ext4: serialize unaligned asynchronous DIO

author Eric Sandeen <sandeen@redhat.com>

Sat, 12 Feb 2011 13:17:34 +0000 (08:17 -0500)

committer Theodore Ts'o <tytso@mit.edu>

Sat, 12 Feb 2011 13:17:34 +0000 (08:17 -0500)
author Eric Sandeen <sandeen@redhat.com>
Sat, 12 Feb 2011 13:17:34 +0000 (08:17 -0500)
committer Theodore Ts'o <tytso@mit.edu>
Sat, 12 Feb 2011 13:17:34 +0000 (08:17 -0500)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index 0c8d97b56f34858c09943407916aefdbcc80ef7f..3aa0b72b3b94b42bea4a499e836068ed991814ce 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -848,6 +848,7 @@ struct ext4_inode_info {
         atomic_t i_ioend_count; /* Number of outstanding io_end structs */
         /* current io_end structure for async DIO write*/
         ext4_io_end_t *cur_aio_dio;
+       atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
  
         spinlock_t i_block_reservation_lock;
  
@@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
  
  #define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)
  
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ                37
+#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
+                                           EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
+                                            EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
  #endif /* __KERNEL__ */
  
  #endif /* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c

index 63a75810b7c3ab1b58b802d1d456275160180fa9..ccce8a7e94edc9ca2d23b76e94bd45339869970d 100644 (file)
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                  * that this IO needs to convertion to written when IO is
                  * completed
                  */
-               if (io)
+               if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
                         io->flag = EXT4_IO_END_UNWRITTEN;
-               else
+                       atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+               } else
                         ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                 if (ext4_should_dioread_nolock(inode))
                         map->m_flags |= EXT4_MAP_UNINIT;
@@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                  * that we need to perform convertion when IO is done.
                  */
                 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                       if (io)
+                       if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
                                 io->flag = EXT4_IO_END_UNWRITTEN;
-                       else
+                               atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+                       } else
                                 ext4_set_inode_state(inode,
                                                      EXT4_STATE_DIO_UNWRITTEN);
                 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index 2e8322c8aa882f977410322b62d88346104b86c6..7b80d543b89e71c41764b54a5d6eee59992c712c 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
         return 0;
  }
  
+static void ext4_aiodio_wait(struct inode *inode)
+{
+       wait_queue_head_t *wq = ext4_ioend_wq(inode);
+
+       wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+}
+
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete.  Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block.  If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
+                  unsigned long nr_segs, loff_t pos)
+{
+       struct super_block *sb = inode->i_sb;
+       int blockmask = sb->s_blocksize - 1;
+       size_t count = iov_length(iov, nr_segs);
+       loff_t final_size = pos + count;
+
+       if (pos >= inode->i_size)
+               return 0;
+
+       if ((pos & blockmask) || (final_size & blockmask))
+               return 1;
+
+       return 0;
+}
+
  static ssize_t
  ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                 unsigned long nr_segs, loff_t pos)
  {
         struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+       int unaligned_aio = 0;
+       int ret;
  
         /*
          * If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                         nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
                                               sbi->s_bitmap_maxbytes - pos);
                 }
+       } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
+                  !is_sync_kiocb(iocb))) {
+               unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
         }
  
-       return generic_file_aio_write(iocb, iov, nr_segs, pos);
+       /* Unaligned direct AIO must be serialized; see comment above */
+       if (unaligned_aio) {
+               static unsigned long unaligned_warn_time;
+
+               /* Warn about this once per day */
+               if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
+                       ext4_msg(inode->i_sb, KERN_WARNING,
+                                "Unaligned AIO/DIO on inode %ld by %s; "
+                                "performance will be poor.",
+                                inode->i_ino, current->comm);
+               mutex_lock(ext4_aio_mutex(inode));
+               ext4_aiodio_wait(inode);
+       }
+
+       ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+       if (unaligned_aio)
+               mutex_unlock(ext4_aio_mutex(inode));
+
+       return ret;
  }
  
  static const struct vm_operations_struct ext4_file_vm_ops = {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c

index 4e9b0a242f4c8a53ce7f5d1c6817e74eebf35afc..955cc309142fe983dc41d95467b1f5a66d7a6b69 100644 (file)
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,14 +32,8 @@
  
  static struct kmem_cache *io_page_cachep, *io_end_cachep;
  
-#define WQ_HASH_SZ             37
-#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
-static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
-
  int __init ext4_init_pageio(void)
  {
-       int i;
-
         io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
         if (io_page_cachep == NULL)
                 return -ENOMEM;
@@ -48,9 +42,6 @@ int __init ext4_init_pageio(void)
                 kmem_cache_destroy(io_page_cachep);
                 return -ENOMEM;
         }
-       for (i = 0; i < WQ_HASH_SZ; i++)
-               init_waitqueue_head(&ioend_wq[i]);
-
         return 0;
  }
  
@@ -62,7 +53,7 @@ void ext4_exit_pageio(void)
  
  void ext4_ioend_wait(struct inode *inode)
  {
-       wait_queue_head_t *wq = to_ioend_wq(inode);
+       wait_queue_head_t *wq = ext4_ioend_wq(inode);
  
         wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
  }
@@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
         for (i = 0; i < io->num_io_pages; i++)
                 put_io_page(io->pages[i]);
         io->num_io_pages = 0;
-       wq = to_ioend_wq(io->inode);
+       wq = ext4_ioend_wq(io->inode);
         if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
             waitqueue_active(wq))
                 wake_up_all(wq);
@@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
         struct inode *inode = io->inode;
         loff_t offset = io->offset;
         ssize_t size = io->size;
+       wait_queue_head_t *wq;
         int ret = 0;
  
         ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
         if (io->iocb)
                 aio_complete(io->iocb, io->result, 0);
         /* clear the DIO AIO unwritten flag */
-       io->flag &= ~EXT4_IO_END_UNWRITTEN;
+       if (io->flag & EXT4_IO_END_UNWRITTEN) {
+               io->flag &= ~EXT4_IO_END_UNWRITTEN;
+               /* Wake up anyone waiting on unwritten extent conversion */
+               wq = ext4_ioend_wq(io->inode);
+               if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
+                   waitqueue_active(wq)) {
+                       wake_up_all(wq);
+               }
+       }
+
         return ret;
  }
  
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 86b05486dc631af4292ff2d70a0051bd692a131b..f6a318f836b2cd7d5c2e70af3ae1bde6b6d301ce 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -833,6 +833,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
         ei->i_sync_tid = 0;
         ei->i_datasync_tid = 0;
         atomic_set(&ei->i_ioend_count, 0);
+       atomic_set(&ei->i_aiodio_unwritten, 0);
  
         return &ei->vfs_inode;
  }
@@ -4800,11 +4801,21 @@ static void ext4_exit_feat_adverts(void)
         kfree(ext4_feat);
  }
  
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
  static int __init ext4_init_fs(void)
  {
-       int err;
+       int i, err;
  
         ext4_check_flag_values();
+
+       for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+               mutex_init(&ext4__aio_mutex[i]);
+               init_waitqueue_head(&ext4__ioend_wq[i]);
+       }
+
         err = ext4_init_pageio();
         if (err)
                 return err;
author	Eric Sandeen <sandeen@redhat.com>
	Sat, 12 Feb 2011 13:17:34 +0000 (08:17 -0500)
committer	Theodore Ts'o <tytso@mit.edu>
	Sat, 12 Feb 2011 13:17:34 +0000 (08:17 -0500)
fs/ext4/ext4.h		patch \| blob \| blame \| history
fs/ext4/extents.c		patch \| blob \| blame \| history
fs/ext4/file.c		patch \| blob \| blame \| history
fs/ext4/page-io.c		patch \| blob \| blame \| history
fs/ext4/super.c		patch \| blob \| blame \| history