ext4: refactor direct IO code

author Jan Kara <jack@suse.cz>

Fri, 13 May 2016 04:44:16 +0000 (00:44 -0400)

committer Theodore Ts'o <tytso@mit.edu>

Fri, 13 May 2016 04:44:16 +0000 (00:44 -0400)
author Jan Kara <jack@suse.cz>
Fri, 13 May 2016 04:44:16 +0000 (00:44 -0400)
committer Theodore Ts'o <tytso@mit.edu>
Fri, 13 May 2016 04:44:16 +0000 (00:44 -0400)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index ba5aecc07fbc842e61a5176be8b693fed3e840d3..89e1bcb21341030145d76678d328695fe2a54585 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2587,8 +2587,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
  /* indirect.c */
  extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                                 struct ext4_map_blocks *map, int flags);
-extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-                                 loff_t offset);
  extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
  extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
  extern void ext4_ind_truncate(handle_t *, struct inode *inode);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c

index 3027fa681de537c586289a26959f8d4f37ae025b..bc15c2c17633079a54de855baf1272b0124f19eb 100644 (file)
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -648,133 +648,6 @@ out:
         return err;
  }
  
-/*
- * O_DIRECT for ext3 (or indirect map) based files
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list.  So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
- */
-ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-                          loff_t offset)
-{
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       handle_t *handle;
-       ssize_t ret;
-       int orphan = 0;
-       size_t count = iov_iter_count(iter);
-       int retries = 0;
-
-       if (iov_iter_rw(iter) == WRITE) {
-               loff_t final_size = offset + count;
-
-               if (final_size > inode->i_size) {
-                       /* Credits for sb + inode write */
-                       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-                       if (IS_ERR(handle)) {
-                               ret = PTR_ERR(handle);
-                               goto out;
-                       }
-                       ret = ext4_orphan_add(handle, inode);
-                       if (ret) {
-                               ext4_journal_stop(handle);
-                               goto out;
-                       }
-                       orphan = 1;
-                       ei->i_disksize = inode->i_size;
-                       ext4_journal_stop(handle);
-               }
-       }
-
-retry:
-       if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
-               /*
-                * Nolock dioread optimization may be dynamically disabled
-                * via ext4_inode_block_unlocked_dio(). Check inode's state
-                * while holding extra i_dio_count ref.
-                */
-               inode_dio_begin(inode);
-               smp_mb();
-               if (unlikely(ext4_test_inode_state(inode,
-                                                   EXT4_STATE_DIOREAD_LOCK))) {
-                       inode_dio_end(inode);
-                       goto locked;
-               }
-               if (IS_DAX(inode))
-                       ret = dax_do_io(iocb, inode, iter, offset,
-                                       ext4_dio_get_block, NULL, 0);
-               else
-                       ret = __blockdev_direct_IO(iocb, inode,
-                                                  inode->i_sb->s_bdev, iter,
-                                                  offset, ext4_dio_get_block,
-                                                  NULL, NULL, 0);
-               inode_dio_end(inode);
-       } else {
-locked:
-               if (IS_DAX(inode))
-                       ret = dax_do_io(iocb, inode, iter, offset,
-                                       ext4_dio_get_block, NULL, DIO_LOCKING);
-               else
-                       ret = blockdev_direct_IO(iocb, inode, iter, offset,
-                                                ext4_dio_get_block);
-
-               if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
-                       loff_t isize = i_size_read(inode);
-                       loff_t end = offset + count;
-
-                       if (end > isize)
-                               ext4_truncate_failed_write(inode);
-               }
-       }
-       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-               goto retry;
-
-       if (orphan) {
-               int err;
-
-               /* Credits for sb + inode write */
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-               if (IS_ERR(handle)) {
-                       /* This is really bad luck. We've written the data
-                        * but cannot extend i_size. Bail out and pretend
-                        * the write failed... */
-                       ret = PTR_ERR(handle);
-                       if (inode->i_nlink)
-                               ext4_orphan_del(NULL, inode);
-
-                       goto out;
-               }
-               if (inode->i_nlink)
-                       ext4_orphan_del(handle, inode);
-               if (ret > 0) {
-                       loff_t end = offset + ret;
-                       if (end > inode->i_size) {
-                               ei->i_disksize = end;
-                               i_size_write(inode, end);
-                               /*
-                                * We're going to return a positive `ret'
-                                * here due to non-zero-length I/O, so there's
-                                * no way of reporting error returns from
-                                * ext4_mark_inode_dirty() to userspace.  So
-                                * ignore it.
-                                */
-                               ext4_mark_inode_dirty(handle, inode);
-                       }
-               }
-               err = ext4_journal_stop(handle);
-               if (ret == 0)
-                       ret = err;
-       }
-out:
-       return ret;
-}
-
  /*
   * Calculate the number of metadata blocks need to reserve
   * to allocate a new block at @lblocks for non extent file based file
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 32825dee81d43a5d020f9a4dccf4f6dbb6b0ebb8..4879e93c91d320e533eb9b36ef82c06ecf37cd9d 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3295,7 +3295,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
  }
  
  /*
- * For ext4 extent files, ext4 will do direct-io write to holes,
+ * Handling of direct IO writes.
+ *
+ * For ext4 extent files, ext4 will do direct-io write even to holes,
   * preallocated extents, and those write extend the file, no need to
   * fall back to buffered IO.
   *
@@ -3313,21 +3315,37 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
   * if the machine crashes during the write.
   *
   */
-static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-                                 loff_t offset)
+static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
+                                   loff_t offset)
  {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
+       struct ext4_inode_info *ei = EXT4_I(inode);
         ssize_t ret;
         size_t count = iov_iter_count(iter);
         int overwrite = 0;
         get_block_t *get_block_func = NULL;
         int dio_flags = 0;
         loff_t final_size = offset + count;
+       int orphan = 0;
+       handle_t *handle;
  
-       /* Use the old path for reads and writes beyond i_size. */
-       if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
-               return ext4_ind_direct_IO(iocb, iter, offset);
+       if (final_size > inode->i_size) {
+               /* Credits for sb + inode write */
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       goto out;
+               }
+               ret = ext4_orphan_add(handle, inode);
+               if (ret) {
+                       ext4_journal_stop(handle);
+                       goto out;
+               }
+               orphan = 1;
+               ei->i_disksize = inode->i_size;
+               ext4_journal_stop(handle);
+       }
  
         BUG_ON(iocb->private == NULL);
  
@@ -3336,8 +3354,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
          * conversion. This also disallows race between truncate() and
          * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
          */
-       if (iov_iter_rw(iter) == WRITE)
-               inode_dio_begin(inode);
+       inode_dio_begin(inode);
  
         /* If we do a overwrite dio, i_mutex locking can be released */
         overwrite = *((int *)iocb->private);
@@ -3346,7 +3363,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                 inode_unlock(inode);
  
         /*
-        * We could direct write to holes and fallocate.
+        * For extent mapped files we could direct write to holes and fallocate.
          *
          * Allocated blocks to fill the hole are marked as unwritten to prevent
          * parallel buffered read to expose the stale data before DIO complete
@@ -3368,7 +3385,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
         iocb->private = NULL;
         if (overwrite)
                 get_block_func = ext4_dio_get_block_overwrite;
-       else if (is_sync_kiocb(iocb)) {
+       else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+                round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
+               get_block_func = ext4_dio_get_block;
+               dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
+       } else if (is_sync_kiocb(iocb)) {
                 get_block_func = ext4_dio_get_block_unwritten_sync;
                 dio_flags = DIO_LOCKING;
         } else {
@@ -3378,10 +3399,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
  #ifdef CONFIG_EXT4_FS_ENCRYPTION
         BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
  #endif
-       if (IS_DAX(inode))
+       if (IS_DAX(inode)) {
+               dio_flags &= ~DIO_SKIP_HOLES;
                 ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
                                 ext4_end_io_dio, dio_flags);
-       else
+       } else
                 ret = __blockdev_direct_IO(iocb, inode,
                                            inode->i_sb->s_bdev, iter, offset,
                                            get_block_func,
@@ -3401,12 +3423,87 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
         }
  
-       if (iov_iter_rw(iter) == WRITE)
-               inode_dio_end(inode);
+       inode_dio_end(inode);
         /* take i_mutex locking again if we do a ovewrite dio */
         if (overwrite)
                 inode_lock(inode);
  
+       if (ret < 0 && final_size > inode->i_size)
+               ext4_truncate_failed_write(inode);
+
+       /* Handle extending of i_size after direct IO write */
+       if (orphan) {
+               int err;
+
+               /* Credits for sb + inode write */
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+               if (IS_ERR(handle)) {
+                       /* This is really bad luck. We've written the data
+                        * but cannot extend i_size. Bail out and pretend
+                        * the write failed... */
+                       ret = PTR_ERR(handle);
+                       if (inode->i_nlink)
+                               ext4_orphan_del(NULL, inode);
+
+                       goto out;
+               }
+               if (inode->i_nlink)
+                       ext4_orphan_del(handle, inode);
+               if (ret > 0) {
+                       loff_t end = offset + ret;
+                       if (end > inode->i_size) {
+                               ei->i_disksize = end;
+                               i_size_write(inode, end);
+                               /*
+                                * We're going to return a positive `ret'
+                                * here due to non-zero-length I/O, so there's
+                                * no way of reporting error returns from
+                                * ext4_mark_inode_dirty() to userspace.  So
+                                * ignore it.
+                                */
+                               ext4_mark_inode_dirty(handle, inode);
+                       }
+               }
+               err = ext4_journal_stop(handle);
+               if (ret == 0)
+                       ret = err;
+       }
+out:
+       return ret;
+}
+
+static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter,
+                                  loff_t offset)
+{
+       int unlocked = 0;
+       struct inode *inode = iocb->ki_filp->f_mapping->host;
+       ssize_t ret;
+
+       if (ext4_should_dioread_nolock(inode)) {
+               /*
+                * Nolock dioread optimization may be dynamically disabled
+                * via ext4_inode_block_unlocked_dio(). Check inode's state
+                * while holding extra i_dio_count ref.
+                */
+               inode_dio_begin(inode);
+               smp_mb();
+               if (unlikely(ext4_test_inode_state(inode,
+                                                   EXT4_STATE_DIOREAD_LOCK)))
+                       inode_dio_end(inode);
+               else
+                       unlocked = 1;
+       }
+       if (IS_DAX(inode)) {
+               ret = dax_do_io(iocb, inode, iter, offset, ext4_dio_get_block,
+                               NULL, unlocked ? 0 : DIO_LOCKING);
+       } else {
+               ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+                                          iter, offset, ext4_dio_get_block,
+                                          NULL, NULL,
+                                          unlocked ? 0 : DIO_LOCKING);
+       }
+       if (unlocked)
+               inode_dio_end(inode);
         return ret;
  }
  
@@ -3434,10 +3531,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                 return 0;
  
         trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
-       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-               ret = ext4_ext_direct_IO(iocb, iter, offset);
+       if (iov_iter_rw(iter) == READ)
+               ret = ext4_direct_IO_read(iocb, iter, offset);
         else
-               ret = ext4_ind_direct_IO(iocb, iter, offset);
+               ret = ext4_direct_IO_write(iocb, iter, offset);
         trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
         return ret;
  }
author	Jan Kara <jack@suse.cz>
	Fri, 13 May 2016 04:44:16 +0000 (00:44 -0400)
committer	Theodore Ts'o <tytso@mit.edu>
	Fri, 13 May 2016 04:44:16 +0000 (00:44 -0400)
fs/ext4/ext4.h		patch \| blob \| blame \| history
fs/ext4/indirect.c		patch \| blob \| blame \| history
fs/ext4/inode.c		patch \| blob \| blame \| history