Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 1 May 2013 15:04:12 +0000 (08:04 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 1 May 2013 15:04:12 +0000 (08:04 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 1 May 2013 15:04:12 +0000 (08:04 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 1 May 2013 15:04:12 +0000 (08:04 -0700)
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt

index 34ea4f1fa6ea7eefd359fb09e1605a6d6948910b..f7cbf574a875271296d93bee53b290ca36c44a66 100644 (file)
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -494,6 +494,17 @@ Files in /sys/fs/ext4/<devname>
   session_write_kbytes         This file is read-only and shows the number of
                                kilobytes of data that have been written to this
                                filesystem since it was mounted.
+
+ reserved_clusters            This is RW file and contains number of reserved
+                              clusters in the file system which will be used
+                              in the specific situations to avoid costly
+                              zeroout, unexpected ENOSPC, or possible data
+                              loss. The default is 2% or 4096 clusters,
+                              whichever is smaller and this can be changed
+                              however it can never exceed number of clusters
+                              in the file system. If there is not enough space
+                              for the reserved space when mounting the file
+                              mount will _not_ fail.
  ..............................................................................
  
  Ioctls
@@ -587,6 +598,16 @@ Table of Ext4 specific ioctls
                               bitmaps and inode table, the userspace tool thus
                               just passes the new number of blocks.
  
+EXT4_IOC_SWAP_BOOT           Swap i_blocks and associated attributes
+                             (like i_blocks, i_size, i_flags, ...) from
+                             the specified inode with inode
+                             EXT4_BOOT_LOADER_INO (#5). This is typically
+                             used to store a boot loader in a secure part of
+                             the filesystem, where it can't be changed by a
+                             normal user by accident.
+                             The data blocks of the previous boot loader
+                             will be associated with the given inode.
+
  ..............................................................................
  
  References
diff --git a/fs/buffer.c b/fs/buffer.c

index 10ef81e10b207aefea70fee5a5a413b23b83a5c5..bc1fe14aaa3e4583aa351298fc9faa4f42d2d6a9 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2987,6 +2987,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
         /* Take care of bh's that straddle the end of the device */
         guard_bh_eod(rw, bio, bh);
  
+       if (buffer_meta(bh))
+               rw |= REQ_META;
+       if (buffer_prio(bh))
+               rw |= REQ_PRIO;
+
         bio_get(bio);
         submit_bio(rw, bio);
  
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig

index 987358740cb970dab82917f952f8cc8bfae4361c..efea5d5c44ce4e0cf0910db50020dca4008779c0 100644 (file)
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -71,4 +71,5 @@ config EXT4_DEBUG
           Enables run-time debugging support for the ext4 filesystem.
  
           If you select Y here, then you will be able to turn on debugging
-         with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
+         with a command such as:
+               echo 1 > /sys/module/ext4/parameters/mballoc_debug
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c

index 92e68b33fffde26bd98c899010589ffaea70f699..d0f13eada0ed5799295a5378642cd3786cf32595 100644 (file)
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -29,6 +29,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
   * balloc.c contains the blocks allocation and deallocation routines
   */
  
+/*
+ * Calculate block group number for a given block number
+ */
+ext4_group_t ext4_get_group_number(struct super_block *sb,
+                                  ext4_fsblk_t block)
+{
+       ext4_group_t group;
+
+       if (test_opt2(sb, STD_GROUP_SIZE))
+               group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+                        block) >>
+                       (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
+       else
+               ext4_get_group_no_and_offset(sb, block, &group, NULL);
+       return group;
+}
+
  /*
   * Calculate the block group number and offset into the block/cluster
   * allocation bitmap, given a block number
@@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
  
  }
  
-static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
-                       ext4_group_t block_group)
+/*
+ * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
+ * and 0 otherwise.
+ */
+static inline int ext4_block_in_group(struct super_block *sb,
+                                     ext4_fsblk_t block,
+                                     ext4_group_t block_group)
  {
         ext4_group_t actual_group;
-       ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
-       if (actual_group == block_group)
-               return 1;
-       return 0;
+
+       actual_group = ext4_get_group_number(sb, block);
+       return (actual_group == block_group) ? 1 : 0;
  }
  
  /* Return the number of clusters used for file system metadata; this
@@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
         trace_ext4_read_block_bitmap_load(sb, block_group);
         bh->b_end_io = ext4_end_bitmap_read;
         get_bh(bh);
-       submit_bh(READ, bh);
+       submit_bh(READ | REQ_META | REQ_PRIO, bh);
         return bh;
  verify:
         ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
  static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
                                   s64 nclusters, unsigned int flags)
  {
-       s64 free_clusters, dirty_clusters, root_clusters;
+       s64 free_clusters, dirty_clusters, rsv, resv_clusters;
         struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
         struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
  
         free_clusters  = percpu_counter_read_positive(fcc);
         dirty_clusters = percpu_counter_read_positive(dcc);
+       resv_clusters = atomic64_read(&sbi->s_resv_clusters);
  
         /*
          * r_blocks_count should always be multiple of the cluster ratio so
          * we are safe to do a plane bit shift only.
          */
-       root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+       rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
+             resv_clusters;
  
-       if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
+       if (free_clusters - (nclusters + rsv + dirty_clusters) <
                                         EXT4_FREECLUSTERS_WATERMARK) {
                 free_clusters  = percpu_counter_sum_positive(fcc);
                 dirty_clusters = percpu_counter_sum_positive(dcc);
@@ -499,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
         /* Check whether we have space after accounting for current
          * dirty clusters & root reserved clusters.
          */
-       if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
+       if (free_clusters >= (rsv + nclusters + dirty_clusters))
                 return 1;
  
         /* Hm, nope.  Are (enough) root reserved clusters available? */
         if (uid_eq(sbi->s_resuid, current_fsuid()) ||
             (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
             capable(CAP_SYS_RESOURCE) ||
-               (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+           (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
  
+               if (free_clusters >= (nclusters + dirty_clusters +
+                                     resv_clusters))
+                       return 1;
+       }
+       /* No free blocks. Let's see if we can dip into reserved pool */
+       if (flags & EXT4_MB_USE_RESERVED) {
                 if (free_clusters >= (nclusters + dirty_clusters))
                         return 1;
         }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c

index d8cd1f0f4661e98e7b84f9c0e10ff81192179f8c..f8d56e4254e05b2866c98e7f6674b71109669cfb 100644 (file)
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -46,7 +46,8 @@ static int is_dx_dir(struct inode *inode)
         if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
                      EXT4_FEATURE_COMPAT_DIR_INDEX) &&
             ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
-            ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+            ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
+            ext4_has_inline_data(inode)))
                 return 1;
  
         return 0;
@@ -115,14 +116,6 @@ static int ext4_readdir(struct file *filp,
         int ret = 0;
         int dir_has_error = 0;
  
-       if (ext4_has_inline_data(inode)) {
-               int has_inline_data = 1;
-               ret = ext4_read_inline_dir(filp, dirent, filldir,
-                                          &has_inline_data);
-               if (has_inline_data)
-                       return ret;
-       }
-
         if (is_dx_dir(inode)) {
                 err = ext4_dx_readdir(filp, dirent, filldir);
                 if (err != ERR_BAD_DX_DIR) {
@@ -136,6 +129,15 @@ static int ext4_readdir(struct file *filp,
                 ext4_clear_inode_flag(file_inode(filp),
                                       EXT4_INODE_INDEX);
         }
+
+       if (ext4_has_inline_data(inode)) {
+               int has_inline_data = 1;
+               ret = ext4_read_inline_dir(filp, dirent, filldir,
+                                          &has_inline_data);
+               if (has_inline_data)
+                       return ret;
+       }
+
         stored = 0;
         offset = filp->f_pos & (sb->s_blocksize - 1);
  
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index 3b83cd6047964ab7218aeb26ab73bce7be472430..0aabb344b02e6779ed410adff60a9c0e3ab4003e 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t;
  #define EXT4_MB_STREAM_ALLOC           0x0800
  /* Use reserved root blocks if needed */
  #define EXT4_MB_USE_ROOT_BLOCKS                0x1000
+/* Use blocks from reserved pool */
+#define EXT4_MB_USE_RESERVED           0x2000
  
  struct ext4_allocation_request {
         /* target inode for block we're allocating */
@@ -196,19 +198,8 @@ struct mpage_da_data {
  #define EXT4_IO_END_ERROR      0x0002
  #define EXT4_IO_END_DIRECT     0x0004
  
-struct ext4_io_page {
-       struct page     *p_page;
-       atomic_t        p_count;
-};
-
-#define MAX_IO_PAGES 128
-
  /*
   * For converting uninitialized extents on a work queue.
- *
- * 'page' is only used from the writepage() path; 'pages' is only used for
- * buffered writes; they are used to keep page references until conversion
- * takes place.  For AIO/DIO, neither field is filled in.
   */
  typedef struct ext4_io_end {
         struct list_head        list;           /* per-file finished IO list */
@@ -218,15 +209,13 @@ typedef struct ext4_io_end {
         ssize_t                 size;           /* size of the extent */
         struct kiocb            *iocb;          /* iocb struct for AIO */
         int                     result;         /* error value for AIO */
-       int                     num_io_pages;   /* for writepages() */
-       struct ext4_io_page     *pages[MAX_IO_PAGES]; /* for writepages() */
+       atomic_t                count;          /* reference counter */
  } ext4_io_end_t;
  
  struct ext4_io_submit {
         int                     io_op;
         struct bio              *io_bio;
         ext4_io_end_t           *io_end;
-       struct ext4_io_page     *io_page;
         sector_t                io_next_block;
  };
  
@@ -403,7 +392,7 @@ struct flex_groups {
  #define EXT4_RESERVED_FL               0x80000000 /* reserved for ext4 lib */
  
  #define EXT4_FL_USER_VISIBLE           0x004BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE                0x004B80FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE                0x004380FF /* User modifiable flags */
  
  /* Flags that should be inherited by new inodes from their parent. */
  #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -557,9 +546,8 @@ enum {
  #define EXT4_GET_BLOCKS_UNINIT_EXT             0x0002
  #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT      (EXT4_GET_BLOCKS_UNINIT_EXT|\
                                                  EXT4_GET_BLOCKS_CREATE)
-       /* Caller is from the delayed allocation writeout path,
-          so set the magic i_delalloc_reserve_flag after taking the
-          inode allocation semaphore for */
+       /* Caller is from the delayed allocation writeout path
+        * finally doing the actual allocation of delayed blocks */
  #define EXT4_GET_BLOCKS_DELALLOC_RESERVE       0x0004
         /* caller is from the direct IO path, request to creation of an
         unitialized extents if not allocated, split the uninitialized
@@ -571,8 +559,9 @@ enum {
         /* Convert extent to initialized after IO complete */
  #define EXT4_GET_BLOCKS_IO_CONVERT_EXT         (EXT4_GET_BLOCKS_CONVERT|\
                                          EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
-       /* Punch out blocks of an extent */
-#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT          0x0020
+       /* Eventual metadata allocation (due to growing extent tree)
+        * should not fail, so try to use reserved blocks for that.*/
+#define EXT4_GET_BLOCKS_METADATA_NOFAIL                0x0020
         /* Don't normalize allocation size (used for fallocate) */
  #define EXT4_GET_BLOCKS_NO_NORMALIZE           0x0040
         /* Request will not result in inode size update (user for fallocate) */
@@ -616,6 +605,7 @@ enum {
  #define EXT4_IOC_ALLOC_DA_BLKS         _IO('f', 12)
  #define EXT4_IOC_MOVE_EXT              _IOWR('f', 15, struct move_extent)
  #define EXT4_IOC_RESIZE_FS             _IOW('f', 16, __u64)
+#define EXT4_IOC_SWAP_BOOT             _IO('f', 17)
  
  #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
  /*
@@ -949,7 +939,7 @@ struct ext4_inode_info {
  #define EXT2_FLAGS_TEST_FILESYS                0x0004  /* to test development code */
  
  /*
- * Mount flags
+ * Mount flags set via mount options or defaults
   */
  #define EXT4_MOUNT_GRPID               0x00004 /* Create files with directory's group */
  #define EXT4_MOUNT_DEBUG               0x00008 /* Some debugging messages */
@@ -981,8 +971,16 @@ struct ext4_inode_info {
  #define EXT4_MOUNT_DISCARD             0x40000000 /* Issue DISCARD requests */
  #define EXT4_MOUNT_INIT_INODE_TABLE    0x80000000 /* Initialize uninitialized itables */
  
+/*
+ * Mount flags set either automatically (could not be set by mount option)
+ * based on per file system feature or property or in special cases such as
+ * distinguishing between explicit mount option definition and default.
+ */
  #define EXT4_MOUNT2_EXPLICIT_DELALLOC  0x00000001 /* User explicitly
                                                       specified delalloc */
+#define EXT4_MOUNT2_STD_GROUP_SIZE     0x00000002 /* We have standard group
+                                                     size of blocksize * 8
+                                                     blocks */
  
  #define clear_opt(sb, opt)             EXT4_SB(sb)->s_mount_opt &= \
                                                 ~EXT4_MOUNT_##opt
@@ -1179,6 +1177,7 @@ struct ext4_sb_info {
         unsigned int s_mount_flags;
         unsigned int s_def_mount_opt;
         ext4_fsblk_t s_sb_block;
+       atomic64_t s_resv_clusters;
         kuid_t s_resuid;
         kgid_t s_resgid;
         unsigned short s_mount_state;
@@ -1333,6 +1332,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
         return ino == EXT4_ROOT_INO ||
                 ino == EXT4_USR_QUOTA_INO ||
                 ino == EXT4_GRP_QUOTA_INO ||
+               ino == EXT4_BOOT_LOADER_INO ||
                 ino == EXT4_JOURNAL_INO ||
                 ino == EXT4_RESIZE_INO ||
                 (ino >= EXT4_FIRST_INO(sb) &&
@@ -1374,6 +1374,7 @@ enum {
         EXT4_STATE_DIOREAD_LOCK,        /* Disable support for dio read
                                            nolocking */
         EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
+       EXT4_STATE_ORDERED_MODE,        /* data=ordered mode */
  };
  
  #define EXT4_INODE_BIT_FNS(name, field, offset)                                \
@@ -1784,9 +1785,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
   */
  #define ERR_BAD_DX_DIR -75000
  
-void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
-                       ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
-
  /*
   * Timeout and state flag for lazy initialization inode thread.
   */
@@ -1908,6 +1906,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
                                   struct buffer_head *bh);
  
  /* balloc.c */
+extern void ext4_get_group_no_and_offset(struct super_block *sb,
+                                        ext4_fsblk_t blocknr,
+                                        ext4_group_t *blockgrpp,
+                                        ext4_grpblk_t *offsetp);
+extern ext4_group_t ext4_get_group_number(struct super_block *sb,
+                                         ext4_fsblk_t block);
+
  extern void ext4_validate_block_bitmap(struct super_block *sb,
                                        struct ext4_group_desc *desc,
                                        unsigned int block_group,
@@ -2108,8 +2113,9 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
                                 unsigned long nr_segs);
  extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
  extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
-extern void ext4_ind_truncate(struct inode *inode);
-extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern void ext4_ind_truncate(handle_t *, struct inode *inode);
+extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+                                ext4_lblk_t first, ext4_lblk_t stop);
  
  /* ioctl.c */
  extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2117,6 +2123,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
  
  /* migrate.c */
  extern int ext4_ext_migrate(struct inode *);
+extern int ext4_ind_migrate(struct inode *inode);
  
  /* namei.c */
  extern int ext4_dirent_csum_verify(struct inode *inode,
@@ -2511,6 +2518,11 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
  extern int ext4_read_inline_dir(struct file *filp,
                                 void *dirent, filldir_t filldir,
                                 int *has_inline_data);
+extern int htree_inlinedir_to_tree(struct file *dir_file,
+                                  struct inode *dir, ext4_lblk_t block,
+                                  struct dx_hash_info *hinfo,
+                                  __u32 start_hash, __u32 start_minor_hash,
+                                  int *has_inline_data);
  extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
                                         const struct qstr *d_name,
                                         struct ext4_dir_entry_2 **res_dir,
@@ -2547,6 +2559,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
  extern int ext4_handle_dirty_dirent_node(handle_t *handle,
                                          struct inode *inode,
                                          struct buffer_head *bh);
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+       [S_IFREG >> S_SHIFT]    = EXT4_FT_REG_FILE,
+       [S_IFDIR >> S_SHIFT]    = EXT4_FT_DIR,
+       [S_IFCHR >> S_SHIFT]    = EXT4_FT_CHRDEV,
+       [S_IFBLK >> S_SHIFT]    = EXT4_FT_BLKDEV,
+       [S_IFIFO >> S_SHIFT]    = EXT4_FT_FIFO,
+       [S_IFSOCK >> S_SHIFT]   = EXT4_FT_SOCK,
+       [S_IFLNK >> S_SHIFT]    = EXT4_FT_SYMLINK,
+};
+
+static inline void ext4_set_de_type(struct super_block *sb,
+                               struct ext4_dir_entry_2 *de,
+                               umode_t mode) {
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+               de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
  
  /* symlink.c */
  extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2573,9 +2603,9 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                        int chunk);
  extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int flags);
-extern void ext4_ext_truncate(struct inode *);
-extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
-                               loff_t length);
+extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+                                ext4_lblk_t end);
  extern void ext4_ext_init(struct super_block *);
  extern void ext4_ext_release(struct super_block *);
  extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2609,17 +2639,26 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
  
  
  /* move_extent.c */
+extern void ext4_double_down_write_data_sem(struct inode *first,
+                                           struct inode *second);
+extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
+                                         struct inode *donor_inode);
+void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
+void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
  extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                              __u64 start_orig, __u64 start_donor,
                              __u64 len, __u64 *moved_len);
  
  /* page-io.c */
  extern int __init ext4_init_pageio(void);
-extern void ext4_add_complete_io(ext4_io_end_t *io_end);
  extern void ext4_exit_pageio(void);
  extern void ext4_ioend_shutdown(struct inode *);
-extern void ext4_free_io_end(ext4_io_end_t *io);
  extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+                               struct writeback_control *wbc);
  extern void ext4_end_io_work(struct work_struct *work);
  extern void ext4_io_submit(struct ext4_io_submit *io);
  extern int ext4_bio_write_page(struct ext4_io_submit *io,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h

index 8643ff5bbeb7a9954af0026ce43601d02487856b..51bc821ade90e2eacf9e894f836dcb50f9617b00 100644 (file)
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
                                      0xffff);
  }
  
+#define ext4_ext_dirty(handle, inode, path) \
+               __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+                    struct inode *inode, struct ext4_ext_path *path);
+
  #endif /* _EXT4_EXTENTS */
  
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c

index 7058975e3a5505a9b19d082f4d6ba627919432ff..451eb404533095fc323218fe5ea0f3caa0af8b16 100644 (file)
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -43,6 +43,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
  {
         journal_t *journal;
  
+       might_sleep();
+
         trace_ext4_journal_start(sb, nblocks, _RET_IP_);
         if (sb->s_flags & MS_RDONLY)
                 return ERR_PTR(-EROFS);
@@ -113,6 +115,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
  {
         int err = 0;
  
+       might_sleep();
+
         if (ext4_handle_valid(handle)) {
                 err = jbd2_journal_get_write_access(handle, bh);
                 if (err)
@@ -209,6 +213,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
  {
         int err = 0;
  
+       might_sleep();
+
+       set_buffer_meta(bh);
+       set_buffer_prio(bh);
         if (ext4_handle_valid(handle)) {
                 err = jbd2_journal_dirty_metadata(handle, bh);
                 if (err) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h

index 4c216b1bf20c8b546d872919342f0c23c9e16622..c8c6885406db16cd7e273a2fc3f7e37ccd27fa6c 100644 (file)
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -29,11 +29,13 @@
   * block to complete the transaction.
   *
   * For extents-enabled fs we may have to allocate and modify up to
- * 5 levels of tree + root which are stored in the inode. */
+ * 5 levels of tree, data block (for each of these we need bitmap + group
+ * summaries), root which is stored in the inode, sb
+ */
  
  #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)                               \
         (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
-        ? 27U : 8U)
+        ? 20U : 8U)
  
  /* Extended attribute operations touch at most two data buffers,
   * two bitmap buffers, and two group summaries, in addition to the inode
@@ -194,16 +196,20 @@ static inline void ext4_journal_callback_add(handle_t *handle,
   * ext4_journal_callback_del: delete a registered callback
   * @handle: active journal transaction handle on which callback was registered
   * @jce: registered journal callback entry to unregister
+ * Return true if object was sucessfully removed
   */
-static inline void ext4_journal_callback_del(handle_t *handle,
+static inline bool ext4_journal_callback_try_del(handle_t *handle,
                                              struct ext4_journal_cb_entry *jce)
  {
+       bool deleted;
         struct ext4_sb_info *sbi =
                         EXT4_SB(handle->h_transaction->t_journal->j_private);
  
         spin_lock(&sbi->s_md_lock);
+       deleted = !list_empty(&jce->jce_list);
         list_del_init(&jce->jce_list);
         spin_unlock(&sbi->s_md_lock);
+       return deleted;
  }
  
  int
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c

index 9c6d06dcef8bf1f997c0f05bfe443fcde993d72b..107936db244eddd5e7192b657ccef02841d3b617 100644 (file)
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -157,11 +157,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
   *  - ENOMEM
   *  - EIO
   */
-#define ext4_ext_dirty(handle, inode, path) \
-               __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
-static int __ext4_ext_dirty(const char *where, unsigned int line,
-                           handle_t *handle, struct inode *inode,
-                           struct ext4_ext_path *path)
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+                    struct inode *inode, struct ext4_ext_path *path)
  {
         int err;
         if (path->p_bh) {
@@ -1813,39 +1810,101 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
         }
         depth = ext_depth(inode);
         ex = path[depth].p_ext;
+       eh = path[depth].p_hdr;
         if (unlikely(path[depth].p_hdr == NULL)) {
                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
                 return -EIO;
         }
  
         /* try to insert block into found extent and return */
-       if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
-               && ext4_can_extents_be_merged(inode, ex, newext)) {
-               ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
-                         ext4_ext_is_uninitialized(newext),
-                         ext4_ext_get_actual_len(newext),
-                         le32_to_cpu(ex->ee_block),
-                         ext4_ext_is_uninitialized(ex),
-                         ext4_ext_get_actual_len(ex),
-                         ext4_ext_pblock(ex));
-               err = ext4_ext_get_access(handle, inode, path + depth);
-               if (err)
-                       return err;
+       if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) {
  
                 /*
-                * ext4_can_extents_be_merged should have checked that either
-                * both extents are uninitialized, or both aren't. Thus we
-                * need to check only one of them here.
+                * Try to see whether we should rather test the extent on
+                * right from ex, or from the left of ex. This is because
+                * ext4_ext_find_extent() can return either extent on the
+                * left, or on the right from the searched position. This
+                * will make merging more effective.
                  */
-               if (ext4_ext_is_uninitialized(ex))
-                       uninitialized = 1;
-               ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+               if (ex < EXT_LAST_EXTENT(eh) &&
+                   (le32_to_cpu(ex->ee_block) +
+                   ext4_ext_get_actual_len(ex) <
+                   le32_to_cpu(newext->ee_block))) {
+                       ex += 1;
+                       goto prepend;
+               } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
+                          (le32_to_cpu(newext->ee_block) +
+                          ext4_ext_get_actual_len(newext) <
+                          le32_to_cpu(ex->ee_block)))
+                       ex -= 1;
+
+               /* Try to append newex to the ex */
+               if (ext4_can_extents_be_merged(inode, ex, newext)) {
+                       ext_debug("append [%d]%d block to %u:[%d]%d"
+                                 "(from %llu)\n",
+                                 ext4_ext_is_uninitialized(newext),
+                                 ext4_ext_get_actual_len(newext),
+                                 le32_to_cpu(ex->ee_block),
+                                 ext4_ext_is_uninitialized(ex),
+                                 ext4_ext_get_actual_len(ex),
+                                 ext4_ext_pblock(ex));
+                       err = ext4_ext_get_access(handle, inode,
+                                                 path + depth);
+                       if (err)
+                               return err;
+
+                       /*
+                        * ext4_can_extents_be_merged should have checked
+                        * that either both extents are uninitialized, or
+                        * both aren't. Thus we need to check only one of
+                        * them here.
+                        */
+                       if (ext4_ext_is_uninitialized(ex))
+                               uninitialized = 1;
+                       ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                         + ext4_ext_get_actual_len(newext));
-               if (uninitialized)
-                       ext4_ext_mark_uninitialized(ex);
-               eh = path[depth].p_hdr;
-               nearex = ex;
-               goto merge;
+                       if (uninitialized)
+                               ext4_ext_mark_uninitialized(ex);
+                       eh = path[depth].p_hdr;
+                       nearex = ex;
+                       goto merge;
+               }
+
+prepend:
+               /* Try to prepend newex to the ex */
+               if (ext4_can_extents_be_merged(inode, newext, ex)) {
+                       ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
+                                 "(from %llu)\n",
+                                 le32_to_cpu(newext->ee_block),
+                                 ext4_ext_is_uninitialized(newext),
+                                 ext4_ext_get_actual_len(newext),
+                                 le32_to_cpu(ex->ee_block),
+                                 ext4_ext_is_uninitialized(ex),
+                                 ext4_ext_get_actual_len(ex),
+                                 ext4_ext_pblock(ex));
+                       err = ext4_ext_get_access(handle, inode,
+                                                 path + depth);
+                       if (err)
+                               return err;
+
+                       /*
+                        * ext4_can_extents_be_merged should have checked
+                        * that either both extents are uninitialized, or
+                        * both aren't. Thus we need to check only one of
+                        * them here.
+                        */
+                       if (ext4_ext_is_uninitialized(ex))
+                               uninitialized = 1;
+                       ex->ee_block = newext->ee_block;
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
+                       ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+                                       + ext4_ext_get_actual_len(newext));
+                       if (uninitialized)
+                               ext4_ext_mark_uninitialized(ex);
+                       eh = path[depth].p_hdr;
+                       nearex = ex;
+                       goto merge;
+               }
         }
  
         depth = ext_depth(inode);
@@ -1880,8 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
          * There is no free space in the found leaf.
          * We're gonna add a new leaf in the tree.
          */
-       if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
-               flags = EXT4_MB_USE_ROOT_BLOCKS;
+       if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+               flags = EXT4_MB_USE_RESERVED;
         err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
         if (err)
                 goto cleanup;
@@ -2599,8 +2658,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
         return 1;
  }
  
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
-                                ext4_lblk_t end)
+int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+                         ext4_lblk_t end)
  {
         struct super_block *sb = inode->i_sb;
         int depth = ext_depth(inode);
@@ -2667,12 +2726,14 @@ again:
  
                         /*
                          * Split the extent in two so that 'end' is the last
-                        * block in the first new extent
+                        * block in the first new extent. Also we should not
+                        * fail removing space due to ENOSPC so try to use
+                        * reserved block if that happens.
                          */
                         err = ext4_split_extent_at(handle, inode, path,
-                                               end + 1, split_flag,
-                                               EXT4_GET_BLOCKS_PRE_IO |
-                                               EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+                                       end + 1, split_flag,
+                                       EXT4_GET_BLOCKS_PRE_IO |
+                                       EXT4_GET_BLOCKS_METADATA_NOFAIL);
  
                         if (err < 0)
                                 goto out;
@@ -3147,35 +3208,35 @@ out:
  static int ext4_ext_convert_to_initialized(handle_t *handle,
                                            struct inode *inode,
                                            struct ext4_map_blocks *map,
-                                          struct ext4_ext_path *path)
+                                          struct ext4_ext_path *path,
+                                          int flags)
  {
         struct ext4_sb_info *sbi;
         struct ext4_extent_header *eh;
         struct ext4_map_blocks split_map;
         struct ext4_extent zero_ex;
-       struct ext4_extent *ex;
+       struct ext4_extent *ex, *abut_ex;
         ext4_lblk_t ee_block, eof_block;
-       unsigned int ee_len, depth;
-       int allocated, max_zeroout = 0;
+       unsigned int ee_len, depth, map_len = map->m_len;
+       int allocated = 0, max_zeroout = 0;
         int err = 0;
         int split_flag = 0;
  
         ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
                 "block %llu, max_blocks %u\n", inode->i_ino,
-               (unsigned long long)map->m_lblk, map->m_len);
+               (unsigned long long)map->m_lblk, map_len);
  
         sbi = EXT4_SB(inode->i_sb);
         eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
                 inode->i_sb->s_blocksize_bits;
-       if (eof_block < map->m_lblk + map->m_len)
-               eof_block = map->m_lblk + map->m_len;
+       if (eof_block < map->m_lblk + map_len)
+               eof_block = map->m_lblk + map_len;
  
         depth = ext_depth(inode);
         eh = path[depth].p_hdr;
         ex = path[depth].p_ext;
         ee_block = le32_to_cpu(ex->ee_block);
         ee_len = ext4_ext_get_actual_len(ex);
-       allocated = ee_len - (map->m_lblk - ee_block);
         zero_ex.ee_len = 0;
  
         trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
@@ -3186,77 +3247,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
  
         /*
          * Attempt to transfer newly initialized blocks from the currently
-        * uninitialized extent to its left neighbor. This is much cheaper
+        * uninitialized extent to its neighbor. This is much cheaper
          * than an insertion followed by a merge as those involve costly
-        * memmove() calls. This is the common case in steady state for
-        * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
-        * writes.
+        * memmove() calls. Transferring to the left is the common case in
+        * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
+        * followed by append writes.
          *
          * Limitations of the current logic:
-        *  - L1: we only deal with writes at the start of the extent.
-        *    The approach could be extended to writes at the end
-        *    of the extent but this scenario was deemed less common.
-        *  - L2: we do not deal with writes covering the whole extent.
+        *  - L1: we do not deal with writes covering the whole extent.
          *    This would require removing the extent if the transfer
          *    is possible.
-        *  - L3: we only attempt to merge with an extent stored in the
+        *  - L2: we only attempt to merge with an extent stored in the
          *    same extent tree node.
          */
-       if ((map->m_lblk == ee_block) &&        /*L1*/
-               (map->m_len < ee_len) &&        /*L2*/
-               (ex > EXT_FIRST_EXTENT(eh))) {  /*L3*/
-               struct ext4_extent *prev_ex;
+       if ((map->m_lblk == ee_block) &&
+               /* See if we can merge left */
+               (map_len < ee_len) &&           /*L1*/
+               (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
                 ext4_lblk_t prev_lblk;
                 ext4_fsblk_t prev_pblk, ee_pblk;
-               unsigned int prev_len, write_len;
+               unsigned int prev_len;
  
-               prev_ex = ex - 1;
-               prev_lblk = le32_to_cpu(prev_ex->ee_block);
-               prev_len = ext4_ext_get_actual_len(prev_ex);
-               prev_pblk = ext4_ext_pblock(prev_ex);
+               abut_ex = ex - 1;
+               prev_lblk = le32_to_cpu(abut_ex->ee_block);
+               prev_len = ext4_ext_get_actual_len(abut_ex);
+               prev_pblk = ext4_ext_pblock(abut_ex);
                 ee_pblk = ext4_ext_pblock(ex);
-               write_len = map->m_len;
  
                 /*
-                * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+                * A transfer of blocks from 'ex' to 'abut_ex' is allowed
                  * upon those conditions:
-                * - C1: prev_ex is initialized,
-                * - C2: prev_ex is logically abutting ex,
-                * - C3: prev_ex is physically abutting ex,
-                * - C4: prev_ex can receive the additional blocks without
+                * - C1: abut_ex is initialized,
+                * - C2: abut_ex is logically abutting ex,
+                * - C3: abut_ex is physically abutting ex,
+                * - C4: abut_ex can receive the additional blocks without
                  *   overflowing the (initialized) length limit.
                  */
-               if ((!ext4_ext_is_uninitialized(prev_ex)) &&            /*C1*/
+               if ((!ext4_ext_is_uninitialized(abut_ex)) &&            /*C1*/
                         ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
                         ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
-                       (prev_len < (EXT_INIT_MAX_LEN - write_len))) {  /*C4*/
+                       (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
                         err = ext4_ext_get_access(handle, inode, path + depth);
                         if (err)
                                 goto out;
  
                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
-                               map, ex, prev_ex);
+                               map, ex, abut_ex);
  
-                       /* Shift the start of ex by 'write_len' blocks */
-                       ex->ee_block = cpu_to_le32(ee_block + write_len);
-                       ext4_ext_store_pblock(ex, ee_pblk + write_len);
-                       ex->ee_len = cpu_to_le16(ee_len - write_len);
+                       /* Shift the start of ex by 'map_len' blocks */
+                       ex->ee_block = cpu_to_le32(ee_block + map_len);
+                       ext4_ext_store_pblock(ex, ee_pblk + map_len);
+                       ex->ee_len = cpu_to_le16(ee_len - map_len);
                         ext4_ext_mark_uninitialized(ex); /* Restore the flag */
  
-                       /* Extend prev_ex by 'write_len' blocks */
-                       prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+                       /* Extend abut_ex by 'map_len' blocks */
+                       abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
  
-                       /* Mark the block containing both extents as dirty */
-                       ext4_ext_dirty(handle, inode, path + depth);
+                       /* Result: number of initialized blocks past m_lblk */
+                       allocated = map_len;
+               }
+       } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
+                  (map_len < ee_len) &&        /*L1*/
+                  ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
+               /* See if we can merge right */
+               ext4_lblk_t next_lblk;
+               ext4_fsblk_t next_pblk, ee_pblk;
+               unsigned int next_len;
+
+               abut_ex = ex + 1;
+               next_lblk = le32_to_cpu(abut_ex->ee_block);
+               next_len = ext4_ext_get_actual_len(abut_ex);
+               next_pblk = ext4_ext_pblock(abut_ex);
+               ee_pblk = ext4_ext_pblock(ex);
  
-                       /* Update path to point to the right extent */
-                       path[depth].p_ext = prev_ex;
+               /*
+                * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+                * upon those conditions:
+                * - C1: abut_ex is initialized,
+                * - C2: abut_ex is logically abutting ex,
+                * - C3: abut_ex is physically abutting ex,
+                * - C4: abut_ex can receive the additional blocks without
+                *   overflowing the (initialized) length limit.
+                */
+               if ((!ext4_ext_is_uninitialized(abut_ex)) &&            /*C1*/
+                   ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
+                   ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
+                   (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
+                       err = ext4_ext_get_access(handle, inode, path + depth);
+                       if (err)
+                               goto out;
+
+                       trace_ext4_ext_convert_to_initialized_fastpath(inode,
+                               map, ex, abut_ex);
+
+                       /* Shift the start of abut_ex by 'map_len' blocks */
+                       abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
+                       ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
+                       ex->ee_len = cpu_to_le16(ee_len - map_len);
+                       ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+                       /* Extend abut_ex by 'map_len' blocks */
+                       abut_ex->ee_len = cpu_to_le16(next_len + map_len);
  
                         /* Result: number of initialized blocks past m_lblk */
-                       allocated = write_len;
-                       goto out;
+                       allocated = map_len;
                 }
         }
+       if (allocated) {
+               /* Mark the block containing both extents as dirty */
+               ext4_ext_dirty(handle, inode, path + depth);
+
+               /* Update path to point to the right extent */
+               path[depth].p_ext = abut_ex;
+               goto out;
+       } else
+               allocated = ee_len - (map->m_lblk - ee_block);
  
         WARN_ON(map->m_lblk < ee_block);
         /*
@@ -3330,7 +3435,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
         }
  
         allocated = ext4_split_extent(handle, inode, path,
-                                     &split_map, split_flag, 0);
+                                     &split_map, split_flag, flags);
         if (allocated < 0)
                 err = allocated;
  
@@ -3650,6 +3755,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                   flags, allocated);
         ext4_ext_show_leaf(inode, path);
  
+       /*
+        * When writing into uninitialized space, we should not fail to
+        * allocate metadata blocks for the new extent block if needed.
+        */
+       flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
+
         trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
                                                     allocated, newblock);
  
@@ -3713,7 +3824,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
         }
  
         /* buffered write, writepage time, convert*/
-       ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
+       ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
         if (ret >= 0)
                 ext4_update_inode_fsync_trans(handle, inode, 1);
  out:
@@ -4257,47 +4368,12 @@ out3:
         return err ? err : allocated;
  }
  
-void ext4_ext_truncate(struct inode *inode)
+void ext4_ext_truncate(handle_t *handle, struct inode *inode)
  {
-       struct address_space *mapping = inode->i_mapping;
         struct super_block *sb = inode->i_sb;
         ext4_lblk_t last_block;
-       handle_t *handle;
-       loff_t page_len;
         int err = 0;
  
-       /*
-        * finish any pending end_io work so we won't run the risk of
-        * converting any truncated blocks to initialized later
-        */
-       ext4_flush_unwritten_io(inode);
-
-       /*
-        * probably first extent we're gonna free will be last in block
-        */
-       err = ext4_writepage_trans_blocks(inode);
-       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err);
-       if (IS_ERR(handle))
-               return;
-
-       if (inode->i_size % PAGE_CACHE_SIZE != 0) {
-               page_len = PAGE_CACHE_SIZE -
-                       (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-               err = ext4_discard_partial_page_buffers(handle,
-                       mapping, inode->i_size, page_len, 0);
-
-               if (err)
-                       goto out_stop;
-       }
-
-       if (ext4_orphan_add(handle, inode))
-               goto out_stop;
-
-       down_write(&EXT4_I(inode)->i_data_sem);
-
-       ext4_discard_preallocations(inode);
-
         /*
          * TODO: optimization is possible here.
          * Probably we need not scan at all,
@@ -4313,29 +4389,6 @@ void ext4_ext_truncate(struct inode *inode)
         err = ext4_es_remove_extent(inode, last_block,
                                     EXT_MAX_BLOCKS - last_block);
         err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
-
-       /* In a multi-transaction truncate, we only make the final
-        * transaction synchronous.
-        */
-       if (IS_SYNC(inode))
-               ext4_handle_sync(handle);
-
-       up_write(&EXT4_I(inode)->i_data_sem);
-
-out_stop:
-       /*
-        * If this was a simple ftruncate() and the file will remain alive,
-        * then we need to clear up the orphan record which we created above.
-        * However, if this was a real unlink then we were called by
-        * ext4_delete_inode(), and we allow that function to clean up the
-        * orphan info for us.
-        */
-       if (inode->i_nlink)
-               ext4_orphan_del(handle, inode);
-
-       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-       ext4_mark_inode_dirty(handle, inode);
-       ext4_journal_stop(handle);
  }
  
  static void ext4_falloc_update_inode(struct inode *inode,
@@ -4623,187 +4676,6 @@ static int ext4_xattr_fiemap(struct inode *inode,
         return (error < 0 ? error : 0);
  }
  
-/*
- * ext4_ext_punch_hole
- *
- * Punches a hole of "length" bytes in a file starting
- * at byte "offset"
- *
- * @inode:  The inode of the file to punch a hole in
- * @offset: The starting byte offset of the hole
- * @length: The length of the hole
- *
- * Returns the number of blocks removed or negative on err
- */
-int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
-{
-       struct inode *inode = file_inode(file);
-       struct super_block *sb = inode->i_sb;
-       ext4_lblk_t first_block, stop_block;
-       struct address_space *mapping = inode->i_mapping;
-       handle_t *handle;
-       loff_t first_page, last_page, page_len;
-       loff_t first_page_offset, last_page_offset;
-       int credits, err = 0;
-
-       /*
-        * Write out all dirty pages to avoid race conditions
-        * Then release them.
-        */
-       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-               err = filemap_write_and_wait_range(mapping,
-                       offset, offset + length - 1);
-
-               if (err)
-                       return err;
-       }
-
-       mutex_lock(&inode->i_mutex);
-       /* It's not possible punch hole on append only file */
-       if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
-               err = -EPERM;
-               goto out_mutex;
-       }
-       if (IS_SWAPFILE(inode)) {
-               err = -ETXTBSY;
-               goto out_mutex;
-       }
-
-       /* No need to punch hole beyond i_size */
-       if (offset >= inode->i_size)
-               goto out_mutex;
-
-       /*
-        * If the hole extends beyond i_size, set the hole
-        * to end after the page that contains i_size
-        */
-       if (offset + length > inode->i_size) {
-               length = inode->i_size +
-                  PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
-                  offset;
-       }
-
-       first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-       last_page = (offset + length) >> PAGE_CACHE_SHIFT;
-
-       first_page_offset = first_page << PAGE_CACHE_SHIFT;
-       last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
-       /* Now release the pages */
-       if (last_page_offset > first_page_offset) {
-               truncate_pagecache_range(inode, first_page_offset,
-                                        last_page_offset - 1);
-       }
-
-       /* Wait all existing dio workers, newcomers will block on i_mutex */
-       ext4_inode_block_unlocked_dio(inode);
-       err = ext4_flush_unwritten_io(inode);
-       if (err)
-               goto out_dio;
-       inode_dio_wait(inode);
-
-       credits = ext4_writepage_trans_blocks(inode);
-       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
-       if (IS_ERR(handle)) {
-               err = PTR_ERR(handle);
-               goto out_dio;
-       }
-
-
-       /*
-        * Now we need to zero out the non-page-aligned data in the
-        * pages at the start and tail of the hole, and unmap the buffer
-        * heads for the block aligned regions of the page that were
-        * completely zeroed.
-        */
-       if (first_page > last_page) {
-               /*
-                * If the file space being truncated is contained within a page
-                * just zero out and unmap the middle of that page
-                */
-               err = ext4_discard_partial_page_buffers(handle,
-                       mapping, offset, length, 0);
-
-               if (err)
-                       goto out;
-       } else {
-               /*
-                * zero out and unmap the partial page that contains
-                * the start of the hole
-                */
-               page_len  = first_page_offset - offset;
-               if (page_len > 0) {
-                       err = ext4_discard_partial_page_buffers(handle, mapping,
-                                                  offset, page_len, 0);
-                       if (err)
-                               goto out;
-               }
-
-               /*
-                * zero out and unmap the partial page that contains
-                * the end of the hole
-                */
-               page_len = offset + length - last_page_offset;
-               if (page_len > 0) {
-                       err = ext4_discard_partial_page_buffers(handle, mapping,
-                                       last_page_offset, page_len, 0);
-                       if (err)
-                               goto out;
-               }
-       }
-
-       /*
-        * If i_size is contained in the last page, we need to
-        * unmap and zero the partial page after i_size
-        */
-       if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
-          inode->i_size % PAGE_CACHE_SIZE != 0) {
-
-               page_len = PAGE_CACHE_SIZE -
-                       (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-               if (page_len > 0) {
-                       err = ext4_discard_partial_page_buffers(handle,
-                         mapping, inode->i_size, page_len, 0);
-
-                       if (err)
-                               goto out;
-               }
-       }
-
-       first_block = (offset + sb->s_blocksize - 1) >>
-               EXT4_BLOCK_SIZE_BITS(sb);
-       stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-
-       /* If there are no blocks to remove, return now */
-       if (first_block >= stop_block)
-               goto out;
-
-       down_write(&EXT4_I(inode)->i_data_sem);
-       ext4_discard_preallocations(inode);
-
-       err = ext4_es_remove_extent(inode, first_block,
-                                   stop_block - first_block);
-       err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
-
-       ext4_discard_preallocations(inode);
-
-       if (IS_SYNC(inode))
-               ext4_handle_sync(handle);
-
-       up_write(&EXT4_I(inode)->i_data_sem);
-
-out:
-       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-       ext4_mark_inode_dirty(handle, inode);
-       ext4_journal_stop(handle);
-out_dio:
-       ext4_inode_resume_unlocked_dio(inode);
-out_mutex:
-       mutex_unlock(&inode->i_mutex);
-       return err;
-}
-
  int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 __u64 start, __u64 len)
  {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c

index 3278e64e57b61ac51a41db3ceebeecd21003985a..e0ba8a408def07583b9cd04a57259e3c56183057 100644 (file)
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         if (journal->j_flags & JBD2_BARRIER &&
             !jbd2_trans_will_send_data_barrier(journal, commit_tid))
                 needs_barrier = true;
-       jbd2_log_start_commit(journal, commit_tid);
-       ret = jbd2_log_wait_commit(journal, commit_tid);
+       ret = jbd2_complete_transaction(journal, commit_tid);
         if (needs_barrier) {
                 err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
                 if (!ret)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c

index 6c5bb8d993fe8ebb07ae48dab2697e247c78ea01..00a818d67b54930c74dad44c6f73dcb3f01e6a19 100644 (file)
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
         trace_ext4_load_inode_bitmap(sb, block_group);
         bh->b_end_io = ext4_end_bitmap_read;
         get_bh(bh);
-       submit_bh(READ, bh);
+       submit_bh(READ | REQ_META | REQ_PRIO, bh);
         wait_on_buffer(bh);
         if (!buffer_uptodate(bh)) {
                 put_bh(bh);
@@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
         ei = EXT4_I(inode);
         sbi = EXT4_SB(sb);
  
+       /*
+        * Initalize owners and quota early so that we don't have to account
+        * for quota initialization worst case in standard inode creating
+        * transaction
+        */
+       if (owner) {
+               inode->i_mode = mode;
+               i_uid_write(inode, owner[0]);
+               i_gid_write(inode, owner[1]);
+       } else if (test_opt(sb, GRPID)) {
+               inode->i_mode = mode;
+               inode->i_uid = current_fsuid();
+               inode->i_gid = dir->i_gid;
+       } else
+               inode_init_owner(inode, dir, mode);
+       dquot_initialize(inode);
+
         if (!goal)
                 goal = sbi->s_inode_goal;
  
@@ -697,7 +714,7 @@ got_group:
  
                 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
                 if (!gdp)
-                       goto fail;
+                       goto out;
  
                 /*
                  * Check free inodes count before loading bitmap.
@@ -711,7 +728,7 @@ got_group:
                 brelse(inode_bitmap_bh);
                 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
                 if (!inode_bitmap_bh)
-                       goto fail;
+                       goto out;
  
  repeat_in_this_group:
                 ino = ext4_find_next_zero_bit((unsigned long *)
@@ -733,13 +750,16 @@ repeat_in_this_group:
                                                          handle_type, nblocks);
                         if (IS_ERR(handle)) {
                                 err = PTR_ERR(handle);
-                               goto fail;
+                               ext4_std_error(sb, err);
+                               goto out;
                         }
                 }
                 BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
                 err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
-               if (err)
-                       goto fail;
+               if (err) {
+                       ext4_std_error(sb, err);
+                       goto out;
+               }
                 ext4_lock_group(sb, group);
                 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
                 ext4_unlock_group(sb, group);
@@ -755,8 +775,10 @@ repeat_in_this_group:
  got:
         BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
         err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
-       if (err)
-               goto fail;
+       if (err) {
+               ext4_std_error(sb, err);
+               goto out;
+       }
  
         /* We may have to initialize the block bitmap if it isn't already */
         if (ext4_has_group_desc_csum(sb) &&
@@ -768,7 +790,8 @@ got:
                 err = ext4_journal_get_write_access(handle, block_bitmap_bh);
                 if (err) {
                         brelse(block_bitmap_bh);
-                       goto fail;
+                       ext4_std_error(sb, err);
+                       goto out;
                 }
  
                 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
@@ -787,14 +810,18 @@ got:
                 ext4_unlock_group(sb, group);
                 brelse(block_bitmap_bh);
  
-               if (err)
-                       goto fail;
+               if (err) {
+                       ext4_std_error(sb, err);
+                       goto out;
+               }
         }
  
         BUFFER_TRACE(group_desc_bh, "get_write_access");
         err = ext4_journal_get_write_access(handle, group_desc_bh);
-       if (err)
-               goto fail;
+       if (err) {
+               ext4_std_error(sb, err);
+               goto out;
+       }
  
         /* Update the relevant bg descriptor fields */
         if (ext4_has_group_desc_csum(sb)) {
@@ -840,8 +867,10 @@ got:
  
         BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
         err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
-       if (err)
-               goto fail;
+       if (err) {
+               ext4_std_error(sb, err);
+               goto out;
+       }
  
         percpu_counter_dec(&sbi->s_freeinodes_counter);
         if (S_ISDIR(mode))
@@ -851,16 +880,6 @@ got:
                 flex_group = ext4_flex_group(sbi, group);
                 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
         }
-       if (owner) {
-               inode->i_mode = mode;
-               i_uid_write(inode, owner[0]);
-               i_gid_write(inode, owner[1]);
-       } else if (test_opt(sb, GRPID)) {
-               inode->i_mode = mode;
-               inode->i_uid = current_fsuid();
-               inode->i_gid = dir->i_gid;
-       } else
-               inode_init_owner(inode, dir, mode);
  
         inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
         /* This is the optimal IO size (for stat), not the fs block size */
@@ -889,7 +908,9 @@ got:
                  * twice.
                  */
                 err = -EIO;
-               goto fail;
+               ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
+                          inode->i_ino);
+               goto out;
         }
         spin_lock(&sbi->s_next_gen_lock);
         inode->i_generation = sbi->s_next_generation++;
@@ -899,7 +920,6 @@ got:
         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                         EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
                 __u32 csum;
-               struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                 __le32 inum = cpu_to_le32(inode->i_ino);
                 __le32 gen = cpu_to_le32(inode->i_generation);
                 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
@@ -918,7 +938,6 @@ got:
                 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
  
         ret = inode;
-       dquot_initialize(inode);
         err = dquot_alloc_inode(inode);
         if (err)
                 goto fail_drop;
@@ -952,24 +971,17 @@ got:
  
         ext4_debug("allocating inode %lu\n", inode->i_ino);
         trace_ext4_allocate_inode(inode, dir, mode);
-       goto really_out;
-fail:
-       ext4_std_error(sb, err);
-out:
-       iput(inode);
-       ret = ERR_PTR(err);
-really_out:
         brelse(inode_bitmap_bh);
         return ret;
  
  fail_free_drop:
         dquot_free_inode(inode);
-
  fail_drop:
-       dquot_drop(inode);
-       inode->i_flags |= S_NOQUOTA;
         clear_nlink(inode);
         unlock_new_inode(inode);
+out:
+       dquot_drop(inode);
+       inode->i_flags |= S_NOQUOTA;
         iput(inode);
         brelse(inode_bitmap_bh);
         return ERR_PTR(err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c

index a04183127ef049190ad96c0b7052ea4495fd33e9..98be6f6974637a4ebb7e4a8b1d471f8f587cdbe6 100644 (file)
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -291,131 +291,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
         return count;
  }
  
-/**
- *     ext4_alloc_blocks: multiple allocate blocks needed for a branch
- *     @handle: handle for this transaction
- *     @inode: inode which needs allocated blocks
- *     @iblock: the logical block to start allocated at
- *     @goal: preferred physical block of allocation
- *     @indirect_blks: the number of blocks need to allocate for indirect
- *                     blocks
- *     @blks: number of desired blocks
- *     @new_blocks: on return it will store the new block numbers for
- *     the indirect blocks(if needed) and the first direct block,
- *     @err: on return it will store the error code
- *
- *     This function will return the number of blocks allocated as
- *     requested by the passed-in parameters.
- */
-static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-                            ext4_lblk_t iblock, ext4_fsblk_t goal,
-                            int indirect_blks, int blks,
-                            ext4_fsblk_t new_blocks[4], int *err)
-{
-       struct ext4_allocation_request ar;
-       int target, i;
-       unsigned long count = 0, blk_allocated = 0;
-       int index = 0;
-       ext4_fsblk_t current_block = 0;
-       int ret = 0;
-
-       /*
-        * Here we try to allocate the requested multiple blocks at once,
-        * on a best-effort basis.
-        * To build a branch, we should allocate blocks for
-        * the indirect blocks(if not allocated yet), and at least
-        * the first direct block of this branch.  That's the
-        * minimum number of blocks need to allocate(required)
-        */
-       /* first we try to allocate the indirect blocks */
-       target = indirect_blks;
-       while (target > 0) {
-               count = target;
-               /* allocating blocks for indirect blocks and direct blocks */
-               current_block = ext4_new_meta_blocks(handle, inode, goal,
-                                                    0, &count, err);
-               if (*err)
-                       goto failed_out;
-
-               if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
-                       EXT4_ERROR_INODE(inode,
-                                        "current_block %llu + count %lu > %d!",
-                                        current_block, count,
-                                        EXT4_MAX_BLOCK_FILE_PHYS);
-                       *err = -EIO;
-                       goto failed_out;
-               }
-
-               target -= count;
-               /* allocate blocks for indirect blocks */
-               while (index < indirect_blks && count) {
-                       new_blocks[index++] = current_block++;
-                       count--;
-               }
-               if (count > 0) {
-                       /*
-                        * save the new block number
-                        * for the first direct block
-                        */
-                       new_blocks[index] = current_block;
-                       WARN(1, KERN_INFO "%s returned more blocks than "
-                                               "requested\n", __func__);
-                       break;
-               }
-       }
-
-       target = blks - count ;
-       blk_allocated = count;
-       if (!target)
-               goto allocated;
-       /* Now allocate data blocks */
-       memset(&ar, 0, sizeof(ar));
-       ar.inode = inode;
-       ar.goal = goal;
-       ar.len = target;
-       ar.logical = iblock;
-       if (S_ISREG(inode->i_mode))
-               /* enable in-core preallocation only for regular files */
-               ar.flags = EXT4_MB_HINT_DATA;
-
-       current_block = ext4_mb_new_blocks(handle, &ar, err);
-       if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
-               EXT4_ERROR_INODE(inode,
-                                "current_block %llu + ar.len %d > %d!",
-                                current_block, ar.len,
-                                EXT4_MAX_BLOCK_FILE_PHYS);
-               *err = -EIO;
-               goto failed_out;
-       }
-
-       if (*err && (target == blks)) {
-               /*
-                * if the allocation failed and we didn't allocate
-                * any blocks before
-                */
-               goto failed_out;
-       }
-       if (!*err) {
-               if (target == blks) {
-                       /*
-                        * save the new block number
-                        * for the first direct block
-                        */
-                       new_blocks[index] = current_block;
-               }
-               blk_allocated += ar.len;
-       }
-allocated:
-       /* total number of blocks allocated for direct blocks */
-       ret = blk_allocated;
-       *err = 0;
-       return ret;
-failed_out:
-       for (i = 0; i < index; i++)
-               ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-       return ret;
-}
-
  /**
   *     ext4_alloc_branch - allocate and set up a chain of blocks.
   *     @handle: handle for this transaction
@@ -448,60 +323,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                              int *blks, ext4_fsblk_t goal,
                              ext4_lblk_t *offsets, Indirect *branch)
  {
-       int blocksize = inode->i_sb->s_blocksize;
-       int i, n = 0;
-       int err = 0;
-       struct buffer_head *bh;
-       int num;
-       ext4_fsblk_t new_blocks[4];
-       ext4_fsblk_t current_block;
-
-       num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
-                               *blks, new_blocks, &err);
-       if (err)
-               return err;
+       struct ext4_allocation_request  ar;
+       struct buffer_head *            bh;
+       ext4_fsblk_t                    b, new_blocks[4];
+       __le32                          *p;
+       int                             i, j, err, len = 1;
  
-       branch[0].key = cpu_to_le32(new_blocks[0]);
         /*
-        * metadata blocks and data blocks are allocated.
+        * Set up for the direct block allocation
          */
-       for (n = 1; n <= indirect_blks;  n++) {
-               /*
-                * Get buffer_head for parent block, zero it out
-                * and set the pointer to new one, then send
-                * parent to disk.
-                */
-               bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+       memset(&ar, 0, sizeof(ar));
+       ar.inode = inode;
+       ar.len = *blks;
+       ar.logical = iblock;
+       if (S_ISREG(inode->i_mode))
+               ar.flags = EXT4_MB_HINT_DATA;
+
+       for (i = 0; i <= indirect_blks; i++) {
+               if (i == indirect_blks) {
+                       ar.goal = goal;
+                       new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
+               } else
+                       goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
+                                                       goal, 0, NULL, &err);
+               if (err) {
+                       i--;
+                       goto failed;
+               }
+               branch[i].key = cpu_to_le32(new_blocks[i]);
+               if (i == 0)
+                       continue;
+
+               bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
                 if (unlikely(!bh)) {
                         err = -ENOMEM;
                         goto failed;
                 }
-
-               branch[n].bh = bh;
                 lock_buffer(bh);
                 BUFFER_TRACE(bh, "call get_create_access");
                 err = ext4_journal_get_create_access(handle, bh);
                 if (err) {
-                       /* Don't brelse(bh) here; it's done in
-                        * ext4_journal_forget() below */
                         unlock_buffer(bh);
                         goto failed;
                 }
  
-               memset(bh->b_data, 0, blocksize);
-               branch[n].p = (__le32 *) bh->b_data + offsets[n];
-               branch[n].key = cpu_to_le32(new_blocks[n]);
-               *branch[n].p = branch[n].key;
-               if (n == indirect_blks) {
-                       current_block = new_blocks[n];
-                       /*
-                        * End of chain, update the last new metablock of
-                        * the chain to point to the new allocated
-                        * data blocks numbers
-                        */
-                       for (i = 1; i < num; i++)
-                               *(branch[n].p + i) = cpu_to_le32(++current_block);
-               }
+               memset(bh->b_data, 0, bh->b_size);
+               p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
+               b = new_blocks[i];
+
+               if (i == indirect_blks)
+                       len = ar.len;
+               for (j = 0; j < len; j++)
+                       *p++ = cpu_to_le32(b++);
+
                 BUFFER_TRACE(bh, "marking uptodate");
                 set_buffer_uptodate(bh);
                 unlock_buffer(bh);
@@ -511,25 +385,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                 if (err)
                         goto failed;
         }
-       *blks = num;
-       return err;
+       *blks = ar.len;
+       return 0;
  failed:
-       /* Allocation failed, free what we already allocated */
-       ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
-       for (i = 1; i <= n ; i++) {
-               /*
-                * branch[i].bh is newly allocated, so there is no
-                * need to revoke the block, which is why we don't
-                * need to set EXT4_FREE_BLOCKS_METADATA.
-                */
-               ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
-                                EXT4_FREE_BLOCKS_FORGET);
+       for (; i >= 0; i--) {
+               if (i != indirect_blks && branch[i].bh)
+                       ext4_forget(handle, 1, inode, branch[i].bh,
+                                   branch[i].bh->b_blocknr);
+               ext4_free_blocks(handle, inode, NULL, new_blocks[i],
+                                (i == indirect_blks) ? ar.len : 1, 0);
         }
-       for (i = n+1; i < indirect_blks; i++)
-               ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-
-       ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
-
         return err;
  }
  
@@ -941,26 +806,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
   * be able to restart the transaction at a conventient checkpoint to make
   * sure we don't overflow the journal.
   *
- * start_transaction gets us a new handle for a truncate transaction,
- * and extend_transaction tries to extend the existing one a bit.  If
+ * Try to extend this transaction for the purposes of truncation.  If
   * extend fails, we need to propagate the failure up and restart the
   * transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
-{
-       handle_t *result;
-
-       result = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
-                                   ext4_blocks_for_truncate(inode));
-       if (!IS_ERR(result))
-               return result;
-
-       ext4_std_error(inode->i_sb, PTR_ERR(result));
-       return result;
-}
-
-/*
- * Try to extend this transaction for the purposes of truncation.
   *
   * Returns 0 if we managed to create more room.  If we can't create more
   * room, and the transaction must be restarted we return 1.
@@ -1353,68 +1201,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
         }
  }
  
-void ext4_ind_truncate(struct inode *inode)
+void ext4_ind_truncate(handle_t *handle, struct inode *inode)
  {
-       handle_t *handle;
         struct ext4_inode_info *ei = EXT4_I(inode);
         __le32 *i_data = ei->i_data;
         int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-       struct address_space *mapping = inode->i_mapping;
         ext4_lblk_t offsets[4];
         Indirect chain[4];
         Indirect *partial;
         __le32 nr = 0;
         int n = 0;
         ext4_lblk_t last_block, max_block;
-       loff_t page_len;
         unsigned blocksize = inode->i_sb->s_blocksize;
-       int err;
-
-       handle = start_transaction(inode);
-       if (IS_ERR(handle))
-               return;         /* AKPM: return what? */
  
         last_block = (inode->i_size + blocksize-1)
                                         >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
         max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
                                         >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
  
-       if (inode->i_size % PAGE_CACHE_SIZE != 0) {
-               page_len = PAGE_CACHE_SIZE -
-                       (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-               err = ext4_discard_partial_page_buffers(handle,
-                       mapping, inode->i_size, page_len, 0);
-
-               if (err)
-                       goto out_stop;
-       }
-
         if (last_block != max_block) {
                 n = ext4_block_to_path(inode, last_block, offsets, NULL);
                 if (n == 0)
-                       goto out_stop;  /* error */
+                       return;
         }
  
-       /*
-        * OK.  This truncate is going to happen.  We add the inode to the
-        * orphan list, so that if this truncate spans multiple transactions,
-        * and we crash, we will resume the truncate when the filesystem
-        * recovers.  It also marks the inode dirty, to catch the new size.
-        *
-        * Implication: the file must always be in a sane, consistent
-        * truncatable state while each transaction commits.
-        */
-       if (ext4_orphan_add(handle, inode))
-               goto out_stop;
-
-       /*
-        * From here we block out all ext4_get_block() callers who want to
-        * modify the block allocation tree.
-        */
-       down_write(&ei->i_data_sem);
-
-       ext4_discard_preallocations(inode);
         ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
  
         /*
@@ -1431,7 +1241,7 @@ void ext4_ind_truncate(struct inode *inode)
                  * It is unnecessary to free any data blocks if last_block is
                  * equal to the indirect block limit.
                  */
-               goto out_unlock;
+               return;
         } else if (n == 1) {            /* direct blocks */
                 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                                i_data + EXT4_NDIR_BLOCKS);
@@ -1491,31 +1301,6 @@ do_indirects:
         case EXT4_TIND_BLOCK:
                 ;
         }
-
-out_unlock:
-       up_write(&ei->i_data_sem);
-       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-       ext4_mark_inode_dirty(handle, inode);
-
-       /*
-        * In a multi-transaction truncate, we only make the final transaction
-        * synchronous
-        */
-       if (IS_SYNC(inode))
-               ext4_handle_sync(handle);
-out_stop:
-       /*
-        * If this was a simple ftruncate(), and the file will remain alive
-        * then we need to clear up the orphan record which we created above.
-        * However, if this was a real unlink then we were called by
-        * ext4_delete_inode(), and we allow that function to clean up the
-        * orphan info for us.
-        */
-       if (inode->i_nlink)
-               ext4_orphan_del(handle, inode);
-
-       ext4_journal_stop(handle);
-       trace_ext4_truncate_exit(inode);
  }
  
  static int free_hole_blocks(handle_t *handle, struct inode *inode,
@@ -1569,8 +1354,8 @@ err:
         return ret;
  }
  
-static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t first, ext4_lblk_t stop)
+int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+                         ext4_lblk_t first, ext4_lblk_t stop)
  {
         int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
         int level, ret = 0;
@@ -1604,157 +1389,3 @@ err:
         return ret;
  }
  
-int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length)
-{
-       struct inode *inode = file_inode(file);
-       struct super_block *sb = inode->i_sb;
-       ext4_lblk_t first_block, stop_block;
-       struct address_space *mapping = inode->i_mapping;
-       handle_t *handle = NULL;
-       loff_t first_page, last_page, page_len;
-       loff_t first_page_offset, last_page_offset;
-       int err = 0;
-
-       /*
-        * Write out all dirty pages to avoid race conditions
-        * Then release them.
-        */
-       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-               err = filemap_write_and_wait_range(mapping,
-                       offset, offset + length - 1);
-               if (err)
-                       return err;
-       }
-
-       mutex_lock(&inode->i_mutex);
-       /* It's not possible punch hole on append only file */
-       if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
-               err = -EPERM;
-               goto out_mutex;
-       }
-       if (IS_SWAPFILE(inode)) {
-               err = -ETXTBSY;
-               goto out_mutex;
-       }
-
-       /* No need to punch hole beyond i_size */
-       if (offset >= inode->i_size)
-               goto out_mutex;
-
-       /*
-        * If the hole extents beyond i_size, set the hole
-        * to end after the page that contains i_size
-        */
-       if (offset + length > inode->i_size) {
-               length = inode->i_size +
-                   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
-                   offset;
-       }
-
-       first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-       last_page = (offset + length) >> PAGE_CACHE_SHIFT;
-
-       first_page_offset = first_page << PAGE_CACHE_SHIFT;
-       last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
-       /* Now release the pages */
-       if (last_page_offset > first_page_offset) {
-               truncate_pagecache_range(inode, first_page_offset,
-                                        last_page_offset - 1);
-       }
-
-       /* Wait all existing dio works, newcomers will block on i_mutex */
-       inode_dio_wait(inode);
-
-       handle = start_transaction(inode);
-       if (IS_ERR(handle))
-               goto out_mutex;
-
-       /*
-        * Now we need to zero out the non-page-aligned data in the
-        * pages at the start and tail of the hole, and unmap the buffer
-        * heads for the block aligned regions of the page that were
-        * completely zerod.
-        */
-       if (first_page > last_page) {
-               /*
-                * If the file space being truncated is contained within a page
-                * just zero out and unmap the middle of that page
-                */
-               err = ext4_discard_partial_page_buffers(handle,
-                       mapping, offset, length, 0);
-               if (err)
-                       goto out;
-       } else {
-               /*
-                * Zero out and unmap the paritial page that contains
-                * the start of the hole
-                */
-               page_len = first_page_offset - offset;
-               if (page_len > 0) {
-                       err = ext4_discard_partial_page_buffers(handle, mapping,
-                                                       offset, page_len, 0);
-                       if (err)
-                               goto out;
-               }
-
-               /*
-                * Zero out and unmap the partial page that contains
-                * the end of the hole
-                */
-               page_len = offset + length - last_page_offset;
-               if (page_len > 0) {
-                       err = ext4_discard_partial_page_buffers(handle, mapping,
-                                               last_page_offset, page_len, 0);
-                       if (err)
-                               goto out;
-               }
-       }
-
-       /*
-        * If i_size contained in the last page, we need to
-        * unmap and zero the paritial page after i_size
-        */
-       if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
-           inode->i_size % PAGE_CACHE_SIZE != 0) {
-               page_len = PAGE_CACHE_SIZE -
-                       (inode->i_size & (PAGE_CACHE_SIZE - 1));
-               if (page_len > 0) {
-                       err = ext4_discard_partial_page_buffers(handle,
-                               mapping, inode->i_size, page_len, 0);
-                       if (err)
-                               goto out;
-               }
-       }
-
-       first_block = (offset + sb->s_blocksize - 1) >>
-               EXT4_BLOCK_SIZE_BITS(sb);
-       stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-
-       if (first_block >= stop_block)
-               goto out;
-
-       down_write(&EXT4_I(inode)->i_data_sem);
-       ext4_discard_preallocations(inode);
-
-       err = ext4_es_remove_extent(inode, first_block,
-                                   stop_block - first_block);
-       err = ext4_free_hole_blocks(handle, inode, first_block, stop_block);
-
-       ext4_discard_preallocations(inode);
-
-       if (IS_SYNC(inode))
-               ext4_handle_sync(handle);
-
-       up_write(&EXT4_I(inode)->i_data_sem);
-
-out:
-       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-       ext4_mark_inode_dirty(handle, inode);
-       ext4_journal_stop(handle);
-
-out_mutex:
-       mutex_unlock(&inode->i_mutex);
-
-       return err;
-}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c

index c0fd1a123f7d910aca4228e4f3af2e96b35c8a01..3e2bf873e8a8b42b1564c44505e0ec8cba1971e6 100644 (file)
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -19,7 +19,8 @@
  
  #define EXT4_XATTR_SYSTEM_DATA "data"
  #define EXT4_MIN_INLINE_DATA_SIZE      ((sizeof(__le32) * EXT4_N_BLOCKS))
-#define EXT4_INLINE_DOTDOT_SIZE        4
+#define EXT4_INLINE_DOTDOT_OFFSET      2
+#define EXT4_INLINE_DOTDOT_SIZE                4
  
  int ext4_get_inline_size(struct inode *inode)
  {
@@ -1289,6 +1290,120 @@ out:
         return ret;
  }
  
+/*
+ * This function fills a red-black tree with information from an
+ * inlined dir.  It returns the number directory entries loaded
+ * into the tree.  If there is an error it is returned in err.
+ */
+int htree_inlinedir_to_tree(struct file *dir_file,
+                           struct inode *dir, ext4_lblk_t block,
+                           struct dx_hash_info *hinfo,
+                           __u32 start_hash, __u32 start_minor_hash,
+                           int *has_inline_data)
+{
+       int err = 0, count = 0;
+       unsigned int parent_ino;
+       int pos;
+       struct ext4_dir_entry_2 *de;
+       struct inode *inode = file_inode(dir_file);
+       int ret, inline_size = 0;
+       struct ext4_iloc iloc;
+       void *dir_buf = NULL;
+       struct ext4_dir_entry_2 fake;
+
+       ret = ext4_get_inode_loc(inode, &iloc);
+       if (ret)
+               return ret;
+
+       down_read(&EXT4_I(inode)->xattr_sem);
+       if (!ext4_has_inline_data(inode)) {
+               up_read(&EXT4_I(inode)->xattr_sem);
+               *has_inline_data = 0;
+               goto out;
+       }
+
+       inline_size = ext4_get_inline_size(inode);
+       dir_buf = kmalloc(inline_size, GFP_NOFS);
+       if (!dir_buf) {
+               ret = -ENOMEM;
+               up_read(&EXT4_I(inode)->xattr_sem);
+               goto out;
+       }
+
+       ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+       up_read(&EXT4_I(inode)->xattr_sem);
+       if (ret < 0)
+               goto out;
+
+       pos = 0;
+       parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+       while (pos < inline_size) {
+               /*
+                * As inlined dir doesn't store any information about '.' and
+                * only the inode number of '..' is stored, we have to handle
+                * them differently.
+                */
+               if (pos == 0) {
+                       fake.inode = cpu_to_le32(inode->i_ino);
+                       fake.name_len = 1;
+                       strcpy(fake.name, ".");
+                       fake.rec_len = ext4_rec_len_to_disk(
+                                               EXT4_DIR_REC_LEN(fake.name_len),
+                                               inline_size);
+                       ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+                       de = &fake;
+                       pos = EXT4_INLINE_DOTDOT_OFFSET;
+               } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
+                       fake.inode = cpu_to_le32(parent_ino);
+                       fake.name_len = 2;
+                       strcpy(fake.name, "..");
+                       fake.rec_len = ext4_rec_len_to_disk(
+                                               EXT4_DIR_REC_LEN(fake.name_len),
+                                               inline_size);
+                       ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+                       de = &fake;
+                       pos = EXT4_INLINE_DOTDOT_SIZE;
+               } else {
+                       de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
+                       pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
+                       if (ext4_check_dir_entry(inode, dir_file, de,
+                                        iloc.bh, dir_buf,
+                                        inline_size, pos)) {
+                               ret = count;
+                               goto out;
+                       }
+               }
+
+               ext4fs_dirhash(de->name, de->name_len, hinfo);
+               if ((hinfo->hash < start_hash) ||
+                   ((hinfo->hash == start_hash) &&
+                    (hinfo->minor_hash < start_minor_hash)))
+                       continue;
+               if (de->inode == 0)
+                       continue;
+               err = ext4_htree_store_dirent(dir_file,
+                                  hinfo->hash, hinfo->minor_hash, de);
+               if (err) {
+                       count = err;
+                       goto out;
+               }
+               count++;
+       }
+       ret = count;
+out:
+       kfree(dir_buf);
+       brelse(iloc.bh);
+       return ret;
+}
+
+/*
+ * So this function is called when the volume is mkfsed with
+ * dir_index disabled. In order to keep f_pos persistent
+ * after we convert from an inlined dir to a blocked based,
+ * we just pretend that we are a normal dir and return the
+ * offset as if '.' and '..' really take place.
+ *
+ */
  int ext4_read_inline_dir(struct file *filp,
                          void *dirent, filldir_t filldir,
                          int *has_inline_data)
@@ -1302,6 +1417,7 @@ int ext4_read_inline_dir(struct file *filp,
         int ret, inline_size = 0;
         struct ext4_iloc iloc;
         void *dir_buf = NULL;
+       int dotdot_offset, dotdot_size, extra_offset, extra_size;
  
         ret = ext4_get_inode_loc(inode, &iloc);
         if (ret)
@@ -1330,8 +1446,21 @@ int ext4_read_inline_dir(struct file *filp,
         sb = inode->i_sb;
         stored = 0;
         parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+       offset = filp->f_pos;
  
-       while (!error && !stored && filp->f_pos < inode->i_size) {
+       /*
+        * dotdot_offset and dotdot_size is the real offset and
+        * size for ".." and "." if the dir is block based while
+        * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
+        * So we will use extra_offset and extra_size to indicate them
+        * during the inline dir iteration.
+        */
+       dotdot_offset = EXT4_DIR_REC_LEN(1);
+       dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+       extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
+       extra_size = extra_offset + inline_size;
+
+       while (!error && !stored && filp->f_pos < extra_size) {
  revalidate:
                 /*
                  * If the version has changed since the last call to
@@ -1340,15 +1469,23 @@ revalidate:
                  * dir to make sure.
                  */
                 if (filp->f_version != inode->i_version) {
-                       for (i = 0;
-                            i < inode->i_size && i < offset;) {
+                       for (i = 0; i < extra_size && i < offset;) {
+                               /*
+                                * "." is with offset 0 and
+                                * ".." is dotdot_offset.
+                                */
                                 if (!i) {
-                                       /* skip "." and ".." if needed. */
-                                       i += EXT4_INLINE_DOTDOT_SIZE;
+                                       i = dotdot_offset;
+                                       continue;
+                               } else if (i == dotdot_offset) {
+                                       i = dotdot_size;
                                         continue;
                                 }
+                               /* for other entry, the real offset in
+                                * the buf has to be tuned accordingly.
+                                */
                                 de = (struct ext4_dir_entry_2 *)
-                                       (dir_buf + i);
+                                       (dir_buf + i - extra_offset);
                                 /* It's too expensive to do a full
                                  * dirent test each time round this
                                  * loop, but we do have to test at
@@ -1356,43 +1493,47 @@ revalidate:
                                  * failure will be detected in the
                                  * dirent test below. */
                                 if (ext4_rec_len_from_disk(de->rec_len,
-                                       inline_size) < EXT4_DIR_REC_LEN(1))
+                                       extra_size) < EXT4_DIR_REC_LEN(1))
                                         break;
                                 i += ext4_rec_len_from_disk(de->rec_len,
-                                                           inline_size);
+                                                           extra_size);
                         }
                         offset = i;
                         filp->f_pos = offset;
                         filp->f_version = inode->i_version;
                 }
  
-               while (!error && filp->f_pos < inode->i_size) {
+               while (!error && filp->f_pos < extra_size) {
                         if (filp->f_pos == 0) {
                                 error = filldir(dirent, ".", 1, 0, inode->i_ino,
                                                 DT_DIR);
                                 if (error)
                                         break;
                                 stored++;
+                               filp->f_pos = dotdot_offset;
+                               continue;
+                       }
  
-                               error = filldir(dirent, "..", 2, 0, parent_ino,
-                                               DT_DIR);
+                       if (filp->f_pos == dotdot_offset) {
+                               error = filldir(dirent, "..", 2,
+                                               dotdot_offset,
+                                               parent_ino, DT_DIR);
                                 if (error)
                                         break;
                                 stored++;
  
-                               filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
+                               filp->f_pos = dotdot_size;
                                 continue;
                         }
  
-                       de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
+                       de = (struct ext4_dir_entry_2 *)
+                               (dir_buf + filp->f_pos - extra_offset);
                         if (ext4_check_dir_entry(inode, filp, de,
                                                  iloc.bh, dir_buf,
-                                                inline_size, offset)) {
+                                                extra_size, filp->f_pos)) {
                                 ret = stored;
                                 goto out;
                         }
-                       offset += ext4_rec_len_from_disk(de->rec_len,
-                                                        inline_size);
                         if (le32_to_cpu(de->inode)) {
                                 /* We might block in the next section
                                  * if the data destination is
@@ -1415,9 +1556,8 @@ revalidate:
                                 stored++;
                         }
                         filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
-                                                             inline_size);
+                                                             extra_size);
                 }
-               offset = 0;
         }
  out:
         kfree(dir_buf);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index b3a5213bc73eac2082cbfa16bad66bba89c32ed4..793d44b84d7f778fe3981892ad2ea6b76fe86a9f 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -55,21 +55,21 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
         __u16 csum_hi = 0;
         __u32 csum;
  
-       csum_lo = raw->i_checksum_lo;
+       csum_lo = le16_to_cpu(raw->i_checksum_lo);
         raw->i_checksum_lo = 0;
         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
             EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
-               csum_hi = raw->i_checksum_hi;
+               csum_hi = le16_to_cpu(raw->i_checksum_hi);
                 raw->i_checksum_hi = 0;
         }
  
         csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
                            EXT4_INODE_SIZE(inode->i_sb));
  
-       raw->i_checksum_lo = csum_lo;
+       raw->i_checksum_lo = cpu_to_le16(csum_lo);
         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
             EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
-               raw->i_checksum_hi = csum_hi;
+               raw->i_checksum_hi = cpu_to_le16(csum_hi);
  
         return csum;
  }
@@ -210,8 +210,7 @@ void ext4_evict_inode(struct inode *inode)
                         journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
                         tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
  
-                       jbd2_log_start_commit(journal, commit_tid);
-                       jbd2_log_wait_commit(journal, commit_tid);
+                       jbd2_complete_transaction(journal, commit_tid);
                         filemap_write_and_wait(&inode->i_data);
                 }
                 truncate_inode_pages(&inode->i_data, 0);
@@ -1081,20 +1080,42 @@ retry_journal:
  /* For write_end() in data=journal mode */
  static int write_end_fn(handle_t *handle, struct buffer_head *bh)
  {
+       int ret;
         if (!buffer_mapped(bh) || buffer_freed(bh))
                 return 0;
         set_buffer_uptodate(bh);
-       return ext4_handle_dirty_metadata(handle, NULL, bh);
+       ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+       clear_buffer_meta(bh);
+       clear_buffer_prio(bh);
+       return ret;
  }
  
-static int ext4_generic_write_end(struct file *file,
-                                 struct address_space *mapping,
-                                 loff_t pos, unsigned len, unsigned copied,
-                                 struct page *page, void *fsdata)
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list.  metadata
+ * buffers are managed internally.
+ */
+static int ext4_write_end(struct file *file,
+                         struct address_space *mapping,
+                         loff_t pos, unsigned len, unsigned copied,
+                         struct page *page, void *fsdata)
  {
-       int i_size_changed = 0;
-       struct inode *inode = mapping->host;
         handle_t *handle = ext4_journal_current_handle();
+       struct inode *inode = mapping->host;
+       int ret = 0, ret2;
+       int i_size_changed = 0;
+
+       trace_ext4_write_end(inode, pos, len, copied);
+       if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
+               ret = ext4_jbd2_file_inode(handle, inode);
+               if (ret) {
+                       unlock_page(page);
+                       page_cache_release(page);
+                       goto errout;
+               }
+       }
  
         if (ext4_has_inline_data(inode))
                 copied = ext4_write_inline_data_end(inode, pos, len,
@@ -1105,7 +1126,7 @@ static int ext4_generic_write_end(struct file *file,
  
         /*
          * No need to use i_size_read() here, the i_size
-        * cannot change under us because we hold i_mutex.
+        * cannot change under us because we hole i_mutex.
          *
          * But it's important to update i_size while still holding page lock:
          * page writeout could otherwise come in and zero beyond i_size.
@@ -1115,10 +1136,10 @@ static int ext4_generic_write_end(struct file *file,
                 i_size_changed = 1;
         }
  
-       if (pos + copied >  EXT4_I(inode)->i_disksize) {
+       if (pos + copied > EXT4_I(inode)->i_disksize) {
                 /* We need to mark inode dirty even if
                  * new_i_size is less that inode->i_size
-                * bu greater than i_disksize.(hint delalloc)
+                * but greater than i_disksize. (hint delalloc)
                  */
                 ext4_update_i_disksize(inode, (pos + copied));
                 i_size_changed = 1;
@@ -1135,87 +1156,15 @@ static int ext4_generic_write_end(struct file *file,
         if (i_size_changed)
                 ext4_mark_inode_dirty(handle, inode);
  
-       return copied;
-}
-
-/*
- * We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
- *
- * ext4 never places buffers on inode->i_mapping->private_list.  metadata
- * buffers are managed internally.
- */
-static int ext4_ordered_write_end(struct file *file,
-                                 struct address_space *mapping,
-                                 loff_t pos, unsigned len, unsigned copied,
-                                 struct page *page, void *fsdata)
-{
-       handle_t *handle = ext4_journal_current_handle();
-       struct inode *inode = mapping->host;
-       int ret = 0, ret2;
-
-       trace_ext4_ordered_write_end(inode, pos, len, copied);
-       ret = ext4_jbd2_file_inode(handle, inode);
-
-       if (ret == 0) {
-               ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
-                                                       page, fsdata);
-               copied = ret2;
-               if (pos + len > inode->i_size && ext4_can_truncate(inode))
-                       /* if we have allocated more blocks and copied
-                        * less. We will have blocks allocated outside
-                        * inode->i_size. So truncate them
-                        */
-                       ext4_orphan_add(handle, inode);
-               if (ret2 < 0)
-                       ret = ret2;
-       } else {
-               unlock_page(page);
-               page_cache_release(page);
-       }
-
-       ret2 = ext4_journal_stop(handle);
-       if (!ret)
-               ret = ret2;
-
-       if (pos + len > inode->i_size) {
-               ext4_truncate_failed_write(inode);
-               /*
-                * If truncate failed early the inode might still be
-                * on the orphan list; we need to make sure the inode
-                * is removed from the orphan list in that case.
-                */
-               if (inode->i_nlink)
-                       ext4_orphan_del(NULL, inode);
-       }
-
-
-       return ret ? ret : copied;
-}
-
-static int ext4_writeback_write_end(struct file *file,
-                                   struct address_space *mapping,
-                                   loff_t pos, unsigned len, unsigned copied,
-                                   struct page *page, void *fsdata)
-{
-       handle_t *handle = ext4_journal_current_handle();
-       struct inode *inode = mapping->host;
-       int ret = 0, ret2;
-
-       trace_ext4_writeback_write_end(inode, pos, len, copied);
-       ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
-                                                       page, fsdata);
-       copied = ret2;
+       if (copied < 0)
+               ret = copied;
         if (pos + len > inode->i_size && ext4_can_truncate(inode))
                 /* if we have allocated more blocks and copied
                  * less. We will have blocks allocated outside
                  * inode->i_size. So truncate them
                  */
                 ext4_orphan_add(handle, inode);
-
-       if (ret2 < 0)
-               ret = ret2;
-
+errout:
         ret2 = ext4_journal_stop(handle);
         if (!ret)
                 ret = ret2;
@@ -1538,7 +1487,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
         struct ext4_io_submit io_submit;
  
         BUG_ON(mpd->next_page <= mpd->first_page);
-       memset(&io_submit, 0, sizeof(io_submit));
+       ext4_io_submit_init(&io_submit, mpd->wbc);
+       io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+       if (!io_submit.io_end)
+               return -ENOMEM;
         /*
          * We need to start from the first_page to the next_page - 1
          * to make sure we also write the mapped dirty buffer_heads.
@@ -1626,6 +1578,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                 pagevec_release(&pvec);
         }
         ext4_io_submit(&io_submit);
+       /* Drop io_end reference we got from init */
+       ext4_put_io_end_defer(io_submit.io_end);
         return ret;
  }
  
@@ -1670,22 +1624,25 @@ static void ext4_print_free_blocks(struct inode *inode)
  {
         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
         struct super_block *sb = inode->i_sb;
+       struct ext4_inode_info *ei = EXT4_I(inode);
  
         ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
                EXT4_C2B(EXT4_SB(inode->i_sb),
-                       ext4_count_free_clusters(inode->i_sb)));
+                       ext4_count_free_clusters(sb)));
         ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
         ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
-              (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+              (long long) EXT4_C2B(EXT4_SB(sb),
                 percpu_counter_sum(&sbi->s_freeclusters_counter)));
         ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
-              (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+              (long long) EXT4_C2B(EXT4_SB(sb),
                 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
         ext4_msg(sb, KERN_CRIT, "Block reservation details");
         ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
-                EXT4_I(inode)->i_reserved_data_blocks);
+                ei->i_reserved_data_blocks);
         ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
-              EXT4_I(inode)->i_reserved_meta_blocks);
+              ei->i_reserved_meta_blocks);
+       ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
+              ei->i_allocated_meta_blocks);
         return;
  }
  
@@ -1740,12 +1697,21 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
          */
         map.m_lblk = next;
         map.m_len = max_blocks;
-       get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+       /*
+        * We're in delalloc path and it is possible that we're going to
+        * need more metadata blocks than previously reserved. However
+        * we must not fail because we're in writeback and there is
+        * nothing we can do about it so it might result in data loss.
+        * So use reserved blocks to allocate metadata if possible.
+        */
+       get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+                          EXT4_GET_BLOCKS_METADATA_NOFAIL;
         if (ext4_should_dioread_nolock(mpd->inode))
                 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
         if (mpd->b_state & (1 << BH_Delay))
                 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
  
+
         blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
         if (blks < 0) {
                 struct super_block *sb = mpd->inode->i_sb;
@@ -2272,9 +2238,16 @@ static int ext4_writepage(struct page *page,
                  */
                 return __ext4_journalled_writepage(page, len);
  
-       memset(&io_submit, 0, sizeof(io_submit));
+       ext4_io_submit_init(&io_submit, wbc);
+       io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+       if (!io_submit.io_end) {
+               redirty_page_for_writepage(wbc, page);
+               return -ENOMEM;
+       }
         ret = ext4_bio_write_page(&io_submit, page, len, wbc);
         ext4_io_submit(&io_submit);
+       /* Drop io_end reference we got from init */
+       ext4_put_io_end_defer(io_submit.io_end);
         return ret;
  }
  
@@ -2661,7 +2634,7 @@ out_writepages:
  
  static int ext4_nonda_switch(struct super_block *sb)
  {
-       s64 free_blocks, dirty_blocks;
+       s64 free_clusters, dirty_clusters;
         struct ext4_sb_info *sbi = EXT4_SB(sb);
  
         /*
@@ -2672,17 +2645,18 @@ static int ext4_nonda_switch(struct super_block *sb)
          * Delalloc need an accurate free block accounting. So switch
          * to non delalloc when we are near to error range.
          */
-       free_blocks  = EXT4_C2B(sbi,
-               percpu_counter_read_positive(&sbi->s_freeclusters_counter));
-       dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+       free_clusters =
+               percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+       dirty_clusters =
+               percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
         /*
          * Start pushing delalloc when 1/2 of free blocks are dirty.
          */
-       if (dirty_blocks && (free_blocks < 2 * dirty_blocks))
+       if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
                 try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
  
-       if (2 * free_blocks < 3 * dirty_blocks ||
-               free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
+       if (2 * free_clusters < 3 * dirty_clusters ||
+           free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
                 /*
                  * free block count is less than 150% of dirty blocks
                  * or free blocks is less than watermark
@@ -2818,18 +2792,9 @@ static int ext4_da_write_end(struct file *file,
         unsigned long start, end;
         int write_mode = (int)(unsigned long)fsdata;
  
-       if (write_mode == FALL_BACK_TO_NONDELALLOC) {
-               switch (ext4_inode_journal_mode(inode)) {
-               case EXT4_INODE_ORDERED_DATA_MODE:
-                       return ext4_ordered_write_end(file, mapping, pos,
-                                       len, copied, page, fsdata);
-               case EXT4_INODE_WRITEBACK_DATA_MODE:
-                       return ext4_writeback_write_end(file, mapping, pos,
-                                       len, copied, page, fsdata);
-               default:
-                       BUG();
-               }
-       }
+       if (write_mode == FALL_BACK_TO_NONDELALLOC)
+               return ext4_write_end(file, mapping, pos,
+                                     len, copied, page, fsdata);
  
         trace_ext4_da_write_end(inode, pos, len, copied);
         start = pos & (PAGE_CACHE_SIZE - 1);
@@ -3113,9 +3078,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
         struct inode *inode = file_inode(iocb->ki_filp);
          ext4_io_end_t *io_end = iocb->private;
  
-       /* if not async direct IO or dio with 0 bytes write, just return */
-       if (!io_end || !size)
-               goto out;
+       /* if not async direct IO just return */
+       if (!io_end) {
+               inode_dio_done(inode);
+               if (is_async)
+                       aio_complete(iocb, ret, 0);
+               return;
+       }
  
         ext_debug("ext4_end_io_dio(): io_end 0x%p "
                   "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3123,25 +3092,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                   size);
  
         iocb->private = NULL;
-
-       /* if not aio dio with unwritten extents, just free io and return */
-       if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-               ext4_free_io_end(io_end);
-out:
-               inode_dio_done(inode);
-               if (is_async)
-                       aio_complete(iocb, ret, 0);
-               return;
-       }
-
         io_end->offset = offset;
         io_end->size = size;
         if (is_async) {
                 io_end->iocb = iocb;
                 io_end->result = ret;
         }
-
-       ext4_add_complete_io(io_end);
+       ext4_put_io_end_defer(io_end);
  }
  
  /*
@@ -3175,6 +3132,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
         get_block_t *get_block_func = NULL;
         int dio_flags = 0;
         loff_t final_size = offset + count;
+       ext4_io_end_t *io_end = NULL;
  
         /* Use the old path for reads and writes beyond i_size. */
         if (rw != WRITE || final_size > inode->i_size)
@@ -3213,13 +3171,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
         iocb->private = NULL;
         ext4_inode_aio_set(inode, NULL);
         if (!is_sync_kiocb(iocb)) {
-               ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+               io_end = ext4_init_io_end(inode, GFP_NOFS);
                 if (!io_end) {
                         ret = -ENOMEM;
                         goto retake_lock;
                 }
                 io_end->flag |= EXT4_IO_END_DIRECT;
-               iocb->private = io_end;
+               /*
+                * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+                */
+               iocb->private = ext4_get_io_end(io_end);
                 /*
                  * we save the io structure for current async direct
                  * IO, so that later ext4_map_blocks() could flag the
@@ -3243,26 +3204,27 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                                    NULL,
                                    dio_flags);
  
-       if (iocb->private)
-               ext4_inode_aio_set(inode, NULL);
         /*
-        * The io_end structure takes a reference to the inode, that
-        * structure needs to be destroyed and the reference to the
-        * inode need to be dropped, when IO is complete, even with 0
-        * byte write, or failed.
-        *
-        * In the successful AIO DIO case, the io_end structure will
-        * be destroyed and the reference to the inode will be dropped
-        * after the end_io call back function is called.
-        *
-        * In the case there is 0 byte write, or error case, since VFS
-        * direct IO won't invoke the end_io call back function, we
-        * need to free the end_io structure here.
+        * Put our reference to io_end. This can free the io_end structure e.g.
+        * in sync IO case or in case of error. It can even perform extent
+        * conversion if all bios we submitted finished before we got here.
+        * Note that in that case iocb->private can be already set to NULL
+        * here.
          */
-       if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
-               ext4_free_io_end(iocb->private);
-               iocb->private = NULL;
-       } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+       if (io_end) {
+               ext4_inode_aio_set(inode, NULL);
+               ext4_put_io_end(io_end);
+               /*
+                * In case of error or no write ext4_end_io_dio() was not
+                * called so we have to put iocb's reference.
+                */
+               if (ret <= 0 && ret != -EIOCBQUEUED) {
+                       WARN_ON(iocb->private != io_end);
+                       ext4_put_io_end(io_end);
+                       iocb->private = NULL;
+               }
+       }
+       if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
                                                 EXT4_STATE_DIO_UNWRITTEN)) {
                 int err;
                 /*
@@ -3334,27 +3296,12 @@ static int ext4_journalled_set_page_dirty(struct page *page)
         return __set_page_dirty_nobuffers(page);
  }
  
-static const struct address_space_operations ext4_ordered_aops = {
+static const struct address_space_operations ext4_aops = {
         .readpage               = ext4_readpage,
         .readpages              = ext4_readpages,
         .writepage              = ext4_writepage,
         .write_begin            = ext4_write_begin,
-       .write_end              = ext4_ordered_write_end,
-       .bmap                   = ext4_bmap,
-       .invalidatepage         = ext4_invalidatepage,
-       .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
-       .migratepage            = buffer_migrate_page,
-       .is_partially_uptodate  = block_is_partially_uptodate,
-       .error_remove_page      = generic_error_remove_page,
-};
-
-static const struct address_space_operations ext4_writeback_aops = {
-       .readpage               = ext4_readpage,
-       .readpages              = ext4_readpages,
-       .writepage              = ext4_writepage,
-       .write_begin            = ext4_write_begin,
-       .write_end              = ext4_writeback_write_end,
+       .write_end              = ext4_write_end,
         .bmap                   = ext4_bmap,
         .invalidatepage         = ext4_invalidatepage,
         .releasepage            = ext4_releasepage,
@@ -3399,23 +3346,21 @@ void ext4_set_aops(struct inode *inode)
  {
         switch (ext4_inode_journal_mode(inode)) {
         case EXT4_INODE_ORDERED_DATA_MODE:
-               if (test_opt(inode->i_sb, DELALLOC))
-                       inode->i_mapping->a_ops = &ext4_da_aops;
-               else
-                       inode->i_mapping->a_ops = &ext4_ordered_aops;
+               ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
                 break;
         case EXT4_INODE_WRITEBACK_DATA_MODE:
-               if (test_opt(inode->i_sb, DELALLOC))
-                       inode->i_mapping->a_ops = &ext4_da_aops;
-               else
-                       inode->i_mapping->a_ops = &ext4_writeback_aops;
+               ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
                 break;
         case EXT4_INODE_JOURNAL_DATA_MODE:
                 inode->i_mapping->a_ops = &ext4_journalled_aops;
-               break;
+               return;
         default:
                 BUG();
         }
+       if (test_opt(inode->i_sb, DELALLOC))
+               inode->i_mapping->a_ops = &ext4_da_aops;
+       else
+               inode->i_mapping->a_ops = &ext4_aops;
  }
  
  
@@ -3646,20 +3591,190 @@ int ext4_can_truncate(struct inode *inode)
  int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
  {
         struct inode *inode = file_inode(file);
+       struct super_block *sb = inode->i_sb;
+       ext4_lblk_t first_block, stop_block;
+       struct address_space *mapping = inode->i_mapping;
+       loff_t first_page, last_page, page_len;
+       loff_t first_page_offset, last_page_offset;
+       handle_t *handle;
+       unsigned int credits;
+       int ret = 0;
+
         if (!S_ISREG(inode->i_mode))
                 return -EOPNOTSUPP;
  
-       if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-               return ext4_ind_punch_hole(file, offset, length);
-
-       if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
+       if (EXT4_SB(sb)->s_cluster_ratio > 1) {
                 /* TODO: Add support for bigalloc file systems */
                 return -EOPNOTSUPP;
         }
  
         trace_ext4_punch_hole(inode, offset, length);
  
-       return ext4_ext_punch_hole(file, offset, length);
+       /*
+        * Write out all dirty pages to avoid race conditions
+        * Then release them.
+        */
+       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+               ret = filemap_write_and_wait_range(mapping, offset,
+                                                  offset + length - 1);
+               if (ret)
+                       return ret;
+       }
+
+       mutex_lock(&inode->i_mutex);
+       /* It's not possible punch hole on append only file */
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+               ret = -EPERM;
+               goto out_mutex;
+       }
+       if (IS_SWAPFILE(inode)) {
+               ret = -ETXTBSY;
+               goto out_mutex;
+       }
+
+       /* No need to punch hole beyond i_size */
+       if (offset >= inode->i_size)
+               goto out_mutex;
+
+       /*
+        * If the hole extends beyond i_size, set the hole
+        * to end after the page that contains i_size
+        */
+       if (offset + length > inode->i_size) {
+               length = inode->i_size +
+                  PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+                  offset;
+       }
+
+       first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+
+       first_page_offset = first_page << PAGE_CACHE_SHIFT;
+       last_page_offset = last_page << PAGE_CACHE_SHIFT;
+
+       /* Now release the pages */
+       if (last_page_offset > first_page_offset) {
+               truncate_pagecache_range(inode, first_page_offset,
+                                        last_page_offset - 1);
+       }
+
+       /* Wait all existing dio workers, newcomers will block on i_mutex */
+       ext4_inode_block_unlocked_dio(inode);
+       ret = ext4_flush_unwritten_io(inode);
+       if (ret)
+               goto out_dio;
+       inode_dio_wait(inode);
+
+       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               credits = ext4_writepage_trans_blocks(inode);
+       else
+               credits = ext4_blocks_for_truncate(inode);
+       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               ext4_std_error(sb, ret);
+               goto out_dio;
+       }
+
+       /*
+        * Now we need to zero out the non-page-aligned data in the
+        * pages at the start and tail of the hole, and unmap the
+        * buffer heads for the block aligned regions of the page that
+        * were completely zeroed.
+        */
+       if (first_page > last_page) {
+               /*
+                * If the file space being truncated is contained
+                * within a page just zero out and unmap the middle of
+                * that page
+                */
+               ret = ext4_discard_partial_page_buffers(handle,
+                       mapping, offset, length, 0);
+
+               if (ret)
+                       goto out_stop;
+       } else {
+               /*
+                * zero out and unmap the partial page that contains
+                * the start of the hole
+                */
+               page_len = first_page_offset - offset;
+               if (page_len > 0) {
+                       ret = ext4_discard_partial_page_buffers(handle, mapping,
+                                               offset, page_len, 0);
+                       if (ret)
+                               goto out_stop;
+               }
+
+               /*
+                * zero out and unmap the partial page that contains
+                * the end of the hole
+                */
+               page_len = offset + length - last_page_offset;
+               if (page_len > 0) {
+                       ret = ext4_discard_partial_page_buffers(handle, mapping,
+                                       last_page_offset, page_len, 0);
+                       if (ret)
+                               goto out_stop;
+               }
+       }
+
+       /*
+        * If i_size is contained in the last page, we need to
+        * unmap and zero the partial page after i_size
+        */
+       if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
+          inode->i_size % PAGE_CACHE_SIZE != 0) {
+               page_len = PAGE_CACHE_SIZE -
+                       (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+               if (page_len > 0) {
+                       ret = ext4_discard_partial_page_buffers(handle,
+                                       mapping, inode->i_size, page_len, 0);
+
+                       if (ret)
+                               goto out_stop;
+               }
+       }
+
+       first_block = (offset + sb->s_blocksize - 1) >>
+               EXT4_BLOCK_SIZE_BITS(sb);
+       stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+       /* If there are no blocks to remove, return now */
+       if (first_block >= stop_block)
+               goto out_stop;
+
+       down_write(&EXT4_I(inode)->i_data_sem);
+       ext4_discard_preallocations(inode);
+
+       ret = ext4_es_remove_extent(inode, first_block,
+                                   stop_block - first_block);
+       if (ret) {
+               up_write(&EXT4_I(inode)->i_data_sem);
+               goto out_stop;
+       }
+
+       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               ret = ext4_ext_remove_space(inode, first_block,
+                                           stop_block - 1);
+       else
+               ret = ext4_free_hole_blocks(handle, inode, first_block,
+                                           stop_block);
+
+       ext4_discard_preallocations(inode);
+       up_write(&EXT4_I(inode)->i_data_sem);
+       if (IS_SYNC(inode))
+               ext4_handle_sync(handle);
+       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+       ext4_mark_inode_dirty(handle, inode);
+out_stop:
+       ext4_journal_stop(handle);
+out_dio:
+       ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+       mutex_unlock(&inode->i_mutex);
+       return ret;
  }
  
  /*
@@ -3692,6 +3807,19 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
   */
  void ext4_truncate(struct inode *inode)
  {
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       unsigned int credits;
+       handle_t *handle;
+       struct address_space *mapping = inode->i_mapping;
+       loff_t page_len;
+
+       /*
+        * There is a possibility that we're either freeing the inode
+        * or it completely new indode. In those cases we might not
+        * have i_mutex locked because it's not necessary.
+        */
+       if (!(inode->i_state & (I_NEW|I_FREEING)))
+               WARN_ON(!mutex_is_locked(&inode->i_mutex));
         trace_ext4_truncate_enter(inode);
  
         if (!ext4_can_truncate(inode))
@@ -3710,10 +3838,72 @@ void ext4_truncate(struct inode *inode)
                         return;
         }
  
+       /*
+        * finish any pending end_io work so we won't run the risk of
+        * converting any truncated blocks to initialized later
+        */
+       ext4_flush_unwritten_io(inode);
+
+       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               credits = ext4_writepage_trans_blocks(inode);
+       else
+               credits = ext4_blocks_for_truncate(inode);
+
+       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+       if (IS_ERR(handle)) {
+               ext4_std_error(inode->i_sb, PTR_ERR(handle));
+               return;
+       }
+
+       if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+               page_len = PAGE_CACHE_SIZE -
+                       (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+               if (ext4_discard_partial_page_buffers(handle,
+                               mapping, inode->i_size, page_len, 0))
+                       goto out_stop;
+       }
+
+       /*
+        * We add the inode to the orphan list, so that if this
+        * truncate spans multiple transactions, and we crash, we will
+        * resume the truncate when the filesystem recovers.  It also
+        * marks the inode dirty, to catch the new size.
+        *
+        * Implication: the file must always be in a sane, consistent
+        * truncatable state while each transaction commits.
+        */
+       if (ext4_orphan_add(handle, inode))
+               goto out_stop;
+
+       down_write(&EXT4_I(inode)->i_data_sem);
+
+       ext4_discard_preallocations(inode);
+
         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-               ext4_ext_truncate(inode);
+               ext4_ext_truncate(handle, inode);
         else
-               ext4_ind_truncate(inode);
+               ext4_ind_truncate(handle, inode);
+
+       up_write(&ei->i_data_sem);
+
+       if (IS_SYNC(inode))
+               ext4_handle_sync(handle);
+
+out_stop:
+       /*
+        * If this was a simple ftruncate() and the file will remain alive,
+        * then we need to clear up the orphan record which we created above.
+        * However, if this was a real unlink then we were called by
+        * ext4_delete_inode(), and we allow that function to clean up the
+        * orphan info for us.
+        */
+       if (inode->i_nlink)
+               ext4_orphan_del(handle, inode);
+
+       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+       ext4_mark_inode_dirty(handle, inode);
+       ext4_journal_stop(handle);
  
         trace_ext4_truncate_exit(inode);
  }
@@ -3821,13 +4011,14 @@ make_io:
                 if (EXT4_SB(sb)->s_inode_readahead_blks) {
                         ext4_fsblk_t b, end, table;
                         unsigned num;
+                       __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
  
                         table = ext4_inode_table(sb, gdp);
                         /* s_inode_readahead_blks is always a power of 2 */
-                       b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+                       b = block & ~((ext4_fsblk_t) ra_blks - 1);
                         if (table > b)
                                 b = table;
-                       end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+                       end = b + ra_blks;
                         num = EXT4_INODES_PER_GROUP(sb);
                         if (ext4_has_group_desc_csum(sb))
                                 num -= ext4_itable_unused_count(sb, gdp);
@@ -4024,8 +4215,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
          * NeilBrown 1999oct15
          */
         if (inode->i_nlink == 0) {
-               if (inode->i_mode == 0 ||
-                   !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+               if ((inode->i_mode == 0 ||
+                    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+                   ino != EXT4_BOOT_LOADER_INO) {
                         /* this inode is deleted */
                         ret = -ESTALE;
                         goto bad_inode;
@@ -4033,7 +4225,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                 /* The only unlinked inodes we let through here have
                  * valid i_mode and are being read by the orphan
                  * recovery code: that's fine, we're about to complete
-                * the process of deleting those. */
+                * the process of deleting those.
+                * OR it is the EXT4_BOOT_LOADER_INO which is
+                * not initialized on a new filesystem. */
         }
         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
         inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
@@ -4153,6 +4347,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                 else
                         init_special_inode(inode, inode->i_mode,
                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+       } else if (ino == EXT4_BOOT_LOADER_INO) {
+               make_bad_inode(inode);
         } else {
                 ret = -EIO;
                 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c

index 721f4d33e148adf63872544dd8e4cedbbbb249f7..9491ac0590f746b9abe56e484e7646022a0bd1c3 100644 (file)
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -17,9 +17,201 @@
  #include <asm/uaccess.h>
  #include "ext4_jbd2.h"
  #include "ext4.h"
+#include "ext4_extents.h"
  
  #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
  
+/**
+ * Swap memory between @a and @b for @len bytes.
+ *
+ * @a:          pointer to first memory area
+ * @b:          pointer to second memory area
+ * @len:        number of bytes to swap
+ *
+ */
+static void memswap(void *a, void *b, size_t len)
+{
+       unsigned char *ap, *bp;
+       unsigned char tmp;
+
+       ap = (unsigned char *)a;
+       bp = (unsigned char *)b;
+       while (len-- > 0) {
+               tmp = *ap;
+               *ap = *bp;
+               *bp = tmp;
+               ap++;
+               bp++;
+       }
+}
+
+/**
+ * Swap i_data and associated attributes between @inode1 and @inode2.
+ * This function is used for the primary swap between inode1 and inode2
+ * and also to revert this primary swap in case of errors.
+ *
+ * Therefore you have to make sure, that calling this method twice
+ * will revert all changes.
+ *
+ * @inode1:     pointer to first inode
+ * @inode2:     pointer to second inode
+ */
+static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+{
+       loff_t isize;
+       struct ext4_inode_info *ei1;
+       struct ext4_inode_info *ei2;
+
+       ei1 = EXT4_I(inode1);
+       ei2 = EXT4_I(inode2);
+
+       memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
+       memswap(&inode1->i_version, &inode2->i_version,
+                 sizeof(inode1->i_version));
+       memswap(&inode1->i_blocks, &inode2->i_blocks,
+                 sizeof(inode1->i_blocks));
+       memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
+       memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
+       memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
+
+       memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
+       memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
+       memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+       memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
+       memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
+
+       isize = i_size_read(inode1);
+       i_size_write(inode1, i_size_read(inode2));
+       i_size_write(inode2, isize);
+}
+
+/**
+ * Swap the information from the given @inode and the inode
+ * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
+ * important fields of the inodes.
+ *
+ * @sb:         the super block of the filesystem
+ * @inode:      the inode to swap with EXT4_BOOT_LOADER_INO
+ *
+ */
+static long swap_inode_boot_loader(struct super_block *sb,
+                               struct inode *inode)
+{
+       handle_t *handle;
+       int err;
+       struct inode *inode_bl;
+       struct ext4_inode_info *ei;
+       struct ext4_inode_info *ei_bl;
+       struct ext4_sb_info *sbi;
+
+       if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
+               err = -EINVAL;
+               goto swap_boot_out;
+       }
+
+       if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+               err = -EPERM;
+               goto swap_boot_out;
+       }
+
+       sbi = EXT4_SB(sb);
+       ei = EXT4_I(inode);
+
+       inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+       if (IS_ERR(inode_bl)) {
+               err = PTR_ERR(inode_bl);
+               goto swap_boot_out;
+       }
+       ei_bl = EXT4_I(inode_bl);
+
+       filemap_flush(inode->i_mapping);
+       filemap_flush(inode_bl->i_mapping);
+
+       /* Protect orig inodes against a truncate and make sure,
+        * that only 1 swap_inode_boot_loader is running. */
+       ext4_inode_double_lock(inode, inode_bl);
+
+       truncate_inode_pages(&inode->i_data, 0);
+       truncate_inode_pages(&inode_bl->i_data, 0);
+
+       /* Wait for all existing dio workers */
+       ext4_inode_block_unlocked_dio(inode);
+       ext4_inode_block_unlocked_dio(inode_bl);
+       inode_dio_wait(inode);
+       inode_dio_wait(inode_bl);
+
+       handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
+       if (IS_ERR(handle)) {
+               err = -EINVAL;
+               goto swap_boot_out;
+       }
+
+       /* Protect extent tree against block allocations via delalloc */
+       ext4_double_down_write_data_sem(inode, inode_bl);
+
+       if (inode_bl->i_nlink == 0) {
+               /* this inode has never been used as a BOOT_LOADER */
+               set_nlink(inode_bl, 1);
+               i_uid_write(inode_bl, 0);
+               i_gid_write(inode_bl, 0);
+               inode_bl->i_flags = 0;
+               ei_bl->i_flags = 0;
+               inode_bl->i_version = 1;
+               i_size_write(inode_bl, 0);
+               inode_bl->i_mode = S_IFREG;
+               if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                             EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+                       ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
+                       ext4_ext_tree_init(handle, inode_bl);
+               } else
+                       memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
+       }
+
+       swap_inode_data(inode, inode_bl);
+
+       inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+
+       spin_lock(&sbi->s_next_gen_lock);
+       inode->i_generation = sbi->s_next_generation++;
+       inode_bl->i_generation = sbi->s_next_generation++;
+       spin_unlock(&sbi->s_next_gen_lock);
+
+       ext4_discard_preallocations(inode);
+
+       err = ext4_mark_inode_dirty(handle, inode);
+       if (err < 0) {
+               ext4_warning(inode->i_sb,
+                       "couldn't mark inode #%lu dirty (err %d)",
+                       inode->i_ino, err);
+               /* Revert all changes: */
+               swap_inode_data(inode, inode_bl);
+       } else {
+               err = ext4_mark_inode_dirty(handle, inode_bl);
+               if (err < 0) {
+                       ext4_warning(inode_bl->i_sb,
+                               "couldn't mark inode #%lu dirty (err %d)",
+                               inode_bl->i_ino, err);
+                       /* Revert all changes: */
+                       swap_inode_data(inode, inode_bl);
+                       ext4_mark_inode_dirty(handle, inode);
+               }
+       }
+
+       ext4_journal_stop(handle);
+
+       ext4_double_up_write_data_sem(inode, inode_bl);
+
+       ext4_inode_resume_unlocked_dio(inode);
+       ext4_inode_resume_unlocked_dio(inode_bl);
+
+       ext4_inode_double_unlock(inode, inode_bl);
+
+       iput(inode_bl);
+
+swap_boot_out:
+       return err;
+}
+
  long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
  {
         struct inode *inode = file_inode(filp);
@@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                         if (!capable(CAP_SYS_RESOURCE))
                                 goto flags_out;
                 }
-               if (oldflags & EXT4_EXTENTS_FL) {
-                       /* We don't support clearning extent flags */
-                       if (!(flags & EXT4_EXTENTS_FL)) {
-                               err = -EOPNOTSUPP;
-                               goto flags_out;
-                       }
-               } else if (flags & EXT4_EXTENTS_FL) {
-                       /* migrate the file */
+               if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
                         migrate = 1;
-                       flags &= ~EXT4_EXTENTS_FL;
-               }
  
                 if (flags & EXT4_EOFBLOCKS_FL) {
                         /* we don't support adding EOFBLOCKS flag */
@@ -137,8 +320,13 @@ flags_err:
                         err = ext4_change_inode_journal_flag(inode, jflag);
                 if (err)
                         goto flags_out;
-               if (migrate)
-                       err = ext4_ext_migrate(inode);
+               if (migrate) {
+                       if (flags & EXT4_EXTENTS_FL)
+                               err = ext4_ext_migrate(inode);
+                       else
+                               err = ext4_ind_migrate(inode);
+               }
+
  flags_out:
                 mutex_unlock(&inode->i_mutex);
                 mnt_drop_write_file(filp);
@@ -357,9 +545,13 @@ group_add_out:
                 return err;
         }
  
+       case EXT4_IOC_SWAP_BOOT:
+               if (!(filp->f_mode & FMODE_WRITE))
+                       return -EBADF;
+               return swap_inode_boot_loader(sb, inode);
+
         case EXT4_IOC_RESIZE_FS: {
                 ext4_fsblk_t n_blocks_count;
-               struct super_block *sb = inode->i_sb;
                 int err = 0, err2 = 0;
                 ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
  
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c

index ee6614bdb63950b7481443f944ca3c87681903be..a11ea4d6164c2593411517473d6543160cec1f6e 100644 (file)
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -405,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr)
         ext4_clear_bit(bit, addr);
  }
  
+static inline int mb_test_and_clear_bit(int bit, void *addr)
+{
+       addr = mb_correct_addr_and_bit(&bit, addr);
+       return ext4_test_and_clear_bit(bit, addr);
+}
+
  static inline int mb_find_next_zero_bit(void *addr, int max, int start)
  {
         int fix = 0, ret, tmpmax;
@@ -764,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
         spin_unlock(&EXT4_SB(sb)->s_bal_lock);
  }
  
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+       int count;
+       int order = 1;
+       void *buddy;
+
+       while ((buddy = mb_find_buddy(e4b, order++, &count))) {
+               ext4_set_bits(buddy, 0, count);
+       }
+       e4b->bd_info->bb_fragments = 0;
+       memset(e4b->bd_info->bb_counters, 0,
+               sizeof(*e4b->bd_info->bb_counters) *
+               (e4b->bd_sb->s_blocksize_bits + 2));
+
+       ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+               e4b->bd_bitmap, e4b->bd_group);
+}
+
  /* The buddy information is attached the buddy cache inode
   * for convenience. The information regarding each group
   * is loaded via ext4_mb_load_buddy. The information involve
@@ -860,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
  
         first_block = page->index * blocks_per_page;
         for (i = 0; i < blocks_per_page; i++) {
-               int group;
-
                 group = (first_block + i) >> 1;
                 if (group >= ngroups)
                         break;
@@ -1011,6 +1033,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
         struct page *page;
         int ret = 0;
  
+       might_sleep();
         mb_debug(1, "init group %u\n", group);
         this_grp = ext4_get_group_info(sb, group);
         /*
@@ -1082,6 +1105,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         struct inode *inode = sbi->s_buddy_cache;
  
+       might_sleep();
         mb_debug(1, "load group %u\n", group);
  
         blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
@@ -1244,6 +1268,33 @@ static void mb_clear_bits(void *bm, int cur, int len)
         }
  }
  
+/* clear bits in given range
+ * will return first found zero bit if any, -1 otherwise
+ */
+static int mb_test_and_clear_bits(void *bm, int cur, int len)
+{
+       __u32 *addr;
+       int zero_bit = -1;
+
+       len = cur + len;
+       while (cur < len) {
+               if ((cur & 31) == 0 && (len - cur) >= 32) {
+                       /* fast path: clear whole word at once */
+                       addr = bm + (cur >> 3);
+                       if (*addr != (__u32)(-1) && zero_bit == -1)
+                               zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
+                       *addr = 0;
+                       cur += 32;
+                       continue;
+               }
+               if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
+                       zero_bit = cur;
+               cur++;
+       }
+
+       return zero_bit;
+}
+
  void ext4_set_bits(void *bm, int cur, int len)
  {
         __u32 *addr;
@@ -1262,17 +1313,90 @@ void ext4_set_bits(void *bm, int cur, int len)
         }
  }
  
+/*
+ * _________________________________________________________________ */
+
+static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
+{
+       if (mb_test_bit(*bit + side, bitmap)) {
+               mb_clear_bit(*bit, bitmap);
+               (*bit) -= side;
+               return 1;
+       }
+       else {
+               (*bit) += side;
+               mb_set_bit(*bit, bitmap);
+               return -1;
+       }
+}
+
+static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
+{
+       int max;
+       int order = 1;
+       void *buddy = mb_find_buddy(e4b, order, &max);
+
+       while (buddy) {
+               void *buddy2;
+
+               /* Bits in range [first; last] are known to be set since
+                * corresponding blocks were allocated. Bits in range
+                * (first; last) will stay set because they form buddies on
+                * upper layer. We just deal with borders if they don't
+                * align with upper layer and then go up.
+                * Releasing entire group is all about clearing
+                * single bit of highest order buddy.
+                */
+
+               /* Example:
+                * ---------------------------------
+                * |   1   |   1   |   1   |   1   |
+                * ---------------------------------
+                * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+                * ---------------------------------
+                *   0   1   2   3   4   5   6   7
+                *      \_____________________/
+                *
+                * Neither [1] nor [6] is aligned to above layer.
+                * Left neighbour [0] is free, so mark it busy,
+                * decrease bb_counters and extend range to
+                * [0; 6]
+                * Right neighbour [7] is busy. It can't be coaleasced with [6], so
+                * mark [6] free, increase bb_counters and shrink range to
+                * [0; 5].
+                * Then shift range to [0; 2], go up and do the same.
+                */
+
+
+               if (first & 1)
+                       e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
+               if (!(last & 1))
+                       e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
+               if (first > last)
+                       break;
+               order++;
+
+               if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+                       mb_clear_bits(buddy, first, last - first + 1);
+                       e4b->bd_info->bb_counters[order - 1] += last - first + 1;
+                       break;
+               }
+               first >>= 1;
+               last >>= 1;
+               buddy = buddy2;
+       }
+}
+
  static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
-                         int first, int count)
+                          int first, int count)
  {
-       int block = 0;
-       int max = 0;
-       int order;
-       void *buddy;
-       void *buddy2;
+       int left_is_free = 0;
+       int right_is_free = 0;
+       int block;
+       int last = first + count - 1;
         struct super_block *sb = e4b->bd_sb;
  
-       BUG_ON(first + count > (sb->s_blocksize << 3));
+       BUG_ON(last >= (sb->s_blocksize << 3));
         assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
         mb_check_buddy(e4b);
         mb_free_blocks_double(inode, e4b, first, count);
@@ -1281,67 +1405,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
         if (first < e4b->bd_info->bb_first_free)
                 e4b->bd_info->bb_first_free = first;
  
-       /* let's maintain fragments counter */
+       /* access memory sequentially: check left neighbour,
+        * clear range and then check right neighbour
+        */
         if (first != 0)
-               block = !mb_test_bit(first - 1, e4b->bd_bitmap);
-       if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
-               max = !mb_test_bit(first + count, e4b->bd_bitmap);
-       if (block && max)
-               e4b->bd_info->bb_fragments--;
-       else if (!block && !max)
-               e4b->bd_info->bb_fragments++;
+               left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
+       block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
+       if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
+               right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
  
-       /* let's maintain buddy itself */
-       while (count-- > 0) {
-               block = first++;
-               order = 0;
+       if (unlikely(block != -1)) {
+               ext4_fsblk_t blocknr;
  
-               if (!mb_test_bit(block, e4b->bd_bitmap)) {
-                       ext4_fsblk_t blocknr;
-
-                       blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
-                       blocknr += EXT4_C2B(EXT4_SB(sb), block);
-                       ext4_grp_locked_error(sb, e4b->bd_group,
-                                             inode ? inode->i_ino : 0,
-                                             blocknr,
-                                             "freeing already freed block "
-                                             "(bit %u)", block);
-               }
-               mb_clear_bit(block, e4b->bd_bitmap);
-               e4b->bd_info->bb_counters[order]++;
-
-               /* start of the buddy */
-               buddy = mb_find_buddy(e4b, order, &max);
-
-               do {
-                       block &= ~1UL;
-                       if (mb_test_bit(block, buddy) ||
-                                       mb_test_bit(block + 1, buddy))
-                               break;
-
-                       /* both the buddies are free, try to coalesce them */
-                       buddy2 = mb_find_buddy(e4b, order + 1, &max);
+               blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+               blocknr += EXT4_C2B(EXT4_SB(sb), block);
+               ext4_grp_locked_error(sb, e4b->bd_group,
+                                     inode ? inode->i_ino : 0,
+                                     blocknr,
+                                     "freeing already freed block "
+                                     "(bit %u)", block);
+               mb_regenerate_buddy(e4b);
+               goto done;
+       }
  
-                       if (!buddy2)
-                               break;
+       /* let's maintain fragments counter */
+       if (left_is_free && right_is_free)
+               e4b->bd_info->bb_fragments--;
+       else if (!left_is_free && !right_is_free)
+               e4b->bd_info->bb_fragments++;
  
-                       if (order > 0) {
-                               /* for special purposes, we don't set
-                                * free bits in bitmap */
-                               mb_set_bit(block, buddy);
-                               mb_set_bit(block + 1, buddy);
-                       }
-                       e4b->bd_info->bb_counters[order]--;
-                       e4b->bd_info->bb_counters[order]--;
+       /* buddy[0] == bd_bitmap is a special case, so handle
+        * it right away and let mb_buddy_mark_free stay free of
+        * zero order checks.
+        * Check if neighbours are to be coaleasced,
+        * adjust bitmap bb_counters and borders appropriately.
+        */
+       if (first & 1) {
+               first += !left_is_free;
+               e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
+       }
+       if (!(last & 1)) {
+               last -= !right_is_free;
+               e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
+       }
  
-                       block = block >> 1;
-                       order++;
-                       e4b->bd_info->bb_counters[order]++;
+       if (first <= last)
+               mb_buddy_mark_free(e4b, first >> 1, last >> 1);
  
-                       mb_clear_bit(block, buddy2);
-                       buddy = buddy2;
-               } while (1);
-       }
+done:
         mb_set_largest_free_order(sb, e4b->bd_info);
         mb_check_buddy(e4b);
  }
@@ -3342,7 +3453,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
         if (pa->pa_type == MB_GROUP_PA)
                 grp_blk--;
  
-       ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
+       grp = ext4_get_group_number(sb, grp_blk);
  
         /*
          * possible race:
@@ -3807,7 +3918,7 @@ repeat:
  
         list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
                 BUG_ON(pa->pa_type != MB_INODE_PA);
-               ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+               group = ext4_get_group_number(sb, pa->pa_pstart);
  
                 err = ext4_mb_load_buddy(sb, group, &e4b);
                 if (err) {
@@ -4069,7 +4180,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
  
         list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
  
-               ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+               group = ext4_get_group_number(sb, pa->pa_pstart);
                 if (ext4_mb_load_buddy(sb, group, &e4b)) {
                         ext4_error(sb, "Error loading buddy information for %u",
                                         group);
@@ -4217,6 +4328,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
         unsigned int inquota = 0;
         unsigned int reserv_clstrs = 0;
  
+       might_sleep();
         sb = ar->inode->i_sb;
         sbi = EXT4_SB(sb);
  
@@ -4420,11 +4532,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
         node = rb_prev(new_node);
         if (node) {
                 entry = rb_entry(node, struct ext4_free_data, efd_node);
-               if (can_merge(entry, new_entry)) {
+               if (can_merge(entry, new_entry) &&
+                   ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
                         new_entry->efd_start_cluster = entry->efd_start_cluster;
                         new_entry->efd_count += entry->efd_count;
                         rb_erase(node, &(db->bb_free_root));
-                       ext4_journal_callback_del(handle, &entry->efd_jce);
                         kmem_cache_free(ext4_free_data_cachep, entry);
                 }
         }
@@ -4432,10 +4544,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
         node = rb_next(new_node);
         if (node) {
                 entry = rb_entry(node, struct ext4_free_data, efd_node);
-               if (can_merge(new_entry, entry)) {
+               if (can_merge(new_entry, entry) &&
+                   ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
                         new_entry->efd_count += entry->efd_count;
                         rb_erase(node, &(db->bb_free_root));
-                       ext4_journal_callback_del(handle, &entry->efd_jce);
                         kmem_cache_free(ext4_free_data_cachep, entry);
                 }
         }
@@ -4470,6 +4582,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
         int err = 0;
         int ret;
  
+       might_sleep();
         if (bh) {
                 if (block)
                         BUG_ON(block != bh->b_blocknr);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c

index 480acf4a085fa7be60982c8643e20675164ad8c4..49e8bdff9163e830931570f4c2660cea201ab3fd 100644 (file)
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
                         return retval;
         }
         return retval;
-
  }
  
  int ext4_ext_migrate(struct inode *inode)
@@ -606,3 +605,64 @@ out:
  
         return retval;
  }
+
+/*
+ * Migrate a simple extent-based inode to use the i_blocks[] array
+ */
+int ext4_ind_migrate(struct inode *inode)
+{
+       struct ext4_extent_header       *eh;
+       struct ext4_super_block         *es = EXT4_SB(inode->i_sb)->s_es;
+       struct ext4_inode_info          *ei = EXT4_I(inode);
+       struct ext4_extent              *ex;
+       unsigned int                    i, len;
+       ext4_fsblk_t                    blk;
+       handle_t                        *handle;
+       int                             ret;
+
+       if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+           (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+               return -EINVAL;
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+               return -EOPNOTSUPP;
+
+       handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       down_write(&EXT4_I(inode)->i_data_sem);
+       ret = ext4_ext_check_inode(inode);
+       if (ret)
+               goto errout;
+
+       eh = ext_inode_hdr(inode);
+       ex  = EXT_FIRST_EXTENT(eh);
+       if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
+           eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
+               ret = -EOPNOTSUPP;
+               goto errout;
+       }
+       if (eh->eh_entries == 0)
+               blk = len = 0;
+       else {
+               len = le16_to_cpu(ex->ee_len);
+               blk = ext4_ext_pblock(ex);
+               if (len > EXT4_NDIR_BLOCKS) {
+                       ret = -EOPNOTSUPP;
+                       goto errout;
+               }
+       }
+
+       ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+       memset(ei->i_data, 0, sizeof(ei->i_data));
+       for (i=0; i < len; i++)
+               ei->i_data[i] = cpu_to_le32(blk++);
+       ext4_mark_inode_dirty(handle, inode);
+errout:
+       ext4_journal_stop(handle);
+       up_write(&EXT4_I(inode)->i_data_sem);
+       return ret;
+}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c

index f9b551561d2cc424fa3b512ba3f0d7c4fc51c41b..214461e42a05c8be0374224c81c2a549135b7014 100644 (file)
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -7,7 +7,7 @@
  #include "ext4.h"
  
  /* Checksumming functions */
-static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
  {
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         int offset = offsetof(struct mmp_struct, mmp_checksum);
@@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
         lock_buffer(bh);
         bh->b_end_io = end_buffer_write_sync;
         get_bh(bh);
-       submit_bh(WRITE_SYNC, bh);
+       submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
         wait_on_buffer(bh);
         sb_end_write(sb);
         if (unlikely(!buffer_uptodate(bh)))
@@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
                 get_bh(*bh);
                 lock_buffer(*bh);
                 (*bh)->b_end_io = end_buffer_read_sync;
-               submit_bh(READ_SYNC, *bh);
+               submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
                 wait_on_buffer(*bh);
                 if (!buffer_uptodate(*bh)) {
                         brelse(*bh);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c

index 33e1c086858b54ad704e80345c92b946ae3c565b..3dcbf364022fe286b1eb94afa16a50790e88bf29 100644 (file)
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
  }
  
  /**
- * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
+ * ext4_double_down_write_data_sem - Acquire two inodes' write lock
+ *                                   of i_data_sem
   *
   * Acquire write lock of i_data_sem of the two inodes
   */
-static void
-double_down_write_data_sem(struct inode *first, struct inode *second)
+void
+ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
  {
         if (first < second) {
                 down_write(&EXT4_I(first)->i_data_sem);
@@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second)
  }
  
  /**
- * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
   *
   * @orig_inode:                original inode structure to be released its lock first
   * @donor_inode:       donor inode structure to be released its lock second
   * Release write lock of i_data_sem of two inodes (orig and donor).
   */
-static void
-double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+void
+ext4_double_up_write_data_sem(struct inode *orig_inode,
+                             struct inode *donor_inode)
  {
         up_write(&EXT4_I(orig_inode)->i_data_sem);
         up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -407,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
                 mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
                                                 end_ext, eh, range_to_move);
  
-       if (depth) {
-               ret = ext4_handle_dirty_metadata(handle, orig_inode,
-                                                orig_path->p_bh);
-               if (ret)
-                       return ret;
-       } else {
-               ret = ext4_mark_inode_dirty(handle, orig_inode);
-               if (ret < 0)
-                       return ret;
-       }
-
-       return 0;
+       return ext4_ext_dirty(handle, orig_inode, orig_path);
  }
  
  /**
@@ -737,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                 donor_off += dext_alen;
                 orig_off += dext_alen;
  
+               BUG_ON(replaced_count > count);
                 /* Already moved the expected blocks */
                 if (replaced_count >= count)
                         break;
@@ -814,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
                 page_cache_release(page[0]);
                 return -ENOMEM;
         }
-
+       /*
+        * grab_cache_page_write_begin() may not wait on page's writeback if
+        * BDI not demand that. But it is reasonable to be very conservative
+        * here and explicitly wait on page's writeback
+        */
+       wait_on_page_writeback(page[0]);
+       wait_on_page_writeback(page[1]);
         if (inode1 > inode2) {
                 struct page *tmp;
                 tmp = page[0];
@@ -856,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
                 if (buffer_uptodate(bh))
                         continue;
                 if (!buffer_mapped(bh)) {
-                       int err = 0;
                         err = ext4_get_block(inode, block, bh, 0);
                         if (err) {
                                 SetPageError(page);
@@ -976,7 +973,7 @@ again:
          * necessary, just swap data blocks between orig and donor.
          */
         if (uninit) {
-               double_down_write_data_sem(orig_inode, donor_inode);
+               ext4_double_down_write_data_sem(orig_inode, donor_inode);
                 /* If any of extents in range became initialized we have to
                  * fallback to data copying */
                 uninit = mext_check_coverage(orig_inode, orig_blk_offset,
@@ -990,7 +987,7 @@ again:
                         goto drop_data_sem;
  
                 if (!uninit) {
-                       double_up_write_data_sem(orig_inode, donor_inode);
+                       ext4_double_up_write_data_sem(orig_inode, donor_inode);
                         goto data_copy;
                 }
                 if ((page_has_private(pagep[0]) &&
@@ -1004,7 +1001,7 @@ again:
                                                 donor_inode, orig_blk_offset,
                                                 block_len_in_page, err);
         drop_data_sem:
-               double_up_write_data_sem(orig_inode, donor_inode);
+               ext4_double_up_write_data_sem(orig_inode, donor_inode);
                 goto unlock_pages;
         }
  data_copy:
@@ -1033,7 +1030,7 @@ data_copy:
         }
         /* Perform all necessary steps similar write_begin()/write_end()
          * but keeping in mind that i_size will not change */
-       *err = __block_write_begin(pagep[0], from, from + replaced_size,
+       *err = __block_write_begin(pagep[0], from, replaced_size,
                                    ext4_get_block);
         if (!*err)
                 *err = block_commit_write(pagep[0], from, from + replaced_size);
@@ -1065,11 +1062,11 @@ repair_branches:
          * Extents are swapped already, but we are not able to copy data.
          * Try to swap extents to it's original places
          */
-       double_down_write_data_sem(orig_inode, donor_inode);
+       ext4_double_down_write_data_sem(orig_inode, donor_inode);
         replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
                                                orig_blk_offset,
                                                block_len_in_page, &err2);
-       double_up_write_data_sem(orig_inode, donor_inode);
+       ext4_double_up_write_data_sem(orig_inode, donor_inode);
         if (replaced_count != block_len_in_page) {
                 EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
                                        "Unable to copy data block,"
@@ -1209,15 +1206,15 @@ mext_check_arguments(struct inode *orig_inode,
  }
  
  /**
- * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
   *
   * @inode1:    the inode structure
   * @inode2:    the inode structure
   *
   * Lock two inodes' i_mutex
   */
-static void
-mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+void
+ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
  {
         BUG_ON(inode1 == inode2);
         if (inode1 < inode2) {
@@ -1230,15 +1227,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
  }
  
  /**
- * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
   *
   * @inode1:     the inode that is released first
   * @inode2:     the inode that is released second
   *
   */
  
-static void
-mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+void
+ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
  {
         mutex_unlock(&inode1->i_mutex);
         mutex_unlock(&inode2->i_mutex);
@@ -1333,7 +1330,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                 return -EINVAL;
         }
         /* Protect orig and donor inodes against a truncate */
-       mext_inode_double_lock(orig_inode, donor_inode);
+       ext4_inode_double_lock(orig_inode, donor_inode);
  
         /* Wait for all existing dio workers */
         ext4_inode_block_unlocked_dio(orig_inode);
@@ -1342,7 +1339,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
         inode_dio_wait(donor_inode);
  
         /* Protect extent tree against block allocations via delalloc */
-       double_down_write_data_sem(orig_inode, donor_inode);
+       ext4_double_down_write_data_sem(orig_inode, donor_inode);
         /* Check the filesystem environment whether move_extent can be done */
         ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
                                     donor_start, &len);
@@ -1466,7 +1463,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                  * b. racing with ->readpage, ->write_begin, and ext4_get_block
                  *    in move_extent_per_page
                  */
-               double_up_write_data_sem(orig_inode, donor_inode);
+               ext4_double_up_write_data_sem(orig_inode, donor_inode);
  
                 while (orig_page_offset <= seq_end_page) {
  
@@ -1500,7 +1497,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                                 block_len_in_page = rest_blocks;
                 }
  
-               double_down_write_data_sem(orig_inode, donor_inode);
+               ext4_double_down_write_data_sem(orig_inode, donor_inode);
                 if (ret < 0)
                         break;
  
@@ -1538,10 +1535,10 @@ out:
                 ext4_ext_drop_refs(holecheck_path);
                 kfree(holecheck_path);
         }
-       double_up_write_data_sem(orig_inode, donor_inode);
+       ext4_double_up_write_data_sem(orig_inode, donor_inode);
         ext4_inode_resume_unlocked_dio(orig_inode);
         ext4_inode_resume_unlocked_dio(donor_inode);
-       mext_inode_double_unlock(orig_inode, donor_inode);
+       ext4_inode_double_unlock(orig_inode, donor_inode);
  
         return ret;
  }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c

index 3825d6aa8336aa0bbcda8f303f299f24dc8bb67e..6653fc35ecb7dda920377a7d45ab8efef2091b2d 100644 (file)
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
  {
         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
         struct ext4_inode_info *ei = EXT4_I(inode);
-       __u32 csum, old_csum;
+       __u32 csum;
+       __le32 save_csum;
         int size;
  
         size = count_offset + (count * sizeof(struct dx_entry));
-       old_csum = t->dt_checksum;
+       save_csum = t->dt_checksum;
         t->dt_checksum = 0;
         csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
         csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
-       t->dt_checksum = old_csum;
+       t->dt_checksum = save_csum;
  
         return cpu_to_le32(csum);
  }
@@ -971,6 +972,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                         hinfo.hash_version +=
                                 EXT4_SB(dir->i_sb)->s_hash_unsigned;
                 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+               if (ext4_has_inline_data(dir)) {
+                       int has_inline_data = 1;
+                       count = htree_inlinedir_to_tree(dir_file, dir, 0,
+                                                       &hinfo, start_hash,
+                                                       start_minor_hash,
+                                                       &has_inline_data);
+                       if (has_inline_data) {
+                               *next_hash = ~0;
+                               return count;
+                       }
+               }
                 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
                                                start_hash, start_minor_hash);
                 *next_hash = ~0;
@@ -1455,24 +1467,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
         return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
  }
  
-#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
-       [S_IFREG >> S_SHIFT]    = EXT4_FT_REG_FILE,
-       [S_IFDIR >> S_SHIFT]    = EXT4_FT_DIR,
-       [S_IFCHR >> S_SHIFT]    = EXT4_FT_CHRDEV,
-       [S_IFBLK >> S_SHIFT]    = EXT4_FT_BLKDEV,
-       [S_IFIFO >> S_SHIFT]    = EXT4_FT_FIFO,
-       [S_IFSOCK >> S_SHIFT]   = EXT4_FT_SOCK,
-       [S_IFLNK >> S_SHIFT]    = EXT4_FT_SYMLINK,
-};
-
-static inline void ext4_set_de_type(struct super_block *sb,
-                               struct ext4_dir_entry_2 *de,
-                               umode_t mode) {
-       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
-               de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
-}
-
  /*
   * Move count entries from end of map between two memory locations.
   * Returns pointer to last entry moved.
@@ -2251,8 +2245,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
         dquot_initialize(dir);
  
         credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-                  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+                  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
  retry:
         inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
                                             NULL, EXT4_HT_DIR, credits);
@@ -2286,8 +2279,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
         dquot_initialize(dir);
  
         credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-                  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+                  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
  retry:
         inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
                                             NULL, EXT4_HT_DIR, credits);
@@ -2396,8 +2388,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
         dquot_initialize(dir);
  
         credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-                  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+                  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
  retry:
         inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
                                             &dentry->d_name,
@@ -2826,8 +2817,7 @@ static int ext4_symlink(struct inode *dir,
                  * quota blocks, sb is already counted in previous macros).
                  */
                 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-                         EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                         EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+                         EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
         }
  retry:
         inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c

index 047a6de04a0ac8195d5453de514a8e95526fcce4..5929cd0baa2077ebce9a3b0c51d528de9b89ac33 100644 (file)
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -29,25 +29,19 @@
  #include "xattr.h"
  #include "acl.h"
  
-static struct kmem_cache *io_page_cachep, *io_end_cachep;
+static struct kmem_cache *io_end_cachep;
  
  int __init ext4_init_pageio(void)
  {
-       io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
-       if (io_page_cachep == NULL)
-               return -ENOMEM;
         io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
-       if (io_end_cachep == NULL) {
-               kmem_cache_destroy(io_page_cachep);
+       if (io_end_cachep == NULL)
                 return -ENOMEM;
-       }
         return 0;
  }
  
  void ext4_exit_pageio(void)
  {
         kmem_cache_destroy(io_end_cachep);
-       kmem_cache_destroy(io_page_cachep);
  }
  
  /*
@@ -67,29 +61,28 @@ void ext4_ioend_shutdown(struct inode *inode)
                 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
  }
  
-static void put_io_page(struct ext4_io_page *io_page)
+static void ext4_release_io_end(ext4_io_end_t *io_end)
  {
-       if (atomic_dec_and_test(&io_page->p_count)) {
-               end_page_writeback(io_page->p_page);
-               put_page(io_page->p_page);
-               kmem_cache_free(io_page_cachep, io_page);
-       }
+       BUG_ON(!list_empty(&io_end->list));
+       BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+
+       if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
+               wake_up_all(ext4_ioend_wq(io_end->inode));
+       if (io_end->flag & EXT4_IO_END_DIRECT)
+               inode_dio_done(io_end->inode);
+       if (io_end->iocb)
+               aio_complete(io_end->iocb, io_end->result, 0);
+       kmem_cache_free(io_end_cachep, io_end);
  }
  
-void ext4_free_io_end(ext4_io_end_t *io)
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
  {
-       int i;
-
-       BUG_ON(!io);
-       BUG_ON(!list_empty(&io->list));
-       BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
+       struct inode *inode = io_end->inode;
  
-       for (i = 0; i < io->num_io_pages; i++)
-               put_io_page(io->pages[i]);
-       io->num_io_pages = 0;
-       if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
-               wake_up_all(ext4_ioend_wq(io->inode));
-       kmem_cache_free(io_end_cachep, io);
+       io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+       /* Wake up anyone waiting on unwritten extent conversion */
+       if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+               wake_up_all(ext4_ioend_wq(inode));
  }
  
  /* check a range of space and convert unwritten extents to written. */
@@ -112,13 +105,8 @@ static int ext4_end_io(ext4_io_end_t *io)
                          "(inode %lu, offset %llu, size %zd, error %d)",
                          inode->i_ino, offset, size, ret);
         }
-       /* Wake up anyone waiting on unwritten extent conversion */
-       if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
-               wake_up_all(ext4_ioend_wq(inode));
-       if (io->flag & EXT4_IO_END_DIRECT)
-               inode_dio_done(inode);
-       if (io->iocb)
-               aio_complete(io->iocb, io->result, 0);
+       ext4_clear_io_unwritten_flag(io);
+       ext4_release_io_end(io);
         return ret;
  }
  
@@ -149,7 +137,7 @@ static void dump_completed_IO(struct inode *inode)
  }
  
  /* Add the io_end to per-inode completed end_io list. */
-void ext4_add_complete_io(ext4_io_end_t *io_end)
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
  {
         struct ext4_inode_info *ei = EXT4_I(io_end->inode);
         struct workqueue_struct *wq;
@@ -186,8 +174,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
                 err = ext4_end_io(io);
                 if (unlikely(!ret && err))
                         ret = err;
-               io->flag &= ~EXT4_IO_END_UNWRITTEN;
-               ext4_free_io_end(io);
         }
         return ret;
  }
@@ -219,10 +205,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
                 atomic_inc(&EXT4_I(inode)->i_ioend_count);
                 io->inode = inode;
                 INIT_LIST_HEAD(&io->list);
+               atomic_set(&io->count, 1);
         }
         return io;
  }
  
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
+{
+       if (atomic_dec_and_test(&io_end->count)) {
+               if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+                       ext4_release_io_end(io_end);
+                       return;
+               }
+               ext4_add_complete_io(io_end);
+       }
+}
+
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+       int err = 0;
+
+       if (atomic_dec_and_test(&io_end->count)) {
+               if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+                       err = ext4_convert_unwritten_extents(io_end->inode,
+                                               io_end->offset, io_end->size);
+                       ext4_clear_io_unwritten_flag(io_end);
+               }
+               ext4_release_io_end(io_end);
+       }
+       return err;
+}
+
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+       atomic_inc(&io_end->count);
+       return io_end;
+}
+
  /*
   * Print an buffer I/O error compatible with the fs/buffer.c.  This
   * provides compatibility with dmesg scrapers that look for a specific
@@ -243,45 +262,56 @@ static void ext4_end_bio(struct bio *bio, int error)
         ext4_io_end_t *io_end = bio->bi_private;
         struct inode *inode;
         int i;
+       int blocksize;
         sector_t bi_sector = bio->bi_sector;
  
         BUG_ON(!io_end);
+       inode = io_end->inode;
+       blocksize = 1 << inode->i_blkbits;
         bio->bi_private = NULL;
         bio->bi_end_io = NULL;
         if (test_bit(BIO_UPTODATE, &bio->bi_flags))
                 error = 0;
-       bio_put(bio);
-
-       for (i = 0; i < io_end->num_io_pages; i++) {
-               struct page *page = io_end->pages[i]->p_page;
+       for (i = 0; i < bio->bi_vcnt; i++) {
+               struct bio_vec *bvec = &bio->bi_io_vec[i];
+               struct page *page = bvec->bv_page;
                 struct buffer_head *bh, *head;
-               loff_t offset;
-               loff_t io_end_offset;
+               unsigned bio_start = bvec->bv_offset;
+               unsigned bio_end = bio_start + bvec->bv_len;
+               unsigned under_io = 0;
+               unsigned long flags;
+
+               if (!page)
+                       continue;
  
                 if (error) {
                         SetPageError(page);
                         set_bit(AS_EIO, &page->mapping->flags);
-                       head = page_buffers(page);
-                       BUG_ON(!head);
-
-                       io_end_offset = io_end->offset + io_end->size;
-
-                       offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
-                       bh = head;
-                       do {
-                               if ((offset >= io_end->offset) &&
-                                   (offset+bh->b_size <= io_end_offset))
-                                       buffer_io_error(bh);
-
-                               offset += bh->b_size;
-                               bh = bh->b_this_page;
-                       } while (bh != head);
                 }
-
-               put_io_page(io_end->pages[i]);
+               bh = head = page_buffers(page);
+               /*
+                * We check all buffers in the page under BH_Uptodate_Lock
+                * to avoid races with other end io clearing async_write flags
+                */
+               local_irq_save(flags);
+               bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+               do {
+                       if (bh_offset(bh) < bio_start ||
+                           bh_offset(bh) + blocksize > bio_end) {
+                               if (buffer_async_write(bh))
+                                       under_io++;
+                               continue;
+                       }
+                       clear_buffer_async_write(bh);
+                       if (error)
+                               buffer_io_error(bh);
+               } while ((bh = bh->b_this_page) != head);
+               bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+               local_irq_restore(flags);
+               if (!under_io)
+                       end_page_writeback(page);
         }
-       io_end->num_io_pages = 0;
-       inode = io_end->inode;
+       bio_put(bio);
  
         if (error) {
                 io_end->flag |= EXT4_IO_END_ERROR;
@@ -294,12 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error)
                              bi_sector >> (inode->i_blkbits - 9));
         }
  
-       if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-               ext4_free_io_end(io_end);
-               return;
-       }
-
-       ext4_add_complete_io(io_end);
+       ext4_put_io_end_defer(io_end);
  }
  
  void ext4_io_submit(struct ext4_io_submit *io)
@@ -313,76 +338,59 @@ void ext4_io_submit(struct ext4_io_submit *io)
                 bio_put(io->io_bio);
         }
         io->io_bio = NULL;
-       io->io_op = 0;
+}
+
+void ext4_io_submit_init(struct ext4_io_submit *io,
+                        struct writeback_control *wbc)
+{
+       io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
+       io->io_bio = NULL;
         io->io_end = NULL;
  }
  
-static int io_submit_init(struct ext4_io_submit *io,
-                         struct inode *inode,
-                         struct writeback_control *wbc,
-                         struct buffer_head *bh)
+static int io_submit_init_bio(struct ext4_io_submit *io,
+                             struct buffer_head *bh)
  {
-       ext4_io_end_t *io_end;
-       struct page *page = bh->b_page;
         int nvecs = bio_get_nr_vecs(bh->b_bdev);
         struct bio *bio;
  
-       io_end = ext4_init_io_end(inode, GFP_NOFS);
-       if (!io_end)
-               return -ENOMEM;
         bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
         bio->bi_bdev = bh->b_bdev;
-       bio->bi_private = io->io_end = io_end;
         bio->bi_end_io = ext4_end_bio;
-
-       io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
-
+       bio->bi_private = ext4_get_io_end(io->io_end);
+       if (!io->io_end->size)
+               io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
+                                    + bh_offset(bh);
         io->io_bio = bio;
-       io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
         io->io_next_block = bh->b_blocknr;
         return 0;
  }
  
  static int io_submit_add_bh(struct ext4_io_submit *io,
-                           struct ext4_io_page *io_page,
                             struct inode *inode,
-                           struct writeback_control *wbc,
                             struct buffer_head *bh)
  {
         ext4_io_end_t *io_end;
         int ret;
  
-       if (buffer_new(bh)) {
-               clear_buffer_new(bh);
-               unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
-       }
-
         if (io->io_bio && bh->b_blocknr != io->io_next_block) {
  submit_and_retry:
                 ext4_io_submit(io);
         }
         if (io->io_bio == NULL) {
-               ret = io_submit_init(io, inode, wbc, bh);
+               ret = io_submit_init_bio(io, bh);
                 if (ret)
                         return ret;
         }
-       io_end = io->io_end;
-       if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
-           (io_end->pages[io_end->num_io_pages-1] != io_page))
-               goto submit_and_retry;
-       if (buffer_uninit(bh))
-               ext4_set_io_unwritten_flag(inode, io_end);
-       io->io_end->size += bh->b_size;
-       io->io_next_block++;
         ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
         if (ret != bh->b_size)
                 goto submit_and_retry;
-       if ((io_end->num_io_pages == 0) ||
-           (io_end->pages[io_end->num_io_pages-1] != io_page)) {
-               io_end->pages[io_end->num_io_pages++] = io_page;
-               atomic_inc(&io_page->p_count);
-       }
+       io_end = io->io_end;
+       if (test_clear_buffer_uninit(bh))
+               ext4_set_io_unwritten_flag(inode, io_end);
+       io_end->size += bh->b_size;
+       io->io_next_block++;
         return 0;
  }
  
@@ -392,33 +400,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                         struct writeback_control *wbc)
  {
         struct inode *inode = page->mapping->host;
-       unsigned block_start, block_end, blocksize;
-       struct ext4_io_page *io_page;
+       unsigned block_start, blocksize;
         struct buffer_head *bh, *head;
         int ret = 0;
+       int nr_submitted = 0;
  
         blocksize = 1 << inode->i_blkbits;
  
         BUG_ON(!PageLocked(page));
         BUG_ON(PageWriteback(page));
  
-       io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
-       if (!io_page) {
-               redirty_page_for_writepage(wbc, page);
-               unlock_page(page);
-               return -ENOMEM;
-       }
-       io_page->p_page = page;
-       atomic_set(&io_page->p_count, 1);
-       get_page(page);
         set_page_writeback(page);
         ClearPageError(page);
  
-       for (bh = head = page_buffers(page), block_start = 0;
-            bh != head || !block_start;
-            block_start = block_end, bh = bh->b_this_page) {
-
-               block_end = block_start + blocksize;
+       /*
+        * In the first loop we prepare and mark buffers to submit. We have to
+        * mark all buffers in the page before submitting so that
+        * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+        * on the first buffer finishes and we are still working on submitting
+        * the second buffer.
+        */
+       bh = head = page_buffers(page);
+       do {
+               block_start = bh_offset(bh);
                 if (block_start >= len) {
                         /*
                          * Comments copied from block_write_full_page_endio:
@@ -431,7 +435,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                          * mapped, and writes to that region are not written
                          * out to the file."
                          */
-                       zero_user_segment(page, block_start, block_end);
+                       zero_user_segment(page, block_start,
+                                         block_start + blocksize);
                         clear_buffer_dirty(bh);
                         set_buffer_uptodate(bh);
                         continue;
@@ -445,7 +450,19 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                                 ext4_io_submit(io);
                         continue;
                 }
-               ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+               if (buffer_new(bh)) {
+                       clear_buffer_new(bh);
+                       unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+               }
+               set_buffer_async_write(bh);
+       } while ((bh = bh->b_this_page) != head);
+
+       /* Now submit buffers to write */
+       bh = head = page_buffers(page);
+       do {
+               if (!buffer_async_write(bh))
+                       continue;
+               ret = io_submit_add_bh(io, inode, bh);
                 if (ret) {
                         /*
                          * We only get here on ENOMEM.  Not much else
@@ -455,17 +472,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                         redirty_page_for_writepage(wbc, page);
                         break;
                 }
+               nr_submitted++;
                 clear_buffer_dirty(bh);
+       } while ((bh = bh->b_this_page) != head);
+
+       /* Error stopped previous loop? Clean up buffers... */
+       if (ret) {
+               do {
+                       clear_buffer_async_write(bh);
+                       bh = bh->b_this_page;
+               } while (bh != head);
         }
         unlock_page(page);
-       /*
-        * If the page was truncated before we could do the writeback,
-        * or we had a memory allocation error while trying to write
-        * the first buffer head, we won't have submitted any pages for
-        * I/O.  In that case we need to make sure we've cleared the
-        * PageWriteback bit from the page to prevent the system from
-        * wedging later on.
-        */
-       put_io_page(io_page);
+       /* Nothing submitted - we have to end page writeback */
+       if (!nr_submitted)
+               end_page_writeback(page);
         return ret;
  }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c

index c169477a62c987a1ba2d3362dde6bd8d3cd0f260..b27c96d01965b97a1998b8542d31d9639a0b0161 100644 (file)
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -272,7 +272,7 @@ next_group:
                 if (start_blk >= last_blk)
                         goto next_group;
                 group_data[bb_index].block_bitmap = start_blk++;
-               ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+               group = ext4_get_group_number(sb, start_blk - 1);
                 group -= group_data[0].group;
                 group_data[group].free_blocks_count--;
                 if (flexbg_size > 1)
@@ -284,7 +284,7 @@ next_group:
                 if (start_blk >= last_blk)
                         goto next_group;
                 group_data[ib_index].inode_bitmap = start_blk++;
-               ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+               group = ext4_get_group_number(sb, start_blk - 1);
                 group -= group_data[0].group;
                 group_data[group].free_blocks_count--;
                 if (flexbg_size > 1)
@@ -296,7 +296,7 @@ next_group:
                 if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
                         goto next_group;
                 group_data[it_index].inode_table = start_blk;
-               ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+               group = ext4_get_group_number(sb, start_blk - 1);
                 group -= group_data[0].group;
                 group_data[group].free_blocks_count -=
                                         EXT4_SB(sb)->s_itb_per_group;
@@ -392,7 +392,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
                 ext4_group_t group;
                 int err;
  
-               ext4_get_group_no_and_offset(sb, block, &group, NULL);
+               group = ext4_get_group_number(sb, block);
                 start = ext4_group_first_block_no(sb, group);
                 group -= flex_gd->groups[0].group;
  
@@ -1341,6 +1341,8 @@ static void ext4_update_super(struct super_block *sb,
  
         /* Update the global fs size fields */
         sbi->s_groups_count += flex_gd->count;
+       sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+                       (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
  
         /* Update the reserved block counts only once the new group is
          * active. */
@@ -1879,7 +1881,11 @@ retry:
                 /* Nothing need to do */
                 return 0;
  
-       ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+       n_group = ext4_get_group_number(sb, n_blocks_count - 1);
+       if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
+               ext4_warning(sb, "resize would cause inodes_count overflow");
+               return -EINVAL;
+       }
         ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
  
         n_desc_blocks = num_desc_blocks(sb, n_group + 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 5d6d53578124dda01132a6545100a5acb2025f73..dbc7c090c13a782a9976fcbe9a8500e6158f6412 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
  static void ext4_destroy_lazyinit_thread(void);
  static void ext4_unregister_li_request(struct super_block *sb);
  static void ext4_clear_request_list(void);
+static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
  
  #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
  static struct file_system_type ext2_fs_type = {
@@ -353,10 +354,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
         struct super_block              *sb = journal->j_private;
         struct ext4_sb_info             *sbi = EXT4_SB(sb);
         int                             error = is_journal_aborted(journal);
-       struct ext4_journal_cb_entry    *jce, *tmp;
+       struct ext4_journal_cb_entry    *jce;
  
+       BUG_ON(txn->t_state == T_FINISHED);
         spin_lock(&sbi->s_md_lock);
-       list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+       while (!list_empty(&txn->t_private_list)) {
+               jce = list_entry(txn->t_private_list.next,
+                                struct ext4_journal_cb_entry, jce_list);
                 list_del_init(&jce->jce_list);
                 spin_unlock(&sbi->s_md_lock);
                 jce->jce_func(sb, jce, error);
@@ -1948,16 +1952,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
         if ((sbi->s_es->s_feature_ro_compat &
              cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
                 /* Use new metadata_csum algorithm */
-               __u16 old_csum;
+               __le16 save_csum;
                 __u32 csum32;
  
-               old_csum = gdp->bg_checksum;
+               save_csum = gdp->bg_checksum;
                 gdp->bg_checksum = 0;
                 csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
                                      sizeof(le_group));
                 csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
                                      sbi->s_desc_size);
-               gdp->bg_checksum = old_csum;
+               gdp->bg_checksum = save_csum;
  
                 crc = csum32 & 0xFFFF;
                 goto out;
@@ -2379,17 +2383,15 @@ struct ext4_attr {
         int offset;
  };
  
-static int parse_strtoul(const char *buf,
-               unsigned long max, unsigned long *value)
+static int parse_strtoull(const char *buf,
+               unsigned long long max, unsigned long long *value)
  {
-       char *endp;
-
-       *value = simple_strtoul(skip_spaces(buf), &endp, 0);
-       endp = skip_spaces(endp);
-       if (*endp || *value > max)
-               return -EINVAL;
+       int ret;
  
-       return 0;
+       ret = kstrtoull(skip_spaces(buf), 0, value);
+       if (!ret && *value > max)
+               ret = -EINVAL;
+       return ret;
  }
  
  static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
@@ -2431,11 +2433,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
                                           const char *buf, size_t count)
  {
         unsigned long t;
+       int ret;
  
-       if (parse_strtoul(buf, 0x40000000, &t))
-               return -EINVAL;
+       ret = kstrtoul(skip_spaces(buf), 0, &t);
+       if (ret)
+               return ret;
  
-       if (t && !is_power_of_2(t))
+       if (t && (!is_power_of_2(t) || t > 0x40000000))
                 return -EINVAL;
  
         sbi->s_inode_readahead_blks = t;
@@ -2456,13 +2460,36 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
  {
         unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
         unsigned long t;
+       int ret;
  
-       if (parse_strtoul(buf, 0xffffffff, &t))
-               return -EINVAL;
+       ret = kstrtoul(skip_spaces(buf), 0, &t);
+       if (ret)
+               return ret;
         *ui = t;
         return count;
  }
  
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+                                 struct ext4_sb_info *sbi, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%llu\n",
+               (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+                                  struct ext4_sb_info *sbi,
+                                  const char *buf, size_t count)
+{
+       unsigned long long val;
+       int ret;
+
+       if (parse_strtoull(buf, -1ULL, &val))
+               return -EINVAL;
+       ret = ext4_reserve_clusters(sbi, val);
+
+       return ret ? ret : count;
+}
+
  static ssize_t trigger_test_error(struct ext4_attr *a,
                                   struct ext4_sb_info *sbi,
                                   const char *buf, size_t count)
@@ -2500,6 +2527,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
  EXT4_RO_ATTR(delayed_allocation_blocks);
  EXT4_RO_ATTR(session_write_kbytes);
  EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
  EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
                  inode_readahead_blks_store, s_inode_readahead_blks);
  EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2517,6 +2545,7 @@ static struct attribute *ext4_attrs[] = {
         ATTR_LIST(delayed_allocation_blocks),
         ATTR_LIST(session_write_kbytes),
         ATTR_LIST(lifetime_write_kbytes),
+       ATTR_LIST(reserved_clusters),
         ATTR_LIST(inode_readahead_blks),
         ATTR_LIST(inode_goal),
         ATTR_LIST(mb_stats),
@@ -3192,6 +3221,40 @@ int ext4_calculate_overhead(struct super_block *sb)
         return 0;
  }
  
+
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+{
+       ext4_fsblk_t resv_clusters;
+
+       /*
+        * By default we reserve 2% or 4096 clusters, whichever is smaller.
+        * This should cover the situations where we can not afford to run
+        * out of space like for example punch hole, or converting
+        * uninitialized extents in delalloc path. In most cases such
+        * allocation would require 1, or 2 blocks, higher numbers are
+        * very rare.
+        */
+       resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+
+       do_div(resv_clusters, 50);
+       resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
+
+       return resv_clusters;
+}
+
+
+static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
+{
+       ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
+                               sbi->s_cluster_bits;
+
+       if (count >= clusters)
+               return -EINVAL;
+
+       atomic64_set(&sbi->s_resv_clusters, count);
+       return 0;
+}
+
  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
  {
         char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -3526,6 +3589,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
         sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
  
+       /* Do we have standard group size of blocksize * 8 blocks ? */
+       if (sbi->s_blocks_per_group == blocksize << 3)
+               set_opt2(sb, STD_GROUP_SIZE);
+
         for (i = 0; i < 4; i++)
                 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
         sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3698,6 +3765,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         sbi->s_err_report.function = print_daily_error_info;
         sbi->s_err_report.data = (unsigned long) sb;
  
+       /* Register extent status tree shrinker */
+       ext4_es_register_shrinker(sb);
+
         err = percpu_counter_init(&sbi->s_freeclusters_counter,
                         ext4_count_free_clusters(sb));
         if (!err) {
@@ -3723,9 +3793,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         sbi->s_max_writeback_mb_bump = 128;
         sbi->s_extent_max_zeroout_kb = 32;
  
-       /* Register extent status tree shrinker */
-       ext4_es_register_shrinker(sb);
-
         /*
          * set up enough so that it can read an inode
          */
@@ -3911,6 +3978,13 @@ no_journal:
                          "available");
         }
  
+       err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+       if (err) {
+               ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
+                        "reserved pool", ext4_calculate_resv_clusters(sbi));
+               goto failed_mount4a;
+       }
+
         err = ext4_setup_system_zone(sb);
         if (err) {
                 ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -4010,6 +4084,7 @@ failed_mount_wq:
                 sbi->s_journal = NULL;
         }
  failed_mount3:
+       ext4_es_unregister_shrinker(sb);
         del_timer(&sbi->s_err_report);
         if (sbi->s_flex_groups)
                 ext4_kvfree(sbi->s_flex_groups);
@@ -4177,7 +4252,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
                 goto out_bdev;
         }
         journal->j_private = sb;
-       ll_rw_block(READ, 1, &journal->j_sb_buffer);
+       ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
         wait_on_buffer(journal->j_sb_buffer);
         if (!buffer_uptodate(journal->j_sb_buffer)) {
                 ext4_msg(sb, KERN_ERR, "I/O error on journal device");
@@ -4742,9 +4817,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
         struct super_block *sb = dentry->d_sb;
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         struct ext4_super_block *es = sbi->s_es;
-       ext4_fsblk_t overhead = 0;
+       ext4_fsblk_t overhead = 0, resv_blocks;
         u64 fsid;
         s64 bfree;
+       resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
  
         if (!test_opt(sb, MINIX_DF))
                 overhead = sbi->s_overhead;
@@ -4756,8 +4832,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
                 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
         /* prevent underflow in case that few free space is available */
         buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
-       buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
-       if (buf->f_bfree < ext4_r_blocks_count(es))
+       buf->f_bavail = buf->f_bfree -
+                       (ext4_r_blocks_count(es) + resv_blocks);
+       if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
                 buf->f_bavail = 0;
         buf->f_files = le32_to_cpu(es->s_inodes_count);
         buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
@@ -4945,6 +5022,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                 return PTR_ERR(qf_inode);
         }
  
+       /* Don't account quota for quota files to avoid recursion */
+       qf_inode->i_flags |= S_NOQUOTA;
         err = dquot_enable(qf_inode, type, format_id, flags);
         iput(qf_inode);
  
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c

index 3a120b277240e70b44fede9fa0b9ffa98b85118b..c081e34f717f6903492acd3c4bc92d26dc888e7e 100644 (file)
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
                                     struct ext4_xattr_header *hdr)
  {
         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       __u32 csum, old;
+       __u32 csum;
+       __le32 save_csum;
+       __le64 dsk_block_nr = cpu_to_le64(block_nr);
  
-       old = hdr->h_checksum;
+       save_csum = hdr->h_checksum;
         hdr->h_checksum = 0;
-       block_nr = cpu_to_le64(block_nr);
-       csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
-                          sizeof(block_nr));
+       csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+                          sizeof(dsk_block_nr));
         csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
                            EXT4_BLOCK_SIZE(inode->i_sb));
  
-       hdr->h_checksum = old;
+       hdr->h_checksum = save_csum;
         return cpu_to_le32(csum);
  }
  
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h

index aa25deb5c6cde14014cbbc5d8259f6de28980d36..c767dbdd7fc495e5a4fd4a5c581455db73c5c1c3 100644 (file)
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -22,6 +22,7 @@
  #define        EXT4_XATTR_INDEX_LUSTRE                 5
  #define EXT4_XATTR_INDEX_SECURITY              6
  #define EXT4_XATTR_INDEX_SYSTEM                        7
+#define EXT4_XATTR_INDEX_RICHACL               8
  
  struct ext4_xattr_header {
         __le32  h_magic;        /* magic number for identification */
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c

index 750c70148effad6151023e70e08a0e17f185d4e8..0f53946f13c15d5ab53a5f46c595d2ae9eb3f652 100644 (file)
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         int space_left = 0;
         int first_tag = 0;
         int tag_flag;
-       int i, to_free = 0;
+       int i;
         int tag_bytes = journal_tag_bytes(journal);
         struct buffer_head *cbh = NULL; /* For transactional checksums */
         __u32 crc32_sum = ~0;
@@ -1134,7 +1134,7 @@ restart_loop:
         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
         spin_unlock(&journal->j_history_lock);
  
-       commit_transaction->t_state = T_FINISHED;
+       commit_transaction->t_state = T_COMMIT_CALLBACK;
         J_ASSERT(commit_transaction == journal->j_committing_transaction);
         journal->j_commit_sequence = commit_transaction->t_tid;
         journal->j_committing_transaction = NULL;
@@ -1149,38 +1149,44 @@ restart_loop:
                                 journal->j_average_commit_time*3) / 4;
         else
                 journal->j_average_commit_time = commit_time;
+
         write_unlock(&journal->j_state_lock);
  
-       if (commit_transaction->t_checkpoint_list == NULL &&
-           commit_transaction->t_checkpoint_io_list == NULL) {
-               __jbd2_journal_drop_transaction(journal, commit_transaction);
-               to_free = 1;
+       if (journal->j_checkpoint_transactions == NULL) {
+               journal->j_checkpoint_transactions = commit_transaction;
+               commit_transaction->t_cpnext = commit_transaction;
+               commit_transaction->t_cpprev = commit_transaction;
         } else {
-               if (journal->j_checkpoint_transactions == NULL) {
-                       journal->j_checkpoint_transactions = commit_transaction;
-                       commit_transaction->t_cpnext = commit_transaction;
-                       commit_transaction->t_cpprev = commit_transaction;
-               } else {
-                       commit_transaction->t_cpnext =
-                               journal->j_checkpoint_transactions;
-                       commit_transaction->t_cpprev =
-                               commit_transaction->t_cpnext->t_cpprev;
-                       commit_transaction->t_cpnext->t_cpprev =
-                               commit_transaction;
-                       commit_transaction->t_cpprev->t_cpnext =
+               commit_transaction->t_cpnext =
+                       journal->j_checkpoint_transactions;
+               commit_transaction->t_cpprev =
+                       commit_transaction->t_cpnext->t_cpprev;
+               commit_transaction->t_cpnext->t_cpprev =
+                       commit_transaction;
+               commit_transaction->t_cpprev->t_cpnext =
                                 commit_transaction;
-               }
         }
         spin_unlock(&journal->j_list_lock);
-
+       /* Drop all spin_locks because commit_callback may be block.
+        * __journal_remove_checkpoint() can not destroy transaction
+        * under us because it is not marked as T_FINISHED yet */
         if (journal->j_commit_callback)
                 journal->j_commit_callback(journal, commit_transaction);
  
         trace_jbd2_end_commit(journal, commit_transaction);
         jbd_debug(1, "JBD2: commit %d complete, head %d\n",
                   journal->j_commit_sequence, journal->j_tail_sequence);
-       if (to_free)
-               jbd2_journal_free_transaction(commit_transaction);
  
+       write_lock(&journal->j_state_lock);
+       spin_lock(&journal->j_list_lock);
+       commit_transaction->t_state = T_FINISHED;
+       /* Recheck checkpoint lists after j_list_lock was dropped */
+       if (commit_transaction->t_checkpoint_list == NULL &&
+           commit_transaction->t_checkpoint_io_list == NULL) {
+               __jbd2_journal_drop_transaction(journal, commit_transaction);
+               jbd2_journal_free_transaction(commit_transaction);
+       }
+       spin_unlock(&journal->j_list_lock);
+       write_unlock(&journal->j_state_lock);
         wake_up(&journal->j_wait_done_commit);
  }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c

index 8b220f1ab54f337a0c0abef513e5280c7d2966ce..f6c5ba027f4f613f655f703de222449a45c6c9c3 100644 (file)
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -707,6 +707,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
         return err;
  }
  
+/*
+ * When this function returns the transaction corresponding to tid
+ * will be completed.  If the transaction has currently running, start
+ * committing that transaction before waiting for it to complete.  If
+ * the transaction id is stale, it is by definition already completed,
+ * so just return SUCCESS.
+ */
+int jbd2_complete_transaction(journal_t *journal, tid_t tid)
+{
+       int     need_to_wait = 1;
+
+       read_lock(&journal->j_state_lock);
+       if (journal->j_running_transaction &&
+           journal->j_running_transaction->t_tid == tid) {
+               if (journal->j_commit_request != tid) {
+                       /* transaction not yet started, so request it */
+                       read_unlock(&journal->j_state_lock);
+                       jbd2_log_start_commit(journal, tid);
+                       goto wait_commit;
+               }
+       } else if (!(journal->j_committing_transaction &&
+                    journal->j_committing_transaction->t_tid == tid))
+               need_to_wait = 0;
+       read_unlock(&journal->j_state_lock);
+       if (!need_to_wait)
+               return 0;
+wait_commit:
+       return jbd2_log_wait_commit(journal, tid);
+}
+EXPORT_SYMBOL(jbd2_complete_transaction);
+
  /*
   * Log buffer allocation routines:
   */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c

index 325bc019ed8813ea00321594405e86c739dad5fb..10f524c59ea88d48bf4f85f42e6fb2eca0d7a55b 100644 (file)
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks)
         handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
         if (!handle)
                 return NULL;
-       memset(handle, 0, sizeof(*handle));
         handle->h_buffer_credits = nblocks;
         handle->h_ref = 1;
  
@@ -640,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
         int error;
         char *frozen_buffer = NULL;
         int need_copy = 0;
+       unsigned long start_lock, time_lock;
  
         if (is_handle_aborted(handle))
                 return -EROFS;
@@ -655,9 +655,16 @@ repeat:
  
         /* @@@ Need to check for errors here at some point. */
  
+       start_lock = jiffies;
         lock_buffer(bh);
         jbd_lock_bh_state(bh);
  
+       /* If it takes too long to lock the buffer, trace it */
+       time_lock = jbd2_time_diff(start_lock, jiffies);
+       if (time_lock > HZ/10)
+               trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
+                       jiffies_to_msecs(time_lock));
+
         /* We now hold the buffer lock so it is safe to query the buffer
          * state.  Is the buffer dirty?
          *
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h

index 4c16c4a88d477cecff3434f49d4c15e4cb399302..9e52b0626b39461bf21a3b71b32b0e7b2fadaf5c 100644 (file)
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -34,6 +34,8 @@ enum bh_state_bits {
         BH_Write_EIO,   /* I/O error on write */
         BH_Unwritten,   /* Buffer is allocated on disk but not written */
         BH_Quiet,       /* Buffer Error Prinks to be quiet */
+       BH_Meta,        /* Buffer contains metadata */
+       BH_Prio,        /* Buffer should be submitted with REQ_PRIO */
  
         BH_PrivateStart,/* not a state bit, but the first bit available
                          * for private allocation by other entities
@@ -124,6 +126,8 @@ BUFFER_FNS(Delay, delay)
  BUFFER_FNS(Boundary, boundary)
  BUFFER_FNS(Write_EIO, write_io_error)
  BUFFER_FNS(Unwritten, unwritten)
+BUFFER_FNS(Meta, meta)
+BUFFER_FNS(Prio, prio)
  
  #define bh_offset(bh)          ((unsigned long)(bh)->b_data & ~PAGE_MASK)
  
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h

index 50e5a5e6a71214535b8cce3d823862da4f7b930f..6e051f472edb6db62f7e292b71c3bc5b2c5575f5 100644 (file)
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -480,6 +480,7 @@ struct transaction_s
                 T_COMMIT,
                 T_COMMIT_DFLUSH,
                 T_COMMIT_JFLUSH,
+               T_COMMIT_CALLBACK,
                 T_FINISHED
         }                       t_state;
  
@@ -1144,7 +1145,7 @@ extern struct kmem_cache *jbd2_handle_cache;
  
  static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags)
  {
-       return kmem_cache_alloc(jbd2_handle_cache, gfp_flags);
+       return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags);
  }
  
  static inline void jbd2_free_handle(handle_t *handle)
@@ -1200,6 +1201,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
  int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
  int jbd2_journal_force_commit_nested(journal_t *journal);
  int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
+int jbd2_complete_transaction(journal_t *journal, tid_t tid);
  int jbd2_log_do_checkpoint(journal_t *journal);
  int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
  
diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h

index c18b46f8aeebbfb4a85f8b750dad108c0124eaf6..13a3da25ff0752a99bccb12b9c105cdbdae13457 100644 (file)
--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h
@@ -31,21 +31,14 @@ struct journal_head {
         /*
          * Journalling list for this buffer [jbd_lock_bh_state()]
          */
-       unsigned b_jlist;
+       unsigned b_jlist:4;
  
         /*
          * This flag signals the buffer has been modified by
          * the currently running transaction
          * [jbd_lock_bh_state()]
          */
-       unsigned b_modified;
-
-       /*
-        * This feild tracks the last transaction id in which this buffer
-        * has been cowed
-        * [jbd_lock_bh_state()]
-        */
-       tid_t b_cow_tid;
+       unsigned b_modified:1;
  
         /*
          * Copy of the buffer data frozen for writing to the log.
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h

index 4ee4710038592950a9584d49646e8de8a3f9273c..d0e686402df883d89e474a63374cdd1aa4c73ba6 100644 (file)
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -257,15 +257,7 @@ DECLARE_EVENT_CLASS(ext4__write_end,
                   __entry->pos, __entry->len, __entry->copied)
  );
  
-DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end,
-
-       TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
-                unsigned int copied),
-
-       TP_ARGS(inode, pos, len, copied)
-);
-
-DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end,
+DEFINE_EVENT(ext4__write_end, ext4_write_end,
  
         TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                  unsigned int copied),
@@ -1956,7 +1948,7 @@ TRACE_EVENT(ext4_remove_blocks,
                 __entry->to             = to;
                 __entry->partial        = partial_cluster;
                 __entry->ee_pblk        = ext4_ext_pblock(ex);
-               __entry->ee_lblk        = cpu_to_le32(ex->ee_block);
+               __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                 __entry->ee_len         = ext4_ext_get_actual_len(ex);
         ),
  
@@ -2060,7 +2052,7 @@ TRACE_EVENT(ext4_ext_remove_space,
  
  TRACE_EVENT(ext4_ext_remove_space_done,
         TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth,
-               ext4_lblk_t partial, unsigned short eh_entries),
+               ext4_lblk_t partial, __le16 eh_entries),
  
         TP_ARGS(inode, start, depth, partial, eh_entries),
  
@@ -2079,7 +2071,7 @@ TRACE_EVENT(ext4_ext_remove_space_done,
                 __entry->start          = start;
                 __entry->depth          = depth;
                 __entry->partial        = partial;
-               __entry->eh_entries     = eh_entries;
+               __entry->eh_entries     = le16_to_cpu(eh_entries);
         ),
  
         TP_printk("dev %d,%d ino %lu since %u depth %d partial %u "
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h

index 070df49e4a1df18aa6ddc0cb1e8b87b0b62c4130..c1d1f3eb242d2738456af4e1bf1443b057030d6c 100644 (file)
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h
@@ -358,6 +358,27 @@ TRACE_EVENT(jbd2_write_superblock,
                   MINOR(__entry->dev), __entry->write_op)
  );
  
+TRACE_EVENT(jbd2_lock_buffer_stall,
+
+       TP_PROTO(dev_t dev, unsigned long stall_ms),
+
+       TP_ARGS(dev, stall_ms),
+
+       TP_STRUCT__entry(
+               __field(        dev_t, dev      )
+               __field(unsigned long, stall_ms )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = dev;
+               __entry->stall_ms       = stall_ms;
+       ),
+
+       TP_printk("dev %d,%d stall_ms %lu",
+               MAJOR(__entry->dev), MINOR(__entry->dev),
+               __entry->stall_ms)
+);
+
  #endif /* _TRACE_JBD2_H */
  
  /* This part must be outside protection */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 1 May 2013 15:04:12 +0000 (08:04 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 1 May 2013 15:04:12 +0000 (08:04 -0700)
Documentation/filesystems/ext4.txt		patch \| blob \| blame \| history
fs/buffer.c		patch \| blob \| blame \| history
fs/ext4/Kconfig		patch \| blob \| blame \| history
fs/ext4/balloc.c		patch \| blob \| blame \| history
fs/ext4/dir.c		patch \| blob \| blame \| history
fs/ext4/ext4.h		patch \| blob \| blame \| history
fs/ext4/ext4_extents.h		patch \| blob \| blame \| history
fs/ext4/ext4_jbd2.c		patch \| blob \| blame \| history
fs/ext4/ext4_jbd2.h		patch \| blob \| blame \| history
fs/ext4/extents.c		patch \| blob \| blame \| history
fs/ext4/fsync.c		patch \| blob \| blame \| history
fs/ext4/ialloc.c		patch \| blob \| blame \| history
fs/ext4/indirect.c		patch \| blob \| blame \| history
fs/ext4/inline.c		patch \| blob \| blame \| history
fs/ext4/inode.c		patch \| blob \| blame \| history
fs/ext4/ioctl.c		patch \| blob \| blame \| history
fs/ext4/mballoc.c		patch \| blob \| blame \| history
fs/ext4/migrate.c		patch \| blob \| blame \| history
fs/ext4/mmp.c		patch \| blob \| blame \| history
fs/ext4/move_extent.c		patch \| blob \| blame \| history
fs/ext4/namei.c		patch \| blob \| blame \| history
fs/ext4/page-io.c		patch \| blob \| blame \| history
fs/ext4/resize.c		patch \| blob \| blame \| history
fs/ext4/super.c		patch \| blob \| blame \| history
fs/ext4/xattr.c		patch \| blob \| blame \| history
fs/ext4/xattr.h		patch \| blob \| blame \| history
fs/jbd2/commit.c		patch \| blob \| blame \| history
fs/jbd2/journal.c		patch \| blob \| blame \| history
fs/jbd2/transaction.c		patch \| blob \| blame \| history
include/linux/buffer_head.h		patch \| blob \| blame \| history
include/linux/jbd2.h		patch \| blob \| blame \| history
include/linux/journal-head.h		patch \| blob \| blame \| history
include/trace/events/ext4.h		patch \| blob \| blame \| history
include/trace/events/jbd2.h		patch \| blob \| blame \| history