ext4: New rec_len encoding for very large blocksizes
authorWei Yongjun <yjwei@cn.fujitsu.com>
Sun, 15 Feb 2009 04:01:36 +0000 (23:01 -0500)
committerTheodore Ts'o <tytso@mit.edu>
Sun, 15 Feb 2009 04:01:36 +0000 (23:01 -0500)
The rec_len field in the directory entry is 16 bits, so to encode
blocksizes larger than 64k becomes problematic.  This patch allows us
to supprot block sizes up to 256k, by using the low 2 bits to extend
the range of rec_len to 2**18-1 (since valid rec_len sizes must be a
multiple of 4).  We use the convention that a rec_len of 0 or 65535
means the filesystem block size, for compatibility with older kernels.

It's unlikely we'll see VM pages of up to 256k, but at some point we
might find that the Linux VM has been enhanced to support filesystem
block sizes > than the VM page size, at which point it might be useful
for some applications to allow very large filesystem block sizes.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/namei.c

index 2df2e40b01af0cc031d4546306079c0d4a6669ac..b64789929a65bc00243e9b10298f056aadaeb306 100644 (file)
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
                         unsigned int offset)
 {
        const char *error_msg = NULL;
-       const int rlen = ext4_rec_len_from_disk(de->rec_len);
+       const int rlen = ext4_rec_len_from_disk(de->rec_len,
+                                               dir->i_sb->s_blocksize);
 
        if (rlen < EXT4_DIR_REC_LEN(1))
                error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
                                 * least that it is non-zero.  A
                                 * failure will be detected in the
                                 * dirent test below. */
-                               if (ext4_rec_len_from_disk(de->rec_len)
-                                               < EXT4_DIR_REC_LEN(1))
+                               if (ext4_rec_len_from_disk(de->rec_len,
+                                       sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
                                        break;
-                               i += ext4_rec_len_from_disk(de->rec_len);
+                               i += ext4_rec_len_from_disk(de->rec_len,
+                                                           sb->s_blocksize);
                        }
                        offset = i;
                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
                                ret = stored;
                                goto out;
                        }
-                       offset += ext4_rec_len_from_disk(de->rec_len);
+                       offset += ext4_rec_len_from_disk(de->rec_len,
+                                       sb->s_blocksize);
                        if (le32_to_cpu(de->inode)) {
                                /* We might block in the next section
                                 * if the data destination is
@@ -225,7 +228,8 @@ revalidate:
                                        goto revalidate;
                                stored++;
                        }
-                       filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+                       filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+                                               sb->s_blocksize);
                }
                offset = 0;
                brelse(bh);
index 0db01421da3ecd8e5c13bb475e58eafa1ffbf606..d5193b55ca94d081b2487552ca41ea4bdb12602a 100644 (file)
@@ -855,24 +855,6 @@ struct ext4_dir_entry_2 {
                                         ~EXT4_DIR_ROUND)
 #define EXT4_MAX_REC_LEN               ((1<<16)-1)
 
-static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
-{
-       unsigned len = le16_to_cpu(dlen);
-
-       if (len == EXT4_MAX_REC_LEN || len == 0)
-               return 1 << 16;
-       return len;
-}
-
-static inline __le16 ext4_rec_len_to_disk(unsigned len)
-{
-       if (len == (1 << 16))
-               return cpu_to_le16(EXT4_MAX_REC_LEN);
-       else if (len > (1 << 16))
-               BUG();
-       return cpu_to_le16(len);
-}
-
 /*
  * Hash Tree Directory indexing
  * (c) Daniel Phillips, 2001
@@ -1097,7 +1079,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *);
+
 /* namei.c */
+extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
+extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
index 04824958cba511259f0889eac2649be22a88ef45..a5ba1a858094409b5d91fd36f3e6a6ad1e5dd290 100644 (file)
@@ -165,7 +165,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
                       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
-               struct dx_map_entry *offsets, int count);
+               struct dx_map_entry *offsets, int count, unsigned blocksize);
 static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
 static void dx_insert_block(struct dx_frame *frame,
                                        u32 hash, ext4_lblk_t block);
@@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
 
+unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+       unsigned len = le16_to_cpu(dlen);
+
+       if (len == EXT4_MAX_REC_LEN || len == 0)
+               return blocksize;
+       return (len & 65532) | ((len & 3) << 16);
+}
+  
+__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+       if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+               BUG();
+       if (len < 65536)
+               return cpu_to_le16(len);
+       if (len == blocksize) {
+               if (blocksize == 65536)
+                       return cpu_to_le16(EXT4_MAX_REC_LEN);
+               else 
+                       return cpu_to_le16(0);
+       }
+       return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+}
+
 /*
  * p is at least 6 bytes before the end of page
  */
 static inline struct ext4_dir_entry_2 *
-ext4_next_entry(struct ext4_dir_entry_2 *p)
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
 {
        return (struct ext4_dir_entry_2 *)((char *)p +
-               ext4_rec_len_from_disk(p->rec_len));
+               ext4_rec_len_from_disk(p->rec_len, blocksize));
 }
 
 /*
@@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
                        space += EXT4_DIR_REC_LEN(de->name_len);
                        names++;
                }
-               de = ext4_next_entry(de);
+               de = ext4_next_entry(de, size);
        }
        printk("(%i)\n", names);
        return (struct stats) { names, space, 1 };
@@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
        top = (struct ext4_dir_entry_2 *) ((char *) de +
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
-       for (; de < top; de = ext4_next_entry(de)) {
+       for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
                if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
                                        (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
                                                +((char *)de - bh->b_data))) {
@@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        }
        if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-               de = ext4_next_entry(de);
+               de = ext4_next_entry(de, dir->i_sb->s_blocksize);
                if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
                        goto errout;
                count++;
@@ -732,7 +756,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
                        cond_resched();
                }
                /* XXX: do we need to check rec_len == 0 case? -Chris */
-               de = ext4_next_entry(de);
+               de = ext4_next_entry(de, blocksize);
        }
        return count;
 }
@@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh,
                        return 1;
                }
                /* prevent looping on a bad block */
-               de_len = ext4_rec_len_from_disk(de->rec_len);
+               de_len = ext4_rec_len_from_disk(de->rec_len,
+                                               dir->i_sb->s_blocksize);
                if (de_len <= 0)
                        return -1;
                offset += de_len;
@@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
                de = (struct ext4_dir_entry_2 *) bh->b_data;
                top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
                                       EXT4_DIR_REC_LEN(0));
-               for (; de < top; de = ext4_next_entry(de)) {
+               for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
                                  + ((char *) de - bh->b_data);
 
@@ -1109,7 +1134,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
  * Returns pointer to last entry moved.
  */
 static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+               unsigned blocksize)
 {
        unsigned rec_len = 0;
 
@@ -1118,7 +1144,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
                ((struct ext4_dir_entry_2 *) to)->rec_len =
-                               ext4_rec_len_to_disk(rec_len);
+                               ext4_rec_len_to_disk(rec_len, blocksize);
                de->inode = 0;
                map++;
                to += rec_len;
@@ -1137,12 +1163,12 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
 
        prev = to = de;
        while ((char*)de < base + blocksize) {
-               next = ext4_next_entry(de);
+               next = ext4_next_entry(de, blocksize);
                if (de->inode && de->name_len) {
                        rec_len = EXT4_DIR_REC_LEN(de->name_len);
                        if (de > to)
                                memmove(to, de, rec_len);
-                       to->rec_len = ext4_rec_len_to_disk(rec_len);
+                       to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
                        prev = to;
                        to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
                }
@@ -1215,10 +1241,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                                        hash2, split, count-split));
 
        /* Fancy dance to stay within two buffers */
-       de2 = dx_move_dirents(data1, data2, map + split, count - split);
+       de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
        de = dx_pack_dirents(data1, blocksize);
-       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
-       de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
+       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+                                          blocksize);
+       de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+                                           blocksize);
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
 
@@ -1268,6 +1296,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        const char      *name = dentry->d_name.name;
        int             namelen = dentry->d_name.len;
        unsigned int    offset = 0;
+       unsigned int    blocksize = dir->i_sb->s_blocksize;
        unsigned short  reclen;
        int             nlen, rlen, err;
        char            *top;
@@ -1275,7 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        reclen = EXT4_DIR_REC_LEN(namelen);
        if (!de) {
                de = (struct ext4_dir_entry_2 *)bh->b_data;
-               top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+               top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
                        if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
                                                  bh, offset)) {
@@ -1287,7 +1316,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                                return -EEXIST;
                        }
                        nlen = EXT4_DIR_REC_LEN(de->name_len);
-                       rlen = ext4_rec_len_from_disk(de->rec_len);
+                       rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
                        if ((de->inode? rlen - nlen: rlen) >= reclen)
                                break;
                        de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1306,11 +1335,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 
        /* By now the buffer is marked for journaling */
        nlen = EXT4_DIR_REC_LEN(de->name_len);
-       rlen = ext4_rec_len_from_disk(de->rec_len);
+       rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
        if (de->inode) {
                struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
-               de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
-               de->rec_len = ext4_rec_len_to_disk(nlen);
+               de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
+               de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
                de = de1;
        }
        de->file_type = EXT4_FT_UNKNOWN;
@@ -1380,7 +1409,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        /* The 0th block becomes the root, move the dirents out */
        fde = &root->dotdot;
        de = (struct ext4_dir_entry_2 *)((char *)fde +
-               ext4_rec_len_from_disk(fde->rec_len));
+               ext4_rec_len_from_disk(fde->rec_len, blocksize));
        if ((char *) de >= (((char *) root) + blocksize)) {
                ext4_error(dir->i_sb, __func__,
                           "invalid rec_len for '..' in inode %lu",
@@ -1402,12 +1431,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        memcpy (data1, de, len);
        de = (struct ext4_dir_entry_2 *) data1;
        top = data1 + len;
-       while ((char *)(de2 = ext4_next_entry(de)) < top)
+       while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
                de = de2;
-       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+                                          blocksize);
        /* Initialize the root; the dot dirents already exist */
        de = (struct ext4_dir_entry_2 *) (&root->dotdot);
-       de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
+       de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+                                          blocksize);
        memset (&root->info, 0, sizeof(root->info));
        root->info.info_length = sizeof(root->info);
        root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1488,7 +1519,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                return retval;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de->inode = 0;
-       de->rec_len = ext4_rec_len_to_disk(blocksize);
+       de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
        return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
 
@@ -1551,7 +1582,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        goto cleanup;
                node2 = (struct dx_node *)(bh2->b_data);
                entries2 = node2->entries;
-               node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
+               node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+                                                          sb->s_blocksize);
                node2->fake.inode = 0;
                BUFFER_TRACE(frame->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1639,6 +1671,7 @@ static int ext4_delete_entry(handle_t *handle,
                             struct buffer_head *bh)
 {
        struct ext4_dir_entry_2 *de, *pde;
+       unsigned int blocksize = dir->i_sb->s_blocksize;
        int i;
 
        i = 0;
@@ -1652,8 +1685,11 @@ static int ext4_delete_entry(handle_t *handle,
                        ext4_journal_get_write_access(handle, bh);
                        if (pde)
                                pde->rec_len = ext4_rec_len_to_disk(
-                                       ext4_rec_len_from_disk(pde->rec_len) +
-                                       ext4_rec_len_from_disk(de->rec_len));
+                                       ext4_rec_len_from_disk(pde->rec_len,
+                                                              blocksize) +
+                                       ext4_rec_len_from_disk(de->rec_len,
+                                                              blocksize),
+                                       blocksize);
                        else
                                de->inode = 0;
                        dir->i_version++;
@@ -1661,9 +1697,9 @@ static int ext4_delete_entry(handle_t *handle,
                        ext4_handle_dirty_metadata(handle, dir, bh);
                        return 0;
                }
-               i += ext4_rec_len_from_disk(de->rec_len);
+               i += ext4_rec_len_from_disk(de->rec_len, blocksize);
                pde = de;
-               de = ext4_next_entry(de);
+               de = ext4_next_entry(de, blocksize);
        }
        return -ENOENT;
 }
@@ -1793,6 +1829,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct inode *inode;
        struct buffer_head *dir_block;
        struct ext4_dir_entry_2 *de;
+       unsigned int blocksize = dir->i_sb->s_blocksize;
        int err, retries = 0;
 
        if (EXT4_DIR_LINK_MAX(dir))
@@ -1824,13 +1861,14 @@ retry:
        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
-       de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
+       de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+                                          blocksize);
        strcpy(de->name, ".");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-       de = ext4_next_entry(de);
+       de = ext4_next_entry(de, blocksize);
        de->inode = cpu_to_le32(dir->i_ino);
-       de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
-                                               EXT4_DIR_REC_LEN(1));
+       de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
+                                          blocksize);
        de->name_len = 2;
        strcpy(de->name, "..");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1885,7 +1923,7 @@ static int empty_dir(struct inode *inode)
                return 1;
        }
        de = (struct ext4_dir_entry_2 *) bh->b_data;
-       de1 = ext4_next_entry(de);
+       de1 = ext4_next_entry(de, sb->s_blocksize);
        if (le32_to_cpu(de->inode) != inode->i_ino ||
                        !le32_to_cpu(de1->inode) ||
                        strcmp(".", de->name) ||
@@ -1896,9 +1934,9 @@ static int empty_dir(struct inode *inode)
                brelse(bh);
                return 1;
        }
-       offset = ext4_rec_len_from_disk(de->rec_len) +
-                ext4_rec_len_from_disk(de1->rec_len);
-       de = ext4_next_entry(de1);
+       offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
+                ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
+       de = ext4_next_entry(de1, sb->s_blocksize);
        while (offset < inode->i_size) {
                if (!bh ||
                        (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1927,8 +1965,8 @@ static int empty_dir(struct inode *inode)
                        brelse(bh);
                        return 0;
                }
-               offset += ext4_rec_len_from_disk(de->rec_len);
-               de = ext4_next_entry(de);
+               offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+               de = ext4_next_entry(de, sb->s_blocksize);
        }
        brelse(bh);
        return 1;
@@ -2297,8 +2335,8 @@ retry:
        return err;
 }
 
-#define PARENT_INO(buffer) \
-       (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
+#define PARENT_INO(buffer, size) \
+       (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
 
 /*
  * Anybody can rename anything with this: the permission checks are left to the
@@ -2358,7 +2396,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
                if (!dir_bh)
                        goto end_rename;
-               if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+               if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
+                               old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
                        goto end_rename;
                retval = -EMLINK;
                if (!new_inode && new_dir != old_dir &&
@@ -2430,7 +2469,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (dir_bh) {
                BUFFER_TRACE(dir_bh, "get_write_access");
                ext4_journal_get_write_access(handle, dir_bh);
-               PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+               PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
+                                               cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
                ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
                ext4_dec_count(handle, old_dir);