ext4: implementation of a new ioctl called EXT4_IOC_SWAP_BOOT
authorDr. Tilmann Bubeck <t.bubeck@reinform.de>
Mon, 8 Apr 2013 16:54:05 +0000 (12:54 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Mon, 8 Apr 2013 16:54:05 +0000 (12:54 -0400)
Add a new ioctl, EXT4_IOC_SWAP_BOOT which swaps i_blocks and
associated attributes (like i_blocks, i_size, i_flags, ...) from the
specified inode with inode EXT4_BOOT_LOADER_INO (#5). This is
typically used to store a boot loader in a secure part of the
filesystem, where it can't be changed by a normal user by accident.
The data blocks of the previous boot loader will be associated with
the given inode.

This usercode program is a simple example of the usage:

int main(int argc, char *argv[])
{
  int fd;
  int err;

  if ( argc != 2 ) {
    printf("usage: ext4-swap-boot-inode FILE-TO-SWAP\n");
    exit(1);
  }

  fd = open(argv[1], O_WRONLY);
  if ( fd < 0 ) {
    perror("open");
    exit(1);
  }

  err = ioctl(fd, EXT4_IOC_SWAP_BOOT);
  if ( err < 0 ) {
    perror("ioctl");
    exit(1);
  }

  close(fd);
  exit(0);
}

[ Modified by Theodore Ts'o to fix a number of bugs in the original code.]

Signed-off-by: Dr. Tilmann Bubeck <t.bubeck@reinform.de>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Documentation/filesystems/ext4.txt
fs/ext4/ext4.h
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/move_extent.c

index 34ea4f1fa6ea7eefd359fb09e1605a6d6948910b..5dd957d8b25b426faf59776f50fffd27e0fb0119 100644 (file)
@@ -587,6 +587,16 @@ Table of Ext4 specific ioctls
                              bitmaps and inode table, the userspace tool thus
                              just passes the new number of blocks.
 
+EXT4_IOC_SWAP_BOOT           Swap i_blocks and associated attributes
+                             (like i_blocks, i_size, i_flags, ...) from
+                             the specified inode with inode
+                             EXT4_BOOT_LOADER_INO (#5). This is typically
+                             used to store a boot loader in a secure part of
+                             the filesystem, where it can't be changed by a
+                             normal user by accident.
+                             The data blocks of the previous boot loader
+                             will be associated with the given inode.
+
 ..............................................................................
 
 References
index a0637e5057aee0488c9fe0d5c58d72c5b98f684a..d91871570982bc1f9659a63c29c7b085c960fe6c 100644 (file)
@@ -616,6 +616,7 @@ enum {
 #define EXT4_IOC_ALLOC_DA_BLKS         _IO('f', 12)
 #define EXT4_IOC_MOVE_EXT              _IOWR('f', 15, struct move_extent)
 #define EXT4_IOC_RESIZE_FS             _IOW('f', 16, __u64)
+#define EXT4_IOC_SWAP_BOOT             _IO('f', 17)
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -1341,6 +1342,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
        return ino == EXT4_ROOT_INO ||
                ino == EXT4_USR_QUOTA_INO ||
                ino == EXT4_GRP_QUOTA_INO ||
+               ino == EXT4_BOOT_LOADER_INO ||
                ino == EXT4_JOURNAL_INO ||
                ino == EXT4_RESIZE_INO ||
                (ino >= EXT4_FIRST_INO(sb) &&
@@ -2624,6 +2626,12 @@ extern int ext4_ind_migrate(struct inode *inode);
 
 
 /* move_extent.c */
+extern void ext4_double_down_write_data_sem(struct inode *first,
+                                           struct inode *second);
+extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
+                                         struct inode *donor_inode);
+void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
+void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);
index 769c656ea3b12d2180aeed009b1b249364f83d69..a29bfc2142ef3daaf183373d0e679df5fad85544 100644 (file)
@@ -4191,8 +4191,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
         * NeilBrown 1999oct15
         */
        if (inode->i_nlink == 0) {
-               if (inode->i_mode == 0 ||
-                   !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+               if ((inode->i_mode == 0 ||
+                    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+                   ino != EXT4_BOOT_LOADER_INO) {
                        /* this inode is deleted */
                        ret = -ESTALE;
                        goto bad_inode;
@@ -4200,7 +4201,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                /* The only unlinked inodes we let through here have
                 * valid i_mode and are being read by the orphan
                 * recovery code: that's fine, we're about to complete
-                * the process of deleting those. */
+                * the process of deleting those.
+                * OR it is the EXT4_BOOT_LOADER_INO which is
+                * not initialized on a new filesystem. */
        }
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
@@ -4320,6 +4323,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                else
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+       } else if (ino == EXT4_BOOT_LOADER_INO) {
+               make_bad_inode(inode);
        } else {
                ret = -EIO;
                EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
index a07b7bc0856ae1e36eff4290e0134d4cb6a36517..cbc3acea6bcf8a2637766f920524bfa9c0cdedc5 100644 (file)
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
+#include "ext4_extents.h"
 
 #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
 
+/**
+ * Swap memory between @a and @b for @len bytes.
+ *
+ * @a:          pointer to first memory area
+ * @b:          pointer to second memory area
+ * @len:        number of bytes to swap
+ *
+ */
+static void memswap(void *a, void *b, size_t len)
+{
+       unsigned char *ap, *bp;
+       unsigned char tmp;
+
+       ap = (unsigned char *)a;
+       bp = (unsigned char *)b;
+       while (len-- > 0) {
+               tmp = *ap;
+               *ap = *bp;
+               *bp = tmp;
+               ap++;
+               bp++;
+       }
+}
+
+/**
+ * Swap i_data and associated attributes between @inode1 and @inode2.
+ * This function is used for the primary swap between inode1 and inode2
+ * and also to revert this primary swap in case of errors.
+ *
+ * Therefore you have to make sure, that calling this method twice
+ * will revert all changes.
+ *
+ * @inode1:     pointer to first inode
+ * @inode2:     pointer to second inode
+ */
+static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+{
+       loff_t isize;
+       struct ext4_inode_info *ei1;
+       struct ext4_inode_info *ei2;
+
+       ei1 = EXT4_I(inode1);
+       ei2 = EXT4_I(inode2);
+
+       memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
+       memswap(&inode1->i_version, &inode2->i_version,
+                 sizeof(inode1->i_version));
+       memswap(&inode1->i_blocks, &inode2->i_blocks,
+                 sizeof(inode1->i_blocks));
+       memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
+       memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
+       memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
+
+       memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
+       memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
+       memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+       memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
+       memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
+
+       isize = i_size_read(inode1);
+       i_size_write(inode1, i_size_read(inode2));
+       i_size_write(inode2, isize);
+}
+
+/**
+ * Swap the information from the given @inode and the inode
+ * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
+ * important fields of the inodes.
+ *
+ * @sb:         the super block of the filesystem
+ * @inode:      the inode to swap with EXT4_BOOT_LOADER_INO
+ *
+ */
+static long swap_inode_boot_loader(struct super_block *sb,
+                               struct inode *inode)
+{
+       handle_t *handle;
+       int err;
+       struct inode *inode_bl;
+       struct ext4_inode_info *ei;
+       struct ext4_inode_info *ei_bl;
+       struct ext4_sb_info *sbi;
+
+       if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
+               err = -EINVAL;
+               goto swap_boot_out;
+       }
+
+       if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+               err = -EPERM;
+               goto swap_boot_out;
+       }
+
+       sbi = EXT4_SB(sb);
+       ei = EXT4_I(inode);
+
+       inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+       if (IS_ERR(inode_bl)) {
+               err = PTR_ERR(inode_bl);
+               goto swap_boot_out;
+       }
+       ei_bl = EXT4_I(inode_bl);
+
+       filemap_flush(inode->i_mapping);
+       filemap_flush(inode_bl->i_mapping);
+
+       /* Protect orig inodes against a truncate and make sure,
+        * that only 1 swap_inode_boot_loader is running. */
+       ext4_inode_double_lock(inode, inode_bl);
+
+       truncate_inode_pages(&inode->i_data, 0);
+       truncate_inode_pages(&inode_bl->i_data, 0);
+
+       /* Wait for all existing dio workers */
+       ext4_inode_block_unlocked_dio(inode);
+       ext4_inode_block_unlocked_dio(inode_bl);
+       inode_dio_wait(inode);
+       inode_dio_wait(inode_bl);
+
+       handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
+       if (IS_ERR(handle)) {
+               err = -EINVAL;
+               goto swap_boot_out;
+       }
+
+       /* Protect extent tree against block allocations via delalloc */
+       ext4_double_down_write_data_sem(inode, inode_bl);
+
+       if (inode_bl->i_nlink == 0) {
+               /* this inode has never been used as a BOOT_LOADER */
+               set_nlink(inode_bl, 1);
+               i_uid_write(inode_bl, 0);
+               i_gid_write(inode_bl, 0);
+               inode_bl->i_flags = 0;
+               ei_bl->i_flags = 0;
+               inode_bl->i_version = 1;
+               i_size_write(inode_bl, 0);
+               inode_bl->i_mode = S_IFREG;
+               if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                             EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+                       ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
+                       ext4_ext_tree_init(handle, inode_bl);
+               } else
+                       memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
+       }
+
+       swap_inode_data(inode, inode_bl);
+
+       inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+
+       spin_lock(&sbi->s_next_gen_lock);
+       inode->i_generation = sbi->s_next_generation++;
+       inode_bl->i_generation = sbi->s_next_generation++;
+       spin_unlock(&sbi->s_next_gen_lock);
+
+       ext4_discard_preallocations(inode);
+
+       err = ext4_mark_inode_dirty(handle, inode);
+       if (err < 0) {
+               ext4_warning(inode->i_sb,
+                       "couldn't mark inode #%lu dirty (err %d)",
+                       inode->i_ino, err);
+               /* Revert all changes: */
+               swap_inode_data(inode, inode_bl);
+       } else {
+               err = ext4_mark_inode_dirty(handle, inode_bl);
+               if (err < 0) {
+                       ext4_warning(inode_bl->i_sb,
+                               "couldn't mark inode #%lu dirty (err %d)",
+                               inode_bl->i_ino, err);
+                       /* Revert all changes: */
+                       swap_inode_data(inode, inode_bl);
+                       ext4_mark_inode_dirty(handle, inode);
+               }
+       }
+
+       ext4_journal_stop(handle);
+
+       ext4_double_up_write_data_sem(inode, inode_bl);
+
+       ext4_inode_resume_unlocked_dio(inode);
+       ext4_inode_resume_unlocked_dio(inode_bl);
+
+       ext4_inode_double_unlock(inode, inode_bl);
+
+       iput(inode_bl);
+
+swap_boot_out:
+       return err;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = file_inode(filp);
@@ -353,6 +545,11 @@ group_add_out:
                return err;
        }
 
+       case EXT4_IOC_SWAP_BOOT:
+               if (!(filp->f_mode & FMODE_WRITE))
+                       return -EBADF;
+               return swap_inode_boot_loader(sb, inode);
+
        case EXT4_IOC_RESIZE_FS: {
                ext4_fsblk_t n_blocks_count;
                struct super_block *sb = inode->i_sb;
index 33e1c086858b54ad704e80345c92b946ae3c565b..a2e696e1633147d32591ee63fe4194560c15916c 100644 (file)
@@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 }
 
 /**
- * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
+ * ext4_double_down_write_data_sem - Acquire two inodes' write lock
+ *                                   of i_data_sem
  *
  * Acquire write lock of i_data_sem of the two inodes
  */
-static void
-double_down_write_data_sem(struct inode *first, struct inode *second)
+void
+ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
 {
        if (first < second) {
                down_write(&EXT4_I(first)->i_data_sem);
@@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second)
 }
 
 /**
- * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
  *
  * @orig_inode:                original inode structure to be released its lock first
  * @donor_inode:       donor inode structure to be released its lock second
  * Release write lock of i_data_sem of two inodes (orig and donor).
  */
-static void
-double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+void
+ext4_double_up_write_data_sem(struct inode *orig_inode,
+                             struct inode *donor_inode)
 {
        up_write(&EXT4_I(orig_inode)->i_data_sem);
        up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -976,7 +978,7 @@ again:
         * necessary, just swap data blocks between orig and donor.
         */
        if (uninit) {
-               double_down_write_data_sem(orig_inode, donor_inode);
+               ext4_double_down_write_data_sem(orig_inode, donor_inode);
                /* If any of extents in range became initialized we have to
                 * fallback to data copying */
                uninit = mext_check_coverage(orig_inode, orig_blk_offset,
@@ -990,7 +992,7 @@ again:
                        goto drop_data_sem;
 
                if (!uninit) {
-                       double_up_write_data_sem(orig_inode, donor_inode);
+                       ext4_double_up_write_data_sem(orig_inode, donor_inode);
                        goto data_copy;
                }
                if ((page_has_private(pagep[0]) &&
@@ -1004,7 +1006,7 @@ again:
                                                donor_inode, orig_blk_offset,
                                                block_len_in_page, err);
        drop_data_sem:
-               double_up_write_data_sem(orig_inode, donor_inode);
+               ext4_double_up_write_data_sem(orig_inode, donor_inode);
                goto unlock_pages;
        }
 data_copy:
@@ -1065,11 +1067,11 @@ repair_branches:
         * Extents are swapped already, but we are not able to copy data.
         * Try to swap extents to it's original places
         */
-       double_down_write_data_sem(orig_inode, donor_inode);
+       ext4_double_down_write_data_sem(orig_inode, donor_inode);
        replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
                                               orig_blk_offset,
                                               block_len_in_page, &err2);
-       double_up_write_data_sem(orig_inode, donor_inode);
+       ext4_double_up_write_data_sem(orig_inode, donor_inode);
        if (replaced_count != block_len_in_page) {
                EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
                                       "Unable to copy data block,"
@@ -1209,15 +1211,15 @@ mext_check_arguments(struct inode *orig_inode,
 }
 
 /**
- * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
  *
  * @inode1:    the inode structure
  * @inode2:    the inode structure
  *
  * Lock two inodes' i_mutex
  */
-static void
-mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+void
+ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
 {
        BUG_ON(inode1 == inode2);
        if (inode1 < inode2) {
@@ -1230,15 +1232,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 }
 
 /**
- * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
  *
  * @inode1:     the inode that is released first
  * @inode2:     the inode that is released second
  *
  */
 
-static void
-mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+void
+ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
 {
        mutex_unlock(&inode1->i_mutex);
        mutex_unlock(&inode2->i_mutex);
@@ -1333,7 +1335,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                return -EINVAL;
        }
        /* Protect orig and donor inodes against a truncate */
-       mext_inode_double_lock(orig_inode, donor_inode);
+       ext4_inode_double_lock(orig_inode, donor_inode);
 
        /* Wait for all existing dio workers */
        ext4_inode_block_unlocked_dio(orig_inode);
@@ -1342,7 +1344,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
        inode_dio_wait(donor_inode);
 
        /* Protect extent tree against block allocations via delalloc */
-       double_down_write_data_sem(orig_inode, donor_inode);
+       ext4_double_down_write_data_sem(orig_inode, donor_inode);
        /* Check the filesystem environment whether move_extent can be done */
        ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
                                    donor_start, &len);
@@ -1466,7 +1468,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                 * b. racing with ->readpage, ->write_begin, and ext4_get_block
                 *    in move_extent_per_page
                 */
-               double_up_write_data_sem(orig_inode, donor_inode);
+               ext4_double_up_write_data_sem(orig_inode, donor_inode);
 
                while (orig_page_offset <= seq_end_page) {
 
@@ -1500,7 +1502,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                                block_len_in_page = rest_blocks;
                }
 
-               double_down_write_data_sem(orig_inode, donor_inode);
+               ext4_double_down_write_data_sem(orig_inode, donor_inode);
                if (ret < 0)
                        break;
 
@@ -1538,10 +1540,10 @@ out:
                ext4_ext_drop_refs(holecheck_path);
                kfree(holecheck_path);
        }
-       double_up_write_data_sem(orig_inode, donor_inode);
+       ext4_double_up_write_data_sem(orig_inode, donor_inode);
        ext4_inode_resume_unlocked_dio(orig_inode);
        ext4_inode_resume_unlocked_dio(donor_inode);
-       mext_inode_double_unlock(orig_inode, donor_inode);
+       ext4_inode_double_unlock(orig_inode, donor_inode);
 
        return ret;
 }