Btrfs: verify csums on read
authorChris Mason <chris.mason@oracle.com>
Thu, 29 Mar 2007 19:15:27 +0000 (15:15 -0400)
committerDavid Woodhouse <dwmw2@hera.kernel.org>
Thu, 29 Mar 2007 19:15:27 +0000 (15:15 -0400)
Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/file-item.c
fs/btrfs/super.c

index 48c611948d11969c42642649c4da20fcb79b74f4..1eb333e2e18bcc17c86c136b705459619061f6bd 100644 (file)
@@ -109,14 +109,14 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
                return 1;
        if (k1.objectid < k2->objectid)
                return -1;
-       if (k1.flags > k2->flags)
-               return 1;
-       if (k1.flags < k2->flags)
-               return -1;
        if (k1.offset > k2->offset)
                return 1;
        if (k1.offset < k2->offset)
                return -1;
+       if (k1.flags > k2->flags)
+               return 1;
+       if (k1.flags < k2->flags)
+               return -1;
        return 0;
 }
 
@@ -1165,7 +1165,6 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
                BUG();
        ret = btrfs_search_slot(trans, root, cpu_key, path, data_size, 1);
        if (ret == 0) {
-               btrfs_release_path(root, path);
                return -EEXIST;
        }
        if (ret < 0)
index 6ff87f44c5d8d2ab08ec811b110e5f3fb8830473..df1a025a771cdd2999175de067517b4d7a3fc5fd 100644 (file)
@@ -21,6 +21,9 @@ struct btrfs_transaction;
  */
 #define BTRFS_NAME_LEN 255
 
+/* 32 bytes in various csum fields */
+#define BTRFS_CSUM_SIZE 32
+
 /*
  * the key defines the order in the tree, and so it also defines (optimal)
  * block layout.  objectid corresonds to the inode number.  The flags
@@ -37,21 +40,21 @@ struct btrfs_transaction;
  */
 struct btrfs_disk_key {
        __le64 objectid;
-       __le32 flags;
        __le64 offset;
+       __le32 flags;
 } __attribute__ ((__packed__));
 
 struct btrfs_key {
        u64 objectid;
-       u32 flags;
        u64 offset;
+       u32 flags;
 } __attribute__ ((__packed__));
 
 /*
  * every tree block (leaf or node) starts with this header.
  */
 struct btrfs_header {
-       __le32 csum[8];
+       u8 csum[BTRFS_CSUM_SIZE];
        u8 fsid[16]; /* FS specific uuid */
        __le64 blocknr; /* which block this node is supposed to live in */
        __le64 generation;
@@ -75,7 +78,7 @@ struct buffer_head;
  * it currently lacks any block count etc etc
  */
 struct btrfs_super_block {
-       __le32 csum[8];
+       u8 csum[BTRFS_CSUM_SIZE];
        /* the first 3 fields must match struct btrfs_header */
        u8 fsid[16];    /* FS specific uuid */
        __le64 blocknr; /* this block number */
@@ -147,7 +150,7 @@ struct btrfs_extent_item {
 } __attribute__ ((__packed__));
 
 struct btrfs_inode_timespec {
-       __le32 sec;
+       __le64 sec;
        __le32 nsec;
 } __attribute__ ((__packed__));
 
@@ -214,6 +217,10 @@ struct btrfs_file_extent_item {
        __le64 num_blocks;
 } __attribute__ ((__packed__));
 
+struct btrfs_csum_item {
+       u8 csum[BTRFS_CSUM_SIZE];
+} __attribute__ ((__packed__));
+
 struct btrfs_inode_map_item {
        struct btrfs_disk_key key;
 } __attribute__ ((__packed__));
@@ -283,27 +290,32 @@ struct btrfs_root {
  * a (hopefully) huge chunk of disk
  */
 #define BTRFS_EXTENT_DATA_KEY  4
+/*
+ * csum items have the checksums for data in the extents
+ */
+#define BTRFS_CSUM_ITEM_KEY    5
+
 /*
  * root items point to tree roots.  There are typically in the root
  * tree used by the super block to find all the other trees
  */
-#define BTRFS_ROOT_ITEM_KEY    5
+#define BTRFS_ROOT_ITEM_KEY    6
 /*
  * extent items are in the extent map tree.  These record which blocks
  * are used, and how many references there are to each block
  */
-#define BTRFS_EXTENT_ITEM_KEY  6
+#define BTRFS_EXTENT_ITEM_KEY  7
 
 /*
  * the inode map records which inode numbers are in use and where
  * they actually live on disk
  */
-#define BTRFS_INODE_MAP_ITEM_KEY 7
+#define BTRFS_INODE_MAP_ITEM_KEY 8
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
-#define BTRFS_STRING_ITEM_KEY  8
+#define BTRFS_STRING_ITEM_KEY  9
 
 static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i)
 {
@@ -407,15 +419,15 @@ static inline void btrfs_set_inode_compat_flags(struct btrfs_inode_item *i,
        i->compat_flags = cpu_to_le16(val);
 }
 
-static inline u32 btrfs_timespec_sec(struct btrfs_inode_timespec *ts)
+static inline u64 btrfs_timespec_sec(struct btrfs_inode_timespec *ts)
 {
-       return le32_to_cpu(ts->sec);
+       return le64_to_cpu(ts->sec);
 }
 
 static inline void btrfs_set_timespec_sec(struct btrfs_inode_timespec *ts,
-                                         u32 val)
+                                         u64 val)
 {
-       ts->sec = cpu_to_le32(val);
+       ts->sec = cpu_to_le64(val);
 }
 
 static inline u32 btrfs_timespec_nsec(struct btrfs_inode_timespec *ts)
@@ -429,8 +441,6 @@ static inline void btrfs_set_timespec_nsec(struct btrfs_inode_timespec *ts,
        ts->nsec = cpu_to_le32(val);
 }
 
-
-
 static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei)
 {
        return le64_to_cpu(ei->owner);
@@ -905,4 +915,11 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid,
                             u64 blocknr, int mod);
+int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         u64 objectid, u64 offset,
+                         char *data, size_t len);
+int btrfs_csum_verify_file_block(struct btrfs_root *root,
+                                u64 objectid, u64 offset,
+                                char *data, size_t len);
 #endif
index f2c1dd40c087871ba435b49ff74d776edfa819b5..f5db2b93650293c191764049d4ade4cb4e8f6523 100644 (file)
@@ -127,31 +127,46 @@ static int btree_get_block(struct inode *inode, sector_t iblock,
        return 0;
 }
 
-static int csum_tree_block(struct btrfs_root * root, struct buffer_head *bh,
-                           int verify)
+int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
+                   char *result)
 {
-       struct btrfs_node *node = btrfs_buffer_node(bh);
        struct scatterlist sg;
        struct crypto_hash *tfm = root->fs_info->hash_tfm;
        struct hash_desc desc;
        int ret;
-       char result[32];
 
        desc.tfm = tfm;
        desc.flags = 0;
-       sg_init_one(&sg, bh->b_data + 32, bh->b_size - 32);
+       sg_init_one(&sg, data, len);
        spin_lock(&root->fs_info->hash_lock);
-       ret = crypto_hash_digest(&desc, &sg, bh->b_size - 32, result);
+       ret = crypto_hash_digest(&desc, &sg, len, result);
        spin_unlock(&root->fs_info->hash_lock);
        if (ret) {
                printk("sha256 digest failed\n");
        }
+       return ret;
+}
+static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
+                          int verify)
+{
+       char result[BTRFS_CSUM_SIZE];
+       int ret;
+       struct btrfs_node *node;
+
+       ret = btrfs_csum_data(root, bh->b_data + BTRFS_CSUM_SIZE,
+                             bh->b_size - BTRFS_CSUM_SIZE, result);
+       if (ret)
+               return ret;
        if (verify) {
-               if (memcmp(node->header.csum, result, sizeof(result)))
-                       printk("csum verify failed on %Lu\n", bh->b_blocknr);
-               return -EINVAL;
-       } else
-               memcpy(node->header.csum, result, sizeof(node->header.csum));
+               if (memcmp(bh->b_data, result, BTRFS_CSUM_SIZE)) {
+                       printk("checksum verify failed on %lu\n",
+                              bh->b_blocknr);
+                       return 1;
+               }
+       } else {
+               node = btrfs_buffer_node(bh);
+               memcpy(&node->header.csum, result, BTRFS_CSUM_SIZE);
+       }
        return 0;
 }
 
index c2c38bda704d88129e1a3d6d6d4753d7cb36e9cc..f6998e2192ced1b1314db87d0b79b6867086b682 100644 (file)
@@ -39,4 +39,6 @@ void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf);
 int write_ctree_super(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root);
 struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr);
+int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
+                   char *result);
 #endif
index 5230a44cb19b9eed44ec5e9a82d248454c9e1630..2d2c23ca7cbfd30a872c09de172c3de23fd82db8 100644 (file)
@@ -57,3 +57,63 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
        return ret;
 }
+
+int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         u64 objectid, u64 offset,
+                         char *data, size_t len)
+{
+       int ret;
+       struct btrfs_key file_key;
+       struct btrfs_path path;
+       struct btrfs_csum_item *item;
+
+       btrfs_init_path(&path);
+       file_key.objectid = objectid;
+       file_key.offset = offset;
+       file_key.flags = 0;
+       btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+       ret = btrfs_insert_empty_item(trans, root, &path, &file_key,
+                                     BTRFS_CSUM_SIZE);
+       if (ret != 0 && ret != -EEXIST)
+               goto fail;
+       item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+                             struct btrfs_csum_item);
+       ret = 0;
+       ret = btrfs_csum_data(root, data, len, item->csum);
+       mark_buffer_dirty(path.nodes[0]);
+fail:
+       btrfs_release_path(root, &path);
+       return ret;
+}
+
+int btrfs_csum_verify_file_block(struct btrfs_root *root,
+                                u64 objectid, u64 offset,
+                                char *data, size_t len)
+{
+       int ret;
+       struct btrfs_key file_key;
+       struct btrfs_path path;
+       struct btrfs_csum_item *item;
+       char result[BTRFS_CSUM_SIZE];
+
+       btrfs_init_path(&path);
+       file_key.objectid = objectid;
+       file_key.offset = offset;
+       file_key.flags = 0;
+       btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+       ret = btrfs_search_slot(NULL, root, &file_key, &path, 0, 0);
+       if (ret)
+               goto fail;
+       item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+                             struct btrfs_csum_item);
+       ret = 0;
+       ret = btrfs_csum_data(root, data, len, result);
+       WARN_ON(ret);
+       if (memcmp(result, item->csum, BTRFS_CSUM_SIZE))
+               ret = 1;
+fail:
+       btrfs_release_path(root, &path);
+       return ret;
+}
+
index 7914b31f5bcd42de681020d23bab4429a85678c2..04428137d75f9e265c24b3abc1184e5bde982890 100644 (file)
@@ -249,15 +249,16 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        struct btrfs_disk_key *found_key;
        struct btrfs_leaf *leaf;
-       struct btrfs_file_extent_item *fi;
-       u64 extent_start;
-       u64 extent_num_blocks;
+       struct btrfs_file_extent_item *fi = NULL;
+       u64 extent_start = 0;
+       u64 extent_num_blocks = 0;
+       int found_extent;
 
        /* FIXME, add redo link to tree so we don't leak on crash */
        key.objectid = inode->i_ino;
        key.offset = (u64)-1;
        key.flags = 0;
-       btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+       btrfs_set_key_type(&key, BTRFS_CSUM_ITEM_KEY);
        while(1) {
                btrfs_init_path(&path);
                ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
@@ -273,25 +274,32 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
                found_key = &leaf->items[path.slots[0]].key;
                if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
                        break;
-               if (btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
+               if (btrfs_disk_key_type(found_key) != BTRFS_CSUM_ITEM_KEY &&
+                   btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
                        break;
                if (btrfs_disk_key_offset(found_key) < inode->i_size)
                        break;
-               fi = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
-                                   path.slots[0],
-                                   struct btrfs_file_extent_item);
-               extent_start = btrfs_file_extent_disk_blocknr(fi);
-               extent_num_blocks = btrfs_file_extent_disk_num_blocks(fi);
-               key.offset = btrfs_disk_key_offset(found_key) - 1;
+               if (btrfs_disk_key_type(found_key) == BTRFS_EXTENT_DATA_KEY) {
+                       fi = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
+                                           path.slots[0],
+                                           struct btrfs_file_extent_item);
+                       extent_start = btrfs_file_extent_disk_blocknr(fi);
+                       extent_num_blocks =
+                               btrfs_file_extent_disk_num_blocks(fi);
+                       inode->i_blocks -=
+                               btrfs_file_extent_num_blocks(fi) >> 9;
+                       found_extent = 1;
+               } else {
+                       found_extent = 0;
+               }
                ret = btrfs_del_item(trans, root, &path);
                BUG_ON(ret);
-               inode->i_blocks -= btrfs_file_extent_num_blocks(fi) >> 9;
                btrfs_release_path(root, &path);
-               ret = btrfs_free_extent(trans, root, extent_start,
-                                       extent_num_blocks, 0);
-               BUG_ON(ret);
-               if (key.offset + 1 == 0)
-                       break;
+               if (found_extent) {
+                       ret = btrfs_free_extent(trans, root, extent_start,
+                                               extent_num_blocks, 0);
+                       BUG_ON(ret);
+               }
        }
        btrfs_release_path(root, &path);
        ret = 0;
@@ -975,10 +983,24 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
        int err = 0;
        int ret;
        int this_write;
+       struct inode *inode = file->f_path.dentry->d_inode;
 
        for (i = 0; i < num_pages; i++) {
                offset = pos & (PAGE_CACHE_SIZE -1);
                this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
+               /* FIXME, one block at a time */
+
+               mutex_lock(&root->fs_info->fs_mutex);
+               trans = btrfs_start_transaction(root, 1);
+               btrfs_csum_file_block(trans, root, inode->i_ino,
+                                     pages[i]->index << PAGE_CACHE_SHIFT,
+                                     kmap(pages[i]), PAGE_CACHE_SIZE);
+               kunmap(pages[i]);
+               SetPageChecked(pages[i]);
+               ret = btrfs_end_transaction(trans, root);
+               BUG_ON(ret);
+               mutex_unlock(&root->fs_info->fs_mutex);
+
                ret = nobh_commit_write(file, pages[i], offset,
                                         offset + this_write);
                pos += this_write;
@@ -1022,7 +1044,7 @@ static int prepare_pages(struct btrfs_trans_handle *trans,
                this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
                ret = nobh_prepare_write(pages[i], offset,
                                         offset + this_write,
-                                        btrfs_get_block_lock);
+                                        btrfs_get_block);
                pos += this_write;
                if (ret) {
                        err = ret;
@@ -1051,7 +1073,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
        size_t num_written = 0;
        int err = 0;
        int ret = 0;
-       struct btrfs_trans_handle *trans;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct btrfs_root *root = btrfs_sb(inode->i_sb);
        struct page *pages[1];
@@ -1077,25 +1098,18 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
                                        PAGE_CACHE_SHIFT;
-               mutex_lock(&root->fs_info->fs_mutex);
-               trans = btrfs_start_transaction(root, 1);
-
-               ret = prepare_pages(trans, root, file, pages, num_pages,
+               ret = prepare_pages(NULL, root, file, pages, num_pages,
                                    pos, write_bytes);
                BUG_ON(ret);
                ret = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, buf);
                BUG_ON(ret);
 
-               mutex_unlock(&root->fs_info->fs_mutex);
-
-               ret = dirty_and_release_pages(trans, root, file, pages,
+               ret = dirty_and_release_pages(NULL, root, file, pages,
                                              num_pages, pos, write_bytes);
                BUG_ON(ret);
                btrfs_drop_pages(pages, num_pages);
 
-               ret = btrfs_end_transaction(trans, root);
-
                buf += write_bytes;
                count -= write_bytes;
                pos += write_bytes;
@@ -1111,6 +1125,118 @@ out:
        return num_written ? num_written : err;
 }
 
+static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
+                       unsigned long offset, unsigned long size)
+{
+       char *kaddr;
+       unsigned long left, count = desc->count;
+
+       if (size > count)
+               size = count;
+
+       if (!PageChecked(page)) {
+               /* FIXME, do it per block */
+               struct btrfs_root *root = btrfs_sb(page->mapping->host->i_sb);
+               int ret = btrfs_csum_verify_file_block(root,
+                                         page->mapping->host->i_ino,
+                                         page->index << PAGE_CACHE_SHIFT,
+                                         kmap(page), PAGE_CACHE_SIZE);
+               if (ret) {
+                       printk("failed to verify ino %lu page %lu\n",
+                              page->mapping->host->i_ino,
+                              page->index);
+                       memset(page_address(page), 0, PAGE_CACHE_SIZE);
+               }
+               SetPageChecked(page);
+               kunmap(page);
+       }
+       /*
+        * Faults on the destination of a read are common, so do it before
+        * taking the kmap.
+        */
+       if (!fault_in_pages_writeable(desc->arg.buf, size)) {
+               kaddr = kmap_atomic(page, KM_USER0);
+               left = __copy_to_user_inatomic(desc->arg.buf,
+                                               kaddr + offset, size);
+               kunmap_atomic(kaddr, KM_USER0);
+               if (left == 0)
+                       goto success;
+       }
+
+       /* Do it the slow way */
+       kaddr = kmap(page);
+       left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
+       kunmap(page);
+
+       if (left) {
+               size -= left;
+               desc->error = -EFAULT;
+       }
+success:
+       desc->count = count - size;
+       desc->written += size;
+       desc->arg.buf += size;
+       return size;
+}
+
+/**
+ * btrfs_file_aio_read - filesystem read routine
+ * @iocb:      kernel I/O control block
+ * @iov:       io vector request
+ * @nr_segs:   number of segments in the iovec
+ * @pos:       current file position
+ */
+static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+                                  unsigned long nr_segs, loff_t pos)
+{
+       struct file *filp = iocb->ki_filp;
+       ssize_t retval;
+       unsigned long seg;
+       size_t count;
+       loff_t *ppos = &iocb->ki_pos;
+
+       count = 0;
+       for (seg = 0; seg < nr_segs; seg++) {
+               const struct iovec *iv = &iov[seg];
+
+               /*
+                * If any segment has a negative length, or the cumulative
+                * length ever wraps negative then return -EINVAL.
+                */
+               count += iv->iov_len;
+               if (unlikely((ssize_t)(count|iv->iov_len) < 0))
+                       return -EINVAL;
+               if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
+                       continue;
+               if (seg == 0)
+                       return -EFAULT;
+               nr_segs = seg;
+               count -= iv->iov_len;   /* This segment is no good */
+               break;
+       }
+       retval = 0;
+       if (count) {
+               for (seg = 0; seg < nr_segs; seg++) {
+                       read_descriptor_t desc;
+
+                       desc.written = 0;
+                       desc.arg.buf = iov[seg].iov_base;
+                       desc.count = iov[seg].iov_len;
+                       if (desc.count == 0)
+                               continue;
+                       desc.error = 0;
+                       do_generic_file_read(filp, ppos, &desc,
+                                            btrfs_read_actor);
+                       retval += desc.written;
+                       if (desc.error) {
+                               retval = retval ?: desc.error;
+                               break;
+                       }
+               }
+       }
+       return retval;
+}
+
 static int btrfs_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
@@ -1166,7 +1292,7 @@ static struct inode_operations btrfs_file_inode_operations = {
 static struct file_operations btrfs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
-       .aio_read       = generic_file_aio_read,
+       .aio_read       = btrfs_file_aio_read,
        .write          = btrfs_file_write,
        .mmap           = generic_file_mmap,
        .open           = generic_file_open,