btrfs: offline dedupe
authorMark Fasheh <mfasheh@suse.de>
Tue, 6 Aug 2013 18:42:51 +0000 (11:42 -0700)
committerChris Mason <chris.mason@fusionio.com>
Sun, 1 Sep 2013 12:05:00 +0000 (08:05 -0400)
This patch adds an ioctl, BTRFS_IOC_FILE_EXTENT_SAME which will try to
de-duplicate a list of extents across a range of files.

Internally, the ioctl re-uses code from the clone ioctl. This avoids
rewriting a large chunk of extent handling code.

Userspace passes in an array of file, offset pairs along with a length
argument. The ioctl will then (for each dedupe) do a byte-by-byte comparison
of the user data before deduping the extent. Status and number of bytes
deduped are returned for each operation.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
fs/btrfs/ioctl.c
include/uapi/linux/btrfs.h

index 5b5148a1b0d3c0cec1179ee89086cab82327e9bb..022d8364e0726a169088f6a8e8b4b4326c93b017 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/blkdev.h>
 #include <linux/uuid.h>
 #include <linux/btrfs.h>
+#include <linux/uaccess.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -57,6 +58,9 @@
 #include "send.h"
 #include "dev-replace.h"
 
+static int btrfs_clone(struct inode *src, struct inode *inode,
+                      u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
 {
@@ -2470,6 +2474,34 @@ out:
        return ret;
 }
 
+static struct page *extent_same_get_page(struct inode *inode, u64 off)
+{
+       struct page *page;
+       pgoff_t index;
+       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+
+       index = off >> PAGE_CACHE_SHIFT;
+
+       page = grab_cache_page(inode->i_mapping, index);
+       if (!page)
+               return NULL;
+
+       if (!PageUptodate(page)) {
+               if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
+                                                0))
+                       return NULL;
+               lock_page(page);
+               if (!PageUptodate(page)) {
+                       unlock_page(page);
+                       page_cache_release(page);
+                       return NULL;
+               }
+       }
+       unlock_page(page);
+
+       return page;
+}
+
 static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
 {
        /* do any pending delalloc/csum calc on src, one way or
@@ -2490,6 +2522,251 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
        }
 }
 
+static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
+                               struct inode *inode2, u64 loff2, u64 len)
+{
+       unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+       unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+
+       mutex_unlock(&inode1->i_mutex);
+       mutex_unlock(&inode2->i_mutex);
+}
+
+static void btrfs_double_lock(struct inode *inode1, u64 loff1,
+                             struct inode *inode2, u64 loff2, u64 len)
+{
+       if (inode1 < inode2) {
+               swap(inode1, inode2);
+               swap(loff1, loff2);
+       }
+
+       mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+       lock_extent_range(inode1, loff1, len);
+       if (inode1 != inode2) {
+               mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+               lock_extent_range(inode2, loff2, len);
+       }
+}
+
+static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
+                         u64 dst_loff, u64 len)
+{
+       int ret = 0;
+       struct page *src_page, *dst_page;
+       unsigned int cmp_len = PAGE_CACHE_SIZE;
+       void *addr, *dst_addr;
+
+       while (len) {
+               if (len < PAGE_CACHE_SIZE)
+                       cmp_len = len;
+
+               src_page = extent_same_get_page(src, loff);
+               if (!src_page)
+                       return -EINVAL;
+               dst_page = extent_same_get_page(dst, dst_loff);
+               if (!dst_page) {
+                       page_cache_release(src_page);
+                       return -EINVAL;
+               }
+               addr = kmap_atomic(src_page);
+               dst_addr = kmap_atomic(dst_page);
+
+               flush_dcache_page(src_page);
+               flush_dcache_page(dst_page);
+
+               if (memcmp(addr, dst_addr, cmp_len))
+                       ret = BTRFS_SAME_DATA_DIFFERS;
+
+               kunmap_atomic(addr);
+               kunmap_atomic(dst_addr);
+               page_cache_release(src_page);
+               page_cache_release(dst_page);
+
+               if (ret)
+                       break;
+
+               loff += cmp_len;
+               dst_loff += cmp_len;
+               len -= cmp_len;
+       }
+
+       return ret;
+}
+
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+{
+       u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
+
+       if (off + len > inode->i_size || off + len < off)
+               return -EINVAL;
+       /* Check that we are block aligned - btrfs_clone() requires this */
+       if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
+               return -EINVAL;
+
+       return 0;
+}
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+                            struct inode *dst, u64 dst_loff)
+{
+       int ret;
+
+       /*
+        * btrfs_clone() can't handle extents in the same file
+        * yet. Once that works, we can drop this check and replace it
+        * with a check for the same inode, but overlapping extents.
+        */
+       if (src == dst)
+               return -EINVAL;
+
+       btrfs_double_lock(src, loff, dst, dst_loff, len);
+
+       ret = extent_same_check_offsets(src, loff, len);
+       if (ret)
+               goto out_unlock;
+
+       ret = extent_same_check_offsets(dst, dst_loff, len);
+       if (ret)
+               goto out_unlock;
+
+       /* don't make the dst file partly checksummed */
+       if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+           (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
+               ret = -EINVAL;
+               goto out_unlock;
+       }
+
+       ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+       if (ret == 0)
+               ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+
+out_unlock:
+       btrfs_double_unlock(src, loff, dst, dst_loff, len);
+
+       return ret;
+}
+
+#define BTRFS_MAX_DEDUPE_LEN   (16 * 1024 * 1024)
+
+static long btrfs_ioctl_file_extent_same(struct file *file,
+                                        void __user *argp)
+{
+       struct btrfs_ioctl_same_args *args = argp;
+       struct btrfs_ioctl_same_args same;
+       struct btrfs_ioctl_same_extent_info info;
+       struct inode *src = file->f_dentry->d_inode;
+       struct file *dst_file = NULL;
+       struct inode *dst;
+       u64 off;
+       u64 len;
+       int i;
+       int ret;
+       u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+       bool is_admin = capable(CAP_SYS_ADMIN);
+
+       if (!(file->f_mode & FMODE_READ))
+               return -EINVAL;
+
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+
+       if (copy_from_user(&same,
+                          (struct btrfs_ioctl_same_args __user *)argp,
+                          sizeof(same))) {
+               ret = -EFAULT;
+               goto out;
+       }
+
+       off = same.logical_offset;
+       len = same.length;
+
+       /*
+        * Limit the total length we will dedupe for each operation.
+        * This is intended to bound the total time spent in this
+        * ioctl to something sane.
+        */
+       if (len > BTRFS_MAX_DEDUPE_LEN)
+               len = BTRFS_MAX_DEDUPE_LEN;
+
+       if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
+               /*
+                * Btrfs does not support blocksize < page_size. As a
+                * result, btrfs_cmp_data() won't correctly handle
+                * this situation without an update.
+                */
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = -EISDIR;
+       if (S_ISDIR(src->i_mode))
+               goto out;
+
+       ret = -EACCES;
+       if (!S_ISREG(src->i_mode))
+               goto out;
+
+       ret = 0;
+       for (i = 0; i < same.dest_count; i++) {
+               if (copy_from_user(&info, &args->info[i], sizeof(info))) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+
+               info.bytes_deduped = 0;
+
+               dst_file = fget(info.fd);
+               if (!dst_file) {
+                       info.status = -EBADF;
+                       goto next;
+               }
+
+               if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
+                       info.status = -EINVAL;
+                       goto next;
+               }
+
+               info.status = -EXDEV;
+               if (file->f_path.mnt != dst_file->f_path.mnt)
+                       goto next;
+
+               dst = dst_file->f_dentry->d_inode;
+               if (src->i_sb != dst->i_sb)
+                       goto next;
+
+               if (S_ISDIR(dst->i_mode)) {
+                       info.status = -EISDIR;
+                       goto next;
+               }
+
+               if (!S_ISREG(dst->i_mode)) {
+                       info.status = -EACCES;
+                       goto next;
+               }
+
+               info.status = btrfs_extent_same(src, off, len, dst,
+                                               info.logical_offset);
+               if (info.status == 0)
+                       info.bytes_deduped += len;
+
+next:
+               if (dst_file)
+                       fput(dst_file);
+
+               if (__put_user_unaligned(info.status, &args->info[i].status) ||
+                   __put_user_unaligned(info.bytes_deduped,
+                                        &args->info[i].bytes_deduped)) {
+                       ret = -EFAULT;
+                       goto out;
+               }                                                               
+       }
+
+out:
+       mnt_drop_write_file(file);
+       return ret;
+}
+
 /**
  * btrfs_clone() - clone a range from inode file to another
  *
@@ -4242,6 +4519,8 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_get_fslabel(file, argp);
        case BTRFS_IOC_SET_FSLABEL:
                return btrfs_ioctl_set_fslabel(file, argp);
+       case BTRFS_IOC_FILE_EXTENT_SAME:
+               return btrfs_ioctl_file_extent_same(file, argp);
        }
 
        return -ENOTTY;
index 05aed70627e24392d87cb4bc350e16bf32ceca44..90d7bd9d839c6cede80be69e0c5844aa08d4398b 100644 (file)
@@ -305,6 +305,31 @@ struct btrfs_ioctl_clone_range_args {
 #define BTRFS_DEFRAG_RANGE_COMPRESS 1
 #define BTRFS_DEFRAG_RANGE_START_IO 2
 
+#define BTRFS_SAME_DATA_DIFFERS        1
+/* For extent-same ioctl */
+struct btrfs_ioctl_same_extent_info {
+       __s64 fd;               /* in - destination file */
+       __u64 logical_offset;   /* in - start of extent in destination */
+       __u64 bytes_deduped;    /* out - total # of bytes we were able
+                                * to dedupe from this file */
+       /* status of this dedupe operation:
+        * 0 if dedup succeeds
+        * < 0 for error
+        * == BTRFS_SAME_DATA_DIFFERS if data differs
+        */
+       __s32 status;           /* out - see above description */
+       __u32 reserved;
+};
+
+struct btrfs_ioctl_same_args {
+       __u64 logical_offset;   /* in - start of extent in source */
+       __u64 length;           /* in - length of extent */
+       __u16 dest_count;       /* in - total elements in info array */
+       __u16 reserved1;
+       __u32 reserved2;
+       struct btrfs_ioctl_same_extent_info info[0];
+};
+
 struct btrfs_ioctl_space_info {
        __u64 flags;
        __u64 total_bytes;
@@ -579,4 +604,7 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code)
                                      struct btrfs_ioctl_get_dev_stats)
 #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
                                    struct btrfs_ioctl_dev_replace_args)
+#define BTRFS_IOC_FILE_EXTENT_SAME _IOWR(BTRFS_IOCTL_MAGIC, 54, \
+                                        struct btrfs_ioctl_same_args)
+
 #endif /* _UAPI_LINUX_BTRFS_H */