Btrfs: Fix the defragmention code and the block relocation code for data=ordered
authorChris Mason <chris.mason@oracle.com>
Thu, 24 Jul 2008 15:57:52 +0000 (11:57 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 25 Sep 2008 15:04:05 +0000 (11:04 -0400)
Before setting an extent to delalloc, the code needs to wait for
pending ordered extents.

Also, the relocation code needs to wait for ordered IO before scanning
the block group again.  This is because the extents are not removed
until the IO for the new extents is finished

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/ioctl.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h

index 8ecac2e77a438e464bc0add50f83b0c0d88d1966..6675e916ebcd8a78eaa1c4c3c54d1502061552bf 100644 (file)
@@ -538,6 +538,13 @@ struct btrfs_fs_info {
        struct list_head dead_roots;
        atomic_t nr_async_submits;
 
+       /*
+        * this is used by the balancing code to wait for all the pending
+        * ordered extents
+        */
+       spinlock_t ordered_extent_lock;
+       struct list_head ordered_extents;
+
        /*
         * there is a pool of worker threads for checksumming during writes
         * and a pool for checksumming after reads.  This is because readers
index 7ce3f83c5dd633bab76b3a535361b28013b134d3..ec01062eb41d23500aa18ba9634d250f5557c99e 100644 (file)
@@ -1252,6 +1252,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->btree_inode->i_nlink = 1;
        fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
 
+       INIT_LIST_HEAD(&fs_info->ordered_extents);
+       spin_lock_init(&fs_info->ordered_extent_lock);
+
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
 
index febc6295c7a9c8c634277aa1b11c701e689df756..f92b297e7da5f8a96d81df20689e026cb63b0b89 100644 (file)
@@ -2640,6 +2640,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
        struct file_ra_state *ra;
        unsigned long total_read = 0;
        unsigned long ra_pages;
+       struct btrfs_ordered_extent *ordered;
        struct btrfs_trans_handle *trans;
 
        ra = kzalloc(sizeof(*ra), GFP_NOFS);
@@ -2658,9 +2659,9 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
                                       calc_ra(i, last_index, ra_pages));
                }
                total_read++;
-               if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size)
+again:
+               if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
                        goto truncate_racing;
-
                page = grab_cache_page(inode->i_mapping, i);
                if (!page) {
                        goto out_unlock;
@@ -2674,18 +2675,24 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
                                goto out_unlock;
                        }
                }
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-               ClearPageDirty(page);
-#else
-               cancel_dirty_page(page, PAGE_CACHE_SIZE);
-#endif
                wait_on_page_writeback(page);
-               set_page_extent_mapped(page);
+
                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
                page_end = page_start + PAGE_CACHE_SIZE - 1;
-
                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
+               ordered = btrfs_lookup_ordered_extent(inode, page_start);
+               if (ordered) {
+                       unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                       unlock_page(page);
+                       page_cache_release(page);
+                       btrfs_start_ordered_extent(inode, ordered, 1);
+                       btrfs_put_ordered_extent(ordered);
+                       goto again;
+               }
+               set_page_extent_mapped(page);
+
+
                set_extent_delalloc(io_tree, page_start,
                                    page_end, GFP_NOFS);
                set_page_dirty(page);
@@ -2694,10 +2701,18 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
                unlock_page(page);
                page_cache_release(page);
        }
-       balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                                          total_read);
 
 out_unlock:
+       /* we have to start the IO in order to get the ordered extents
+        * instantiated.  This allows the relocation to code to wait
+        * for all the ordered extents to hit the disk.
+        *
+        * Otherwise, it would constantly loop over the same extents
+        * because the old ones don't get deleted  until the IO is
+        * started
+        */
+       btrfs_fdatawrite_range(inode->i_mapping, start, start + len - 1,
+                              WB_SYNC_NONE);
        kfree(ra);
        trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
        if (trans) {
@@ -3238,6 +3253,8 @@ next:
 
                btrfs_clean_old_snapshots(tree_root);
 
+               btrfs_wait_ordered_extents(tree_root);
+
                trans = btrfs_start_transaction(tree_root, 1);
                btrfs_commit_transaction(trans, tree_root);
                mutex_lock(&root->fs_info->alloc_mutex);
index 83f17a5cbd6a23e67e34c6b1d1da7a7f4a1595e2..a61f2e7e2db57a35dc4e36b8c99a58069cecf3d4 100644 (file)
@@ -213,6 +213,7 @@ int btrfs_defrag_file(struct file *file)
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       struct btrfs_ordered_extent *ordered;
        struct page *page;
        unsigned long last_index;
        unsigned long ra_pages = root->fs_info->bdi.ra_pages;
@@ -234,6 +235,7 @@ int btrfs_defrag_file(struct file *file)
                                       min(last_index, i + ra_pages - 1));
                }
                total_read++;
+again:
                page = grab_cache_page(inode->i_mapping, i);
                if (!page)
                        goto out_unlock;
@@ -247,18 +249,23 @@ int btrfs_defrag_file(struct file *file)
                        }
                }
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-               ClearPageDirty(page);
-#else
-               cancel_dirty_page(page, PAGE_CACHE_SIZE);
-#endif
                wait_on_page_writeback(page);
-               set_page_extent_mapped(page);
 
                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
                page_end = page_start + PAGE_CACHE_SIZE - 1;
-
                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+               ordered = btrfs_lookup_ordered_extent(inode, page_start);
+               if (ordered) {
+                       unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                       unlock_page(page);
+                       page_cache_release(page);
+                       btrfs_start_ordered_extent(inode, ordered, 1);
+                       btrfs_put_ordered_extent(ordered);
+                       goto again;
+               }
+               set_page_extent_mapped(page);
+
                set_extent_delalloc(io_tree, page_start,
                                    page_end, GFP_NOFS);
 
index e42fd233e04cdef068acb94df92303b1aa09dc75..676e4bd65c529cb3332da5a62679a44ed5439bcd 100644 (file)
@@ -167,20 +167,28 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        entry->file_offset = file_offset;
        entry->start = start;
        entry->len = len;
+       entry->inode = inode;
+
        /* one ref for the tree */
        atomic_set(&entry->refs, 1);
        init_waitqueue_head(&entry->wait);
        INIT_LIST_HEAD(&entry->list);
+       INIT_LIST_HEAD(&entry->root_extent_list);
 
        node = tree_insert(&tree->tree, file_offset,
                           &entry->rb_node);
        if (node) {
-               entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-               atomic_inc(&entry->refs);
+               printk("warning dup entry from add_ordered_extent\n");
+               BUG();
        }
        set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
                           entry_end(entry) - 1, GFP_NOFS);
 
+       spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+       list_add_tail(&entry->root_extent_list,
+                     &BTRFS_I(inode)->root->fs_info->ordered_extents);
+       spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+
        mutex_unlock(&tree->mutex);
        BUG_ON(node);
        return 0;
@@ -285,11 +293,55 @@ int btrfs_remove_ordered_extent(struct inode *inode,
        rb_erase(node, &tree->tree);
        tree->last = NULL;
        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+
+       spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+       list_del_init(&entry->root_extent_list);
+       spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+
        mutex_unlock(&tree->mutex);
        wake_up(&entry->wait);
        return 0;
 }
 
+int btrfs_wait_ordered_extents(struct btrfs_root *root)
+{
+       struct list_head splice;
+       struct list_head *cur;
+       struct btrfs_ordered_extent *ordered;
+       struct inode *inode;
+
+       INIT_LIST_HEAD(&splice);
+
+       spin_lock(&root->fs_info->ordered_extent_lock);
+       list_splice_init(&root->fs_info->ordered_extents, &splice);
+       while(!list_empty(&splice)) {
+               cur = splice.next;
+               ordered = list_entry(cur, struct btrfs_ordered_extent,
+                                    root_extent_list);
+               list_del_init(&ordered->root_extent_list);
+               atomic_inc(&ordered->refs);
+               inode = ordered->inode;
+
+               /*
+                * the inode can't go away until all the pages are gone
+                * and the pages won't go away while there is still
+                * an ordered extent and the ordered extent won't go
+                * away until it is off this list.  So, we can safely
+                * increment i_count here and call iput later
+                */
+               atomic_inc(&inode->i_count);
+               spin_unlock(&root->fs_info->ordered_extent_lock);
+
+               btrfs_start_ordered_extent(inode, ordered, 1);
+               btrfs_put_ordered_extent(ordered);
+               iput(inode);
+
+               spin_lock(&root->fs_info->ordered_extent_lock);
+       }
+       spin_unlock(&root->fs_info->ordered_extent_lock);
+       return 0;
+}
+
 /*
  * Used to start IO or wait for a given ordered extent to finish.
  *
index 199cb0b4f1d9a1d7a0de54fdca8ea0d33f9eefa6..5efe6b63c74c28f6e9e5c43308656144d967adbf 100644 (file)
@@ -80,6 +80,9 @@ struct btrfs_ordered_extent {
        /* reference count */
        atomic_t refs;
 
+       /* the inode we belong to */
+       struct inode *inode;
+
        /* list of checksums for insertion when the extent io is done */
        struct list_head list;
 
@@ -88,6 +91,9 @@ struct btrfs_ordered_extent {
 
        /* our friendly rbtree entry */
        struct rb_node rb_node;
+
+       /* a per root list of all the pending ordered extents */
+       struct list_head root_extent_list;
 };
 
 
@@ -137,4 +143,5 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
                                       pgoff_t start, pgoff_t end);
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
                           loff_t end, int sync_mode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root);
 #endif