writeback: flush inode cgroup wb switches instead of pinning super_block
authorTejun Heo <tj@kernel.org>
Mon, 29 Feb 2016 23:28:53 +0000 (18:28 -0500)
committerJens Axboe <axboe@fb.com>
Thu, 3 Mar 2016 21:42:50 +0000 (14:42 -0700)
If cgroup writeback is in use, inodes can be scheduled for
asynchronous wb switching.  Before 5ff8eaac1636 ("writeback: keep
superblock pinned during cgroup writeback association switches"), this
could race with umount leading to super_block being destroyed while
inodes are pinned for wb switching.  5ff8eaac1636 fixed it by bumping
s_active while wb switches are in flight; however, this allowed
in-flight wb switches to make umounts asynchronous when the userland
expected synchronosity - e.g. fsck immediately following umount may
fail because the device is still busy.

This patch removes the problematic super_block pinning and instead
makes generic_shutdown_super() flush in-flight wb switches.  wb
switches are now executed on a dedicated isw_wq so that they can be
flushed and isw_nr_in_flight keeps track of the number of in-flight wb
switches so that flushing can be avoided in most cases.

v2: Move cgroup_writeback_umount() further below and add MS_ACTIVE
    check in inode_switch_wbs() as Jan an Al suggested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Tahsin Erdogan <tahsin@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Link: http://lkml.kernel.org/g/CAAeU0aNCq7LGODvVGRU-oU_o-6enii5ey0p1c26D1ZzYwkDc5A@mail.gmail.com
Fixes: 5ff8eaac1636 ("writeback: keep superblock pinned during cgroup writeback association switches")
Cc: stable@vger.kernel.org #v4.5
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
fs/fs-writeback.c
fs/super.c
include/linux/writeback.h

index 1f76d8950a57fc7bbdb3f8d98ec056fe89970b5c..5c46ed9f3e14087d4fa7bad553eb360b30fb4fd8 100644 (file)
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
 #define WB_FRN_HIST_MAX_SLOTS  (WB_FRN_HIST_THR_SLOTS / 2 + 1)
                                        /* one round can affect upto 5 slots */
 
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
 void __inode_attach_wb(struct inode *inode, struct page *page)
 {
        struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -317,7 +320,6 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
        struct inode_switch_wbs_context *isw =
                container_of(work, struct inode_switch_wbs_context, work);
        struct inode *inode = isw->inode;
-       struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        struct bdi_writeback *old_wb = inode->i_wb;
        struct bdi_writeback *new_wb = isw->new_wb;
@@ -424,8 +426,9 @@ skip_switch:
        wb_put(new_wb);
 
        iput(inode);
-       deactivate_super(sb);
        kfree(isw);
+
+       atomic_dec(&isw_nr_in_flight);
 }
 
 static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -435,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
 
        /* needs to grab bh-unsafe locks, bounce to work item */
        INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
-       schedule_work(&isw->work);
+       queue_work(isw_wq, &isw->work);
 }
 
 /**
@@ -471,20 +474,20 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 
        /* while holding I_WB_SWITCH, no one else can update the association */
        spin_lock(&inode->i_lock);
-
-       if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
-           inode_to_wb(inode) == isw->new_wb)
-               goto out_unlock;
-
-       if (!atomic_inc_not_zero(&inode->i_sb->s_active))
-               goto out_unlock;
-
+       if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+           inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+           inode_to_wb(inode) == isw->new_wb) {
+               spin_unlock(&inode->i_lock);
+               goto out_free;
+       }
        inode->i_state |= I_WB_SWITCH;
        spin_unlock(&inode->i_lock);
 
        ihold(inode);
        isw->inode = inode;
 
+       atomic_inc(&isw_nr_in_flight);
+
        /*
         * In addition to synchronizing among switchers, I_WB_SWITCH tells
         * the RCU protected stat update paths to grab the mapping's
@@ -494,8 +497,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
        call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
        return;
 
-out_unlock:
-       spin_unlock(&inode->i_lock);
 out_free:
        if (isw->new_wb)
                wb_put(isw->new_wb);
@@ -847,6 +848,33 @@ restart:
                wb_put(last_wb);
 }
 
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches.  An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished.  As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+       if (atomic_read(&isw_nr_in_flight)) {
+               synchronize_rcu();
+               flush_workqueue(isw_wq);
+       }
+}
+
+static int __init cgroup_writeback_init(void)
+{
+       isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+       if (!isw_wq)
+               return -ENOMEM;
+       return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static struct bdi_writeback *
index 1182af8fd5ff3ede7459305440d50178821c061b..74914b1bae70ff09d451450cb596fb3b5639d969 100644 (file)
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb)
                sb->s_flags &= ~MS_ACTIVE;
 
                fsnotify_unmount_inodes(sb);
+               cgroup_writeback_umount();
 
                evict_inodes(sb);
 
index b333c945e57117aa3d80f6ec2df47f6513c82260..d0b5ca5d4e080346e8a657c6c6aee540e47776e7 100644 (file)
@@ -198,6 +198,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 void wbc_detach_inode(struct writeback_control *wbc);
 void wbc_account_io(struct writeback_control *wbc, struct page *page,
                    size_t bytes);
+void cgroup_writeback_umount(void);
 
 /**
  * inode_attach_wb - associate an inode with its wb
@@ -301,6 +302,10 @@ static inline void wbc_account_io(struct writeback_control *wbc,
 {
 }
 
+static inline void cgroup_writeback_umount(void)
+{
+}
+
 #endif /* CONFIG_CGROUP_WRITEBACK */
 
 /*