fs: Provide function to unmap metadata for a range of blocks
authorJan Kara <jack@suse.cz>
Fri, 4 Nov 2016 17:08:11 +0000 (18:08 +0100)
committerJens Axboe <axboe@fb.com>
Fri, 4 Nov 2016 20:34:47 +0000 (14:34 -0600)
Provide function equivalent to unmap_underlying_metadata() for a range
of blocks. We somewhat optimize the function to use pagevec lookups
instead of looking up buffer heads one by one and use page lock to pin
buffer heads instead of mapping's private_lock to improve scalability.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
fs/buffer.c
include/linux/buffer_head.h

index af5776da814af7d678ef98a90d9f2d9e488bb64b..f8beca55240a2ef048a4a6c4021082588ded3079 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <linux/pagevec.h>
 #include <trace/events/block.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -1636,6 +1637,81 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
 }
 EXPORT_SYMBOL(unmap_underlying_metadata);
 
+/**
+ * clean_bdev_aliases: clean a range of buffers in block device
+ * @bdev: Block device to clean buffers in
+ * @block: Start of a range of blocks to clean
+ * @len: Number of blocks to clean
+ *
+ * We are taking a range of blocks for data and we don't want writeback of any
+ * buffer-cache aliases starting from return from this function and until the
+ * moment when something will explicitly mark the buffer dirty (hopefully that
+ * will not happen until we will free that block ;-) We don't even need to mark
+ * it not-uptodate - nobody can expect anything from a newly allocated buffer
+ * anyway. We used to use unmap_buffer() for such invalidation, but that was
+ * wrong. We definitely don't want to mark the alias unmapped, for example - it
+ * would confuse anyone who might pick it with bread() afterwards...
+ *
+ * Also..  Note that bforget() doesn't lock the buffer.  So there can be
+ * writeout I/O going on against recently-freed buffers.  We don't wait on that
+ * I/O in bforget() - it's more efficient to wait on the I/O only if we really
+ * need to.  That happens here.
+ */
+void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
+{
+       struct inode *bd_inode = bdev->bd_inode;
+       struct address_space *bd_mapping = bd_inode->i_mapping;
+       struct pagevec pvec;
+       pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
+       pgoff_t end;
+       int i;
+       struct buffer_head *bh;
+       struct buffer_head *head;
+
+       end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
+       pagevec_init(&pvec, 0);
+       while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+               for (i = 0; i < pagevec_count(&pvec); i++) {
+                       struct page *page = pvec.pages[i];
+
+                       index = page->index;
+                       if (index > end)
+                               break;
+                       if (!page_has_buffers(page))
+                               continue;
+                       /*
+                        * We use page lock instead of bd_mapping->private_lock
+                        * to pin buffers here since we can afford to sleep and
+                        * it scales better than a global spinlock lock.
+                        */
+                       lock_page(page);
+                       /* Recheck when the page is locked which pins bhs */
+                       if (!page_has_buffers(page))
+                               goto unlock_page;
+                       head = page_buffers(page);
+                       bh = head;
+                       do {
+                               if (!buffer_mapped(bh))
+                                       goto next;
+                               if (bh->b_blocknr >= block + len)
+                                       break;
+                               clear_buffer_dirty(bh);
+                               wait_on_buffer(bh);
+                               clear_buffer_req(bh);
+next:
+                               bh = bh->b_this_page;
+                       } while (bh != head);
+unlock_page:
+                       unlock_page(page);
+               }
+               pagevec_release(&pvec);
+               cond_resched();
+               index++;
+       }
+}
+EXPORT_SYMBOL(clean_bdev_aliases);
+
 /*
  * Size is a power-of-two in the range 512..PAGE_SIZE,
  * and the case we care about most is PAGE_SIZE.
index ebbacd14d4504a192d7c3f7443012edac433485d..9c9c73ce7d4f8f438710e56175f467b91e337803 100644 (file)
@@ -169,6 +169,8 @@ void invalidate_inode_buffers(struct inode *);
 int remove_inode_buffers(struct inode *inode);
 int sync_mapping_buffers(struct address_space *mapping);
 void unmap_underlying_metadata(struct block_device *bdev, sector_t block);
+void clean_bdev_aliases(struct block_device *bdev, sector_t block,
+                       sector_t len);
 
 void mark_buffer_async_write(struct buffer_head *bh);
 void __wait_on_buffer(struct buffer_head *);