mm: implement writeback livelock avoidance using page tagging
authorJan Kara <jack@suse.cz>
Tue, 10 Aug 2010 00:19:12 +0000 (17:19 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 10 Aug 2010 03:44:59 +0000 (20:44 -0700)
We try to avoid livelocks of writeback when some steadily creates dirty
pages in a mapping we are writing out.  For memory-cleaning writeback,
using nr_to_write works reasonably well but we cannot really use it for
data integrity writeback.  This patch tries to solve the problem.

The idea is simple: Tag all pages that should be written back with a
special tag (TOWRITE) in the radix tree.  This can be done rather quickly
and thus livelocks should not happen in practice.  Then we start doing the
hard work of locking pages and sending them to disk only for those pages
that have TOWRITE tag set.

Note: Adding new radix tree tag grows radix tree node from 288 to 296
bytes for 32-bit archs and from 552 to 560 bytes for 64-bit archs.
However, the number of slab/slub items per page remains the same (13 and 7
respectively).

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/fs.h
include/linux/radix-tree.h
mm/page-writeback.c

index e5106e49bd2ccce31cad95a2fe5f30efb5233dae..488efec09d148dc91972962048540adfe261e5ac 100644 (file)
@@ -687,6 +687,7 @@ struct block_device {
  */
 #define PAGECACHE_TAG_DIRTY    0
 #define PAGECACHE_TAG_WRITEBACK        1
+#define PAGECACHE_TAG_TOWRITE  2
 
 int mapping_tagged(struct address_space *mapping, int tag);
 
index a4b00e9cca90148981d5334ad5889043539304da..634b8e674ac578e2916b110a28b992bf2e8dfd22 100644 (file)
@@ -55,7 +55,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 
 /*** radix-tree API starts here ***/
 
-#define RADIX_TREE_MAX_TAGS 2
+#define RADIX_TREE_MAX_TAGS 3
 
 /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
index 37498ef6154836943f3478304bf7440ab255669a..df8202ebc7b80c0849dfd1486fb0130eed875557 100644 (file)
@@ -804,6 +804,41 @@ void __init page_writeback_init(void)
        prop_descriptor_init(&vm_dirties, shift);
 }
 
+/**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end (inclusive) and tags
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
+ * that write_cache_pages (or whoever calls this function) will then use
+ * TOWRITE tag to identify pages eligible for writeback.  This mechanism is
+ * used to avoid livelocking of writeback by a process steadily creating new
+ * dirty pages in the file (thus it is important for this function to be quick
+ * so that it can tag pages faster than a dirtying process can create them).
+ */
+/*
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ */
+#define WRITEBACK_TAG_BATCH 4096
+void tag_pages_for_writeback(struct address_space *mapping,
+                            pgoff_t start, pgoff_t end)
+{
+       unsigned long tagged;
+
+       do {
+               spin_lock_irq(&mapping->tree_lock);
+               tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
+                               &start, end, WRITEBACK_TAG_BATCH,
+                               PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+               spin_unlock_irq(&mapping->tree_lock);
+               WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
+               cond_resched();
+       } while (tagged >= WRITEBACK_TAG_BATCH);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
@@ -818,6 +853,13 @@ void __init page_writeback_init(void)
  * the call was made get new I/O started against them.  If wbc->sync_mode is
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
  */
 int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
@@ -833,6 +875,7 @@ int write_cache_pages(struct address_space *mapping,
        pgoff_t done_index;
        int cycled;
        int range_whole = 0;
+       int tag;
 
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
@@ -849,29 +892,19 @@ int write_cache_pages(struct address_space *mapping,
                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                        range_whole = 1;
                cycled = 1; /* ignore range_cyclic tests */
-
-               /*
-                * If this is a data integrity sync, cap the writeback to the
-                * current end of file. Any extension to the file that occurs
-                * after this is a new write and we don't need to write those
-                * pages out to fulfil our data integrity requirements. If we
-                * try to write them out, we can get stuck in this scan until
-                * the concurrent writer stops adding dirty pages and extending
-                * EOF.
-                */
-               if (wbc->sync_mode == WB_SYNC_ALL &&
-                   wbc->range_end == LLONG_MAX) {
-                       end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
-               }
        }
-
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag = PAGECACHE_TAG_TOWRITE;
+       else
+               tag = PAGECACHE_TAG_DIRTY;
 retry:
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag_pages_for_writeback(mapping, index, end);
        done_index = index;
        while (!done && (index <= end)) {
                int i;
 
-               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                             PAGECACHE_TAG_DIRTY,
+               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
@@ -1327,6 +1360,9 @@ int test_set_page_writeback(struct page *page)
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_DIRTY);
+               radix_tree_tag_clear(&mapping->page_tree,
+                                    page_index(page),
+                                    PAGECACHE_TAG_TOWRITE);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestSetPageWriteback(page);