Btrfs: use WRITE_SYNC for synchronous writes

author Chris Mason <chris.mason@oracle.com>

Mon, 20 Apr 2009 19:50:09 +0000 (15:50 -0400)

committer Chris Mason <chris.mason@oracle.com>

Mon, 20 Apr 2009 19:53:08 +0000 (15:53 -0400)
author Chris Mason <chris.mason@oracle.com>
Mon, 20 Apr 2009 19:50:09 +0000 (15:50 -0400)
committer Chris Mason <chris.mason@oracle.com>
Mon, 20 Apr 2009 19:53:08 +0000 (15:53 -0400)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 92caa8035f36f9beecc3c21551722a0eefdd911e..fec18b43c2c37eff80ae7d0c932ce7ad2f280251 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2095,10 +2095,10 @@ static int write_dev_supers(struct btrfs_device *device,
                                 device->barriers = 0;
                                 get_bh(bh);
                                 lock_buffer(bh);
-                               ret = submit_bh(WRITE, bh);
+                               ret = submit_bh(WRITE_SYNC, bh);
                         }
                 } else {
-                       ret = submit_bh(WRITE, bh);
+                       ret = submit_bh(WRITE_SYNC, bh);
                 }
  
                 if (!ret && wait) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index eb2bee8b7fbfb19fb37dc26cbe6fe01ec92ea3ef..483b6727aaafdcc6c98f1567a0fae4379ac48750 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -50,7 +50,10 @@ struct extent_page_data {
         /* tells writepage not to lock the state bits for this range
          * it still does the unlocking
          */
-       int extent_locked;
+       unsigned int extent_locked:1;
+
+       /* tells the submit_bio code to use a WRITE_SYNC */
+       unsigned int sync_io:1;
  };
  
  int __init extent_io_init(void)
@@ -2136,8 +2139,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
         u64 delalloc_end;
         int page_started;
         int compressed;
+       int write_flags;
         unsigned long nr_written = 0;
  
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               write_flags = WRITE_SYNC_PLUG;
+       else
+               write_flags = WRITE;
+
         WARN_ON(!PageLocked(page));
         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
         if (page->index > end_index ||
@@ -2314,9 +2323,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                                        (unsigned long long)end);
                         }
  
-                       ret = submit_extent_page(WRITE, tree, page, sector,
-                                                iosize, pg_offset, bdev,
-                                                &epd->bio, max_nr,
+                       ret = submit_extent_page(write_flags, tree, page,
+                                                sector, iosize, pg_offset,
+                                                bdev, &epd->bio, max_nr,
                                                  end_bio_extent_writepage,
                                                  0, 0, 0);
                         if (ret)
@@ -2460,15 +2469,23 @@ retry:
         return ret;
  }
  
-static noinline void flush_write_bio(void *data)
+static void flush_epd_write_bio(struct extent_page_data *epd)
  {
-       struct extent_page_data *epd = data;
         if (epd->bio) {
-               submit_one_bio(WRITE, epd->bio, 0, 0);
+               if (epd->sync_io)
+                       submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
+               else
+                       submit_one_bio(WRITE, epd->bio, 0, 0);
                 epd->bio = NULL;
         }
  }
  
+static noinline void flush_write_bio(void *data)
+{
+       struct extent_page_data *epd = data;
+       flush_epd_write_bio(epd);
+}
+
  int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                           get_extent_t *get_extent,
                           struct writeback_control *wbc)
@@ -2480,6 +2497,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                 .tree = tree,
                 .get_extent = get_extent,
                 .extent_locked = 0,
+               .sync_io = wbc->sync_mode == WB_SYNC_ALL,
         };
         struct writeback_control wbc_writepages = {
                 .bdi            = wbc->bdi,
@@ -2490,13 +2508,11 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                 .range_end      = (loff_t)-1,
         };
  
-
         ret = __extent_writepage(page, wbc, &epd);
  
         extent_write_cache_pages(tree, mapping, &wbc_writepages,
                                  __extent_writepage, &epd, flush_write_bio);
-       if (epd.bio)
-               submit_one_bio(WRITE, epd.bio, 0, 0);
+       flush_epd_write_bio(&epd);
         return ret;
  }
  
@@ -2515,6 +2531,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                 .tree = tree,
                 .get_extent = get_extent,
                 .extent_locked = 1,
+               .sync_io = mode == WB_SYNC_ALL,
         };
         struct writeback_control wbc_writepages = {
                 .bdi            = inode->i_mapping->backing_dev_info,
@@ -2540,8 +2557,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                 start += PAGE_CACHE_SIZE;
         }
  
-       if (epd.bio)
-               submit_one_bio(WRITE, epd.bio, 0, 0);
+       flush_epd_write_bio(&epd);
         return ret;
  }
  
@@ -2556,13 +2572,13 @@ int extent_writepages(struct extent_io_tree *tree,
                 .tree = tree,
                 .get_extent = get_extent,
                 .extent_locked = 0,
+               .sync_io = wbc->sync_mode == WB_SYNC_ALL,
         };
  
         ret = extent_write_cache_pages(tree, mapping, wbc,
                                        __extent_writepage, &epd,
                                        flush_write_bio);
-       if (epd.bio)
-               submit_one_bio(WRITE, epd.bio, 0, 0);
+       flush_epd_write_bio(&epd);
         return ret;
  }
  
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c

index 53c87b197d701671afde64fd7f8f00b59fdb8cd2..d6f0806c682ff83fc11b019ac4f26d1f732acbec 100644 (file)
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -489,7 +489,7 @@ again:
         /* start IO across the range first to instantiate any delalloc
          * extents
          */
-       btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+       btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
  
         /* The compression code will leave pages locked but return from
          * writepage without setting the page writeback.  Starting again
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index e0913e4697284673b4c00cd58bf911fc44fe4019..e53835b885945a910c7b5390bfd6331563bbf239 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
         return NULL;
  }
  
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+                       struct bio *head, struct bio *tail)
+{
+
+       struct bio *old_head;
+
+       old_head = pending_bios->head;
+       pending_bios->head = head;
+       if (pending_bios->tail)
+               tail->bi_next = old_head;
+       else
+               pending_bios->tail = tail;
+}
+
  /*
   * we try to collect pending bios for a device so we don't get a large
   * number of procs sending bios down to the same device.  This greatly
@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
         struct bio *pending;
         struct backing_dev_info *bdi;
         struct btrfs_fs_info *fs_info;
+       struct btrfs_pending_bios *pending_bios;
         struct bio *tail;
         struct bio *cur;
         int again = 0;
-       unsigned long num_run = 0;
+       unsigned long num_run;
+       unsigned long num_sync_run;
         unsigned long limit;
         unsigned long last_waited = 0;
  
@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
         limit = btrfs_async_submit_limit(fs_info);
         limit = limit * 2 / 3;
  
+       /* we want to make sure that every time we switch from the sync
+        * list to the normal list, we unplug
+        */
+       num_sync_run = 0;
+
  loop:
         spin_lock(&device->io_lock);
+       num_run = 0;
  
  loop_lock:
+
         /* take all the bios off the list at once and process them
          * later on (without the lock held).  But, remember the
          * tail and other pointers so the bios can be properly reinserted
          * into the list if we hit congestion
          */
-       pending = device->pending_bios;
-       tail = device->pending_bio_tail;
+       if (device->pending_sync_bios.head)
+               pending_bios = &device->pending_sync_bios;
+       else
+               pending_bios = &device->pending_bios;
+
+       pending = pending_bios->head;
+       tail = pending_bios->tail;
         WARN_ON(pending && !tail);
-       device->pending_bios = NULL;
-       device->pending_bio_tail = NULL;
  
         /*
          * if pending was null this time around, no bios need processing
@@ -176,16 +202,41 @@ loop_lock:
          * device->running_pending is used to synchronize with the
          * schedule_bio code.
          */
-       if (pending) {
-               again = 1;
-               device->running_pending = 1;
-       } else {
+       if (device->pending_sync_bios.head == NULL &&
+           device->pending_bios.head == NULL) {
                 again = 0;
                 device->running_pending = 0;
+       } else {
+               again = 1;
+               device->running_pending = 1;
         }
+
+       pending_bios->head = NULL;
+       pending_bios->tail = NULL;
+
         spin_unlock(&device->io_lock);
  
+       /*
+        * if we're doing the regular priority list, make sure we unplug
+        * for any high prio bios we've sent down
+        */
+       if (pending_bios == &device->pending_bios && num_sync_run > 0) {
+               num_sync_run = 0;
+               blk_run_backing_dev(bdi, NULL);
+       }
+
         while (pending) {
+
+               rmb();
+               if (pending_bios != &device->pending_sync_bios &&
+                   device->pending_sync_bios.head &&
+                   num_run > 16) {
+                       cond_resched();
+                       spin_lock(&device->io_lock);
+                       requeue_list(pending_bios, pending, tail);
+                       goto loop_lock;
+               }
+
                 cur = pending;
                 pending = pending->bi_next;
                 cur->bi_next = NULL;
@@ -196,10 +247,18 @@ loop_lock:
                         wake_up(&fs_info->async_submit_wait);
  
                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-               bio_get(cur);
                 submit_bio(cur->bi_rw, cur);
-               bio_put(cur);
                 num_run++;
+               if (bio_sync(cur))
+                       num_sync_run++;
+
+               if (need_resched()) {
+                       if (num_sync_run) {
+                               blk_run_backing_dev(bdi, NULL);
+                               num_sync_run = 0;
+                       }
+                       cond_resched();
+               }
  
                 /*
                  * we made progress, there is more work to do and the bdi
@@ -208,7 +267,6 @@ loop_lock:
                  */
                 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
                     fs_info->fs_devices->open_devices > 1) {
-                       struct bio *old_head;
                         struct io_context *ioc;
  
                         ioc = current->io_context;
@@ -233,17 +291,17 @@ loop_lock:
                                  * against it before looping
                                  */
                                 last_waited = ioc->last_waited;
+                               if (need_resched()) {
+                                       if (num_sync_run) {
+                                               blk_run_backing_dev(bdi, NULL);
+                                               num_sync_run = 0;
+                                       }
+                                       cond_resched();
+                               }
                                 continue;
                         }
                         spin_lock(&device->io_lock);
-
-                       old_head = device->pending_bios;
-                       device->pending_bios = pending;
-                       if (device->pending_bio_tail)
-                               tail->bi_next = old_head;
-                       else
-                               device->pending_bio_tail = tail;
-
+                       requeue_list(pending_bios, pending, tail);
                         device->running_pending = 1;
  
                         spin_unlock(&device->io_lock);
@@ -251,11 +309,18 @@ loop_lock:
                         goto done;
                 }
         }
+
+       if (num_sync_run) {
+               num_sync_run = 0;
+               blk_run_backing_dev(bdi, NULL);
+       }
+
+       cond_resched();
         if (again)
                 goto loop;
  
         spin_lock(&device->io_lock);
-       if (device->pending_bios)
+       if (device->pending_bios.head || device->pending_sync_bios.head)
                 goto loop_lock;
         spin_unlock(&device->io_lock);
  
@@ -2497,7 +2562,7 @@ again:
                         max_errors = 1;
                 }
         }
-       if (multi_ret && rw == WRITE &&
+       if (multi_ret && (rw & (1 << BIO_RW)) &&
             stripes_allocated < stripes_required) {
                 stripes_allocated = map->num_stripes;
                 free_extent_map(em);
@@ -2762,6 +2827,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
                                  int rw, struct bio *bio)
  {
         int should_queue = 1;
+       struct btrfs_pending_bios *pending_bios;
  
         /* don't bother with additional async steps for reads, right now */
         if (!(rw & (1 << BIO_RW))) {
@@ -2783,13 +2849,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
         bio->bi_rw |= rw;
  
         spin_lock(&device->io_lock);
+       if (bio_sync(bio))
+               pending_bios = &device->pending_sync_bios;
+       else
+               pending_bios = &device->pending_bios;
  
-       if (device->pending_bio_tail)
-               device->pending_bio_tail->bi_next = bio;
+       if (pending_bios->tail)
+               pending_bios->tail->bi_next = bio;
  
-       device->pending_bio_tail = bio;
-       if (!device->pending_bios)
-               device->pending_bios = bio;
+       pending_bios->tail = bio;
+       if (!pending_bios->head)
+               pending_bios->head = bio;
         if (device->running_pending)
                 should_queue = 0;
  
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h

index 2185de72ff7dad59154f5a568624be89ab7e8b97..5836327ba5dd261ed078965e88fa9d4e4512e96d 100644 (file)
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,13 +23,22 @@
  #include "async-thread.h"
  
  struct buffer_head;
+struct btrfs_pending_bios {
+       struct bio *head;
+       struct bio *tail;
+};
+
  struct btrfs_device {
         struct list_head dev_list;
         struct list_head dev_alloc_list;
         struct btrfs_fs_devices *fs_devices;
         struct btrfs_root *dev_root;
-       struct bio *pending_bios;
-       struct bio *pending_bio_tail;
+
+       /* regular prio bios */
+       struct btrfs_pending_bios pending_bios;
+       /* WRITE_SYNC bios */
+       struct btrfs_pending_bios pending_sync_bios;
+
         int running_pending;
         u64 generation;
author	Chris Mason <chris.mason@oracle.com>
	Mon, 20 Apr 2009 19:50:09 +0000 (15:50 -0400)
committer	Chris Mason <chris.mason@oracle.com>
	Mon, 20 Apr 2009 19:53:08 +0000 (15:53 -0400)
fs/btrfs/disk-io.c		patch \| blob \| blame \| history
fs/btrfs/extent_io.c		patch \| blob \| blame \| history
fs/btrfs/ordered-data.c		patch \| blob \| blame \| history
fs/btrfs/volumes.c		patch \| blob \| blame \| history
fs/btrfs/volumes.h		patch \| blob \| blame \| history