dm: implement REQ_FLUSH/FUA support for bio-based dm

author Tejun Heo <tj@kernel.org>

Fri, 3 Sep 2010 09:56:19 +0000 (11:56 +0200)

committer Jens Axboe <jaxboe@fusionio.com>

Fri, 10 Sep 2010 10:35:38 +0000 (12:35 +0200)
author Tejun Heo <tj@kernel.org>
Fri, 3 Sep 2010 09:56:19 +0000 (11:56 +0200)
committer Jens Axboe <jaxboe@fusionio.com>
Fri, 10 Sep 2010 10:35:38 +0000 (12:35 +0200)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c

index 368e8e98f7050e0fa5ddd7fa2220379681fdb04d..d5b0e4c0e7028b75296e23e1c2cbc409877ba565 100644 (file)
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1278,7 +1278,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
         struct dm_crypt_io *io;
         struct crypt_config *cc;
  
-       if (unlikely(bio_empty_barrier(bio))) {
+       if (bio->bi_rw & REQ_FLUSH) {
                 cc = ti->private;
                 bio->bi_bdev = cc->dev->bdev;
                 return DM_MAPIO_REMAPPED;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c

index 0590c75b0ab68e7f49a9f7d0adfa82f09fa2badf..136d4f71a1162509abc40cd10b96ed6085c75186 100644 (file)
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -31,7 +31,6 @@ struct dm_io_client {
   */
  struct io {
         unsigned long error_bits;
-       unsigned long eopnotsupp_bits;
         atomic_t count;
         struct task_struct *sleeper;
         struct dm_io_client *client;
@@ -130,11 +129,8 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
   *---------------------------------------------------------------*/
  static void dec_count(struct io *io, unsigned int region, int error)
  {
-       if (error) {
+       if (error)
                 set_bit(region, &io->error_bits);
-               if (error == -EOPNOTSUPP)
-                       set_bit(region, &io->eopnotsupp_bits);
-       }
  
         if (atomic_dec_and_test(&io->count)) {
                 if (io->sleeper)
@@ -310,8 +306,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
         sector_t remaining = where->count;
  
         /*
-        * where->count may be zero if rw holds a write barrier and we
-        * need to send a zero-sized barrier.
+        * where->count may be zero if rw holds a flush and we need to
+        * send a zero-sized flush.
          */
         do {
                 /*
@@ -364,7 +360,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
          */
         for (i = 0; i < num_regions; i++) {
                 *dp = old_pages;
-               if (where[i].count || (rw & REQ_HARDBARRIER))
+               if (where[i].count || (rw & REQ_FLUSH))
                         do_region(rw, i, where + i, dp, io);
         }
  
@@ -393,9 +389,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
                 return -EIO;
         }
  
-retry:
         io->error_bits = 0;
-       io->eopnotsupp_bits = 0;
         atomic_set(&io->count, 1); /* see dispatch_io() */
         io->sleeper = current;
         io->client = client;
@@ -412,11 +406,6 @@ retry:
         }
         set_current_state(TASK_RUNNING);
  
-       if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
-               rw &= ~REQ_HARDBARRIER;
-               goto retry;
-       }
-
         if (error_bits)
                 *error_bits = io->error_bits;
  
@@ -437,7 +426,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
  
         io = mempool_alloc(client->pool, GFP_NOIO);
         io->error_bits = 0;
-       io->eopnotsupp_bits = 0;
         atomic_set(&io->count, 1); /* see dispatch_io() */
         io->sleeper = NULL;
         io->client = client;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c

index 5a08be0222dbee7f27e89942c6ef4458cbcf633a..33420e68d1534d18c988af3872ecafb31168d79f 100644 (file)
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -300,7 +300,7 @@ static int flush_header(struct log_c *lc)
                 .count = 0,
         };
  
-       lc->io_req.bi_rw = WRITE_BARRIER;
+       lc->io_req.bi_rw = WRITE_FLUSH;
  
         return dm_io(&lc->io_req, 1, &null_location, NULL);
  }
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c

index 7c081bcbc3cf31f141ea1e7c1c7bdd03f7742582..19a59b041c277a4e83b739011031572bf135109f 100644 (file)
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -259,7 +259,7 @@ static int mirror_flush(struct dm_target *ti)
         struct dm_io_region io[ms->nr_mirrors];
         struct mirror *m;
         struct dm_io_request io_req = {
-               .bi_rw = WRITE_BARRIER,
+               .bi_rw = WRITE_FLUSH,
                 .mem.type = DM_IO_KMEM,
                 .mem.ptr.bvec = NULL,
                 .client = ms->io_client,
@@ -629,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
         struct dm_io_region io[ms->nr_mirrors], *dest = io;
         struct mirror *m;
         struct dm_io_request io_req = {
-               .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
+               .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
                 .mem.type = DM_IO_BVEC,
                 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
                 .notify.fn = write_callback,
@@ -670,7 +670,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
         bio_list_init(&requeue);
  
         while ((bio = bio_list_pop(writes))) {
-               if (unlikely(bio_empty_barrier(bio))) {
+               if (bio->bi_rw & REQ_FLUSH) {
                         bio_list_add(&sync, bio);
                         continue;
                 }
@@ -1203,7 +1203,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
          * We need to dec pending if this was a write.
          */
         if (rw == WRITE) {
-               if (likely(!bio_empty_barrier(bio)))
+               if (!(bio->bi_rw & REQ_FLUSH))
                         dm_rh_dec(ms->rh, map_context->ll);
                 return error;
         }
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c

index bd5c58b2886849a795b762823246c55b9d8e49ea..dad011aed0c929f2075bb8412f606c89be8869bc 100644 (file)
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -81,9 +81,9 @@ struct dm_region_hash {
         struct list_head failed_recovered_regions;
  
         /*
-        * If there was a barrier failure no regions can be marked clean.
+        * If there was a flush failure no regions can be marked clean.
          */
-       int barrier_failure;
+       int flush_failure;
  
         void *context;
         sector_t target_begin;
@@ -217,7 +217,7 @@ struct dm_region_hash *dm_region_hash_create(
         INIT_LIST_HEAD(&rh->quiesced_regions);
         INIT_LIST_HEAD(&rh->recovered_regions);
         INIT_LIST_HEAD(&rh->failed_recovered_regions);
-       rh->barrier_failure = 0;
+       rh->flush_failure = 0;
  
         rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
                                                       sizeof(struct dm_region));
@@ -399,8 +399,8 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
         region_t region = dm_rh_bio_to_region(rh, bio);
         int recovering = 0;
  
-       if (bio_empty_barrier(bio)) {
-               rh->barrier_failure = 1;
+       if (bio->bi_rw & REQ_FLUSH) {
+               rh->flush_failure = 1;
                 return;
         }
  
@@ -524,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
         struct bio *bio;
  
         for (bio = bios->head; bio; bio = bio->bi_next) {
-               if (bio_empty_barrier(bio))
+               if (bio->bi_rw & REQ_FLUSH)
                         continue;
                 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
         }
@@ -555,9 +555,9 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
                  */
  
                 /* do nothing for DM_RH_NOSYNC */
-               if (unlikely(rh->barrier_failure)) {
+               if (unlikely(rh->flush_failure)) {
                         /*
-                        * If a write barrier failed some time ago, we
+                        * If a write flush failed some time ago, we
                          * don't know whether or not this write made it
                          * to the disk, so we must resync the device.
                          */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c

index cc2bdb83f9ad685c7e87211f2e84b0b3a7aed871..0b61792a278041bc6bedc9c04800457f93db1f6d 100644 (file)
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -687,7 +687,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
         /*
          * Commit exceptions to disk.
          */
-       if (ps->valid && area_io(ps, WRITE_BARRIER))
+       if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
                 ps->valid = 0;
  
         /*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c

index 5974d3094d979297cde5267b0cbaa6abbd0481cd..eed210152b75802930369d1953d6006aaedc993a 100644 (file)
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1587,7 +1587,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
         chunk_t chunk;
         struct dm_snap_pending_exception *pe = NULL;
  
-       if (unlikely(bio_empty_barrier(bio))) {
+       if (bio->bi_rw & REQ_FLUSH) {
                 bio->bi_bdev = s->cow->bdev;
                 return DM_MAPIO_REMAPPED;
         }
@@ -1691,7 +1691,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
         int r = DM_MAPIO_REMAPPED;
         chunk_t chunk;
  
-       if (unlikely(bio_empty_barrier(bio))) {
+       if (bio->bi_rw & REQ_FLUSH) {
                 if (!map_context->target_request_nr)
                         bio->bi_bdev = s->origin->bdev;
                 else
@@ -2135,7 +2135,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
         struct dm_dev *dev = ti->private;
         bio->bi_bdev = dev->bdev;
  
-       if (unlikely(bio_empty_barrier(bio)))
+       if (bio->bi_rw & REQ_FLUSH)
                 return DM_MAPIO_REMAPPED;
  
         /* Only tell snapshots if this is a write */
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c

index c297f6da91ea3cb1b05e4f35af21c0760ab25456..f0371b4c4fbfbfbd1d9f6747cad62fa8d06f5860 100644 (file)
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -271,7 +271,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
         uint32_t stripe;
         unsigned target_request_nr;
  
-       if (unlikely(bio_empty_barrier(bio))) {
+       if (bio->bi_rw & REQ_FLUSH) {
                 target_request_nr = map_context->target_request_nr;
                 BUG_ON(target_request_nr >= sc->stripes);
                 bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c

index b1d92be8f990339e66ee4ae445675400119adcb6..32e6622767ada54225fc3f17cdbc40b77ebb2e1a 100644 (file)
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -144,15 +144,16 @@ struct mapped_device {
         spinlock_t deferred_lock;
  
         /*
-        * An error from the barrier request currently being processed.
+        * An error from the flush request currently being processed.
          */
-       int barrier_error;
+       int flush_error;
  
         /*
          * Protect barrier_error from concurrent endio processing
          * in request-based dm.
          */
         spinlock_t barrier_error_lock;
+       int barrier_error;
  
         /*
          * Processing queue (flush/barriers)
@@ -200,8 +201,8 @@ struct mapped_device {
         /* sysfs handle */
         struct kobject kobj;
  
-       /* zero-length barrier that will be cloned and submitted to targets */
-       struct bio barrier_bio;
+       /* zero-length flush that will be cloned and submitted to targets */
+       struct bio flush_bio;
  };
  
  /*
@@ -512,7 +513,7 @@ static void end_io_acct(struct dm_io *io)
  
         /*
          * After this is decremented the bio must not be touched if it is
-        * a barrier.
+        * a flush.
          */
         dm_disk(md)->part0.in_flight[rw] = pending =
                 atomic_dec_return(&md->pending[rw]);
@@ -626,7 +627,7 @@ static void dec_pending(struct dm_io *io, int error)
                          */
                         spin_lock_irqsave(&md->deferred_lock, flags);
                         if (__noflush_suspending(md)) {
-                               if (!(io->bio->bi_rw & REQ_HARDBARRIER))
+                               if (!(io->bio->bi_rw & REQ_FLUSH))
                                         bio_list_add_head(&md->deferred,
                                                           io->bio);
                         } else
@@ -638,20 +639,14 @@ static void dec_pending(struct dm_io *io, int error)
                 io_error = io->error;
                 bio = io->bio;
  
-               if (bio->bi_rw & REQ_HARDBARRIER) {
+               if (bio->bi_rw & REQ_FLUSH) {
                         /*
-                        * There can be just one barrier request so we use
+                        * There can be just one flush request so we use
                          * a per-device variable for error reporting.
                          * Note that you can't touch the bio after end_io_acct
-                        *
-                        * We ignore -EOPNOTSUPP for empty flush reported by
-                        * underlying devices. We assume that if the device
-                        * doesn't support empty barriers, it doesn't need
-                        * cache flushing commands.
                          */
-                       if (!md->barrier_error &&
-                           !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))
-                               md->barrier_error = io_error;
+                       if (!md->flush_error)
+                               md->flush_error = io_error;
                         end_io_acct(io);
                         free_io(md, io);
                 } else {
@@ -1119,7 +1114,7 @@ static void dm_bio_destructor(struct bio *bio)
  }
  
  /*
- * Creates a little bio that is just does part of a bvec.
+ * Creates a little bio that just does part of a bvec.
   */
  static struct bio *split_bvec(struct bio *bio, sector_t sector,
                               unsigned short idx, unsigned int offset,
@@ -1134,7 +1129,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
  
         clone->bi_sector = sector;
         clone->bi_bdev = bio->bi_bdev;
-       clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
+       clone->bi_rw = bio->bi_rw;
         clone->bi_vcnt = 1;
         clone->bi_size = to_bytes(len);
         clone->bi_io_vec->bv_offset = offset;
@@ -1161,7 +1156,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
  
         clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
         __bio_clone(clone, bio);
-       clone->bi_rw &= ~REQ_HARDBARRIER;
         clone->bi_destructor = dm_bio_destructor;
         clone->bi_sector = sector;
         clone->bi_idx = idx;
@@ -1225,7 +1219,7 @@ static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
                 __issue_target_request(ci, ti, request_nr, len);
  }
  
-static int __clone_and_map_empty_barrier(struct clone_info *ci)
+static int __clone_and_map_flush(struct clone_info *ci)
  {
         unsigned target_nr = 0;
         struct dm_target *ti;
@@ -1289,9 +1283,6 @@ static int __clone_and_map(struct clone_info *ci)
         sector_t len = 0, max;
         struct dm_target_io *tio;
  
-       if (unlikely(bio_empty_barrier(bio)))
-               return __clone_and_map_empty_barrier(ci);
-
         if (unlikely(bio->bi_rw & REQ_DISCARD))
                 return __clone_and_map_discard(ci);
  
@@ -1383,11 +1374,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
  
         ci.map = dm_get_live_table(md);
         if (unlikely(!ci.map)) {
-               if (!(bio->bi_rw & REQ_HARDBARRIER))
+               if (!(bio->bi_rw & REQ_FLUSH))
                         bio_io_error(bio);
                 else
-                       if (!md->barrier_error)
-                               md->barrier_error = -EIO;
+                       if (!md->flush_error)
+                               md->flush_error = -EIO;
                 return;
         }
  
@@ -1400,14 +1391,22 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
         ci.io->md = md;
         spin_lock_init(&ci.io->endio_lock);
         ci.sector = bio->bi_sector;
-       ci.sector_count = bio_sectors(bio);
-       if (unlikely(bio_empty_barrier(bio)))
+       if (!(bio->bi_rw & REQ_FLUSH))
+               ci.sector_count = bio_sectors(bio);
+       else {
+               /* all FLUSH bio's reaching here should be empty */
+               WARN_ON_ONCE(bio_has_data(bio));
                 ci.sector_count = 1;
+       }
         ci.idx = bio->bi_idx;
  
         start_io_acct(ci.io);
-       while (ci.sector_count && !error)
-               error = __clone_and_map(&ci);
+       while (ci.sector_count && !error) {
+               if (!(bio->bi_rw & REQ_FLUSH))
+                       error = __clone_and_map(&ci);
+               else
+                       error = __clone_and_map_flush(&ci);
+       }
  
         /* drop the extra reference count */
         dec_pending(ci.io, error);
@@ -1492,11 +1491,11 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
         part_stat_unlock();
  
         /*
-        * If we're suspended or the thread is processing barriers
+        * If we're suspended or the thread is processing flushes
          * we have to queue this io for later.
          */
         if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-           unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+           (bio->bi_rw & REQ_FLUSH)) {
                 up_read(&md->io_lock);
  
                 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -1940,6 +1939,7 @@ static void dm_init_md_queue(struct mapped_device *md)
         blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
         md->queue->unplug_fn = dm_unplug_all;
         blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+       blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
  }
  
  /*
@@ -2245,7 +2245,8 @@ static int dm_init_request_based_queue(struct mapped_device *md)
         blk_queue_softirq_done(md->queue, dm_softirq_done);
         blk_queue_prep_rq(md->queue, dm_prep_fn);
         blk_queue_lld_busy(md->queue, dm_lld_busy);
-       blk_queue_flush(md->queue, REQ_FLUSH);
+       /* no flush support for request based dm yet */
+       blk_queue_flush(md->queue, 0);
  
         elv_register_queue(md->queue);
  
@@ -2406,41 +2407,35 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
         return r;
  }
  
-static void dm_flush(struct mapped_device *md)
+static void process_flush(struct mapped_device *md, struct bio *bio)
  {
-       dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-
-       bio_init(&md->barrier_bio);
-       md->barrier_bio.bi_bdev = md->bdev;
-       md->barrier_bio.bi_rw = WRITE_BARRIER;
-       __split_and_process_bio(md, &md->barrier_bio);
+       md->flush_error = 0;
  
+       /* handle REQ_FLUSH */
         dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-}
  
-static void process_barrier(struct mapped_device *md, struct bio *bio)
-{
-       md->barrier_error = 0;
+       bio_init(&md->flush_bio);
+       md->flush_bio.bi_bdev = md->bdev;
+       md->flush_bio.bi_rw = WRITE_FLUSH;
+       __split_and_process_bio(md, &md->flush_bio);
  
-       dm_flush(md);
+       dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
  
-       if (!bio_empty_barrier(bio)) {
-               __split_and_process_bio(md, bio);
-               /*
-                * If the request isn't supported, don't waste time with
-                * the second flush.
-                */
-               if (md->barrier_error != -EOPNOTSUPP)
-                       dm_flush(md);
+       /* if it's an empty flush or the preflush failed, we're done */
+       if (!bio_has_data(bio) || md->flush_error) {
+               if (md->flush_error != DM_ENDIO_REQUEUE)
+                       bio_endio(bio, md->flush_error);
+               else {
+                       spin_lock_irq(&md->deferred_lock);
+                       bio_list_add_head(&md->deferred, bio);
+                       spin_unlock_irq(&md->deferred_lock);
+               }
+               return;
         }
  
-       if (md->barrier_error != DM_ENDIO_REQUEUE)
-               bio_endio(bio, md->barrier_error);
-       else {
-               spin_lock_irq(&md->deferred_lock);
-               bio_list_add_head(&md->deferred, bio);
-               spin_unlock_irq(&md->deferred_lock);
-       }
+       /* issue data + REQ_FUA */
+       bio->bi_rw &= ~REQ_FLUSH;
+       __split_and_process_bio(md, bio);
  }
  
  /*
@@ -2469,8 +2464,8 @@ static void dm_wq_work(struct work_struct *work)
                 if (dm_request_based(md))
                         generic_make_request(c);
                 else {
-                       if (c->bi_rw & REQ_HARDBARRIER)
-                               process_barrier(md, c);
+                       if (c->bi_rw & REQ_FLUSH)
+                               process_flush(md, c);
                         else
                                 __split_and_process_bio(md, c);
                 }
author	Tejun Heo <tj@kernel.org>
	Fri, 3 Sep 2010 09:56:19 +0000 (11:56 +0200)
committer	Jens Axboe <jaxboe@fusionio.com>
	Fri, 10 Sep 2010 10:35:38 +0000 (12:35 +0200)
drivers/md/dm-crypt.c		patch \| blob \| blame \| history
drivers/md/dm-io.c		patch \| blob \| blame \| history
drivers/md/dm-log.c		patch \| blob \| blame \| history
drivers/md/dm-raid1.c		patch \| blob \| blame \| history
drivers/md/dm-region-hash.c		patch \| blob \| blame \| history
drivers/md/dm-snap-persistent.c		patch \| blob \| blame \| history
drivers/md/dm-snap.c		patch \| blob \| blame \| history
drivers/md/dm-stripe.c		patch \| blob \| blame \| history
drivers/md/dm.c		patch \| blob \| blame \| history