drivers/md/multipath.c

   1 /*
   2  * multipath.c : Multiple Devices driver for Linux
   3  *
   4  * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
   5  *
   6  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   7  *
   8  * MULTIPATH management functions.
   9  *
  10  * derived from raid1.c.
  11  *
  12  * This program is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License as published by
  14  * the Free Software Foundation; either version 2, or (at your option)
  15  * any later version.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * (for example /usr/src/linux/COPYING); if not, write to the Free
  19  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20  */
  21
  22 #include <linux/module.h>
  23 #include <linux/slab.h>
  24 #include <linux/spinlock.h>
  25 #include <linux/raid/multipath.h>
  26 #include <linux/buffer_head.h>
  27 #include <asm/atomic.h>
  28
  29 #define MAJOR_NR MD_MAJOR
  30 #define MD_DRIVER
  31 #define MD_PERSONALITY
  32
  33 #define MAX_WORK_PER_DISK 128
  34
  35 #define NR_RESERVED_BUFS        32
  36
  37
  38 static int multipath_map (multipath_conf_t *conf)
  39 {
  40         int i, disks = conf->raid_disks;
  41
  42         /*
  43          * Later we do read balancing on the read side
  44          * now we use the first available disk.
  45          */
  46
  47         rcu_read_lock();
  48         for (i = 0; i < disks; i++) {
  49                 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
  50                 if (rdev && test_bit(In_sync, &rdev->flags)) {
  51                         atomic_inc(&rdev->nr_pending);
  52                         rcu_read_unlock();
  53                         return i;
  54                 }
  55         }
  56         rcu_read_unlock();
  57
  58         printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
  59         return (-1);
  60 }
  61
  62 static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
  63 {
  64         unsigned long flags;
  65         mddev_t *mddev = mp_bh->mddev;
  66         multipath_conf_t *conf = mddev_to_conf(mddev);
  67
  68         spin_lock_irqsave(&conf->device_lock, flags);
  69         list_add(&mp_bh->retry_list, &conf->retry_list);
  70         spin_unlock_irqrestore(&conf->device_lock, flags);
  71         md_wakeup_thread(mddev->thread);
  72 }
  73
  74
  75 /*
  76  * multipath_end_bh_io() is called when we have finished servicing a multipathed
  77  * operation and are ready to return a success/failure code to the buffer
  78  * cache layer.
  79  */
  80 static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
  81 {
  82         struct bio *bio = mp_bh->master_bio;
  83         multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
  84
  85         bio_endio(bio, bio->bi_size, err);
  86         mempool_free(mp_bh, conf->pool);
  87 }
  88
  89 static int multipath_end_request(struct bio *bio, unsigned int bytes_done,
  90                                  int error)
  91 {
  92         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
  93         struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
  94         multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
  95         mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
  96
  97         if (bio->bi_size)
  98                 return 1;
  99
 100         if (uptodate)
 101                 multipath_end_bh_io(mp_bh, 0);
 102         else if (!bio_rw_ahead(bio)) {
 103                 /*
 104                  * oops, IO error:
 105                  */
 106                 char b[BDEVNAME_SIZE];
 107                 md_error (mp_bh->mddev, rdev);
 108                 printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n",
 109                        bdevname(rdev->bdev,b),
 110                        (unsigned long long)bio->bi_sector);
 111                 multipath_reschedule_retry(mp_bh);
 112         } else
 113                 multipath_end_bh_io(mp_bh, error);
 114         rdev_dec_pending(rdev, conf->mddev);
 115         return 0;
 116 }
 117
 118 static void unplug_slaves(mddev_t *mddev)
 119 {
 120         multipath_conf_t *conf = mddev_to_conf(mddev);
 121         int i;
 122
 123         rcu_read_lock();
 124         for (i=0; i<mddev->raid_disks; i++) {
 125                 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
 126                 if (rdev && !test_bit(Faulty, &rdev->flags)
 127                     && atomic_read(&rdev->nr_pending)) {
 128                         request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
 129
 130                         atomic_inc(&rdev->nr_pending);
 131                         rcu_read_unlock();
 132
 133                         if (r_queue->unplug_fn)
 134                                 r_queue->unplug_fn(r_queue);
 135
 136                         rdev_dec_pending(rdev, mddev);
 137                         rcu_read_lock();
 138                 }
 139         }
 140         rcu_read_unlock();
 141 }
 142
 143 static void multipath_unplug(request_queue_t *q)
 144 {
 145         unplug_slaves(q->queuedata);
 146 }
 147
 148
 149 static int multipath_make_request (request_queue_t *q, struct bio * bio)
 150 {
 151         mddev_t *mddev = q->queuedata;
 152         multipath_conf_t *conf = mddev_to_conf(mddev);
 153         struct multipath_bh * mp_bh;
 154         struct multipath_info *multipath;
 155         const int rw = bio_data_dir(bio);
 156
 157         if (unlikely(bio_barrier(bio))) {
 158                 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
 159                 return 0;
 160         }
 161
 162         mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
 163
 164         mp_bh->master_bio = bio;
 165         mp_bh->mddev = mddev;
 166
 167         disk_stat_inc(mddev->gendisk, ios[rw]);
 168         disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
 169
 170         mp_bh->path = multipath_map(conf);
 171         if (mp_bh->path < 0) {
 172                 bio_endio(bio, bio->bi_size, -EIO);
 173                 mempool_free(mp_bh, conf->pool);
 174                 return 0;
 175         }
 176         multipath = conf->multipaths + mp_bh->path;
 177
 178         mp_bh->bio = *bio;
 179         mp_bh->bio.bi_sector += multipath->rdev->data_offset;
 180         mp_bh->bio.bi_bdev = multipath->rdev->bdev;
 181         mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST);
 182         mp_bh->bio.bi_end_io = multipath_end_request;
 183         mp_bh->bio.bi_private = mp_bh;
 184         generic_make_request(&mp_bh->bio);
 185         return 0;
 186 }
 187
 188 static void multipath_status (struct seq_file *seq, mddev_t *mddev)
 189 {
 190         multipath_conf_t *conf = mddev_to_conf(mddev);
 191         int i;
 192
 193         seq_printf (seq, " [%d/%d] [", conf->raid_disks,
 194                                                  conf->working_disks);
 195         for (i = 0; i < conf->raid_disks; i++)
 196                 seq_printf (seq, "%s",
 197                                conf->multipaths[i].rdev &&
 198                                test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_");
 199         seq_printf (seq, "]");
 200 }
 201
 202 static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
 203                                  sector_t *error_sector)
 204 {
 205         mddev_t *mddev = q->queuedata;
 206         multipath_conf_t *conf = mddev_to_conf(mddev);
 207         int i, ret = 0;
 208
 209         rcu_read_lock();
 210         for (i=0; i<mddev->raid_disks && ret == 0; i++) {
 211                 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
 212                 if (rdev && !test_bit(Faulty, &rdev->flags)) {
 213                         struct block_device *bdev = rdev->bdev;
 214                         request_queue_t *r_queue = bdev_get_queue(bdev);
 215
 216                         if (!r_queue->issue_flush_fn)
 217                                 ret = -EOPNOTSUPP;
 218                         else {
 219                                 atomic_inc(&rdev->nr_pending);
 220                                 rcu_read_unlock();
 221                                 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
 222                                                               error_sector);
 223                                 rdev_dec_pending(rdev, mddev);
 224                                 rcu_read_lock();
 225                         }
 226                 }
 227         }
 228         rcu_read_unlock();
 229         return ret;
 230 }
 231
 232 /*
 233  * Careful, this can execute in IRQ contexts as well!
 234  */
 235 static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
 236 {
 237         multipath_conf_t *conf = mddev_to_conf(mddev);
 238
 239         if (conf->working_disks <= 1) {
 240                 /*
 241                  * Uh oh, we can do nothing if this is our last path, but
 242                  * first check if this is a queued request for a device
 243                  * which has just failed.
 244                  */
 245                 printk(KERN_ALERT
 246                         "multipath: only one IO path left and IO error.\n");
 247                 /* leave it active... it's all we have */
 248         } else {
 249                 /*
 250                  * Mark disk as unusable
 251                  */
 252                 if (!test_bit(Faulty, &rdev->flags)) {
 253                         char b[BDEVNAME_SIZE];
 254                         clear_bit(In_sync, &rdev->flags);
 255                         set_bit(Faulty, &rdev->flags);
 256                         mddev->sb_dirty = 1;
 257                         conf->working_disks--;
 258                         printk(KERN_ALERT "multipath: IO failure on %s,"
 259                                 " disabling IO path. \n Operation continuing"
 260                                 " on %d IO paths.\n",
 261                                 bdevname (rdev->bdev,b),
 262                                 conf->working_disks);
 263                 }
 264         }
 265 }
 266
 267 static void print_multipath_conf (multipath_conf_t *conf)
 268 {
 269         int i;
 270         struct multipath_info *tmp;
 271
 272         printk("MULTIPATH conf printout:\n");
 273         if (!conf) {
 274                 printk("(conf==NULL)\n");
 275                 return;
 276         }
 277         printk(" --- wd:%d rd:%d\n", conf->working_disks,
 278                          conf->raid_disks);
 279
 280         for (i = 0; i < conf->raid_disks; i++) {
 281                 char b[BDEVNAME_SIZE];
 282                 tmp = conf->multipaths + i;
 283                 if (tmp->rdev)
 284                         printk(" disk%d, o:%d, dev:%s\n",
 285                                 i,!test_bit(Faulty, &tmp->rdev->flags),
 286                                bdevname(tmp->rdev->bdev,b));
 287         }
 288 }
 289
 290
 291 static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 292 {
 293         multipath_conf_t *conf = mddev->private;
 294         struct request_queue *q;
 295         int found = 0;
 296         int path;
 297         struct multipath_info *p;
 298
 299         print_multipath_conf(conf);
 300
 301         for (path=0; path<mddev->raid_disks; path++)
 302                 if ((p=conf->multipaths+path)->rdev == NULL) {
 303                         q = rdev->bdev->bd_disk->queue;
 304                         blk_queue_stack_limits(mddev->queue, q);
 305
 306                 /* as we don't honour merge_bvec_fn, we must never risk
 307                  * violating it, so limit ->max_sector to one PAGE, as
 308                  * a one page request is never in violation.
 309                  * (Note: it is very unlikely that a device with
 310                  * merge_bvec_fn will be involved in multipath.)
 311                  */
 312                         if (q->merge_bvec_fn &&
 313                             mddev->queue->max_sectors > (PAGE_SIZE>>9))
 314                                 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 315
 316                         conf->working_disks++;
 317                         rdev->raid_disk = path;
 318                         set_bit(In_sync, &rdev->flags);
 319                         rcu_assign_pointer(p->rdev, rdev);
 320                         found = 1;
 321                 }
 322
 323         print_multipath_conf(conf);
 324         return found;
 325 }
 326
 327 static int multipath_remove_disk(mddev_t *mddev, int number)
 328 {
 329         multipath_conf_t *conf = mddev->private;
 330         int err = 0;
 331         mdk_rdev_t *rdev;
 332         struct multipath_info *p = conf->multipaths + number;
 333
 334         print_multipath_conf(conf);
 335
 336         rdev = p->rdev;
 337         if (rdev) {
 338                 if (test_bit(In_sync, &rdev->flags) ||
 339                     atomic_read(&rdev->nr_pending)) {
 340                         printk(KERN_ERR "hot-remove-disk, slot %d is identified"                                " but is still operational!\n", number);
 341                         err = -EBUSY;
 342                         goto abort;
 343                 }
 344                 p->rdev = NULL;
 345                 synchronize_rcu();
 346                 if (atomic_read(&rdev->nr_pending)) {
 347                         /* lost the race, try later */
 348                         err = -EBUSY;
 349                         p->rdev = rdev;
 350                 }
 351         }
 352 abort:
 353
 354         print_multipath_conf(conf);
 355         return err;
 356 }
 357
 358
 359
 360 /*
 361  * This is a kernel thread which:
 362  *
 363  *      1.      Retries failed read operations on working multipaths.
 364  *      2.      Updates the raid superblock when problems encounter.
 365  *      3.      Performs writes following reads for array syncronising.
 366  */
 367
 368 static void multipathd (mddev_t *mddev)
 369 {
 370         struct multipath_bh *mp_bh;
 371         struct bio *bio;
 372         unsigned long flags;
 373         multipath_conf_t *conf = mddev_to_conf(mddev);
 374         struct list_head *head = &conf->retry_list;
 375
 376         md_check_recovery(mddev);
 377         for (;;) {
 378                 char b[BDEVNAME_SIZE];
 379                 spin_lock_irqsave(&conf->device_lock, flags);
 380                 if (list_empty(head))
 381                         break;
 382                 mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
 383                 list_del(head->prev);
 384                 spin_unlock_irqrestore(&conf->device_lock, flags);
 385
 386                 bio = &mp_bh->bio;
 387                 bio->bi_sector = mp_bh->master_bio->bi_sector;
 388
 389                 if ((mp_bh->path = multipath_map (conf))<0) {
 390                         printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
 391                                 " error for block %llu\n",
 392                                 bdevname(bio->bi_bdev,b),
 393                                 (unsigned long long)bio->bi_sector);
 394                         multipath_end_bh_io(mp_bh, -EIO);
 395                 } else {
 396                         printk(KERN_ERR "multipath: %s: redirecting sector %llu"
 397                                 " to another IO path\n",
 398                                 bdevname(bio->bi_bdev,b),
 399                                 (unsigned long long)bio->bi_sector);
 400                         *bio = *(mp_bh->master_bio);
 401                         bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset;
 402                         bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
 403                         bio->bi_rw |= (1 << BIO_RW_FAILFAST);
 404                         bio->bi_end_io = multipath_end_request;
 405                         bio->bi_private = mp_bh;
 406                         generic_make_request(bio);
 407                 }
 408         }
 409         spin_unlock_irqrestore(&conf->device_lock, flags);
 410 }
 411
 412 static int multipath_run (mddev_t *mddev)
 413 {
 414         multipath_conf_t *conf;
 415         int disk_idx;
 416         struct multipath_info *disk;
 417         mdk_rdev_t *rdev;
 418         struct list_head *tmp;
 419
 420         if (mddev->level != LEVEL_MULTIPATH) {
 421                 printk("multipath: %s: raid level not set to multipath IO (%d)\n",
 422                        mdname(mddev), mddev->level);
 423                 goto out;
 424         }
 425         /*
 426          * copy the already verified devices into our private MULTIPATH
 427          * bookkeeping area. [whatever we allocate in multipath_run(),
 428          * should be freed in multipath_stop()]
 429          */
 430
 431         conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
 432         mddev->private = conf;
 433         if (!conf) {
 434                 printk(KERN_ERR
 435                         "multipath: couldn't allocate memory for %s\n",
 436                         mdname(mddev));
 437                 goto out;
 438         }
 439
 440         conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
 441                                    GFP_KERNEL);
 442         if (!conf->multipaths) {
 443                 printk(KERN_ERR
 444                         "multipath: couldn't allocate memory for %s\n",
 445                         mdname(mddev));
 446                 goto out_free_conf;
 447         }
 448
 449         conf->working_disks = 0;
 450         ITERATE_RDEV(mddev,rdev,tmp) {
 451                 disk_idx = rdev->raid_disk;
 452                 if (disk_idx < 0 ||
 453                     disk_idx >= mddev->raid_disks)
 454                         continue;
 455
 456                 disk = conf->multipaths + disk_idx;
 457                 disk->rdev = rdev;
 458
 459                 blk_queue_stack_limits(mddev->queue,
 460                                        rdev->bdev->bd_disk->queue);
 461                 /* as we don't honour merge_bvec_fn, we must never risk
 462                  * violating it, not that we ever expect a device with
 463                  * a merge_bvec_fn to be involved in multipath */
 464                 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
 465                     mddev->queue->max_sectors > (PAGE_SIZE>>9))
 466                         blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 467
 468                 if (!test_bit(Faulty, &rdev->flags))
 469                         conf->working_disks++;
 470         }
 471
 472         conf->raid_disks = mddev->raid_disks;
 473         mddev->sb_dirty = 1;
 474         conf->mddev = mddev;
 475         spin_lock_init(&conf->device_lock);
 476         INIT_LIST_HEAD(&conf->retry_list);
 477
 478         if (!conf->working_disks) {
 479                 printk(KERN_ERR "multipath: no operational IO paths for %s\n",
 480                         mdname(mddev));
 481                 goto out_free_conf;
 482         }
 483         mddev->degraded = conf->raid_disks = conf->working_disks;
 484
 485         conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS,
 486                                                  sizeof(struct multipath_bh));
 487         if (conf->pool == NULL) {
 488                 printk(KERN_ERR
 489                         "multipath: couldn't allocate memory for %s\n",
 490                         mdname(mddev));
 491                 goto out_free_conf;
 492         }
 493
 494         {
 495                 mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath");
 496                 if (!mddev->thread) {
 497                         printk(KERN_ERR "multipath: couldn't allocate thread"
 498                                 " for %s\n", mdname(mddev));
 499                         goto out_free_conf;
 500                 }
 501         }
 502
 503         printk(KERN_INFO
 504                 "multipath: array %s active with %d out of %d IO paths\n",
 505                 mdname(mddev), conf->working_disks, mddev->raid_disks);
 506         /*
 507          * Ok, everything is just fine now
 508          */
 509         mddev->array_size = mddev->size;
 510
 511         mddev->queue->unplug_fn = multipath_unplug;
 512         mddev->queue->issue_flush_fn = multipath_issue_flush;
 513
 514         return 0;
 515
 516 out_free_conf:
 517         if (conf->pool)
 518                 mempool_destroy(conf->pool);
 519         kfree(conf->multipaths);
 520         kfree(conf);
 521         mddev->private = NULL;
 522 out:
 523         return -EIO;
 524 }
 525
 526
 527 static int multipath_stop (mddev_t *mddev)
 528 {
 529         multipath_conf_t *conf = mddev_to_conf(mddev);
 530
 531         md_unregister_thread(mddev->thread);
 532         mddev->thread = NULL;
 533         blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 534         mempool_destroy(conf->pool);
 535         kfree(conf->multipaths);
 536         kfree(conf);
 537         mddev->private = NULL;
 538         return 0;
 539 }
 540
 541 static struct mdk_personality multipath_personality =
 542 {
 543         .name           = "multipath",
 544         .level          = LEVEL_MULTIPATH,
 545         .owner          = THIS_MODULE,
 546         .make_request   = multipath_make_request,
 547         .run            = multipath_run,
 548         .stop           = multipath_stop,
 549         .status         = multipath_status,
 550         .error_handler  = multipath_error,
 551         .hot_add_disk   = multipath_add_disk,
 552         .hot_remove_disk= multipath_remove_disk,
 553 };
 554
 555 static int __init multipath_init (void)
 556 {
 557         return register_md_personality (&multipath_personality);
 558 }
 559
 560 static void __exit multipath_exit (void)
 561 {
 562         unregister_md_personality (&multipath_personality);
 563 }
 564
 565 module_init(multipath_init);
 566 module_exit(multipath_exit);
 567 MODULE_LICENSE("GPL");
 568 MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
 569 MODULE_ALIAS("md-multipath");
 570 MODULE_ALIAS("md-level--4");