[PATCH] md: Final stages of raid5 expand code
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / drivers / md / md.c
CommitLineData
1da177e4
LT
1/*
2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5 completely rewritten, based on the MD driver code from Marc Zyngier
6
7 Changes:
8
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization):
19
20 Neil Brown <neilb@cse.unsw.edu.au>.
21
32a7627c
N
22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
1da177e4
LT
25 This program is free software; you can redistribute it and/or modify
26 it under the terms of the GNU General Public License as published by
27 the Free Software Foundation; either version 2, or (at your option)
28 any later version.
29
30 You should have received a copy of the GNU General Public License
31 (for example /usr/src/linux/COPYING); if not, write to the Free
32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33*/
34
35#include <linux/module.h>
36#include <linux/config.h>
a6fb0934 37#include <linux/kthread.h>
1da177e4
LT
38#include <linux/linkage.h>
39#include <linux/raid/md.h>
32a7627c 40#include <linux/raid/bitmap.h>
1da177e4
LT
41#include <linux/sysctl.h>
42#include <linux/devfs_fs_kernel.h>
43#include <linux/buffer_head.h> /* for invalidate_bdev */
44#include <linux/suspend.h>
d7603b7e 45#include <linux/poll.h>
1da177e4
LT
46
47#include <linux/init.h>
48
32a7627c
N
49#include <linux/file.h>
50
1da177e4
LT
51#ifdef CONFIG_KMOD
52#include <linux/kmod.h>
53#endif
54
55#include <asm/unaligned.h>
56
57#define MAJOR_NR MD_MAJOR
58#define MD_DRIVER
59
60/* 63 partitions with the alternate major number (mdp) */
61#define MdpMinorShift 6
62
63#define DEBUG 0
64#define dprintk(x...) ((void)(DEBUG && printk(x)))
65
66
67#ifndef MODULE
68static void autostart_arrays (int part);
69#endif
70
2604b703 71static LIST_HEAD(pers_list);
1da177e4
LT
72static DEFINE_SPINLOCK(pers_lock);
73
74/*
75 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
76 * is 1000 KB/sec, so the extra system load does not show up that much.
77 * Increase it if you want to have more _guaranteed_ speed. Note that
338cec32 78 * the RAID driver will use the maximum available bandwidth if the IO
1da177e4
LT
79 * subsystem is idle. There is also an 'absolute maximum' reconstruction
80 * speed limit - in case reconstruction slows down your system despite
81 * idle IO detection.
82 *
83 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
88202a0c 84 * or /sys/block/mdX/md/sync_speed_{min,max}
1da177e4
LT
85 */
86
87static int sysctl_speed_limit_min = 1000;
88static int sysctl_speed_limit_max = 200000;
88202a0c
N
89static inline int speed_min(mddev_t *mddev)
90{
91 return mddev->sync_speed_min ?
92 mddev->sync_speed_min : sysctl_speed_limit_min;
93}
94
95static inline int speed_max(mddev_t *mddev)
96{
97 return mddev->sync_speed_max ?
98 mddev->sync_speed_max : sysctl_speed_limit_max;
99}
1da177e4
LT
100
101static struct ctl_table_header *raid_table_header;
102
103static ctl_table raid_table[] = {
104 {
105 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
106 .procname = "speed_limit_min",
107 .data = &sysctl_speed_limit_min,
108 .maxlen = sizeof(int),
109 .mode = 0644,
110 .proc_handler = &proc_dointvec,
111 },
112 {
113 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
114 .procname = "speed_limit_max",
115 .data = &sysctl_speed_limit_max,
116 .maxlen = sizeof(int),
117 .mode = 0644,
118 .proc_handler = &proc_dointvec,
119 },
120 { .ctl_name = 0 }
121};
122
123static ctl_table raid_dir_table[] = {
124 {
125 .ctl_name = DEV_RAID,
126 .procname = "raid",
127 .maxlen = 0,
128 .mode = 0555,
129 .child = raid_table,
130 },
131 { .ctl_name = 0 }
132};
133
134static ctl_table raid_root_table[] = {
135 {
136 .ctl_name = CTL_DEV,
137 .procname = "dev",
138 .maxlen = 0,
139 .mode = 0555,
140 .child = raid_dir_table,
141 },
142 { .ctl_name = 0 }
143};
144
145static struct block_device_operations md_fops;
146
f91de92e
N
147static int start_readonly;
148
d7603b7e
N
149/*
150 * We have a system wide 'event count' that is incremented
151 * on any 'interesting' event, and readers of /proc/mdstat
152 * can use 'poll' or 'select' to find out when the event
153 * count increases.
154 *
155 * Events are:
156 * start array, stop array, error, add device, remove device,
157 * start build, activate spare
158 */
2989ddbd 159static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
d7603b7e 160static atomic_t md_event_count;
29269553 161void md_new_event(mddev_t *mddev)
d7603b7e
N
162{
163 atomic_inc(&md_event_count);
164 wake_up(&md_event_waiters);
165}
29269553 166EXPORT_SYMBOL_GPL(md_new_event);
d7603b7e 167
1da177e4
LT
168/*
169 * Enables to iterate over all existing md arrays
170 * all_mddevs_lock protects this list.
171 */
172static LIST_HEAD(all_mddevs);
173static DEFINE_SPINLOCK(all_mddevs_lock);
174
175
176/*
177 * iterates through all used mddevs in the system.
178 * We take care to grab the all_mddevs_lock whenever navigating
179 * the list, and to always hold a refcount when unlocked.
180 * Any code which breaks out of this loop while own
181 * a reference to the current mddev and must mddev_put it.
182 */
183#define ITERATE_MDDEV(mddev,tmp) \
184 \
185 for (({ spin_lock(&all_mddevs_lock); \
186 tmp = all_mddevs.next; \
187 mddev = NULL;}); \
188 ({ if (tmp != &all_mddevs) \
189 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
190 spin_unlock(&all_mddevs_lock); \
191 if (mddev) mddev_put(mddev); \
192 mddev = list_entry(tmp, mddev_t, all_mddevs); \
193 tmp != &all_mddevs;}); \
194 ({ spin_lock(&all_mddevs_lock); \
195 tmp = tmp->next;}) \
196 )
197
198
199static int md_fail_request (request_queue_t *q, struct bio *bio)
200{
201 bio_io_error(bio, bio->bi_size);
202 return 0;
203}
204
205static inline mddev_t *mddev_get(mddev_t *mddev)
206{
207 atomic_inc(&mddev->active);
208 return mddev;
209}
210
211static void mddev_put(mddev_t *mddev)
212{
213 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
214 return;
215 if (!mddev->raid_disks && list_empty(&mddev->disks)) {
216 list_del(&mddev->all_mddevs);
1312f40e
AV
217 /* that blocks */
218 blk_cleanup_queue(mddev->queue);
219 /* that also blocks */
eae1701f 220 kobject_unregister(&mddev->kobj);
1312f40e 221 /* result blows... */
1da177e4
LT
222 }
223 spin_unlock(&all_mddevs_lock);
224}
225
226static mddev_t * mddev_find(dev_t unit)
227{
228 mddev_t *mddev, *new = NULL;
229
230 retry:
231 spin_lock(&all_mddevs_lock);
232 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
233 if (mddev->unit == unit) {
234 mddev_get(mddev);
235 spin_unlock(&all_mddevs_lock);
990a8baf 236 kfree(new);
1da177e4
LT
237 return mddev;
238 }
239
240 if (new) {
241 list_add(&new->all_mddevs, &all_mddevs);
242 spin_unlock(&all_mddevs_lock);
243 return new;
244 }
245 spin_unlock(&all_mddevs_lock);
246
9ffae0cf 247 new = kzalloc(sizeof(*new), GFP_KERNEL);
1da177e4
LT
248 if (!new)
249 return NULL;
250
1da177e4
LT
251 new->unit = unit;
252 if (MAJOR(unit) == MD_MAJOR)
253 new->md_minor = MINOR(unit);
254 else
255 new->md_minor = MINOR(unit) >> MdpMinorShift;
256
257 init_MUTEX(&new->reconfig_sem);
258 INIT_LIST_HEAD(&new->disks);
259 INIT_LIST_HEAD(&new->all_mddevs);
260 init_timer(&new->safemode_timer);
261 atomic_set(&new->active, 1);
06d91a5f 262 spin_lock_init(&new->write_lock);
3d310eb7 263 init_waitqueue_head(&new->sb_wait);
1da177e4
LT
264
265 new->queue = blk_alloc_queue(GFP_KERNEL);
266 if (!new->queue) {
267 kfree(new);
268 return NULL;
269 }
89e5c8b5 270 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
1da177e4
LT
271
272 blk_queue_make_request(new->queue, md_fail_request);
273
274 goto retry;
275}
276
277static inline int mddev_lock(mddev_t * mddev)
278{
279 return down_interruptible(&mddev->reconfig_sem);
280}
281
282static inline void mddev_lock_uninterruptible(mddev_t * mddev)
283{
284 down(&mddev->reconfig_sem);
285}
286
287static inline int mddev_trylock(mddev_t * mddev)
288{
289 return down_trylock(&mddev->reconfig_sem);
290}
291
292static inline void mddev_unlock(mddev_t * mddev)
293{
294 up(&mddev->reconfig_sem);
295
005eca5e 296 md_wakeup_thread(mddev->thread);
1da177e4
LT
297}
298
2989ddbd 299static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
1da177e4
LT
300{
301 mdk_rdev_t * rdev;
302 struct list_head *tmp;
303
304 ITERATE_RDEV(mddev,rdev,tmp) {
305 if (rdev->desc_nr == nr)
306 return rdev;
307 }
308 return NULL;
309}
310
311static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
312{
313 struct list_head *tmp;
314 mdk_rdev_t *rdev;
315
316 ITERATE_RDEV(mddev,rdev,tmp) {
317 if (rdev->bdev->bd_dev == dev)
318 return rdev;
319 }
320 return NULL;
321}
322
d9d166c2 323static struct mdk_personality *find_pers(int level, char *clevel)
2604b703
N
324{
325 struct mdk_personality *pers;
d9d166c2
N
326 list_for_each_entry(pers, &pers_list, list) {
327 if (level != LEVEL_NONE && pers->level == level)
2604b703 328 return pers;
d9d166c2
N
329 if (strcmp(pers->name, clevel)==0)
330 return pers;
331 }
2604b703
N
332 return NULL;
333}
334
77933d72 335static inline sector_t calc_dev_sboffset(struct block_device *bdev)
1da177e4
LT
336{
337 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
338 return MD_NEW_SIZE_BLOCKS(size);
339}
340
341static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
342{
343 sector_t size;
344
345 size = rdev->sb_offset;
346
347 if (chunk_size)
348 size &= ~((sector_t)chunk_size/1024 - 1);
349 return size;
350}
351
352static int alloc_disk_sb(mdk_rdev_t * rdev)
353{
354 if (rdev->sb_page)
355 MD_BUG();
356
357 rdev->sb_page = alloc_page(GFP_KERNEL);
358 if (!rdev->sb_page) {
359 printk(KERN_ALERT "md: out of memory.\n");
360 return -EINVAL;
361 }
362
363 return 0;
364}
365
366static void free_disk_sb(mdk_rdev_t * rdev)
367{
368 if (rdev->sb_page) {
2d1f3b5d 369 put_page(rdev->sb_page);
1da177e4
LT
370 rdev->sb_loaded = 0;
371 rdev->sb_page = NULL;
372 rdev->sb_offset = 0;
373 rdev->size = 0;
374 }
375}
376
377
7bfa19f2
N
378static int super_written(struct bio *bio, unsigned int bytes_done, int error)
379{
380 mdk_rdev_t *rdev = bio->bi_private;
a9701a30 381 mddev_t *mddev = rdev->mddev;
7bfa19f2
N
382 if (bio->bi_size)
383 return 1;
384
385 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
a9701a30 386 md_error(mddev, rdev);
7bfa19f2 387
a9701a30
N
388 if (atomic_dec_and_test(&mddev->pending_writes))
389 wake_up(&mddev->sb_wait);
f8b58edf 390 bio_put(bio);
7bfa19f2
N
391 return 0;
392}
393
a9701a30
N
394static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
395{
396 struct bio *bio2 = bio->bi_private;
397 mdk_rdev_t *rdev = bio2->bi_private;
398 mddev_t *mddev = rdev->mddev;
399 if (bio->bi_size)
400 return 1;
401
402 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
403 error == -EOPNOTSUPP) {
404 unsigned long flags;
405 /* barriers don't appear to be supported :-( */
406 set_bit(BarriersNotsupp, &rdev->flags);
407 mddev->barriers_work = 0;
408 spin_lock_irqsave(&mddev->write_lock, flags);
409 bio2->bi_next = mddev->biolist;
410 mddev->biolist = bio2;
411 spin_unlock_irqrestore(&mddev->write_lock, flags);
412 wake_up(&mddev->sb_wait);
413 bio_put(bio);
414 return 0;
415 }
416 bio_put(bio2);
417 bio->bi_private = rdev;
418 return super_written(bio, bytes_done, error);
419}
420
7bfa19f2
N
421void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
422 sector_t sector, int size, struct page *page)
423{
424 /* write first size bytes of page to sector of rdev
425 * Increment mddev->pending_writes before returning
426 * and decrement it on completion, waking up sb_wait
427 * if zero is reached.
428 * If an error occurred, call md_error
a9701a30
N
429 *
430 * As we might need to resubmit the request if BIO_RW_BARRIER
431 * causes ENOTSUPP, we allocate a spare bio...
7bfa19f2
N
432 */
433 struct bio *bio = bio_alloc(GFP_NOIO, 1);
a9701a30 434 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
7bfa19f2
N
435
436 bio->bi_bdev = rdev->bdev;
437 bio->bi_sector = sector;
438 bio_add_page(bio, page, size, 0);
439 bio->bi_private = rdev;
440 bio->bi_end_io = super_written;
a9701a30
N
441 bio->bi_rw = rw;
442
7bfa19f2 443 atomic_inc(&mddev->pending_writes);
a9701a30
N
444 if (!test_bit(BarriersNotsupp, &rdev->flags)) {
445 struct bio *rbio;
446 rw |= (1<<BIO_RW_BARRIER);
447 rbio = bio_clone(bio, GFP_NOIO);
448 rbio->bi_private = bio;
449 rbio->bi_end_io = super_written_barrier;
450 submit_bio(rw, rbio);
451 } else
452 submit_bio(rw, bio);
453}
454
455void md_super_wait(mddev_t *mddev)
456{
457 /* wait for all superblock writes that were scheduled to complete.
458 * if any had to be retried (due to BARRIER problems), retry them
459 */
460 DEFINE_WAIT(wq);
461 for(;;) {
462 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
463 if (atomic_read(&mddev->pending_writes)==0)
464 break;
465 while (mddev->biolist) {
466 struct bio *bio;
467 spin_lock_irq(&mddev->write_lock);
468 bio = mddev->biolist;
469 mddev->biolist = bio->bi_next ;
470 bio->bi_next = NULL;
471 spin_unlock_irq(&mddev->write_lock);
472 submit_bio(bio->bi_rw, bio);
473 }
474 schedule();
475 }
476 finish_wait(&mddev->sb_wait, &wq);
7bfa19f2
N
477}
478
1da177e4
LT
479static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
480{
481 if (bio->bi_size)
482 return 1;
483
484 complete((struct completion*)bio->bi_private);
485 return 0;
486}
487
a654b9d8 488int sync_page_io(struct block_device *bdev, sector_t sector, int size,
1da177e4
LT
489 struct page *page, int rw)
490{
baaa2c51 491 struct bio *bio = bio_alloc(GFP_NOIO, 1);
1da177e4
LT
492 struct completion event;
493 int ret;
494
495 rw |= (1 << BIO_RW_SYNC);
496
497 bio->bi_bdev = bdev;
498 bio->bi_sector = sector;
499 bio_add_page(bio, page, size, 0);
500 init_completion(&event);
501 bio->bi_private = &event;
502 bio->bi_end_io = bi_complete;
503 submit_bio(rw, bio);
504 wait_for_completion(&event);
505
506 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
507 bio_put(bio);
508 return ret;
509}
a8745db2 510EXPORT_SYMBOL_GPL(sync_page_io);
1da177e4 511
0002b271 512static int read_disk_sb(mdk_rdev_t * rdev, int size)
1da177e4
LT
513{
514 char b[BDEVNAME_SIZE];
515 if (!rdev->sb_page) {
516 MD_BUG();
517 return -EINVAL;
518 }
519 if (rdev->sb_loaded)
520 return 0;
521
522
0002b271 523 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
1da177e4
LT
524 goto fail;
525 rdev->sb_loaded = 1;
526 return 0;
527
528fail:
529 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
530 bdevname(rdev->bdev,b));
531 return -EINVAL;
532}
533
534static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
535{
536 if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
537 (sb1->set_uuid1 == sb2->set_uuid1) &&
538 (sb1->set_uuid2 == sb2->set_uuid2) &&
539 (sb1->set_uuid3 == sb2->set_uuid3))
540
541 return 1;
542
543 return 0;
544}
545
546
547static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
548{
549 int ret;
550 mdp_super_t *tmp1, *tmp2;
551
552 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
553 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
554
555 if (!tmp1 || !tmp2) {
556 ret = 0;
557 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
558 goto abort;
559 }
560
561 *tmp1 = *sb1;
562 *tmp2 = *sb2;
563
564 /*
565 * nr_disks is not constant
566 */
567 tmp1->nr_disks = 0;
568 tmp2->nr_disks = 0;
569
570 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
571 ret = 0;
572 else
573 ret = 1;
574
575abort:
990a8baf
JJ
576 kfree(tmp1);
577 kfree(tmp2);
1da177e4
LT
578 return ret;
579}
580
581static unsigned int calc_sb_csum(mdp_super_t * sb)
582{
583 unsigned int disk_csum, csum;
584
585 disk_csum = sb->sb_csum;
586 sb->sb_csum = 0;
587 csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
588 sb->sb_csum = disk_csum;
589 return csum;
590}
591
592
593/*
594 * Handle superblock details.
595 * We want to be able to handle multiple superblock formats
596 * so we have a common interface to them all, and an array of
597 * different handlers.
598 * We rely on user-space to write the initial superblock, and support
599 * reading and updating of superblocks.
600 * Interface methods are:
601 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
602 * loads and validates a superblock on dev.
603 * if refdev != NULL, compare superblocks on both devices
604 * Return:
605 * 0 - dev has a superblock that is compatible with refdev
606 * 1 - dev has a superblock that is compatible and newer than refdev
607 * so dev should be used as the refdev in future
608 * -EINVAL superblock incompatible or invalid
609 * -othererror e.g. -EIO
610 *
611 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
612 * Verify that dev is acceptable into mddev.
613 * The first time, mddev->raid_disks will be 0, and data from
614 * dev should be merged in. Subsequent calls check that dev
615 * is new enough. Return 0 or -EINVAL
616 *
617 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
618 * Update the superblock for rdev with data in mddev
619 * This does not write to disc.
620 *
621 */
622
623struct super_type {
624 char *name;
625 struct module *owner;
626 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
627 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
628 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
629};
630
631/*
632 * load_super for 0.90.0
633 */
634static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
635{
636 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
637 mdp_super_t *sb;
638 int ret;
639 sector_t sb_offset;
640
641 /*
642 * Calculate the position of the superblock,
643 * it's at the end of the disk.
644 *
645 * It also happens to be a multiple of 4Kb.
646 */
647 sb_offset = calc_dev_sboffset(rdev->bdev);
648 rdev->sb_offset = sb_offset;
649
0002b271 650 ret = read_disk_sb(rdev, MD_SB_BYTES);
1da177e4
LT
651 if (ret) return ret;
652
653 ret = -EINVAL;
654
655 bdevname(rdev->bdev, b);
656 sb = (mdp_super_t*)page_address(rdev->sb_page);
657
658 if (sb->md_magic != MD_SB_MAGIC) {
659 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
660 b);
661 goto abort;
662 }
663
664 if (sb->major_version != 0 ||
665 sb->minor_version != 90) {
666 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
667 sb->major_version, sb->minor_version,
668 b);
669 goto abort;
670 }
671
672 if (sb->raid_disks <= 0)
673 goto abort;
674
675 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
676 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
677 b);
678 goto abort;
679 }
680
681 rdev->preferred_minor = sb->md_minor;
682 rdev->data_offset = 0;
0002b271 683 rdev->sb_size = MD_SB_BYTES;
1da177e4
LT
684
685 if (sb->level == LEVEL_MULTIPATH)
686 rdev->desc_nr = -1;
687 else
688 rdev->desc_nr = sb->this_disk.number;
689
690 if (refdev == 0)
691 ret = 1;
692 else {
693 __u64 ev1, ev2;
694 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
695 if (!uuid_equal(refsb, sb)) {
696 printk(KERN_WARNING "md: %s has different UUID to %s\n",
697 b, bdevname(refdev->bdev,b2));
698 goto abort;
699 }
700 if (!sb_equal(refsb, sb)) {
701 printk(KERN_WARNING "md: %s has same UUID"
702 " but different superblock to %s\n",
703 b, bdevname(refdev->bdev, b2));
704 goto abort;
705 }
706 ev1 = md_event(sb);
707 ev2 = md_event(refsb);
708 if (ev1 > ev2)
709 ret = 1;
710 else
711 ret = 0;
712 }
713 rdev->size = calc_dev_size(rdev, sb->chunk_size);
714
2bf071bf
N
715 if (rdev->size < sb->size && sb->level > 1)
716 /* "this cannot possibly happen" ... */
717 ret = -EINVAL;
718
1da177e4
LT
719 abort:
720 return ret;
721}
722
723/*
724 * validate_super for 0.90.0
725 */
726static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
727{
728 mdp_disk_t *desc;
729 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
730
41158c7e 731 rdev->raid_disk = -1;
b2d444d7 732 rdev->flags = 0;
1da177e4
LT
733 if (mddev->raid_disks == 0) {
734 mddev->major_version = 0;
735 mddev->minor_version = sb->minor_version;
736 mddev->patch_version = sb->patch_version;
737 mddev->persistent = ! sb->not_persistent;
738 mddev->chunk_size = sb->chunk_size;
739 mddev->ctime = sb->ctime;
740 mddev->utime = sb->utime;
741 mddev->level = sb->level;
d9d166c2 742 mddev->clevel[0] = 0;
1da177e4
LT
743 mddev->layout = sb->layout;
744 mddev->raid_disks = sb->raid_disks;
745 mddev->size = sb->size;
746 mddev->events = md_event(sb);
9223214e 747 mddev->bitmap_offset = 0;
36fa3063 748 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
1da177e4
LT
749
750 if (sb->state & (1<<MD_SB_CLEAN))
751 mddev->recovery_cp = MaxSector;
752 else {
753 if (sb->events_hi == sb->cp_events_hi &&
754 sb->events_lo == sb->cp_events_lo) {
755 mddev->recovery_cp = sb->recovery_cp;
756 } else
757 mddev->recovery_cp = 0;
758 }
759
760 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
761 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
762 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
763 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
764
765 mddev->max_disks = MD_SB_DISKS;
a654b9d8
N
766
767 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
768 mddev->bitmap_file == NULL) {
c5a10f62
N
769 if (mddev->level != 1 && mddev->level != 4
770 && mddev->level != 5 && mddev->level != 6
6cce3b23 771 && mddev->level != 10) {
a654b9d8 772 /* FIXME use a better test */
6cce3b23 773 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
a654b9d8
N
774 return -EINVAL;
775 }
36fa3063 776 mddev->bitmap_offset = mddev->default_bitmap_offset;
a654b9d8
N
777 }
778
41158c7e
N
779 } else if (mddev->pers == NULL) {
780 /* Insist on good event counter while assembling */
781 __u64 ev1 = md_event(sb);
1da177e4
LT
782 ++ev1;
783 if (ev1 < mddev->events)
784 return -EINVAL;
41158c7e
N
785 } else if (mddev->bitmap) {
786 /* if adding to array with a bitmap, then we can accept an
787 * older device ... but not too old.
788 */
789 __u64 ev1 = md_event(sb);
790 if (ev1 < mddev->bitmap->events_cleared)
791 return 0;
792 } else /* just a hot-add of a new device, leave raid_disk at -1 */
793 return 0;
794
1da177e4 795 if (mddev->level != LEVEL_MULTIPATH) {
1da177e4
LT
796 desc = sb->disks + rdev->desc_nr;
797
798 if (desc->state & (1<<MD_DISK_FAULTY))
b2d444d7 799 set_bit(Faulty, &rdev->flags);
1da177e4
LT
800 else if (desc->state & (1<<MD_DISK_SYNC) &&
801 desc->raid_disk < mddev->raid_disks) {
b2d444d7 802 set_bit(In_sync, &rdev->flags);
1da177e4
LT
803 rdev->raid_disk = desc->raid_disk;
804 }
8ddf9efe
N
805 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
806 set_bit(WriteMostly, &rdev->flags);
41158c7e 807 } else /* MULTIPATH are always insync */
b2d444d7 808 set_bit(In_sync, &rdev->flags);
1da177e4
LT
809 return 0;
810}
811
812/*
813 * sync_super for 0.90.0
814 */
815static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
816{
817 mdp_super_t *sb;
818 struct list_head *tmp;
819 mdk_rdev_t *rdev2;
820 int next_spare = mddev->raid_disks;
19133a42 821
1da177e4
LT
822
823 /* make rdev->sb match mddev data..
824 *
825 * 1/ zero out disks
826 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
827 * 3/ any empty disks < next_spare become removed
828 *
829 * disks[0] gets initialised to REMOVED because
830 * we cannot be sure from other fields if it has
831 * been initialised or not.
832 */
833 int i;
834 int active=0, working=0,failed=0,spare=0,nr_disks=0;
835
61181565
N
836 rdev->sb_size = MD_SB_BYTES;
837
1da177e4
LT
838 sb = (mdp_super_t*)page_address(rdev->sb_page);
839
840 memset(sb, 0, sizeof(*sb));
841
842 sb->md_magic = MD_SB_MAGIC;
843 sb->major_version = mddev->major_version;
844 sb->minor_version = mddev->minor_version;
845 sb->patch_version = mddev->patch_version;
846 sb->gvalid_words = 0; /* ignored */
847 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
848 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
849 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
850 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
851
852 sb->ctime = mddev->ctime;
853 sb->level = mddev->level;
854 sb->size = mddev->size;
855 sb->raid_disks = mddev->raid_disks;
856 sb->md_minor = mddev->md_minor;
857 sb->not_persistent = !mddev->persistent;
858 sb->utime = mddev->utime;
859 sb->state = 0;
860 sb->events_hi = (mddev->events>>32);
861 sb->events_lo = (u32)mddev->events;
862
863 if (mddev->in_sync)
864 {
865 sb->recovery_cp = mddev->recovery_cp;
866 sb->cp_events_hi = (mddev->events>>32);
867 sb->cp_events_lo = (u32)mddev->events;
868 if (mddev->recovery_cp == MaxSector)
869 sb->state = (1<< MD_SB_CLEAN);
870 } else
871 sb->recovery_cp = 0;
872
873 sb->layout = mddev->layout;
874 sb->chunk_size = mddev->chunk_size;
875
a654b9d8
N
876 if (mddev->bitmap && mddev->bitmap_file == NULL)
877 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
878
1da177e4
LT
879 sb->disks[0].state = (1<<MD_DISK_REMOVED);
880 ITERATE_RDEV(mddev,rdev2,tmp) {
881 mdp_disk_t *d;
86e6ffdd 882 int desc_nr;
b2d444d7
N
883 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
884 && !test_bit(Faulty, &rdev2->flags))
86e6ffdd 885 desc_nr = rdev2->raid_disk;
1da177e4 886 else
86e6ffdd 887 desc_nr = next_spare++;
19133a42 888 rdev2->desc_nr = desc_nr;
1da177e4
LT
889 d = &sb->disks[rdev2->desc_nr];
890 nr_disks++;
891 d->number = rdev2->desc_nr;
892 d->major = MAJOR(rdev2->bdev->bd_dev);
893 d->minor = MINOR(rdev2->bdev->bd_dev);
b2d444d7
N
894 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
895 && !test_bit(Faulty, &rdev2->flags))
1da177e4
LT
896 d->raid_disk = rdev2->raid_disk;
897 else
898 d->raid_disk = rdev2->desc_nr; /* compatibility */
1be7892f 899 if (test_bit(Faulty, &rdev2->flags))
1da177e4 900 d->state = (1<<MD_DISK_FAULTY);
1be7892f 901 else if (test_bit(In_sync, &rdev2->flags)) {
1da177e4
LT
902 d->state = (1<<MD_DISK_ACTIVE);
903 d->state |= (1<<MD_DISK_SYNC);
904 active++;
905 working++;
906 } else {
907 d->state = 0;
908 spare++;
909 working++;
910 }
8ddf9efe
N
911 if (test_bit(WriteMostly, &rdev2->flags))
912 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1da177e4 913 }
1da177e4
LT
914 /* now set the "removed" and "faulty" bits on any missing devices */
915 for (i=0 ; i < mddev->raid_disks ; i++) {
916 mdp_disk_t *d = &sb->disks[i];
917 if (d->state == 0 && d->number == 0) {
918 d->number = i;
919 d->raid_disk = i;
920 d->state = (1<<MD_DISK_REMOVED);
921 d->state |= (1<<MD_DISK_FAULTY);
922 failed++;
923 }
924 }
925 sb->nr_disks = nr_disks;
926 sb->active_disks = active;
927 sb->working_disks = working;
928 sb->failed_disks = failed;
929 sb->spare_disks = spare;
930
931 sb->this_disk = sb->disks[rdev->desc_nr];
932 sb->sb_csum = calc_sb_csum(sb);
933}
934
935/*
936 * version 1 superblock
937 */
938
939static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
940{
941 unsigned int disk_csum, csum;
942 unsigned long long newcsum;
943 int size = 256 + le32_to_cpu(sb->max_dev)*2;
944 unsigned int *isuper = (unsigned int*)sb;
945 int i;
946
947 disk_csum = sb->sb_csum;
948 sb->sb_csum = 0;
949 newcsum = 0;
950 for (i=0; size>=4; size -= 4 )
951 newcsum += le32_to_cpu(*isuper++);
952
953 if (size == 2)
954 newcsum += le16_to_cpu(*(unsigned short*) isuper);
955
956 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
957 sb->sb_csum = disk_csum;
958 return cpu_to_le32(csum);
959}
960
961static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
962{
963 struct mdp_superblock_1 *sb;
964 int ret;
965 sector_t sb_offset;
966 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
0002b271 967 int bmask;
1da177e4
LT
968
969 /*
970 * Calculate the position of the superblock.
971 * It is always aligned to a 4K boundary and
972 * depeding on minor_version, it can be:
973 * 0: At least 8K, but less than 12K, from end of device
974 * 1: At start of device
975 * 2: 4K from start of device.
976 */
977 switch(minor_version) {
978 case 0:
979 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
980 sb_offset -= 8*2;
39730960 981 sb_offset &= ~(sector_t)(4*2-1);
1da177e4
LT
982 /* convert from sectors to K */
983 sb_offset /= 2;
984 break;
985 case 1:
986 sb_offset = 0;
987 break;
988 case 2:
989 sb_offset = 4;
990 break;
991 default:
992 return -EINVAL;
993 }
994 rdev->sb_offset = sb_offset;
995
0002b271
N
996 /* superblock is rarely larger than 1K, but it can be larger,
997 * and it is safe to read 4k, so we do that
998 */
999 ret = read_disk_sb(rdev, 4096);
1da177e4
LT
1000 if (ret) return ret;
1001
1002
1003 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1004
1005 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1006 sb->major_version != cpu_to_le32(1) ||
1007 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1008 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
71c0805c 1009 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1da177e4
LT
1010 return -EINVAL;
1011
1012 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1013 printk("md: invalid superblock checksum on %s\n",
1014 bdevname(rdev->bdev,b));
1015 return -EINVAL;
1016 }
1017 if (le64_to_cpu(sb->data_size) < 10) {
1018 printk("md: data_size too small on %s\n",
1019 bdevname(rdev->bdev,b));
1020 return -EINVAL;
1021 }
1022 rdev->preferred_minor = 0xffff;
1023 rdev->data_offset = le64_to_cpu(sb->data_offset);
4dbcdc75 1024 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1da177e4 1025
0002b271 1026 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
720a3dc3 1027 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
0002b271
N
1028 if (rdev->sb_size & bmask)
1029 rdev-> sb_size = (rdev->sb_size | bmask)+1;
1030
1da177e4 1031 if (refdev == 0)
8ed75463 1032 ret = 1;
1da177e4
LT
1033 else {
1034 __u64 ev1, ev2;
1035 struct mdp_superblock_1 *refsb =
1036 (struct mdp_superblock_1*)page_address(refdev->sb_page);
1037
1038 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1039 sb->level != refsb->level ||
1040 sb->layout != refsb->layout ||
1041 sb->chunksize != refsb->chunksize) {
1042 printk(KERN_WARNING "md: %s has strangely different"
1043 " superblock to %s\n",
1044 bdevname(rdev->bdev,b),
1045 bdevname(refdev->bdev,b2));
1046 return -EINVAL;
1047 }
1048 ev1 = le64_to_cpu(sb->events);
1049 ev2 = le64_to_cpu(refsb->events);
1050
1051 if (ev1 > ev2)
8ed75463
N
1052 ret = 1;
1053 else
1054 ret = 0;
1da177e4
LT
1055 }
1056 if (minor_version)
1057 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1058 else
1059 rdev->size = rdev->sb_offset;
1060 if (rdev->size < le64_to_cpu(sb->data_size)/2)
1061 return -EINVAL;
1062 rdev->size = le64_to_cpu(sb->data_size)/2;
1063 if (le32_to_cpu(sb->chunksize))
1064 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
2bf071bf
N
1065
1066 if (le32_to_cpu(sb->size) > rdev->size*2)
1067 return -EINVAL;
8ed75463 1068 return ret;
1da177e4
LT
1069}
1070
1071static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1072{
1073 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1074
41158c7e 1075 rdev->raid_disk = -1;
b2d444d7 1076 rdev->flags = 0;
1da177e4
LT
1077 if (mddev->raid_disks == 0) {
1078 mddev->major_version = 1;
1079 mddev->patch_version = 0;
1080 mddev->persistent = 1;
1081 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1082 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1083 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1084 mddev->level = le32_to_cpu(sb->level);
d9d166c2 1085 mddev->clevel[0] = 0;
1da177e4
LT
1086 mddev->layout = le32_to_cpu(sb->layout);
1087 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1088 mddev->size = le64_to_cpu(sb->size)/2;
1089 mddev->events = le64_to_cpu(sb->events);
9223214e 1090 mddev->bitmap_offset = 0;
29fc7e3e 1091 mddev->default_bitmap_offset = 1024 >> 9;
1da177e4
LT
1092
1093 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1094 memcpy(mddev->uuid, sb->set_uuid, 16);
1095
1096 mddev->max_disks = (4096-256)/2;
a654b9d8 1097
71c0805c 1098 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
a654b9d8 1099 mddev->bitmap_file == NULL ) {
6cce3b23
N
1100 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
1101 && mddev->level != 10) {
1102 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
a654b9d8
N
1103 return -EINVAL;
1104 }
1105 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1106 }
41158c7e
N
1107 } else if (mddev->pers == NULL) {
1108 /* Insist of good event counter while assembling */
1109 __u64 ev1 = le64_to_cpu(sb->events);
1da177e4
LT
1110 ++ev1;
1111 if (ev1 < mddev->events)
1112 return -EINVAL;
41158c7e
N
1113 } else if (mddev->bitmap) {
1114 /* If adding to array with a bitmap, then we can accept an
1115 * older device, but not too old.
1116 */
1117 __u64 ev1 = le64_to_cpu(sb->events);
1118 if (ev1 < mddev->bitmap->events_cleared)
1119 return 0;
1120 } else /* just a hot-add of a new device, leave raid_disk at -1 */
1121 return 0;
1da177e4
LT
1122
1123 if (mddev->level != LEVEL_MULTIPATH) {
1124 int role;
1125 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1126 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1127 switch(role) {
1128 case 0xffff: /* spare */
1da177e4
LT
1129 break;
1130 case 0xfffe: /* faulty */
b2d444d7 1131 set_bit(Faulty, &rdev->flags);
1da177e4
LT
1132 break;
1133 default:
b2d444d7 1134 set_bit(In_sync, &rdev->flags);
1da177e4
LT
1135 rdev->raid_disk = role;
1136 break;
1137 }
8ddf9efe
N
1138 if (sb->devflags & WriteMostly1)
1139 set_bit(WriteMostly, &rdev->flags);
41158c7e 1140 } else /* MULTIPATH are always insync */
b2d444d7 1141 set_bit(In_sync, &rdev->flags);
41158c7e 1142
1da177e4
LT
1143 return 0;
1144}
1145
1146static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1147{
1148 struct mdp_superblock_1 *sb;
1149 struct list_head *tmp;
1150 mdk_rdev_t *rdev2;
1151 int max_dev, i;
1152 /* make rdev->sb match mddev and rdev data. */
1153
1154 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1155
1156 sb->feature_map = 0;
1157 sb->pad0 = 0;
1158 memset(sb->pad1, 0, sizeof(sb->pad1));
1159 memset(sb->pad2, 0, sizeof(sb->pad2));
1160 memset(sb->pad3, 0, sizeof(sb->pad3));
1161
1162 sb->utime = cpu_to_le64((__u64)mddev->utime);
1163 sb->events = cpu_to_le64(mddev->events);
1164 if (mddev->in_sync)
1165 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1166 else
1167 sb->resync_offset = cpu_to_le64(0);
1168
4dbcdc75
N
1169 sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
1170
f0ca340c 1171 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
29fc7e3e 1172 sb->size = cpu_to_le64(mddev->size<<1);
f0ca340c 1173
a654b9d8
N
1174 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1175 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
71c0805c 1176 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
a654b9d8
N
1177 }
1178
1da177e4
LT
1179 max_dev = 0;
1180 ITERATE_RDEV(mddev,rdev2,tmp)
1181 if (rdev2->desc_nr+1 > max_dev)
1182 max_dev = rdev2->desc_nr+1;
1183
1184 sb->max_dev = cpu_to_le32(max_dev);
1185 for (i=0; i<max_dev;i++)
1186 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1187
1188 ITERATE_RDEV(mddev,rdev2,tmp) {
1189 i = rdev2->desc_nr;
b2d444d7 1190 if (test_bit(Faulty, &rdev2->flags))
1da177e4 1191 sb->dev_roles[i] = cpu_to_le16(0xfffe);
b2d444d7 1192 else if (test_bit(In_sync, &rdev2->flags))
1da177e4
LT
1193 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1194 else
1195 sb->dev_roles[i] = cpu_to_le16(0xffff);
1196 }
1197
1198 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
1199 sb->sb_csum = calc_sb_1_csum(sb);
1200}
1201
1202
75c96f85 1203static struct super_type super_types[] = {
1da177e4
LT
1204 [0] = {
1205 .name = "0.90.0",
1206 .owner = THIS_MODULE,
1207 .load_super = super_90_load,
1208 .validate_super = super_90_validate,
1209 .sync_super = super_90_sync,
1210 },
1211 [1] = {
1212 .name = "md-1",
1213 .owner = THIS_MODULE,
1214 .load_super = super_1_load,
1215 .validate_super = super_1_validate,
1216 .sync_super = super_1_sync,
1217 },
1218};
1219
1220static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
1221{
1222 struct list_head *tmp;
1223 mdk_rdev_t *rdev;
1224
1225 ITERATE_RDEV(mddev,rdev,tmp)
1226 if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
1227 return rdev;
1228
1229 return NULL;
1230}
1231
1232static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1233{
1234 struct list_head *tmp;
1235 mdk_rdev_t *rdev;
1236
1237 ITERATE_RDEV(mddev1,rdev,tmp)
1238 if (match_dev_unit(mddev2, rdev))
1239 return 1;
1240
1241 return 0;
1242}
1243
1244static LIST_HEAD(pending_raid_disks);
1245
1246static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1247{
1248 mdk_rdev_t *same_pdev;
1249 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
f637b9f9 1250 struct kobject *ko;
1edf80d3 1251 char *s;
1da177e4
LT
1252
1253 if (rdev->mddev) {
1254 MD_BUG();
1255 return -EINVAL;
1256 }
2bf071bf
N
1257 /* make sure rdev->size exceeds mddev->size */
1258 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1259 if (mddev->pers)
1260 /* Cannot change size, so fail */
1261 return -ENOSPC;
1262 else
1263 mddev->size = rdev->size;
1264 }
1da177e4
LT
1265 same_pdev = match_dev_unit(mddev, rdev);
1266 if (same_pdev)
1267 printk(KERN_WARNING
1268 "%s: WARNING: %s appears to be on the same physical"
1269 " disk as %s. True\n protection against single-disk"
1270 " failure might be compromised.\n",
1271 mdname(mddev), bdevname(rdev->bdev,b),
1272 bdevname(same_pdev->bdev,b2));
1273
1274 /* Verify rdev->desc_nr is unique.
1275 * If it is -1, assign a free number, else
1276 * check number is not in use
1277 */
1278 if (rdev->desc_nr < 0) {
1279 int choice = 0;
1280 if (mddev->pers) choice = mddev->raid_disks;
1281 while (find_rdev_nr(mddev, choice))
1282 choice++;
1283 rdev->desc_nr = choice;
1284 } else {
1285 if (find_rdev_nr(mddev, rdev->desc_nr))
1286 return -EBUSY;
1287 }
19133a42
N
1288 bdevname(rdev->bdev,b);
1289 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
1290 return -ENOMEM;
1edf80d3
NB
1291 while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL)
1292 *s = '!';
1da177e4
LT
1293
1294 list_add(&rdev->same_set, &mddev->disks);
1295 rdev->mddev = mddev;
19133a42 1296 printk(KERN_INFO "md: bind<%s>\n", b);
86e6ffdd 1297
9c791977 1298 rdev->kobj.parent = &mddev->kobj;
86e6ffdd
N
1299 kobject_add(&rdev->kobj);
1300
f637b9f9
N
1301 if (rdev->bdev->bd_part)
1302 ko = &rdev->bdev->bd_part->kobj;
1303 else
1304 ko = &rdev->bdev->bd_disk->kobj;
1305 sysfs_create_link(&rdev->kobj, ko, "block");
5463c790 1306 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
1da177e4
LT
1307 return 0;
1308}
1309
1310static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1311{
1312 char b[BDEVNAME_SIZE];
1313 if (!rdev->mddev) {
1314 MD_BUG();
1315 return;
1316 }
5463c790 1317 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1da177e4
LT
1318 list_del_init(&rdev->same_set);
1319 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1320 rdev->mddev = NULL;
86e6ffdd
N
1321 sysfs_remove_link(&rdev->kobj, "block");
1322 kobject_del(&rdev->kobj);
1da177e4
LT
1323}
1324
1325/*
1326 * prevent the device from being mounted, repartitioned or
1327 * otherwise reused by a RAID array (or any other kernel
1328 * subsystem), by bd_claiming the device.
1329 */
1330static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1331{
1332 int err = 0;
1333 struct block_device *bdev;
1334 char b[BDEVNAME_SIZE];
1335
1336 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1337 if (IS_ERR(bdev)) {
1338 printk(KERN_ERR "md: could not open %s.\n",
1339 __bdevname(dev, b));
1340 return PTR_ERR(bdev);
1341 }
1342 err = bd_claim(bdev, rdev);
1343 if (err) {
1344 printk(KERN_ERR "md: could not bd_claim %s.\n",
1345 bdevname(bdev, b));
1346 blkdev_put(bdev);
1347 return err;
1348 }
1349 rdev->bdev = bdev;
1350 return err;
1351}
1352
1353static void unlock_rdev(mdk_rdev_t *rdev)
1354{
1355 struct block_device *bdev = rdev->bdev;
1356 rdev->bdev = NULL;
1357 if (!bdev)
1358 MD_BUG();
1359 bd_release(bdev);
1360 blkdev_put(bdev);
1361}
1362
1363void md_autodetect_dev(dev_t dev);
1364
1365static void export_rdev(mdk_rdev_t * rdev)
1366{
1367 char b[BDEVNAME_SIZE];
1368 printk(KERN_INFO "md: export_rdev(%s)\n",
1369 bdevname(rdev->bdev,b));
1370 if (rdev->mddev)
1371 MD_BUG();
1372 free_disk_sb(rdev);
1373 list_del_init(&rdev->same_set);
1374#ifndef MODULE
1375 md_autodetect_dev(rdev->bdev->bd_dev);
1376#endif
1377 unlock_rdev(rdev);
86e6ffdd 1378 kobject_put(&rdev->kobj);
1da177e4
LT
1379}
1380
1381static void kick_rdev_from_array(mdk_rdev_t * rdev)
1382{
1383 unbind_rdev_from_array(rdev);
1384 export_rdev(rdev);
1385}
1386
1387static void export_array(mddev_t *mddev)
1388{
1389 struct list_head *tmp;
1390 mdk_rdev_t *rdev;
1391
1392 ITERATE_RDEV(mddev,rdev,tmp) {
1393 if (!rdev->mddev) {
1394 MD_BUG();
1395 continue;
1396 }
1397 kick_rdev_from_array(rdev);
1398 }
1399 if (!list_empty(&mddev->disks))
1400 MD_BUG();
1401 mddev->raid_disks = 0;
1402 mddev->major_version = 0;
1403}
1404
1405static void print_desc(mdp_disk_t *desc)
1406{
1407 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1408 desc->major,desc->minor,desc->raid_disk,desc->state);
1409}
1410
1411static void print_sb(mdp_super_t *sb)
1412{
1413 int i;
1414
1415 printk(KERN_INFO
1416 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1417 sb->major_version, sb->minor_version, sb->patch_version,
1418 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1419 sb->ctime);
1420 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1421 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1422 sb->md_minor, sb->layout, sb->chunk_size);
1423 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
1424 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1425 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1426 sb->failed_disks, sb->spare_disks,
1427 sb->sb_csum, (unsigned long)sb->events_lo);
1428
1429 printk(KERN_INFO);
1430 for (i = 0; i < MD_SB_DISKS; i++) {
1431 mdp_disk_t *desc;
1432
1433 desc = sb->disks + i;
1434 if (desc->number || desc->major || desc->minor ||
1435 desc->raid_disk || (desc->state && (desc->state != 4))) {
1436 printk(" D %2d: ", i);
1437 print_desc(desc);
1438 }
1439 }
1440 printk(KERN_INFO "md: THIS: ");
1441 print_desc(&sb->this_disk);
1442
1443}
1444
1445static void print_rdev(mdk_rdev_t *rdev)
1446{
1447 char b[BDEVNAME_SIZE];
1448 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1449 bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
b2d444d7
N
1450 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1451 rdev->desc_nr);
1da177e4
LT
1452 if (rdev->sb_loaded) {
1453 printk(KERN_INFO "md: rdev superblock:\n");
1454 print_sb((mdp_super_t*)page_address(rdev->sb_page));
1455 } else
1456 printk(KERN_INFO "md: no rdev superblock!\n");
1457}
1458
1459void md_print_devices(void)
1460{
1461 struct list_head *tmp, *tmp2;
1462 mdk_rdev_t *rdev;
1463 mddev_t *mddev;
1464 char b[BDEVNAME_SIZE];
1465
1466 printk("\n");
1467 printk("md: **********************************\n");
1468 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
1469 printk("md: **********************************\n");
1470 ITERATE_MDDEV(mddev,tmp) {
1da177e4 1471
32a7627c
N
1472 if (mddev->bitmap)
1473 bitmap_print_sb(mddev->bitmap);
1474 else
1475 printk("%s: ", mdname(mddev));
1da177e4
LT
1476 ITERATE_RDEV(mddev,rdev,tmp2)
1477 printk("<%s>", bdevname(rdev->bdev,b));
1478 printk("\n");
1479
1480 ITERATE_RDEV(mddev,rdev,tmp2)
1481 print_rdev(rdev);
1482 }
1483 printk("md: **********************************\n");
1484 printk("\n");
1485}
1486
1487
1da177e4
LT
1488static void sync_sbs(mddev_t * mddev)
1489{
1490 mdk_rdev_t *rdev;
1491 struct list_head *tmp;
1492
1493 ITERATE_RDEV(mddev,rdev,tmp) {
1494 super_types[mddev->major_version].
1495 sync_super(mddev, rdev);
1496 rdev->sb_loaded = 1;
1497 }
1498}
1499
1500static void md_update_sb(mddev_t * mddev)
1501{
7bfa19f2 1502 int err;
1da177e4
LT
1503 struct list_head *tmp;
1504 mdk_rdev_t *rdev;
06d91a5f 1505 int sync_req;
1da177e4 1506
1da177e4 1507repeat:
a9701a30 1508 spin_lock_irq(&mddev->write_lock);
06d91a5f 1509 sync_req = mddev->in_sync;
1da177e4
LT
1510 mddev->utime = get_seconds();
1511 mddev->events ++;
1512
1513 if (!mddev->events) {
1514 /*
1515 * oops, this 64-bit counter should never wrap.
1516 * Either we are in around ~1 trillion A.C., assuming
1517 * 1 reboot per second, or we have a bug:
1518 */
1519 MD_BUG();
1520 mddev->events --;
1521 }
7bfa19f2 1522 mddev->sb_dirty = 2;
1da177e4
LT
1523 sync_sbs(mddev);
1524
1525 /*
1526 * do not write anything to disk if using
1527 * nonpersistent superblocks
1528 */
06d91a5f
N
1529 if (!mddev->persistent) {
1530 mddev->sb_dirty = 0;
a9701a30 1531 spin_unlock_irq(&mddev->write_lock);
3d310eb7 1532 wake_up(&mddev->sb_wait);
1da177e4 1533 return;
06d91a5f 1534 }
a9701a30 1535 spin_unlock_irq(&mddev->write_lock);
1da177e4
LT
1536
1537 dprintk(KERN_INFO
1538 "md: updating %s RAID superblock on device (in sync %d)\n",
1539 mdname(mddev),mddev->in_sync);
1540
32a7627c 1541 err = bitmap_update_sb(mddev->bitmap);
1da177e4
LT
1542 ITERATE_RDEV(mddev,rdev,tmp) {
1543 char b[BDEVNAME_SIZE];
1544 dprintk(KERN_INFO "md: ");
b2d444d7 1545 if (test_bit(Faulty, &rdev->flags))
1da177e4
LT
1546 dprintk("(skipping faulty ");
1547
1548 dprintk("%s ", bdevname(rdev->bdev,b));
b2d444d7 1549 if (!test_bit(Faulty, &rdev->flags)) {
7bfa19f2 1550 md_super_write(mddev,rdev,
0002b271 1551 rdev->sb_offset<<1, rdev->sb_size,
7bfa19f2
N
1552 rdev->sb_page);
1553 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1554 bdevname(rdev->bdev,b),
1555 (unsigned long long)rdev->sb_offset);
1556
1da177e4
LT
1557 } else
1558 dprintk(")\n");
7bfa19f2 1559 if (mddev->level == LEVEL_MULTIPATH)
1da177e4
LT
1560 /* only need to write one superblock... */
1561 break;
1562 }
a9701a30 1563 md_super_wait(mddev);
7bfa19f2
N
1564 /* if there was a failure, sb_dirty was set to 1, and we re-write super */
1565
a9701a30 1566 spin_lock_irq(&mddev->write_lock);
7bfa19f2 1567 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
06d91a5f 1568 /* have to write it out again */
a9701a30 1569 spin_unlock_irq(&mddev->write_lock);
06d91a5f
N
1570 goto repeat;
1571 }
1572 mddev->sb_dirty = 0;
a9701a30 1573 spin_unlock_irq(&mddev->write_lock);
3d310eb7 1574 wake_up(&mddev->sb_wait);
06d91a5f 1575
1da177e4
LT
1576}
1577
bce74dac
N
1578/* words written to sysfs files may, or my not, be \n terminated.
1579 * We want to accept with case. For this we use cmd_match.
1580 */
1581static int cmd_match(const char *cmd, const char *str)
1582{
1583 /* See if cmd, written into a sysfs file, matches
1584 * str. They must either be the same, or cmd can
1585 * have a trailing newline
1586 */
1587 while (*cmd && *str && *cmd == *str) {
1588 cmd++;
1589 str++;
1590 }
1591 if (*cmd == '\n')
1592 cmd++;
1593 if (*str || *cmd)
1594 return 0;
1595 return 1;
1596}
1597
86e6ffdd
N
1598struct rdev_sysfs_entry {
1599 struct attribute attr;
1600 ssize_t (*show)(mdk_rdev_t *, char *);
1601 ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1602};
1603
1604static ssize_t
96de1e66 1605state_show(mdk_rdev_t *rdev, char *page)
86e6ffdd
N
1606{
1607 char *sep = "";
1608 int len=0;
1609
b2d444d7 1610 if (test_bit(Faulty, &rdev->flags)) {
86e6ffdd
N
1611 len+= sprintf(page+len, "%sfaulty",sep);
1612 sep = ",";
1613 }
b2d444d7 1614 if (test_bit(In_sync, &rdev->flags)) {
86e6ffdd
N
1615 len += sprintf(page+len, "%sin_sync",sep);
1616 sep = ",";
1617 }
b2d444d7
N
1618 if (!test_bit(Faulty, &rdev->flags) &&
1619 !test_bit(In_sync, &rdev->flags)) {
86e6ffdd
N
1620 len += sprintf(page+len, "%sspare", sep);
1621 sep = ",";
1622 }
1623 return len+sprintf(page+len, "\n");
1624}
1625
96de1e66
N
1626static struct rdev_sysfs_entry
1627rdev_state = __ATTR_RO(state);
86e6ffdd
N
1628
1629static ssize_t
96de1e66 1630super_show(mdk_rdev_t *rdev, char *page)
86e6ffdd
N
1631{
1632 if (rdev->sb_loaded && rdev->sb_size) {
1633 memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
1634 return rdev->sb_size;
1635 } else
1636 return 0;
1637}
96de1e66
N
1638static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1639
4dbcdc75
N
1640static ssize_t
1641errors_show(mdk_rdev_t *rdev, char *page)
1642{
1643 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1644}
1645
1646static ssize_t
1647errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1648{
1649 char *e;
1650 unsigned long n = simple_strtoul(buf, &e, 10);
1651 if (*buf && (*e == 0 || *e == '\n')) {
1652 atomic_set(&rdev->corrected_errors, n);
1653 return len;
1654 }
1655 return -EINVAL;
1656}
1657static struct rdev_sysfs_entry rdev_errors =
1658__ATTR(errors, 0644, errors_show, errors_store);
1659
014236d2
N
1660static ssize_t
1661slot_show(mdk_rdev_t *rdev, char *page)
1662{
1663 if (rdev->raid_disk < 0)
1664 return sprintf(page, "none\n");
1665 else
1666 return sprintf(page, "%d\n", rdev->raid_disk);
1667}
1668
1669static ssize_t
1670slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1671{
1672 char *e;
1673 int slot = simple_strtoul(buf, &e, 10);
1674 if (strncmp(buf, "none", 4)==0)
1675 slot = -1;
1676 else if (e==buf || (*e && *e!= '\n'))
1677 return -EINVAL;
1678 if (rdev->mddev->pers)
1679 /* Cannot set slot in active array (yet) */
1680 return -EBUSY;
1681 if (slot >= rdev->mddev->raid_disks)
1682 return -ENOSPC;
1683 rdev->raid_disk = slot;
1684 /* assume it is working */
1685 rdev->flags = 0;
1686 set_bit(In_sync, &rdev->flags);
1687 return len;
1688}
1689
1690
1691static struct rdev_sysfs_entry rdev_slot =
1692__ATTR(slot, 0644, slot_show, slot_store);
1693
93c8cad0
N
1694static ssize_t
1695offset_show(mdk_rdev_t *rdev, char *page)
1696{
6961ece4 1697 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
93c8cad0
N
1698}
1699
1700static ssize_t
1701offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1702{
1703 char *e;
1704 unsigned long long offset = simple_strtoull(buf, &e, 10);
1705 if (e==buf || (*e && *e != '\n'))
1706 return -EINVAL;
1707 if (rdev->mddev->pers)
1708 return -EBUSY;
1709 rdev->data_offset = offset;
1710 return len;
1711}
1712
1713static struct rdev_sysfs_entry rdev_offset =
1714__ATTR(offset, 0644, offset_show, offset_store);
1715
83303b61
N
1716static ssize_t
1717rdev_size_show(mdk_rdev_t *rdev, char *page)
1718{
1719 return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1720}
1721
1722static ssize_t
1723rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1724{
1725 char *e;
1726 unsigned long long size = simple_strtoull(buf, &e, 10);
1727 if (e==buf || (*e && *e != '\n'))
1728 return -EINVAL;
1729 if (rdev->mddev->pers)
1730 return -EBUSY;
1731 rdev->size = size;
1732 if (size < rdev->mddev->size || rdev->mddev->size == 0)
1733 rdev->mddev->size = size;
1734 return len;
1735}
1736
1737static struct rdev_sysfs_entry rdev_size =
1738__ATTR(size, 0644, rdev_size_show, rdev_size_store);
1739
86e6ffdd
N
1740static struct attribute *rdev_default_attrs[] = {
1741 &rdev_state.attr,
1742 &rdev_super.attr,
4dbcdc75 1743 &rdev_errors.attr,
014236d2 1744 &rdev_slot.attr,
93c8cad0 1745 &rdev_offset.attr,
83303b61 1746 &rdev_size.attr,
86e6ffdd
N
1747 NULL,
1748};
1749static ssize_t
1750rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1751{
1752 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1753 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1754
1755 if (!entry->show)
1756 return -EIO;
1757 return entry->show(rdev, page);
1758}
1759
1760static ssize_t
1761rdev_attr_store(struct kobject *kobj, struct attribute *attr,
1762 const char *page, size_t length)
1763{
1764 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1765 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1766
1767 if (!entry->store)
1768 return -EIO;
1769 return entry->store(rdev, page, length);
1770}
1771
1772static void rdev_free(struct kobject *ko)
1773{
1774 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
1775 kfree(rdev);
1776}
1777static struct sysfs_ops rdev_sysfs_ops = {
1778 .show = rdev_attr_show,
1779 .store = rdev_attr_store,
1780};
1781static struct kobj_type rdev_ktype = {
1782 .release = rdev_free,
1783 .sysfs_ops = &rdev_sysfs_ops,
1784 .default_attrs = rdev_default_attrs,
1785};
1786
1da177e4
LT
1787/*
1788 * Import a device. If 'super_format' >= 0, then sanity check the superblock
1789 *
1790 * mark the device faulty if:
1791 *
1792 * - the device is nonexistent (zero size)
1793 * - the device has no valid superblock
1794 *
1795 * a faulty rdev _never_ has rdev->sb set.
1796 */
1797static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
1798{
1799 char b[BDEVNAME_SIZE];
1800 int err;
1801 mdk_rdev_t *rdev;
1802 sector_t size;
1803
9ffae0cf 1804 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
1da177e4
LT
1805 if (!rdev) {
1806 printk(KERN_ERR "md: could not alloc mem for new device!\n");
1807 return ERR_PTR(-ENOMEM);
1808 }
1da177e4
LT
1809
1810 if ((err = alloc_disk_sb(rdev)))
1811 goto abort_free;
1812
1813 err = lock_rdev(rdev, newdev);
1814 if (err)
1815 goto abort_free;
1816
86e6ffdd
N
1817 rdev->kobj.parent = NULL;
1818 rdev->kobj.ktype = &rdev_ktype;
1819 kobject_init(&rdev->kobj);
1820
1da177e4 1821 rdev->desc_nr = -1;
b2d444d7 1822 rdev->flags = 0;
1da177e4
LT
1823 rdev->data_offset = 0;
1824 atomic_set(&rdev->nr_pending, 0);
ba22dcbf 1825 atomic_set(&rdev->read_errors, 0);
4dbcdc75 1826 atomic_set(&rdev->corrected_errors, 0);
1da177e4
LT
1827
1828 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
1829 if (!size) {
1830 printk(KERN_WARNING
1831 "md: %s has zero or unknown size, marking faulty!\n",
1832 bdevname(rdev->bdev,b));
1833 err = -EINVAL;
1834 goto abort_free;
1835 }
1836
1837 if (super_format >= 0) {
1838 err = super_types[super_format].
1839 load_super(rdev, NULL, super_minor);
1840 if (err == -EINVAL) {
1841 printk(KERN_WARNING
1842 "md: %s has invalid sb, not importing!\n",
1843 bdevname(rdev->bdev,b));
1844 goto abort_free;
1845 }
1846 if (err < 0) {
1847 printk(KERN_WARNING
1848 "md: could not read %s's sb, not importing!\n",
1849 bdevname(rdev->bdev,b));
1850 goto abort_free;
1851 }
1852 }
1853 INIT_LIST_HEAD(&rdev->same_set);
1854
1855 return rdev;
1856
1857abort_free:
1858 if (rdev->sb_page) {
1859 if (rdev->bdev)
1860 unlock_rdev(rdev);
1861 free_disk_sb(rdev);
1862 }
1863 kfree(rdev);
1864 return ERR_PTR(err);
1865}
1866
1867/*
1868 * Check a full RAID array for plausibility
1869 */
1870
1871
a757e64c 1872static void analyze_sbs(mddev_t * mddev)
1da177e4
LT
1873{
1874 int i;
1875 struct list_head *tmp;
1876 mdk_rdev_t *rdev, *freshest;
1877 char b[BDEVNAME_SIZE];
1878
1879 freshest = NULL;
1880 ITERATE_RDEV(mddev,rdev,tmp)
1881 switch (super_types[mddev->major_version].
1882 load_super(rdev, freshest, mddev->minor_version)) {
1883 case 1:
1884 freshest = rdev;
1885 break;
1886 case 0:
1887 break;
1888 default:
1889 printk( KERN_ERR \
1890 "md: fatal superblock inconsistency in %s"
1891 " -- removing from array\n",
1892 bdevname(rdev->bdev,b));
1893 kick_rdev_from_array(rdev);
1894 }
1895
1896
1897 super_types[mddev->major_version].
1898 validate_super(mddev, freshest);
1899
1900 i = 0;
1901 ITERATE_RDEV(mddev,rdev,tmp) {
1902 if (rdev != freshest)
1903 if (super_types[mddev->major_version].
1904 validate_super(mddev, rdev)) {
1905 printk(KERN_WARNING "md: kicking non-fresh %s"
1906 " from array!\n",
1907 bdevname(rdev->bdev,b));
1908 kick_rdev_from_array(rdev);
1909 continue;
1910 }
1911 if (mddev->level == LEVEL_MULTIPATH) {
1912 rdev->desc_nr = i++;
1913 rdev->raid_disk = rdev->desc_nr;
b2d444d7 1914 set_bit(In_sync, &rdev->flags);
1da177e4
LT
1915 }
1916 }
1917
1918
1919
1920 if (mddev->recovery_cp != MaxSector &&
1921 mddev->level >= 1)
1922 printk(KERN_ERR "md: %s: raid array is not clean"
1923 " -- starting background reconstruction\n",
1924 mdname(mddev));
1925
1da177e4
LT
1926}
1927
eae1701f 1928static ssize_t
96de1e66 1929level_show(mddev_t *mddev, char *page)
eae1701f 1930{
2604b703 1931 struct mdk_personality *p = mddev->pers;
d9d166c2 1932 if (p)
eae1701f 1933 return sprintf(page, "%s\n", p->name);
d9d166c2
N
1934 else if (mddev->clevel[0])
1935 return sprintf(page, "%s\n", mddev->clevel);
1936 else if (mddev->level != LEVEL_NONE)
1937 return sprintf(page, "%d\n", mddev->level);
1938 else
1939 return 0;
eae1701f
N
1940}
1941
d9d166c2
N
1942static ssize_t
1943level_store(mddev_t *mddev, const char *buf, size_t len)
1944{
1945 int rv = len;
1946 if (mddev->pers)
1947 return -EBUSY;
1948 if (len == 0)
1949 return 0;
1950 if (len >= sizeof(mddev->clevel))
1951 return -ENOSPC;
1952 strncpy(mddev->clevel, buf, len);
1953 if (mddev->clevel[len-1] == '\n')
1954 len--;
1955 mddev->clevel[len] = 0;
1956 mddev->level = LEVEL_NONE;
1957 return rv;
1958}
1959
1960static struct md_sysfs_entry md_level =
1961__ATTR(level, 0644, level_show, level_store);
eae1701f
N
1962
1963static ssize_t
96de1e66 1964raid_disks_show(mddev_t *mddev, char *page)
eae1701f 1965{
bb636547
N
1966 if (mddev->raid_disks == 0)
1967 return 0;
eae1701f
N
1968 return sprintf(page, "%d\n", mddev->raid_disks);
1969}
1970
da943b99
N
1971static int update_raid_disks(mddev_t *mddev, int raid_disks);
1972
1973static ssize_t
1974raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
1975{
1976 /* can only set raid_disks if array is not yet active */
1977 char *e;
1978 int rv = 0;
1979 unsigned long n = simple_strtoul(buf, &e, 10);
1980
1981 if (!*buf || (*e && *e != '\n'))
1982 return -EINVAL;
1983
1984 if (mddev->pers)
1985 rv = update_raid_disks(mddev, n);
1986 else
1987 mddev->raid_disks = n;
1988 return rv ? rv : len;
1989}
1990static struct md_sysfs_entry md_raid_disks =
1991__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
eae1701f 1992
3b34380a
N
1993static ssize_t
1994chunk_size_show(mddev_t *mddev, char *page)
1995{
1996 return sprintf(page, "%d\n", mddev->chunk_size);
1997}
1998
1999static ssize_t
2000chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2001{
2002 /* can only set chunk_size if array is not yet active */
2003 char *e;
2004 unsigned long n = simple_strtoul(buf, &e, 10);
2005
2006 if (mddev->pers)
2007 return -EBUSY;
2008 if (!*buf || (*e && *e != '\n'))
2009 return -EINVAL;
2010
2011 mddev->chunk_size = n;
2012 return len;
2013}
2014static struct md_sysfs_entry md_chunk_size =
2015__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
2016
6d7ff738
N
2017static ssize_t
2018null_show(mddev_t *mddev, char *page)
2019{
2020 return -EINVAL;
2021}
2022
2023static ssize_t
2024new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2025{
2026 /* buf must be %d:%d\n? giving major and minor numbers */
2027 /* The new device is added to the array.
2028 * If the array has a persistent superblock, we read the
2029 * superblock to initialise info and check validity.
2030 * Otherwise, only checking done is that in bind_rdev_to_array,
2031 * which mainly checks size.
2032 */
2033 char *e;
2034 int major = simple_strtoul(buf, &e, 10);
2035 int minor;
2036 dev_t dev;
2037 mdk_rdev_t *rdev;
2038 int err;
2039
2040 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2041 return -EINVAL;
2042 minor = simple_strtoul(e+1, &e, 10);
2043 if (*e && *e != '\n')
2044 return -EINVAL;
2045 dev = MKDEV(major, minor);
2046 if (major != MAJOR(dev) ||
2047 minor != MINOR(dev))
2048 return -EOVERFLOW;
2049
2050
2051 if (mddev->persistent) {
2052 rdev = md_import_device(dev, mddev->major_version,
2053 mddev->minor_version);
2054 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2055 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2056 mdk_rdev_t, same_set);
2057 err = super_types[mddev->major_version]
2058 .load_super(rdev, rdev0, mddev->minor_version);
2059 if (err < 0)
2060 goto out;
2061 }
2062 } else
2063 rdev = md_import_device(dev, -1, -1);
2064
2065 if (IS_ERR(rdev))
2066 return PTR_ERR(rdev);
2067 err = bind_rdev_to_array(rdev, mddev);
2068 out:
2069 if (err)
2070 export_rdev(rdev);
2071 return err ? err : len;
2072}
2073
2074static struct md_sysfs_entry md_new_device =
2075__ATTR(new_dev, 0200, null_show, new_dev_store);
3b34380a 2076
a35b0d69
N
2077static ssize_t
2078size_show(mddev_t *mddev, char *page)
2079{
2080 return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2081}
2082
2083static int update_size(mddev_t *mddev, unsigned long size);
2084
2085static ssize_t
2086size_store(mddev_t *mddev, const char *buf, size_t len)
2087{
2088 /* If array is inactive, we can reduce the component size, but
2089 * not increase it (except from 0).
2090 * If array is active, we can try an on-line resize
2091 */
2092 char *e;
2093 int err = 0;
2094 unsigned long long size = simple_strtoull(buf, &e, 10);
2095 if (!*buf || *buf == '\n' ||
2096 (*e && *e != '\n'))
2097 return -EINVAL;
2098
2099 if (mddev->pers) {
2100 err = update_size(mddev, size);
2101 md_update_sb(mddev);
2102 } else {
2103 if (mddev->size == 0 ||
2104 mddev->size > size)
2105 mddev->size = size;
2106 else
2107 err = -ENOSPC;
2108 }
2109 return err ? err : len;
2110}
2111
2112static struct md_sysfs_entry md_size =
2113__ATTR(component_size, 0644, size_show, size_store);
2114
8bb93aac
N
2115
2116/* Metdata version.
2117 * This is either 'none' for arrays with externally managed metadata,
2118 * or N.M for internally known formats
2119 */
2120static ssize_t
2121metadata_show(mddev_t *mddev, char *page)
2122{
2123 if (mddev->persistent)
2124 return sprintf(page, "%d.%d\n",
2125 mddev->major_version, mddev->minor_version);
2126 else
2127 return sprintf(page, "none\n");
2128}
2129
2130static ssize_t
2131metadata_store(mddev_t *mddev, const char *buf, size_t len)
2132{
2133 int major, minor;
2134 char *e;
2135 if (!list_empty(&mddev->disks))
2136 return -EBUSY;
2137
2138 if (cmd_match(buf, "none")) {
2139 mddev->persistent = 0;
2140 mddev->major_version = 0;
2141 mddev->minor_version = 90;
2142 return len;
2143 }
2144 major = simple_strtoul(buf, &e, 10);
2145 if (e==buf || *e != '.')
2146 return -EINVAL;
2147 buf = e+1;
2148 minor = simple_strtoul(buf, &e, 10);
2149 if (e==buf || *e != '\n')
2150 return -EINVAL;
2151 if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
2152 super_types[major].name == NULL)
2153 return -ENOENT;
2154 mddev->major_version = major;
2155 mddev->minor_version = minor;
2156 mddev->persistent = 1;
2157 return len;
2158}
2159
2160static struct md_sysfs_entry md_metadata =
2161__ATTR(metadata_version, 0644, metadata_show, metadata_store);
2162
24dd469d 2163static ssize_t
7eec314d 2164action_show(mddev_t *mddev, char *page)
24dd469d 2165{
7eec314d 2166 char *type = "idle";
31399d9e
N
2167 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2168 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
ccfcc3c1
N
2169 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2170 type = "reshape";
2171 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
24dd469d
N
2172 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2173 type = "resync";
2174 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2175 type = "check";
2176 else
2177 type = "repair";
2178 } else
2179 type = "recover";
2180 }
2181 return sprintf(page, "%s\n", type);
2182}
2183
2184static ssize_t
7eec314d 2185action_store(mddev_t *mddev, const char *page, size_t len)
24dd469d 2186{
7eec314d
N
2187 if (!mddev->pers || !mddev->pers->sync_request)
2188 return -EINVAL;
2189
bce74dac 2190 if (cmd_match(page, "idle")) {
7eec314d
N
2191 if (mddev->sync_thread) {
2192 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2193 md_unregister_thread(mddev->sync_thread);
2194 mddev->sync_thread = NULL;
2195 mddev->recovery = 0;
2196 }
03c902e1
N
2197 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2198 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
24dd469d 2199 return -EBUSY;
03c902e1 2200 else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
7eec314d
N
2201 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2202 else {
bce74dac 2203 if (cmd_match(page, "check"))
7eec314d 2204 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
bce74dac 2205 else if (cmd_match(page, "repair"))
7eec314d
N
2206 return -EINVAL;
2207 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
2208 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7eec314d 2209 }
03c902e1 2210 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
24dd469d
N
2211 md_wakeup_thread(mddev->thread);
2212 return len;
2213}
2214
9d88883e 2215static ssize_t
96de1e66 2216mismatch_cnt_show(mddev_t *mddev, char *page)
9d88883e
N
2217{
2218 return sprintf(page, "%llu\n",
2219 (unsigned long long) mddev->resync_mismatches);
2220}
2221
96de1e66 2222static struct md_sysfs_entry
7eec314d 2223md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
24dd469d 2224
96de1e66
N
2225
2226static struct md_sysfs_entry
2227md_mismatches = __ATTR_RO(mismatch_cnt);
9d88883e 2228
88202a0c
N
2229static ssize_t
2230sync_min_show(mddev_t *mddev, char *page)
2231{
2232 return sprintf(page, "%d (%s)\n", speed_min(mddev),
2233 mddev->sync_speed_min ? "local": "system");
2234}
2235
2236static ssize_t
2237sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2238{
2239 int min;
2240 char *e;
2241 if (strncmp(buf, "system", 6)==0) {
2242 mddev->sync_speed_min = 0;
2243 return len;
2244 }
2245 min = simple_strtoul(buf, &e, 10);
2246 if (buf == e || (*e && *e != '\n') || min <= 0)
2247 return -EINVAL;
2248 mddev->sync_speed_min = min;
2249 return len;
2250}
2251
2252static struct md_sysfs_entry md_sync_min =
2253__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2254
2255static ssize_t
2256sync_max_show(mddev_t *mddev, char *page)
2257{
2258 return sprintf(page, "%d (%s)\n", speed_max(mddev),
2259 mddev->sync_speed_max ? "local": "system");
2260}
2261
2262static ssize_t
2263sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2264{
2265 int max;
2266 char *e;
2267 if (strncmp(buf, "system", 6)==0) {
2268 mddev->sync_speed_max = 0;
2269 return len;
2270 }
2271 max = simple_strtoul(buf, &e, 10);
2272 if (buf == e || (*e && *e != '\n') || max <= 0)
2273 return -EINVAL;
2274 mddev->sync_speed_max = max;
2275 return len;
2276}
2277
2278static struct md_sysfs_entry md_sync_max =
2279__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2280
2281
2282static ssize_t
2283sync_speed_show(mddev_t *mddev, char *page)
2284{
2285 unsigned long resync, dt, db;
2286 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2287 dt = ((jiffies - mddev->resync_mark) / HZ);
2288 if (!dt) dt++;
2289 db = resync - (mddev->resync_mark_cnt);
2290 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
2291}
2292
2293static struct md_sysfs_entry
2294md_sync_speed = __ATTR_RO(sync_speed);
2295
2296static ssize_t
2297sync_completed_show(mddev_t *mddev, char *page)
2298{
2299 unsigned long max_blocks, resync;
2300
2301 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2302 max_blocks = mddev->resync_max_sectors;
2303 else
2304 max_blocks = mddev->size << 1;
2305
2306 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2307 return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2308}
2309
2310static struct md_sysfs_entry
2311md_sync_completed = __ATTR_RO(sync_completed);
2312
eae1701f
N
2313static struct attribute *md_default_attrs[] = {
2314 &md_level.attr,
2315 &md_raid_disks.attr,
3b34380a 2316 &md_chunk_size.attr,
a35b0d69 2317 &md_size.attr,
8bb93aac 2318 &md_metadata.attr,
6d7ff738 2319 &md_new_device.attr,
411036fa
N
2320 NULL,
2321};
2322
2323static struct attribute *md_redundancy_attrs[] = {
24dd469d 2324 &md_scan_mode.attr,
9d88883e 2325 &md_mismatches.attr,
88202a0c
N
2326 &md_sync_min.attr,
2327 &md_sync_max.attr,
2328 &md_sync_speed.attr,
2329 &md_sync_completed.attr,
eae1701f
N
2330 NULL,
2331};
411036fa
N
2332static struct attribute_group md_redundancy_group = {
2333 .name = NULL,
2334 .attrs = md_redundancy_attrs,
2335};
2336
eae1701f
N
2337
2338static ssize_t
2339md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2340{
2341 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2342 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
96de1e66 2343 ssize_t rv;
eae1701f
N
2344
2345 if (!entry->show)
2346 return -EIO;
96de1e66
N
2347 mddev_lock(mddev);
2348 rv = entry->show(mddev, page);
2349 mddev_unlock(mddev);
2350 return rv;
eae1701f
N
2351}
2352
2353static ssize_t
2354md_attr_store(struct kobject *kobj, struct attribute *attr,
2355 const char *page, size_t length)
2356{
2357 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2358 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
96de1e66 2359 ssize_t rv;
eae1701f
N
2360
2361 if (!entry->store)
2362 return -EIO;
96de1e66
N
2363 mddev_lock(mddev);
2364 rv = entry->store(mddev, page, length);
2365 mddev_unlock(mddev);
2366 return rv;
eae1701f
N
2367}
2368
2369static void md_free(struct kobject *ko)
2370{
2371 mddev_t *mddev = container_of(ko, mddev_t, kobj);
2372 kfree(mddev);
2373}
2374
2375static struct sysfs_ops md_sysfs_ops = {
2376 .show = md_attr_show,
2377 .store = md_attr_store,
2378};
2379static struct kobj_type md_ktype = {
2380 .release = md_free,
2381 .sysfs_ops = &md_sysfs_ops,
2382 .default_attrs = md_default_attrs,
2383};
2384
1da177e4
LT
2385int mdp_major = 0;
2386
2387static struct kobject *md_probe(dev_t dev, int *part, void *data)
2388{
2389 static DECLARE_MUTEX(disks_sem);
2390 mddev_t *mddev = mddev_find(dev);
2391 struct gendisk *disk;
2392 int partitioned = (MAJOR(dev) != MD_MAJOR);
2393 int shift = partitioned ? MdpMinorShift : 0;
2394 int unit = MINOR(dev) >> shift;
2395
2396 if (!mddev)
2397 return NULL;
2398
2399 down(&disks_sem);
2400 if (mddev->gendisk) {
2401 up(&disks_sem);
2402 mddev_put(mddev);
2403 return NULL;
2404 }
2405 disk = alloc_disk(1 << shift);
2406 if (!disk) {
2407 up(&disks_sem);
2408 mddev_put(mddev);
2409 return NULL;
2410 }
2411 disk->major = MAJOR(dev);
2412 disk->first_minor = unit << shift;
2413 if (partitioned) {
2414 sprintf(disk->disk_name, "md_d%d", unit);
2415 sprintf(disk->devfs_name, "md/d%d", unit);
2416 } else {
2417 sprintf(disk->disk_name, "md%d", unit);
2418 sprintf(disk->devfs_name, "md/%d", unit);
2419 }
2420 disk->fops = &md_fops;
2421 disk->private_data = mddev;
2422 disk->queue = mddev->queue;
2423 add_disk(disk);
2424 mddev->gendisk = disk;
2425 up(&disks_sem);
9c791977 2426 mddev->kobj.parent = &disk->kobj;
eae1701f
N
2427 mddev->kobj.k_name = NULL;
2428 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
2429 mddev->kobj.ktype = &md_ktype;
2430 kobject_register(&mddev->kobj);
1da177e4
LT
2431 return NULL;
2432}
2433
2434void md_wakeup_thread(mdk_thread_t *thread);
2435
2436static void md_safemode_timeout(unsigned long data)
2437{
2438 mddev_t *mddev = (mddev_t *) data;
2439
2440 mddev->safemode = 1;
2441 md_wakeup_thread(mddev->thread);
2442}
2443
6ff8d8ec 2444static int start_dirty_degraded;
1da177e4
LT
2445
2446static int do_md_run(mddev_t * mddev)
2447{
2604b703 2448 int err;
1da177e4
LT
2449 int chunk_size;
2450 struct list_head *tmp;
2451 mdk_rdev_t *rdev;
2452 struct gendisk *disk;
2604b703 2453 struct mdk_personality *pers;
1da177e4
LT
2454 char b[BDEVNAME_SIZE];
2455
a757e64c
N
2456 if (list_empty(&mddev->disks))
2457 /* cannot run an array with no devices.. */
1da177e4 2458 return -EINVAL;
1da177e4
LT
2459
2460 if (mddev->pers)
2461 return -EBUSY;
2462
2463 /*
2464 * Analyze all RAID superblock(s)
2465 */
a757e64c
N
2466 if (!mddev->raid_disks)
2467 analyze_sbs(mddev);
1da177e4
LT
2468
2469 chunk_size = mddev->chunk_size;
2604b703
N
2470
2471 if (chunk_size) {
1da177e4
LT
2472 if (chunk_size > MAX_CHUNK_SIZE) {
2473 printk(KERN_ERR "too big chunk_size: %d > %d\n",
2474 chunk_size, MAX_CHUNK_SIZE);
2475 return -EINVAL;
2476 }
2477 /*
2478 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
2479 */
2480 if ( (1 << ffz(~chunk_size)) != chunk_size) {
a757e64c 2481 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
1da177e4
LT
2482 return -EINVAL;
2483 }
2484 if (chunk_size < PAGE_SIZE) {
2485 printk(KERN_ERR "too small chunk_size: %d < %ld\n",
2486 chunk_size, PAGE_SIZE);
2487 return -EINVAL;
2488 }
2489
2490 /* devices must have minimum size of one chunk */
2491 ITERATE_RDEV(mddev,rdev,tmp) {
b2d444d7 2492 if (test_bit(Faulty, &rdev->flags))
1da177e4
LT
2493 continue;
2494 if (rdev->size < chunk_size / 1024) {
2495 printk(KERN_WARNING
2496 "md: Dev %s smaller than chunk_size:"
2497 " %lluk < %dk\n",
2498 bdevname(rdev->bdev,b),
2499 (unsigned long long)rdev->size,
2500 chunk_size / 1024);
2501 return -EINVAL;
2502 }
2503 }
2504 }
2505
1da177e4 2506#ifdef CONFIG_KMOD
d9d166c2
N
2507 if (mddev->level != LEVEL_NONE)
2508 request_module("md-level-%d", mddev->level);
2509 else if (mddev->clevel[0])
2510 request_module("md-%s", mddev->clevel);
1da177e4
LT
2511#endif
2512
2513 /*
2514 * Drop all container device buffers, from now on
2515 * the only valid external interface is through the md
2516 * device.
2517 * Also find largest hardsector size
2518 */
2519 ITERATE_RDEV(mddev,rdev,tmp) {
b2d444d7 2520 if (test_bit(Faulty, &rdev->flags))
1da177e4
LT
2521 continue;
2522 sync_blockdev(rdev->bdev);
2523 invalidate_bdev(rdev->bdev, 0);
2524 }
2525
2526 md_probe(mddev->unit, NULL, NULL);
2527 disk = mddev->gendisk;
2528 if (!disk)
2529 return -ENOMEM;
2530
2531 spin_lock(&pers_lock);
d9d166c2 2532 pers = find_pers(mddev->level, mddev->clevel);
2604b703 2533 if (!pers || !try_module_get(pers->owner)) {
1da177e4 2534 spin_unlock(&pers_lock);
d9d166c2
N
2535 if (mddev->level != LEVEL_NONE)
2536 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
2537 mddev->level);
2538 else
2539 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
2540 mddev->clevel);
1da177e4
LT
2541 return -EINVAL;
2542 }
2604b703 2543 mddev->pers = pers;
1da177e4 2544 spin_unlock(&pers_lock);
d9d166c2
N
2545 mddev->level = pers->level;
2546 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
1da177e4 2547
657390d2 2548 mddev->recovery = 0;
1da177e4 2549 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
a9701a30 2550 mddev->barriers_work = 1;
6ff8d8ec 2551 mddev->ok_start_degraded = start_dirty_degraded;
1da177e4 2552
f91de92e
N
2553 if (start_readonly)
2554 mddev->ro = 2; /* read-only, but switch on first write */
2555
b15c2e57
N
2556 err = mddev->pers->run(mddev);
2557 if (!err && mddev->pers->sync_request) {
2558 err = bitmap_create(mddev);
2559 if (err) {
2560 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
2561 mdname(mddev), err);
2562 mddev->pers->stop(mddev);
2563 }
2564 }
1da177e4
LT
2565 if (err) {
2566 printk(KERN_ERR "md: pers->run() failed ...\n");
2567 module_put(mddev->pers->owner);
2568 mddev->pers = NULL;
32a7627c
N
2569 bitmap_destroy(mddev);
2570 return err;
1da177e4 2571 }
411036fa
N
2572 if (mddev->pers->sync_request)
2573 sysfs_create_group(&mddev->kobj, &md_redundancy_group);
fd9d49ca
N
2574 else if (mddev->ro == 2) /* auto-readonly not meaningful */
2575 mddev->ro = 0;
2576
1da177e4
LT
2577 atomic_set(&mddev->writes_pending,0);
2578 mddev->safemode = 0;
2579 mddev->safemode_timer.function = md_safemode_timeout;
2580 mddev->safemode_timer.data = (unsigned long) mddev;
2581 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
2582 mddev->in_sync = 1;
86e6ffdd
N
2583
2584 ITERATE_RDEV(mddev,rdev,tmp)
2585 if (rdev->raid_disk >= 0) {
2586 char nm[20];
2587 sprintf(nm, "rd%d", rdev->raid_disk);
2588 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
2589 }
1da177e4
LT
2590
2591 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
005eca5e 2592 md_wakeup_thread(mddev->thread);
1da177e4
LT
2593
2594 if (mddev->sb_dirty)
2595 md_update_sb(mddev);
2596
2597 set_capacity(disk, mddev->array_size<<1);
2598
2599 /* If we call blk_queue_make_request here, it will
2600 * re-initialise max_sectors etc which may have been
2601 * refined inside -> run. So just set the bits we need to set.
2602 * Most initialisation happended when we called
2603 * blk_queue_make_request(..., md_fail_request)
2604 * earlier.
2605 */
2606 mddev->queue->queuedata = mddev;
2607 mddev->queue->make_request_fn = mddev->pers->make_request;
2608
2609 mddev->changed = 1;
d7603b7e 2610 md_new_event(mddev);
1da177e4
LT
2611 return 0;
2612}
2613
2614static int restart_array(mddev_t *mddev)
2615{
2616 struct gendisk *disk = mddev->gendisk;
2617 int err;
2618
2619 /*
2620 * Complain if it has no devices
2621 */
2622 err = -ENXIO;
2623 if (list_empty(&mddev->disks))
2624 goto out;
2625
2626 if (mddev->pers) {
2627 err = -EBUSY;
2628 if (!mddev->ro)
2629 goto out;
2630
2631 mddev->safemode = 0;
2632 mddev->ro = 0;
2633 set_disk_ro(disk, 0);
2634
2635 printk(KERN_INFO "md: %s switched to read-write mode.\n",
2636 mdname(mddev));
2637 /*
2638 * Kick recovery or resync if necessary
2639 */
2640 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2641 md_wakeup_thread(mddev->thread);
2642 err = 0;
2643 } else {
2644 printk(KERN_ERR "md: %s has no personality assigned.\n",
2645 mdname(mddev));
2646 err = -EINVAL;
2647 }
2648
2649out:
2650 return err;
2651}
2652
2653static int do_md_stop(mddev_t * mddev, int ro)
2654{
2655 int err = 0;
2656 struct gendisk *disk = mddev->gendisk;
2657
2658 if (mddev->pers) {
2659 if (atomic_read(&mddev->active)>2) {
2660 printk("md: %s still in use.\n",mdname(mddev));
2661 return -EBUSY;
2662 }
2663
2664 if (mddev->sync_thread) {
2665 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2666 md_unregister_thread(mddev->sync_thread);
2667 mddev->sync_thread = NULL;
2668 }
2669
2670 del_timer_sync(&mddev->safemode_timer);
2671
2672 invalidate_partition(disk, 0);
2673
2674 if (ro) {
2675 err = -ENXIO;
f91de92e 2676 if (mddev->ro==1)
1da177e4
LT
2677 goto out;
2678 mddev->ro = 1;
2679 } else {
6b8b3e8a 2680 bitmap_flush(mddev);
a9701a30 2681 md_super_wait(mddev);
1da177e4
LT
2682 if (mddev->ro)
2683 set_disk_ro(disk, 0);
2684 blk_queue_make_request(mddev->queue, md_fail_request);
2685 mddev->pers->stop(mddev);
411036fa
N
2686 if (mddev->pers->sync_request)
2687 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
2688
1da177e4
LT
2689 module_put(mddev->pers->owner);
2690 mddev->pers = NULL;
2691 if (mddev->ro)
2692 mddev->ro = 0;
2693 }
2694 if (!mddev->in_sync) {
2695 /* mark array as shutdown cleanly */
2696 mddev->in_sync = 1;
2697 md_update_sb(mddev);
2698 }
2699 if (ro)
2700 set_disk_ro(disk, 1);
2701 }
32a7627c 2702
1da177e4
LT
2703 /*
2704 * Free resources if final stop
2705 */
2706 if (!ro) {
86e6ffdd
N
2707 mdk_rdev_t *rdev;
2708 struct list_head *tmp;
1da177e4
LT
2709 struct gendisk *disk;
2710 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
2711
978f946b
N
2712 bitmap_destroy(mddev);
2713 if (mddev->bitmap_file) {
2714 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1);
2715 fput(mddev->bitmap_file);
2716 mddev->bitmap_file = NULL;
2717 }
2718 mddev->bitmap_offset = 0;
2719
86e6ffdd
N
2720 ITERATE_RDEV(mddev,rdev,tmp)
2721 if (rdev->raid_disk >= 0) {
2722 char nm[20];
2723 sprintf(nm, "rd%d", rdev->raid_disk);
2724 sysfs_remove_link(&mddev->kobj, nm);
2725 }
2726
1da177e4
LT
2727 export_array(mddev);
2728
2729 mddev->array_size = 0;
2730 disk = mddev->gendisk;
2731 if (disk)
2732 set_capacity(disk, 0);
2733 mddev->changed = 1;
2734 } else
2735 printk(KERN_INFO "md: %s switched to read-only mode.\n",
2736 mdname(mddev));
2737 err = 0;
d7603b7e 2738 md_new_event(mddev);
1da177e4
LT
2739out:
2740 return err;
2741}
2742
2743static void autorun_array(mddev_t *mddev)
2744{
2745 mdk_rdev_t *rdev;
2746 struct list_head *tmp;
2747 int err;
2748
a757e64c 2749 if (list_empty(&mddev->disks))
1da177e4 2750 return;
1da177e4
LT
2751
2752 printk(KERN_INFO "md: running: ");
2753
2754 ITERATE_RDEV(mddev,rdev,tmp) {
2755 char b[BDEVNAME_SIZE];
2756 printk("<%s>", bdevname(rdev->bdev,b));
2757 }
2758 printk("\n");
2759
2760 err = do_md_run (mddev);
2761 if (err) {
2762 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
2763 do_md_stop (mddev, 0);
2764 }
2765}
2766
2767/*
2768 * lets try to run arrays based on all disks that have arrived
2769 * until now. (those are in pending_raid_disks)
2770 *
2771 * the method: pick the first pending disk, collect all disks with
2772 * the same UUID, remove all from the pending list and put them into
2773 * the 'same_array' list. Then order this list based on superblock
2774 * update time (freshest comes first), kick out 'old' disks and
2775 * compare superblocks. If everything's fine then run it.
2776 *
2777 * If "unit" is allocated, then bump its reference count
2778 */
2779static void autorun_devices(int part)
2780{
1da177e4
LT
2781 struct list_head *tmp;
2782 mdk_rdev_t *rdev0, *rdev;
2783 mddev_t *mddev;
2784 char b[BDEVNAME_SIZE];
2785
2786 printk(KERN_INFO "md: autorun ...\n");
2787 while (!list_empty(&pending_raid_disks)) {
2788 dev_t dev;
ad01c9e3 2789 LIST_HEAD(candidates);
1da177e4
LT
2790 rdev0 = list_entry(pending_raid_disks.next,
2791 mdk_rdev_t, same_set);
2792
2793 printk(KERN_INFO "md: considering %s ...\n",
2794 bdevname(rdev0->bdev,b));
2795 INIT_LIST_HEAD(&candidates);
2796 ITERATE_RDEV_PENDING(rdev,tmp)
2797 if (super_90_load(rdev, rdev0, 0) >= 0) {
2798 printk(KERN_INFO "md: adding %s ...\n",
2799 bdevname(rdev->bdev,b));
2800 list_move(&rdev->same_set, &candidates);
2801 }
2802 /*
2803 * now we have a set of devices, with all of them having
2804 * mostly sane superblocks. It's time to allocate the
2805 * mddev.
2806 */
2807 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
2808 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
2809 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
2810 break;
2811 }
2812 if (part)
2813 dev = MKDEV(mdp_major,
2814 rdev0->preferred_minor << MdpMinorShift);
2815 else
2816 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
2817
2818 md_probe(dev, NULL, NULL);
2819 mddev = mddev_find(dev);
2820 if (!mddev) {
2821 printk(KERN_ERR
2822 "md: cannot allocate memory for md drive.\n");
2823 break;
2824 }
2825 if (mddev_lock(mddev))
2826 printk(KERN_WARNING "md: %s locked, cannot run\n",
2827 mdname(mddev));
2828 else if (mddev->raid_disks || mddev->major_version
2829 || !list_empty(&mddev->disks)) {
2830 printk(KERN_WARNING
2831 "md: %s already running, cannot run %s\n",
2832 mdname(mddev), bdevname(rdev0->bdev,b));
2833 mddev_unlock(mddev);
2834 } else {
2835 printk(KERN_INFO "md: created %s\n", mdname(mddev));
2836 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
2837 list_del_init(&rdev->same_set);
2838 if (bind_rdev_to_array(rdev, mddev))
2839 export_rdev(rdev);
2840 }
2841 autorun_array(mddev);
2842 mddev_unlock(mddev);
2843 }
2844 /* on success, candidates will be empty, on error
2845 * it won't...
2846 */
2847 ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
2848 export_rdev(rdev);
2849 mddev_put(mddev);
2850 }
2851 printk(KERN_INFO "md: ... autorun DONE.\n");
2852}
2853
2854/*
2855 * import RAID devices based on one partition
2856 * if possible, the array gets run as well.
2857 */
2858
2859static int autostart_array(dev_t startdev)
2860{
2861 char b[BDEVNAME_SIZE];
2862 int err = -EINVAL, i;
2863 mdp_super_t *sb = NULL;
2864 mdk_rdev_t *start_rdev = NULL, *rdev;
2865
2866 start_rdev = md_import_device(startdev, 0, 0);
2867 if (IS_ERR(start_rdev))
2868 return err;
2869
2870
2871 /* NOTE: this can only work for 0.90.0 superblocks */
2872 sb = (mdp_super_t*)page_address(start_rdev->sb_page);
2873 if (sb->major_version != 0 ||
2874 sb->minor_version != 90 ) {
2875 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
2876 export_rdev(start_rdev);
2877 return err;
2878 }
2879
b2d444d7 2880 if (test_bit(Faulty, &start_rdev->flags)) {
1da177e4
LT
2881 printk(KERN_WARNING
2882 "md: can not autostart based on faulty %s!\n",
2883 bdevname(start_rdev->bdev,b));
2884 export_rdev(start_rdev);
2885 return err;
2886 }
2887 list_add(&start_rdev->same_set, &pending_raid_disks);
2888
2889 for (i = 0; i < MD_SB_DISKS; i++) {
2890 mdp_disk_t *desc = sb->disks + i;
2891 dev_t dev = MKDEV(desc->major, desc->minor);
2892
2893 if (!dev)
2894 continue;
2895 if (dev == startdev)
2896 continue;
2897 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
2898 continue;
2899 rdev = md_import_device(dev, 0, 0);
2900 if (IS_ERR(rdev))
2901 continue;
2902
2903 list_add(&rdev->same_set, &pending_raid_disks);
2904 }
2905
2906 /*
2907 * possibly return codes
2908 */
2909 autorun_devices(0);
2910 return 0;
2911
2912}
2913
2914
2915static int get_version(void __user * arg)
2916{
2917 mdu_version_t ver;
2918
2919 ver.major = MD_MAJOR_VERSION;
2920 ver.minor = MD_MINOR_VERSION;
2921 ver.patchlevel = MD_PATCHLEVEL_VERSION;
2922
2923 if (copy_to_user(arg, &ver, sizeof(ver)))
2924 return -EFAULT;
2925
2926 return 0;
2927}
2928
2929static int get_array_info(mddev_t * mddev, void __user * arg)
2930{
2931 mdu_array_info_t info;
2932 int nr,working,active,failed,spare;
2933 mdk_rdev_t *rdev;
2934 struct list_head *tmp;
2935
2936 nr=working=active=failed=spare=0;
2937 ITERATE_RDEV(mddev,rdev,tmp) {
2938 nr++;
b2d444d7 2939 if (test_bit(Faulty, &rdev->flags))
1da177e4
LT
2940 failed++;
2941 else {
2942 working++;
b2d444d7 2943 if (test_bit(In_sync, &rdev->flags))
1da177e4
LT
2944 active++;
2945 else
2946 spare++;
2947 }
2948 }
2949
2950 info.major_version = mddev->major_version;
2951 info.minor_version = mddev->minor_version;
2952 info.patch_version = MD_PATCHLEVEL_VERSION;
2953 info.ctime = mddev->ctime;
2954 info.level = mddev->level;
2955 info.size = mddev->size;
284ae7ca
N
2956 if (info.size != mddev->size) /* overflow */
2957 info.size = -1;
1da177e4
LT
2958 info.nr_disks = nr;
2959 info.raid_disks = mddev->raid_disks;
2960 info.md_minor = mddev->md_minor;
2961 info.not_persistent= !mddev->persistent;
2962
2963 info.utime = mddev->utime;
2964 info.state = 0;
2965 if (mddev->in_sync)
2966 info.state = (1<<MD_SB_CLEAN);
36fa3063
N
2967 if (mddev->bitmap && mddev->bitmap_offset)
2968 info.state = (1<<MD_SB_BITMAP_PRESENT);
1da177e4
LT
2969 info.active_disks = active;
2970 info.working_disks = working;
2971 info.failed_disks = failed;
2972 info.spare_disks = spare;
2973
2974 info.layout = mddev->layout;
2975 info.chunk_size = mddev->chunk_size;
2976
2977 if (copy_to_user(arg, &info, sizeof(info)))
2978 return -EFAULT;
2979
2980 return 0;
2981}
2982
87162a28 2983static int get_bitmap_file(mddev_t * mddev, void __user * arg)
32a7627c
N
2984{
2985 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
2986 char *ptr, *buf = NULL;
2987 int err = -ENOMEM;
2988
2989 file = kmalloc(sizeof(*file), GFP_KERNEL);
2990 if (!file)
2991 goto out;
2992
2993 /* bitmap disabled, zero the first byte and copy out */
2994 if (!mddev->bitmap || !mddev->bitmap->file) {
2995 file->pathname[0] = '\0';
2996 goto copy_out;
2997 }
2998
2999 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
3000 if (!buf)
3001 goto out;
3002
3003 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
3004 if (!ptr)
3005 goto out;
3006
3007 strcpy(file->pathname, ptr);
3008
3009copy_out:
3010 err = 0;
3011 if (copy_to_user(arg, file, sizeof(*file)))
3012 err = -EFAULT;
3013out:
3014 kfree(buf);
3015 kfree(file);
3016 return err;
3017}
3018
1da177e4
LT
3019static int get_disk_info(mddev_t * mddev, void __user * arg)
3020{
3021 mdu_disk_info_t info;
3022 unsigned int nr;
3023 mdk_rdev_t *rdev;
3024
3025 if (copy_from_user(&info, arg, sizeof(info)))
3026 return -EFAULT;
3027
3028 nr = info.number;
3029
3030 rdev = find_rdev_nr(mddev, nr);
3031 if (rdev) {
3032 info.major = MAJOR(rdev->bdev->bd_dev);
3033 info.minor = MINOR(rdev->bdev->bd_dev);
3034 info.raid_disk = rdev->raid_disk;
3035 info.state = 0;
b2d444d7 3036 if (test_bit(Faulty, &rdev->flags))
1da177e4 3037 info.state |= (1<<MD_DISK_FAULTY);
b2d444d7 3038 else if (test_bit(In_sync, &rdev->flags)) {
1da177e4
LT
3039 info.state |= (1<<MD_DISK_ACTIVE);
3040 info.state |= (1<<MD_DISK_SYNC);
3041 }
8ddf9efe
N
3042 if (test_bit(WriteMostly, &rdev->flags))
3043 info.state |= (1<<MD_DISK_WRITEMOSTLY);
1da177e4
LT
3044 } else {
3045 info.major = info.minor = 0;
3046 info.raid_disk = -1;
3047 info.state = (1<<MD_DISK_REMOVED);
3048 }
3049
3050 if (copy_to_user(arg, &info, sizeof(info)))
3051 return -EFAULT;
3052
3053 return 0;
3054}
3055
3056static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3057{
3058 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3059 mdk_rdev_t *rdev;
3060 dev_t dev = MKDEV(info->major,info->minor);
3061
3062 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
3063 return -EOVERFLOW;
3064
3065 if (!mddev->raid_disks) {
3066 int err;
3067 /* expecting a device which has a superblock */
3068 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
3069 if (IS_ERR(rdev)) {
3070 printk(KERN_WARNING
3071 "md: md_import_device returned %ld\n",
3072 PTR_ERR(rdev));
3073 return PTR_ERR(rdev);
3074 }
3075 if (!list_empty(&mddev->disks)) {
3076 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3077 mdk_rdev_t, same_set);
3078 int err = super_types[mddev->major_version]
3079 .load_super(rdev, rdev0, mddev->minor_version);
3080 if (err < 0) {
3081 printk(KERN_WARNING
3082 "md: %s has different UUID to %s\n",
3083 bdevname(rdev->bdev,b),
3084 bdevname(rdev0->bdev,b2));
3085 export_rdev(rdev);
3086 return -EINVAL;
3087 }
3088 }
3089 err = bind_rdev_to_array(rdev, mddev);
3090 if (err)
3091 export_rdev(rdev);
3092 return err;
3093 }
3094
3095 /*
3096 * add_new_disk can be used once the array is assembled
3097 * to add "hot spares". They must already have a superblock
3098 * written
3099 */
3100 if (mddev->pers) {
3101 int err;
3102 if (!mddev->pers->hot_add_disk) {
3103 printk(KERN_WARNING
3104 "%s: personality does not support diskops!\n",
3105 mdname(mddev));
3106 return -EINVAL;
3107 }
7b1e35f6
N
3108 if (mddev->persistent)
3109 rdev = md_import_device(dev, mddev->major_version,
3110 mddev->minor_version);
3111 else
3112 rdev = md_import_device(dev, -1, -1);
1da177e4
LT
3113 if (IS_ERR(rdev)) {
3114 printk(KERN_WARNING
3115 "md: md_import_device returned %ld\n",
3116 PTR_ERR(rdev));
3117 return PTR_ERR(rdev);
3118 }
41158c7e
N
3119 /* set save_raid_disk if appropriate */
3120 if (!mddev->persistent) {
3121 if (info->state & (1<<MD_DISK_SYNC) &&
3122 info->raid_disk < mddev->raid_disks)
3123 rdev->raid_disk = info->raid_disk;
3124 else
3125 rdev->raid_disk = -1;
3126 } else
3127 super_types[mddev->major_version].
3128 validate_super(mddev, rdev);
3129 rdev->saved_raid_disk = rdev->raid_disk;
3130
b2d444d7 3131 clear_bit(In_sync, &rdev->flags); /* just to be sure */
8ddf9efe
N
3132 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3133 set_bit(WriteMostly, &rdev->flags);
3134
1da177e4
LT
3135 rdev->raid_disk = -1;
3136 err = bind_rdev_to_array(rdev, mddev);
3137 if (err)
3138 export_rdev(rdev);
c361777f
N
3139
3140 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
005eca5e 3141 md_wakeup_thread(mddev->thread);
1da177e4
LT
3142 return err;
3143 }
3144
3145 /* otherwise, add_new_disk is only allowed
3146 * for major_version==0 superblocks
3147 */
3148 if (mddev->major_version != 0) {
3149 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
3150 mdname(mddev));
3151 return -EINVAL;
3152 }
3153
3154 if (!(info->state & (1<<MD_DISK_FAULTY))) {
3155 int err;
3156 rdev = md_import_device (dev, -1, 0);
3157 if (IS_ERR(rdev)) {
3158 printk(KERN_WARNING
3159 "md: error, md_import_device() returned %ld\n",
3160 PTR_ERR(rdev));
3161 return PTR_ERR(rdev);
3162 }
3163 rdev->desc_nr = info->number;
3164 if (info->raid_disk < mddev->raid_disks)
3165 rdev->raid_disk = info->raid_disk;
3166 else
3167 rdev->raid_disk = -1;
3168
b2d444d7
N
3169 rdev->flags = 0;
3170
1da177e4 3171 if (rdev->raid_disk < mddev->raid_disks)
b2d444d7
N
3172 if (info->state & (1<<MD_DISK_SYNC))
3173 set_bit(In_sync, &rdev->flags);
1da177e4 3174
8ddf9efe
N
3175 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3176 set_bit(WriteMostly, &rdev->flags);
3177
1da177e4
LT
3178 if (!mddev->persistent) {
3179 printk(KERN_INFO "md: nonpersistent superblock ...\n");
3180 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3181 } else
3182 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3183 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
3184
2bf071bf
N
3185 err = bind_rdev_to_array(rdev, mddev);
3186 if (err) {
3187 export_rdev(rdev);
3188 return err;
3189 }
1da177e4
LT
3190 }
3191
3192 return 0;
3193}
3194
3195static int hot_remove_disk(mddev_t * mddev, dev_t dev)
3196{
3197 char b[BDEVNAME_SIZE];
3198 mdk_rdev_t *rdev;
3199
3200 if (!mddev->pers)
3201 return -ENODEV;
3202
3203 rdev = find_rdev(mddev, dev);
3204 if (!rdev)
3205 return -ENXIO;
3206
3207 if (rdev->raid_disk >= 0)
3208 goto busy;
3209
3210 kick_rdev_from_array(rdev);
3211 md_update_sb(mddev);
d7603b7e 3212 md_new_event(mddev);
1da177e4
LT
3213
3214 return 0;
3215busy:
3216 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
3217 bdevname(rdev->bdev,b), mdname(mddev));
3218 return -EBUSY;
3219}
3220
3221static int hot_add_disk(mddev_t * mddev, dev_t dev)
3222{
3223 char b[BDEVNAME_SIZE];
3224 int err;
3225 unsigned int size;
3226 mdk_rdev_t *rdev;
3227
3228 if (!mddev->pers)
3229 return -ENODEV;
3230
3231 if (mddev->major_version != 0) {
3232 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
3233 " version-0 superblocks.\n",
3234 mdname(mddev));
3235 return -EINVAL;
3236 }
3237 if (!mddev->pers->hot_add_disk) {
3238 printk(KERN_WARNING
3239 "%s: personality does not support diskops!\n",
3240 mdname(mddev));
3241 return -EINVAL;
3242 }
3243
3244 rdev = md_import_device (dev, -1, 0);
3245 if (IS_ERR(rdev)) {
3246 printk(KERN_WARNING
3247 "md: error, md_import_device() returned %ld\n",
3248 PTR_ERR(rdev));
3249 return -EINVAL;
3250 }
3251
3252 if (mddev->persistent)
3253 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3254 else
3255 rdev->sb_offset =
3256 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3257
3258 size = calc_dev_size(rdev, mddev->chunk_size);
3259 rdev->size = size;
3260
b2d444d7 3261 if (test_bit(Faulty, &rdev->flags)) {
1da177e4
LT
3262 printk(KERN_WARNING
3263 "md: can not hot-add faulty %s disk to %s!\n",
3264 bdevname(rdev->bdev,b), mdname(mddev));
3265 err = -EINVAL;
3266 goto abort_export;
3267 }
b2d444d7 3268 clear_bit(In_sync, &rdev->flags);
1da177e4 3269 rdev->desc_nr = -1;
2bf071bf
N
3270 err = bind_rdev_to_array(rdev, mddev);
3271 if (err)
3272 goto abort_export;
1da177e4
LT
3273
3274 /*
3275 * The rest should better be atomic, we can have disk failures
3276 * noticed in interrupt contexts ...
3277 */
3278
3279 if (rdev->desc_nr == mddev->max_disks) {
3280 printk(KERN_WARNING "%s: can not hot-add to full array!\n",
3281 mdname(mddev));
3282 err = -EBUSY;
3283 goto abort_unbind_export;
3284 }
3285
3286 rdev->raid_disk = -1;
3287
3288 md_update_sb(mddev);
3289
3290 /*
3291 * Kick recovery, maybe this spare has to be added to the
3292 * array immediately.
3293 */
3294 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3295 md_wakeup_thread(mddev->thread);
d7603b7e 3296 md_new_event(mddev);
1da177e4
LT
3297 return 0;
3298
3299abort_unbind_export:
3300 unbind_rdev_from_array(rdev);
3301
3302abort_export:
3303 export_rdev(rdev);
3304 return err;
3305}
3306
32a7627c
N
3307/* similar to deny_write_access, but accounts for our holding a reference
3308 * to the file ourselves */
3309static int deny_bitmap_write_access(struct file * file)
3310{
3311 struct inode *inode = file->f_mapping->host;
3312
3313 spin_lock(&inode->i_lock);
3314 if (atomic_read(&inode->i_writecount) > 1) {
3315 spin_unlock(&inode->i_lock);
3316 return -ETXTBSY;
3317 }
3318 atomic_set(&inode->i_writecount, -1);
3319 spin_unlock(&inode->i_lock);
3320
3321 return 0;
3322}
3323
3324static int set_bitmap_file(mddev_t *mddev, int fd)
3325{
3326 int err;
3327
36fa3063
N
3328 if (mddev->pers) {
3329 if (!mddev->pers->quiesce)
3330 return -EBUSY;
3331 if (mddev->recovery || mddev->sync_thread)
3332 return -EBUSY;
3333 /* we should be able to change the bitmap.. */
3334 }
32a7627c 3335
32a7627c 3336
36fa3063
N
3337 if (fd >= 0) {
3338 if (mddev->bitmap)
3339 return -EEXIST; /* cannot add when bitmap is present */
3340 mddev->bitmap_file = fget(fd);
32a7627c 3341
36fa3063
N
3342 if (mddev->bitmap_file == NULL) {
3343 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
3344 mdname(mddev));
3345 return -EBADF;
3346 }
3347
3348 err = deny_bitmap_write_access(mddev->bitmap_file);
3349 if (err) {
3350 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
3351 mdname(mddev));
3352 fput(mddev->bitmap_file);
3353 mddev->bitmap_file = NULL;
3354 return err;
3355 }
a654b9d8 3356 mddev->bitmap_offset = 0; /* file overrides offset */
36fa3063
N
3357 } else if (mddev->bitmap == NULL)
3358 return -ENOENT; /* cannot remove what isn't there */
3359 err = 0;
3360 if (mddev->pers) {
3361 mddev->pers->quiesce(mddev, 1);
3362 if (fd >= 0)
3363 err = bitmap_create(mddev);
3364 if (fd < 0 || err)
3365 bitmap_destroy(mddev);
3366 mddev->pers->quiesce(mddev, 0);
3367 } else if (fd < 0) {
3368 if (mddev->bitmap_file)
3369 fput(mddev->bitmap_file);
3370 mddev->bitmap_file = NULL;
3371 }
3372
32a7627c
N
3373 return err;
3374}
3375
1da177e4
LT
3376/*
3377 * set_array_info is used two different ways
3378 * The original usage is when creating a new array.
3379 * In this usage, raid_disks is > 0 and it together with
3380 * level, size, not_persistent,layout,chunksize determine the
3381 * shape of the array.
3382 * This will always create an array with a type-0.90.0 superblock.
3383 * The newer usage is when assembling an array.
3384 * In this case raid_disks will be 0, and the major_version field is
3385 * use to determine which style super-blocks are to be found on the devices.
3386 * The minor and patch _version numbers are also kept incase the
3387 * super_block handler wishes to interpret them.
3388 */
3389static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
3390{
3391
3392 if (info->raid_disks == 0) {
3393 /* just setting version number for superblock loading */
3394 if (info->major_version < 0 ||
3395 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
3396 super_types[info->major_version].name == NULL) {
3397 /* maybe try to auto-load a module? */
3398 printk(KERN_INFO
3399 "md: superblock version %d not known\n",
3400 info->major_version);
3401 return -EINVAL;
3402 }
3403 mddev->major_version = info->major_version;
3404 mddev->minor_version = info->minor_version;
3405 mddev->patch_version = info->patch_version;
3406 return 0;
3407 }
3408 mddev->major_version = MD_MAJOR_VERSION;
3409 mddev->minor_version = MD_MINOR_VERSION;
3410 mddev->patch_version = MD_PATCHLEVEL_VERSION;
3411 mddev->ctime = get_seconds();
3412
3413 mddev->level = info->level;
17115e03 3414 mddev->clevel[0] = 0;
1da177e4
LT
3415 mddev->size = info->size;
3416 mddev->raid_disks = info->raid_disks;
3417 /* don't set md_minor, it is determined by which /dev/md* was
3418 * openned
3419 */
3420 if (info->state & (1<<MD_SB_CLEAN))
3421 mddev->recovery_cp = MaxSector;
3422 else
3423 mddev->recovery_cp = 0;
3424 mddev->persistent = ! info->not_persistent;
3425
3426 mddev->layout = info->layout;
3427 mddev->chunk_size = info->chunk_size;
3428
3429 mddev->max_disks = MD_SB_DISKS;
3430
3431 mddev->sb_dirty = 1;
3432
b2a2703c
N
3433 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
3434 mddev->bitmap_offset = 0;
3435
1da177e4
LT
3436 /*
3437 * Generate a 128 bit UUID
3438 */
3439 get_random_bytes(mddev->uuid, 16);
3440
3441 return 0;
3442}
3443
a35b0d69
N
3444static int update_size(mddev_t *mddev, unsigned long size)
3445{
3446 mdk_rdev_t * rdev;
3447 int rv;
3448 struct list_head *tmp;
3449
3450 if (mddev->pers->resize == NULL)
3451 return -EINVAL;
3452 /* The "size" is the amount of each device that is used.
3453 * This can only make sense for arrays with redundancy.
3454 * linear and raid0 always use whatever space is available
3455 * We can only consider changing the size if no resync
3456 * or reconstruction is happening, and if the new size
3457 * is acceptable. It must fit before the sb_offset or,
3458 * if that is <data_offset, it must fit before the
3459 * size of each device.
3460 * If size is zero, we find the largest size that fits.
3461 */
3462 if (mddev->sync_thread)
3463 return -EBUSY;
3464 ITERATE_RDEV(mddev,rdev,tmp) {
3465 sector_t avail;
3466 int fit = (size == 0);
3467 if (rdev->sb_offset > rdev->data_offset)
3468 avail = (rdev->sb_offset*2) - rdev->data_offset;
3469 else
3470 avail = get_capacity(rdev->bdev->bd_disk)
3471 - rdev->data_offset;
3472 if (fit && (size == 0 || size > avail/2))
3473 size = avail/2;
3474 if (avail < ((sector_t)size << 1))
3475 return -ENOSPC;
3476 }
3477 rv = mddev->pers->resize(mddev, (sector_t)size *2);
3478 if (!rv) {
3479 struct block_device *bdev;
3480
3481 bdev = bdget_disk(mddev->gendisk, 0);
3482 if (bdev) {
1b1dcc1b 3483 mutex_lock(&bdev->bd_inode->i_mutex);
6d89332b 3484 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
1b1dcc1b 3485 mutex_unlock(&bdev->bd_inode->i_mutex);
a35b0d69
N
3486 bdput(bdev);
3487 }
3488 }
3489 return rv;
3490}
3491
da943b99
N
3492static int update_raid_disks(mddev_t *mddev, int raid_disks)
3493{
3494 int rv;
3495 /* change the number of raid disks */
3496 if (mddev->pers->reshape == NULL)
3497 return -EINVAL;
3498 if (raid_disks <= 0 ||
3499 raid_disks >= mddev->max_disks)
3500 return -EINVAL;
3501 if (mddev->sync_thread)
3502 return -EBUSY;
3503 rv = mddev->pers->reshape(mddev, raid_disks);
da943b99
N
3504 return rv;
3505}
3506
3507
1da177e4
LT
3508/*
3509 * update_array_info is used to change the configuration of an
3510 * on-line array.
3511 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
3512 * fields in the info are checked against the array.
3513 * Any differences that cannot be handled will cause an error.
3514 * Normally, only one change can be managed at a time.
3515 */
3516static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
3517{
3518 int rv = 0;
3519 int cnt = 0;
36fa3063
N
3520 int state = 0;
3521
3522 /* calculate expected state,ignoring low bits */
3523 if (mddev->bitmap && mddev->bitmap_offset)
3524 state |= (1 << MD_SB_BITMAP_PRESENT);
1da177e4
LT
3525
3526 if (mddev->major_version != info->major_version ||
3527 mddev->minor_version != info->minor_version ||
3528/* mddev->patch_version != info->patch_version || */
3529 mddev->ctime != info->ctime ||
3530 mddev->level != info->level ||
3531/* mddev->layout != info->layout || */
3532 !mddev->persistent != info->not_persistent||
36fa3063
N
3533 mddev->chunk_size != info->chunk_size ||
3534 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
3535 ((state^info->state) & 0xfffffe00)
3536 )
1da177e4
LT
3537 return -EINVAL;
3538 /* Check there is only one change */
284ae7ca 3539 if (info->size >= 0 && mddev->size != info->size) cnt++;
1da177e4
LT
3540 if (mddev->raid_disks != info->raid_disks) cnt++;
3541 if (mddev->layout != info->layout) cnt++;
36fa3063 3542 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
1da177e4
LT
3543 if (cnt == 0) return 0;
3544 if (cnt > 1) return -EINVAL;
3545
3546 if (mddev->layout != info->layout) {
3547 /* Change layout
3548 * we don't need to do anything at the md level, the
3549 * personality will take care of it all.
3550 */
3551 if (mddev->pers->reconfig == NULL)
3552 return -EINVAL;
3553 else
3554 return mddev->pers->reconfig(mddev, info->layout, -1);
3555 }
284ae7ca 3556 if (info->size >= 0 && mddev->size != info->size)
a35b0d69
N
3557 rv = update_size(mddev, info->size);
3558
da943b99
N
3559 if (mddev->raid_disks != info->raid_disks)
3560 rv = update_raid_disks(mddev, info->raid_disks);
3561
36fa3063
N
3562 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
3563 if (mddev->pers->quiesce == NULL)
3564 return -EINVAL;
3565 if (mddev->recovery || mddev->sync_thread)
3566 return -EBUSY;
3567 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
3568 /* add the bitmap */
3569 if (mddev->bitmap)
3570 return -EEXIST;
3571 if (mddev->default_bitmap_offset == 0)
3572 return -EINVAL;
3573 mddev->bitmap_offset = mddev->default_bitmap_offset;
3574 mddev->pers->quiesce(mddev, 1);
3575 rv = bitmap_create(mddev);
3576 if (rv)
3577 bitmap_destroy(mddev);
3578 mddev->pers->quiesce(mddev, 0);
3579 } else {
3580 /* remove the bitmap */
3581 if (!mddev->bitmap)
3582 return -ENOENT;
3583 if (mddev->bitmap->file)
3584 return -EINVAL;
3585 mddev->pers->quiesce(mddev, 1);
3586 bitmap_destroy(mddev);
3587 mddev->pers->quiesce(mddev, 0);
3588 mddev->bitmap_offset = 0;
3589 }
3590 }
1da177e4
LT
3591 md_update_sb(mddev);
3592 return rv;
3593}
3594
3595static int set_disk_faulty(mddev_t *mddev, dev_t dev)
3596{
3597 mdk_rdev_t *rdev;
3598
3599 if (mddev->pers == NULL)
3600 return -ENODEV;
3601
3602 rdev = find_rdev(mddev, dev);
3603 if (!rdev)
3604 return -ENODEV;
3605
3606 md_error(mddev, rdev);
3607 return 0;
3608}
3609
a885c8c4
CH
3610static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
3611{
3612 mddev_t *mddev = bdev->bd_disk->private_data;
3613
3614 geo->heads = 2;
3615 geo->sectors = 4;
3616 geo->cylinders = get_capacity(mddev->gendisk) / 8;
3617 return 0;
3618}
3619
1da177e4
LT
3620static int md_ioctl(struct inode *inode, struct file *file,
3621 unsigned int cmd, unsigned long arg)
3622{
3623 int err = 0;
3624 void __user *argp = (void __user *)arg;
1da177e4
LT
3625 mddev_t *mddev = NULL;
3626
3627 if (!capable(CAP_SYS_ADMIN))
3628 return -EACCES;
3629
3630 /*
3631 * Commands dealing with the RAID driver but not any
3632 * particular array:
3633 */
3634 switch (cmd)
3635 {
3636 case RAID_VERSION:
3637 err = get_version(argp);
3638 goto done;
3639
3640 case PRINT_RAID_DEBUG:
3641 err = 0;
3642 md_print_devices();
3643 goto done;
3644
3645#ifndef MODULE
3646 case RAID_AUTORUN:
3647 err = 0;
3648 autostart_arrays(arg);
3649 goto done;
3650#endif
3651 default:;
3652 }
3653
3654 /*
3655 * Commands creating/starting a new array:
3656 */
3657
3658 mddev = inode->i_bdev->bd_disk->private_data;
3659
3660 if (!mddev) {
3661 BUG();
3662 goto abort;
3663 }
3664
3665
3666 if (cmd == START_ARRAY) {
3667 /* START_ARRAY doesn't need to lock the array as autostart_array
3668 * does the locking, and it could even be a different array
3669 */
3670 static int cnt = 3;
3671 if (cnt > 0 ) {
3672 printk(KERN_WARNING
3673 "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
e8a00334 3674 "This will not be supported beyond July 2006\n",
1da177e4
LT
3675 current->comm, current->pid);
3676 cnt--;
3677 }
3678 err = autostart_array(new_decode_dev(arg));
3679 if (err) {
3680 printk(KERN_WARNING "md: autostart failed!\n");
3681 goto abort;
3682 }
3683 goto done;
3684 }
3685
3686 err = mddev_lock(mddev);
3687 if (err) {
3688 printk(KERN_INFO
3689 "md: ioctl lock interrupted, reason %d, cmd %d\n",
3690 err, cmd);
3691 goto abort;
3692 }
3693
3694 switch (cmd)
3695 {
3696 case SET_ARRAY_INFO:
3697 {
3698 mdu_array_info_t info;
3699 if (!arg)
3700 memset(&info, 0, sizeof(info));
3701 else if (copy_from_user(&info, argp, sizeof(info))) {
3702 err = -EFAULT;
3703 goto abort_unlock;
3704 }
3705 if (mddev->pers) {
3706 err = update_array_info(mddev, &info);
3707 if (err) {
3708 printk(KERN_WARNING "md: couldn't update"
3709 " array info. %d\n", err);
3710 goto abort_unlock;
3711 }
3712 goto done_unlock;
3713 }
3714 if (!list_empty(&mddev->disks)) {
3715 printk(KERN_WARNING
3716 "md: array %s already has disks!\n",
3717 mdname(mddev));
3718 err = -EBUSY;
3719 goto abort_unlock;
3720 }
3721 if (mddev->raid_disks) {
3722 printk(KERN_WARNING
3723 "md: array %s already initialised!\n",
3724 mdname(mddev));
3725 err = -EBUSY;
3726 goto abort_unlock;
3727 }
3728 err = set_array_info(mddev, &info);
3729 if (err) {
3730 printk(KERN_WARNING "md: couldn't set"
3731 " array info. %d\n", err);
3732 goto abort_unlock;
3733 }
3734 }
3735 goto done_unlock;
3736
3737 default:;
3738 }
3739
3740 /*
3741 * Commands querying/configuring an existing array:
3742 */
32a7627c
N
3743 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
3744 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
3745 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
3746 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
1da177e4
LT
3747 err = -ENODEV;
3748 goto abort_unlock;
3749 }
3750
3751 /*
3752 * Commands even a read-only array can execute:
3753 */
3754 switch (cmd)
3755 {
3756 case GET_ARRAY_INFO:
3757 err = get_array_info(mddev, argp);
3758 goto done_unlock;
3759
32a7627c 3760 case GET_BITMAP_FILE:
87162a28 3761 err = get_bitmap_file(mddev, argp);
32a7627c
N
3762 goto done_unlock;
3763
1da177e4
LT
3764 case GET_DISK_INFO:
3765 err = get_disk_info(mddev, argp);
3766 goto done_unlock;
3767
3768 case RESTART_ARRAY_RW:
3769 err = restart_array(mddev);
3770 goto done_unlock;
3771
3772 case STOP_ARRAY:
3773 err = do_md_stop (mddev, 0);
3774 goto done_unlock;
3775
3776 case STOP_ARRAY_RO:
3777 err = do_md_stop (mddev, 1);
3778 goto done_unlock;
3779
3780 /*
3781 * We have a problem here : there is no easy way to give a CHS
3782 * virtual geometry. We currently pretend that we have a 2 heads
3783 * 4 sectors (with a BIG number of cylinders...). This drives
3784 * dosfs just mad... ;-)
3785 */
1da177e4
LT
3786 }
3787
3788 /*
3789 * The remaining ioctls are changing the state of the
f91de92e
N
3790 * superblock, so we do not allow them on read-only arrays.
3791 * However non-MD ioctls (e.g. get-size) will still come through
3792 * here and hit the 'default' below, so only disallow
3793 * 'md' ioctls, and switch to rw mode if started auto-readonly.
1da177e4 3794 */
f91de92e
N
3795 if (_IOC_TYPE(cmd) == MD_MAJOR &&
3796 mddev->ro && mddev->pers) {
3797 if (mddev->ro == 2) {
3798 mddev->ro = 0;
3799 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3800 md_wakeup_thread(mddev->thread);
3801
3802 } else {
3803 err = -EROFS;
3804 goto abort_unlock;
3805 }
1da177e4
LT
3806 }
3807
3808 switch (cmd)
3809 {
3810 case ADD_NEW_DISK:
3811 {
3812 mdu_disk_info_t info;
3813 if (copy_from_user(&info, argp, sizeof(info)))
3814 err = -EFAULT;
3815 else
3816 err = add_new_disk(mddev, &info);
3817 goto done_unlock;
3818 }
3819
3820 case HOT_REMOVE_DISK:
3821 err = hot_remove_disk(mddev, new_decode_dev(arg));
3822 goto done_unlock;
3823
3824 case HOT_ADD_DISK:
3825 err = hot_add_disk(mddev, new_decode_dev(arg));
3826 goto done_unlock;
3827
3828 case SET_DISK_FAULTY:
3829 err = set_disk_faulty(mddev, new_decode_dev(arg));
3830 goto done_unlock;
3831
3832 case RUN_ARRAY:
3833 err = do_md_run (mddev);
3834 goto done_unlock;
3835
32a7627c
N
3836 case SET_BITMAP_FILE:
3837 err = set_bitmap_file(mddev, (int)arg);
3838 goto done_unlock;
3839
1da177e4
LT
3840 default:
3841 if (_IOC_TYPE(cmd) == MD_MAJOR)
3842 printk(KERN_WARNING "md: %s(pid %d) used"
3843 " obsolete MD ioctl, upgrade your"
3844 " software to use new ictls.\n",
3845 current->comm, current->pid);
3846 err = -EINVAL;
3847 goto abort_unlock;
3848 }
3849
3850done_unlock:
3851abort_unlock:
3852 mddev_unlock(mddev);
3853
3854 return err;
3855done:
3856 if (err)
3857 MD_BUG();
3858abort:
3859 return err;
3860}
3861
3862static int md_open(struct inode *inode, struct file *file)
3863{
3864 /*
3865 * Succeed if we can lock the mddev, which confirms that
3866 * it isn't being stopped right now.
3867 */
3868 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
3869 int err;
3870
3871 if ((err = mddev_lock(mddev)))
3872 goto out;
3873
3874 err = 0;
3875 mddev_get(mddev);
3876 mddev_unlock(mddev);
3877
3878 check_disk_change(inode->i_bdev);
3879 out:
3880 return err;
3881}
3882
3883static int md_release(struct inode *inode, struct file * file)
3884{
3885 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
3886
3887 if (!mddev)
3888 BUG();
3889 mddev_put(mddev);
3890
3891 return 0;
3892}
3893
3894static int md_media_changed(struct gendisk *disk)
3895{
3896 mddev_t *mddev = disk->private_data;
3897
3898 return mddev->changed;
3899}
3900
3901static int md_revalidate(struct gendisk *disk)
3902{
3903 mddev_t *mddev = disk->private_data;
3904
3905 mddev->changed = 0;
3906 return 0;
3907}
3908static struct block_device_operations md_fops =
3909{
3910 .owner = THIS_MODULE,
3911 .open = md_open,
3912 .release = md_release,
3913 .ioctl = md_ioctl,
a885c8c4 3914 .getgeo = md_getgeo,
1da177e4
LT
3915 .media_changed = md_media_changed,
3916 .revalidate_disk= md_revalidate,
3917};
3918
75c96f85 3919static int md_thread(void * arg)
1da177e4
LT
3920{
3921 mdk_thread_t *thread = arg;
3922
1da177e4
LT
3923 /*
3924 * md_thread is a 'system-thread', it's priority should be very
3925 * high. We avoid resource deadlocks individually in each
3926 * raid personality. (RAID5 does preallocation) We also use RR and
3927 * the very same RT priority as kswapd, thus we will never get
3928 * into a priority inversion deadlock.
3929 *
3930 * we definitely have to have equal or higher priority than
3931 * bdflush, otherwise bdflush will deadlock if there are too
3932 * many dirty RAID5 blocks.
3933 */
1da177e4 3934
6985c43f 3935 allow_signal(SIGKILL);
a6fb0934 3936 while (!kthread_should_stop()) {
1da177e4 3937
93588e22
N
3938 /* We need to wait INTERRUPTIBLE so that
3939 * we don't add to the load-average.
3940 * That means we need to be sure no signals are
3941 * pending
3942 */
3943 if (signal_pending(current))
3944 flush_signals(current);
3945
3946 wait_event_interruptible_timeout
3947 (thread->wqueue,
3948 test_bit(THREAD_WAKEUP, &thread->flags)
3949 || kthread_should_stop(),
3950 thread->timeout);
3e1d1d28 3951 try_to_freeze();
1da177e4
LT
3952
3953 clear_bit(THREAD_WAKEUP, &thread->flags);
3954
787453c2 3955 thread->run(thread->mddev);
1da177e4 3956 }
a6fb0934 3957
1da177e4
LT
3958 return 0;
3959}
3960
3961void md_wakeup_thread(mdk_thread_t *thread)
3962{
3963 if (thread) {
3964 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
3965 set_bit(THREAD_WAKEUP, &thread->flags);
3966 wake_up(&thread->wqueue);
3967 }
3968}
3969
3970mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
3971 const char *name)
3972{
3973 mdk_thread_t *thread;
1da177e4 3974
9ffae0cf 3975 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
1da177e4
LT
3976 if (!thread)
3977 return NULL;
3978
1da177e4
LT
3979 init_waitqueue_head(&thread->wqueue);
3980
1da177e4
LT
3981 thread->run = run;
3982 thread->mddev = mddev;
32a7627c 3983 thread->timeout = MAX_SCHEDULE_TIMEOUT;
6985c43f 3984 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
a6fb0934 3985 if (IS_ERR(thread->tsk)) {
1da177e4
LT
3986 kfree(thread);
3987 return NULL;
3988 }
1da177e4
LT
3989 return thread;
3990}
3991
1da177e4
LT
3992void md_unregister_thread(mdk_thread_t *thread)
3993{
d28446fe 3994 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
a6fb0934
N
3995
3996 kthread_stop(thread->tsk);
1da177e4
LT
3997 kfree(thread);
3998}
3999
4000void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4001{
4002 if (!mddev) {
4003 MD_BUG();
4004 return;
4005 }
4006
b2d444d7 4007 if (!rdev || test_bit(Faulty, &rdev->flags))
1da177e4 4008 return;
32a7627c 4009/*
1da177e4
LT
4010 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
4011 mdname(mddev),
4012 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
4013 __builtin_return_address(0),__builtin_return_address(1),
4014 __builtin_return_address(2),__builtin_return_address(3));
32a7627c 4015*/
1da177e4
LT
4016 if (!mddev->pers->error_handler)
4017 return;
4018 mddev->pers->error_handler(mddev,rdev);
4019 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4020 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4021 md_wakeup_thread(mddev->thread);
d7603b7e 4022 md_new_event(mddev);
1da177e4
LT
4023}
4024
4025/* seq_file implementation /proc/mdstat */
4026
4027static void status_unused(struct seq_file *seq)
4028{
4029 int i = 0;
4030 mdk_rdev_t *rdev;
4031 struct list_head *tmp;
4032
4033 seq_printf(seq, "unused devices: ");
4034
4035 ITERATE_RDEV_PENDING(rdev,tmp) {
4036 char b[BDEVNAME_SIZE];
4037 i++;
4038 seq_printf(seq, "%s ",
4039 bdevname(rdev->bdev,b));
4040 }
4041 if (!i)
4042 seq_printf(seq, "<none>");
4043
4044 seq_printf(seq, "\n");
4045}
4046
4047
4048static void status_resync(struct seq_file *seq, mddev_t * mddev)
4049{
4588b42e
N
4050 sector_t max_blocks, resync, res;
4051 unsigned long dt, db, rt;
4052 int scale;
4053 unsigned int per_milli;
1da177e4
LT
4054
4055 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
4056
4057 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4058 max_blocks = mddev->resync_max_sectors >> 1;
4059 else
4060 max_blocks = mddev->size;
4061
4062 /*
4063 * Should not happen.
4064 */
4065 if (!max_blocks) {
4066 MD_BUG();
4067 return;
4068 }
4588b42e
N
4069 /* Pick 'scale' such that (resync>>scale)*1000 will fit
4070 * in a sector_t, and (max_blocks>>scale) will fit in a
4071 * u32, as those are the requirements for sector_div.
4072 * Thus 'scale' must be at least 10
4073 */
4074 scale = 10;
4075 if (sizeof(sector_t) > sizeof(unsigned long)) {
4076 while ( max_blocks/2 > (1ULL<<(scale+32)))
4077 scale++;
4078 }
4079 res = (resync>>scale)*1000;
4080 sector_div(res, (u32)((max_blocks>>scale)+1));
4081
4082 per_milli = res;
1da177e4 4083 {
4588b42e 4084 int i, x = per_milli/50, y = 20-x;
1da177e4
LT
4085 seq_printf(seq, "[");
4086 for (i = 0; i < x; i++)
4087 seq_printf(seq, "=");
4088 seq_printf(seq, ">");
4089 for (i = 0; i < y; i++)
4090 seq_printf(seq, ".");
4091 seq_printf(seq, "] ");
4092 }
4588b42e 4093 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
ccfcc3c1
N
4094 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
4095 "reshape" :
1da177e4 4096 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
ccfcc3c1 4097 "resync" : "recovery")),
4588b42e
N
4098 per_milli/10, per_milli % 10,
4099 (unsigned long long) resync,
4100 (unsigned long long) max_blocks);
1da177e4
LT
4101
4102 /*
4103 * We do not want to overflow, so the order of operands and
4104 * the * 100 / 100 trick are important. We do a +1 to be
4105 * safe against division by zero. We only estimate anyway.
4106 *
4107 * dt: time from mark until now
4108 * db: blocks written from mark until now
4109 * rt: remaining time
4110 */
4111 dt = ((jiffies - mddev->resync_mark) / HZ);
4112 if (!dt) dt++;
4113 db = resync - (mddev->resync_mark_cnt/2);
4588b42e 4114 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100;
1da177e4
LT
4115
4116 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
4117
4118 seq_printf(seq, " speed=%ldK/sec", db/dt);
4119}
4120
4121static void *md_seq_start(struct seq_file *seq, loff_t *pos)
4122{
4123 struct list_head *tmp;
4124 loff_t l = *pos;
4125 mddev_t *mddev;
4126
4127 if (l >= 0x10000)
4128 return NULL;
4129 if (!l--)
4130 /* header */
4131 return (void*)1;
4132
4133 spin_lock(&all_mddevs_lock);
4134 list_for_each(tmp,&all_mddevs)
4135 if (!l--) {
4136 mddev = list_entry(tmp, mddev_t, all_mddevs);
4137 mddev_get(mddev);
4138 spin_unlock(&all_mddevs_lock);
4139 return mddev;
4140 }
4141 spin_unlock(&all_mddevs_lock);
4142 if (!l--)
4143 return (void*)2;/* tail */
4144 return NULL;
4145}
4146
4147static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4148{
4149 struct list_head *tmp;
4150 mddev_t *next_mddev, *mddev = v;
4151
4152 ++*pos;
4153 if (v == (void*)2)
4154 return NULL;
4155
4156 spin_lock(&all_mddevs_lock);
4157 if (v == (void*)1)
4158 tmp = all_mddevs.next;
4159 else
4160 tmp = mddev->all_mddevs.next;
4161 if (tmp != &all_mddevs)
4162 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
4163 else {
4164 next_mddev = (void*)2;
4165 *pos = 0x10000;
4166 }
4167 spin_unlock(&all_mddevs_lock);
4168
4169 if (v != (void*)1)
4170 mddev_put(mddev);
4171 return next_mddev;
4172
4173}
4174
4175static void md_seq_stop(struct seq_file *seq, void *v)
4176{
4177 mddev_t *mddev = v;
4178
4179 if (mddev && v != (void*)1 && v != (void*)2)
4180 mddev_put(mddev);
4181}
4182
d7603b7e
N
4183struct mdstat_info {
4184 int event;
4185};
4186
1da177e4
LT
4187static int md_seq_show(struct seq_file *seq, void *v)
4188{
4189 mddev_t *mddev = v;
4190 sector_t size;
4191 struct list_head *tmp2;
4192 mdk_rdev_t *rdev;
d7603b7e 4193 struct mdstat_info *mi = seq->private;
32a7627c 4194 struct bitmap *bitmap;
1da177e4
LT
4195
4196 if (v == (void*)1) {
2604b703 4197 struct mdk_personality *pers;
1da177e4
LT
4198 seq_printf(seq, "Personalities : ");
4199 spin_lock(&pers_lock);
2604b703
N
4200 list_for_each_entry(pers, &pers_list, list)
4201 seq_printf(seq, "[%s] ", pers->name);
1da177e4
LT
4202
4203 spin_unlock(&pers_lock);
4204 seq_printf(seq, "\n");
d7603b7e 4205 mi->event = atomic_read(&md_event_count);
1da177e4
LT
4206 return 0;
4207 }
4208 if (v == (void*)2) {
4209 status_unused(seq);
4210 return 0;
4211 }
4212
4213 if (mddev_lock(mddev)!=0)
4214 return -EINTR;
4215 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
4216 seq_printf(seq, "%s : %sactive", mdname(mddev),
4217 mddev->pers ? "" : "in");
4218 if (mddev->pers) {
f91de92e 4219 if (mddev->ro==1)
1da177e4 4220 seq_printf(seq, " (read-only)");
f91de92e
N
4221 if (mddev->ro==2)
4222 seq_printf(seq, "(auto-read-only)");
1da177e4
LT
4223 seq_printf(seq, " %s", mddev->pers->name);
4224 }
4225
4226 size = 0;
4227 ITERATE_RDEV(mddev,rdev,tmp2) {
4228 char b[BDEVNAME_SIZE];
4229 seq_printf(seq, " %s[%d]",
4230 bdevname(rdev->bdev,b), rdev->desc_nr);
8ddf9efe
N
4231 if (test_bit(WriteMostly, &rdev->flags))
4232 seq_printf(seq, "(W)");
b2d444d7 4233 if (test_bit(Faulty, &rdev->flags)) {
1da177e4
LT
4234 seq_printf(seq, "(F)");
4235 continue;
b325a32e
N
4236 } else if (rdev->raid_disk < 0)
4237 seq_printf(seq, "(S)"); /* spare */
1da177e4
LT
4238 size += rdev->size;
4239 }
4240
4241 if (!list_empty(&mddev->disks)) {
4242 if (mddev->pers)
4243 seq_printf(seq, "\n %llu blocks",
4244 (unsigned long long)mddev->array_size);
4245 else
4246 seq_printf(seq, "\n %llu blocks",
4247 (unsigned long long)size);
4248 }
1cd6bf19
N
4249 if (mddev->persistent) {
4250 if (mddev->major_version != 0 ||
4251 mddev->minor_version != 90) {
4252 seq_printf(seq," super %d.%d",
4253 mddev->major_version,
4254 mddev->minor_version);
4255 }
4256 } else
4257 seq_printf(seq, " super non-persistent");
1da177e4
LT
4258
4259 if (mddev->pers) {
4260 mddev->pers->status (seq, mddev);
4261 seq_printf(seq, "\n ");
8e1b39d6
N
4262 if (mddev->pers->sync_request) {
4263 if (mddev->curr_resync > 2) {
4264 status_resync (seq, mddev);
4265 seq_printf(seq, "\n ");
4266 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
4267 seq_printf(seq, "\tresync=DELAYED\n ");
4268 else if (mddev->recovery_cp < MaxSector)
4269 seq_printf(seq, "\tresync=PENDING\n ");
4270 }
32a7627c
N
4271 } else
4272 seq_printf(seq, "\n ");
4273
4274 if ((bitmap = mddev->bitmap)) {
32a7627c
N
4275 unsigned long chunk_kb;
4276 unsigned long flags;
32a7627c
N
4277 spin_lock_irqsave(&bitmap->lock, flags);
4278 chunk_kb = bitmap->chunksize >> 10;
4279 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
4280 "%lu%s chunk",
4281 bitmap->pages - bitmap->missing_pages,
4282 bitmap->pages,
4283 (bitmap->pages - bitmap->missing_pages)
4284 << (PAGE_SHIFT - 10),
4285 chunk_kb ? chunk_kb : bitmap->chunksize,
4286 chunk_kb ? "KB" : "B");
78d742d8
N
4287 if (bitmap->file) {
4288 seq_printf(seq, ", file: ");
4289 seq_path(seq, bitmap->file->f_vfsmnt,
4290 bitmap->file->f_dentry," \t\n");
32a7627c 4291 }
78d742d8 4292
32a7627c
N
4293 seq_printf(seq, "\n");
4294 spin_unlock_irqrestore(&bitmap->lock, flags);
1da177e4
LT
4295 }
4296
4297 seq_printf(seq, "\n");
4298 }
4299 mddev_unlock(mddev);
4300
4301 return 0;
4302}
4303
4304static struct seq_operations md_seq_ops = {
4305 .start = md_seq_start,
4306 .next = md_seq_next,
4307 .stop = md_seq_stop,
4308 .show = md_seq_show,
4309};
4310
4311static int md_seq_open(struct inode *inode, struct file *file)
4312{
4313 int error;
d7603b7e
N
4314 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
4315 if (mi == NULL)
4316 return -ENOMEM;
1da177e4
LT
4317
4318 error = seq_open(file, &md_seq_ops);
d7603b7e
N
4319 if (error)
4320 kfree(mi);
4321 else {
4322 struct seq_file *p = file->private_data;
4323 p->private = mi;
4324 mi->event = atomic_read(&md_event_count);
4325 }
1da177e4
LT
4326 return error;
4327}
4328
d7603b7e
N
4329static int md_seq_release(struct inode *inode, struct file *file)
4330{
4331 struct seq_file *m = file->private_data;
4332 struct mdstat_info *mi = m->private;
4333 m->private = NULL;
4334 kfree(mi);
4335 return seq_release(inode, file);
4336}
4337
4338static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
4339{
4340 struct seq_file *m = filp->private_data;
4341 struct mdstat_info *mi = m->private;
4342 int mask;
4343
4344 poll_wait(filp, &md_event_waiters, wait);
4345
4346 /* always allow read */
4347 mask = POLLIN | POLLRDNORM;
4348
4349 if (mi->event != atomic_read(&md_event_count))
4350 mask |= POLLERR | POLLPRI;
4351 return mask;
4352}
4353
1da177e4
LT
4354static struct file_operations md_seq_fops = {
4355 .open = md_seq_open,
4356 .read = seq_read,
4357 .llseek = seq_lseek,
d7603b7e
N
4358 .release = md_seq_release,
4359 .poll = mdstat_poll,
1da177e4
LT
4360};
4361
2604b703 4362int register_md_personality(struct mdk_personality *p)
1da177e4 4363{
1da177e4 4364 spin_lock(&pers_lock);
2604b703
N
4365 list_add_tail(&p->list, &pers_list);
4366 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
1da177e4
LT
4367 spin_unlock(&pers_lock);
4368 return 0;
4369}
4370
2604b703 4371int unregister_md_personality(struct mdk_personality *p)
1da177e4 4372{
2604b703 4373 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
1da177e4 4374 spin_lock(&pers_lock);
2604b703 4375 list_del_init(&p->list);
1da177e4
LT
4376 spin_unlock(&pers_lock);
4377 return 0;
4378}
4379
4380static int is_mddev_idle(mddev_t *mddev)
4381{
4382 mdk_rdev_t * rdev;
4383 struct list_head *tmp;
4384 int idle;
4385 unsigned long curr_events;
4386
4387 idle = 1;
4388 ITERATE_RDEV(mddev,rdev,tmp) {
4389 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
a362357b
JA
4390 curr_events = disk_stat_read(disk, sectors[0]) +
4391 disk_stat_read(disk, sectors[1]) -
1da177e4 4392 atomic_read(&disk->sync_io);
c0e48521
N
4393 /* The difference between curr_events and last_events
4394 * will be affected by any new non-sync IO (making
4395 * curr_events bigger) and any difference in the amount of
4396 * in-flight syncio (making current_events bigger or smaller)
4397 * The amount in-flight is currently limited to
4398 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6
4399 * which is at most 4096 sectors.
4400 * These numbers are fairly fragile and should be made
4401 * more robust, probably by enforcing the
4402 * 'window size' that md_do_sync sort-of uses.
4403 *
1da177e4
LT
4404 * Note: the following is an unsigned comparison.
4405 */
c0e48521 4406 if ((curr_events - rdev->last_events + 4096) > 8192) {
1da177e4
LT
4407 rdev->last_events = curr_events;
4408 idle = 0;
4409 }
4410 }
4411 return idle;
4412}
4413
4414void md_done_sync(mddev_t *mddev, int blocks, int ok)
4415{
4416 /* another "blocks" (512byte) blocks have been synced */
4417 atomic_sub(blocks, &mddev->recovery_active);
4418 wake_up(&mddev->recovery_wait);
4419 if (!ok) {
4420 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
4421 md_wakeup_thread(mddev->thread);
4422 // stop recovery, signal do_sync ....
4423 }
4424}
4425
4426
06d91a5f
N
4427/* md_write_start(mddev, bi)
4428 * If we need to update some array metadata (e.g. 'active' flag
3d310eb7
N
4429 * in superblock) before writing, schedule a superblock update
4430 * and wait for it to complete.
06d91a5f 4431 */
3d310eb7 4432void md_write_start(mddev_t *mddev, struct bio *bi)
1da177e4 4433{
06d91a5f 4434 if (bio_data_dir(bi) != WRITE)
3d310eb7 4435 return;
06d91a5f 4436
f91de92e
N
4437 BUG_ON(mddev->ro == 1);
4438 if (mddev->ro == 2) {
4439 /* need to switch to read/write */
4440 mddev->ro = 0;
4441 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4442 md_wakeup_thread(mddev->thread);
4443 }
06d91a5f 4444 atomic_inc(&mddev->writes_pending);
06d91a5f 4445 if (mddev->in_sync) {
a9701a30 4446 spin_lock_irq(&mddev->write_lock);
3d310eb7
N
4447 if (mddev->in_sync) {
4448 mddev->in_sync = 0;
4449 mddev->sb_dirty = 1;
4450 md_wakeup_thread(mddev->thread);
4451 }
a9701a30 4452 spin_unlock_irq(&mddev->write_lock);
06d91a5f 4453 }
3d310eb7 4454 wait_event(mddev->sb_wait, mddev->sb_dirty==0);
1da177e4
LT
4455}
4456
4457void md_write_end(mddev_t *mddev)
4458{
4459 if (atomic_dec_and_test(&mddev->writes_pending)) {
4460 if (mddev->safemode == 2)
4461 md_wakeup_thread(mddev->thread);
4462 else
4463 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
4464 }
4465}
4466
75c96f85 4467static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
1da177e4
LT
4468
4469#define SYNC_MARKS 10
4470#define SYNC_MARK_STEP (3*HZ)
29269553 4471void md_do_sync(mddev_t *mddev)
1da177e4
LT
4472{
4473 mddev_t *mddev2;
4474 unsigned int currspeed = 0,
4475 window;
57afd89f 4476 sector_t max_sectors,j, io_sectors;
1da177e4
LT
4477 unsigned long mark[SYNC_MARKS];
4478 sector_t mark_cnt[SYNC_MARKS];
4479 int last_mark,m;
4480 struct list_head *tmp;
4481 sector_t last_check;
57afd89f 4482 int skipped = 0;
1da177e4
LT
4483
4484 /* just incase thread restarts... */
4485 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
4486 return;
4487
4488 /* we overload curr_resync somewhat here.
4489 * 0 == not engaged in resync at all
4490 * 2 == checking that there is no conflict with another sync
4491 * 1 == like 2, but have yielded to allow conflicting resync to
4492 * commense
4493 * other == active in resync - this many blocks
4494 *
4495 * Before starting a resync we must have set curr_resync to
4496 * 2, and then checked that every "conflicting" array has curr_resync
4497 * less than ours. When we find one that is the same or higher
4498 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
4499 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
4500 * This will mean we have to start checking from the beginning again.
4501 *
4502 */
4503
4504 do {
4505 mddev->curr_resync = 2;
4506
4507 try_again:
787453c2 4508 if (kthread_should_stop()) {
6985c43f 4509 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1da177e4
LT
4510 goto skip;
4511 }
4512 ITERATE_MDDEV(mddev2,tmp) {
1da177e4
LT
4513 if (mddev2 == mddev)
4514 continue;
4515 if (mddev2->curr_resync &&
4516 match_mddev_units(mddev,mddev2)) {
4517 DEFINE_WAIT(wq);
4518 if (mddev < mddev2 && mddev->curr_resync == 2) {
4519 /* arbitrarily yield */
4520 mddev->curr_resync = 1;
4521 wake_up(&resync_wait);
4522 }
4523 if (mddev > mddev2 && mddev->curr_resync == 1)
4524 /* no need to wait here, we can wait the next
4525 * time 'round when curr_resync == 2
4526 */
4527 continue;
787453c2
N
4528 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
4529 if (!kthread_should_stop() &&
8712e553 4530 mddev2->curr_resync >= mddev->curr_resync) {
1da177e4
LT
4531 printk(KERN_INFO "md: delaying resync of %s"
4532 " until %s has finished resync (they"
4533 " share one or more physical units)\n",
4534 mdname(mddev), mdname(mddev2));
4535 mddev_put(mddev2);
4536 schedule();
4537 finish_wait(&resync_wait, &wq);
4538 goto try_again;
4539 }
4540 finish_wait(&resync_wait, &wq);
4541 }
4542 }
4543 } while (mddev->curr_resync < 2);
4544
9d88883e 4545 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1da177e4 4546 /* resync follows the size requested by the personality,
57afd89f 4547 * which defaults to physical size, but can be virtual size
1da177e4
LT
4548 */
4549 max_sectors = mddev->resync_max_sectors;
9d88883e 4550 mddev->resync_mismatches = 0;
ccfcc3c1
N
4551 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4552 max_sectors = mddev->size << 1;
4553 else
1da177e4
LT
4554 /* recovery follows the physical size of devices */
4555 max_sectors = mddev->size << 1;
4556
4557 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
4558 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
88202a0c 4559 " %d KB/sec/disc.\n", speed_min(mddev));
338cec32 4560 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
1da177e4 4561 "(but not more than %d KB/sec) for reconstruction.\n",
88202a0c 4562 speed_max(mddev));
1da177e4
LT
4563
4564 is_mddev_idle(mddev); /* this also initializes IO event counters */
32a7627c 4565 /* we don't use the checkpoint if there's a bitmap */
24dd469d
N
4566 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
4567 && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1da177e4
LT
4568 j = mddev->recovery_cp;
4569 else
4570 j = 0;
57afd89f 4571 io_sectors = 0;
1da177e4
LT
4572 for (m = 0; m < SYNC_MARKS; m++) {
4573 mark[m] = jiffies;
57afd89f 4574 mark_cnt[m] = io_sectors;
1da177e4
LT
4575 }
4576 last_mark = 0;
4577 mddev->resync_mark = mark[last_mark];
4578 mddev->resync_mark_cnt = mark_cnt[last_mark];
4579
4580 /*
4581 * Tune reconstruction:
4582 */
4583 window = 32*(PAGE_SIZE/512);
4584 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
4585 window/2,(unsigned long long) max_sectors/2);
4586
4587 atomic_set(&mddev->recovery_active, 0);
4588 init_waitqueue_head(&mddev->recovery_wait);
4589 last_check = 0;
4590
4591 if (j>2) {
4592 printk(KERN_INFO
4593 "md: resuming recovery of %s from checkpoint.\n",
4594 mdname(mddev));
4595 mddev->curr_resync = j;
4596 }
4597
4598 while (j < max_sectors) {
57afd89f 4599 sector_t sectors;
1da177e4 4600
57afd89f
N
4601 skipped = 0;
4602 sectors = mddev->pers->sync_request(mddev, j, &skipped,
88202a0c 4603 currspeed < speed_min(mddev));
57afd89f 4604 if (sectors == 0) {
1da177e4
LT
4605 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
4606 goto out;
4607 }
57afd89f
N
4608
4609 if (!skipped) { /* actual IO requested */
4610 io_sectors += sectors;
4611 atomic_add(sectors, &mddev->recovery_active);
4612 }
4613
1da177e4
LT
4614 j += sectors;
4615 if (j>1) mddev->curr_resync = j;
d7603b7e
N
4616 if (last_check == 0)
4617 /* this is the earliers that rebuilt will be
4618 * visible in /proc/mdstat
4619 */
4620 md_new_event(mddev);
57afd89f
N
4621
4622 if (last_check + window > io_sectors || j == max_sectors)
1da177e4
LT
4623 continue;
4624
57afd89f 4625 last_check = io_sectors;
1da177e4
LT
4626
4627 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
4628 test_bit(MD_RECOVERY_ERR, &mddev->recovery))
4629 break;
4630
4631 repeat:
4632 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
4633 /* step marks */
4634 int next = (last_mark+1) % SYNC_MARKS;
4635
4636 mddev->resync_mark = mark[next];
4637 mddev->resync_mark_cnt = mark_cnt[next];
4638 mark[next] = jiffies;
57afd89f 4639 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
1da177e4
LT
4640 last_mark = next;
4641 }
4642
4643
787453c2 4644 if (kthread_should_stop()) {
1da177e4
LT
4645 /*
4646 * got a signal, exit.
4647 */
4648 printk(KERN_INFO
4649 "md: md_do_sync() got signal ... exiting\n");
1da177e4
LT
4650 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4651 goto out;
4652 }
4653
4654 /*
4655 * this loop exits only if either when we are slower than
4656 * the 'hard' speed limit, or the system was IO-idle for
4657 * a jiffy.
4658 * the system might be non-idle CPU-wise, but we only care
4659 * about not overloading the IO subsystem. (things like an
4660 * e2fsck being done on the RAID array should execute fast)
4661 */
4662 mddev->queue->unplug_fn(mddev->queue);
4663 cond_resched();
4664
57afd89f
N
4665 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
4666 /((jiffies-mddev->resync_mark)/HZ +1) +1;
1da177e4 4667
88202a0c
N
4668 if (currspeed > speed_min(mddev)) {
4669 if ((currspeed > speed_max(mddev)) ||
1da177e4 4670 !is_mddev_idle(mddev)) {
c0e48521 4671 msleep(500);
1da177e4
LT
4672 goto repeat;
4673 }
4674 }
4675 }
4676 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
4677 /*
4678 * this also signals 'finished resyncing' to md_stop
4679 */
4680 out:
4681 mddev->queue->unplug_fn(mddev->queue);
4682
4683 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
4684
4685 /* tell personality that we are finished */
57afd89f 4686 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
1da177e4
LT
4687
4688 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
ccfcc3c1
N
4689 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
4690 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
1da177e4
LT
4691 mddev->curr_resync > 2 &&
4692 mddev->curr_resync >= mddev->recovery_cp) {
4693 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4694 printk(KERN_INFO
4695 "md: checkpointing recovery of %s.\n",
4696 mdname(mddev));
4697 mddev->recovery_cp = mddev->curr_resync;
4698 } else
4699 mddev->recovery_cp = MaxSector;
4700 }
4701
1da177e4
LT
4702 skip:
4703 mddev->curr_resync = 0;
4704 wake_up(&resync_wait);
4705 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
4706 md_wakeup_thread(mddev->thread);
4707}
29269553 4708EXPORT_SYMBOL_GPL(md_do_sync);
1da177e4
LT
4709
4710
4711/*
4712 * This routine is regularly called by all per-raid-array threads to
4713 * deal with generic issues like resync and super-block update.
4714 * Raid personalities that don't have a thread (linear/raid0) do not
4715 * need this as they never do any recovery or update the superblock.
4716 *
4717 * It does not do any resync itself, but rather "forks" off other threads
4718 * to do that as needed.
4719 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
4720 * "->recovery" and create a thread at ->sync_thread.
4721 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
4722 * and wakeups up this thread which will reap the thread and finish up.
4723 * This thread also removes any faulty devices (with nr_pending == 0).
4724 *
4725 * The overall approach is:
4726 * 1/ if the superblock needs updating, update it.
4727 * 2/ If a recovery thread is running, don't do anything else.
4728 * 3/ If recovery has finished, clean up, possibly marking spares active.
4729 * 4/ If there are any faulty devices, remove them.
4730 * 5/ If array is degraded, try to add spares devices
4731 * 6/ If array has spares or is not in-sync, start a resync thread.
4732 */
4733void md_check_recovery(mddev_t *mddev)
4734{
4735 mdk_rdev_t *rdev;
4736 struct list_head *rtmp;
4737
4738
5f40402d
N
4739 if (mddev->bitmap)
4740 bitmap_daemon_work(mddev->bitmap);
1da177e4
LT
4741
4742 if (mddev->ro)
4743 return;
fca4d848
N
4744
4745 if (signal_pending(current)) {
4746 if (mddev->pers->sync_request) {
4747 printk(KERN_INFO "md: %s in immediate safe mode\n",
4748 mdname(mddev));
4749 mddev->safemode = 2;
4750 }
4751 flush_signals(current);
4752 }
4753
1da177e4
LT
4754 if ( ! (
4755 mddev->sb_dirty ||
4756 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
fca4d848
N
4757 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
4758 (mddev->safemode == 1) ||
4759 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
4760 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
1da177e4
LT
4761 ))
4762 return;
fca4d848 4763
1da177e4
LT
4764 if (mddev_trylock(mddev)==0) {
4765 int spares =0;
fca4d848 4766
a9701a30 4767 spin_lock_irq(&mddev->write_lock);
fca4d848
N
4768 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
4769 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
4770 mddev->in_sync = 1;
4771 mddev->sb_dirty = 1;
4772 }
4773 if (mddev->safemode == 1)
4774 mddev->safemode = 0;
a9701a30 4775 spin_unlock_irq(&mddev->write_lock);
fca4d848 4776
1da177e4
LT
4777 if (mddev->sb_dirty)
4778 md_update_sb(mddev);
06d91a5f 4779
06d91a5f 4780
1da177e4
LT
4781 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
4782 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
4783 /* resync/recovery still happening */
4784 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4785 goto unlock;
4786 }
4787 if (mddev->sync_thread) {
4788 /* resync has finished, collect result */
4789 md_unregister_thread(mddev->sync_thread);
4790 mddev->sync_thread = NULL;
4791 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
4792 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4793 /* success...*/
4794 /* activate any spares */
4795 mddev->pers->spare_active(mddev);
4796 }
4797 md_update_sb(mddev);
41158c7e
N
4798
4799 /* if array is no-longer degraded, then any saved_raid_disk
4800 * information must be scrapped
4801 */
4802 if (!mddev->degraded)
4803 ITERATE_RDEV(mddev,rdev,rtmp)
4804 rdev->saved_raid_disk = -1;
4805
1da177e4
LT
4806 mddev->recovery = 0;
4807 /* flag recovery needed just to double check */
4808 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
d7603b7e 4809 md_new_event(mddev);
1da177e4
LT
4810 goto unlock;
4811 }
24dd469d
N
4812 /* Clear some bits that don't mean anything, but
4813 * might be left set
4814 */
4815 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4816 clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
4817 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
4818 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
1da177e4
LT
4819
4820 /* no recovery is running.
4821 * remove any failed drives, then
4822 * add spares if possible.
4823 * Spare are also removed and re-added, to allow
4824 * the personality to fail the re-add.
4825 */
4826 ITERATE_RDEV(mddev,rdev,rtmp)
4827 if (rdev->raid_disk >= 0 &&
b2d444d7 4828 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) &&
1da177e4 4829 atomic_read(&rdev->nr_pending)==0) {
86e6ffdd
N
4830 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) {
4831 char nm[20];
4832 sprintf(nm,"rd%d", rdev->raid_disk);
4833 sysfs_remove_link(&mddev->kobj, nm);
1da177e4 4834 rdev->raid_disk = -1;
86e6ffdd 4835 }
1da177e4
LT
4836 }
4837
4838 if (mddev->degraded) {
4839 ITERATE_RDEV(mddev,rdev,rtmp)
4840 if (rdev->raid_disk < 0
b2d444d7 4841 && !test_bit(Faulty, &rdev->flags)) {
86e6ffdd
N
4842 if (mddev->pers->hot_add_disk(mddev,rdev)) {
4843 char nm[20];
4844 sprintf(nm, "rd%d", rdev->raid_disk);
4845 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
1da177e4 4846 spares++;
d7603b7e 4847 md_new_event(mddev);
86e6ffdd 4848 } else
1da177e4
LT
4849 break;
4850 }
4851 }
4852
24dd469d
N
4853 if (spares) {
4854 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4855 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4856 } else if (mddev->recovery_cp < MaxSector) {
4857 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4858 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4859 /* nothing to be done ... */
1da177e4 4860 goto unlock;
24dd469d 4861
1da177e4
LT
4862 if (mddev->pers->sync_request) {
4863 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
a654b9d8
N
4864 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
4865 /* We are adding a device or devices to an array
4866 * which has the bitmap stored on all devices.
4867 * So make sure all bitmap pages get written
4868 */
4869 bitmap_write_all(mddev->bitmap);
4870 }
1da177e4
LT
4871 mddev->sync_thread = md_register_thread(md_do_sync,
4872 mddev,
4873 "%s_resync");
4874 if (!mddev->sync_thread) {
4875 printk(KERN_ERR "%s: could not start resync"
4876 " thread...\n",
4877 mdname(mddev));
4878 /* leave the spares where they are, it shouldn't hurt */
4879 mddev->recovery = 0;
d7603b7e 4880 } else
1da177e4 4881 md_wakeup_thread(mddev->sync_thread);
d7603b7e 4882 md_new_event(mddev);
1da177e4
LT
4883 }
4884 unlock:
4885 mddev_unlock(mddev);
4886 }
4887}
4888
75c96f85
AB
4889static int md_notify_reboot(struct notifier_block *this,
4890 unsigned long code, void *x)
1da177e4
LT
4891{
4892 struct list_head *tmp;
4893 mddev_t *mddev;
4894
4895 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
4896
4897 printk(KERN_INFO "md: stopping all md devices.\n");
4898
4899 ITERATE_MDDEV(mddev,tmp)
4900 if (mddev_trylock(mddev)==0)
4901 do_md_stop (mddev, 1);
4902 /*
4903 * certain more exotic SCSI devices are known to be
4904 * volatile wrt too early system reboots. While the
4905 * right place to handle this issue is the given
4906 * driver, we do want to have a safe RAID driver ...
4907 */
4908 mdelay(1000*1);
4909 }
4910 return NOTIFY_DONE;
4911}
4912
75c96f85 4913static struct notifier_block md_notifier = {
1da177e4
LT
4914 .notifier_call = md_notify_reboot,
4915 .next = NULL,
4916 .priority = INT_MAX, /* before any real devices */
4917};
4918
4919static void md_geninit(void)
4920{
4921 struct proc_dir_entry *p;
4922
4923 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
4924
4925 p = create_proc_entry("mdstat", S_IRUGO, NULL);
4926 if (p)
4927 p->proc_fops = &md_seq_fops;
4928}
4929
75c96f85 4930static int __init md_init(void)
1da177e4
LT
4931{
4932 int minor;
4933
4934 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
4935 " MD_SB_DISKS=%d\n",
4936 MD_MAJOR_VERSION, MD_MINOR_VERSION,
4937 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
bd926c63 4938 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
32a7627c 4939 BITMAP_MINOR);
1da177e4
LT
4940
4941 if (register_blkdev(MAJOR_NR, "md"))
4942 return -1;
4943 if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
4944 unregister_blkdev(MAJOR_NR, "md");
4945 return -1;
4946 }
4947 devfs_mk_dir("md");
4948 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
4949 md_probe, NULL, NULL);
4950 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
4951 md_probe, NULL, NULL);
4952
4953 for (minor=0; minor < MAX_MD_DEVS; ++minor)
4954 devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
4955 S_IFBLK|S_IRUSR|S_IWUSR,
4956 "md/%d", minor);
4957
4958 for (minor=0; minor < MAX_MD_DEVS; ++minor)
4959 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
4960 S_IFBLK|S_IRUSR|S_IWUSR,
4961 "md/mdp%d", minor);
4962
4963
4964 register_reboot_notifier(&md_notifier);
4965 raid_table_header = register_sysctl_table(raid_root_table, 1);
4966
4967 md_geninit();
4968 return (0);
4969}
4970
4971
4972#ifndef MODULE
4973
4974/*
4975 * Searches all registered partitions for autorun RAID arrays
4976 * at boot time.
4977 */
4978static dev_t detected_devices[128];
4979static int dev_cnt;
4980
4981void md_autodetect_dev(dev_t dev)
4982{
4983 if (dev_cnt >= 0 && dev_cnt < 127)
4984 detected_devices[dev_cnt++] = dev;
4985}
4986
4987
4988static void autostart_arrays(int part)
4989{
4990 mdk_rdev_t *rdev;
4991 int i;
4992
4993 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
4994
4995 for (i = 0; i < dev_cnt; i++) {
4996 dev_t dev = detected_devices[i];
4997
4998 rdev = md_import_device(dev,0, 0);
4999 if (IS_ERR(rdev))
5000 continue;
5001
b2d444d7 5002 if (test_bit(Faulty, &rdev->flags)) {
1da177e4
LT
5003 MD_BUG();
5004 continue;
5005 }
5006 list_add(&rdev->same_set, &pending_raid_disks);
5007 }
5008 dev_cnt = 0;
5009
5010 autorun_devices(part);
5011}
5012
5013#endif
5014
5015static __exit void md_exit(void)
5016{
5017 mddev_t *mddev;
5018 struct list_head *tmp;
5019 int i;
5020 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
5021 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
5022 for (i=0; i < MAX_MD_DEVS; i++)
5023 devfs_remove("md/%d", i);
5024 for (i=0; i < MAX_MD_DEVS; i++)
5025 devfs_remove("md/d%d", i);
5026
5027 devfs_remove("md");
5028
5029 unregister_blkdev(MAJOR_NR,"md");
5030 unregister_blkdev(mdp_major, "mdp");
5031 unregister_reboot_notifier(&md_notifier);
5032 unregister_sysctl_table(raid_table_header);
5033 remove_proc_entry("mdstat", NULL);
5034 ITERATE_MDDEV(mddev,tmp) {
5035 struct gendisk *disk = mddev->gendisk;
5036 if (!disk)
5037 continue;
5038 export_array(mddev);
5039 del_gendisk(disk);
5040 put_disk(disk);
5041 mddev->gendisk = NULL;
5042 mddev_put(mddev);
5043 }
5044}
5045
5046module_init(md_init)
5047module_exit(md_exit)
5048
f91de92e
N
5049static int get_ro(char *buffer, struct kernel_param *kp)
5050{
5051 return sprintf(buffer, "%d", start_readonly);
5052}
5053static int set_ro(const char *val, struct kernel_param *kp)
5054{
5055 char *e;
5056 int num = simple_strtoul(val, &e, 10);
5057 if (*val && (*e == '\0' || *e == '\n')) {
5058 start_readonly = num;
4dbcdc75 5059 return 0;
f91de92e
N
5060 }
5061 return -EINVAL;
5062}
5063
5064module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
6ff8d8ec
N
5065module_param(start_dirty_degraded, int, 0644);
5066
f91de92e 5067
1da177e4
LT
5068EXPORT_SYMBOL(register_md_personality);
5069EXPORT_SYMBOL(unregister_md_personality);
5070EXPORT_SYMBOL(md_error);
5071EXPORT_SYMBOL(md_done_sync);
5072EXPORT_SYMBOL(md_write_start);
5073EXPORT_SYMBOL(md_write_end);
1da177e4
LT
5074EXPORT_SYMBOL(md_register_thread);
5075EXPORT_SYMBOL(md_unregister_thread);
5076EXPORT_SYMBOL(md_wakeup_thread);
5077EXPORT_SYMBOL(md_print_devices);
5078EXPORT_SYMBOL(md_check_recovery);
5079MODULE_LICENSE("GPL");
aa1595e9 5080MODULE_ALIAS("md");
72008652 5081MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);