1 /*
2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5 completely rewritten, based on the MD driver code from Marc Zyngier
6
7 Changes:
8
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization):
19
20 Neil Brown <neilb@cse.unsw.edu.au>.
21
22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25 This program is free software; you can redistribute it and/or modify
26 it under the terms of the GNU General Public License as published by
27 the Free Software Foundation; either version 2, or (at your option)
28 any later version.
29
30 You should have received a copy of the GNU General Public License
31 (for example /usr/src/linux/COPYING); if not, write to the Free
32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34
35 #include <linux/module.h>
36 #include <linux/config.h>
37 #include <linux/kthread.h>
38 #include <linux/linkage.h>
39 #include <linux/raid/md.h>
40 #include <linux/raid/bitmap.h>
41 #include <linux/sysctl.h>
42 #include <linux/devfs_fs_kernel.h>
43 #include <linux/buffer_head.h> /* for invalidate_bdev */
44 #include <linux/suspend.h>
45 #include <linux/poll.h>
46 #include <linux/mutex.h>
47
48 #include <linux/init.h>
49
50 #include <linux/file.h>
51
52 #ifdef CONFIG_KMOD
53 #include <linux/kmod.h>
54 #endif
55
56 #include <asm/unaligned.h>
57
58 #define MAJOR_NR MD_MAJOR
59 #define MD_DRIVER
60
61 /* 63 partitions with the alternate major number (mdp) */
62 #define MdpMinorShift 6
63
64 #define DEBUG 0
65 #define dprintk(x...) ((void)(DEBUG && printk(x)))
66
67
68 #ifndef MODULE
69 static void autostart_arrays (int part);
70 #endif
71
72 static LIST_HEAD(pers_list);
73 static DEFINE_SPINLOCK(pers_lock);
74
75 /*
76 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
77 * is 1000 KB/sec, so the extra system load does not show up that much.
78 * Increase it if you want to have more _guaranteed_ speed. Note that
79 * the RAID driver will use the maximum available bandwidth if the IO
80 * subsystem is idle. There is also an 'absolute maximum' reconstruction
81 * speed limit - in case reconstruction slows down your system despite
82 * idle IO detection.
83 *
84 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
85 * or /sys/block/mdX/md/sync_speed_{min,max}
86 */
87
88 static int sysctl_speed_limit_min = 1000;
89 static int sysctl_speed_limit_max = 200000;
90 static inline int speed_min(mddev_t *mddev)
91 {
92 return mddev->sync_speed_min ?
93 mddev->sync_speed_min : sysctl_speed_limit_min;
94 }
95
96 static inline int speed_max(mddev_t *mddev)
97 {
98 return mddev->sync_speed_max ?
99 mddev->sync_speed_max : sysctl_speed_limit_max;
100 }
101
102 static struct ctl_table_header *raid_table_header;
103
104 static ctl_table raid_table[] = {
105 {
106 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
107 .procname = "speed_limit_min",
108 .data = &sysctl_speed_limit_min,
109 .maxlen = sizeof(int),
110 .mode = 0644,
111 .proc_handler = &proc_dointvec,
112 },
113 {
114 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
115 .procname = "speed_limit_max",
116 .data = &sysctl_speed_limit_max,
117 .maxlen = sizeof(int),
118 .mode = 0644,
119 .proc_handler = &proc_dointvec,
120 },
121 { .ctl_name = 0 }
122 };
123
124 static ctl_table raid_dir_table[] = {
125 {
126 .ctl_name = DEV_RAID,
127 .procname = "raid",
128 .maxlen = 0,
129 .mode = 0555,
130 .child = raid_table,
131 },
132 { .ctl_name = 0 }
133 };
134
135 static ctl_table raid_root_table[] = {
136 {
137 .ctl_name = CTL_DEV,
138 .procname = "dev",
139 .maxlen = 0,
140 .mode = 0555,
141 .child = raid_dir_table,
142 },
143 { .ctl_name = 0 }
144 };
145
146 static struct block_device_operations md_fops;
147
148 static int start_readonly;
149
150 /*
151 * We have a system wide 'event count' that is incremented
152 * on any 'interesting' event, and readers of /proc/mdstat
153 * can use 'poll' or 'select' to find out when the event
154 * count increases.
155 *
156 * Events are:
157 * start array, stop array, error, add device, remove device,
158 * start build, activate spare
159 */
160 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
161 static atomic_t md_event_count;
162 void md_new_event(mddev_t *mddev)
163 {
164 atomic_inc(&md_event_count);
165 wake_up(&md_event_waiters);
166 sysfs_notify(&mddev->kobj, NULL, "sync_action");
167 }
168 EXPORT_SYMBOL_GPL(md_new_event);
169
170 /* Alternate version that can be called from interrupts
171 * when calling sysfs_notify isn't needed.
172 */
173 void md_new_event_inintr(mddev_t *mddev)
174 {
175 atomic_inc(&md_event_count);
176 wake_up(&md_event_waiters);
177 }
178
179 /*
180 * Enables to iterate over all existing md arrays
181 * all_mddevs_lock protects this list.
182 */
183 static LIST_HEAD(all_mddevs);
184 static DEFINE_SPINLOCK(all_mddevs_lock);
185
186
187 /*
188 * iterates through all used mddevs in the system.
189 * We take care to grab the all_mddevs_lock whenever navigating
190 * the list, and to always hold a refcount when unlocked.
191 * Any code which breaks out of this loop while own
192 * a reference to the current mddev and must mddev_put it.
193 */
194 #define ITERATE_MDDEV(mddev,tmp) \
195 \
196 for (({ spin_lock(&all_mddevs_lock); \
197 tmp = all_mddevs.next; \
198 mddev = NULL;}); \
199 ({ if (tmp != &all_mddevs) \
200 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
201 spin_unlock(&all_mddevs_lock); \
202 if (mddev) mddev_put(mddev); \
203 mddev = list_entry(tmp, mddev_t, all_mddevs); \
204 tmp != &all_mddevs;}); \
205 ({ spin_lock(&all_mddevs_lock); \
206 tmp = tmp->next;}) \
207 )
208
209
210 static int md_fail_request (request_queue_t *q, struct bio *bio)
211 {
212 bio_io_error(bio, bio->bi_size);
213 return 0;
214 }
215
216 static inline mddev_t *mddev_get(mddev_t *mddev)
217 {
218 atomic_inc(&mddev->active);
219 return mddev;
220 }
221
222 static void mddev_put(mddev_t *mddev)
223 {
224 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
225 return;
226 if (!mddev->raid_disks && list_empty(&mddev->disks)) {
227 list_del(&mddev->all_mddevs);
228 spin_unlock(&all_mddevs_lock);
229 blk_cleanup_queue(mddev->queue);
230 kobject_unregister(&mddev->kobj);
231 } else
232 spin_unlock(&all_mddevs_lock);
233 }
234
235 static mddev_t * mddev_find(dev_t unit)
236 {
237 mddev_t *mddev, *new = NULL;
238
239 retry:
240 spin_lock(&all_mddevs_lock);
241 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
242 if (mddev->unit == unit) {
243 mddev_get(mddev);
244 spin_unlock(&all_mddevs_lock);
245 kfree(new);
246 return mddev;
247 }
248
249 if (new) {
250 list_add(&new->all_mddevs, &all_mddevs);
251 spin_unlock(&all_mddevs_lock);
252 return new;
253 }
254 spin_unlock(&all_mddevs_lock);
255
256 new = kzalloc(sizeof(*new), GFP_KERNEL);
257 if (!new)
258 return NULL;
259
260 new->unit = unit;
261 if (MAJOR(unit) == MD_MAJOR)
262 new->md_minor = MINOR(unit);
263 else
264 new->md_minor = MINOR(unit) >> MdpMinorShift;
265
266 mutex_init(&new->reconfig_mutex);
267 INIT_LIST_HEAD(&new->disks);
268 INIT_LIST_HEAD(&new->all_mddevs);
269 init_timer(&new->safemode_timer);
270 atomic_set(&new->active, 1);
271 spin_lock_init(&new->write_lock);
272 init_waitqueue_head(&new->sb_wait);
273
274 new->queue = blk_alloc_queue(GFP_KERNEL);
275 if (!new->queue) {
276 kfree(new);
277 return NULL;
278 }
279 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
280
281 blk_queue_make_request(new->queue, md_fail_request);
282
283 goto retry;
284 }
285
286 static inline int mddev_lock(mddev_t * mddev)
287 {
288 return mutex_lock_interruptible(&mddev->reconfig_mutex);
289 }
290
291 static inline int mddev_trylock(mddev_t * mddev)
292 {
293 return mutex_trylock(&mddev->reconfig_mutex);
294 }
295
296 static inline void mddev_unlock(mddev_t * mddev)
297 {
298 mutex_unlock(&mddev->reconfig_mutex);
299
300 md_wakeup_thread(mddev->thread);
301 }
302
303 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
304 {
305 mdk_rdev_t * rdev;
306 struct list_head *tmp;
307
308 ITERATE_RDEV(mddev,rdev,tmp) {
309 if (rdev->desc_nr == nr)
310 return rdev;
311 }
312 return NULL;
313 }
314
315 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
316 {
317 struct list_head *tmp;
318 mdk_rdev_t *rdev;
319
320 ITERATE_RDEV(mddev,rdev,tmp) {
321 if (rdev->bdev->bd_dev == dev)
322 return rdev;
323 }
324 return NULL;
325 }
326
327 static struct mdk_personality *find_pers(int level, char *clevel)
328 {
329 struct mdk_personality *pers;
330 list_for_each_entry(pers, &pers_list, list) {
331 if (level != LEVEL_NONE && pers->level == level)
332 return pers;
333 if (strcmp(pers->name, clevel)==0)
334 return pers;
335 }
336 return NULL;
337 }
338
339 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
340 {
341 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
342 return MD_NEW_SIZE_BLOCKS(size);
343 }
344
345 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
346 {
347 sector_t size;
348
349 size = rdev->sb_offset;
350
351 if (chunk_size)
352 size &= ~((sector_t)chunk_size/1024 - 1);
353 return size;
354 }
355
356 static int alloc_disk_sb(mdk_rdev_t * rdev)
357 {
358 if (rdev->sb_page)
359 MD_BUG();
360
361 rdev->sb_page = alloc_page(GFP_KERNEL);
362 if (!rdev->sb_page) {
363 printk(KERN_ALERT "md: out of memory.\n");
364 return -EINVAL;
365 }
366
367 return 0;
368 }
369
370 static void free_disk_sb(mdk_rdev_t * rdev)
371 {
372 if (rdev->sb_page) {
373 put_page(rdev->sb_page);
374 rdev->sb_loaded = 0;
375 rdev->sb_page = NULL;
376 rdev->sb_offset = 0;
377 rdev->size = 0;
378 }
379 }
380
381
382 static int super_written(struct bio *bio, unsigned int bytes_done, int error)
383 {
384 mdk_rdev_t *rdev = bio->bi_private;
385 mddev_t *mddev = rdev->mddev;
386 if (bio->bi_size)
387 return 1;
388
389 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
390 md_error(mddev, rdev);
391
392 if (atomic_dec_and_test(&mddev->pending_writes))
393 wake_up(&mddev->sb_wait);
394 bio_put(bio);
395 return 0;
396 }
397
398 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
399 {
400 struct bio *bio2 = bio->bi_private;
401 mdk_rdev_t *rdev = bio2->bi_private;
402 mddev_t *mddev = rdev->mddev;
403 if (bio->bi_size)
404 return 1;
405
406 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
407 error == -EOPNOTSUPP) {
408 unsigned long flags;
409 /* barriers don't appear to be supported :-( */
410 set_bit(BarriersNotsupp, &rdev->flags);
411 mddev->barriers_work = 0;
412 spin_lock_irqsave(&mddev->write_lock, flags);
413 bio2->bi_next = mddev->biolist;
414 mddev->biolist = bio2;
415 spin_unlock_irqrestore(&mddev->write_lock, flags);
416 wake_up(&mddev->sb_wait);
417 bio_put(bio);
418 return 0;
419 }
420 bio_put(bio2);
421 bio->bi_private = rdev;
422 return super_written(bio, bytes_done, error);
423 }
424
425 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
426 sector_t sector, int size, struct page *page)
427 {
428 /* write first size bytes of page to sector of rdev
429 * Increment mddev->pending_writes before returning
430 * and decrement it on completion, waking up sb_wait
431 * if zero is reached.
432 * If an error occurred, call md_error
433 *
434 * As we might need to resubmit the request if BIO_RW_BARRIER
435 * causes ENOTSUPP, we allocate a spare bio...
436 */
437 struct bio *bio = bio_alloc(GFP_NOIO, 1);
438 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
439
440 bio->bi_bdev = rdev->bdev;
441 bio->bi_sector = sector;
442 bio_add_page(bio, page, size, 0);
443 bio->bi_private = rdev;
444 bio->bi_end_io = super_written;
445 bio->bi_rw = rw;
446
447 atomic_inc(&mddev->pending_writes);
448 if (!test_bit(BarriersNotsupp, &rdev->flags)) {
449 struct bio *rbio;
450 rw |= (1<<BIO_RW_BARRIER);
451 rbio = bio_clone(bio, GFP_NOIO);
452 rbio->bi_private = bio;
453 rbio->bi_end_io = super_written_barrier;
454 submit_bio(rw, rbio);
455 } else
456 submit_bio(rw, bio);
457 }
458
459 void md_super_wait(mddev_t *mddev)
460 {
461 /* wait for all superblock writes that were scheduled to complete.
462 * if any had to be retried (due to BARRIER problems), retry them
463 */
464 DEFINE_WAIT(wq);
465 for(;;) {
466 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
467 if (atomic_read(&mddev->pending_writes)==0)
468 break;
469 while (mddev->biolist) {
470 struct bio *bio;
471 spin_lock_irq(&mddev->write_lock);
472 bio = mddev->biolist;
473 mddev->biolist = bio->bi_next ;
474 bio->bi_next = NULL;
475 spin_unlock_irq(&mddev->write_lock);
476 submit_bio(bio->bi_rw, bio);
477 }
478 schedule();
479 }
480 finish_wait(&mddev->sb_wait, &wq);
481 }
482
483 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
484 {
485 if (bio->bi_size)
486 return 1;
487
488 complete((struct completion*)bio->bi_private);
489 return 0;
490 }
491
492 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
493 struct page *page, int rw)
494 {
495 struct bio *bio = bio_alloc(GFP_NOIO, 1);
496 struct completion event;
497 int ret;
498
499 rw |= (1 << BIO_RW_SYNC);
500
501 bio->bi_bdev = bdev;
502 bio->bi_sector = sector;
503 bio_add_page(bio, page, size, 0);
504 init_completion(&event);
505 bio->bi_private = &event;
506 bio->bi_end_io = bi_complete;
507 submit_bio(rw, bio);
508 wait_for_completion(&event);
509
510 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
511 bio_put(bio);
512 return ret;
513 }
514 EXPORT_SYMBOL_GPL(sync_page_io);
515
516 static int read_disk_sb(mdk_rdev_t * rdev, int size)
517 {
518 char b[BDEVNAME_SIZE];
519 if (!rdev->sb_page) {
520 MD_BUG();
521 return -EINVAL;
522 }
523 if (rdev->sb_loaded)
524 return 0;
525
526
527 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
528 goto fail;
529 rdev->sb_loaded = 1;
530 return 0;
531
532 fail:
533 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
534 bdevname(rdev->bdev,b));
535 return -EINVAL;
536 }
537
538 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
539 {
540 if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
541 (sb1->set_uuid1 == sb2->set_uuid1) &&
542 (sb1->set_uuid2 == sb2->set_uuid2) &&
543 (sb1->set_uuid3 == sb2->set_uuid3))
544
545 return 1;
546
547 return 0;
548 }
549
550
551 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
552 {
553 int ret;
554 mdp_super_t *tmp1, *tmp2;
555
556 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
557 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
558
559 if (!tmp1 || !tmp2) {
560 ret = 0;
561 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
562 goto abort;
563 }
564
565 *tmp1 = *sb1;
566 *tmp2 = *sb2;
567
568 /*
569 * nr_disks is not constant
570 */
571 tmp1->nr_disks = 0;
572 tmp2->nr_disks = 0;
573
574 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
575 ret = 0;
576 else
577 ret = 1;
578
579 abort:
580 kfree(tmp1);
581 kfree(tmp2);
582 return ret;
583 }
584
585 static unsigned int calc_sb_csum(mdp_super_t * sb)
586 {
587 unsigned int disk_csum, csum;
588
589 disk_csum = sb->sb_csum;
590 sb->sb_csum = 0;
591 csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
592 sb->sb_csum = disk_csum;
593 return csum;
594 }
595
596
597 /*
598 * Handle superblock details.
599 * We want to be able to handle multiple superblock formats
600 * so we have a common interface to them all, and an array of
601 * different handlers.
602 * We rely on user-space to write the initial superblock, and support
603 * reading and updating of superblocks.
604 * Interface methods are:
605 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
606 * loads and validates a superblock on dev.
607 * if refdev != NULL, compare superblocks on both devices
608 * Return:
609 * 0 - dev has a superblock that is compatible with refdev
610 * 1 - dev has a superblock that is compatible and newer than refdev
611 * so dev should be used as the refdev in future
612 * -EINVAL superblock incompatible or invalid
613 * -othererror e.g. -EIO
614 *
615 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
616 * Verify that dev is acceptable into mddev.
617 * The first time, mddev->raid_disks will be 0, and data from
618 * dev should be merged in. Subsequent calls check that dev
619 * is new enough. Return 0 or -EINVAL
620 *
621 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
622 * Update the superblock for rdev with data in mddev
623 * This does not write to disc.
624 *
625 */
626
627 struct super_type {
628 char *name;
629 struct module *owner;
630 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
631 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
632 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
633 };
634
635 /*
636 * load_super for 0.90.0
637 */
638 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
639 {
640 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
641 mdp_super_t *sb;
642 int ret;
643 sector_t sb_offset;
644
645 /*
646 * Calculate the position of the superblock,
647 * it's at the end of the disk.
648 *
649 * It also happens to be a multiple of 4Kb.
650 */
651 sb_offset = calc_dev_sboffset(rdev->bdev);
652 rdev->sb_offset = sb_offset;
653
654 ret = read_disk_sb(rdev, MD_SB_BYTES);
655 if (ret) return ret;
656
657 ret = -EINVAL;
658
659 bdevname(rdev->bdev, b);
660 sb = (mdp_super_t*)page_address(rdev->sb_page);
661
662 if (sb->md_magic != MD_SB_MAGIC) {
663 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
664 b);
665 goto abort;
666 }
667
668 if (sb->major_version != 0 ||
669 sb->minor_version < 90 ||
670 sb->minor_version > 91) {
671 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
672 sb->major_version, sb->minor_version,
673 b);
674 goto abort;
675 }
676
677 if (sb->raid_disks <= 0)
678 goto abort;
679
680 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
681 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
682 b);
683 goto abort;
684 }
685
686 rdev->preferred_minor = sb->md_minor;
687 rdev->data_offset = 0;
688 rdev->sb_size = MD_SB_BYTES;
689
690 if (sb->level == LEVEL_MULTIPATH)
691 rdev->desc_nr = -1;
692 else
693 rdev->desc_nr = sb->this_disk.number;
694
695 if (refdev == 0)
696 ret = 1;
697 else {
698 __u64 ev1, ev2;
699 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
700 if (!uuid_equal(refsb, sb)) {
701 printk(KERN_WARNING "md: %s has different UUID to %s\n",
702 b, bdevname(refdev->bdev,b2));
703 goto abort;
704 }
705 if (!sb_equal(refsb, sb)) {
706 printk(KERN_WARNING "md: %s has same UUID"
707 " but different superblock to %s\n",
708 b, bdevname(refdev->bdev, b2));
709 goto abort;
710 }
711 ev1 = md_event(sb);
712 ev2 = md_event(refsb);
713 if (ev1 > ev2)
714 ret = 1;
715 else
716 ret = 0;
717 }
718 rdev->size = calc_dev_size(rdev, sb->chunk_size);
719
720 if (rdev->size < sb->size && sb->level > 1)
721 /* "this cannot possibly happen" ... */
722 ret = -EINVAL;
723
724 abort:
725 return ret;
726 }
727
728 /*
729 * validate_super for 0.90.0
730 */
731 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
732 {
733 mdp_disk_t *desc;
734 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
735
736 rdev->raid_disk = -1;
737 rdev->flags = 0;
738 if (mddev->raid_disks == 0) {
739 mddev->major_version = 0;
740 mddev->minor_version = sb->minor_version;
741 mddev->patch_version = sb->patch_version;
742 mddev->persistent = ! sb->not_persistent;
743 mddev->chunk_size = sb->chunk_size;
744 mddev->ctime = sb->ctime;
745 mddev->utime = sb->utime;
746 mddev->level = sb->level;
747 mddev->clevel[0] = 0;
748 mddev->layout = sb->layout;
749 mddev->raid_disks = sb->raid_disks;
750 mddev->size = sb->size;
751 mddev->events = md_event(sb);
752 mddev->bitmap_offset = 0;
753 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
754
755 if (mddev->minor_version >= 91) {
756 mddev->reshape_position = sb->reshape_position;
757 mddev->delta_disks = sb->delta_disks;
758 mddev->new_level = sb->new_level;
759 mddev->new_layout = sb->new_layout;
760 mddev->new_chunk = sb->new_chunk;
761 } else {
762 mddev->reshape_position = MaxSector;
763 mddev->delta_disks = 0;
764 mddev->new_level = mddev->level;
765 mddev->new_layout = mddev->layout;
766 mddev->new_chunk = mddev->chunk_size;
767 }
768
769 if (sb->state & (1<<MD_SB_CLEAN))
770 mddev->recovery_cp = MaxSector;
771 else {
772 if (sb->events_hi == sb->cp_events_hi &&
773 sb->events_lo == sb->cp_events_lo) {
774 mddev->recovery_cp = sb->recovery_cp;
775 } else
776 mddev->recovery_cp = 0;
777 }
778
779 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
780 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
781 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
782 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
783
784 mddev->max_disks = MD_SB_DISKS;
785
786 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
787 mddev->bitmap_file == NULL) {
788 if (mddev->level != 1 && mddev->level != 4
789 && mddev->level != 5 && mddev->level != 6
790 && mddev->level != 10) {
791 /* FIXME use a better test */
792 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
793 return -EINVAL;
794 }
795 mddev->bitmap_offset = mddev->default_bitmap_offset;
796 }
797
798 } else if (mddev->pers == NULL) {
799 /* Insist on good event counter while assembling */
800 __u64 ev1 = md_event(sb);
801 ++ev1;
802 if (ev1 < mddev->events)
803 return -EINVAL;
804 } else if (mddev->bitmap) {
805 /* if adding to array with a bitmap, then we can accept an
806 * older device ... but not too old.
807 */
808 __u64 ev1 = md_event(sb);
809 if (ev1 < mddev->bitmap->events_cleared)
810 return 0;
811 } else /* just a hot-add of a new device, leave raid_disk at -1 */
812 return 0;
813
814 if (mddev->level != LEVEL_MULTIPATH) {
815 desc = sb->disks + rdev->desc_nr;
816
817 if (desc->state & (1<<MD_DISK_FAULTY))
818 set_bit(Faulty, &rdev->flags);
819 else if (desc->state & (1<<MD_DISK_SYNC) &&
820 desc->raid_disk < mddev->raid_disks) {
821 set_bit(In_sync, &rdev->flags);
822 rdev->raid_disk = desc->raid_disk;
823 }
824 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
825 set_bit(WriteMostly, &rdev->flags);
826 } else /* MULTIPATH are always insync */
827 set_bit(In_sync, &rdev->flags);
828 return 0;
829 }
830
831 /*
832 * sync_super for 0.90.0
833 */
834 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
835 {
836 mdp_super_t *sb;
837 struct list_head *tmp;
838 mdk_rdev_t *rdev2;
839 int next_spare = mddev->raid_disks;
840
841
842 /* make rdev->sb match mddev data..
843 *
844 * 1/ zero out disks
845 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
846 * 3/ any empty disks < next_spare become removed
847 *
848 * disks[0] gets initialised to REMOVED because
849 * we cannot be sure from other fields if it has
850 * been initialised or not.
851 */
852 int i;
853 int active=0, working=0,failed=0,spare=0,nr_disks=0;
854
855 rdev->sb_size = MD_SB_BYTES;
856
857 sb = (mdp_super_t*)page_address(rdev->sb_page);
858
859 memset(sb, 0, sizeof(*sb));
860
861 sb->md_magic = MD_SB_MAGIC;
862 sb->major_version = mddev->major_version;
863 sb->patch_version = mddev->patch_version;
864 sb->gvalid_words = 0; /* ignored */
865 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
866 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
867 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
868 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
869
870 sb->ctime = mddev->ctime;
871 sb->level = mddev->level;
872 sb->size = mddev->size;
873 sb->raid_disks = mddev->raid_disks;
874 sb->md_minor = mddev->md_minor;
875 sb->not_persistent = !mddev->persistent;
876 sb->utime = mddev->utime;
877 sb->state = 0;
878 sb->events_hi = (mddev->events>>32);
879 sb->events_lo = (u32)mddev->events;
880
881 if (mddev->reshape_position == MaxSector)
882 sb->minor_version = 90;
883 else {
884 sb->minor_version = 91;
885 sb->reshape_position = mddev->reshape_position;
886 sb->new_level = mddev->new_level;
887 sb->delta_disks = mddev->delta_disks;
888 sb->new_layout = mddev->new_layout;
889 sb->new_chunk = mddev->new_chunk;
890 }
891 mddev->minor_version = sb->minor_version;
892 if (mddev->in_sync)
893 {
894 sb->recovery_cp = mddev->recovery_cp;
895 sb->cp_events_hi = (mddev->events>>32);
896 sb->cp_events_lo = (u32)mddev->events;
897 if (mddev->recovery_cp == MaxSector)
898 sb->state = (1<< MD_SB_CLEAN);
899 } else
900 sb->recovery_cp = 0;
901
902 sb->layout = mddev->layout;
903 sb->chunk_size = mddev->chunk_size;
904
905 if (mddev->bitmap && mddev->bitmap_file == NULL)
906 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
907
908 sb->disks[0].state = (1<<MD_DISK_REMOVED);
909 ITERATE_RDEV(mddev,rdev2,tmp) {
910 mdp_disk_t *d;
911 int desc_nr;
912 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
913 && !test_bit(Faulty, &rdev2->flags))
914 desc_nr = rdev2->raid_disk;
915 else
916 desc_nr = next_spare++;
917 rdev2->desc_nr = desc_nr;
918 d = &sb->disks[rdev2->desc_nr];
919 nr_disks++;
920 d->number = rdev2->desc_nr;
921 d->major = MAJOR(rdev2->bdev->bd_dev);
922 d->minor = MINOR(rdev2->bdev->bd_dev);
923 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
924 && !test_bit(Faulty, &rdev2->flags))
925 d->raid_disk = rdev2->raid_disk;
926 else
927 d->raid_disk = rdev2->desc_nr; /* compatibility */
928 if (test_bit(Faulty, &rdev2->flags))
929 d->state = (1<<MD_DISK_FAULTY);
930 else if (test_bit(In_sync, &rdev2->flags)) {
931 d->state = (1<<MD_DISK_ACTIVE);
932 d->state |= (1<<MD_DISK_SYNC);
933 active++;
934 working++;
935 } else {
936 d->state = 0;
937 spare++;
938 working++;
939 }
940 if (test_bit(WriteMostly, &rdev2->flags))
941 d->state |= (1<<MD_DISK_WRITEMOSTLY);
942 }
943 /* now set the "removed" and "faulty" bits on any missing devices */
944 for (i=0 ; i < mddev->raid_disks ; i++) {
945 mdp_disk_t *d = &sb->disks[i];
946 if (d->state == 0 && d->number == 0) {
947 d->number = i;
948 d->raid_disk = i;
949 d->state = (1<<MD_DISK_REMOVED);
950 d->state |= (1<<MD_DISK_FAULTY);
951 failed++;
952 }
953 }
954 sb->nr_disks = nr_disks;
955 sb->active_disks = active;
956 sb->working_disks = working;
957 sb->failed_disks = failed;
958 sb->spare_disks = spare;
959
960 sb->this_disk = sb->disks[rdev->desc_nr];
961 sb->sb_csum = calc_sb_csum(sb);
962 }
963
964 /*
965 * version 1 superblock
966 */
967
968 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
969 {
970 unsigned int disk_csum, csum;
971 unsigned long long newcsum;
972 int size = 256 + le32_to_cpu(sb->max_dev)*2;
973 unsigned int *isuper = (unsigned int*)sb;
974 int i;
975
976 disk_csum = sb->sb_csum;
977 sb->sb_csum = 0;
978 newcsum = 0;
979 for (i=0; size>=4; size -= 4 )
980 newcsum += le32_to_cpu(*isuper++);
981
982 if (size == 2)
983 newcsum += le16_to_cpu(*(unsigned short*) isuper);
984
985 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
986 sb->sb_csum = disk_csum;
987 return cpu_to_le32(csum);
988 }
989
990 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
991 {
992 struct mdp_superblock_1 *sb;
993 int ret;
994 sector_t sb_offset;
995 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
996 int bmask;
997
998 /*
999 * Calculate the position of the superblock.
1000 * It is always aligned to a 4K boundary and
1001 * depeding on minor_version, it can be:
1002 * 0: At least 8K, but less than 12K, from end of device
1003 * 1: At start of device
1004 * 2: 4K from start of device.
1005 */
1006 switch(minor_version) {
1007 case 0:
1008 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
1009 sb_offset -= 8*2;
1010 sb_offset &= ~(sector_t)(4*2-1);
1011 /* convert from sectors to K */
1012 sb_offset /= 2;
1013 break;
1014 case 1:
1015 sb_offset = 0;
1016 break;
1017 case 2:
1018 sb_offset = 4;
1019 break;
1020 default:
1021 return -EINVAL;
1022 }
1023 rdev->sb_offset = sb_offset;
1024
1025 /* superblock is rarely larger than 1K, but it can be larger,
1026 * and it is safe to read 4k, so we do that
1027 */
1028 ret = read_disk_sb(rdev, 4096);
1029 if (ret) return ret;
1030
1031
1032 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1033
1034 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1035 sb->major_version != cpu_to_le32(1) ||
1036 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1037 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
1038 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1039 return -EINVAL;
1040
1041 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1042 printk("md: invalid superblock checksum on %s\n",
1043 bdevname(rdev->bdev,b));
1044 return -EINVAL;
1045 }
1046 if (le64_to_cpu(sb->data_size) < 10) {
1047 printk("md: data_size too small on %s\n",
1048 bdevname(rdev->bdev,b));
1049 return -EINVAL;
1050 }
1051 rdev->preferred_minor = 0xffff;
1052 rdev->data_offset = le64_to_cpu(sb->data_offset);
1053 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1054
1055 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1056 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1057 if (rdev->sb_size & bmask)
1058 rdev-> sb_size = (rdev->sb_size | bmask)+1;
1059
1060 if (refdev == 0)
1061 ret = 1;
1062 else {
1063 __u64 ev1, ev2;
1064 struct mdp_superblock_1 *refsb =
1065 (struct mdp_superblock_1*)page_address(refdev->sb_page);
1066
1067 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1068 sb->level != refsb->level ||
1069 sb->layout != refsb->layout ||
1070 sb->chunksize != refsb->chunksize) {
1071 printk(KERN_WARNING "md: %s has strangely different"
1072 " superblock to %s\n",
1073 bdevname(rdev->bdev,b),