~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux-2.6.17/drivers/md/md.c

Version: ~ [ 2.6.16 ] ~ [ 2.6.17 ] ~
Architecture: ~ [ ia64 ] ~ [ i386 ] ~ [ arm ] ~ [ ppc ] ~ [ sparc64 ] ~

  1 /*
  2    md.c : Multiple Devices driver for Linux
  3           Copyright (C) 1998, 1999, 2000 Ingo Molnar
  4 
  5      completely rewritten, based on the MD driver code from Marc Zyngier
  6 
  7    Changes:
  8 
  9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
 10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
 11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
 12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
 13    - kmod support by: Cyrus Durgin
 14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
 15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
 16 
 17    - lots of fixes and improvements to the RAID1/RAID5 and generic
 18      RAID code (such as request based resynchronization):
 19 
 20      Neil Brown <neilb@cse.unsw.edu.au>.
 21 
 22    - persistent bitmap code
 23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
 24 
 25    This program is free software; you can redistribute it and/or modify
 26    it under the terms of the GNU General Public License as published by
 27    the Free Software Foundation; either version 2, or (at your option)
 28    any later version.
 29 
 30    You should have received a copy of the GNU General Public License
 31    (for example /usr/src/linux/COPYING); if not, write to the Free
 32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 33 */
 34 
 35 #include <linux/module.h>
 36 #include <linux/config.h>
 37 #include <linux/kthread.h>
 38 #include <linux/linkage.h>
 39 #include <linux/raid/md.h>
 40 #include <linux/raid/bitmap.h>
 41 #include <linux/sysctl.h>
 42 #include <linux/devfs_fs_kernel.h>
 43 #include <linux/buffer_head.h> /* for invalidate_bdev */
 44 #include <linux/suspend.h>
 45 #include <linux/poll.h>
 46 #include <linux/mutex.h>
 47 
 48 #include <linux/init.h>
 49 
 50 #include <linux/file.h>
 51 
 52 #ifdef CONFIG_KMOD
 53 #include <linux/kmod.h>
 54 #endif
 55 
 56 #include <asm/unaligned.h>
 57 
 58 #define MAJOR_NR MD_MAJOR
 59 #define MD_DRIVER
 60 
 61 /* 63 partitions with the alternate major number (mdp) */
 62 #define MdpMinorShift 6
 63 
 64 #define DEBUG 0
 65 #define dprintk(x...) ((void)(DEBUG && printk(x)))
 66 
 67 
 68 #ifndef MODULE
 69 static void autostart_arrays (int part);
 70 #endif
 71 
 72 static LIST_HEAD(pers_list);
 73 static DEFINE_SPINLOCK(pers_lock);
 74 
 75 /*
 76  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
 77  * is 1000 KB/sec, so the extra system load does not show up that much.
 78  * Increase it if you want to have more _guaranteed_ speed. Note that
 79  * the RAID driver will use the maximum available bandwidth if the IO
 80  * subsystem is idle. There is also an 'absolute maximum' reconstruction
 81  * speed limit - in case reconstruction slows down your system despite
 82  * idle IO detection.
 83  *
 84  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
 85  * or /sys/block/mdX/md/sync_speed_{min,max}
 86  */
 87 
 88 static int sysctl_speed_limit_min = 1000;
 89 static int sysctl_speed_limit_max = 200000;
 90 static inline int speed_min(mddev_t *mddev)
 91 {
 92         return mddev->sync_speed_min ?
 93                 mddev->sync_speed_min : sysctl_speed_limit_min;
 94 }
 95 
 96 static inline int speed_max(mddev_t *mddev)
 97 {
 98         return mddev->sync_speed_max ?
 99                 mddev->sync_speed_max : sysctl_speed_limit_max;
100 }
101 
102 static struct ctl_table_header *raid_table_header;
103 
104 static ctl_table raid_table[] = {
105         {
106                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
107                 .procname       = "speed_limit_min",
108                 .data           = &sysctl_speed_limit_min,
109                 .maxlen         = sizeof(int),
110                 .mode           = 0644,
111                 .proc_handler   = &proc_dointvec,
112         },
113         {
114                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
115                 .procname       = "speed_limit_max",
116                 .data           = &sysctl_speed_limit_max,
117                 .maxlen         = sizeof(int),
118                 .mode           = 0644,
119                 .proc_handler   = &proc_dointvec,
120         },
121         { .ctl_name = 0 }
122 };
123 
124 static ctl_table raid_dir_table[] = {
125         {
126                 .ctl_name       = DEV_RAID,
127                 .procname       = "raid",
128                 .maxlen         = 0,
129                 .mode           = 0555,
130                 .child          = raid_table,
131         },
132         { .ctl_name = 0 }
133 };
134 
135 static ctl_table raid_root_table[] = {
136         {
137                 .ctl_name       = CTL_DEV,
138                 .procname       = "dev",
139                 .maxlen         = 0,
140                 .mode           = 0555,
141                 .child          = raid_dir_table,
142         },
143         { .ctl_name = 0 }
144 };
145 
146 static struct block_device_operations md_fops;
147 
148 static int start_readonly;
149 
150 /*
151  * We have a system wide 'event count' that is incremented
152  * on any 'interesting' event, and readers of /proc/mdstat
153  * can use 'poll' or 'select' to find out when the event
154  * count increases.
155  *
156  * Events are:
157  *  start array, stop array, error, add device, remove device,
158  *  start build, activate spare
159  */
160 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
161 static atomic_t md_event_count;
162 void md_new_event(mddev_t *mddev)
163 {
164         atomic_inc(&md_event_count);
165         wake_up(&md_event_waiters);
166         sysfs_notify(&mddev->kobj, NULL, "sync_action");
167 }
168 EXPORT_SYMBOL_GPL(md_new_event);
169 
170 /* Alternate version that can be called from interrupts
171  * when calling sysfs_notify isn't needed.
172  */
173 void md_new_event_inintr(mddev_t *mddev)
174 {
175         atomic_inc(&md_event_count);
176         wake_up(&md_event_waiters);
177 }
178 
179 /*
180  * Enables to iterate over all existing md arrays
181  * all_mddevs_lock protects this list.
182  */
183 static LIST_HEAD(all_mddevs);
184 static DEFINE_SPINLOCK(all_mddevs_lock);
185 
186 
187 /*
188  * iterates through all used mddevs in the system.
189  * We take care to grab the all_mddevs_lock whenever navigating
190  * the list, and to always hold a refcount when unlocked.
191  * Any code which breaks out of this loop while own
192  * a reference to the current mddev and must mddev_put it.
193  */
194 #define ITERATE_MDDEV(mddev,tmp)                                        \
195                                                                         \
196         for (({ spin_lock(&all_mddevs_lock);                            \
197                 tmp = all_mddevs.next;                                  \
198                 mddev = NULL;});                                        \
199              ({ if (tmp != &all_mddevs)                                 \
200                         mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
201                 spin_unlock(&all_mddevs_lock);                          \
202                 if (mddev) mddev_put(mddev);                            \
203                 mddev = list_entry(tmp, mddev_t, all_mddevs);           \
204                 tmp != &all_mddevs;});                                  \
205              ({ spin_lock(&all_mddevs_lock);                            \
206                 tmp = tmp->next;})                                      \
207                 )
208 
209 
210 static int md_fail_request (request_queue_t *q, struct bio *bio)
211 {
212         bio_io_error(bio, bio->bi_size);
213         return 0;
214 }
215 
216 static inline mddev_t *mddev_get(mddev_t *mddev)
217 {
218         atomic_inc(&mddev->active);
219         return mddev;
220 }
221 
222 static void mddev_put(mddev_t *mddev)
223 {
224         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
225                 return;
226         if (!mddev->raid_disks && list_empty(&mddev->disks)) {
227                 list_del(&mddev->all_mddevs);
228                 spin_unlock(&all_mddevs_lock);
229                 blk_cleanup_queue(mddev->queue);
230                 kobject_unregister(&mddev->kobj);
231         } else
232                 spin_unlock(&all_mddevs_lock);
233 }
234 
235 static mddev_t * mddev_find(dev_t unit)
236 {
237         mddev_t *mddev, *new = NULL;
238 
239  retry:
240         spin_lock(&all_mddevs_lock);
241         list_for_each_entry(mddev, &all_mddevs, all_mddevs)
242                 if (mddev->unit == unit) {
243                         mddev_get(mddev);
244                         spin_unlock(&all_mddevs_lock);
245                         kfree(new);
246                         return mddev;
247                 }
248 
249         if (new) {
250                 list_add(&new->all_mddevs, &all_mddevs);
251                 spin_unlock(&all_mddevs_lock);
252                 return new;
253         }
254         spin_unlock(&all_mddevs_lock);
255 
256         new = kzalloc(sizeof(*new), GFP_KERNEL);
257         if (!new)
258                 return NULL;
259 
260         new->unit = unit;
261         if (MAJOR(unit) == MD_MAJOR)
262                 new->md_minor = MINOR(unit);
263         else
264                 new->md_minor = MINOR(unit) >> MdpMinorShift;
265 
266         mutex_init(&new->reconfig_mutex);
267         INIT_LIST_HEAD(&new->disks);
268         INIT_LIST_HEAD(&new->all_mddevs);
269         init_timer(&new->safemode_timer);
270         atomic_set(&new->active, 1);
271         spin_lock_init(&new->write_lock);
272         init_waitqueue_head(&new->sb_wait);
273 
274         new->queue = blk_alloc_queue(GFP_KERNEL);
275         if (!new->queue) {
276                 kfree(new);
277                 return NULL;
278         }
279         set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
280 
281         blk_queue_make_request(new->queue, md_fail_request);
282 
283         goto retry;
284 }
285 
286 static inline int mddev_lock(mddev_t * mddev)
287 {
288         return mutex_lock_interruptible(&mddev->reconfig_mutex);
289 }
290 
291 static inline int mddev_trylock(mddev_t * mddev)
292 {
293         return mutex_trylock(&mddev->reconfig_mutex);
294 }
295 
296 static inline void mddev_unlock(mddev_t * mddev)
297 {
298         mutex_unlock(&mddev->reconfig_mutex);
299 
300         md_wakeup_thread(mddev->thread);
301 }
302 
303 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
304 {
305         mdk_rdev_t * rdev;
306         struct list_head *tmp;
307 
308         ITERATE_RDEV(mddev,rdev,tmp) {
309                 if (rdev->desc_nr == nr)
310                         return rdev;
311         }
312         return NULL;
313 }
314 
315 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
316 {
317         struct list_head *tmp;
318         mdk_rdev_t *rdev;
319 
320         ITERATE_RDEV(mddev,rdev,tmp) {
321                 if (rdev->bdev->bd_dev == dev)
322                         return rdev;
323         }
324         return NULL;
325 }
326 
327 static struct mdk_personality *find_pers(int level, char *clevel)
328 {
329         struct mdk_personality *pers;
330         list_for_each_entry(pers, &pers_list, list) {
331                 if (level != LEVEL_NONE && pers->level == level)
332                         return pers;
333                 if (strcmp(pers->name, clevel)==0)
334                         return pers;
335         }
336         return NULL;
337 }
338 
339 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
340 {
341         sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
342         return MD_NEW_SIZE_BLOCKS(size);
343 }
344 
345 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
346 {
347         sector_t size;
348 
349         size = rdev->sb_offset;
350 
351         if (chunk_size)
352                 size &= ~((sector_t)chunk_size/1024 - 1);
353         return size;
354 }
355 
356 static int alloc_disk_sb(mdk_rdev_t * rdev)
357 {
358         if (rdev->sb_page)
359                 MD_BUG();
360 
361         rdev->sb_page = alloc_page(GFP_KERNEL);
362         if (!rdev->sb_page) {
363                 printk(KERN_ALERT "md: out of memory.\n");
364                 return -EINVAL;
365         }
366 
367         return 0;
368 }
369 
370 static void free_disk_sb(mdk_rdev_t * rdev)
371 {
372         if (rdev->sb_page) {
373                 put_page(rdev->sb_page);
374                 rdev->sb_loaded = 0;
375                 rdev->sb_page = NULL;
376                 rdev->sb_offset = 0;
377                 rdev->size = 0;
378         }
379 }
380 
381 
382 static int super_written(struct bio *bio, unsigned int bytes_done, int error)
383 {
384         mdk_rdev_t *rdev = bio->bi_private;
385         mddev_t *mddev = rdev->mddev;
386         if (bio->bi_size)
387                 return 1;
388 
389         if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
390                 md_error(mddev, rdev);
391 
392         if (atomic_dec_and_test(&mddev->pending_writes))
393                 wake_up(&mddev->sb_wait);
394         bio_put(bio);
395         return 0;
396 }
397 
398 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
399 {
400         struct bio *bio2 = bio->bi_private;
401         mdk_rdev_t *rdev = bio2->bi_private;
402         mddev_t *mddev = rdev->mddev;
403         if (bio->bi_size)
404                 return 1;
405 
406         if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
407             error == -EOPNOTSUPP) {
408                 unsigned long flags;
409                 /* barriers don't appear to be supported :-( */
410                 set_bit(BarriersNotsupp, &rdev->flags);
411                 mddev->barriers_work = 0;
412                 spin_lock_irqsave(&mddev->write_lock, flags);
413                 bio2->bi_next = mddev->biolist;
414                 mddev->biolist = bio2;
415                 spin_unlock_irqrestore(&mddev->write_lock, flags);
416                 wake_up(&mddev->sb_wait);
417                 bio_put(bio);
418                 return 0;
419         }
420         bio_put(bio2);
421         bio->bi_private = rdev;
422         return super_written(bio, bytes_done, error);
423 }
424 
425 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
426                    sector_t sector, int size, struct page *page)
427 {
428         /* write first size bytes of page to sector of rdev
429          * Increment mddev->pending_writes before returning
430          * and decrement it on completion, waking up sb_wait
431          * if zero is reached.
432          * If an error occurred, call md_error
433          *
434          * As we might need to resubmit the request if BIO_RW_BARRIER
435          * causes ENOTSUPP, we allocate a spare bio...
436          */
437         struct bio *bio = bio_alloc(GFP_NOIO, 1);
438         int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
439 
440         bio->bi_bdev = rdev->bdev;
441         bio->bi_sector = sector;
442         bio_add_page(bio, page, size, 0);
443         bio->bi_private = rdev;
444         bio->bi_end_io = super_written;
445         bio->bi_rw = rw;
446 
447         atomic_inc(&mddev->pending_writes);
448         if (!test_bit(BarriersNotsupp, &rdev->flags)) {
449                 struct bio *rbio;
450                 rw |= (1<<BIO_RW_BARRIER);
451                 rbio = bio_clone(bio, GFP_NOIO);
452                 rbio->bi_private = bio;
453                 rbio->bi_end_io = super_written_barrier;
454                 submit_bio(rw, rbio);
455         } else
456                 submit_bio(rw, bio);
457 }
458 
459 void md_super_wait(mddev_t *mddev)
460 {
461         /* wait for all superblock writes that were scheduled to complete.
462          * if any had to be retried (due to BARRIER problems), retry them
463          */
464         DEFINE_WAIT(wq);
465         for(;;) {
466                 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
467                 if (atomic_read(&mddev->pending_writes)==0)
468                         break;
469                 while (mddev->biolist) {
470                         struct bio *bio;
471                         spin_lock_irq(&mddev->write_lock);
472                         bio = mddev->biolist;
473                         mddev->biolist = bio->bi_next ;
474                         bio->bi_next = NULL;
475                         spin_unlock_irq(&mddev->write_lock);
476                         submit_bio(bio->bi_rw, bio);
477                 }
478                 schedule();
479         }
480         finish_wait(&mddev->sb_wait, &wq);
481 }
482 
483 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
484 {
485         if (bio->bi_size)
486                 return 1;
487 
488         complete((struct completion*)bio->bi_private);
489         return 0;
490 }
491 
492 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
493                    struct page *page, int rw)
494 {
495         struct bio *bio = bio_alloc(GFP_NOIO, 1);
496         struct completion event;
497         int ret;
498 
499         rw |= (1 << BIO_RW_SYNC);
500 
501         bio->bi_bdev = bdev;
502         bio->bi_sector = sector;
503         bio_add_page(bio, page, size, 0);
504         init_completion(&event);
505         bio->bi_private = &event;
506         bio->bi_end_io = bi_complete;
507         submit_bio(rw, bio);
508         wait_for_completion(&event);
509 
510         ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
511         bio_put(bio);
512         return ret;
513 }
514 EXPORT_SYMBOL_GPL(sync_page_io);
515 
516 static int read_disk_sb(mdk_rdev_t * rdev, int size)
517 {
518         char b[BDEVNAME_SIZE];
519         if (!rdev->sb_page) {
520                 MD_BUG();
521                 return -EINVAL;
522         }
523         if (rdev->sb_loaded)
524                 return 0;
525 
526 
527         if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
528                 goto fail;
529         rdev->sb_loaded = 1;
530         return 0;
531 
532 fail:
533         printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
534                 bdevname(rdev->bdev,b));
535         return -EINVAL;
536 }
537 
538 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
539 {
540         if (    (sb1->set_uuid0 == sb2->set_uuid0) &&
541                 (sb1->set_uuid1 == sb2->set_uuid1) &&
542                 (sb1->set_uuid2 == sb2->set_uuid2) &&
543                 (sb1->set_uuid3 == sb2->set_uuid3))
544 
545                 return 1;
546 
547         return 0;
548 }
549 
550 
551 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
552 {
553         int ret;
554         mdp_super_t *tmp1, *tmp2;
555 
556         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
557         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
558 
559         if (!tmp1 || !tmp2) {
560                 ret = 0;
561                 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
562                 goto abort;
563         }
564 
565         *tmp1 = *sb1;
566         *tmp2 = *sb2;
567 
568         /*
569          * nr_disks is not constant
570          */
571         tmp1->nr_disks = 0;
572         tmp2->nr_disks = 0;
573 
574         if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
575                 ret = 0;
576         else
577                 ret = 1;
578 
579 abort:
580         kfree(tmp1);
581         kfree(tmp2);
582         return ret;
583 }
584 
585 static unsigned int calc_sb_csum(mdp_super_t * sb)
586 {
587         unsigned int disk_csum, csum;
588 
589         disk_csum = sb->sb_csum;
590         sb->sb_csum = 0;
591         csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
592         sb->sb_csum = disk_csum;
593         return csum;
594 }
595 
596 
597 /*
598  * Handle superblock details.
599  * We want to be able to handle multiple superblock formats
600  * so we have a common interface to them all, and an array of
601  * different handlers.
602  * We rely on user-space to write the initial superblock, and support
603  * reading and updating of superblocks.
604  * Interface methods are:
605  *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
606  *      loads and validates a superblock on dev.
607  *      if refdev != NULL, compare superblocks on both devices
608  *    Return:
609  *      0 - dev has a superblock that is compatible with refdev
610  *      1 - dev has a superblock that is compatible and newer than refdev
611  *          so dev should be used as the refdev in future
612  *     -EINVAL superblock incompatible or invalid
613  *     -othererror e.g. -EIO
614  *
615  *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
616  *      Verify that dev is acceptable into mddev.
617  *       The first time, mddev->raid_disks will be 0, and data from
618  *       dev should be merged in.  Subsequent calls check that dev
619  *       is new enough.  Return 0 or -EINVAL
620  *
621  *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
622  *     Update the superblock for rdev with data in mddev
623  *     This does not write to disc.
624  *
625  */
626 
627 struct super_type  {
628         char            *name;
629         struct module   *owner;
630         int             (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
631         int             (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
632         void            (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
633 };
634 
635 /*
636  * load_super for 0.90.0 
637  */
638 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
639 {
640         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
641         mdp_super_t *sb;
642         int ret;
643         sector_t sb_offset;
644 
645         /*
646          * Calculate the position of the superblock,
647          * it's at the end of the disk.
648          *
649          * It also happens to be a multiple of 4Kb.
650          */
651         sb_offset = calc_dev_sboffset(rdev->bdev);
652         rdev->sb_offset = sb_offset;
653 
654         ret = read_disk_sb(rdev, MD_SB_BYTES);
655         if (ret) return ret;
656 
657         ret = -EINVAL;
658 
659         bdevname(rdev->bdev, b);
660         sb = (mdp_super_t*)page_address(rdev->sb_page);
661 
662         if (sb->md_magic != MD_SB_MAGIC) {
663                 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
664                        b);
665                 goto abort;
666         }
667 
668         if (sb->major_version != 0 ||
669             sb->minor_version < 90 ||
670             sb->minor_version > 91) {
671                 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
672                         sb->major_version, sb->minor_version,
673                         b);
674                 goto abort;
675         }
676 
677         if (sb->raid_disks <= 0)
678                 goto abort;
679 
680         if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
681                 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
682                         b);
683                 goto abort;
684         }
685 
686         rdev->preferred_minor = sb->md_minor;
687         rdev->data_offset = 0;
688         rdev->sb_size = MD_SB_BYTES;
689 
690         if (sb->level == LEVEL_MULTIPATH)
691                 rdev->desc_nr = -1;
692         else
693                 rdev->desc_nr = sb->this_disk.number;
694 
695         if (refdev == 0)
696                 ret = 1;
697         else {
698                 __u64 ev1, ev2;
699                 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
700                 if (!uuid_equal(refsb, sb)) {
701                         printk(KERN_WARNING "md: %s has different UUID to %s\n",
702                                 b, bdevname(refdev->bdev,b2));
703                         goto abort;
704                 }
705                 if (!sb_equal(refsb, sb)) {
706                         printk(KERN_WARNING "md: %s has same UUID"
707                                " but different superblock to %s\n",
708                                b, bdevname(refdev->bdev, b2));
709                         goto abort;
710                 }
711                 ev1 = md_event(sb);
712                 ev2 = md_event(refsb);
713                 if (ev1 > ev2)
714                         ret = 1;
715                 else 
716                         ret = 0;
717         }
718         rdev->size = calc_dev_size(rdev, sb->chunk_size);
719 
720         if (rdev->size < sb->size && sb->level > 1)
721                 /* "this cannot possibly happen" ... */
722                 ret = -EINVAL;
723 
724  abort:
725         return ret;
726 }
727 
728 /*
729  * validate_super for 0.90.0
730  */
731 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
732 {
733         mdp_disk_t *desc;
734         mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
735 
736         rdev->raid_disk = -1;
737         rdev->flags = 0;
738         if (mddev->raid_disks == 0) {
739                 mddev->major_version = 0;
740                 mddev->minor_version = sb->minor_version;
741                 mddev->patch_version = sb->patch_version;
742                 mddev->persistent = ! sb->not_persistent;
743                 mddev->chunk_size = sb->chunk_size;
744                 mddev->ctime = sb->ctime;
745                 mddev->utime = sb->utime;
746                 mddev->level = sb->level;
747                 mddev->clevel[0] = 0;
748                 mddev->layout = sb->layout;
749                 mddev->raid_disks = sb->raid_disks;
750                 mddev->size = sb->size;
751                 mddev->events = md_event(sb);
752                 mddev->bitmap_offset = 0;
753                 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
754 
755                 if (mddev->minor_version >= 91) {
756                         mddev->reshape_position = sb->reshape_position;
757                         mddev->delta_disks = sb->delta_disks;
758                         mddev->new_level = sb->new_level;
759                         mddev->new_layout = sb->new_layout;
760                         mddev->new_chunk = sb->new_chunk;
761                 } else {
762                         mddev->reshape_position = MaxSector;
763                         mddev->delta_disks = 0;
764                         mddev->new_level = mddev->level;
765                         mddev->new_layout = mddev->layout;
766                         mddev->new_chunk = mddev->chunk_size;
767                 }
768 
769                 if (sb->state & (1<<MD_SB_CLEAN))
770                         mddev->recovery_cp = MaxSector;
771                 else {
772                         if (sb->events_hi == sb->cp_events_hi && 
773                                 sb->events_lo == sb->cp_events_lo) {
774                                 mddev->recovery_cp = sb->recovery_cp;
775                         } else
776                                 mddev->recovery_cp = 0;
777                 }
778 
779                 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
780                 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
781                 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
782                 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
783 
784                 mddev->max_disks = MD_SB_DISKS;
785 
786                 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
787                     mddev->bitmap_file == NULL) {
788                         if (mddev->level != 1 && mddev->level != 4
789                             && mddev->level != 5 && mddev->level != 6
790                             && mddev->level != 10) {
791                                 /* FIXME use a better test */
792                                 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
793                                 return -EINVAL;
794                         }
795                         mddev->bitmap_offset = mddev->default_bitmap_offset;
796                 }
797 
798         } else if (mddev->pers == NULL) {
799                 /* Insist on good event counter while assembling */
800                 __u64 ev1 = md_event(sb);
801                 ++ev1;
802                 if (ev1 < mddev->events) 
803                         return -EINVAL;
804         } else if (mddev->bitmap) {
805                 /* if adding to array with a bitmap, then we can accept an
806                  * older device ... but not too old.
807                  */
808                 __u64 ev1 = md_event(sb);
809                 if (ev1 < mddev->bitmap->events_cleared)
810                         return 0;
811         } else /* just a hot-add of a new device, leave raid_disk at -1 */
812                 return 0;
813 
814         if (mddev->level != LEVEL_MULTIPATH) {
815                 desc = sb->disks + rdev->desc_nr;
816 
817                 if (desc->state & (1<<MD_DISK_FAULTY))
818                         set_bit(Faulty, &rdev->flags);
819                 else if (desc->state & (1<<MD_DISK_SYNC) &&
820                          desc->raid_disk < mddev->raid_disks) {
821                         set_bit(In_sync, &rdev->flags);
822                         rdev->raid_disk = desc->raid_disk;
823                 }
824                 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
825                         set_bit(WriteMostly, &rdev->flags);
826         } else /* MULTIPATH are always insync */
827                 set_bit(In_sync, &rdev->flags);
828         return 0;
829 }
830 
831 /*
832  * sync_super for 0.90.0
833  */
834 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
835 {
836         mdp_super_t *sb;
837         struct list_head *tmp;
838         mdk_rdev_t *rdev2;
839         int next_spare = mddev->raid_disks;
840 
841 
842         /* make rdev->sb match mddev data..
843          *
844          * 1/ zero out disks
845          * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
846          * 3/ any empty disks < next_spare become removed
847          *
848          * disks[0] gets initialised to REMOVED because
849          * we cannot be sure from other fields if it has
850          * been initialised or not.
851          */
852         int i;
853         int active=0, working=0,failed=0,spare=0,nr_disks=0;
854 
855         rdev->sb_size = MD_SB_BYTES;
856 
857         sb = (mdp_super_t*)page_address(rdev->sb_page);
858 
859         memset(sb, 0, sizeof(*sb));
860 
861         sb->md_magic = MD_SB_MAGIC;
862         sb->major_version = mddev->major_version;
863         sb->patch_version = mddev->patch_version;
864         sb->gvalid_words  = 0; /* ignored */
865         memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
866         memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
867         memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
868         memcpy(&sb->set_uuid3, mddev->uuid+12,4);
869 
870         sb->ctime = mddev->ctime;
871         sb->level = mddev->level;
872         sb->size  = mddev->size;
873         sb->raid_disks = mddev->raid_disks;
874         sb->md_minor = mddev->md_minor;
875         sb->not_persistent = !mddev->persistent;
876         sb->utime = mddev->utime;
877         sb->state = 0;
878         sb->events_hi = (mddev->events>>32);
879         sb->events_lo = (u32)mddev->events;
880 
881         if (mddev->reshape_position == MaxSector)
882                 sb->minor_version = 90;
883         else {
884                 sb->minor_version = 91;
885                 sb->reshape_position = mddev->reshape_position;
886                 sb->new_level = mddev->new_level;
887                 sb->delta_disks = mddev->delta_disks;
888                 sb->new_layout = mddev->new_layout;
889                 sb->new_chunk = mddev->new_chunk;
890         }
891         mddev->minor_version = sb->minor_version;
892         if (mddev->in_sync)
893         {
894                 sb->recovery_cp = mddev->recovery_cp;
895                 sb->cp_events_hi = (mddev->events>>32);
896                 sb->cp_events_lo = (u32)mddev->events;
897                 if (mddev->recovery_cp == MaxSector)
898                         sb->state = (1<< MD_SB_CLEAN);
899         } else
900                 sb->recovery_cp = 0;
901 
902         sb->layout = mddev->layout;
903         sb->chunk_size = mddev->chunk_size;
904 
905         if (mddev->bitmap && mddev->bitmap_file == NULL)
906                 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
907 
908         sb->disks[0].state = (1<<MD_DISK_REMOVED);
909         ITERATE_RDEV(mddev,rdev2,tmp) {
910                 mdp_disk_t *d;
911                 int desc_nr;
912                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
913                     && !test_bit(Faulty, &rdev2->flags))
914                         desc_nr = rdev2->raid_disk;
915                 else
916                         desc_nr = next_spare++;
917                 rdev2->desc_nr = desc_nr;
918                 d = &sb->disks[rdev2->desc_nr];
919                 nr_disks++;
920                 d->number = rdev2->desc_nr;
921                 d->major = MAJOR(rdev2->bdev->bd_dev);
922                 d->minor = MINOR(rdev2->bdev->bd_dev);
923                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
924                     && !test_bit(Faulty, &rdev2->flags))
925                         d->raid_disk = rdev2->raid_disk;
926                 else
927                         d->raid_disk = rdev2->desc_nr; /* compatibility */
928                 if (test_bit(Faulty, &rdev2->flags))
929                         d->state = (1<<MD_DISK_FAULTY);
930                 else if (test_bit(In_sync, &rdev2->flags)) {
931                         d->state = (1<<MD_DISK_ACTIVE);
932                         d->state |= (1<<MD_DISK_SYNC);
933                         active++;
934                         working++;
935                 } else {
936                         d->state = 0;
937                         spare++;
938                         working++;
939                 }
940                 if (test_bit(WriteMostly, &rdev2->flags))
941                         d->state |= (1<<MD_DISK_WRITEMOSTLY);
942         }
943         /* now set the "removed" and "faulty" bits on any missing devices */
944         for (i=0 ; i < mddev->raid_disks ; i++) {
945                 mdp_disk_t *d = &sb->disks[i];
946                 if (d->state == 0 && d->number == 0) {
947                         d->number = i;
948                         d->raid_disk = i;
949                         d->state = (1<<MD_DISK_REMOVED);
950                         d->state |= (1<<MD_DISK_FAULTY);
951                         failed++;
952                 }
953         }
954         sb->nr_disks = nr_disks;
955         sb->active_disks = active;
956         sb->working_disks = working;
957         sb->failed_disks = failed;
958         sb->spare_disks = spare;
959 
960         sb->this_disk = sb->disks[rdev->desc_nr];
961         sb->sb_csum = calc_sb_csum(sb);
962 }
963 
964 /*
965  * version 1 superblock
966  */
967 
968 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
969 {
970         unsigned int disk_csum, csum;
971         unsigned long long newcsum;
972         int size = 256 + le32_to_cpu(sb->max_dev)*2;
973         unsigned int *isuper = (unsigned int*)sb;
974         int i;
975 
976         disk_csum = sb->sb_csum;
977         sb->sb_csum = 0;
978         newcsum = 0;
979         for (i=0; size>=4; size -= 4 )
980                 newcsum += le32_to_cpu(*isuper++);
981 
982         if (size == 2)
983                 newcsum += le16_to_cpu(*(unsigned short*) isuper);
984 
985         csum = (newcsum & 0xffffffff) + (newcsum >> 32);
986         sb->sb_csum = disk_csum;
987         return cpu_to_le32(csum);
988 }
989 
990 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
991 {
992         struct mdp_superblock_1 *sb;
993         int ret;
994         sector_t sb_offset;
995         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
996         int bmask;
997 
998         /*
999          * Calculate the position of the superblock.
1000          * It is always aligned to a 4K boundary and
1001          * depeding on minor_version, it can be:
1002          * 0: At least 8K, but less than 12K, from end of device
1003          * 1: At start of device
1004          * 2: 4K from start of device.
1005          */
1006         switch(minor_version) {
1007         case 0:
1008                 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
1009                 sb_offset -= 8*2;
1010                 sb_offset &= ~(sector_t)(4*2-1);
1011                 /* convert from sectors to K */
1012                 sb_offset /= 2;
1013                 break;
1014         case 1:
1015                 sb_offset = 0;
1016                 break;
1017         case 2:
1018                 sb_offset = 4;
1019                 break;
1020         default:
1021                 return -EINVAL;
1022         }
1023         rdev->sb_offset = sb_offset;
1024 
1025         /* superblock is rarely larger than 1K, but it can be larger,
1026          * and it is safe to read 4k, so we do that
1027          */
1028         ret = read_disk_sb(rdev, 4096);
1029         if (ret) return ret;
1030 
1031 
1032         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1033 
1034         if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1035             sb->major_version != cpu_to_le32(1) ||
1036             le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1037             le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
1038             (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1039                 return -EINVAL;
1040 
1041         if (calc_sb_1_csum(sb) != sb->sb_csum) {
1042                 printk("md: invalid superblock checksum on %s\n",
1043                         bdevname(rdev->bdev,b));
1044                 return -EINVAL;
1045         }
1046         if (le64_to_cpu(sb->data_size) < 10) {
1047                 printk("md: data_size too small on %s\n",
1048                        bdevname(rdev->bdev,b));
1049                 return -EINVAL;
1050         }
1051         rdev->preferred_minor = 0xffff;
1052         rdev->data_offset = le64_to_cpu(sb->data_offset);
1053         atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1054 
1055         rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1056         bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1057         if (rdev->sb_size & bmask)
1058                 rdev-> sb_size = (rdev->sb_size | bmask)+1;
1059 
1060         if (refdev == 0)
1061                 ret = 1;
1062         else {
1063                 __u64 ev1, ev2;
1064                 struct mdp_superblock_1 *refsb = 
1065                         (struct mdp_superblock_1*)page_address(refdev->sb_page);
1066 
1067                 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1068                     sb->level != refsb->level ||
1069                     sb->layout != refsb->layout ||
1070                     sb->chunksize != refsb->chunksize) {
1071                         printk(KERN_WARNING "md: %s has strangely different"
1072                                 " superblock to %s\n",
1073                                 bdevname(rdev->bdev,b),