~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux-2.6.17/drivers/md/raid5.c

Version: ~ [ 2.6.16 ] ~ [ 2.6.17 ] ~
Architecture: ~ [ ia64 ] ~ [ i386 ] ~ [ arm ] ~ [ ppc ] ~ [ sparc64 ] ~

  1 /*
  2  * raid5.c : Multiple Devices driver for Linux
  3  *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  4  *         Copyright (C) 1999, 2000 Ingo Molnar
  5  *
  6  * RAID-5 management functions.
  7  *
  8  * This program is free software; you can redistribute it and/or modify
  9  * it under the terms of the GNU General Public License as published by
 10  * the Free Software Foundation; either version 2, or (at your option)
 11  * any later version.
 12  *
 13  * You should have received a copy of the GNU General Public License
 14  * (for example /usr/src/linux/COPYING); if not, write to the Free
 15  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 16  */
 17 
 18 
 19 #include <linux/config.h>
 20 #include <linux/module.h>
 21 #include <linux/slab.h>
 22 #include <linux/raid/raid5.h>
 23 #include <linux/highmem.h>
 24 #include <linux/bitops.h>
 25 #include <linux/kthread.h>
 26 #include <asm/atomic.h>
 27 
 28 #include <linux/raid/bitmap.h>
 29 
 30 /*
 31  * Stripe cache
 32  */
 33 
 34 #define NR_STRIPES              256
 35 #define STRIPE_SIZE             PAGE_SIZE
 36 #define STRIPE_SHIFT            (PAGE_SHIFT - 9)
 37 #define STRIPE_SECTORS          (STRIPE_SIZE>>9)
 38 #define IO_THRESHOLD            1
 39 #define NR_HASH                 (PAGE_SIZE / sizeof(struct hlist_head))
 40 #define HASH_MASK               (NR_HASH - 1)
 41 
 42 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
 43 
 44 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
 45  * order without overlap.  There may be several bio's per stripe+device, and
 46  * a bio could span several devices.
 47  * When walking this list for a particular stripe+device, we must never proceed
 48  * beyond a bio that extends past this device, as the next bio might no longer
 49  * be valid.
 50  * This macro is used to determine the 'next' bio in the list, given the sector
 51  * of the current stripe+device
 52  */
 53 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
 54 /*
 55  * The following can be used to debug the driver
 56  */
 57 #define RAID5_DEBUG     0
 58 #define RAID5_PARANOIA  1
 59 #if RAID5_PARANOIA && defined(CONFIG_SMP)
 60 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
 61 #else
 62 # define CHECK_DEVLOCK()
 63 #endif
 64 
 65 #define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x)))
 66 #if RAID5_DEBUG
 67 #define inline
 68 #define __inline__
 69 #endif
 70 
 71 static void print_raid5_conf (raid5_conf_t *conf);
 72 
 73 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 74 {
 75         if (atomic_dec_and_test(&sh->count)) {
 76                 BUG_ON(!list_empty(&sh->lru));
 77                 BUG_ON(atomic_read(&conf->active_stripes)==0);
 78                 if (test_bit(STRIPE_HANDLE, &sh->state)) {
 79                         if (test_bit(STRIPE_DELAYED, &sh->state))
 80                                 list_add_tail(&sh->lru, &conf->delayed_list);
 81                         else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 82                                  conf->seq_write == sh->bm_seq)
 83                                 list_add_tail(&sh->lru, &conf->bitmap_list);
 84                         else {
 85                                 clear_bit(STRIPE_BIT_DELAY, &sh->state);
 86                                 list_add_tail(&sh->lru, &conf->handle_list);
 87                         }
 88                         md_wakeup_thread(conf->mddev->thread);
 89                 } else {
 90                         if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 91                                 atomic_dec(&conf->preread_active_stripes);
 92                                 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
 93                                         md_wakeup_thread(conf->mddev->thread);
 94                         }
 95                         atomic_dec(&conf->active_stripes);
 96                         if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 97                                 list_add_tail(&sh->lru, &conf->inactive_list);
 98                                 wake_up(&conf->wait_for_stripe);
 99                         }
100                 }
101         }
102 }
103 static void release_stripe(struct stripe_head *sh)
104 {
105         raid5_conf_t *conf = sh->raid_conf;
106         unsigned long flags;
107         
108         spin_lock_irqsave(&conf->device_lock, flags);
109         __release_stripe(conf, sh);
110         spin_unlock_irqrestore(&conf->device_lock, flags);
111 }
112 
113 static inline void remove_hash(struct stripe_head *sh)
114 {
115         PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
116 
117         hlist_del_init(&sh->hash);
118 }
119 
120 static void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
121 {
122         struct hlist_head *hp = stripe_hash(conf, sh->sector);
123 
124         PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
125 
126         CHECK_DEVLOCK();
127         hlist_add_head(&sh->hash, hp);
128 }
129 
130 
131 /* find an idle stripe, make sure it is unhashed, and return it. */
132 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
133 {
134         struct stripe_head *sh = NULL;
135         struct list_head *first;
136 
137         CHECK_DEVLOCK();
138         if (list_empty(&conf->inactive_list))
139                 goto out;
140         first = conf->inactive_list.next;
141         sh = list_entry(first, struct stripe_head, lru);
142         list_del_init(first);
143         remove_hash(sh);
144         atomic_inc(&conf->active_stripes);
145 out:
146         return sh;
147 }
148 
149 static void shrink_buffers(struct stripe_head *sh, int num)
150 {
151         struct page *p;
152         int i;
153 
154         for (i=0; i<num ; i++) {
155                 p = sh->dev[i].page;
156                 if (!p)
157                         continue;
158                 sh->dev[i].page = NULL;
159                 put_page(p);
160         }
161 }
162 
163 static int grow_buffers(struct stripe_head *sh, int num)
164 {
165         int i;
166 
167         for (i=0; i<num; i++) {
168                 struct page *page;
169 
170                 if (!(page = alloc_page(GFP_KERNEL))) {
171                         return 1;
172                 }
173                 sh->dev[i].page = page;
174         }
175         return 0;
176 }
177 
178 static void raid5_build_block (struct stripe_head *sh, int i);
179 
180 static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
181 {
182         raid5_conf_t *conf = sh->raid_conf;
183         int i;
184 
185         BUG_ON(atomic_read(&sh->count) != 0);
186         BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
187         
188         CHECK_DEVLOCK();
189         PRINTK("init_stripe called, stripe %llu\n", 
190                 (unsigned long long)sh->sector);
191 
192         remove_hash(sh);
193         
194         sh->sector = sector;
195         sh->pd_idx = pd_idx;
196         sh->state = 0;
197 
198         sh->disks = disks;
199 
200         for (i = sh->disks; i--; ) {
201                 struct r5dev *dev = &sh->dev[i];
202 
203                 if (dev->toread || dev->towrite || dev->written ||
204                     test_bit(R5_LOCKED, &dev->flags)) {
205                         printk("sector=%llx i=%d %p %p %p %d\n",
206                                (unsigned long long)sh->sector, i, dev->toread,
207                                dev->towrite, dev->written,
208                                test_bit(R5_LOCKED, &dev->flags));
209                         BUG();
210                 }
211                 dev->flags = 0;
212                 raid5_build_block(sh, i);
213         }
214         insert_hash(conf, sh);
215 }
216 
217 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
218 {
219         struct stripe_head *sh;
220         struct hlist_node *hn;
221 
222         CHECK_DEVLOCK();
223         PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
224         hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
225                 if (sh->sector == sector && sh->disks == disks)
226                         return sh;
227         PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
228         return NULL;
229 }
230 
231 static void unplug_slaves(mddev_t *mddev);
232 static void raid5_unplug_device(request_queue_t *q);
233 
234 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
235                                              int pd_idx, int noblock)
236 {
237         struct stripe_head *sh;
238 
239         PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
240 
241         spin_lock_irq(&conf->device_lock);
242 
243         do {
244                 wait_event_lock_irq(conf->wait_for_stripe,
245                                     conf->quiesce == 0,
246                                     conf->device_lock, /* nothing */);
247                 sh = __find_stripe(conf, sector, disks);
248                 if (!sh) {
249                         if (!conf->inactive_blocked)
250                                 sh = get_free_stripe(conf);
251                         if (noblock && sh == NULL)
252                                 break;
253                         if (!sh) {
254                                 conf->inactive_blocked = 1;
255                                 wait_event_lock_irq(conf->wait_for_stripe,
256                                                     !list_empty(&conf->inactive_list) &&
257                                                     (atomic_read(&conf->active_stripes)
258                                                      < (conf->max_nr_stripes *3/4)
259                                                      || !conf->inactive_blocked),
260                                                     conf->device_lock,
261                                                     unplug_slaves(conf->mddev)
262                                         );
263                                 conf->inactive_blocked = 0;
264                         } else
265                                 init_stripe(sh, sector, pd_idx, disks);
266                 } else {
267                         if (atomic_read(&sh->count)) {
268                           BUG_ON(!list_empty(&sh->lru));
269                         } else {
270                                 if (!test_bit(STRIPE_HANDLE, &sh->state))
271                                         atomic_inc(&conf->active_stripes);
272                                 if (!list_empty(&sh->lru))
273                                         list_del_init(&sh->lru);
274                         }
275                 }
276         } while (sh == NULL);
277 
278         if (sh)
279                 atomic_inc(&sh->count);
280 
281         spin_unlock_irq(&conf->device_lock);
282         return sh;
283 }
284 
285 static int grow_one_stripe(raid5_conf_t *conf)
286 {
287         struct stripe_head *sh;
288         sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
289         if (!sh)
290                 return 0;
291         memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
292         sh->raid_conf = conf;
293         spin_lock_init(&sh->lock);
294 
295         if (grow_buffers(sh, conf->raid_disks)) {
296                 shrink_buffers(sh, conf->raid_disks);
297                 kmem_cache_free(conf->slab_cache, sh);
298                 return 0;
299         }
300         sh->disks = conf->raid_disks;
301         /* we just created an active stripe so... */
302         atomic_set(&sh->count, 1);
303         atomic_inc(&conf->active_stripes);
304         INIT_LIST_HEAD(&sh->lru);
305         release_stripe(sh);
306         return 1;
307 }
308 
309 static int grow_stripes(raid5_conf_t *conf, int num)
310 {
311         kmem_cache_t *sc;
312         int devs = conf->raid_disks;
313 
314         sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
315         sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
316         conf->active_name = 0;
317         sc = kmem_cache_create(conf->cache_name[conf->active_name],
318                                sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
319                                0, 0, NULL, NULL);
320         if (!sc)
321                 return 1;
322         conf->slab_cache = sc;
323         conf->pool_size = devs;
324         while (num--) {
325                 if (!grow_one_stripe(conf))
326                         return 1;
327         }
328         return 0;
329 }
330 
331 #ifdef CONFIG_MD_RAID5_RESHAPE
332 static int resize_stripes(raid5_conf_t *conf, int newsize)
333 {
334         /* Make all the stripes able to hold 'newsize' devices.
335          * New slots in each stripe get 'page' set to a new page.
336          *
337          * This happens in stages:
338          * 1/ create a new kmem_cache and allocate the required number of
339          *    stripe_heads.
340          * 2/ gather all the old stripe_heads and tranfer the pages across
341          *    to the new stripe_heads.  This will have the side effect of
342          *    freezing the array as once all stripe_heads have been collected,
343          *    no IO will be possible.  Old stripe heads are freed once their
344          *    pages have been transferred over, and the old kmem_cache is
345          *    freed when all stripes are done.
346          * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
347          *    we simple return a failre status - no need to clean anything up.
348          * 4/ allocate new pages for the new slots in the new stripe_heads.
349          *    If this fails, we don't bother trying the shrink the
350          *    stripe_heads down again, we just leave them as they are.
351          *    As each stripe_head is processed the new one is released into
352          *    active service.
353          *
354          * Once step2 is started, we cannot afford to wait for a write,
355          * so we use GFP_NOIO allocations.
356          */
357         struct stripe_head *osh, *nsh;
358         LIST_HEAD(newstripes);
359         struct disk_info *ndisks;
360         int err = 0;
361         kmem_cache_t *sc;
362         int i;
363 
364         if (newsize <= conf->pool_size)
365                 return 0; /* never bother to shrink */
366 
367         /* Step 1 */
368         sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
369                                sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
370                                0, 0, NULL, NULL);
371         if (!sc)
372                 return -ENOMEM;
373 
374         for (i = conf->max_nr_stripes; i; i--) {
375                 nsh = kmem_cache_alloc(sc, GFP_KERNEL);
376                 if (!nsh)
377                         break;
378 
379                 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
380 
381                 nsh->raid_conf = conf;
382                 spin_lock_init(&nsh->lock);
383 
384                 list_add(&nsh->lru, &newstripes);
385         }
386         if (i) {
387                 /* didn't get enough, give up */
388                 while (!list_empty(&newstripes)) {
389                         nsh = list_entry(newstripes.next, struct stripe_head, lru);
390                         list_del(&nsh->lru);
391                         kmem_cache_free(sc, nsh);
392                 }
393                 kmem_cache_destroy(sc);
394                 return -ENOMEM;
395         }
396         /* Step 2 - Must use GFP_NOIO now.
397          * OK, we have enough stripes, start collecting inactive
398          * stripes and copying them over
399          */
400         list_for_each_entry(nsh, &newstripes, lru) {
401                 spin_lock_irq(&conf->device_lock);
402                 wait_event_lock_irq(conf->wait_for_stripe,
403                                     !list_empty(&conf->inactive_list),
404                                     conf->device_lock,
405                                     unplug_slaves(conf->mddev)
406                         );
407                 osh = get_free_stripe(conf);
408                 spin_unlock_irq(&conf->device_lock);
409                 atomic_set(&nsh->count, 1);
410                 for(i=0; i<conf->pool_size; i++)
411                         nsh->dev[i].page = osh->dev[i].page;
412                 for( ; i<newsize; i++)
413                         nsh->dev[i].page = NULL;
414                 kmem_cache_free(conf->slab_cache, osh);
415         }
416         kmem_cache_destroy(conf->slab_cache);
417 
418         /* Step 3.
419          * At this point, we are holding all the stripes so the array
420          * is completely stalled, so now is a good time to resize
421          * conf->disks.
422          */
423         ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
424         if (ndisks) {
425                 for (i=0; i<conf->raid_disks; i++)
426                         ndisks[i] = conf->disks[i];
427                 kfree(conf->disks);
428                 conf->disks = ndisks;
429         } else
430                 err = -ENOMEM;
431 
432         /* Step 4, return new stripes to service */
433         while(!list_empty(&newstripes)) {
434                 nsh = list_entry(newstripes.next, struct stripe_head, lru);
435                 list_del_init(&nsh->lru);
436                 for (i=conf->raid_disks; i < newsize; i++)
437                         if (nsh->dev[i].page == NULL) {
438                                 struct page *p = alloc_page(GFP_NOIO);
439                                 nsh->dev[i].page = p;
440                                 if (!p)
441                                         err = -ENOMEM;
442                         }
443                 release_stripe(nsh);
444         }
445         /* critical section pass, GFP_NOIO no longer needed */
446 
447         conf->slab_cache = sc;
448         conf->active_name = 1-conf->active_name;
449         conf->pool_size = newsize;
450         return err;
451 }
452 #endif
453 
454 static int drop_one_stripe(raid5_conf_t *conf)
455 {
456         struct stripe_head *sh;
457 
458         spin_lock_irq(&conf->device_lock);
459         sh = get_free_stripe(conf);
460         spin_unlock_irq(&conf->device_lock);
461         if (!sh)
462                 return 0;
463         BUG_ON(atomic_read(&sh->count));
464         shrink_buffers(sh, conf->pool_size);
465         kmem_cache_free(conf->slab_cache, sh);
466         atomic_dec(&conf->active_stripes);
467         return 1;
468 }
469 
470 static void shrink_stripes(raid5_conf_t *conf)
471 {
472         while (drop_one_stripe(conf))
473                 ;
474 
475         if (conf->slab_cache)
476                 kmem_cache_destroy(conf->slab_cache);
477         conf->slab_cache = NULL;
478 }
479 
480 static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
481                                    int error)
482 {
483         struct stripe_head *sh = bi->bi_private;
484         raid5_conf_t *conf = sh->raid_conf;
485         int disks = sh->disks, i;
486         int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
487 
488         if (bi->bi_size)
489                 return 1;
490 
491         for (i=0 ; i<disks; i++)
492                 if (bi == &sh->dev[i].req)
493                         break;
494 
495         PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n", 
496                 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 
497                 uptodate);
498         if (i == disks) {
499                 BUG();
500                 return 0;
501         }
502 
503         if (uptodate) {
504 #if 0
505                 struct bio *bio;
506                 unsigned long flags;
507                 spin_lock_irqsave(&conf->device_lock, flags);
508                 /* we can return a buffer if we bypassed the cache or
509                  * if the top buffer is not in highmem.  If there are
510                  * multiple buffers, leave the extra work to
511                  * handle_stripe
512                  */
513                 buffer = sh->bh_read[i];
514                 if (buffer &&
515                     (!PageHighMem(buffer->b_page)
516                      || buffer->b_page == bh->b_page )
517                         ) {
518                         sh->bh_read[i] = buffer->b_reqnext;
519                         buffer->b_reqnext = NULL;
520                 } else
521                         buffer = NULL;
522                 spin_unlock_irqrestore(&conf->device_lock, flags);
523                 if (sh->bh_page[i]==bh->b_page)
524                         set_buffer_uptodate(bh);
525                 if (buffer) {
526                         if (buffer->b_page != bh->b_page)
527                                 memcpy(buffer->b_data, bh->b_data, bh->b_size);
528                         buffer->b_end_io(buffer, 1);
529                 }
530 #else
531                 set_bit(R5_UPTODATE, &sh->dev[i].flags);
532 #endif
533                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
534                         printk(KERN_INFO "raid5: read error corrected!!\n");
535                         clear_bit(R5_ReadError, &sh->dev[i].flags);
536                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
537                 }
538                 if (atomic_read(&conf->disks[i].rdev->read_errors))
539                         atomic_set(&conf->disks[i].rdev->read_errors, 0);
540         } else {
541                 int retry = 0;
542                 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
543                 atomic_inc(&conf->disks[i].rdev->read_errors);
544                 if (conf->mddev->degraded)
545                         printk(KERN_WARNING "raid5: read error not correctable.\n");
546                 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
547                         /* Oh, no!!! */
548                         printk(KERN_WARNING "raid5: read error NOT corrected!!\n");
549                 else if (atomic_read(&conf->disks[i].rdev->read_errors)
550                          > conf->max_nr_stripes)
551                         printk(KERN_WARNING
552                                "raid5: Too many read errors, failing device.\n");
553                 else
554                         retry = 1;
555                 if (retry)
556                         set_bit(R5_ReadError, &sh->dev[i].flags);
557                 else {
558                         clear_bit(R5_ReadError, &sh->dev[i].flags);
559                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
560                         md_error(conf->mddev, conf->disks[i].rdev);
561                 }
562         }
563         rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
564 #if 0
565         /* must restore b_page before unlocking buffer... */
566         if (sh->bh_page[i] != bh->b_page) {
567                 bh->b_page = sh->bh_page[i];
568                 bh->b_data = page_address(bh->b_page);
569                 clear_buffer_uptodate(bh);
570         }
571 #endif
572         clear_bit(R5_LOCKED, &sh->dev[i].flags);
573         set_bit(STRIPE_HANDLE, &sh->state);
574         release_stripe(sh);
575         return 0;
576 }
577 
578 static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
579                                     int error)
580 {
581         struct stripe_head *sh = bi->bi_private;
582         raid5_conf_t *conf = sh->raid_conf;
583         int disks = sh->disks, i;
584         unsigned long flags;
585         int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
586 
587         if (bi->bi_size)
588                 return 1;
589 
590         for (i=0 ; i<disks; i++)
591                 if (bi == &sh->dev[i].req)
592                         break;
593 
594         PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n", 
595                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
596                 uptodate);
597         if (i == disks) {
598                 BUG();
599                 return 0;
600         }
601 
602         spin_lock_irqsave(&conf->device_lock, flags);
603         if (!uptodate)
604                 md_error(conf->mddev, conf->disks[i].rdev);
605 
606         rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
607         
608         clear_bit(R5_LOCKED, &sh->dev[i].flags);
609         set_bit(STRIPE_HANDLE, &sh->state);
610         __release_stripe(conf, sh);
611         spin_unlock_irqrestore(&conf->device_lock, flags);
612         return 0;
613 }
614 
615 
616 static sector_t compute_blocknr(struct stripe_head *sh, int i);
617         
618 static void raid5_build_block (struct stripe_head *sh, int i)
619 {
620         struct r5dev *dev = &sh->dev[i];
621 
622         bio_init(&dev->req);
623         dev->req.bi_io_vec = &dev->vec;
624         dev->req.bi_vcnt++;
625         dev->req.bi_max_vecs++;
626         dev->vec.bv_page = dev->page;
627         dev->vec.bv_len = STRIPE_SIZE;
628         dev->vec.bv_offset = 0;
629 
630         dev->req.bi_sector = sh->sector;
631         dev->req.bi_private = sh;
632 
633         dev->flags = 0;
634         if (i != sh->pd_idx)
635                 dev->sector = compute_blocknr(sh, i);
636 }
637 
638 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
639 {
640         char b[BDEVNAME_SIZE];
641         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
642         PRINTK("raid5: error called\n");
643 
644         if (!test_bit(Faulty, &rdev->flags)) {
645                 mddev->sb_dirty = 1;
646                 if (test_bit(In_sync, &rdev->flags)) {
647                         conf->working_disks--;
648                         mddev->degraded++;
649                         conf->failed_disks++;
650                         clear_bit(In_sync, &rdev->flags);
651                         /*
652                          * if recovery was running, make sure it aborts.
653                          */
654                         set_bit(MD_RECOVERY_ERR, &mddev->recovery);
655                 }
656                 set_bit(Faulty, &rdev->flags);
657                 printk (KERN_ALERT
658                         "raid5: Disk failure on %s, disabling device."
659                         " Operation continuing on %d devices\n",
660                         bdevname(rdev->bdev,b), conf->working_disks);
661         }
662 }       
663 
664 /*
665  * Input: a 'big' sector number,
666  * Output: index of the data and parity disk, and the sector # in them.
667  */
668 static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
669                         unsigned int data_disks, unsigned int * dd_idx,
670                         unsigned int * pd_idx, raid5_conf_t *conf)
671 {
672         long stripe;
673         unsigned long chunk_number;
674         unsigned int chunk_offset;
675         sector_t new_sector;
676         int sectors_per_chunk = conf->chunk_size >> 9;
677 
678         /* First compute the information on this sector */
679 
680         /*
681          * Compute the chunk number and the sector offset inside the chunk
682          */
683         chunk_offset = sector_div(r_sector, sectors_per_chunk);
684         chunk_number = r_sector;
685         BUG_ON(r_sector != chunk_number);
686 
687         /*
688          * Compute the stripe number
689          */
690         stripe = chunk_number / data_disks;
691 
692         /*
693          * Compute the data disk and parity disk indexes inside the stripe
694          */
695         *dd_idx = chunk_number % data_disks;
696 
697         /*
698          * Select the parity disk based on the user selected algorithm.
699          */
700         if (conf->level == 4)
701                 *pd_idx = data_disks;
702         else switch (conf->algorithm) {
703                 case ALGORITHM_LEFT_ASYMMETRIC:
704                         *pd_idx = data_disks - stripe % raid_disks;
705                         if (*dd_idx >= *pd_idx)
706                                 (*dd_idx)++;
707                         break;
708                 case ALGORITHM_RIGHT_ASYMMETRIC:
709                         *pd_idx = stripe % raid_disks;
710                         if (*dd_idx >= *pd_idx)
711                                 (*dd_idx)++;
712                         break;
713                 case ALGORITHM_LEFT_SYMMETRIC:
714                         *pd_idx = data_disks - stripe % raid_disks;
715                         *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
716                         break;
717                 case ALGORITHM_RIGHT_SYMMETRIC:
718                         *pd_idx = stripe % raid_disks;
719                         *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
720                         break;
721                 default:
722                         printk(KERN_ERR "raid5: unsupported algorithm %d\n",
723                                 conf->algorithm);
724         }
725 
726         /*
727          * Finally, compute the new sector number
728          */
729         new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
730         return new_sector;
731 }
732 
733 
734 static sector_t compute_blocknr(struct stripe_head *sh, int i)
735 {
736         raid5_conf_t *conf = sh->raid_conf;
737         int raid_disks = sh->disks, data_disks = raid_disks - 1;
738         sector_t new_sector = sh->sector, check;
739         int sectors_per_chunk = conf->chunk_size >> 9;
740         sector_t stripe;
741         int chunk_offset;
742         int chunk_number, dummy1, dummy2, dd_idx = i;
743         sector_t r_sector;
744 
745         chunk_offset = sector_div(new_sector, sectors_per_chunk);
746         stripe = new_sector;
747         BUG_ON(new_sector != stripe);
748 
749         
750         switch (conf->algorithm) {
751                 case ALGORITHM_LEFT_ASYMMETRIC:
752                 case ALGORITHM_RIGHT_ASYMMETRIC:
753                         if (i > sh->pd_idx)
754                                 i--;
755                         break;
756                 case ALGORITHM_LEFT_SYMMETRIC:
757                 case ALGORITHM_RIGHT_SYMMETRIC:
758                         if (i < sh->pd_idx)
759                                 i += raid_disks;
760                         i -= (sh->pd_idx + 1);
761                         break;
762                 default:
763                         printk(KERN_ERR "raid5: unsupported algorithm %d\n",
764                                 conf->algorithm);
765         }
766 
767         chunk_number = stripe * data_disks + i;
768         r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
769 
770         check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
771         if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
772                 printk(KERN_ERR "compute_blocknr: map not correct\n");
773                 return 0;
774         }
775         return r_sector;
776 }
777 
778 
779 
780 /*
781  * Copy data between a page in the stripe cache, and a bio.
782  * There are no alignment or size guarantees between the page or the
783  * bio except that there is some overlap.
784  * All iovecs in the bio must be considered.
785  */
786 static void copy_data(int frombio, struct bio *bio,
787                      struct page *page,
788                      sector_t sector)
789 {
790         char *pa = page_address(page);
791         struct bio_vec *bvl;
792         int i;
793         int page_offset;
794 
795         if (bio->bi_sector >= sector)
796                 page_offset = (signed)(bio->bi_sector - sector) * 512;
797         else
798                 page_offset = (signed)(sector - bio->bi_sector) * -512;
799         bio_for_each_segment(bvl, bio, i) {
800                 int len = bio_iovec_idx(bio,i)->bv_len;
801                 int clen;
802                 int b_offset = 0;
803 
804                 if (page_offset < 0) {
805                         b_offset = -page_offset;
806                         page_offset += b_offset;
807                         len -= b_offset;
808                 }
809 
810                 if (len > 0 && page_offset + len > STRIPE_SIZE)
811                         clen = STRIPE_SIZE - page_offset;
812                 else clen = len;
813                         
814                 if (clen > 0) {
815                         char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
816                         if (frombio)
817                                 memcpy(pa+page_offset, ba+b_offset, clen);
818                         else
819                                 memcpy(ba+b_offset, pa+page_offset, clen);
820                         __bio_kunmap_atomic(ba, KM_USER0);
821                 }
822                 if (clen < len) /* hit end of page */
823                         break;
824                 page_offset +=  len;
825         }
826 }
827 
828 #define check_xor()     do {                                            \
829                            if (count == MAX_XOR_BLOCKS) {               \
830                                 xor_block(count, STRIPE_SIZE, ptr);     \
831                                 count = 1;                              \
832                            }                                            \
833                         } while(0)
834 
835 
836 static void compute_block(struct stripe_head *sh, int dd_idx)
837 {
838         int i, count, disks = sh->disks;
839         void *ptr[MAX_XOR_BLOCKS], *p;
840 
841         PRINTK("compute_block, stripe %llu, idx %d\n", 
842                 (unsigned long long)sh->sector, dd_idx);
843 
844         ptr[0] = page_address(sh->dev[dd_idx].page);
845         memset(ptr[0], 0, STRIPE_SIZE);
846         count = 1;
847         for (i = disks ; i--; ) {
848                 if (i == dd_idx)
849                         continue;
850                 p = page_address(sh->dev[i].page);
851                 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
852                         ptr[count++] = p;
853                 else
854                         printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
855                                 " not present\n", dd_idx,
856                                 (unsigned long long)sh->sector, i);
857 
858                 check_xor();
859         }
860         if (count != 1)
861                 xor_block(count, STRIPE_SIZE, ptr);
862         set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
863 }
864 
865 static void compute_parity(struct stripe_head *sh, int method)
866 {
867         raid5_conf_t *conf = sh->raid_conf;
868         int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
869         void *ptr[MAX_XOR_BLOCKS];
870         struct bio *chosen;
871 
872         PRINTK("compute_parity, stripe %llu, method %d\n",
873                 (unsigned long long)sh->sector, method);
874 
875         count = 1;
876         ptr[0] = page_address(sh->dev[pd_idx].page);
877         switch(method) {
878         case READ_MODIFY_WRITE:
879                 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
880                 for (i=disks ; i-- ;) {
881                         if (i==pd_idx)
882                                 continue;
883                         if (sh->dev[i].towrite &&
884                             test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
885                                 ptr[count++] = page_address(sh->dev[i].page);
886                                 chosen = sh->dev[i].towrite;
887                                 sh->dev[i].towrite = NULL;
888 
889                                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
890                                         wake_up(&conf->wait_for_overlap);
891 
892                                 BUG_ON(sh->dev[i].written);
893                                 sh->dev[i].written = chosen;
894                                 check_xor();
895                         }
896                 }
897                 break;
898         case RECONSTRUCT_WRITE:
899                 memset(ptr[0], 0, STRIPE_SIZE);
900                 for (i= disks; i-- ;)
901                         if (i!=pd_idx && sh->dev[i].towrite) {
902                                 chosen = sh->dev[i].towrite;
903                                 sh->dev[i].towrite = NULL;
904 
905                                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
906                                         wake_up(&conf->wait_for_overlap);
907 
908                                 BUG_ON(sh->dev[i].written);
909                                 sh->dev[i].written = chosen;
910                         }
911                 break;
912         case CHECK_PARITY:
913                 break;
914         }
915         if (count>1) {
916                 xor_block(count, STRIPE_SIZE, ptr);
917                 count = 1;
918         }
919         
920         for (i = disks; i--;)
921                 if (sh->dev[i].written) {
922                         sector_t sector = sh->dev[i].sector;
923                         struct bio *wbi = sh->dev[i].written;
924                         while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
925                                 copy_data(1, wbi, sh->dev[i].page, sector);
926                                 wbi = r5_next_bio(wbi, sector);
927                         }
928 
929                         set_bit(R5_LOCKED, &sh->dev[