1 /*
2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 *
6 * RAID-5 management functions.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2, or (at your option)
11 * any later version.
12 *
13 * You should have received a copy of the GNU General Public License
14 * (for example /usr/src/linux/COPYING); if not, write to the Free
15 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16 */
17
18
19 #include <linux/config.h>
20 #include <linux/module.h>
21 #include <linux/slab.h>
22 #include <linux/raid/raid5.h>
23 #include <linux/highmem.h>
24 #include <linux/bitops.h>
25 #include <linux/kthread.h>
26 #include <asm/atomic.h>
27
28 #include <linux/raid/bitmap.h>
29
30 /*
31 * Stripe cache
32 */
33
34 #define NR_STRIPES 256
35 #define STRIPE_SIZE PAGE_SIZE
36 #define STRIPE_SHIFT (PAGE_SHIFT - 9)
37 #define STRIPE_SECTORS (STRIPE_SIZE>>9)
38 #define IO_THRESHOLD 1
39 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
40 #define HASH_MASK (NR_HASH - 1)
41
42 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
43
44 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
45 * order without overlap. There may be several bio's per stripe+device, and
46 * a bio could span several devices.
47 * When walking this list for a particular stripe+device, we must never proceed
48 * beyond a bio that extends past this device, as the next bio might no longer
49 * be valid.
50 * This macro is used to determine the 'next' bio in the list, given the sector
51 * of the current stripe+device
52 */
53 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
54 /*
55 * The following can be used to debug the driver
56 */
57 #define RAID5_DEBUG 0
58 #define RAID5_PARANOIA 1
59 #if RAID5_PARANOIA && defined(CONFIG_SMP)
60 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
61 #else
62 # define CHECK_DEVLOCK()
63 #endif
64
65 #define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x)))
66 #if RAID5_DEBUG
67 #define inline
68 #define __inline__
69 #endif
70
71 static void print_raid5_conf (raid5_conf_t *conf);
72
73 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
74 {
75 if (atomic_dec_and_test(&sh->count)) {
76 BUG_ON(!list_empty(&sh->lru));
77 BUG_ON(atomic_read(&conf->active_stripes)==0);
78 if (test_bit(STRIPE_HANDLE, &sh->state)) {
79 if (test_bit(STRIPE_DELAYED, &sh->state))
80 list_add_tail(&sh->lru, &conf->delayed_list);
81 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
82 conf->seq_write == sh->bm_seq)
83 list_add_tail(&sh->lru, &conf->bitmap_list);
84 else {
85 clear_bit(STRIPE_BIT_DELAY, &sh->state);
86 list_add_tail(&sh->lru, &conf->handle_list);
87 }
88 md_wakeup_thread(conf->mddev->thread);
89 } else {
90 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
91 atomic_dec(&conf->preread_active_stripes);
92 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
93 md_wakeup_thread(conf->mddev->thread);
94 }
95 atomic_dec(&conf->active_stripes);
96 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
97 list_add_tail(&sh->lru, &conf->inactive_list);
98 wake_up(&conf->wait_for_stripe);
99 }
100 }
101 }
102 }
103 static void release_stripe(struct stripe_head *sh)
104 {
105 raid5_conf_t *conf = sh->raid_conf;
106 unsigned long flags;
107
108 spin_lock_irqsave(&conf->device_lock, flags);
109 __release_stripe(conf, sh);
110 spin_unlock_irqrestore(&conf->device_lock, flags);
111 }
112
113 static inline void remove_hash(struct stripe_head *sh)
114 {
115 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
116
117 hlist_del_init(&sh->hash);
118 }
119
120 static void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
121 {
122 struct hlist_head *hp = stripe_hash(conf, sh->sector);
123
124 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
125
126 CHECK_DEVLOCK();
127 hlist_add_head(&sh->hash, hp);
128 }
129
130
131 /* find an idle stripe, make sure it is unhashed, and return it. */
132 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
133 {
134 struct stripe_head *sh = NULL;
135 struct list_head *first;
136
137 CHECK_DEVLOCK();
138 if (list_empty(&conf->inactive_list))
139 goto out;
140 first = conf->inactive_list.next;
141 sh = list_entry(first, struct stripe_head, lru);
142 list_del_init(first);
143 remove_hash(sh);
144 atomic_inc(&conf->active_stripes);
145 out:
146 return sh;
147 }
148
149 static void shrink_buffers(struct stripe_head *sh, int num)
150 {
151 struct page *p;
152 int i;
153
154 for (i=0; i<num ; i++) {
155 p = sh->dev[i].page;
156 if (!p)
157 continue;
158 sh->dev[i].page = NULL;
159 put_page(p);
160 }
161 }
162
163 static int grow_buffers(struct stripe_head *sh, int num)
164 {
165 int i;
166
167 for (i=0; i<num; i++) {
168 struct page *page;
169
170 if (!(page = alloc_page(GFP_KERNEL))) {
171 return 1;
172 }
173 sh->dev[i].page = page;
174 }
175 return 0;
176 }
177
178 static void raid5_build_block (struct stripe_head *sh, int i);
179
180 static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
181 {
182 raid5_conf_t *conf = sh->raid_conf;
183 int i;
184
185 BUG_ON(atomic_read(&sh->count) != 0);
186 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
187
188 CHECK_DEVLOCK();
189 PRINTK("init_stripe called, stripe %llu\n",
190 (unsigned long long)sh->sector);
191
192 remove_hash(sh);
193
194 sh->sector = sector;
195 sh->pd_idx = pd_idx;
196 sh->state = 0;
197
198 sh->disks = disks;
199
200 for (i = sh->disks; i--; ) {
201 struct r5dev *dev = &sh->dev[i];
202
203 if (dev->toread || dev->towrite || dev->written ||
204 test_bit(R5_LOCKED, &dev->flags)) {
205 printk("sector=%llx i=%d %p %p %p %d\n",
206 (unsigned long long)sh->sector, i, dev->toread,
207 dev->towrite, dev->written,
208 test_bit(R5_LOCKED, &dev->flags));
209 BUG();
210 }
211 dev->flags = 0;
212 raid5_build_block(sh, i);
213 }
214 insert_hash(conf, sh);
215 }
216
217 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
218 {
219 struct stripe_head *sh;
220 struct hlist_node *hn;
221
222 CHECK_DEVLOCK();
223 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
224 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
225 if (sh->sector == sector && sh->disks == disks)
226 return sh;
227 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
228 return NULL;
229 }
230
231 static void unplug_slaves(mddev_t *mddev);
232 static void raid5_unplug_device(request_queue_t *q);
233
234 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
235 int pd_idx, int noblock)
236 {
237 struct stripe_head *sh;
238
239 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
240
241 spin_lock_irq(&conf->device_lock);
242
243 do {
244 wait_event_lock_irq(conf->wait_for_stripe,
245 conf->quiesce == 0,
246 conf->device_lock, /* nothing */);
247 sh = __find_stripe(conf, sector, disks);
248 if (!sh) {
249 if (!conf->inactive_blocked)
250 sh = get_free_stripe(conf);
251 if (noblock && sh == NULL)
252 break;
253 if (!sh) {
254 conf->inactive_blocked = 1;
255 wait_event_lock_irq(conf->wait_for_stripe,
256 !list_empty(&conf->inactive_list) &&
257 (atomic_read(&conf->active_stripes)
258 < (conf->max_nr_stripes *3/4)
259 || !conf->inactive_blocked),
260 conf->device_lock,
261 unplug_slaves(conf->mddev)
262 );
263 conf->inactive_blocked = 0;
264 } else
265 init_stripe(sh, sector, pd_idx, disks);
266 } else {
267 if (atomic_read(&sh->count)) {
268 BUG_ON(!list_empty(&sh->lru));
269 } else {
270 if (!test_bit(STRIPE_HANDLE, &sh->state))
271 atomic_inc(&conf->active_stripes);
272 if (!list_empty(&sh->lru))
273 list_del_init(&sh->lru);
274 }
275 }
276 } while (sh == NULL);
277
278 if (sh)
279 atomic_inc(&sh->count);
280
281 spin_unlock_irq(&conf->device_lock);
282 return sh;
283 }
284
285 static int grow_one_stripe(raid5_conf_t *conf)
286 {
287 struct stripe_head *sh;
288 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
289 if (!sh)
290 return 0;
291 memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
292 sh->raid_conf = conf;
293 spin_lock_init(&sh->lock);
294
295 if (grow_buffers(sh, conf->raid_disks)) {
296 shrink_buffers(sh, conf->raid_disks);
297 kmem_cache_free(conf->slab_cache, sh);
298 return 0;
299 }
300 sh->disks = conf->raid_disks;
301 /* we just created an active stripe so... */
302 atomic_set(&sh->count, 1);
303 atomic_inc(&conf->active_stripes);
304 INIT_LIST_HEAD(&sh->lru);
305 release_stripe(sh);
306 return 1;
307 }
308
309 static int grow_stripes(raid5_conf_t *conf, int num)
310 {
311 kmem_cache_t *sc;
312 int devs = conf->raid_disks;
313
314 sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
315 sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
316 conf->active_name = 0;
317 sc = kmem_cache_create(conf->cache_name[conf->active_name],
318 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
319 0, 0, NULL, NULL);
320 if (!sc)
321 return 1;
322 conf->slab_cache = sc;
323 conf->pool_size = devs;
324 while (num--) {
325 if (!grow_one_stripe(conf))
326 return 1;
327 }
328 return 0;
329 }
330
331 #ifdef CONFIG_MD_RAID5_RESHAPE
332 static int resize_stripes(raid5_conf_t *conf, int newsize)
333 {
334 /* Make all the stripes able to hold 'newsize' devices.
335 * New slots in each stripe get 'page' set to a new page.
336 *
337 * This happens in stages:
338 * 1/ create a new kmem_cache and allocate the required number of
339 * stripe_heads.
340 * 2/ gather all the old stripe_heads and tranfer the pages across
341 * to the new stripe_heads. This will have the side effect of
342 * freezing the array as once all stripe_heads have been collected,
343 * no IO will be possible. Old stripe heads are freed once their
344 * pages have been transferred over, and the old kmem_cache is
345 * freed when all stripes are done.
346 * 3/ reallocate conf->disks to be suitable bigger. If this fails,
347 * we simple return a failre status - no need to clean anything up.
348 * 4/ allocate new pages for the new slots in the new stripe_heads.
349 * If this fails, we don't bother trying the shrink the
350 * stripe_heads down again, we just leave them as they are.
351 * As each stripe_head is processed the new one is released into
352 * active service.
353 *
354 * Once step2 is started, we cannot afford to wait for a write,
355 * so we use GFP_NOIO allocations.
356 */
357 struct stripe_head *osh, *nsh;
358 LIST_HEAD(newstripes);
359 struct disk_info *ndisks;
360 int err = 0;
361 kmem_cache_t *sc;
362 int i;
363
364 if (newsize <= conf->pool_size)
365 return 0; /* never bother to shrink */
366
367 /* Step 1 */
368 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
369 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
370 0, 0, NULL, NULL);
371 if (!sc)
372 return -ENOMEM;
373
374 for (i = conf->max_nr_stripes; i; i--) {
375 nsh = kmem_cache_alloc(sc, GFP_KERNEL);
376 if (!nsh)
377 break;
378
379 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
380
381 nsh->raid_conf = conf;
382 spin_lock_init(&nsh->lock);
383
384 list_add(&nsh->lru, &newstripes);
385 }
386 if (i) {
387 /* didn't get enough, give up */
388 while (!list_empty(&newstripes)) {
389 nsh = list_entry(newstripes.next, struct stripe_head, lru);
390 list_del(&nsh->lru);
391 kmem_cache_free(sc, nsh);
392 }
393 kmem_cache_destroy(sc);
394 return -ENOMEM;
395 }
396 /* Step 2 - Must use GFP_NOIO now.
397 * OK, we have enough stripes, start collecting inactive
398 * stripes and copying them over
399 */
400 list_for_each_entry(nsh, &newstripes, lru) {
401 spin_lock_irq(&conf->device_lock);
402 wait_event_lock_irq(conf->wait_for_stripe,
403 !list_empty(&conf->inactive_list),
404 conf->device_lock,
405 unplug_slaves(conf->mddev)
406 );
407 osh = get_free_stripe(conf);
408 spin_unlock_irq(&conf->device_lock);
409 atomic_set(&nsh->count, 1);
410 for(i=0; i<conf->pool_size; i++)
411 nsh->dev[i].page = osh->dev[i].page;
412 for( ; i<newsize; i++)
413 nsh->dev[i].page = NULL;
414 kmem_cache_free(conf->slab_cache, osh);
415 }
416 kmem_cache_destroy(conf->slab_cache);
417
418 /* Step 3.
419 * At this point, we are holding all the stripes so the array
420 * is completely stalled, so now is a good time to resize
421 * conf->disks.
422 */
423 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
424 if (ndisks) {
425 for (i=0; i<conf->raid_disks; i++)
426 ndisks[i] = conf->disks[i];
427 kfree(conf->disks);
428 conf->disks = ndisks;
429 } else
430 err = -ENOMEM;
431
432 /* Step 4, return new stripes to service */
433 while(!list_empty(&newstripes)) {
434 nsh = list_entry(newstripes.next, struct stripe_head, lru);
435 list_del_init(&nsh->lru);
436 for (i=conf->raid_disks; i < newsize; i++)
437 if (nsh->dev[i].page == NULL) {
438 struct page *p = alloc_page(GFP_NOIO);
439 nsh->dev[i].page = p;
440 if (!p)
441 err = -ENOMEM;
442 }
443 release_stripe(nsh);
444 }
445 /* critical section pass, GFP_NOIO no longer needed */
446
447 conf->slab_cache = sc;
448 conf->active_name = 1-conf->active_name;
449 conf->pool_size = newsize;
450 return err;
451 }
452 #endif
453
454 static int drop_one_stripe(raid5_conf_t *conf)
455 {
456 struct stripe_head *sh;
457
458 spin_lock_irq(&conf->device_lock);
459 sh = get_free_stripe(conf);
460 spin_unlock_irq(&conf->device_lock);
461 if (!sh)
462 return 0;
463 BUG_ON(atomic_read(&sh->count));
464 shrink_buffers(sh, conf->pool_size);
465 kmem_cache_free(conf->slab_cache, sh);
466 atomic_dec(&conf->active_stripes);
467 return 1;
468 }
469
470 static void shrink_stripes(raid5_conf_t *conf)
471 {
472 while (drop_one_stripe(conf))
473 ;
474
475 if (conf->slab_cache)
476 kmem_cache_destroy(conf->slab_cache);
477 conf->slab_cache = NULL;
478 }
479
480 static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
481 int error)
482 {
483 struct stripe_head *sh = bi->bi_private;
484 raid5_conf_t *conf = sh->raid_conf;
485 int disks = sh->disks, i;
486 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
487
488 if (bi->bi_size)
489 return 1;
490
491 for (i=0 ; i<disks; i++)
492 if (bi == &sh->dev[i].req)
493 break;
494
495 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
496 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
497 uptodate);
498 if (i == disks) {
499 BUG();
500 return 0;
501 }
502
503 if (uptodate) {
504 #if 0
505 struct bio *bio;
506 unsigned long flags;
507 spin_lock_irqsave(&conf->device_lock, flags);
508 /* we can return a buffer if we bypassed the cache or
509 * if the top buffer is not in highmem. If there are
510 * multiple buffers, leave the extra work to
511 * handle_stripe
512 */
513 buffer = sh->bh_read[i];
514 if (buffer &&
515 (!PageHighMem(buffer->b_page)
516 || buffer->b_page == bh->b_page )
517 ) {
518 sh->bh_read[i] = buffer->b_reqnext;
519 buffer->b_reqnext = NULL;
520 } else
521 buffer = NULL;
522 spin_unlock_irqrestore(&conf->device_lock, flags);
523 if (sh->bh_page[i]==bh->b_page)
524 set_buffer_uptodate(bh);
525 if (buffer) {
526 if (buffer->b_page != bh->b_page)
527 memcpy(buffer->b_data, bh->b_data, bh->b_size);
528 buffer->b_end_io(buffer, 1);
529 }
530 #else
531 set_bit(R5_UPTODATE, &sh->dev[i].flags);
532 #endif
533 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
534 printk(KERN_INFO "raid5: read error corrected!!\n");
535 clear_bit(R5_ReadError, &sh->dev[i].flags);
536 clear_bit(R5_ReWrite, &sh->dev[i].flags);
537 }
538 if (atomic_read(&conf->disks[i].rdev->read_errors))
539 atomic_set(&conf->disks[i].rdev->read_errors, 0);
540 } else {
541 int retry = 0;
542 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
543 atomic_inc(&conf->disks[i].rdev->read_errors);
544 if (conf->mddev->degraded)
545 printk(KERN_WARNING "raid5: read error not correctable.\n");
546 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
547 /* Oh, no!!! */
548 printk(KERN_WARNING "raid5: read error NOT corrected!!\n");
549 else if (atomic_read(&conf->disks[i].rdev->read_errors)
550 > conf->max_nr_stripes)
551 printk(KERN_WARNING
552 "raid5: Too many read errors, failing device.\n");
553 else
554 retry = 1;
555 if (retry)
556 set_bit(R5_ReadError, &sh->dev[i].flags);
557 else {
558 clear_bit(R5_ReadError, &sh->dev[i].flags);
559 clear_bit(R5_ReWrite, &sh->dev[i].flags);
560 md_error(conf->mddev, conf->disks[i].rdev);
561 }
562 }
563 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
564 #if 0
565 /* must restore b_page before unlocking buffer... */
566 if (sh->bh_page[i] != bh->b_page) {
567 bh->b_page = sh->bh_page[i];
568 bh->b_data = page_address(bh->b_page);
569 clear_buffer_uptodate(bh);
570 }
571 #endif
572 clear_bit(R5_LOCKED, &sh->dev[i].flags);
573 set_bit(STRIPE_HANDLE, &sh->state);
574 release_stripe(sh);
575 return 0;
576 }
577
578 static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
579 int error)
580 {
581 struct stripe_head *sh = bi->bi_private;
582 raid5_conf_t *conf = sh->raid_conf;
583 int disks = sh->disks, i;
584 unsigned long flags;
585 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
586
587 if (bi->bi_size)
588 return 1;
589
590 for (i=0 ; i<disks; i++)
591 if (bi == &sh->dev[i].req)
592 break;
593
594 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
595 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
596 uptodate);
597 if (i == disks) {
598 BUG();
599 return 0;
600 }
601
602 spin_lock_irqsave(&conf->device_lock, flags);
603 if (!uptodate)
604 md_error(conf->mddev, conf->disks[i].rdev);
605
606 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
607
608 clear_bit(R5_LOCKED, &sh->dev[i].flags);
609 set_bit(STRIPE_HANDLE, &sh->state);
610 __release_stripe(conf, sh);
611 spin_unlock_irqrestore(&conf->device_lock, flags);
612 return 0;
613 }
614
615
616 static sector_t compute_blocknr(struct stripe_head *sh, int i);
617
618 static void raid5_build_block (struct stripe_head *sh, int i)
619 {
620 struct r5dev *dev = &sh->dev[i];
621
622 bio_init(&dev->req);
623 dev->req.bi_io_vec = &dev->vec;
624 dev->req.bi_vcnt++;
625 dev->req.bi_max_vecs++;
626 dev->vec.bv_page = dev->page;
627 dev->vec.bv_len = STRIPE_SIZE;
628 dev->vec.bv_offset = 0;
629
630 dev->req.bi_sector = sh->sector;
631 dev->req.bi_private = sh;
632
633 dev->flags = 0;
634 if (i != sh->pd_idx)
635 dev->sector = compute_blocknr(sh, i);
636 }
637
638 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
639 {
640 char b[BDEVNAME_SIZE];
641 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
642 PRINTK("raid5: error called\n");
643
644 if (!test_bit(Faulty, &rdev->flags)) {
645 mddev->sb_dirty = 1;
646 if (test_bit(In_sync, &rdev->flags)) {
647 conf->working_disks--;
648 mddev->degraded++;
649 conf->failed_disks++;
650 clear_bit(In_sync, &rdev->flags);
651 /*
652 * if recovery was running, make sure it aborts.
653 */
654 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
655 }
656 set_bit(Faulty, &rdev->flags);
657 printk (KERN_ALERT
658 "raid5: Disk failure on %s, disabling device."
659 " Operation continuing on %d devices\n",
660 bdevname(rdev->bdev,b), conf->working_disks);
661 }
662 }
663
664 /*
665 * Input: a 'big' sector number,
666 * Output: index of the data and parity disk, and the sector # in them.
667 */
668 static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
669 unsigned int data_disks, unsigned int * dd_idx,
670 unsigned int * pd_idx, raid5_conf_t *conf)
671 {
672 long stripe;
673 unsigned long chunk_number;
674 unsigned int chunk_offset;
675 sector_t new_sector;
676 int sectors_per_chunk = conf->chunk_size >> 9;
677
678 /* First compute the information on this sector */
679
680 /*
681 * Compute the chunk number and the sector offset inside the chunk
682 */
683 chunk_offset = sector_div(r_sector, sectors_per_chunk);
684 chunk_number = r_sector;
685 BUG_ON(r_sector != chunk_number);
686
687 /*
688 * Compute the stripe number
689 */
690 stripe = chunk_number / data_disks;
691
692 /*
693 * Compute the data disk and parity disk indexes inside the stripe
694 */
695 *dd_idx = chunk_number % data_disks;
696
697 /*
698 * Select the parity disk based on the user selected algorithm.
699 */
700 if (conf->level == 4)
701 *pd_idx = data_disks;
702 else switch (conf->algorithm) {
703 case ALGORITHM_LEFT_ASYMMETRIC:
704 *pd_idx = data_disks - stripe % raid_disks;
705 if (*dd_idx >= *pd_idx)
706 (*dd_idx)++;
707 break;
708 case ALGORITHM_RIGHT_ASYMMETRIC:
709 *pd_idx = stripe % raid_disks;
710 if (*dd_idx >= *pd_idx)
711 (*dd_idx)++;
712 break;
713 case ALGORITHM_LEFT_SYMMETRIC:
714 *pd_idx = data_disks - stripe % raid_disks;
715 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
716 break;
717 case ALGORITHM_RIGHT_SYMMETRIC:
718 *pd_idx = stripe % raid_disks;
719 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
720 break;
721 default:
722 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
723 conf->algorithm);
724 }
725
726 /*
727 * Finally, compute the new sector number
728 */
729 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
730 return new_sector;
731 }
732
733
734 static sector_t compute_blocknr(struct stripe_head *sh, int i)
735 {
736 raid5_conf_t *conf = sh->raid_conf;
737 int raid_disks = sh->disks, data_disks = raid_disks - 1;
738 sector_t new_sector = sh->sector, check;
739 int sectors_per_chunk = conf->chunk_size >> 9;
740 sector_t stripe;
741 int chunk_offset;
742 int chunk_number, dummy1, dummy2, dd_idx = i;
743 sector_t r_sector;
744
745 chunk_offset = sector_div(new_sector, sectors_per_chunk);
746 stripe = new_sector;
747 BUG_ON(new_sector != stripe);
748
749
750 switch (conf->algorithm) {
751 case ALGORITHM_LEFT_ASYMMETRIC:
752 case ALGORITHM_RIGHT_ASYMMETRIC:
753 if (i > sh->pd_idx)
754 i--;
755 break;
756 case ALGORITHM_LEFT_SYMMETRIC:
757 case ALGORITHM_RIGHT_SYMMETRIC:
758 if (i < sh->pd_idx)
759 i += raid_disks;
760 i -= (sh->pd_idx + 1);
761 break;
762 default:
763 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
764 conf->algorithm);
765 }
766
767 chunk_number = stripe * data_disks + i;
768 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
769
770 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
771 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
772 printk(KERN_ERR "compute_blocknr: map not correct\n");
773 return 0;
774 }
775 return r_sector;
776 }
777
778
779
780 /*
781 * Copy data between a page in the stripe cache, and a bio.
782 * There are no alignment or size guarantees between the page or the
783 * bio except that there is some overlap.
784 * All iovecs in the bio must be considered.
785 */
786 static void copy_data(int frombio, struct bio *bio,
787 struct page *page,
788 sector_t sector)
789 {
790 char *pa = page_address(page);
791 struct bio_vec *bvl;
792 int i;
793 int page_offset;
794
795 if (bio->bi_sector >= sector)
796 page_offset = (signed)(bio->bi_sector - sector) * 512;
797 else
798 page_offset = (signed)(sector - bio->bi_sector) * -512;
799 bio_for_each_segment(bvl, bio, i) {
800 int len = bio_iovec_idx(bio,i)->bv_len;
801 int clen;
802 int b_offset = 0;
803
804 if (page_offset < 0) {
805 b_offset = -page_offset;
806 page_offset += b_offset;
807 len -= b_offset;
808 }
809
810 if (len > 0 && page_offset + len > STRIPE_SIZE)
811 clen = STRIPE_SIZE - page_offset;
812 else clen = len;
813
814 if (clen > 0) {
815 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
816 if (frombio)
817 memcpy(pa+page_offset, ba+b_offset, clen);
818 else
819 memcpy(ba+b_offset, pa+page_offset, clen);
820 __bio_kunmap_atomic(ba, KM_USER0);
821 }
822 if (clen < len) /* hit end of page */
823 break;
824 page_offset += len;
825 }
826 }
827
828 #define check_xor() do { \
829 if (count == MAX_XOR_BLOCKS) { \
830 xor_block(count, STRIPE_SIZE, ptr); \
831 count = 1; \
832 } \
833 } while(0)
834
835
836 static void compute_block(struct stripe_head *sh, int dd_idx)
837 {
838 int i, count, disks = sh->disks;
839 void *ptr[MAX_XOR_BLOCKS], *p;
840
841 PRINTK("compute_block, stripe %llu, idx %d\n",
842 (unsigned long long)sh->sector, dd_idx);
843
844 ptr[0] = page_address(sh->dev[dd_idx].page);
845 memset(ptr[0], 0, STRIPE_SIZE);
846 count = 1;
847 for (i = disks ; i--; ) {
848 if (i == dd_idx)
849 continue;
850 p = page_address(sh->dev[i].page);
851 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
852 ptr[count++] = p;
853 else
854 printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
855 " not present\n", dd_idx,
856 (unsigned long long)sh->sector, i);
857
858 check_xor();
859 }
860 if (count != 1)
861 xor_block(count, STRIPE_SIZE, ptr);
862 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
863 }
864
865 static void compute_parity(struct stripe_head *sh, int method)
866 {
867 raid5_conf_t *conf = sh->raid_conf;
868 int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
869 void *ptr[MAX_XOR_BLOCKS];
870 struct bio *chosen;
871
872 PRINTK("compute_parity, stripe %llu, method %d\n",
873 (unsigned long long)sh->sector, method);
874
875 count = 1;
876 ptr[0] = page_address(sh->dev[pd_idx].page);
877 switch(method) {
878 case READ_MODIFY_WRITE:
879 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
880 for (i=disks ; i-- ;) {
881 if (i==pd_idx)
882 continue;
883 if (sh->dev[i].towrite &&
884 test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
885 ptr[count++] = page_address(sh->dev[i].page);
886 chosen = sh->dev[i].towrite;
887 sh->dev[i].towrite = NULL;
888
889 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
890 wake_up(&conf->wait_for_overlap);
891
892 BUG_ON(sh->dev[i].written);
893 sh->dev[i].written = chosen;
894 check_xor();
895 }
896 }
897 break;
898 case RECONSTRUCT_WRITE:
899 memset(ptr[0], 0, STRIPE_SIZE);
900 for (i= disks; i-- ;)
901 if (i!=pd_idx && sh->dev[i].towrite) {
902 chosen = sh->dev[i].towrite;
903 sh->dev[i].towrite = NULL;
904
905 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
906 wake_up(&conf->wait_for_overlap);
907
908 BUG_ON(sh->dev[i].written);
909 sh->dev[i].written = chosen;
910 }
911 break;
912 case CHECK_PARITY:
913 break;
914 }
915 if (count>1) {
916 xor_block(count, STRIPE_SIZE, ptr);
917 count = 1;
918 }
919
920 for (i = disks; i--;)
921 if (sh->dev[i].written) {
922 sector_t sector = sh->dev[i].sector;
923 struct bio *wbi = sh->dev[i].written;
924 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
925 copy_data(1, wbi, sh->dev[i].page, sector);
926 wbi = r5_next_bio(wbi, sector);
927 }
928
929 set_bit(R5_LOCKED, &sh->dev[