~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux-2.6.17/drivers/md/raid1.c

Version: ~ [ 2.6.16 ] ~ [ 2.6.17 ] ~
Architecture: ~ [ ia64 ] ~ [ i386 ] ~ [ arm ] ~ [ ppc ] ~ [ sparc64 ] ~

  1 /*
  2  * raid1.c : Multiple Devices driver for Linux
  3  *
  4  * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
  5  *
  6  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  7  *
  8  * RAID-1 management functions.
  9  *
 10  * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
 11  *
 12  * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
 13  * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
 14  *
 15  * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
 16  * bitmapped intelligence in resync:
 17  *
 18  *      - bitmap marked during normal i/o
 19  *      - bitmap used to skip nondirty blocks during sync
 20  *
 21  * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
 22  * - persistent bitmap code
 23  *
 24  * This program is free software; you can redistribute it and/or modify
 25  * it under the terms of the GNU General Public License as published by
 26  * the Free Software Foundation; either version 2, or (at your option)
 27  * any later version.
 28  *
 29  * You should have received a copy of the GNU General Public License
 30  * (for example /usr/src/linux/COPYING); if not, write to the Free
 31  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 32  */
 33 
 34 #include "dm-bio-list.h"
 35 #include <linux/raid/raid1.h>
 36 #include <linux/raid/bitmap.h>
 37 
 38 #define DEBUG 0
 39 #if DEBUG
 40 #define PRINTK(x...) printk(x)
 41 #else
 42 #define PRINTK(x...)
 43 #endif
 44 
 45 /*
 46  * Number of guaranteed r1bios in case of extreme VM load:
 47  */
 48 #define NR_RAID1_BIOS 256
 49 
 50 
 51 static void unplug_slaves(mddev_t *mddev);
 52 
 53 static void allow_barrier(conf_t *conf);
 54 static void lower_barrier(conf_t *conf);
 55 
 56 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 57 {
 58         struct pool_info *pi = data;
 59         r1bio_t *r1_bio;
 60         int size = offsetof(r1bio_t, bios[pi->raid_disks]);
 61 
 62         /* allocate a r1bio with room for raid_disks entries in the bios array */
 63         r1_bio = kzalloc(size, gfp_flags);
 64         if (!r1_bio)
 65                 unplug_slaves(pi->mddev);
 66 
 67         return r1_bio;
 68 }
 69 
 70 static void r1bio_pool_free(void *r1_bio, void *data)
 71 {
 72         kfree(r1_bio);
 73 }
 74 
 75 #define RESYNC_BLOCK_SIZE (64*1024)
 76 //#define RESYNC_BLOCK_SIZE PAGE_SIZE
 77 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
 78 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 79 #define RESYNC_WINDOW (2048*1024)
 80 
 81 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 82 {
 83         struct pool_info *pi = data;
 84         struct page *page;
 85         r1bio_t *r1_bio;
 86         struct bio *bio;
 87         int i, j;
 88 
 89         r1_bio = r1bio_pool_alloc(gfp_flags, pi);
 90         if (!r1_bio) {
 91                 unplug_slaves(pi->mddev);
 92                 return NULL;
 93         }
 94 
 95         /*
 96          * Allocate bios : 1 for reading, n-1 for writing
 97          */
 98         for (j = pi->raid_disks ; j-- ; ) {
 99                 bio = bio_alloc(gfp_flags, RESYNC_PAGES);
100                 if (!bio)
101                         goto out_free_bio;
102                 r1_bio->bios[j] = bio;
103         }
104         /*
105          * Allocate RESYNC_PAGES data pages and attach them to
106          * the first bio.
107          * If this is a user-requested check/repair, allocate
108          * RESYNC_PAGES for each bio.
109          */
110         if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
111                 j = pi->raid_disks;
112         else
113                 j = 1;
114         while(j--) {
115                 bio = r1_bio->bios[j];
116                 for (i = 0; i < RESYNC_PAGES; i++) {
117                         page = alloc_page(gfp_flags);
118                         if (unlikely(!page))
119                                 goto out_free_pages;
120 
121                         bio->bi_io_vec[i].bv_page = page;
122                 }
123         }
124         /* If not user-requests, copy the page pointers to all bios */
125         if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
126                 for (i=0; i<RESYNC_PAGES ; i++)
127                         for (j=1; j<pi->raid_disks; j++)
128                                 r1_bio->bios[j]->bi_io_vec[i].bv_page =
129                                         r1_bio->bios[0]->bi_io_vec[i].bv_page;
130         }
131 
132         r1_bio->master_bio = NULL;
133 
134         return r1_bio;
135 
136 out_free_pages:
137         for (i=0; i < RESYNC_PAGES ; i++)
138                 for (j=0 ; j < pi->raid_disks; j++)
139                         safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
140         j = -1;
141 out_free_bio:
142         while ( ++j < pi->raid_disks )
143                 bio_put(r1_bio->bios[j]);
144         r1bio_pool_free(r1_bio, data);
145         return NULL;
146 }
147 
148 static void r1buf_pool_free(void *__r1_bio, void *data)
149 {
150         struct pool_info *pi = data;
151         int i,j;
152         r1bio_t *r1bio = __r1_bio;
153 
154         for (i = 0; i < RESYNC_PAGES; i++)
155                 for (j = pi->raid_disks; j-- ;) {
156                         if (j == 0 ||
157                             r1bio->bios[j]->bi_io_vec[i].bv_page !=
158                             r1bio->bios[0]->bi_io_vec[i].bv_page)
159                                 safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
160                 }
161         for (i=0 ; i < pi->raid_disks; i++)
162                 bio_put(r1bio->bios[i]);
163 
164         r1bio_pool_free(r1bio, data);
165 }
166 
167 static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
168 {
169         int i;
170 
171         for (i = 0; i < conf->raid_disks; i++) {
172                 struct bio **bio = r1_bio->bios + i;
173                 if (*bio && *bio != IO_BLOCKED)
174                         bio_put(*bio);
175                 *bio = NULL;
176         }
177 }
178 
179 static void free_r1bio(r1bio_t *r1_bio)
180 {
181         conf_t *conf = mddev_to_conf(r1_bio->mddev);
182 
183         /*
184          * Wake up any possible resync thread that waits for the device
185          * to go idle.
186          */
187         allow_barrier(conf);
188 
189         put_all_bios(conf, r1_bio);
190         mempool_free(r1_bio, conf->r1bio_pool);
191 }
192 
193 static void put_buf(r1bio_t *r1_bio)
194 {
195         conf_t *conf = mddev_to_conf(r1_bio->mddev);
196         int i;
197 
198         for (i=0; i<conf->raid_disks; i++) {
199                 struct bio *bio = r1_bio->bios[i];
200                 if (bio->bi_end_io)
201                         rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
202         }
203 
204         mempool_free(r1_bio, conf->r1buf_pool);
205 
206         lower_barrier(conf);
207 }
208 
209 static void reschedule_retry(r1bio_t *r1_bio)
210 {
211         unsigned long flags;
212         mddev_t *mddev = r1_bio->mddev;
213         conf_t *conf = mddev_to_conf(mddev);
214 
215         spin_lock_irqsave(&conf->device_lock, flags);
216         list_add(&r1_bio->retry_list, &conf->retry_list);
217         conf->nr_queued ++;
218         spin_unlock_irqrestore(&conf->device_lock, flags);
219 
220         wake_up(&conf->wait_barrier);
221         md_wakeup_thread(mddev->thread);
222 }
223 
224 /*
225  * raid_end_bio_io() is called when we have finished servicing a mirrored
226  * operation and are ready to return a success/failure code to the buffer
227  * cache layer.
228  */
229 static void raid_end_bio_io(r1bio_t *r1_bio)
230 {
231         struct bio *bio = r1_bio->master_bio;
232 
233         /* if nobody has done the final endio yet, do it now */
234         if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
235                 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
236                         (bio_data_dir(bio) == WRITE) ? "write" : "read",
237                         (unsigned long long) bio->bi_sector,
238                         (unsigned long long) bio->bi_sector +
239                                 (bio->bi_size >> 9) - 1);
240 
241                 bio_endio(bio, bio->bi_size,
242                         test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
243         }
244         free_r1bio(r1_bio);
245 }
246 
247 /*
248  * Update disk head position estimator based on IRQ completion info.
249  */
250 static inline void update_head_pos(int disk, r1bio_t *r1_bio)
251 {
252         conf_t *conf = mddev_to_conf(r1_bio->mddev);
253 
254         conf->mirrors[disk].head_position =
255                 r1_bio->sector + (r1_bio->sectors);
256 }
257 
258 static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
259 {
260         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
261         r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
262         int mirror;
263         conf_t *conf = mddev_to_conf(r1_bio->mddev);
264 
265         if (bio->bi_size)
266                 return 1;
267         
268         mirror = r1_bio->read_disk;
269         /*
270          * this branch is our 'one mirror IO has finished' event handler:
271          */
272         update_head_pos(mirror, r1_bio);
273 
274         if (uptodate || conf->working_disks <= 1) {
275                 /*
276                  * Set R1BIO_Uptodate in our master bio, so that
277                  * we will return a good error code for to the higher
278                  * levels even if IO on some other mirrored buffer fails.
279                  *
280                  * The 'master' represents the composite IO operation to
281                  * user-side. So if something waits for IO, then it will
282                  * wait for the 'master' bio.
283                  */
284                 if (uptodate)
285                         set_bit(R1BIO_Uptodate, &r1_bio->state);
286 
287                 raid_end_bio_io(r1_bio);
288         } else {
289                 /*
290                  * oops, read error:
291                  */
292                 char b[BDEVNAME_SIZE];
293                 if (printk_ratelimit())
294                         printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
295                                bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
296                 reschedule_retry(r1_bio);
297         }
298 
299         rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
300         return 0;
301 }
302 
303 static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
304 {
305         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
306         r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
307         int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
308         conf_t *conf = mddev_to_conf(r1_bio->mddev);
309         struct bio *to_put = NULL;
310 
311         if (bio->bi_size)
312                 return 1;
313 
314         for (mirror = 0; mirror < conf->raid_disks; mirror++)
315                 if (r1_bio->bios[mirror] == bio)
316                         break;
317 
318         if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
319                 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
320                 set_bit(R1BIO_BarrierRetry, &r1_bio->state);
321                 r1_bio->mddev->barriers_work = 0;
322                 /* Don't rdev_dec_pending in this branch - keep it for the retry */
323         } else {
324                 /*
325                  * this branch is our 'one mirror IO has finished' event handler:
326                  */
327                 r1_bio->bios[mirror] = NULL;
328                 to_put = bio;
329                 if (!uptodate) {
330                         md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
331                         /* an I/O failed, we can't clear the bitmap */
332                         set_bit(R1BIO_Degraded, &r1_bio->state);
333                 } else
334                         /*
335                          * Set R1BIO_Uptodate in our master bio, so that
336                          * we will return a good error code for to the higher
337                          * levels even if IO on some other mirrored buffer fails.
338                          *
339                          * The 'master' represents the composite IO operation to
340                          * user-side. So if something waits for IO, then it will
341                          * wait for the 'master' bio.
342                          */
343                         set_bit(R1BIO_Uptodate, &r1_bio->state);
344 
345                 update_head_pos(mirror, r1_bio);
346 
347                 if (behind) {
348                         if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
349                                 atomic_dec(&r1_bio->behind_remaining);
350 
351                         /* In behind mode, we ACK the master bio once the I/O has safely
352                          * reached all non-writemostly disks. Setting the Returned bit
353                          * ensures that this gets done only once -- we don't ever want to
354                          * return -EIO here, instead we'll wait */
355 
356                         if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
357                             test_bit(R1BIO_Uptodate, &r1_bio->state)) {
358                                 /* Maybe we can return now */
359                                 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
360                                         struct bio *mbio = r1_bio->master_bio;
361                                         PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
362                                                (unsigned long long) mbio->bi_sector,
363                                                (unsigned long long) mbio->bi_sector +
364                                                (mbio->bi_size >> 9) - 1);
365                                         bio_endio(mbio, mbio->bi_size, 0);
366                                 }
367                         }
368                 }
369                 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
370         }
371         /*
372          *
373          * Let's see if all mirrored write operations have finished
374          * already.
375          */
376         if (atomic_dec_and_test(&r1_bio->remaining)) {
377                 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
378                         reschedule_retry(r1_bio);
379                         goto out;
380                 }
381                 /* it really is the end of this request */
382                 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
383                         /* free extra copy of the data pages */
384                         int i = bio->bi_vcnt;
385                         while (i--)
386                                 safe_put_page(bio->bi_io_vec[i].bv_page);
387                 }
388                 /* clear the bitmap if all writes complete successfully */
389                 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
390                                 r1_bio->sectors,
391                                 !test_bit(R1BIO_Degraded, &r1_bio->state),
392                                 behind);
393                 md_write_end(r1_bio->mddev);
394                 raid_end_bio_io(r1_bio);
395         }
396  out:
397         if (to_put)
398                 bio_put(to_put);
399 
400         return 0;
401 }
402 
403 
404 /*
405  * This routine returns the disk from which the requested read should
406  * be done. There is a per-array 'next expected sequential IO' sector
407  * number - if this matches on the next IO then we use the last disk.
408  * There is also a per-disk 'last know head position' sector that is
409  * maintained from IRQ contexts, both the normal and the resync IO
410  * completion handlers update this position correctly. If there is no
411  * perfect sequential match then we pick the disk whose head is closest.
412  *
413  * If there are 2 mirrors in the same 2 devices, performance degrades
414  * because position is mirror, not device based.
415  *
416  * The rdev for the device selected will have nr_pending incremented.
417  */
418 static int read_balance(conf_t *conf, r1bio_t *r1_bio)
419 {
420         const unsigned long this_sector = r1_bio->sector;
421         int new_disk = conf->last_used, disk = new_disk;
422         int wonly_disk = -1;
423         const int sectors = r1_bio->sectors;
424         sector_t new_distance, current_distance;
425         mdk_rdev_t *rdev;
426 
427         rcu_read_lock();
428         /*
429          * Check if we can balance. We can balance on the whole
430          * device if no resync is going on, or below the resync window.
431          * We take the first readable disk when above the resync window.
432          */
433  retry:
434         if (conf->mddev->recovery_cp < MaxSector &&
435             (this_sector + sectors >= conf->next_resync)) {
436                 /* Choose the first operation device, for consistancy */
437                 new_disk = 0;
438 
439                 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
440                      r1_bio->bios[new_disk] == IO_BLOCKED ||
441                      !rdev || !test_bit(In_sync, &rdev->flags)
442                              || test_bit(WriteMostly, &rdev->flags);
443                      rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
444 
445                         if (rdev && test_bit(In_sync, &rdev->flags) &&
446                                 r1_bio->bios[new_disk] != IO_BLOCKED)
447                                 wonly_disk = new_disk;
448 
449                         if (new_disk == conf->raid_disks - 1) {
450                                 new_disk = wonly_disk;
451                                 break;
452                         }
453                 }
454                 goto rb_out;
455         }
456 
457 
458         /* make sure the disk is operational */
459         for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
460              r1_bio->bios[new_disk] == IO_BLOCKED ||
461              !rdev || !test_bit(In_sync, &rdev->flags) ||
462                      test_bit(WriteMostly, &rdev->flags);
463              rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
464 
465                 if (rdev && test_bit(In_sync, &rdev->flags) &&
466                     r1_bio->bios[new_disk] != IO_BLOCKED)
467                         wonly_disk = new_disk;
468 
469                 if (new_disk <= 0)
470                         new_disk = conf->raid_disks;
471                 new_disk--;
472                 if (new_disk == disk) {
473                         new_disk = wonly_disk;
474                         break;
475                 }
476         }
477 
478         if (new_disk < 0)
479                 goto rb_out;
480 
481         disk = new_disk;
482         /* now disk == new_disk == starting point for search */
483 
484         /*
485          * Don't change to another disk for sequential reads:
486          */
487         if (conf->next_seq_sect == this_sector)
488                 goto rb_out;
489         if (this_sector == conf->mirrors[new_disk].head_position)
490                 goto rb_out;
491 
492         current_distance = abs(this_sector - conf->mirrors[disk].head_position);
493 
494         /* Find the disk whose head is closest */
495 
496         do {
497                 if (disk <= 0)
498                         disk = conf->raid_disks;
499                 disk--;
500 
501                 rdev = rcu_dereference(conf->mirrors[disk].rdev);
502 
503                 if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
504                     !test_bit(In_sync, &rdev->flags) ||
505                     test_bit(WriteMostly, &rdev->flags))
506                         continue;
507 
508                 if (!atomic_read(&rdev->nr_pending)) {
509                         new_disk = disk;
510                         break;
511                 }
512                 new_distance = abs(this_sector - conf->mirrors[disk].head_position);
513                 if (new_distance < current_distance) {
514                         current_distance = new_distance;
515                         new_disk = disk;
516                 }
517         } while (disk != conf->last_used);
518 
519  rb_out:
520 
521 
522         if (new_disk >= 0) {
523                 rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
524                 if (!rdev)
525                         goto retry;
526                 atomic_inc(&rdev->nr_pending);
527                 if (!test_bit(In_sync, &rdev->flags)) {
528                         /* cannot risk returning a device that failed
529                          * before we inc'ed nr_pending
530                          */
531                         rdev_dec_pending(rdev, conf->mddev);
532                         goto retry;
533                 }
534                 conf->next_seq_sect = this_sector + sectors;
535                 conf->last_used = new_disk;
536         }
537         rcu_read_unlock();
538 
539         return new_disk;
540 }
541 
542 static void unplug_slaves(mddev_t *mddev)
543 {
544         conf_t *conf = mddev_to_conf(mddev);
545         int i;
546 
547         rcu_read_lock();
548         for (i=0; i<mddev->raid_disks; i++) {
549                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
550                 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
551                         request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
552 
553                         atomic_inc(&rdev->nr_pending);
554                         rcu_read_unlock();
555 
556                         if (r_queue->unplug_fn)
557                                 r_queue->unplug_fn(r_queue);
558 
559                         rdev_dec_pending(rdev, mddev);
560                         rcu_read_lock();
561                 }
562         }
563         rcu_read_unlock();
564 }
565 
566 static void raid1_unplug(request_queue_t *q)
567 {
568         mddev_t *mddev = q->queuedata;
569 
570         unplug_slaves(mddev);
571         md_wakeup_thread(mddev->thread);
572 }
573 
574 static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
575                              sector_t *error_sector)
576 {
577         mddev_t *mddev = q->queuedata;
578         conf_t *conf = mddev_to_conf(mddev);
579         int i, ret = 0;
580 
581         rcu_read_lock();
582         for (i=0; i<mddev->raid_disks && ret == 0; i++) {
583                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
584                 if (rdev && !test_bit(Faulty, &rdev->flags)) {
585                         struct block_device *bdev = rdev->bdev;
586                         request_queue_t *r_queue = bdev_get_queue(bdev);
587 
588                         if (!r_queue->issue_flush_fn)
589                                 ret = -EOPNOTSUPP;
590                         else {
591                                 atomic_inc(&rdev->nr_pending);
592                                 rcu_read_unlock();
593                                 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
594                                                               error_sector);
595                                 rdev_dec_pending(rdev, mddev);
596                                 rcu_read_lock();
597                         }
598                 }
599         }
600         rcu_read_unlock();
601         return ret;
602 }
603 
604 /* Barriers....
605  * Sometimes we need to suspend IO while we do something else,
606  * either some resync/recovery, or reconfigure the array.
607  * To do this we raise a 'barrier'.
608  * The 'barrier' is a counter that can be raised multiple times
609  * to count how many activities are happening which preclude
610  * normal IO.
611  * We can only raise the barrier if there is no pending IO.
612  * i.e. if nr_pending == 0.
613  * We choose only to raise the barrier if no-one is waiting for the
614  * barrier to go down.  This means that as soon as an IO request
615  * is ready, no other operations which require a barrier will start
616  * until the IO request has had a chance.
617  *
618  * So: regular IO calls 'wait_barrier'.  When that returns there
619  *    is no backgroup IO happening,  It must arrange to call
620  *    allow_barrier when it has finished its IO.
621  * backgroup IO calls must call raise_barrier.  Once that returns
622  *    there is no normal IO happeing.  It must arrange to call
623  *    lower_barrier when the particular background IO completes.
624  */
625 #define RESYNC_DEPTH 32
626 
627 static void raise_barrier(conf_t *conf)
628 {
629         spin_lock_irq(&conf->resync_lock);
630 
631         /* Wait until no block IO is waiting */
632         wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
633                             conf->resync_lock,
634                             raid1_unplug(conf->mddev->queue));
635 
636         /* block any new IO from starting */
637         conf->barrier++;
638 
639         /* No wait for all pending IO to complete */
640         wait_event_lock_irq(conf->wait_barrier,
641                             !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
642                             conf->resync_lock,
643                             raid1_unplug(conf->mddev->queue));
644 
645         spin_unlock_irq(&conf->resync_lock);
646 }
647 
648 static void lower_barrier(conf_t *conf)
649 {
650         unsigned long flags;
651         spin_lock_irqsave(&conf->resync_lock, flags);
652         conf->barrier--;
653         spin_unlock_irqrestore(&conf->resync_lock, flags);
654         wake_up(&conf->wait_barrier);
655 }
656 
657 static void wait_barrier(conf_t *conf)
658 {
659         spin_lock_irq(&conf->resync_lock);
660         if (conf->barrier) {
661                 conf->nr_waiting++;
662                 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
663                                     conf->resync_lock,
664                                     raid1_unplug(conf->mddev->queue));
665                 conf->nr_waiting--;
666         }
667         conf->nr_pending++;
668         spin_unlock_irq(&conf->resync_lock);
669 }
670 
671 static void allow_barrier(conf_t *conf)
672 {
673         unsigned long flags;
674         spin_lock_irqsave(&conf->resync_lock, flags);
675         conf->nr_pending--;
676         spin_unlock_irqrestore(&conf->resync_lock, flags);
677         wake_up(&conf->wait_barrier);
678 }
679 
680 static void freeze_array(conf_t *conf)
681 {
682         /* stop syncio and normal IO and wait for everything to
683          * go quite.
684          * We increment barrier and nr_waiting, and then
685          * wait until barrier+nr_pending match nr_queued+2
686          */
687         spin_lock_irq(&conf->resync_lock);
688         conf->barrier++;
689         conf->nr_waiting++;
690         wait_event_lock_irq(conf->wait_barrier,
691                             conf->barrier+conf->nr_pending == conf->nr_queued+2,
692                             conf->resync_lock,
693                             raid1_unplug(conf->mddev->queue));
694         spin_unlock_irq(&conf->resync_lock);
695 }
696 static void unfreeze_array(conf_t *conf)
697 {
698         /* reverse the effect of the freeze */
699         spin_lock_irq(&conf->resync_lock);
700         conf->barrier--;
701         conf->nr_waiting--;
702         wake_up(&conf->wait_barrier);
703         spin_unlock_irq(&conf->resync_lock);
704 }
705 
706 
707 /* duplicate the data pages for behind I/O */
708 static struct page **alloc_behind_pages(struct bio *bio)
709 {
710         int i;
711         struct bio_vec *bvec;
712         struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
713                                         GFP_NOIO);
714         if (unlikely(!pages))
715                 goto do_sync_io;
716 
717         bio_for_each_segment(bvec, bio, i) {
718                 pages[i] = alloc_page(GFP_NOIO);
719                 if (unlikely(!pages[i]))
720                         goto do_sync_io;
721                 memcpy(kmap(pages[i]) + bvec->bv_offset,
722                         kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
723                 kunmap(pages[i]);
724                 kunmap(bvec->bv_page);
725         }
726 
727         return pages;
728 
729 do_sync_io:
730         if (pages)
731                 for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
732                         put_page(pages[i]);
733         kfree(pages);
734         PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
735         return NULL;
736 }
737 
738 static int make_request(request_queue_t *q, struct bio * bio)
739 {
740         mddev_t *mddev = q->queuedata;
741         conf_t *conf = mddev_to_conf(mddev);
742         mirror_info_t *mirror;
743         r1bio_t *r1_bio;
744         struct bio *read_bio;
745         int i, targets = 0, disks;
746         mdk_rdev_t *rdev;
747         struct bitmap *bitmap = mddev->bitmap;
748         unsigned long flags;
749         struct bio_list bl;
750         struct page **behind_pages = NULL;
751         const int rw = bio_data_dir(bio);
752         int do_barriers;
753 
754         /*
755          * Register the new request and wait if the reconstruction
756          * thread has put up a bar for new requests.
757          * Continue immediately if no resync is active currently.
758          * We test barriers_work *after* md_write_start as md_write_start
759          * may cause the first superblock write, and that will check out
760          * if barriers work.
761          */
762 
763         md_write_start(mddev, bio); /* wait on superblock update early */
764 
765         if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
766                 if (rw == WRITE)
767                         md_write_end(mddev);
768                 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
769                 return 0;
770         }
771 
772         wait_barrier(conf);
773 
774         disk_stat_inc(mddev->gendisk, ios[rw]);
775         disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
776 
777         /*
778          * make_request() can abort the operation when READA is being
779          * used and no empty request is available.
780          *
781          */
782         r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
783 
784         r1_bio->master_bio = bio;
785         r1_bio->sectors = bio->bi_size >> 9;
786         r1_bio->state = 0;
787         r1_bio->mddev = mddev;
788         r1_bio->sector = bio->bi_sector;
789 
790         if (rw == READ) {
791                 /*
792                  * read balancing logic:
793                  */
794                 int rdisk = read_balance(conf, r1_bio);
795 
796                 if (rdisk < 0) {
797                         /* couldn't find anywhere to read from */
798                         raid_end_bio_io(r1_bio);
799                         return 0;
800                 }
801                 mirror = conf->mirrors + rdisk;
802 
803                 r1_bio->read_disk = rdisk;
804 
805                 read_bio = bio_clone(bio, GFP_NOIO);
806 
807                 r1_bio->bios[rdisk] = read_bio;
808 
809                 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
810                 read_bio->bi_bdev = mirror->rdev->bdev;
811                 read_bio->bi_end_io = raid1_end_read_request;
812                 read_bio->bi_rw = READ;
813                 read_bio->bi_private = r1_bio;
814 
815                 generic_make_request(read_bio);
816                 return 0;
817         }
818 
819         /*
820          * WRITE:
821          */
822         /* first select target devices under spinlock and
823          * inc refcount on their rdev.  Record them by setting
824          * bios[x] to bio
825          */
826         disks = conf->raid_disks;
827 #if 0
828         { static int first=1;
829         if (first) printk("First Write sector %llu disks %d\n",
830                           (unsigned long long)r1_bio->sector, disks);
831         first = 0;
832         }
833 #endif
834         rcu_read_lock();
835         for (i = 0;  i < disks; i++) {
836                 if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL &&
837                     !test_bit(Faulty, &rdev->flags)) {
838                         atomic_inc(&rdev->nr_pending);
839                         if (test_bit(Faulty, &rdev->flags)) {
840                                 rdev_dec_pending(rdev, mddev);
841                                 r1_bio->bios[i] = NULL;
842                         } else
843                                 r1_bio->bios[i] = bio;
844                         targets++;
845                 } else
846                         r1_bio->bios[i] = NULL;
847         }
848         rcu_read_unlock();
849 
850         BUG_ON(targets == 0); /* we never fail the last device */
851 
852         if (targets < conf->raid_disks) {
853                 /* array is degraded, we will not clear the bitmap
854                  * on I/O completion (see raid1_end_write_request) */
855                 set_bit(R1BIO_Degraded, &r1_bio->state);
856         }
857 
858         /* do behind I/O ? */
859         if (bitmap &&
860             atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
861             (behind_pages = alloc_behind_pages(bio)) != NULL)
862                 set_bit(R1BIO_BehindIO, &r1_bio->state);
863 
864         atomic_set(&r1_bio->remaining, 0);
865         atomic_set(&r1_bio->behind_remaining, 0);
866 
867         do_barriers = bio_barrier(bio);
868         if (do_barriers)
869                 set_bit(R1BIO_Barrier, &r1_bio->state);
870 
871         bio_list_init(&bl);
872         for (i = 0; i < disks; i++) {
873                 struct bio *mbio;
874                 if (!r1_bio->bios[i])
875                         continue;
876 
877                 mbio = bio_clone(bio, GFP_NOIO);
878                 r1_bio->bios[i] = mbio;
879 
880                 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
881                 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
882                 mbio->bi_end_io = raid1_end_write_request;
883                 mbio->bi_rw = WRITE | do_barriers;
884                 mbio->bi_private = r1_bio;
885 
886                 if (behind_pages) {
887                         struct bio_vec *bvec;
888                         int j;
889 
890                         /* Yes, I really want the '__' version so that
891                          * we clear any unused pointer in the io_vec, rather
892                          * than leave them unchanged.  This is important
893                          * because when we come to free the pages, we won't
894                          * know the originial bi_idx, so we just free
895                          * them all
896                          */
897                         __bio_for_each_segment(bvec, mbio, j, 0)
898                                 bvec->bv_page = behind_pages[j];
899                         if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
900                                 atomic_inc(&r1_bio->behind_remaining);
901                 }
902 
903                 atomic_inc(&r1_bio->remaining);
904 
905                 bio_list_add(&bl, mbio);
906         }
907         kfree(behind_pages); /* the behind pages are attached to the bios now */
908 
909         bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
910                                 test_bit(R1BIO_BehindIO, &r1_bio->state));
911         spin_lock_irqsave(&conf->device_lock, flags);
912         bio_list_merge(&conf->pending_bio_list, &bl);
913         bio_list_init(&bl);
914 
915         blk_plug_device(mddev->queue);
916         spin_unlock_irqrestore(&conf->device_lock, flags);
917 
918 #if 0
919         while ((bio = bio_list_pop(&bl)) != NULL)
920                 generic_make_request(bio);
921 #endif
922 
923         return 0;
924 }
925 
926 static void status(struct seq_file *seq, mddev_t *mddev)
927 {
928         conf_t *conf = mddev_to_conf(mddev);
929         int i;
930 
931         seq_printf(seq, " [%d/%d] [", conf->raid_disks,
932                                                 conf->working_disks);
933         for (i = 0; i < conf->raid_disks; i++)
934                 seq_printf(seq, "%s",
935                               conf->mirrors[i].rdev &&
936                               test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
937         seq_printf(seq, "]");
938 }
939 
940 
941 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
942 {
943         char b[BDEVNAME_SIZE];
944         conf_t *conf = mddev_to_conf(mddev);
945 
946         /*
947          * If it is not operational, then we have already marked it as dead
948          * else if it is the last working disks, ignore the error, let the
949          * next level up know.
950          * else mark the drive as failed
951          */
952         if (test_bit(In_sync, &rdev->flags)
953             && conf->working_disks == 1)
954                 /*
955                  * Don't fail the drive, act as though we were just a
956                  * normal single drive
957                  */
958                 return;
959         if (test_bit(In_sync, &rdev->flags)) {
960                 mddev->degraded++;
961                 conf->working_disks--;
962                 /*
963                  * if recovery is running, make sure it aborts.
964                  */
965                 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
966         }
967         clear_bit(In_sync, &rdev->flags);
968         set_bit(Faulty, &rdev->flags);
969         mddev->sb_dirty = 1;
970         printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
971                 "       Operation continuing on %d devices\n",
972                 bdevname(rdev->bdev,b), conf->working_disks);
973 }
974 
975 static void print_conf(conf_t *conf)
976 {
977         int i;
978         mirror_info_t *tmp;
979 
980         printk("RAID1 conf printout:\n");
981         if (!conf) {
982                 printk("(!conf)\n");
983                 return;
984         }
985         printk(" --- wd:%d rd:%d\n", conf->working_disks,
986                 conf->raid_disks);
987 
988         for (i = 0; i < conf->raid_disks; i++) {
989                 char b[BDEVNAME_SIZE];
990                 tmp = conf->mirrors + i;
991                 if (tmp->rdev)
992                         printk(" disk %d, wo:%d, o:%d, dev:%s\n",
993                                 i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags),
994                                 bdevname(tmp->rdev->bdev,b));
995         }
996 }
997 
998 static void close_sync(conf_t *conf)
999 {
1000         wait_barrier(conf);