[PATCH] md: allow md to update multiple superblocks in parallel.
[safe/jmp/linux-2.6] / drivers / md / md.c
1 /*
2    md.c : Multiple Devices driver for Linux
3           Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5      completely rewritten, based on the MD driver code from Marc Zyngier
6
7    Changes:
8
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19
20      Neil Brown <neilb@cse.unsw.edu.au>.
21
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34
35 #include <linux/module.h>
36 #include <linux/config.h>
37 #include <linux/linkage.h>
38 #include <linux/raid/md.h>
39 #include <linux/raid/bitmap.h>
40 #include <linux/sysctl.h>
41 #include <linux/devfs_fs_kernel.h>
42 #include <linux/buffer_head.h> /* for invalidate_bdev */
43 #include <linux/suspend.h>
44
45 #include <linux/init.h>
46
47 #include <linux/file.h>
48
49 #ifdef CONFIG_KMOD
50 #include <linux/kmod.h>
51 #endif
52
53 #include <asm/unaligned.h>
54
55 #define MAJOR_NR MD_MAJOR
56 #define MD_DRIVER
57
58 /* 63 partitions with the alternate major number (mdp) */
59 #define MdpMinorShift 6
60
61 #define DEBUG 0
62 #define dprintk(x...) ((void)(DEBUG && printk(x)))
63
64
65 #ifndef MODULE
66 static void autostart_arrays (int part);
67 #endif
68
69 static mdk_personality_t *pers[MAX_PERSONALITY];
70 static DEFINE_SPINLOCK(pers_lock);
71
72 /*
73  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
74  * is 1000 KB/sec, so the extra system load does not show up that much.
75  * Increase it if you want to have more _guaranteed_ speed. Note that
76  * the RAID driver will use the maximum available bandwith if the IO
77  * subsystem is idle. There is also an 'absolute maximum' reconstruction
78  * speed limit - in case reconstruction slows down your system despite
79  * idle IO detection.
80  *
81  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
82  */
83
84 static int sysctl_speed_limit_min = 1000;
85 static int sysctl_speed_limit_max = 200000;
86
87 static struct ctl_table_header *raid_table_header;
88
89 static ctl_table raid_table[] = {
90         {
91                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
92                 .procname       = "speed_limit_min",
93                 .data           = &sysctl_speed_limit_min,
94                 .maxlen         = sizeof(int),
95                 .mode           = 0644,
96                 .proc_handler   = &proc_dointvec,
97         },
98         {
99                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
100                 .procname       = "speed_limit_max",
101                 .data           = &sysctl_speed_limit_max,
102                 .maxlen         = sizeof(int),
103                 .mode           = 0644,
104                 .proc_handler   = &proc_dointvec,
105         },
106         { .ctl_name = 0 }
107 };
108
109 static ctl_table raid_dir_table[] = {
110         {
111                 .ctl_name       = DEV_RAID,
112                 .procname       = "raid",
113                 .maxlen         = 0,
114                 .mode           = 0555,
115                 .child          = raid_table,
116         },
117         { .ctl_name = 0 }
118 };
119
120 static ctl_table raid_root_table[] = {
121         {
122                 .ctl_name       = CTL_DEV,
123                 .procname       = "dev",
124                 .maxlen         = 0,
125                 .mode           = 0555,
126                 .child          = raid_dir_table,
127         },
128         { .ctl_name = 0 }
129 };
130
131 static struct block_device_operations md_fops;
132
133 /*
134  * Enables to iterate over all existing md arrays
135  * all_mddevs_lock protects this list.
136  */
137 static LIST_HEAD(all_mddevs);
138 static DEFINE_SPINLOCK(all_mddevs_lock);
139
140
141 /*
142  * iterates through all used mddevs in the system.
143  * We take care to grab the all_mddevs_lock whenever navigating
144  * the list, and to always hold a refcount when unlocked.
145  * Any code which breaks out of this loop while own
146  * a reference to the current mddev and must mddev_put it.
147  */
148 #define ITERATE_MDDEV(mddev,tmp)                                        \
149                                                                         \
150         for (({ spin_lock(&all_mddevs_lock);                            \
151                 tmp = all_mddevs.next;                                  \
152                 mddev = NULL;});                                        \
153              ({ if (tmp != &all_mddevs)                                 \
154                         mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
155                 spin_unlock(&all_mddevs_lock);                          \
156                 if (mddev) mddev_put(mddev);                            \
157                 mddev = list_entry(tmp, mddev_t, all_mddevs);           \
158                 tmp != &all_mddevs;});                                  \
159              ({ spin_lock(&all_mddevs_lock);                            \
160                 tmp = tmp->next;})                                      \
161                 )
162
163
164 static int md_fail_request (request_queue_t *q, struct bio *bio)
165 {
166         bio_io_error(bio, bio->bi_size);
167         return 0;
168 }
169
170 static inline mddev_t *mddev_get(mddev_t *mddev)
171 {
172         atomic_inc(&mddev->active);
173         return mddev;
174 }
175
176 static void mddev_put(mddev_t *mddev)
177 {
178         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
179                 return;
180         if (!mddev->raid_disks && list_empty(&mddev->disks)) {
181                 list_del(&mddev->all_mddevs);
182                 blk_put_queue(mddev->queue);
183                 kfree(mddev);
184         }
185         spin_unlock(&all_mddevs_lock);
186 }
187
188 static mddev_t * mddev_find(dev_t unit)
189 {
190         mddev_t *mddev, *new = NULL;
191
192  retry:
193         spin_lock(&all_mddevs_lock);
194         list_for_each_entry(mddev, &all_mddevs, all_mddevs)
195                 if (mddev->unit == unit) {
196                         mddev_get(mddev);
197                         spin_unlock(&all_mddevs_lock);
198                         if (new)
199                                 kfree(new);
200                         return mddev;
201                 }
202
203         if (new) {
204                 list_add(&new->all_mddevs, &all_mddevs);
205                 spin_unlock(&all_mddevs_lock);
206                 return new;
207         }
208         spin_unlock(&all_mddevs_lock);
209
210         new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
211         if (!new)
212                 return NULL;
213
214         memset(new, 0, sizeof(*new));
215
216         new->unit = unit;
217         if (MAJOR(unit) == MD_MAJOR)
218                 new->md_minor = MINOR(unit);
219         else
220                 new->md_minor = MINOR(unit) >> MdpMinorShift;
221
222         init_MUTEX(&new->reconfig_sem);
223         INIT_LIST_HEAD(&new->disks);
224         INIT_LIST_HEAD(&new->all_mddevs);
225         init_timer(&new->safemode_timer);
226         atomic_set(&new->active, 1);
227         spin_lock_init(&new->write_lock);
228         init_waitqueue_head(&new->sb_wait);
229
230         new->queue = blk_alloc_queue(GFP_KERNEL);
231         if (!new->queue) {
232                 kfree(new);
233                 return NULL;
234         }
235
236         blk_queue_make_request(new->queue, md_fail_request);
237
238         goto retry;
239 }
240
241 static inline int mddev_lock(mddev_t * mddev)
242 {
243         return down_interruptible(&mddev->reconfig_sem);
244 }
245
246 static inline void mddev_lock_uninterruptible(mddev_t * mddev)
247 {
248         down(&mddev->reconfig_sem);
249 }
250
251 static inline int mddev_trylock(mddev_t * mddev)
252 {
253         return down_trylock(&mddev->reconfig_sem);
254 }
255
256 static inline void mddev_unlock(mddev_t * mddev)
257 {
258         up(&mddev->reconfig_sem);
259
260         if (mddev->thread)
261                 md_wakeup_thread(mddev->thread);
262 }
263
264 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
265 {
266         mdk_rdev_t * rdev;
267         struct list_head *tmp;
268
269         ITERATE_RDEV(mddev,rdev,tmp) {
270                 if (rdev->desc_nr == nr)
271                         return rdev;
272         }
273         return NULL;
274 }
275
276 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
277 {
278         struct list_head *tmp;
279         mdk_rdev_t *rdev;
280
281         ITERATE_RDEV(mddev,rdev,tmp) {
282                 if (rdev->bdev->bd_dev == dev)
283                         return rdev;
284         }
285         return NULL;
286 }
287
288 inline static sector_t calc_dev_sboffset(struct block_device *bdev)
289 {
290         sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
291         return MD_NEW_SIZE_BLOCKS(size);
292 }
293
294 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
295 {
296         sector_t size;
297
298         size = rdev->sb_offset;
299
300         if (chunk_size)
301                 size &= ~((sector_t)chunk_size/1024 - 1);
302         return size;
303 }
304
305 static int alloc_disk_sb(mdk_rdev_t * rdev)
306 {
307         if (rdev->sb_page)
308                 MD_BUG();
309
310         rdev->sb_page = alloc_page(GFP_KERNEL);
311         if (!rdev->sb_page) {
312                 printk(KERN_ALERT "md: out of memory.\n");
313                 return -EINVAL;
314         }
315
316         return 0;
317 }
318
319 static void free_disk_sb(mdk_rdev_t * rdev)
320 {
321         if (rdev->sb_page) {
322                 page_cache_release(rdev->sb_page);
323                 rdev->sb_loaded = 0;
324                 rdev->sb_page = NULL;
325                 rdev->sb_offset = 0;
326                 rdev->size = 0;
327         }
328 }
329
330
331 static int super_written(struct bio *bio, unsigned int bytes_done, int error)
332 {
333         mdk_rdev_t *rdev = bio->bi_private;
334         if (bio->bi_size)
335                 return 1;
336
337         if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
338                 md_error(rdev->mddev, rdev);
339
340         if (atomic_dec_and_test(&rdev->mddev->pending_writes))
341                 wake_up(&rdev->mddev->sb_wait);
342         return 0;
343 }
344
345 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
346                    sector_t sector, int size, struct page *page)
347 {
348         /* write first size bytes of page to sector of rdev
349          * Increment mddev->pending_writes before returning
350          * and decrement it on completion, waking up sb_wait
351          * if zero is reached.
352          * If an error occurred, call md_error
353          */
354         struct bio *bio = bio_alloc(GFP_NOIO, 1);
355
356         bio->bi_bdev = rdev->bdev;
357         bio->bi_sector = sector;
358         bio_add_page(bio, page, size, 0);
359         bio->bi_private = rdev;
360         bio->bi_end_io = super_written;
361         atomic_inc(&mddev->pending_writes);
362         submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio);
363 }
364
365 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
366 {
367         if (bio->bi_size)
368                 return 1;
369
370         complete((struct completion*)bio->bi_private);
371         return 0;
372 }
373
374 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
375                    struct page *page, int rw)
376 {
377         struct bio *bio = bio_alloc(GFP_NOIO, 1);
378         struct completion event;
379         int ret;
380
381         rw |= (1 << BIO_RW_SYNC);
382
383         bio->bi_bdev = bdev;
384         bio->bi_sector = sector;
385         bio_add_page(bio, page, size, 0);
386         init_completion(&event);
387         bio->bi_private = &event;
388         bio->bi_end_io = bi_complete;
389         submit_bio(rw, bio);
390         wait_for_completion(&event);
391
392         ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
393         bio_put(bio);
394         return ret;
395 }
396
397 static int read_disk_sb(mdk_rdev_t * rdev)
398 {
399         char b[BDEVNAME_SIZE];
400         if (!rdev->sb_page) {
401                 MD_BUG();
402                 return -EINVAL;
403         }
404         if (rdev->sb_loaded)
405                 return 0;
406
407
408         if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
409                 goto fail;
410         rdev->sb_loaded = 1;
411         return 0;
412
413 fail:
414         printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
415                 bdevname(rdev->bdev,b));
416         return -EINVAL;
417 }
418
419 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
420 {
421         if (    (sb1->set_uuid0 == sb2->set_uuid0) &&
422                 (sb1->set_uuid1 == sb2->set_uuid1) &&
423                 (sb1->set_uuid2 == sb2->set_uuid2) &&
424                 (sb1->set_uuid3 == sb2->set_uuid3))
425
426                 return 1;
427
428         return 0;
429 }
430
431
432 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
433 {
434         int ret;
435         mdp_super_t *tmp1, *tmp2;
436
437         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
438         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
439
440         if (!tmp1 || !tmp2) {
441                 ret = 0;
442                 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
443                 goto abort;
444         }
445
446         *tmp1 = *sb1;
447         *tmp2 = *sb2;
448
449         /*
450          * nr_disks is not constant
451          */
452         tmp1->nr_disks = 0;
453         tmp2->nr_disks = 0;
454
455         if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
456                 ret = 0;
457         else
458                 ret = 1;
459
460 abort:
461         if (tmp1)
462                 kfree(tmp1);
463         if (tmp2)
464                 kfree(tmp2);
465
466         return ret;
467 }
468
469 static unsigned int calc_sb_csum(mdp_super_t * sb)
470 {
471         unsigned int disk_csum, csum;
472
473         disk_csum = sb->sb_csum;
474         sb->sb_csum = 0;
475         csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
476         sb->sb_csum = disk_csum;
477         return csum;
478 }
479
480
481 /*
482  * Handle superblock details.
483  * We want to be able to handle multiple superblock formats
484  * so we have a common interface to them all, and an array of
485  * different handlers.
486  * We rely on user-space to write the initial superblock, and support
487  * reading and updating of superblocks.
488  * Interface methods are:
489  *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
490  *      loads and validates a superblock on dev.
491  *      if refdev != NULL, compare superblocks on both devices
492  *    Return:
493  *      0 - dev has a superblock that is compatible with refdev
494  *      1 - dev has a superblock that is compatible and newer than refdev
495  *          so dev should be used as the refdev in future
496  *     -EINVAL superblock incompatible or invalid
497  *     -othererror e.g. -EIO
498  *
499  *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
500  *      Verify that dev is acceptable into mddev.
501  *       The first time, mddev->raid_disks will be 0, and data from
502  *       dev should be merged in.  Subsequent calls check that dev
503  *       is new enough.  Return 0 or -EINVAL
504  *
505  *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
506  *     Update the superblock for rdev with data in mddev
507  *     This does not write to disc.
508  *
509  */
510
511 struct super_type  {
512         char            *name;
513         struct module   *owner;
514         int             (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
515         int             (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
516         void            (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
517 };
518
519 /*
520  * load_super for 0.90.0 
521  */
522 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
523 {
524         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
525         mdp_super_t *sb;
526         int ret;
527         sector_t sb_offset;
528
529         /*
530          * Calculate the position of the superblock,
531          * it's at the end of the disk.
532          *
533          * It also happens to be a multiple of 4Kb.
534          */
535         sb_offset = calc_dev_sboffset(rdev->bdev);
536         rdev->sb_offset = sb_offset;
537
538         ret = read_disk_sb(rdev);
539         if (ret) return ret;
540
541         ret = -EINVAL;
542
543         bdevname(rdev->bdev, b);
544         sb = (mdp_super_t*)page_address(rdev->sb_page);
545
546         if (sb->md_magic != MD_SB_MAGIC) {
547                 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
548                        b);
549                 goto abort;
550         }
551
552         if (sb->major_version != 0 ||
553             sb->minor_version != 90) {
554                 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
555                         sb->major_version, sb->minor_version,
556                         b);
557                 goto abort;
558         }
559
560         if (sb->raid_disks <= 0)
561                 goto abort;
562
563         if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
564                 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
565                         b);
566                 goto abort;
567         }
568
569         rdev->preferred_minor = sb->md_minor;
570         rdev->data_offset = 0;
571
572         if (sb->level == LEVEL_MULTIPATH)
573                 rdev->desc_nr = -1;
574         else
575                 rdev->desc_nr = sb->this_disk.number;
576
577         if (refdev == 0)
578                 ret = 1;
579         else {
580                 __u64 ev1, ev2;
581                 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
582                 if (!uuid_equal(refsb, sb)) {
583                         printk(KERN_WARNING "md: %s has different UUID to %s\n",
584                                 b, bdevname(refdev->bdev,b2));
585                         goto abort;
586                 }
587                 if (!sb_equal(refsb, sb)) {
588                         printk(KERN_WARNING "md: %s has same UUID"
589                                " but different superblock to %s\n",
590                                b, bdevname(refdev->bdev, b2));
591                         goto abort;
592                 }
593                 ev1 = md_event(sb);
594                 ev2 = md_event(refsb);
595                 if (ev1 > ev2)
596                         ret = 1;
597                 else 
598                         ret = 0;
599         }
600         rdev->size = calc_dev_size(rdev, sb->chunk_size);
601
602  abort:
603         return ret;
604 }
605
606 /*
607  * validate_super for 0.90.0
608  */
609 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
610 {
611         mdp_disk_t *desc;
612         mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
613
614         rdev->raid_disk = -1;
615         rdev->in_sync = 0;
616         if (mddev->raid_disks == 0) {
617                 mddev->major_version = 0;
618                 mddev->minor_version = sb->minor_version;
619                 mddev->patch_version = sb->patch_version;
620                 mddev->persistent = ! sb->not_persistent;
621                 mddev->chunk_size = sb->chunk_size;
622                 mddev->ctime = sb->ctime;
623                 mddev->utime = sb->utime;
624                 mddev->level = sb->level;
625                 mddev->layout = sb->layout;
626                 mddev->raid_disks = sb->raid_disks;
627                 mddev->size = sb->size;
628                 mddev->events = md_event(sb);
629
630                 if (sb->state & (1<<MD_SB_CLEAN))
631                         mddev->recovery_cp = MaxSector;
632                 else {
633                         if (sb->events_hi == sb->cp_events_hi && 
634                                 sb->events_lo == sb->cp_events_lo) {
635                                 mddev->recovery_cp = sb->recovery_cp;
636                         } else
637                                 mddev->recovery_cp = 0;
638                 }
639
640                 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
641                 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
642                 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
643                 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
644
645                 mddev->max_disks = MD_SB_DISKS;
646
647                 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
648                     mddev->bitmap_file == NULL) {
649                         if (mddev->level != 1) {
650                                 /* FIXME use a better test */
651                                 printk(KERN_WARNING "md: bitmaps only support for raid1\n");
652                                 return -EINVAL;
653                         }
654                         mddev->bitmap_offset = (MD_SB_BYTES >> 9);
655                 }
656
657         } else if (mddev->pers == NULL) {
658                 /* Insist on good event counter while assembling */
659                 __u64 ev1 = md_event(sb);
660                 ++ev1;
661                 if (ev1 < mddev->events) 
662                         return -EINVAL;
663         } else if (mddev->bitmap) {
664                 /* if adding to array with a bitmap, then we can accept an
665                  * older device ... but not too old.
666                  */
667                 __u64 ev1 = md_event(sb);
668                 if (ev1 < mddev->bitmap->events_cleared)
669                         return 0;
670         } else /* just a hot-add of a new device, leave raid_disk at -1 */
671                 return 0;
672
673         if (mddev->level != LEVEL_MULTIPATH) {
674                 rdev->faulty = 0;
675                 desc = sb->disks + rdev->desc_nr;
676
677                 if (desc->state & (1<<MD_DISK_FAULTY))
678                         rdev->faulty = 1;
679                 else if (desc->state & (1<<MD_DISK_SYNC) &&
680                          desc->raid_disk < mddev->raid_disks) {
681                         rdev->in_sync = 1;
682                         rdev->raid_disk = desc->raid_disk;
683                 }
684         } else /* MULTIPATH are always insync */
685                 rdev->in_sync = 1;
686         return 0;
687 }
688
689 /*
690  * sync_super for 0.90.0
691  */
692 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
693 {
694         mdp_super_t *sb;
695         struct list_head *tmp;
696         mdk_rdev_t *rdev2;
697         int next_spare = mddev->raid_disks;
698
699         /* make rdev->sb match mddev data..
700          *
701          * 1/ zero out disks
702          * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
703          * 3/ any empty disks < next_spare become removed
704          *
705          * disks[0] gets initialised to REMOVED because
706          * we cannot be sure from other fields if it has
707          * been initialised or not.
708          */
709         int i;
710         int active=0, working=0,failed=0,spare=0,nr_disks=0;
711
712         sb = (mdp_super_t*)page_address(rdev->sb_page);
713
714         memset(sb, 0, sizeof(*sb));
715
716         sb->md_magic = MD_SB_MAGIC;
717         sb->major_version = mddev->major_version;
718         sb->minor_version = mddev->minor_version;
719         sb->patch_version = mddev->patch_version;
720         sb->gvalid_words  = 0; /* ignored */
721         memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
722         memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
723         memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
724         memcpy(&sb->set_uuid3, mddev->uuid+12,4);
725
726         sb->ctime = mddev->ctime;
727         sb->level = mddev->level;
728         sb->size  = mddev->size;
729         sb->raid_disks = mddev->raid_disks;
730         sb->md_minor = mddev->md_minor;
731         sb->not_persistent = !mddev->persistent;
732         sb->utime = mddev->utime;
733         sb->state = 0;
734         sb->events_hi = (mddev->events>>32);
735         sb->events_lo = (u32)mddev->events;
736
737         if (mddev->in_sync)
738         {
739                 sb->recovery_cp = mddev->recovery_cp;
740                 sb->cp_events_hi = (mddev->events>>32);
741                 sb->cp_events_lo = (u32)mddev->events;
742                 if (mddev->recovery_cp == MaxSector)
743                         sb->state = (1<< MD_SB_CLEAN);
744         } else
745                 sb->recovery_cp = 0;
746
747         sb->layout = mddev->layout;
748         sb->chunk_size = mddev->chunk_size;
749
750         if (mddev->bitmap && mddev->bitmap_file == NULL)
751                 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
752
753         sb->disks[0].state = (1<<MD_DISK_REMOVED);
754         ITERATE_RDEV(mddev,rdev2,tmp) {
755                 mdp_disk_t *d;
756                 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty)
757                         rdev2->desc_nr = rdev2->raid_disk;
758                 else
759                         rdev2->desc_nr = next_spare++;
760                 d = &sb->disks[rdev2->desc_nr];
761                 nr_disks++;
762                 d->number = rdev2->desc_nr;
763                 d->major = MAJOR(rdev2->bdev->bd_dev);
764                 d->minor = MINOR(rdev2->bdev->bd_dev);
765                 if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty)
766                         d->raid_disk = rdev2->raid_disk;
767                 else
768                         d->raid_disk = rdev2->desc_nr; /* compatibility */
769                 if (rdev2->faulty) {
770                         d->state = (1<<MD_DISK_FAULTY);
771                         failed++;
772                 } else if (rdev2->in_sync) {
773                         d->state = (1<<MD_DISK_ACTIVE);
774                         d->state |= (1<<MD_DISK_SYNC);
775                         active++;
776                         working++;
777                 } else {
778                         d->state = 0;
779                         spare++;
780                         working++;
781                 }
782         }
783         
784         /* now set the "removed" and "faulty" bits on any missing devices */
785         for (i=0 ; i < mddev->raid_disks ; i++) {
786                 mdp_disk_t *d = &sb->disks[i];
787                 if (d->state == 0 && d->number == 0) {
788                         d->number = i;
789                         d->raid_disk = i;
790                         d->state = (1<<MD_DISK_REMOVED);
791                         d->state |= (1<<MD_DISK_FAULTY);
792                         failed++;
793                 }
794         }
795         sb->nr_disks = nr_disks;
796         sb->active_disks = active;
797         sb->working_disks = working;
798         sb->failed_disks = failed;
799         sb->spare_disks = spare;
800
801         sb->this_disk = sb->disks[rdev->desc_nr];
802         sb->sb_csum = calc_sb_csum(sb);
803 }
804
805 /*
806  * version 1 superblock
807  */
808
809 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
810 {
811         unsigned int disk_csum, csum;
812         unsigned long long newcsum;
813         int size = 256 + le32_to_cpu(sb->max_dev)*2;
814         unsigned int *isuper = (unsigned int*)sb;
815         int i;
816
817         disk_csum = sb->sb_csum;
818         sb->sb_csum = 0;
819         newcsum = 0;
820         for (i=0; size>=4; size -= 4 )
821                 newcsum += le32_to_cpu(*isuper++);
822
823         if (size == 2)
824                 newcsum += le16_to_cpu(*(unsigned short*) isuper);
825
826         csum = (newcsum & 0xffffffff) + (newcsum >> 32);
827         sb->sb_csum = disk_csum;
828         return cpu_to_le32(csum);
829 }
830
831 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
832 {
833         struct mdp_superblock_1 *sb;
834         int ret;
835         sector_t sb_offset;
836         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
837
838         /*
839          * Calculate the position of the superblock.
840          * It is always aligned to a 4K boundary and
841          * depeding on minor_version, it can be:
842          * 0: At least 8K, but less than 12K, from end of device
843          * 1: At start of device
844          * 2: 4K from start of device.
845          */
846         switch(minor_version) {
847         case 0:
848                 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
849                 sb_offset -= 8*2;
850                 sb_offset &= ~(4*2-1);
851                 /* convert from sectors to K */
852                 sb_offset /= 2;
853                 break;
854         case 1:
855                 sb_offset = 0;
856                 break;
857         case 2:
858                 sb_offset = 4;
859                 break;
860         default:
861                 return -EINVAL;
862         }
863         rdev->sb_offset = sb_offset;
864
865         ret = read_disk_sb(rdev);
866         if (ret) return ret;
867
868
869         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
870
871         if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
872             sb->major_version != cpu_to_le32(1) ||
873             le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
874             le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
875             sb->feature_map != 0)
876                 return -EINVAL;
877
878         if (calc_sb_1_csum(sb) != sb->sb_csum) {
879                 printk("md: invalid superblock checksum on %s\n",
880                         bdevname(rdev->bdev,b));
881                 return -EINVAL;
882         }
883         if (le64_to_cpu(sb->data_size) < 10) {
884                 printk("md: data_size too small on %s\n",
885                        bdevname(rdev->bdev,b));
886                 return -EINVAL;
887         }
888         rdev->preferred_minor = 0xffff;
889         rdev->data_offset = le64_to_cpu(sb->data_offset);
890
891         if (refdev == 0)
892                 return 1;
893         else {
894                 __u64 ev1, ev2;
895                 struct mdp_superblock_1 *refsb = 
896                         (struct mdp_superblock_1*)page_address(refdev->sb_page);
897
898                 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
899                     sb->level != refsb->level ||
900                     sb->layout != refsb->layout ||
901                     sb->chunksize != refsb->chunksize) {
902                         printk(KERN_WARNING "md: %s has strangely different"
903                                 " superblock to %s\n",
904                                 bdevname(rdev->bdev,b),
905                                 bdevname(refdev->bdev,b2));
906                         return -EINVAL;
907                 }
908                 ev1 = le64_to_cpu(sb->events);
909                 ev2 = le64_to_cpu(refsb->events);
910
911                 if (ev1 > ev2)
912                         return 1;
913         }
914         if (minor_version) 
915                 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
916         else
917                 rdev->size = rdev->sb_offset;
918         if (rdev->size < le64_to_cpu(sb->data_size)/2)
919                 return -EINVAL;
920         rdev->size = le64_to_cpu(sb->data_size)/2;
921         if (le32_to_cpu(sb->chunksize))
922                 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
923         return 0;
924 }
925
926 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
927 {
928         struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
929
930         rdev->raid_disk = -1;
931         rdev->in_sync = 0;
932         if (mddev->raid_disks == 0) {
933                 mddev->major_version = 1;
934                 mddev->patch_version = 0;
935                 mddev->persistent = 1;
936                 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
937                 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
938                 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
939                 mddev->level = le32_to_cpu(sb->level);
940                 mddev->layout = le32_to_cpu(sb->layout);
941                 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
942                 mddev->size = le64_to_cpu(sb->size)/2;
943                 mddev->events = le64_to_cpu(sb->events);
944                 
945                 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
946                 memcpy(mddev->uuid, sb->set_uuid, 16);
947
948                 mddev->max_disks =  (4096-256)/2;
949
950                 if ((le32_to_cpu(sb->feature_map) & 1) &&
951                     mddev->bitmap_file == NULL ) {
952                         if (mddev->level != 1) {
953                                 printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
954                                 return -EINVAL;
955                         }
956                         mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
957                 }
958         } else if (mddev->pers == NULL) {
959                 /* Insist of good event counter while assembling */
960                 __u64 ev1 = le64_to_cpu(sb->events);
961                 ++ev1;
962                 if (ev1 < mddev->events)
963                         return -EINVAL;
964         } else if (mddev->bitmap) {
965                 /* If adding to array with a bitmap, then we can accept an
966                  * older device, but not too old.
967                  */
968                 __u64 ev1 = le64_to_cpu(sb->events);
969                 if (ev1 < mddev->bitmap->events_cleared)
970                         return 0;
971         } else /* just a hot-add of a new device, leave raid_disk at -1 */
972                 return 0;
973
974         if (mddev->level != LEVEL_MULTIPATH) {
975                 int role;
976                 rdev->desc_nr = le32_to_cpu(sb->dev_number);
977                 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
978                 switch(role) {
979                 case 0xffff: /* spare */
980                         rdev->faulty = 0;
981                         break;
982                 case 0xfffe: /* faulty */
983                         rdev->faulty = 1;
984                         break;
985                 default:
986                         rdev->in_sync = 1;
987                         rdev->faulty = 0;
988                         rdev->raid_disk = role;
989                         break;
990                 }
991         } else /* MULTIPATH are always insync */
992                 rdev->in_sync = 1;
993
994         return 0;
995 }
996
997 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
998 {
999         struct mdp_superblock_1 *sb;
1000         struct list_head *tmp;
1001         mdk_rdev_t *rdev2;
1002         int max_dev, i;
1003         /* make rdev->sb match mddev and rdev data. */
1004
1005         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1006
1007         sb->feature_map = 0;
1008         sb->pad0 = 0;
1009         memset(sb->pad1, 0, sizeof(sb->pad1));
1010         memset(sb->pad2, 0, sizeof(sb->pad2));
1011         memset(sb->pad3, 0, sizeof(sb->pad3));
1012
1013         sb->utime = cpu_to_le64((__u64)mddev->utime);
1014         sb->events = cpu_to_le64(mddev->events);
1015         if (mddev->in_sync)
1016                 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1017         else
1018                 sb->resync_offset = cpu_to_le64(0);
1019
1020         if (mddev->bitmap && mddev->bitmap_file == NULL) {
1021                 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1022                 sb->feature_map = cpu_to_le32(1);
1023         }
1024
1025         max_dev = 0;
1026         ITERATE_RDEV(mddev,rdev2,tmp)
1027                 if (rdev2->desc_nr+1 > max_dev)
1028                         max_dev = rdev2->desc_nr+1;
1029         
1030         sb->max_dev = cpu_to_le32(max_dev);
1031         for (i=0; i<max_dev;i++)
1032                 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1033         
1034         ITERATE_RDEV(mddev,rdev2,tmp) {
1035                 i = rdev2->desc_nr;
1036                 if (rdev2->faulty)
1037                         sb->dev_roles[i] = cpu_to_le16(0xfffe);
1038                 else if (rdev2->in_sync)
1039                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1040                 else
1041                         sb->dev_roles[i] = cpu_to_le16(0xffff);
1042         }
1043
1044         sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
1045         sb->sb_csum = calc_sb_1_csum(sb);
1046 }
1047
1048
1049 static struct super_type super_types[] = {
1050         [0] = {
1051                 .name   = "0.90.0",
1052                 .owner  = THIS_MODULE,
1053                 .load_super     = super_90_load,
1054                 .validate_super = super_90_validate,
1055                 .sync_super     = super_90_sync,
1056         },
1057         [1] = {
1058                 .name   = "md-1",
1059                 .owner  = THIS_MODULE,
1060                 .load_super     = super_1_load,
1061                 .validate_super = super_1_validate,
1062                 .sync_super     = super_1_sync,
1063         },
1064 };
1065         
1066 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
1067 {
1068         struct list_head *tmp;
1069         mdk_rdev_t *rdev;
1070
1071         ITERATE_RDEV(mddev,rdev,tmp)
1072                 if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
1073                         return rdev;
1074
1075         return NULL;
1076 }
1077
1078 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1079 {
1080         struct list_head *tmp;
1081         mdk_rdev_t *rdev;
1082
1083         ITERATE_RDEV(mddev1,rdev,tmp)
1084                 if (match_dev_unit(mddev2, rdev))
1085                         return 1;
1086
1087         return 0;
1088 }
1089
1090 static LIST_HEAD(pending_raid_disks);
1091
1092 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1093 {
1094         mdk_rdev_t *same_pdev;
1095         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1096
1097         if (rdev->mddev) {
1098                 MD_BUG();
1099                 return -EINVAL;
1100         }
1101         same_pdev = match_dev_unit(mddev, rdev);
1102         if (same_pdev)
1103                 printk(KERN_WARNING
1104                         "%s: WARNING: %s appears to be on the same physical"
1105                         " disk as %s. True\n     protection against single-disk"
1106                         " failure might be compromised.\n",
1107                         mdname(mddev), bdevname(rdev->bdev,b),
1108                         bdevname(same_pdev->bdev,b2));
1109
1110         /* Verify rdev->desc_nr is unique.
1111          * If it is -1, assign a free number, else
1112          * check number is not in use
1113          */
1114         if (rdev->desc_nr < 0) {
1115                 int choice = 0;
1116                 if (mddev->pers) choice = mddev->raid_disks;
1117                 while (find_rdev_nr(mddev, choice))
1118                         choice++;
1119                 rdev->desc_nr = choice;
1120         } else {
1121                 if (find_rdev_nr(mddev, rdev->desc_nr))
1122                         return -EBUSY;
1123         }
1124                         
1125         list_add(&rdev->same_set, &mddev->disks);
1126         rdev->mddev = mddev;
1127         printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b));
1128         return 0;
1129 }
1130
1131 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1132 {
1133         char b[BDEVNAME_SIZE];
1134         if (!rdev->mddev) {
1135                 MD_BUG();
1136                 return;
1137         }
1138         list_del_init(&rdev->same_set);
1139         printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1140         rdev->mddev = NULL;
1141 }
1142
1143 /*
1144  * prevent the device from being mounted, repartitioned or
1145  * otherwise reused by a RAID array (or any other kernel
1146  * subsystem), by bd_claiming the device.
1147  */
1148 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1149 {
1150         int err = 0;
1151         struct block_device *bdev;
1152         char b[BDEVNAME_SIZE];
1153
1154         bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1155         if (IS_ERR(bdev)) {
1156                 printk(KERN_ERR "md: could not open %s.\n",
1157                         __bdevname(dev, b));
1158                 return PTR_ERR(bdev);
1159         }
1160         err = bd_claim(bdev, rdev);
1161         if (err) {
1162                 printk(KERN_ERR "md: could not bd_claim %s.\n",
1163                         bdevname(bdev, b));
1164                 blkdev_put(bdev);
1165                 return err;
1166         }
1167         rdev->bdev = bdev;
1168         return err;
1169 }
1170
1171 static void unlock_rdev(mdk_rdev_t *rdev)
1172 {
1173         struct block_device *bdev = rdev->bdev;
1174         rdev->bdev = NULL;
1175         if (!bdev)
1176                 MD_BUG();
1177         bd_release(bdev);
1178         blkdev_put(bdev);
1179 }
1180
1181 void md_autodetect_dev(dev_t dev);
1182
1183 static void export_rdev(mdk_rdev_t * rdev)
1184 {
1185         char b[BDEVNAME_SIZE];
1186         printk(KERN_INFO "md: export_rdev(%s)\n",
1187                 bdevname(rdev->bdev,b));
1188         if (rdev->mddev)
1189                 MD_BUG();
1190         free_disk_sb(rdev);
1191         list_del_init(&rdev->same_set);
1192 #ifndef MODULE
1193         md_autodetect_dev(rdev->bdev->bd_dev);
1194 #endif
1195         unlock_rdev(rdev);
1196         kfree(rdev);
1197 }
1198
1199 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1200 {
1201         unbind_rdev_from_array(rdev);
1202         export_rdev(rdev);
1203 }
1204
1205 static void export_array(mddev_t *mddev)
1206 {
1207         struct list_head *tmp;
1208         mdk_rdev_t *rdev;
1209
1210         ITERATE_RDEV(mddev,rdev,tmp) {
1211                 if (!rdev->mddev) {
1212                         MD_BUG();
1213                         continue;
1214                 }
1215                 kick_rdev_from_array(rdev);
1216         }
1217         if (!list_empty(&mddev->disks))
1218                 MD_BUG();
1219         mddev->raid_disks = 0;
1220         mddev->major_version = 0;
1221 }
1222
1223 static void print_desc(mdp_disk_t *desc)
1224 {
1225         printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1226                 desc->major,desc->minor,desc->raid_disk,desc->state);
1227 }
1228
1229 static void print_sb(mdp_super_t *sb)
1230 {
1231         int i;
1232
1233         printk(KERN_INFO 
1234                 "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1235                 sb->major_version, sb->minor_version, sb->patch_version,
1236                 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1237                 sb->ctime);
1238         printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1239                 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1240                 sb->md_minor, sb->layout, sb->chunk_size);
1241         printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1242                 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1243                 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1244                 sb->failed_disks, sb->spare_disks,
1245                 sb->sb_csum, (unsigned long)sb->events_lo);
1246
1247         printk(KERN_INFO);
1248         for (i = 0; i < MD_SB_DISKS; i++) {
1249                 mdp_disk_t *desc;
1250
1251                 desc = sb->disks + i;
1252                 if (desc->number || desc->major || desc->minor ||
1253                     desc->raid_disk || (desc->state && (desc->state != 4))) {
1254                         printk("     D %2d: ", i);
1255                         print_desc(desc);
1256                 }
1257         }
1258         printk(KERN_INFO "md:     THIS: ");
1259         print_desc(&sb->this_disk);
1260
1261 }
1262
1263 static void print_rdev(mdk_rdev_t *rdev)
1264 {
1265         char b[BDEVNAME_SIZE];
1266         printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1267                 bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1268                 rdev->faulty, rdev->in_sync, rdev->desc_nr);
1269         if (rdev->sb_loaded) {
1270                 printk(KERN_INFO "md: rdev superblock:\n");
1271                 print_sb((mdp_super_t*)page_address(rdev->sb_page));
1272         } else
1273                 printk(KERN_INFO "md: no rdev superblock!\n");
1274 }
1275
1276 void md_print_devices(void)
1277 {
1278         struct list_head *tmp, *tmp2;
1279         mdk_rdev_t *rdev;
1280         mddev_t *mddev;
1281         char b[BDEVNAME_SIZE];
1282
1283         printk("\n");
1284         printk("md:     **********************************\n");
1285         printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
1286         printk("md:     **********************************\n");
1287         ITERATE_MDDEV(mddev,tmp) {
1288
1289                 if (mddev->bitmap)
1290                         bitmap_print_sb(mddev->bitmap);
1291                 else
1292                         printk("%s: ", mdname(mddev));
1293                 ITERATE_RDEV(mddev,rdev,tmp2)
1294                         printk("<%s>", bdevname(rdev->bdev,b));
1295                 printk("\n");
1296
1297                 ITERATE_RDEV(mddev,rdev,tmp2)
1298                         print_rdev(rdev);
1299         }
1300         printk("md:     **********************************\n");
1301         printk("\n");
1302 }
1303
1304
1305 static void sync_sbs(mddev_t * mddev)
1306 {
1307         mdk_rdev_t *rdev;
1308         struct list_head *tmp;
1309
1310         ITERATE_RDEV(mddev,rdev,tmp) {
1311                 super_types[mddev->major_version].
1312                         sync_super(mddev, rdev);
1313                 rdev->sb_loaded = 1;
1314         }
1315 }
1316
1317 static void md_update_sb(mddev_t * mddev)
1318 {
1319         int err;
1320         struct list_head *tmp;
1321         mdk_rdev_t *rdev;
1322         int sync_req;
1323
1324 repeat:
1325         spin_lock(&mddev->write_lock);
1326         sync_req = mddev->in_sync;
1327         mddev->utime = get_seconds();
1328         mddev->events ++;
1329
1330         if (!mddev->events) {
1331                 /*
1332                  * oops, this 64-bit counter should never wrap.
1333                  * Either we are in around ~1 trillion A.C., assuming
1334                  * 1 reboot per second, or we have a bug:
1335                  */
1336                 MD_BUG();
1337                 mddev->events --;
1338         }
1339         mddev->sb_dirty = 2;
1340         sync_sbs(mddev);
1341
1342         /*
1343          * do not write anything to disk if using
1344          * nonpersistent superblocks
1345          */
1346         if (!mddev->persistent) {
1347                 mddev->sb_dirty = 0;
1348                 spin_unlock(&mddev->write_lock);
1349                 wake_up(&mddev->sb_wait);
1350                 return;
1351         }
1352         spin_unlock(&mddev->write_lock);
1353
1354         dprintk(KERN_INFO 
1355                 "md: updating %s RAID superblock on device (in sync %d)\n",
1356                 mdname(mddev),mddev->in_sync);
1357
1358         err = bitmap_update_sb(mddev->bitmap);
1359         ITERATE_RDEV(mddev,rdev,tmp) {
1360                 char b[BDEVNAME_SIZE];
1361                 dprintk(KERN_INFO "md: ");
1362                 if (rdev->faulty)
1363                         dprintk("(skipping faulty ");
1364
1365                 dprintk("%s ", bdevname(rdev->bdev,b));
1366                 if (!rdev->faulty) {
1367                         md_super_write(mddev,rdev,
1368                                        rdev->sb_offset<<1, MD_SB_BYTES,
1369                                        rdev->sb_page);
1370                         dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1371                                 bdevname(rdev->bdev,b),
1372                                 (unsigned long long)rdev->sb_offset);
1373
1374                 } else
1375                         dprintk(")\n");
1376                 if (mddev->level == LEVEL_MULTIPATH)
1377                         /* only need to write one superblock... */
1378                         break;
1379         }
1380         wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
1381         /* if there was a failure, sb_dirty was set to 1, and we re-write super */
1382
1383         spin_lock(&mddev->write_lock);
1384         if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
1385                 /* have to write it out again */
1386                 spin_unlock(&mddev->write_lock);
1387                 goto repeat;
1388         }
1389         mddev->sb_dirty = 0;
1390         spin_unlock(&mddev->write_lock);
1391         wake_up(&mddev->sb_wait);
1392
1393 }
1394
1395 /*
1396  * Import a device. If 'super_format' >= 0, then sanity check the superblock
1397  *
1398  * mark the device faulty if:
1399  *
1400  *   - the device is nonexistent (zero size)
1401  *   - the device has no valid superblock
1402  *
1403  * a faulty rdev _never_ has rdev->sb set.
1404  */
1405 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
1406 {
1407         char b[BDEVNAME_SIZE];
1408         int err;
1409         mdk_rdev_t *rdev;
1410         sector_t size;
1411
1412         rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
1413         if (!rdev) {
1414                 printk(KERN_ERR "md: could not alloc mem for new device!\n");
1415                 return ERR_PTR(-ENOMEM);
1416         }
1417         memset(rdev, 0, sizeof(*rdev));
1418
1419         if ((err = alloc_disk_sb(rdev)))
1420                 goto abort_free;
1421
1422         err = lock_rdev(rdev, newdev);
1423         if (err)
1424                 goto abort_free;
1425
1426         rdev->desc_nr = -1;
1427         rdev->faulty = 0;
1428         rdev->in_sync = 0;
1429         rdev->data_offset = 0;
1430         atomic_set(&rdev->nr_pending, 0);
1431
1432         size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
1433         if (!size) {
1434                 printk(KERN_WARNING 
1435                         "md: %s has zero or unknown size, marking faulty!\n",
1436                         bdevname(rdev->bdev,b));
1437                 err = -EINVAL;
1438                 goto abort_free;
1439         }
1440
1441         if (super_format >= 0) {
1442                 err = super_types[super_format].
1443                         load_super(rdev, NULL, super_minor);
1444                 if (err == -EINVAL) {
1445                         printk(KERN_WARNING 
1446                                 "md: %s has invalid sb, not importing!\n",
1447                                 bdevname(rdev->bdev,b));
1448                         goto abort_free;
1449                 }
1450                 if (err < 0) {
1451                         printk(KERN_WARNING 
1452                                 "md: could not read %s's sb, not importing!\n",
1453                                 bdevname(rdev->bdev,b));
1454                         goto abort_free;
1455                 }
1456         }
1457         INIT_LIST_HEAD(&rdev->same_set);
1458
1459         return rdev;
1460
1461 abort_free:
1462         if (rdev->sb_page) {
1463                 if (rdev->bdev)
1464                         unlock_rdev(rdev);
1465                 free_disk_sb(rdev);
1466         }
1467         kfree(rdev);
1468         return ERR_PTR(err);
1469 }
1470
1471 /*
1472  * Check a full RAID array for plausibility
1473  */
1474
1475
1476 static void analyze_sbs(mddev_t * mddev)
1477 {
1478         int i;
1479         struct list_head *tmp;
1480         mdk_rdev_t *rdev, *freshest;
1481         char b[BDEVNAME_SIZE];
1482
1483         freshest = NULL;
1484         ITERATE_RDEV(mddev,rdev,tmp)
1485                 switch (super_types[mddev->major_version].
1486                         load_super(rdev, freshest, mddev->minor_version)) {
1487                 case 1:
1488                         freshest = rdev;
1489                         break;
1490                 case 0:
1491                         break;
1492                 default:
1493                         printk( KERN_ERR \
1494                                 "md: fatal superblock inconsistency in %s"
1495                                 " -- removing from array\n", 
1496                                 bdevname(rdev->bdev,b));
1497                         kick_rdev_from_array(rdev);
1498                 }
1499
1500
1501         super_types[mddev->major_version].
1502                 validate_super(mddev, freshest);
1503
1504         i = 0;
1505         ITERATE_RDEV(mddev,rdev,tmp) {
1506                 if (rdev != freshest)
1507                         if (super_types[mddev->major_version].
1508                             validate_super(mddev, rdev)) {
1509                                 printk(KERN_WARNING "md: kicking non-fresh %s"
1510                                         " from array!\n",
1511                                         bdevname(rdev->bdev,b));
1512                                 kick_rdev_from_array(rdev);
1513                                 continue;
1514                         }
1515                 if (mddev->level == LEVEL_MULTIPATH) {
1516                         rdev->desc_nr = i++;
1517                         rdev->raid_disk = rdev->desc_nr;
1518                         rdev->in_sync = 1;
1519                 }
1520         }
1521
1522
1523
1524         if (mddev->recovery_cp != MaxSector &&
1525             mddev->level >= 1)
1526                 printk(KERN_ERR "md: %s: raid array is not clean"
1527                        " -- starting background reconstruction\n",
1528                        mdname(mddev));
1529
1530 }
1531
1532 int mdp_major = 0;
1533
1534 static struct kobject *md_probe(dev_t dev, int *part, void *data)
1535 {
1536         static DECLARE_MUTEX(disks_sem);
1537         mddev_t *mddev = mddev_find(dev);
1538         struct gendisk *disk;
1539         int partitioned = (MAJOR(dev) != MD_MAJOR);
1540         int shift = partitioned ? MdpMinorShift : 0;
1541         int unit = MINOR(dev) >> shift;
1542
1543         if (!mddev)
1544                 return NULL;
1545
1546         down(&disks_sem);
1547         if (mddev->gendisk) {
1548                 up(&disks_sem);
1549                 mddev_put(mddev);
1550                 return NULL;
1551         }
1552         disk = alloc_disk(1 << shift);
1553         if (!disk) {
1554                 up(&disks_sem);
1555                 mddev_put(mddev);
1556                 return NULL;
1557         }
1558         disk->major = MAJOR(dev);
1559         disk->first_minor = unit << shift;
1560         if (partitioned) {
1561                 sprintf(disk->disk_name, "md_d%d", unit);
1562                 sprintf(disk->devfs_name, "md/d%d", unit);
1563         } else {
1564                 sprintf(disk->disk_name, "md%d", unit);
1565                 sprintf(disk->devfs_name, "md/%d", unit);
1566         }
1567         disk->fops = &md_fops;
1568         disk->private_data = mddev;
1569         disk->queue = mddev->queue;
1570         add_disk(disk);
1571         mddev->gendisk = disk;
1572         up(&disks_sem);
1573         return NULL;
1574 }
1575
1576 void md_wakeup_thread(mdk_thread_t *thread);
1577
1578 static void md_safemode_timeout(unsigned long data)
1579 {
1580         mddev_t *mddev = (mddev_t *) data;
1581
1582         mddev->safemode = 1;
1583         md_wakeup_thread(mddev->thread);
1584 }
1585
1586
1587 static int do_md_run(mddev_t * mddev)
1588 {
1589         int pnum, err;
1590         int chunk_size;
1591         struct list_head *tmp;
1592         mdk_rdev_t *rdev;
1593         struct gendisk *disk;
1594         char b[BDEVNAME_SIZE];
1595
1596         if (list_empty(&mddev->disks))
1597                 /* cannot run an array with no devices.. */
1598                 return -EINVAL;
1599
1600         if (mddev->pers)
1601                 return -EBUSY;
1602
1603         /*
1604          * Analyze all RAID superblock(s)
1605          */
1606         if (!mddev->raid_disks)
1607                 analyze_sbs(mddev);
1608
1609         chunk_size = mddev->chunk_size;
1610         pnum = level_to_pers(mddev->level);
1611
1612         if ((pnum != MULTIPATH) && (pnum != RAID1)) {
1613                 if (!chunk_size) {
1614                         /*
1615                          * 'default chunksize' in the old md code used to
1616                          * be PAGE_SIZE, baaad.
1617                          * we abort here to be on the safe side. We don't
1618                          * want to continue the bad practice.
1619                          */
1620                         printk(KERN_ERR 
1621                                 "no chunksize specified, see 'man raidtab'\n");
1622                         return -EINVAL;
1623                 }
1624                 if (chunk_size > MAX_CHUNK_SIZE) {
1625                         printk(KERN_ERR "too big chunk_size: %d > %d\n",
1626                                 chunk_size, MAX_CHUNK_SIZE);
1627                         return -EINVAL;
1628                 }
1629                 /*
1630                  * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1631                  */
1632                 if ( (1 << ffz(~chunk_size)) != chunk_size) {
1633                         printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
1634                         return -EINVAL;
1635                 }
1636                 if (chunk_size < PAGE_SIZE) {
1637                         printk(KERN_ERR "too small chunk_size: %d < %ld\n",
1638                                 chunk_size, PAGE_SIZE);
1639                         return -EINVAL;
1640                 }
1641
1642                 /* devices must have minimum size of one chunk */
1643                 ITERATE_RDEV(mddev,rdev,tmp) {
1644                         if (rdev->faulty)
1645                                 continue;
1646                         if (rdev->size < chunk_size / 1024) {
1647                                 printk(KERN_WARNING
1648                                         "md: Dev %s smaller than chunk_size:"
1649                                         " %lluk < %dk\n",
1650                                         bdevname(rdev->bdev,b),
1651                                         (unsigned long long)rdev->size,
1652                                         chunk_size / 1024);
1653                                 return -EINVAL;
1654                         }
1655                 }
1656         }
1657
1658 #ifdef CONFIG_KMOD
1659         if (!pers[pnum])
1660         {
1661                 request_module("md-personality-%d", pnum);
1662         }
1663 #endif
1664
1665         /*
1666          * Drop all container device buffers, from now on
1667          * the only valid external interface is through the md
1668          * device.
1669          * Also find largest hardsector size
1670          */
1671         ITERATE_RDEV(mddev,rdev,tmp) {
1672                 if (rdev->faulty)
1673                         continue;
1674                 sync_blockdev(rdev->bdev);
1675                 invalidate_bdev(rdev->bdev, 0);
1676         }
1677
1678         md_probe(mddev->unit, NULL, NULL);
1679         disk = mddev->gendisk;
1680         if (!disk)
1681                 return -ENOMEM;
1682
1683         spin_lock(&pers_lock);
1684         if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
1685                 spin_unlock(&pers_lock);
1686                 printk(KERN_WARNING "md: personality %d is not loaded!\n",
1687                        pnum);
1688                 return -EINVAL;
1689         }
1690
1691         mddev->pers = pers[pnum];
1692         spin_unlock(&pers_lock);
1693
1694         mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
1695
1696         /* before we start the array running, initialise the bitmap */
1697         err = bitmap_create(mddev);
1698         if (err)
1699                 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
1700                         mdname(mddev), err);
1701         else
1702                 err = mddev->pers->run(mddev);
1703         if (err) {
1704                 printk(KERN_ERR "md: pers->run() failed ...\n");
1705                 module_put(mddev->pers->owner);
1706                 mddev->pers = NULL;
1707                 bitmap_destroy(mddev);
1708                 return err;
1709         }
1710         atomic_set(&mddev->writes_pending,0);
1711         mddev->safemode = 0;
1712         mddev->safemode_timer.function = md_safemode_timeout;
1713         mddev->safemode_timer.data = (unsigned long) mddev;
1714         mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
1715         mddev->in_sync = 1;
1716         
1717         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1718         
1719         if (mddev->sb_dirty)
1720                 md_update_sb(mddev);
1721
1722         set_capacity(disk, mddev->array_size<<1);
1723
1724         /* If we call blk_queue_make_request here, it will
1725          * re-initialise max_sectors etc which may have been
1726          * refined inside -> run.  So just set the bits we need to set.
1727          * Most initialisation happended when we called
1728          * blk_queue_make_request(..., md_fail_request)
1729          * earlier.
1730          */
1731         mddev->queue->queuedata = mddev;
1732         mddev->queue->make_request_fn = mddev->pers->make_request;
1733
1734         mddev->changed = 1;
1735         return 0;
1736 }
1737
1738 static int restart_array(mddev_t *mddev)
1739 {
1740         struct gendisk *disk = mddev->gendisk;
1741         int err;
1742
1743         /*
1744          * Complain if it has no devices
1745          */
1746         err = -ENXIO;
1747         if (list_empty(&mddev->disks))
1748                 goto out;
1749
1750         if (mddev->pers) {
1751                 err = -EBUSY;
1752                 if (!mddev->ro)
1753                         goto out;
1754
1755                 mddev->safemode = 0;
1756                 mddev->ro = 0;
1757                 set_disk_ro(disk, 0);
1758
1759                 printk(KERN_INFO "md: %s switched to read-write mode.\n",
1760                         mdname(mddev));
1761                 /*
1762                  * Kick recovery or resync if necessary
1763                  */
1764                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1765                 md_wakeup_thread(mddev->thread);
1766                 err = 0;
1767         } else {
1768                 printk(KERN_ERR "md: %s has no personality assigned.\n",
1769                         mdname(mddev));
1770                 err = -EINVAL;
1771         }
1772
1773 out:
1774         return err;
1775 }
1776
1777 static int do_md_stop(mddev_t * mddev, int ro)
1778 {
1779         int err = 0;
1780         struct gendisk *disk = mddev->gendisk;
1781
1782         if (mddev->pers) {
1783                 if (atomic_read(&mddev->active)>2) {
1784                         printk("md: %s still in use.\n",mdname(mddev));
1785                         return -EBUSY;
1786                 }
1787
1788                 if (mddev->sync_thread) {
1789                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1790                         md_unregister_thread(mddev->sync_thread);
1791                         mddev->sync_thread = NULL;
1792                 }
1793
1794                 del_timer_sync(&mddev->safemode_timer);
1795
1796                 invalidate_partition(disk, 0);
1797
1798                 if (ro) {
1799                         err  = -ENXIO;
1800                         if (mddev->ro)
1801                                 goto out;
1802                         mddev->ro = 1;
1803                 } else {
1804                         if (mddev->ro)
1805                                 set_disk_ro(disk, 0);
1806                         blk_queue_make_request(mddev->queue, md_fail_request);
1807                         mddev->pers->stop(mddev);
1808                         module_put(mddev->pers->owner);
1809                         mddev->pers = NULL;
1810                         if (mddev->ro)
1811                                 mddev->ro = 0;
1812                 }
1813                 if (!mddev->in_sync) {
1814                         /* mark array as shutdown cleanly */
1815                         mddev->in_sync = 1;
1816                         md_update_sb(mddev);
1817                 }
1818                 if (ro)
1819                         set_disk_ro(disk, 1);
1820         }
1821
1822         bitmap_destroy(mddev);
1823         if (mddev->bitmap_file) {
1824                 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1);
1825                 fput(mddev->bitmap_file);
1826                 mddev->bitmap_file = NULL;
1827         }
1828
1829         /*
1830          * Free resources if final stop
1831          */
1832         if (!ro) {
1833                 struct gendisk *disk;
1834                 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
1835
1836                 export_array(mddev);
1837
1838                 mddev->array_size = 0;
1839                 disk = mddev->gendisk;
1840                 if (disk)
1841                         set_capacity(disk, 0);
1842                 mddev->changed = 1;
1843         } else
1844                 printk(KERN_INFO "md: %s switched to read-only mode.\n",
1845                         mdname(mddev));
1846         err = 0;
1847 out:
1848         return err;
1849 }
1850
1851 static void autorun_array(mddev_t *mddev)
1852 {
1853         mdk_rdev_t *rdev;
1854         struct list_head *tmp;
1855         int err;
1856
1857         if (list_empty(&mddev->disks))
1858                 return;
1859
1860         printk(KERN_INFO "md: running: ");
1861
1862         ITERATE_RDEV(mddev,rdev,tmp) {
1863                 char b[BDEVNAME_SIZE];
1864                 printk("<%s>", bdevname(rdev->bdev,b));
1865         }
1866         printk("\n");
1867
1868         err = do_md_run (mddev);
1869         if (err) {
1870                 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
1871                 do_md_stop (mddev, 0);
1872         }
1873 }
1874
1875 /*
1876  * lets try to run arrays based on all disks that have arrived
1877  * until now. (those are in pending_raid_disks)
1878  *
1879  * the method: pick the first pending disk, collect all disks with
1880  * the same UUID, remove all from the pending list and put them into
1881  * the 'same_array' list. Then order this list based on superblock
1882  * update time (freshest comes first), kick out 'old' disks and
1883  * compare superblocks. If everything's fine then run it.
1884  *
1885  * If "unit" is allocated, then bump its reference count
1886  */
1887 static void autorun_devices(int part)
1888 {
1889         struct list_head candidates;
1890         struct list_head *tmp;
1891         mdk_rdev_t *rdev0, *rdev;
1892         mddev_t *mddev;
1893         char b[BDEVNAME_SIZE];
1894
1895         printk(KERN_INFO "md: autorun ...\n");
1896         while (!list_empty(&pending_raid_disks)) {
1897                 dev_t dev;
1898                 rdev0 = list_entry(pending_raid_disks.next,
1899                                          mdk_rdev_t, same_set);
1900
1901                 printk(KERN_INFO "md: considering %s ...\n",
1902                         bdevname(rdev0->bdev,b));
1903                 INIT_LIST_HEAD(&candidates);
1904                 ITERATE_RDEV_PENDING(rdev,tmp)
1905                         if (super_90_load(rdev, rdev0, 0) >= 0) {
1906                                 printk(KERN_INFO "md:  adding %s ...\n",
1907                                         bdevname(rdev->bdev,b));
1908                                 list_move(&rdev->same_set, &candidates);
1909                         }
1910                 /*
1911                  * now we have a set of devices, with all of them having
1912                  * mostly sane superblocks. It's time to allocate the
1913                  * mddev.
1914                  */
1915                 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
1916                         printk(KERN_INFO "md: unit number in %s is bad: %d\n",
1917                                bdevname(rdev0->bdev, b), rdev0->preferred_minor);
1918                         break;
1919                 }
1920                 if (part)
1921                         dev = MKDEV(mdp_major,
1922                                     rdev0->preferred_minor << MdpMinorShift);
1923                 else
1924                         dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
1925
1926                 md_probe(dev, NULL, NULL);
1927                 mddev = mddev_find(dev);
1928                 if (!mddev) {
1929                         printk(KERN_ERR 
1930                                 "md: cannot allocate memory for md drive.\n");
1931                         break;
1932                 }
1933                 if (mddev_lock(mddev)) 
1934                         printk(KERN_WARNING "md: %s locked, cannot run\n",
1935                                mdname(mddev));
1936                 else if (mddev->raid_disks || mddev->major_version
1937                          || !list_empty(&mddev->disks)) {
1938                         printk(KERN_WARNING 
1939                                 "md: %s already running, cannot run %s\n",
1940                                 mdname(mddev), bdevname(rdev0->bdev,b));
1941                         mddev_unlock(mddev);
1942                 } else {
1943                         printk(KERN_INFO "md: created %s\n", mdname(mddev));
1944                         ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
1945                                 list_del_init(&rdev->same_set);
1946                                 if (bind_rdev_to_array(rdev, mddev))
1947                                         export_rdev(rdev);
1948                         }
1949                         autorun_array(mddev);
1950                         mddev_unlock(mddev);
1951                 }
1952                 /* on success, candidates will be empty, on error
1953                  * it won't...
1954                  */
1955                 ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
1956                         export_rdev(rdev);
1957                 mddev_put(mddev);
1958         }
1959         printk(KERN_INFO "md: ... autorun DONE.\n");
1960 }
1961
1962 /*
1963  * import RAID devices based on one partition
1964  * if possible, the array gets run as well.
1965  */
1966
1967 static int autostart_array(dev_t startdev)
1968 {
1969         char b[BDEVNAME_SIZE];
1970         int err = -EINVAL, i;
1971         mdp_super_t *sb = NULL;
1972         mdk_rdev_t *start_rdev = NULL, *rdev;
1973
1974         start_rdev = md_import_device(startdev, 0, 0);
1975         if (IS_ERR(start_rdev))
1976                 return err;
1977
1978
1979         /* NOTE: this can only work for 0.90.0 superblocks */
1980         sb = (mdp_super_t*)page_address(start_rdev->sb_page);
1981         if (sb->major_version != 0 ||
1982             sb->minor_version != 90 ) {
1983                 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
1984                 export_rdev(start_rdev);
1985                 return err;
1986         }
1987
1988         if (start_rdev->faulty) {
1989                 printk(KERN_WARNING 
1990                         "md: can not autostart based on faulty %s!\n",
1991                         bdevname(start_rdev->bdev,b));
1992                 export_rdev(start_rdev);
1993                 return err;
1994         }
1995         list_add(&start_rdev->same_set, &pending_raid_disks);
1996
1997         for (i = 0; i < MD_SB_DISKS; i++) {
1998                 mdp_disk_t *desc = sb->disks + i;
1999                 dev_t dev = MKDEV(desc->major, desc->minor);
2000
2001                 if (!dev)
2002                         continue;
2003                 if (dev == startdev)
2004                         continue;
2005                 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
2006                         continue;
2007                 rdev = md_import_device(dev, 0, 0);
2008                 if (IS_ERR(rdev))
2009                         continue;
2010
2011                 list_add(&rdev->same_set, &pending_raid_disks);
2012         }
2013
2014         /*
2015          * possibly return codes
2016          */
2017         autorun_devices(0);
2018         return 0;
2019
2020 }
2021
2022
2023 static int get_version(void __user * arg)
2024 {
2025         mdu_version_t ver;
2026
2027         ver.major = MD_MAJOR_VERSION;
2028         ver.minor = MD_MINOR_VERSION;
2029         ver.patchlevel = MD_PATCHLEVEL_VERSION;
2030
2031         if (copy_to_user(arg, &ver, sizeof(ver)))
2032                 return -EFAULT;
2033
2034         return 0;
2035 }
2036
2037 static int get_array_info(mddev_t * mddev, void __user * arg)
2038 {
2039         mdu_array_info_t info;
2040         int nr,working,active,failed,spare;
2041         mdk_rdev_t *rdev;
2042         struct list_head *tmp;
2043
2044         nr=working=active=failed=spare=0;
2045         ITERATE_RDEV(mddev,rdev,tmp) {
2046                 nr++;
2047                 if (rdev->faulty)
2048                         failed++;
2049                 else {
2050                         working++;
2051                         if (rdev->in_sync)
2052                                 active++;       
2053                         else
2054                                 spare++;
2055                 }
2056         }
2057
2058         info.major_version = mddev->major_version;
2059         info.minor_version = mddev->minor_version;
2060         info.patch_version = MD_PATCHLEVEL_VERSION;
2061         info.ctime         = mddev->ctime;
2062         info.level         = mddev->level;
2063         info.size          = mddev->size;
2064         info.nr_disks      = nr;
2065         info.raid_disks    = mddev->raid_disks;
2066         info.md_minor      = mddev->md_minor;
2067         info.not_persistent= !mddev->persistent;
2068
2069         info.utime         = mddev->utime;
2070         info.state         = 0;
2071         if (mddev->in_sync)
2072                 info.state = (1<<MD_SB_CLEAN);
2073         info.active_disks  = active;
2074         info.working_disks = working;
2075         info.failed_disks  = failed;
2076         info.spare_disks   = spare;
2077
2078         info.layout        = mddev->layout;
2079         info.chunk_size    = mddev->chunk_size;
2080
2081         if (copy_to_user(arg, &info, sizeof(info)))
2082                 return -EFAULT;
2083
2084         return 0;
2085 }
2086
2087 static int get_bitmap_file(mddev_t * mddev, void * arg)
2088 {
2089         mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
2090         char *ptr, *buf = NULL;
2091         int err = -ENOMEM;
2092
2093         file = kmalloc(sizeof(*file), GFP_KERNEL);
2094         if (!file)
2095                 goto out;
2096
2097         /* bitmap disabled, zero the first byte and copy out */
2098         if (!mddev->bitmap || !mddev->bitmap->file) {
2099                 file->pathname[0] = '\0';
2100                 goto copy_out;
2101         }
2102
2103         buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
2104         if (!buf)
2105                 goto out;
2106
2107         ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
2108         if (!ptr)
2109                 goto out;
2110
2111         strcpy(file->pathname, ptr);
2112
2113 copy_out:
2114         err = 0;
2115         if (copy_to_user(arg, file, sizeof(*file)))
2116                 err = -EFAULT;
2117 out:
2118         kfree(buf);
2119         kfree(file);
2120         return err;
2121 }
2122
2123 static int get_disk_info(mddev_t * mddev, void __user * arg)
2124 {
2125         mdu_disk_info_t info;
2126         unsigned int nr;
2127         mdk_rdev_t *rdev;
2128
2129         if (copy_from_user(&info, arg, sizeof(info)))
2130                 return -EFAULT;
2131
2132         nr = info.number;
2133
2134         rdev = find_rdev_nr(mddev, nr);
2135         if (rdev) {
2136                 info.major = MAJOR(rdev->bdev->bd_dev);
2137                 info.minor = MINOR(rdev->bdev->bd_dev);
2138                 info.raid_disk = rdev->raid_disk;
2139                 info.state = 0;
2140                 if (rdev->faulty)
2141                         info.state |= (1<<MD_DISK_FAULTY);
2142                 else if (rdev->in_sync) {
2143                         info.state |= (1<<MD_DISK_ACTIVE);
2144                         info.state |= (1<<MD_DISK_SYNC);
2145                 }
2146         } else {
2147                 info.major = info.minor = 0;
2148                 info.raid_disk = -1;
2149                 info.state = (1<<MD_DISK_REMOVED);
2150         }
2151
2152         if (copy_to_user(arg, &info, sizeof(info)))
2153                 return -EFAULT;
2154
2155         return 0;
2156 }
2157
2158 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2159 {
2160         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
2161         mdk_rdev_t *rdev;
2162         dev_t dev = MKDEV(info->major,info->minor);
2163
2164         if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
2165                 return -EOVERFLOW;
2166
2167         if (!mddev->raid_disks) {
2168                 int err;
2169                 /* expecting a device which has a superblock */
2170                 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
2171                 if (IS_ERR(rdev)) {
2172                         printk(KERN_WARNING 
2173                                 "md: md_import_device returned %ld\n",
2174                                 PTR_ERR(rdev));
2175                         return PTR_ERR(rdev);
2176                 }
2177                 if (!list_empty(&mddev->disks)) {
2178                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2179                                                         mdk_rdev_t, same_set);
2180                         int err = super_types[mddev->major_version]
2181                                 .load_super(rdev, rdev0, mddev->minor_version);
2182                         if (err < 0) {
2183                                 printk(KERN_WARNING 
2184                                         "md: %s has different UUID to %s\n",
2185                                         bdevname(rdev->bdev,b), 
2186                                         bdevname(rdev0->bdev,b2));
2187                                 export_rdev(rdev);
2188                                 return -EINVAL;
2189                         }
2190                 }
2191                 err = bind_rdev_to_array(rdev, mddev);
2192                 if (err)
2193                         export_rdev(rdev);
2194                 return err;
2195         }
2196
2197         /*
2198          * add_new_disk can be used once the array is assembled
2199          * to add "hot spares".  They must already have a superblock
2200          * written
2201          */
2202         if (mddev->pers) {
2203                 int err;
2204                 if (!mddev->pers->hot_add_disk) {
2205                         printk(KERN_WARNING 
2206                                 "%s: personality does not support diskops!\n",
2207                                mdname(mddev));
2208                         return -EINVAL;
2209                 }
2210                 rdev = md_import_device(dev, mddev->major_version,
2211                                         mddev->minor_version);
2212                 if (IS_ERR(rdev)) {
2213                         printk(KERN_WARNING 
2214                                 "md: md_import_device returned %ld\n",
2215                                 PTR_ERR(rdev));
2216                         return PTR_ERR(rdev);
2217                 }
2218                 /* set save_raid_disk if appropriate */
2219                 if (!mddev->persistent) {
2220                         if (info->state & (1<<MD_DISK_SYNC)  &&
2221                             info->raid_disk < mddev->raid_disks)
2222                                 rdev->raid_disk = info->raid_disk;
2223                         else
2224                                 rdev->raid_disk = -1;
2225                 } else
2226                         super_types[mddev->major_version].
2227                                 validate_super(mddev, rdev);
2228                 rdev->saved_raid_disk = rdev->raid_disk;
2229
2230                 rdev->in_sync = 0; /* just to be sure */
2231                 rdev->raid_disk = -1;
2232                 err = bind_rdev_to_array(rdev, mddev);
2233                 if (err)
2234                         export_rdev(rdev);
2235
2236                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2237                 if (mddev->thread)
2238                         md_wakeup_thread(mddev->thread);
2239                 return err;
2240         }
2241
2242         /* otherwise, add_new_disk is only allowed
2243          * for major_version==0 superblocks
2244          */
2245         if (mddev->major_version != 0) {
2246                 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
2247                        mdname(mddev));
2248                 return -EINVAL;
2249         }
2250
2251         if (!(info->state & (1<<MD_DISK_FAULTY))) {
2252                 int err;
2253                 rdev = md_import_device (dev, -1, 0);
2254                 if (IS_ERR(rdev)) {
2255                         printk(KERN_WARNING 
2256                                 "md: error, md_import_device() returned %ld\n",
2257                                 PTR_ERR(rdev));
2258                         return PTR_ERR(rdev);
2259                 }
2260                 rdev->desc_nr = info->number;
2261                 if (info->raid_disk < mddev->raid_disks)
2262                         rdev->raid_disk = info->raid_disk;
2263                 else
2264                         rdev->raid_disk = -1;
2265
2266                 rdev->faulty = 0;
2267                 if (rdev->raid_disk < mddev->raid_disks)
2268                         rdev->in_sync = (info->state & (1<<MD_DISK_SYNC));
2269                 else
2270                         rdev->in_sync = 0;
2271
2272                 err = bind_rdev_to_array(rdev, mddev);
2273                 if (err) {
2274                         export_rdev(rdev);
2275                         return err;
2276                 }
2277
2278                 if (!mddev->persistent) {
2279                         printk(KERN_INFO "md: nonpersistent superblock ...\n");
2280                         rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2281                 } else 
2282                         rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
2283                 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
2284
2285                 if (!mddev->size || (mddev->size > rdev->size))
2286                         mddev->size = rdev->size;
2287         }
2288
2289         return 0;
2290 }
2291
2292 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
2293 {
2294         char b[BDEVNAME_SIZE];
2295         mdk_rdev_t *rdev;
2296
2297         if (!mddev->pers)
2298                 return -ENODEV;
2299
2300         rdev = find_rdev(mddev, dev);
2301         if (!rdev)
2302                 return -ENXIO;
2303
2304         if (rdev->raid_disk >= 0)
2305                 goto busy;
2306
2307         kick_rdev_from_array(rdev);
2308         md_update_sb(mddev);
2309
2310         return 0;
2311 busy:
2312         printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
2313                 bdevname(rdev->bdev,b), mdname(mddev));
2314         return -EBUSY;
2315 }
2316
2317 static int hot_add_disk(mddev_t * mddev, dev_t dev)
2318 {
2319         char b[BDEVNAME_SIZE];
2320         int err;
2321         unsigned int size;
2322         mdk_rdev_t *rdev;
2323
2324         if (!mddev->pers)
2325                 return -ENODEV;
2326
2327         if (mddev->major_version != 0) {
2328                 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
2329                         " version-0 superblocks.\n",
2330                         mdname(mddev));
2331                 return -EINVAL;
2332         }
2333         if (!mddev->pers->hot_add_disk) {
2334                 printk(KERN_WARNING 
2335                         "%s: personality does not support diskops!\n",
2336                         mdname(mddev));
2337                 return -EINVAL;
2338         }
2339
2340         rdev = md_import_device (dev, -1, 0);
2341         if (IS_ERR(rdev)) {
2342                 printk(KERN_WARNING 
2343                         "md: error, md_import_device() returned %ld\n",
2344                         PTR_ERR(rdev));
2345                 return -EINVAL;
2346         }
2347
2348         if (mddev->persistent)
2349                 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
2350         else
2351                 rdev->sb_offset =
2352                         rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2353
2354         size = calc_dev_size(rdev, mddev->chunk_size);
2355         rdev->size = size;
2356
2357         if (size < mddev->size) {
2358                 printk(KERN_WARNING 
2359                         "%s: disk size %llu blocks < array size %llu\n",
2360                         mdname(mddev), (unsigned long long)size,
2361                         (unsigned long long)mddev->size);
2362                 err = -ENOSPC;
2363                 goto abort_export;
2364         }
2365
2366         if (rdev->faulty) {
2367                 printk(KERN_WARNING 
2368                         "md: can not hot-add faulty %s disk to %s!\n",
2369                         bdevname(rdev->bdev,b), mdname(mddev));
2370                 err = -EINVAL;
2371                 goto abort_export;
2372         }
2373         rdev->in_sync = 0;
2374         rdev->desc_nr = -1;
2375         bind_rdev_to_array(rdev, mddev);
2376
2377         /*
2378          * The rest should better be atomic, we can have disk failures
2379          * noticed in interrupt contexts ...
2380          */
2381
2382         if (rdev->desc_nr == mddev->max_disks) {
2383                 printk(KERN_WARNING "%s: can not hot-add to full array!\n",
2384                         mdname(mddev));
2385                 err = -EBUSY;
2386                 goto abort_unbind_export;
2387         }
2388
2389         rdev->raid_disk = -1;
2390
2391         md_update_sb(mddev);
2392
2393         /*
2394          * Kick recovery, maybe this spare has to be added to the
2395          * array immediately.
2396          */
2397         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2398         md_wakeup_thread(mddev->thread);
2399
2400         return 0;
2401
2402 abort_unbind_export:
2403         unbind_rdev_from_array(rdev);
2404
2405 abort_export:
2406         export_rdev(rdev);
2407         return err;
2408 }
2409
2410 /* similar to deny_write_access, but accounts for our holding a reference
2411  * to the file ourselves */
2412 static int deny_bitmap_write_access(struct file * file)
2413 {
2414         struct inode *inode = file->f_mapping->host;
2415
2416         spin_lock(&inode->i_lock);
2417         if (atomic_read(&inode->i_writecount) > 1) {
2418                 spin_unlock(&inode->i_lock);
2419                 return -ETXTBSY;
2420         }
2421         atomic_set(&inode->i_writecount, -1);
2422         spin_unlock(&inode->i_lock);
2423
2424         return 0;
2425 }
2426
2427 static int set_bitmap_file(mddev_t *mddev, int fd)
2428 {
2429         int err;
2430
2431         if (mddev->pers)
2432                 return -EBUSY;
2433
2434         mddev->bitmap_file = fget(fd);
2435
2436         if (mddev->bitmap_file == NULL) {
2437                 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
2438                         mdname(mddev));
2439                 return -EBADF;
2440         }
2441
2442         err = deny_bitmap_write_access(mddev->bitmap_file);
2443         if (err) {
2444                 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
2445                         mdname(mddev));
2446                 fput(mddev->bitmap_file);
2447                 mddev->bitmap_file = NULL;
2448         } else
2449                 mddev->bitmap_offset = 0; /* file overrides offset */
2450         return err;
2451 }
2452
2453 /*
2454  * set_array_info is used two different ways
2455  * The original usage is when creating a new array.
2456  * In this usage, raid_disks is > 0 and it together with
2457  *  level, size, not_persistent,layout,chunksize determine the
2458  *  shape of the array.
2459  *  This will always create an array with a type-0.90.0 superblock.
2460  * The newer usage is when assembling an array.
2461  *  In this case raid_disks will be 0, and the major_version field is
2462  *  use to determine which style super-blocks are to be found on the devices.
2463  *  The minor and patch _version numbers are also kept incase the
2464  *  super_block handler wishes to interpret them.
2465  */
2466 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
2467 {
2468
2469         if (info->raid_disks == 0) {
2470                 /* just setting version number for superblock loading */
2471                 if (info->major_version < 0 ||
2472                     info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
2473                     super_types[info->major_version].name == NULL) {
2474                         /* maybe try to auto-load a module? */
2475                         printk(KERN_INFO 
2476                                 "md: superblock version %d not known\n",
2477                                 info->major_version);
2478                         return -EINVAL;
2479                 }
2480                 mddev->major_version = info->major_version;
2481                 mddev->minor_version = info->minor_version;
2482                 mddev->patch_version = info->patch_version;
2483                 return 0;
2484         }
2485         mddev->major_version = MD_MAJOR_VERSION;
2486         mddev->minor_version = MD_MINOR_VERSION;
2487         mddev->patch_version = MD_PATCHLEVEL_VERSION;
2488         mddev->ctime         = get_seconds();
2489
2490         mddev->level         = info->level;
2491         mddev->size          = info->size;
2492         mddev->raid_disks    = info->raid_disks;
2493         /* don't set md_minor, it is determined by which /dev/md* was
2494          * openned
2495          */
2496         if (info->state & (1<<MD_SB_CLEAN))
2497                 mddev->recovery_cp = MaxSector;
2498         else
2499                 mddev->recovery_cp = 0;
2500         mddev->persistent    = ! info->not_persistent;
2501
2502         mddev->layout        = info->layout;
2503         mddev->chunk_size    = info->chunk_size;
2504
2505         mddev->max_disks     = MD_SB_DISKS;
2506
2507         mddev->sb_dirty      = 1;
2508
2509         /*
2510          * Generate a 128 bit UUID
2511          */
2512         get_random_bytes(mddev->uuid, 16);
2513
2514         return 0;
2515 }
2516
2517 /*
2518  * update_array_info is used to change the configuration of an
2519  * on-line array.
2520  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
2521  * fields in the info are checked against the array.
2522  * Any differences that cannot be handled will cause an error.
2523  * Normally, only one change can be managed at a time.
2524  */
2525 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
2526 {
2527         int rv = 0;
2528         int cnt = 0;
2529
2530         if (mddev->major_version != info->major_version ||
2531             mddev->minor_version != info->minor_version ||
2532 /*          mddev->patch_version != info->patch_version || */
2533             mddev->ctime         != info->ctime         ||
2534             mddev->level         != info->level         ||
2535 /*          mddev->layout        != info->layout        || */
2536             !mddev->persistent   != info->not_persistent||
2537             mddev->chunk_size    != info->chunk_size    )
2538                 return -EINVAL;
2539         /* Check there is only one change */
2540         if (mddev->size != info->size) cnt++;
2541         if (mddev->raid_disks != info->raid_disks) cnt++;
2542         if (mddev->layout != info->layout) cnt++;
2543         if (cnt == 0) return 0;
2544         if (cnt > 1) return -EINVAL;
2545
2546         if (mddev->layout != info->layout) {
2547                 /* Change layout
2548                  * we don't need to do anything at the md level, the
2549                  * personality will take care of it all.
2550                  */
2551                 if (mddev->pers->reconfig == NULL)
2552                         return -EINVAL;
2553                 else
2554                         return mddev->pers->reconfig(mddev, info->layout, -1);
2555         }
2556         if (mddev->size != info->size) {
2557                 mdk_rdev_t * rdev;
2558                 struct list_head *tmp;
2559                 if (mddev->pers->resize == NULL)
2560                         return -EINVAL;
2561                 /* The "size" is the amount of each device that is used.
2562                  * This can only make sense for arrays with redundancy.
2563                  * linear and raid0 always use whatever space is available
2564                  * We can only consider changing the size if no resync
2565                  * or reconstruction is happening, and if the new size
2566                  * is acceptable. It must fit before the sb_offset or,
2567                  * if that is <data_offset, it must fit before the
2568                  * size of each device.
2569                  * If size is zero, we find the largest size that fits.
2570                  */
2571                 if (mddev->sync_thread)
2572                         return -EBUSY;
2573                 ITERATE_RDEV(mddev,rdev,tmp) {
2574                         sector_t avail;
2575                         int fit = (info->size == 0);
2576                         if (rdev->sb_offset > rdev->data_offset)
2577                                 avail = (rdev->sb_offset*2) - rdev->data_offset;
2578                         else
2579                                 avail = get_capacity(rdev->bdev->bd_disk)
2580                                         - rdev->data_offset;
2581                         if (fit && (info->size == 0 || info->size > avail/2))
2582                                 info->size = avail/2;
2583                         if (avail < ((sector_t)info->size << 1))
2584                                 return -ENOSPC;
2585                 }
2586                 rv = mddev->pers->resize(mddev, (sector_t)info->size *2);
2587                 if (!rv) {
2588                         struct block_device *bdev;
2589
2590                         bdev = bdget_disk(mddev->gendisk, 0);
2591                         if (bdev) {
2592                                 down(&bdev->bd_inode->i_sem);
2593                                 i_size_write(bdev->bd_inode, mddev->array_size << 10);
2594                                 up(&bdev->bd_inode->i_sem);
2595                                 bdput(bdev);
2596                         }
2597                 }
2598         }
2599         if (mddev->raid_disks    != info->raid_disks) {
2600                 /* change the number of raid disks */
2601                 if (mddev->pers->reshape == NULL)
2602                         return -EINVAL;
2603                 if (info->raid_disks <= 0 ||
2604                     info->raid_disks >= mddev->max_disks)
2605                         return -EINVAL;
2606                 if (mddev->sync_thread)
2607                         return -EBUSY;
2608                 rv = mddev->pers->reshape(mddev, info->raid_disks);
2609                 if (!rv) {
2610                         struct block_device *bdev;
2611
2612                         bdev = bdget_disk(mddev->gendisk, 0);
2613                         if (bdev) {
2614                                 down(&bdev->bd_inode->i_sem);
2615                                 i_size_write(bdev->bd_inode, mddev->array_size << 10);
2616                                 up(&bdev->bd_inode->i_sem);
2617                                 bdput(bdev);
2618                         }
2619                 }
2620         }
2621         md_update_sb(mddev);
2622         return rv;
2623 }
2624
2625 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
2626 {
2627         mdk_rdev_t *rdev;
2628
2629         if (mddev->pers == NULL)
2630                 return -ENODEV;
2631
2632         rdev = find_rdev(mddev, dev);
2633         if (!rdev)
2634                 return -ENODEV;
2635
2636         md_error(mddev, rdev);
2637         return 0;
2638 }
2639
2640 static int md_ioctl(struct inode *inode, struct file *file,
2641                         unsigned int cmd, unsigned long arg)
2642 {
2643         int err = 0;
2644         void __user *argp = (void __user *)arg;
2645         struct hd_geometry __user *loc = argp;
2646         mddev_t *mddev = NULL;
2647
2648         if (!capable(CAP_SYS_ADMIN))
2649                 return -EACCES;
2650
2651         /*
2652          * Commands dealing with the RAID driver but not any
2653          * particular array:
2654          */
2655         switch (cmd)
2656         {
2657                 case RAID_VERSION:
2658                         err = get_version(argp);
2659                         goto done;
2660
2661                 case PRINT_RAID_DEBUG:
2662                         err = 0;
2663                         md_print_devices();
2664                         goto done;
2665
2666 #ifndef MODULE
2667                 case RAID_AUTORUN:
2668                         err = 0;
2669                         autostart_arrays(arg);
2670                         goto done;
2671 #endif
2672                 default:;
2673         }
2674
2675         /*
2676          * Commands creating/starting a new array:
2677          */
2678
2679         mddev = inode->i_bdev->bd_disk->private_data;
2680
2681         if (!mddev) {
2682                 BUG();
2683                 goto abort;
2684         }
2685
2686
2687         if (cmd == START_ARRAY) {
2688                 /* START_ARRAY doesn't need to lock the array as autostart_array
2689                  * does the locking, and it could even be a different array
2690                  */
2691                 static int cnt = 3;
2692                 if (cnt > 0 ) {
2693                         printk(KERN_WARNING
2694                                "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
2695                                "This will not be supported beyond 2.6\n",
2696                                current->comm, current->pid);
2697                         cnt--;
2698                 }
2699                 err = autostart_array(new_decode_dev(arg));
2700                 if (err) {
2701                         printk(KERN_WARNING "md: autostart failed!\n");
2702                         goto abort;
2703                 }
2704                 goto done;
2705         }
2706
2707         err = mddev_lock(mddev);
2708         if (err) {
2709                 printk(KERN_INFO 
2710                         "md: ioctl lock interrupted, reason %d, cmd %d\n",
2711                         err, cmd);
2712                 goto abort;
2713         }
2714
2715         switch (cmd)
2716         {
2717                 case SET_ARRAY_INFO:
2718                         {
2719                                 mdu_array_info_t info;
2720                                 if (!arg)
2721                                         memset(&info, 0, sizeof(info));
2722                                 else if (copy_from_user(&info, argp, sizeof(info))) {
2723                                         err = -EFAULT;
2724                                         goto abort_unlock;
2725                                 }
2726                                 if (mddev->pers) {
2727                                         err = update_array_info(mddev, &info);
2728                                         if (err) {
2729                                                 printk(KERN_WARNING "md: couldn't update"
2730                                                        " array info. %d\n", err);
2731                                                 goto abort_unlock;
2732                                         }
2733                                         goto done_unlock;
2734                                 }
2735                                 if (!list_empty(&mddev->disks)) {
2736                                         printk(KERN_WARNING
2737                                                "md: array %s already has disks!\n",
2738                                                mdname(mddev));
2739                                         err = -EBUSY;
2740                                         goto abort_unlock;
2741                                 }
2742                                 if (mddev->raid_disks) {
2743                                         printk(KERN_WARNING
2744                                                "md: array %s already initialised!\n",
2745                                                mdname(mddev));
2746                                         err = -EBUSY;
2747                                         goto abort_unlock;
2748                                 }
2749                                 err = set_array_info(mddev, &info);
2750                                 if (err) {
2751                                         printk(KERN_WARNING "md: couldn't set"
2752                                                " array info. %d\n", err);
2753                                         goto abort_unlock;
2754                                 }
2755                         }
2756                         goto done_unlock;
2757
2758                 default:;
2759         }
2760
2761         /*
2762          * Commands querying/configuring an existing array:
2763          */
2764         /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
2765          * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
2766         if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
2767                         && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
2768                 err = -ENODEV;
2769                 goto abort_unlock;
2770         }
2771
2772         /*
2773          * Commands even a read-only array can execute:
2774          */
2775         switch (cmd)
2776         {
2777                 case GET_ARRAY_INFO:
2778                         err = get_array_info(mddev, argp);
2779                         goto done_unlock;
2780
2781                 case GET_BITMAP_FILE:
2782                         err = get_bitmap_file(mddev, (void *)arg);
2783                         goto done_unlock;
2784
2785                 case GET_DISK_INFO:
2786                         err = get_disk_info(mddev, argp);
2787                         goto done_unlock;
2788
2789                 case RESTART_ARRAY_RW:
2790                         err = restart_array(mddev);
2791                         goto done_unlock;
2792
2793                 case STOP_ARRAY:
2794                         err = do_md_stop (mddev, 0);
2795                         goto done_unlock;
2796
2797                 case STOP_ARRAY_RO:
2798                         err = do_md_stop (mddev, 1);
2799                         goto done_unlock;
2800
2801         /*
2802          * We have a problem here : there is no easy way to give a CHS
2803          * virtual geometry. We currently pretend that we have a 2 heads
2804          * 4 sectors (with a BIG number of cylinders...). This drives
2805          * dosfs just mad... ;-)
2806          */
2807                 case HDIO_GETGEO:
2808                         if (!loc) {
2809                                 err = -EINVAL;
2810                                 goto abort_unlock;
2811                         }
2812                         err = put_user (2, (char __user *) &loc->heads);
2813                         if (err)
2814                                 goto abort_unlock;
2815                         err = put_user (4, (char __user *) &loc->sectors);
2816                         if (err)
2817                                 goto abort_unlock;
2818                         err = put_user(get_capacity(mddev->gendisk)/8,
2819                                         (short __user *) &loc->cylinders);
2820                         if (err)
2821                                 goto abort_unlock;
2822                         err = put_user (get_start_sect(inode->i_bdev),
2823                                                 (long __user *) &loc->start);
2824                         goto done_unlock;
2825         }
2826
2827         /*
2828          * The remaining ioctls are changing the state of the
2829          * superblock, so we do not allow read-only arrays
2830          * here:
2831          */
2832         if (mddev->ro) {
2833                 err = -EROFS;
2834                 goto abort_unlock;
2835         }
2836
2837         switch (cmd)
2838         {
2839                 case ADD_NEW_DISK:
2840                 {
2841                         mdu_disk_info_t info;
2842                         if (copy_from_user(&info, argp, sizeof(info)))
2843                                 err = -EFAULT;
2844                         else
2845                                 err = add_new_disk(mddev, &info);
2846                         goto done_unlock;
2847                 }
2848
2849                 case HOT_REMOVE_DISK:
2850                         err = hot_remove_disk(mddev, new_decode_dev(arg));
2851                         goto done_unlock;
2852
2853                 case HOT_ADD_DISK:
2854                         err = hot_add_disk(mddev, new_decode_dev(arg));
2855                         goto done_unlock;
2856
2857                 case SET_DISK_FAULTY:
2858                         err = set_disk_faulty(mddev, new_decode_dev(arg));
2859                         goto done_unlock;
2860
2861                 case RUN_ARRAY:
2862                         err = do_md_run (mddev);
2863                         goto done_unlock;
2864
2865                 case SET_BITMAP_FILE:
2866                         err = set_bitmap_file(mddev, (int)arg);
2867                         goto done_unlock;
2868
2869                 default:
2870                         if (_IOC_TYPE(cmd) == MD_MAJOR)
2871                                 printk(KERN_WARNING "md: %s(pid %d) used"
2872                                         " obsolete MD ioctl, upgrade your"
2873                                         " software to use new ictls.\n",
2874                                         current->comm, current->pid);
2875                         err = -EINVAL;
2876                         goto abort_unlock;
2877         }
2878
2879 done_unlock:
2880 abort_unlock:
2881         mddev_unlock(mddev);
2882
2883         return err;
2884 done:
2885         if (err)
2886                 MD_BUG();
2887 abort:
2888         return err;
2889 }
2890
2891 static int md_open(struct inode *inode, struct file *file)
2892 {
2893         /*
2894          * Succeed if we can lock the mddev, which confirms that
2895          * it isn't being stopped right now.
2896          */
2897         mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
2898         int err;
2899
2900         if ((err = mddev_lock(mddev)))
2901                 goto out;
2902
2903         err = 0;
2904         mddev_get(mddev);
2905         mddev_unlock(mddev);
2906
2907         check_disk_change(inode->i_bdev);
2908  out:
2909         return err;
2910 }
2911
2912 static int md_release(struct inode *inode, struct file * file)
2913 {
2914         mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
2915
2916         if (!mddev)
2917                 BUG();
2918         mddev_put(mddev);
2919
2920         return 0;
2921 }
2922
2923 static int md_media_changed(struct gendisk *disk)
2924 {
2925         mddev_t *mddev = disk->private_data;
2926
2927         return mddev->changed;
2928 }
2929
2930 static int md_revalidate(struct gendisk *disk)
2931 {
2932         mddev_t *mddev = disk->private_data;
2933
2934         mddev->changed = 0;
2935         return 0;
2936 }
2937 static struct block_device_operations md_fops =
2938 {
2939         .owner          = THIS_MODULE,
2940         .open           = md_open,
2941         .release        = md_release,
2942         .ioctl          = md_ioctl,
2943         .media_changed  = md_media_changed,
2944         .revalidate_disk= md_revalidate,
2945 };
2946
2947 static int md_thread(void * arg)
2948 {
2949         mdk_thread_t *thread = arg;
2950
2951         lock_kernel();
2952
2953         /*
2954          * Detach thread
2955          */
2956
2957         daemonize(thread->name, mdname(thread->mddev));
2958
2959         current->exit_signal = SIGCHLD;
2960         allow_signal(SIGKILL);
2961         thread->tsk = current;
2962
2963         /*
2964          * md_thread is a 'system-thread', it's priority should be very
2965          * high. We avoid resource deadlocks individually in each
2966          * raid personality. (RAID5 does preallocation) We also use RR and
2967          * the very same RT priority as kswapd, thus we will never get
2968          * into a priority inversion deadlock.
2969          *
2970          * we definitely have to have equal or higher priority than
2971          * bdflush, otherwise bdflush will deadlock if there are too
2972          * many dirty RAID5 blocks.
2973          */
2974         unlock_kernel();
2975
2976         complete(thread->event);
2977         while (thread->run) {
2978                 void (*run)(mddev_t *);
2979
2980                 wait_event_interruptible_timeout(thread->wqueue,
2981                                                  test_bit(THREAD_WAKEUP, &thread->flags),
2982                                                  thread->timeout);
2983                 if (current->flags & PF_FREEZE)
2984                         refrigerator(PF_FREEZE);
2985
2986                 clear_bit(THREAD_WAKEUP, &thread->flags);
2987
2988                 run = thread->run;
2989                 if (run)
2990                         run(thread->mddev);
2991
2992                 if (signal_pending(current))
2993                         flush_signals(current);
2994         }
2995         complete(thread->event);
2996         return 0;
2997 }
2998
2999 void md_wakeup_thread(mdk_thread_t *thread)
3000 {
3001         if (thread) {
3002                 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
3003                 set_bit(THREAD_WAKEUP, &thread->flags);
3004                 wake_up(&thread->wqueue);
3005         }
3006 }
3007
3008 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
3009                                  const char *name)
3010 {
3011         mdk_thread_t *thread;
3012         int ret;
3013         struct completion event;
3014
3015         thread = (mdk_thread_t *) kmalloc
3016                                 (sizeof(mdk_thread_t), GFP_KERNEL);
3017         if (!thread)
3018                 return NULL;
3019
3020         memset(thread, 0, sizeof(mdk_thread_t));
3021         init_waitqueue_head(&thread->wqueue);
3022
3023         init_completion(&event);
3024         thread->event = &event;
3025         thread->run = run;
3026         thread->mddev = mddev;
3027         thread->name = name;
3028         thread->timeout = MAX_SCHEDULE_TIMEOUT;
3029         ret = kernel_thread(md_thread, thread, 0);
3030         if (ret < 0) {
3031                 kfree(thread);
3032                 return NULL;
3033         }
3034         wait_for_completion(&event);
3035         return thread;
3036 }
3037
3038 void md_unregister_thread(mdk_thread_t *thread)
3039 {
3040         struct completion event;
3041
3042         init_completion(&event);
3043
3044         thread->event = &event;
3045
3046         /* As soon as ->run is set to NULL, the task could disappear,
3047          * so we need to hold tasklist_lock until we have sent the signal
3048          */
3049         dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
3050         read_lock(&tasklist_lock);
3051         thread->run = NULL;
3052         send_sig(SIGKILL, thread->tsk, 1);
3053         read_unlock(&tasklist_lock);
3054         wait_for_completion(&event);
3055         kfree(thread);
3056 }
3057
3058 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
3059 {
3060         if (!mddev) {
3061                 MD_BUG();
3062                 return;
3063         }
3064
3065         if (!rdev || rdev->faulty)
3066                 return;
3067 /*
3068         dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
3069                 mdname(mddev),
3070                 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
3071                 __builtin_return_address(0),__builtin_return_address(1),
3072                 __builtin_return_address(2),__builtin_return_address(3));
3073 */
3074         if (!mddev->pers->error_handler)
3075                 return;
3076         mddev->pers->error_handler(mddev,rdev);
3077         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3078         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3079         md_wakeup_thread(mddev->thread);
3080 }
3081
3082 /* seq_file implementation /proc/mdstat */
3083
3084 static void status_unused(struct seq_file *seq)
3085 {
3086         int i = 0;
3087         mdk_rdev_t *rdev;
3088         struct list_head *tmp;
3089
3090         seq_printf(seq, "unused devices: ");
3091
3092         ITERATE_RDEV_PENDING(rdev,tmp) {
3093                 char b[BDEVNAME_SIZE];
3094                 i++;
3095                 seq_printf(seq, "%s ",
3096                               bdevname(rdev->bdev,b));
3097         }
3098         if (!i)
3099                 seq_printf(seq, "<none>");
3100
3101         seq_printf(seq, "\n");
3102 }
3103
3104
3105 static void status_resync(struct seq_file *seq, mddev_t * mddev)
3106 {
3107         unsigned long max_blocks, resync, res, dt, db, rt;
3108
3109         resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
3110
3111         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3112                 max_blocks = mddev->resync_max_sectors >> 1;
3113         else
3114                 max_blocks = mddev->size;
3115
3116         /*
3117          * Should not happen.
3118          */
3119         if (!max_blocks) {
3120                 MD_BUG();
3121                 return;
3122         }
3123         res = (resync/1024)*1000/(max_blocks/1024 + 1);
3124         {
3125                 int i, x = res/50, y = 20-x;
3126                 seq_printf(seq, "[");
3127                 for (i = 0; i < x; i++)
3128                         seq_printf(seq, "=");
3129                 seq_printf(seq, ">");
3130                 for (i = 0; i < y; i++)
3131                         seq_printf(seq, ".");
3132                 seq_printf(seq, "] ");
3133         }
3134         seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
3135                       (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
3136                        "resync" : "recovery"),
3137                       res/10, res % 10, resync, max_blocks);
3138
3139         /*
3140          * We do not want to overflow, so the order of operands and
3141          * the * 100 / 100 trick are important. We do a +1 to be
3142          * safe against division by zero. We only estimate anyway.
3143          *
3144          * dt: time from mark until now
3145          * db: blocks written from mark until now
3146          * rt: remaining time
3147          */
3148         dt = ((jiffies - mddev->resync_mark) / HZ);
3149         if (!dt) dt++;
3150         db = resync - (mddev->resync_mark_cnt/2);
3151         rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
3152
3153         seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
3154
3155         seq_printf(seq, " speed=%ldK/sec", db/dt);
3156 }
3157
3158 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
3159 {
3160         struct list_head *tmp;
3161         loff_t l = *pos;
3162         mddev_t *mddev;
3163
3164         if (l >= 0x10000)
3165                 return NULL;
3166         if (!l--)
3167                 /* header */
3168                 return (void*)1;
3169
3170         spin_lock(&all_mddevs_lock);
3171         list_for_each(tmp,&all_mddevs)
3172                 if (!l--) {
3173                         mddev = list_entry(tmp, mddev_t, all_mddevs);
3174                         mddev_get(mddev);
3175                         spin_unlock(&all_mddevs_lock);
3176                         return mddev;
3177                 }
3178         spin_unlock(&all_mddevs_lock);
3179         if (!l--)
3180                 return (void*)2;/* tail */
3181         return NULL;
3182 }
3183
3184 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3185 {
3186         struct list_head *tmp;
3187         mddev_t *next_mddev, *mddev = v;
3188         
3189         ++*pos;
3190         if (v == (void*)2)
3191                 return NULL;
3192
3193         spin_lock(&all_mddevs_lock);
3194         if (v == (void*)1)
3195                 tmp = all_mddevs.next;
3196         else
3197                 tmp = mddev->all_mddevs.next;
3198         if (tmp != &all_mddevs)
3199                 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
3200         else {
3201                 next_mddev = (void*)2;
3202                 *pos = 0x10000;
3203         }               
3204         spin_unlock(&all_mddevs_lock);
3205
3206         if (v != (void*)1)
3207                 mddev_put(mddev);
3208         return next_mddev;
3209
3210 }
3211
3212 static void md_seq_stop(struct seq_file *seq, void *v)
3213 {
3214         mddev_t *mddev = v;
3215
3216         if (mddev && v != (void*)1 && v != (void*)2)
3217                 mddev_put(mddev);
3218 }
3219
3220 static int md_seq_show(struct seq_file *seq, void *v)
3221 {
3222         mddev_t *mddev = v;
3223         sector_t size;
3224         struct list_head *tmp2;
3225         mdk_rdev_t *rdev;
3226         int i;
3227         struct bitmap *bitmap;
3228
3229         if (v == (void*)1) {
3230                 seq_printf(seq, "Personalities : ");
3231                 spin_lock(&pers_lock);
3232                 for (i = 0; i < MAX_PERSONALITY; i++)
3233                         if (pers[i])
3234                                 seq_printf(seq, "[%s] ", pers[i]->name);
3235
3236                 spin_unlock(&pers_lock);
3237                 seq_printf(seq, "\n");
3238                 return 0;
3239         }
3240         if (v == (void*)2) {
3241                 status_unused(seq);
3242                 return 0;
3243         }
3244
3245         if (mddev_lock(mddev)!=0) 
3246                 return -EINTR;
3247         if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
3248                 seq_printf(seq, "%s : %sactive", mdname(mddev),
3249                                                 mddev->pers ? "" : "in");
3250                 if (mddev->pers) {
3251                         if (mddev->ro)
3252                                 seq_printf(seq, " (read-only)");
3253                         seq_printf(seq, " %s", mddev->pers->name);
3254                 }
3255
3256                 size = 0;
3257                 ITERATE_RDEV(mddev,rdev,tmp2) {
3258                         char b[BDEVNAME_SIZE];
3259                         seq_printf(seq, " %s[%d]",
3260                                 bdevname(rdev->bdev,b), rdev->desc_nr);
3261                         if (rdev->faulty) {
3262                                 seq_printf(seq, "(F)");
3263                                 continue;
3264                         }
3265                         size += rdev->size;
3266                 }
3267
3268                 if (!list_empty(&mddev->disks)) {
3269                         if (mddev->pers)
3270                                 seq_printf(seq, "\n      %llu blocks",
3271                                         (unsigned long long)mddev->array_size);
3272                         else
3273                                 seq_printf(seq, "\n      %llu blocks",
3274                                         (unsigned long long)size);
3275                 }
3276
3277                 if (mddev->pers) {
3278                         mddev->pers->status (seq, mddev);
3279                         seq_printf(seq, "\n      ");
3280                         if (mddev->curr_resync > 2) {
3281                                 status_resync (seq, mddev);
3282                                 seq_printf(seq, "\n      ");
3283                         } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
3284                                 seq_printf(seq, "       resync=DELAYED\n      ");
3285                 } else
3286                         seq_printf(seq, "\n       ");
3287
3288                 if ((bitmap = mddev->bitmap)) {
3289                         unsigned long chunk_kb;
3290                         unsigned long flags;
3291                         spin_lock_irqsave(&bitmap->lock, flags);
3292                         chunk_kb = bitmap->chunksize >> 10;
3293                         seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
3294                                 "%lu%s chunk",
3295                                 bitmap->pages - bitmap->missing_pages,
3296                                 bitmap->pages,
3297                                 (bitmap->pages - bitmap->missing_pages)
3298                                         << (PAGE_SHIFT - 10),
3299                                 chunk_kb ? chunk_kb : bitmap->chunksize,
3300                                 chunk_kb ? "KB" : "B");
3301                         if (bitmap->file) {
3302                                 seq_printf(seq, ", file: ");
3303                                 seq_path(seq, bitmap->file->f_vfsmnt,
3304                                          bitmap->file->f_dentry," \t\n");
3305                         }
3306
3307                         seq_printf(seq, "\n");
3308                         spin_unlock_irqrestore(&bitmap->lock, flags);
3309                 }
3310
3311                 seq_printf(seq, "\n");
3312         }
3313         mddev_unlock(mddev);
3314         
3315         return 0;
3316 }
3317
3318 static struct seq_operations md_seq_ops = {
3319         .start  = md_seq_start,
3320         .next   = md_seq_next,
3321         .stop   = md_seq_stop,
3322         .show   = md_seq_show,
3323 };
3324
3325 static int md_seq_open(struct inode *inode, struct file *file)
3326 {
3327         int error;
3328
3329         error = seq_open(file, &md_seq_ops);
3330         return error;
3331 }
3332
3333 static struct file_operations md_seq_fops = {
3334         .open           = md_seq_open,
3335         .read           = seq_read,
3336         .llseek         = seq_lseek,
3337         .release        = seq_release,
3338 };
3339
3340 int register_md_personality(int pnum, mdk_personality_t *p)
3341 {
3342         if (pnum >= MAX_PERSONALITY) {
3343                 printk(KERN_ERR
3344                        "md: tried to install personality %s as nr %d, but max is %lu\n",
3345                        p->name, pnum, MAX_PERSONALITY-1);
3346                 return -EINVAL;
3347         }
3348
3349         spin_lock(&pers_lock);
3350         if (pers[pnum]) {
3351                 spin_unlock(&pers_lock);
3352                 return -EBUSY;
3353         }
3354
3355         pers[pnum] = p;
3356         printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
3357         spin_unlock(&pers_lock);
3358         return 0;
3359 }
3360
3361 int unregister_md_personality(int pnum)
3362 {
3363         if (pnum >= MAX_PERSONALITY)
3364                 return -EINVAL;
3365
3366         printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
3367         spin_lock(&pers_lock);
3368         pers[pnum] = NULL;
3369         spin_unlock(&pers_lock);
3370         return 0;
3371 }
3372
3373 static int is_mddev_idle(mddev_t *mddev)
3374 {
3375         mdk_rdev_t * rdev;
3376         struct list_head *tmp;
3377         int idle;
3378         unsigned long curr_events;
3379
3380         idle = 1;
3381         ITERATE_RDEV(mddev,rdev,tmp) {
3382                 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
3383                 curr_events = disk_stat_read(disk, read_sectors) + 
3384                                 disk_stat_read(disk, write_sectors) - 
3385                                 atomic_read(&disk->sync_io);
3386                 /* Allow some slack between valud of curr_events and last_events,
3387                  * as there are some uninteresting races.
3388                  * Note: the following is an unsigned comparison.
3389                  */
3390                 if ((curr_events - rdev->last_events + 32) > 64) {
3391                         rdev->last_events = curr_events;
3392                         idle = 0;
3393                 }
3394         }
3395         return idle;
3396 }
3397
3398 void md_done_sync(mddev_t *mddev, int blocks, int ok)
3399 {
3400         /* another "blocks" (512byte) blocks have been synced */
3401         atomic_sub(blocks, &mddev->recovery_active);
3402         wake_up(&mddev->recovery_wait);
3403         if (!ok) {
3404                 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
3405                 md_wakeup_thread(mddev->thread);
3406                 // stop recovery, signal do_sync ....
3407         }
3408 }
3409
3410
3411 /* md_write_start(mddev, bi)
3412  * If we need to update some array metadata (e.g. 'active' flag
3413  * in superblock) before writing, schedule a superblock update
3414  * and wait for it to complete.
3415  */
3416 void md_write_start(mddev_t *mddev, struct bio *bi)
3417 {
3418         DEFINE_WAIT(w);
3419         if (bio_data_dir(bi) != WRITE)
3420                 return;
3421
3422         atomic_inc(&mddev->writes_pending);
3423         if (mddev->in_sync) {
3424                 spin_lock(&mddev->write_lock);
3425                 if (mddev->in_sync) {
3426                         mddev->in_sync = 0;
3427                         mddev->sb_dirty = 1;
3428                         md_wakeup_thread(mddev->thread);
3429                 }
3430                 spin_unlock(&mddev->write_lock);
3431         }
3432         wait_event(mddev->sb_wait, mddev->sb_dirty==0);
3433 }
3434
3435 void md_write_end(mddev_t *mddev)
3436 {
3437         if (atomic_dec_and_test(&mddev->writes_pending)) {
3438                 if (mddev->safemode == 2)
3439                         md_wakeup_thread(mddev->thread);
3440                 else
3441                         mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
3442         }
3443 }
3444
3445 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3446
3447 #define SYNC_MARKS      10
3448 #define SYNC_MARK_STEP  (3*HZ)
3449 static void md_do_sync(mddev_t *mddev)
3450 {
3451         mddev_t *mddev2;
3452         unsigned int currspeed = 0,
3453                  window;
3454         sector_t max_sectors,j, io_sectors;
3455         unsigned long mark[SYNC_MARKS];
3456         sector_t mark_cnt[SYNC_MARKS];
3457         int last_mark,m;
3458         struct list_head *tmp;
3459         sector_t last_check;
3460         int skipped = 0;
3461
3462         /* just incase thread restarts... */
3463         if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
3464                 return;
3465
3466         /* we overload curr_resync somewhat here.
3467          * 0 == not engaged in resync at all
3468          * 2 == checking that there is no conflict with another sync
3469          * 1 == like 2, but have yielded to allow conflicting resync to
3470          *              commense
3471          * other == active in resync - this many blocks
3472          *
3473          * Before starting a resync we must have set curr_resync to
3474          * 2, and then checked that every "conflicting" array has curr_resync
3475          * less than ours.  When we find one that is the same or higher
3476          * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
3477          * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
3478          * This will mean we have to start checking from the beginning again.
3479          *
3480          */
3481
3482         do {
3483                 mddev->curr_resync = 2;
3484
3485         try_again:
3486                 if (signal_pending(current)) {
3487                         flush_signals(current);
3488                         goto skip;
3489                 }
3490                 ITERATE_MDDEV(mddev2,tmp) {
3491                         printk(".");
3492                         if (mddev2 == mddev)
3493                                 continue;
3494                         if (mddev2->curr_resync && 
3495                             match_mddev_units(mddev,mddev2)) {
3496                                 DEFINE_WAIT(wq);
3497                                 if (mddev < mddev2 && mddev->curr_resync == 2) {
3498                                         /* arbitrarily yield */
3499                                         mddev->curr_resync = 1;
3500                                         wake_up(&resync_wait);
3501                                 }
3502                                 if (mddev > mddev2 && mddev->curr_resync == 1)
3503                                         /* no need to wait here, we can wait the next
3504                                          * time 'round when curr_resync == 2
3505                                          */
3506                                         continue;
3507                                 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
3508                                 if (!signal_pending(current)
3509                                     && mddev2->curr_resync >= mddev->curr_resync) {
3510                                         printk(KERN_INFO "md: delaying resync of %s"
3511                                                " until %s has finished resync (they"
3512                                                " share one or more physical units)\n",
3513                                                mdname(mddev), mdname(mddev2));
3514                                         mddev_put(mddev2);
3515                                         schedule();
3516                                         finish_wait(&resync_wait, &wq);
3517                                         goto try_again;
3518                                 }
3519                                 finish_wait(&resync_wait, &wq);
3520                         }
3521                 }
3522         } while (mddev->curr_resync < 2);
3523
3524         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3525                 /* resync follows the size requested by the personality,
3526                  * which defaults to physical size, but can be virtual size
3527                  */
3528                 max_sectors = mddev->resync_max_sectors;
3529         else
3530                 /* recovery follows the physical size of devices */
3531                 max_sectors = mddev->size << 1;
3532
3533         printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
3534         printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
3535                 " %d KB/sec/disc.\n", sysctl_speed_limit_min);
3536         printk(KERN_INFO "md: using maximum available idle IO bandwith "
3537                "(but not more than %d KB/sec) for reconstruction.\n",
3538                sysctl_speed_limit_max);
3539
3540         is_mddev_idle(mddev); /* this also initializes IO event counters */
3541         /* we don't use the checkpoint if there's a bitmap */
3542         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap)
3543                 j = mddev->recovery_cp;
3544         else
3545                 j = 0;
3546         io_sectors = 0;
3547         for (m = 0; m < SYNC_MARKS; m++) {
3548                 mark[m] = jiffies;
3549                 mark_cnt[m] = io_sectors;
3550         }
3551         last_mark = 0;
3552         mddev->resync_mark = mark[last_mark];
3553         mddev->resync_mark_cnt = mark_cnt[last_mark];
3554
3555         /*
3556          * Tune reconstruction:
3557          */
3558         window = 32*(PAGE_SIZE/512);
3559         printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
3560                 window/2,(unsigned long long) max_sectors/2);
3561
3562         atomic_set(&mddev->recovery_active, 0);
3563         init_waitqueue_head(&mddev->recovery_wait);
3564         last_check = 0;
3565
3566         if (j>2) {
3567                 printk(KERN_INFO 
3568                         "md: resuming recovery of %s from checkpoint.\n",
3569                         mdname(mddev));
3570                 mddev->curr_resync = j;
3571         }
3572
3573         while (j < max_sectors) {
3574                 sector_t sectors;
3575
3576                 skipped = 0;
3577                 sectors = mddev->pers->sync_request(mddev, j, &skipped,
3578                                             currspeed < sysctl_speed_limit_min);
3579                 if (sectors == 0) {
3580                         set_bit(MD_RECOVERY_ERR, &mddev->recovery);
3581                         goto out;
3582                 }
3583
3584                 if (!skipped) { /* actual IO requested */
3585                         io_sectors += sectors;
3586                         atomic_add(sectors, &mddev->recovery_active);
3587                 }
3588
3589                 j += sectors;
3590                 if (j>1) mddev->curr_resync = j;
3591
3592
3593                 if (last_check + window > io_sectors || j == max_sectors)
3594                         continue;
3595
3596                 last_check = io_sectors;
3597
3598                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
3599                     test_bit(MD_RECOVERY_ERR, &mddev->recovery))
3600                         break;
3601
3602         repeat:
3603                 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
3604                         /* step marks */
3605                         int next = (last_mark+1) % SYNC_MARKS;
3606
3607                         mddev->resync_mark = mark[next];
3608                         mddev->resync_mark_cnt = mark_cnt[next];
3609                         mark[next] = jiffies;
3610                         mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
3611                         last_mark = next;
3612                 }
3613
3614
3615                 if (signal_pending(current)) {
3616                         /*
3617                          * got a signal, exit.
3618                          */
3619                         printk(KERN_INFO 
3620                                 "md: md_do_sync() got signal ... exiting\n");
3621                         flush_signals(current);
3622                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3623                         goto out;
3624                 }
3625
3626                 /*
3627                  * this loop exits only if either when we are slower than
3628                  * the 'hard' speed limit, or the system was IO-idle for
3629                  * a jiffy.
3630                  * the system might be non-idle CPU-wise, but we only care
3631                  * about not overloading the IO subsystem. (things like an
3632                  * e2fsck being done on the RAID array should execute fast)
3633                  */
3634                 mddev->queue->unplug_fn(mddev->queue);
3635                 cond_resched();
3636
3637                 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
3638                         /((jiffies-mddev->resync_mark)/HZ +1) +1;
3639
3640                 if (currspeed > sysctl_speed_limit_min) {
3641                         if ((currspeed > sysctl_speed_limit_max) ||
3642                                         !is_mddev_idle(mddev)) {
3643                                 msleep_interruptible(250);
3644                                 goto repeat;
3645                         }
3646                 }
3647         }
3648         printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
3649         /*
3650          * this also signals 'finished resyncing' to md_stop
3651          */
3652  out:
3653         mddev->queue->unplug_fn(mddev->queue);
3654
3655         wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
3656
3657         /* tell personality that we are finished */
3658         mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
3659
3660         if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
3661             mddev->curr_resync > 2 &&
3662             mddev->curr_resync >= mddev->recovery_cp) {
3663                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
3664                         printk(KERN_INFO 
3665                                 "md: checkpointing recovery of %s.\n",
3666                                 mdname(mddev));
3667                         mddev->recovery_cp = mddev->curr_resync;
3668                 } else
3669                         mddev->recovery_cp = MaxSector;
3670         }
3671
3672  skip:
3673         mddev->curr_resync = 0;
3674         wake_up(&resync_wait);
3675         set_bit(MD_RECOVERY_DONE, &mddev->recovery);
3676         md_wakeup_thread(mddev->thread);
3677 }
3678
3679
3680 /*
3681  * This routine is regularly called by all per-raid-array threads to
3682  * deal with generic issues like resync and super-block update.
3683  * Raid personalities that don't have a thread (linear/raid0) do not
3684  * need this as they never do any recovery or update the superblock.
3685  *
3686  * It does not do any resync itself, but rather "forks" off other threads
3687  * to do that as needed.
3688  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
3689  * "->recovery" and create a thread at ->sync_thread.
3690  * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
3691  * and wakeups up this thread which will reap the thread and finish up.
3692  * This thread also removes any faulty devices (with nr_pending == 0).
3693  *
3694  * The overall approach is:
3695  *  1/ if the superblock needs updating, update it.
3696  *  2/ If a recovery thread is running, don't do anything else.
3697  *  3/ If recovery has finished, clean up, possibly marking spares active.
3698  *  4/ If there are any faulty devices, remove them.
3699  *  5/ If array is degraded, try to add spares devices
3700  *  6/ If array has spares or is not in-sync, start a resync thread.
3701  */
3702 void md_check_recovery(mddev_t *mddev)
3703 {
3704         mdk_rdev_t *rdev;
3705         struct list_head *rtmp;
3706
3707
3708         if (mddev->bitmap)
3709                 bitmap_daemon_work(mddev->bitmap);
3710
3711         if (mddev->ro)
3712                 return;
3713
3714         if (signal_pending(current)) {
3715                 if (mddev->pers->sync_request) {
3716                         printk(KERN_INFO "md: %s in immediate safe mode\n",
3717                                mdname(mddev));
3718                         mddev->safemode = 2;
3719                 }
3720                 flush_signals(current);
3721         }
3722
3723         if ( ! (
3724                 mddev->sb_dirty ||
3725                 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
3726                 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
3727                 (mddev->safemode == 1) ||
3728                 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
3729                  && !mddev->in_sync && mddev->recovery_cp == MaxSector)
3730                 ))
3731                 return;
3732
3733         if (mddev_trylock(mddev)==0) {
3734                 int spares =0;
3735
3736                 spin_lock(&mddev->write_lock);
3737                 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
3738                     !mddev->in_sync && mddev->recovery_cp == MaxSector) {
3739                         mddev->in_sync = 1;
3740                         mddev->sb_dirty = 1;
3741                 }
3742                 if (mddev->safemode == 1)
3743                         mddev->safemode = 0;
3744                 spin_unlock(&mddev->write_lock);
3745
3746                 if (mddev->sb_dirty)
3747                         md_update_sb(mddev);
3748
3749
3750                 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
3751                     !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
3752                         /* resync/recovery still happening */
3753                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3754                         goto unlock;
3755                 }
3756                 if (mddev->sync_thread) {
3757                         /* resync has finished, collect result */
3758                         md_unregister_thread(mddev->sync_thread);
3759                         mddev->sync_thread = NULL;
3760                         if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
3761                             !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
3762                                 /* success...*/
3763                                 /* activate any spares */
3764                                 mddev->pers->spare_active(mddev);
3765                         }
3766                         md_update_sb(mddev);
3767
3768                         /* if array is no-longer degraded, then any saved_raid_disk
3769                          * information must be scrapped
3770                          */
3771                         if (!mddev->degraded)
3772                                 ITERATE_RDEV(mddev,rdev,rtmp)
3773                                         rdev->saved_raid_disk = -1;
3774
3775                         mddev->recovery = 0;
3776                         /* flag recovery needed just to double check */
3777                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3778                         goto unlock;
3779                 }
3780                 if (mddev->recovery)
3781                         /* probably just the RECOVERY_NEEDED flag */
3782                         mddev->recovery = 0;
3783
3784                 /* no recovery is running.
3785                  * remove any failed drives, then
3786                  * add spares if possible.
3787                  * Spare are also removed and re-added, to allow
3788                  * the personality to fail the re-add.
3789                  */
3790                 ITERATE_RDEV(mddev,rdev,rtmp)
3791                         if (rdev->raid_disk >= 0 &&
3792                             (rdev->faulty || ! rdev->in_sync) &&
3793                             atomic_read(&rdev->nr_pending)==0) {
3794                                 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0)
3795                                         rdev->raid_disk = -1;
3796                         }
3797
3798                 if (mddev->degraded) {
3799                         ITERATE_RDEV(mddev,rdev,rtmp)
3800                                 if (rdev->raid_disk < 0
3801                                     && !rdev->faulty) {
3802                                         if (mddev->pers->hot_add_disk(mddev,rdev))
3803                                                 spares++;
3804                                         else
3805                                                 break;
3806                                 }
3807                 }
3808
3809                 if (!spares && (mddev->recovery_cp == MaxSector )) {
3810                         /* nothing we can do ... */
3811                         goto unlock;
3812                 }
3813                 if (mddev->pers->sync_request) {
3814                         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3815                         if (!spares)
3816                                 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3817                         if (spares && mddev->bitmap && ! mddev->bitmap->file) {
3818                                 /* We are adding a device or devices to an array
3819                                  * which has the bitmap stored on all devices.
3820                                  * So make sure all bitmap pages get written
3821                                  */
3822                                 bitmap_write_all(mddev->bitmap);
3823                         }
3824                         mddev->sync_thread = md_register_thread(md_do_sync,
3825                                                                 mddev,
3826                                                                 "%s_resync");
3827                         if (!mddev->sync_thread) {
3828                                 printk(KERN_ERR "%s: could not start resync"
3829                                         " thread...\n", 
3830                                         mdname(mddev));
3831                                 /* leave the spares where they are, it shouldn't hurt */
3832                                 mddev->recovery = 0;
3833                         } else {
3834                                 md_wakeup_thread(mddev->sync_thread);
3835                         }
3836                 }
3837         unlock:
3838                 mddev_unlock(mddev);
3839         }
3840 }
3841
3842 static int md_notify_reboot(struct notifier_block *this,
3843                             unsigned long code, void *x)
3844 {
3845         struct list_head *tmp;
3846         mddev_t *mddev;
3847
3848         if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
3849
3850                 printk(KERN_INFO "md: stopping all md devices.\n");
3851
3852                 ITERATE_MDDEV(mddev,tmp)
3853                         if (mddev_trylock(mddev)==0)
3854                                 do_md_stop (mddev, 1);
3855                 /*
3856                  * certain more exotic SCSI devices are known to be
3857                  * volatile wrt too early system reboots. While the
3858                  * right place to handle this issue is the given
3859                  * driver, we do want to have a safe RAID driver ...
3860                  */
3861                 mdelay(1000*1);
3862         }
3863         return NOTIFY_DONE;
3864 }
3865
3866 static struct notifier_block md_notifier = {
3867         .notifier_call  = md_notify_reboot,
3868         .next           = NULL,
3869         .priority       = INT_MAX, /* before any real devices */
3870 };
3871
3872 static void md_geninit(void)
3873 {
3874         struct proc_dir_entry *p;
3875
3876         dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
3877
3878         p = create_proc_entry("mdstat", S_IRUGO, NULL);
3879         if (p)
3880                 p->proc_fops = &md_seq_fops;
3881 }
3882
3883 static int __init md_init(void)
3884 {
3885         int minor;
3886
3887         printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
3888                         " MD_SB_DISKS=%d\n",
3889                         MD_MAJOR_VERSION, MD_MINOR_VERSION,
3890                         MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3891         printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR,
3892                         BITMAP_MINOR);
3893
3894         if (register_blkdev(MAJOR_NR, "md"))
3895                 return -1;
3896         if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
3897                 unregister_blkdev(MAJOR_NR, "md");
3898                 return -1;
3899         }
3900         devfs_mk_dir("md");
3901         blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
3902                                 md_probe, NULL, NULL);
3903         blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
3904                             md_probe, NULL, NULL);
3905
3906         for (minor=0; minor < MAX_MD_DEVS; ++minor)
3907                 devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
3908                                 S_IFBLK|S_IRUSR|S_IWUSR,
3909                                 "md/%d", minor);
3910
3911         for (minor=0; minor < MAX_MD_DEVS; ++minor)
3912                 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
3913                               S_IFBLK|S_IRUSR|S_IWUSR,
3914                               "md/mdp%d", minor);
3915
3916
3917         register_reboot_notifier(&md_notifier);
3918         raid_table_header = register_sysctl_table(raid_root_table, 1);
3919
3920         md_geninit();
3921         return (0);
3922 }
3923
3924
3925 #ifndef MODULE
3926
3927 /*
3928  * Searches all registered partitions for autorun RAID arrays
3929  * at boot time.
3930  */
3931 static dev_t detected_devices[128];
3932 static int dev_cnt;
3933
3934 void md_autodetect_dev(dev_t dev)
3935 {
3936         if (dev_cnt >= 0 && dev_cnt < 127)
3937                 detected_devices[dev_cnt++] = dev;
3938 }
3939
3940
3941 static void autostart_arrays(int part)
3942 {
3943         mdk_rdev_t *rdev;
3944         int i;
3945
3946         printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
3947
3948         for (i = 0; i < dev_cnt; i++) {
3949                 dev_t dev = detected_devices[i];
3950
3951                 rdev = md_import_device(dev,0, 0);
3952                 if (IS_ERR(rdev))
3953                         continue;
3954
3955                 if (rdev->faulty) {
3956                         MD_BUG();
3957                         continue;
3958                 }
3959                 list_add(&rdev->same_set, &pending_raid_disks);
3960         }
3961         dev_cnt = 0;
3962
3963         autorun_devices(part);
3964 }
3965
3966 #endif
3967
3968 static __exit void md_exit(void)
3969 {
3970         mddev_t *mddev;
3971         struct list_head *tmp;
3972         int i;
3973         blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
3974         blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
3975         for (i=0; i < MAX_MD_DEVS; i++)
3976                 devfs_remove("md/%d", i);
3977         for (i=0; i < MAX_MD_DEVS; i++)
3978                 devfs_remove("md/d%d", i);
3979
3980         devfs_remove("md");
3981
3982         unregister_blkdev(MAJOR_NR,"md");
3983         unregister_blkdev(mdp_major, "mdp");
3984         unregister_reboot_notifier(&md_notifier);
3985         unregister_sysctl_table(raid_table_header);
3986         remove_proc_entry("mdstat", NULL);
3987         ITERATE_MDDEV(mddev,tmp) {
3988                 struct gendisk *disk = mddev->gendisk;
3989                 if (!disk)
3990                         continue;
3991                 export_array(mddev);
3992                 del_gendisk(disk);
3993                 put_disk(disk);
3994                 mddev->gendisk = NULL;
3995                 mddev_put(mddev);
3996         }
3997 }
3998
3999 module_init(md_init)
4000 module_exit(md_exit)
4001
4002 EXPORT_SYMBOL(register_md_personality);
4003 EXPORT_SYMBOL(unregister_md_personality);
4004 EXPORT_SYMBOL(md_error);
4005 EXPORT_SYMBOL(md_done_sync);
4006 EXPORT_SYMBOL(md_write_start);
4007 EXPORT_SYMBOL(md_write_end);
4008 EXPORT_SYMBOL(md_register_thread);
4009 EXPORT_SYMBOL(md_unregister_thread);
4010 EXPORT_SYMBOL(md_wakeup_thread);
4011 EXPORT_SYMBOL(md_print_devices);
4012 EXPORT_SYMBOL(md_check_recovery);
4013 MODULE_LICENSE("GPL");