md: enable suspend/resume of md devices.
[safe/jmp/linux-2.6] / drivers / md / md.c
1 /*
2    md.c : Multiple Devices driver for Linux
3           Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5      completely rewritten, based on the MD driver code from Marc Zyngier
6
7    Changes:
8
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19
20      Neil Brown <neilb@cse.unsw.edu.au>.
21
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34
35 #include <linux/kthread.h>
36 #include <linux/blkdev.h>
37 #include <linux/sysctl.h>
38 #include <linux/seq_file.h>
39 #include <linux/buffer_head.h> /* for invalidate_bdev */
40 #include <linux/poll.h>
41 #include <linux/ctype.h>
42 #include <linux/hdreg.h>
43 #include <linux/proc_fs.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/file.h>
47 #include <linux/delay.h>
48 #include <linux/raid/md_p.h>
49 #include <linux/raid/md_u.h>
50 #include "md.h"
51 #include "bitmap.h"
52
53 #define DEBUG 0
54 #define dprintk(x...) ((void)(DEBUG && printk(x)))
55
56
57 #ifndef MODULE
58 static void autostart_arrays(int part);
59 #endif
60
61 static LIST_HEAD(pers_list);
62 static DEFINE_SPINLOCK(pers_lock);
63
64 static void md_print_devices(void);
65
66 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
67
68 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
69
70 /*
71  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
72  * is 1000 KB/sec, so the extra system load does not show up that much.
73  * Increase it if you want to have more _guaranteed_ speed. Note that
74  * the RAID driver will use the maximum available bandwidth if the IO
75  * subsystem is idle. There is also an 'absolute maximum' reconstruction
76  * speed limit - in case reconstruction slows down your system despite
77  * idle IO detection.
78  *
79  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
80  * or /sys/block/mdX/md/sync_speed_{min,max}
81  */
82
83 static int sysctl_speed_limit_min = 1000;
84 static int sysctl_speed_limit_max = 200000;
85 static inline int speed_min(mddev_t *mddev)
86 {
87         return mddev->sync_speed_min ?
88                 mddev->sync_speed_min : sysctl_speed_limit_min;
89 }
90
91 static inline int speed_max(mddev_t *mddev)
92 {
93         return mddev->sync_speed_max ?
94                 mddev->sync_speed_max : sysctl_speed_limit_max;
95 }
96
97 static struct ctl_table_header *raid_table_header;
98
99 static ctl_table raid_table[] = {
100         {
101                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
102                 .procname       = "speed_limit_min",
103                 .data           = &sysctl_speed_limit_min,
104                 .maxlen         = sizeof(int),
105                 .mode           = S_IRUGO|S_IWUSR,
106                 .proc_handler   = &proc_dointvec,
107         },
108         {
109                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
110                 .procname       = "speed_limit_max",
111                 .data           = &sysctl_speed_limit_max,
112                 .maxlen         = sizeof(int),
113                 .mode           = S_IRUGO|S_IWUSR,
114                 .proc_handler   = &proc_dointvec,
115         },
116         { .ctl_name = 0 }
117 };
118
119 static ctl_table raid_dir_table[] = {
120         {
121                 .ctl_name       = DEV_RAID,
122                 .procname       = "raid",
123                 .maxlen         = 0,
124                 .mode           = S_IRUGO|S_IXUGO,
125                 .child          = raid_table,
126         },
127         { .ctl_name = 0 }
128 };
129
130 static ctl_table raid_root_table[] = {
131         {
132                 .ctl_name       = CTL_DEV,
133                 .procname       = "dev",
134                 .maxlen         = 0,
135                 .mode           = 0555,
136                 .child          = raid_dir_table,
137         },
138         { .ctl_name = 0 }
139 };
140
141 static struct block_device_operations md_fops;
142
143 static int start_readonly;
144
145 /*
146  * We have a system wide 'event count' that is incremented
147  * on any 'interesting' event, and readers of /proc/mdstat
148  * can use 'poll' or 'select' to find out when the event
149  * count increases.
150  *
151  * Events are:
152  *  start array, stop array, error, add device, remove device,
153  *  start build, activate spare
154  */
155 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
156 static atomic_t md_event_count;
157 void md_new_event(mddev_t *mddev)
158 {
159         atomic_inc(&md_event_count);
160         wake_up(&md_event_waiters);
161 }
162 EXPORT_SYMBOL_GPL(md_new_event);
163
164 /* Alternate version that can be called from interrupts
165  * when calling sysfs_notify isn't needed.
166  */
167 static void md_new_event_inintr(mddev_t *mddev)
168 {
169         atomic_inc(&md_event_count);
170         wake_up(&md_event_waiters);
171 }
172
173 /*
174  * Enables to iterate over all existing md arrays
175  * all_mddevs_lock protects this list.
176  */
177 static LIST_HEAD(all_mddevs);
178 static DEFINE_SPINLOCK(all_mddevs_lock);
179
180
181 /*
182  * iterates through all used mddevs in the system.
183  * We take care to grab the all_mddevs_lock whenever navigating
184  * the list, and to always hold a refcount when unlocked.
185  * Any code which breaks out of this loop while own
186  * a reference to the current mddev and must mddev_put it.
187  */
188 #define for_each_mddev(mddev,tmp)                                       \
189                                                                         \
190         for (({ spin_lock(&all_mddevs_lock);                            \
191                 tmp = all_mddevs.next;                                  \
192                 mddev = NULL;});                                        \
193              ({ if (tmp != &all_mddevs)                                 \
194                         mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
195                 spin_unlock(&all_mddevs_lock);                          \
196                 if (mddev) mddev_put(mddev);                            \
197                 mddev = list_entry(tmp, mddev_t, all_mddevs);           \
198                 tmp != &all_mddevs;});                                  \
199              ({ spin_lock(&all_mddevs_lock);                            \
200                 tmp = tmp->next;})                                      \
201                 )
202
203
204 /* Rather than calling directly into the personality make_request function,
205  * IO requests come here first so that we can check if the device is
206  * being suspended pending a reconfiguration.
207  * We hold a refcount over the call to ->make_request.  By the time that
208  * call has finished, the bio has been linked into some internal structure
209  * and so is visible to ->quiesce(), so we don't need the refcount any more.
210  */
211 static int md_make_request(struct request_queue *q, struct bio *bio)
212 {
213         mddev_t *mddev = q->queuedata;
214         int rv;
215         if (mddev == NULL || mddev->pers == NULL) {
216                 bio_io_error(bio);
217                 return 0;
218         }
219         rcu_read_lock();
220         if (mddev->suspended) {
221                 DEFINE_WAIT(__wait);
222                 for (;;) {
223                         prepare_to_wait(&mddev->sb_wait, &__wait,
224                                         TASK_UNINTERRUPTIBLE);
225                         if (!mddev->suspended)
226                                 break;
227                         rcu_read_unlock();
228                         schedule();
229                         rcu_read_lock();
230                 }
231                 finish_wait(&mddev->sb_wait, &__wait);
232         }
233         atomic_inc(&mddev->active_io);
234         rcu_read_unlock();
235         rv = mddev->pers->make_request(q, bio);
236         if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
237                 wake_up(&mddev->sb_wait);
238
239         return rv;
240 }
241
242 static void mddev_suspend(mddev_t *mddev)
243 {
244         BUG_ON(mddev->suspended);
245         mddev->suspended = 1;
246         synchronize_rcu();
247         wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
248         mddev->pers->quiesce(mddev, 1);
249         md_unregister_thread(mddev->thread);
250         mddev->thread = NULL;
251         /* we now know that no code is executing in the personality module,
252          * except possibly the tail end of a ->bi_end_io function, but that
253          * is certain to complete before the module has a chance to get
254          * unloaded
255          */
256 }
257
258 static void mddev_resume(mddev_t *mddev)
259 {
260         mddev->suspended = 0;
261         wake_up(&mddev->sb_wait);
262         mddev->pers->quiesce(mddev, 0);
263 }
264
265
266 static inline mddev_t *mddev_get(mddev_t *mddev)
267 {
268         atomic_inc(&mddev->active);
269         return mddev;
270 }
271
272 static void mddev_delayed_delete(struct work_struct *ws)
273 {
274         mddev_t *mddev = container_of(ws, mddev_t, del_work);
275         kobject_del(&mddev->kobj);
276         kobject_put(&mddev->kobj);
277 }
278
279 static void mddev_put(mddev_t *mddev)
280 {
281         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
282                 return;
283         if (!mddev->raid_disks && list_empty(&mddev->disks) &&
284             !mddev->hold_active) {
285                 list_del(&mddev->all_mddevs);
286                 if (mddev->gendisk) {
287                         /* we did a probe so need to clean up.
288                          * Call schedule_work inside the spinlock
289                          * so that flush_scheduled_work() after
290                          * mddev_find will succeed in waiting for the
291                          * work to be done.
292                          */
293                         INIT_WORK(&mddev->del_work, mddev_delayed_delete);
294                         schedule_work(&mddev->del_work);
295                 } else
296                         kfree(mddev);
297         }
298         spin_unlock(&all_mddevs_lock);
299 }
300
301 static mddev_t * mddev_find(dev_t unit)
302 {
303         mddev_t *mddev, *new = NULL;
304
305  retry:
306         spin_lock(&all_mddevs_lock);
307
308         if (unit) {
309                 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
310                         if (mddev->unit == unit) {
311                                 mddev_get(mddev);
312                                 spin_unlock(&all_mddevs_lock);
313                                 kfree(new);
314                                 return mddev;
315                         }
316
317                 if (new) {
318                         list_add(&new->all_mddevs, &all_mddevs);
319                         spin_unlock(&all_mddevs_lock);
320                         new->hold_active = UNTIL_IOCTL;
321                         return new;
322                 }
323         } else if (new) {
324                 /* find an unused unit number */
325                 static int next_minor = 512;
326                 int start = next_minor;
327                 int is_free = 0;
328                 int dev = 0;
329                 while (!is_free) {
330                         dev = MKDEV(MD_MAJOR, next_minor);
331                         next_minor++;
332                         if (next_minor > MINORMASK)
333                                 next_minor = 0;
334                         if (next_minor == start) {
335                                 /* Oh dear, all in use. */
336                                 spin_unlock(&all_mddevs_lock);
337                                 kfree(new);
338                                 return NULL;
339                         }
340                                 
341                         is_free = 1;
342                         list_for_each_entry(mddev, &all_mddevs, all_mddevs)
343                                 if (mddev->unit == dev) {
344                                         is_free = 0;
345                                         break;
346                                 }
347                 }
348                 new->unit = dev;
349                 new->md_minor = MINOR(dev);
350                 new->hold_active = UNTIL_STOP;
351                 list_add(&new->all_mddevs, &all_mddevs);
352                 spin_unlock(&all_mddevs_lock);
353                 return new;
354         }
355         spin_unlock(&all_mddevs_lock);
356
357         new = kzalloc(sizeof(*new), GFP_KERNEL);
358         if (!new)
359                 return NULL;
360
361         new->unit = unit;
362         if (MAJOR(unit) == MD_MAJOR)
363                 new->md_minor = MINOR(unit);
364         else
365                 new->md_minor = MINOR(unit) >> MdpMinorShift;
366
367         mutex_init(&new->reconfig_mutex);
368         INIT_LIST_HEAD(&new->disks);
369         INIT_LIST_HEAD(&new->all_mddevs);
370         init_timer(&new->safemode_timer);
371         atomic_set(&new->active, 1);
372         atomic_set(&new->openers, 0);
373         atomic_set(&new->active_io, 0);
374         spin_lock_init(&new->write_lock);
375         init_waitqueue_head(&new->sb_wait);
376         init_waitqueue_head(&new->recovery_wait);
377         new->reshape_position = MaxSector;
378         new->resync_min = 0;
379         new->resync_max = MaxSector;
380         new->level = LEVEL_NONE;
381
382         goto retry;
383 }
384
385 static inline int mddev_lock(mddev_t * mddev)
386 {
387         return mutex_lock_interruptible(&mddev->reconfig_mutex);
388 }
389
390 static inline int mddev_trylock(mddev_t * mddev)
391 {
392         return mutex_trylock(&mddev->reconfig_mutex);
393 }
394
395 static inline void mddev_unlock(mddev_t * mddev)
396 {
397         mutex_unlock(&mddev->reconfig_mutex);
398
399         md_wakeup_thread(mddev->thread);
400 }
401
402 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
403 {
404         mdk_rdev_t *rdev;
405
406         list_for_each_entry(rdev, &mddev->disks, same_set)
407                 if (rdev->desc_nr == nr)
408                         return rdev;
409
410         return NULL;
411 }
412
413 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
414 {
415         mdk_rdev_t *rdev;
416
417         list_for_each_entry(rdev, &mddev->disks, same_set)
418                 if (rdev->bdev->bd_dev == dev)
419                         return rdev;
420
421         return NULL;
422 }
423
424 static struct mdk_personality *find_pers(int level, char *clevel)
425 {
426         struct mdk_personality *pers;
427         list_for_each_entry(pers, &pers_list, list) {
428                 if (level != LEVEL_NONE && pers->level == level)
429                         return pers;
430                 if (strcmp(pers->name, clevel)==0)
431                         return pers;
432         }
433         return NULL;
434 }
435
436 /* return the offset of the super block in 512byte sectors */
437 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
438 {
439         sector_t num_sectors = bdev->bd_inode->i_size / 512;
440         return MD_NEW_SIZE_SECTORS(num_sectors);
441 }
442
443 static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
444 {
445         sector_t num_sectors = rdev->sb_start;
446
447         if (chunk_size)
448                 num_sectors &= ~((sector_t)chunk_size/512 - 1);
449         return num_sectors;
450 }
451
452 static int alloc_disk_sb(mdk_rdev_t * rdev)
453 {
454         if (rdev->sb_page)
455                 MD_BUG();
456
457         rdev->sb_page = alloc_page(GFP_KERNEL);
458         if (!rdev->sb_page) {
459                 printk(KERN_ALERT "md: out of memory.\n");
460                 return -ENOMEM;
461         }
462
463         return 0;
464 }
465
466 static void free_disk_sb(mdk_rdev_t * rdev)
467 {
468         if (rdev->sb_page) {
469                 put_page(rdev->sb_page);
470                 rdev->sb_loaded = 0;
471                 rdev->sb_page = NULL;
472                 rdev->sb_start = 0;
473                 rdev->sectors = 0;
474         }
475 }
476
477
478 static void super_written(struct bio *bio, int error)
479 {
480         mdk_rdev_t *rdev = bio->bi_private;
481         mddev_t *mddev = rdev->mddev;
482
483         if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
484                 printk("md: super_written gets error=%d, uptodate=%d\n",
485                        error, test_bit(BIO_UPTODATE, &bio->bi_flags));
486                 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
487                 md_error(mddev, rdev);
488         }
489
490         if (atomic_dec_and_test(&mddev->pending_writes))
491                 wake_up(&mddev->sb_wait);
492         bio_put(bio);
493 }
494
495 static void super_written_barrier(struct bio *bio, int error)
496 {
497         struct bio *bio2 = bio->bi_private;
498         mdk_rdev_t *rdev = bio2->bi_private;
499         mddev_t *mddev = rdev->mddev;
500
501         if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
502             error == -EOPNOTSUPP) {
503                 unsigned long flags;
504                 /* barriers don't appear to be supported :-( */
505                 set_bit(BarriersNotsupp, &rdev->flags);
506                 mddev->barriers_work = 0;
507                 spin_lock_irqsave(&mddev->write_lock, flags);
508                 bio2->bi_next = mddev->biolist;
509                 mddev->biolist = bio2;
510                 spin_unlock_irqrestore(&mddev->write_lock, flags);
511                 wake_up(&mddev->sb_wait);
512                 bio_put(bio);
513         } else {
514                 bio_put(bio2);
515                 bio->bi_private = rdev;
516                 super_written(bio, error);
517         }
518 }
519
520 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
521                    sector_t sector, int size, struct page *page)
522 {
523         /* write first size bytes of page to sector of rdev
524          * Increment mddev->pending_writes before returning
525          * and decrement it on completion, waking up sb_wait
526          * if zero is reached.
527          * If an error occurred, call md_error
528          *
529          * As we might need to resubmit the request if BIO_RW_BARRIER
530          * causes ENOTSUPP, we allocate a spare bio...
531          */
532         struct bio *bio = bio_alloc(GFP_NOIO, 1);
533         int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
534
535         bio->bi_bdev = rdev->bdev;
536         bio->bi_sector = sector;
537         bio_add_page(bio, page, size, 0);
538         bio->bi_private = rdev;
539         bio->bi_end_io = super_written;
540         bio->bi_rw = rw;
541
542         atomic_inc(&mddev->pending_writes);
543         if (!test_bit(BarriersNotsupp, &rdev->flags)) {
544                 struct bio *rbio;
545                 rw |= (1<<BIO_RW_BARRIER);
546                 rbio = bio_clone(bio, GFP_NOIO);
547                 rbio->bi_private = bio;
548                 rbio->bi_end_io = super_written_barrier;
549                 submit_bio(rw, rbio);
550         } else
551                 submit_bio(rw, bio);
552 }
553
554 void md_super_wait(mddev_t *mddev)
555 {
556         /* wait for all superblock writes that were scheduled to complete.
557          * if any had to be retried (due to BARRIER problems), retry them
558          */
559         DEFINE_WAIT(wq);
560         for(;;) {
561                 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
562                 if (atomic_read(&mddev->pending_writes)==0)
563                         break;
564                 while (mddev->biolist) {
565                         struct bio *bio;
566                         spin_lock_irq(&mddev->write_lock);
567                         bio = mddev->biolist;
568                         mddev->biolist = bio->bi_next ;
569                         bio->bi_next = NULL;
570                         spin_unlock_irq(&mddev->write_lock);
571                         submit_bio(bio->bi_rw, bio);
572                 }
573                 schedule();
574         }
575         finish_wait(&mddev->sb_wait, &wq);
576 }
577
578 static void bi_complete(struct bio *bio, int error)
579 {
580         complete((struct completion*)bio->bi_private);
581 }
582
583 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
584                    struct page *page, int rw)
585 {
586         struct bio *bio = bio_alloc(GFP_NOIO, 1);
587         struct completion event;
588         int ret;
589
590         rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
591
592         bio->bi_bdev = bdev;
593         bio->bi_sector = sector;
594         bio_add_page(bio, page, size, 0);
595         init_completion(&event);
596         bio->bi_private = &event;
597         bio->bi_end_io = bi_complete;
598         submit_bio(rw, bio);
599         wait_for_completion(&event);
600
601         ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
602         bio_put(bio);
603         return ret;
604 }
605 EXPORT_SYMBOL_GPL(sync_page_io);
606
607 static int read_disk_sb(mdk_rdev_t * rdev, int size)
608 {
609         char b[BDEVNAME_SIZE];
610         if (!rdev->sb_page) {
611                 MD_BUG();
612                 return -EINVAL;
613         }
614         if (rdev->sb_loaded)
615                 return 0;
616
617
618         if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
619                 goto fail;
620         rdev->sb_loaded = 1;
621         return 0;
622
623 fail:
624         printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
625                 bdevname(rdev->bdev,b));
626         return -EINVAL;
627 }
628
629 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
630 {
631         return  sb1->set_uuid0 == sb2->set_uuid0 &&
632                 sb1->set_uuid1 == sb2->set_uuid1 &&
633                 sb1->set_uuid2 == sb2->set_uuid2 &&
634                 sb1->set_uuid3 == sb2->set_uuid3;
635 }
636
637 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
638 {
639         int ret;
640         mdp_super_t *tmp1, *tmp2;
641
642         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
643         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
644
645         if (!tmp1 || !tmp2) {
646                 ret = 0;
647                 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
648                 goto abort;
649         }
650
651         *tmp1 = *sb1;
652         *tmp2 = *sb2;
653
654         /*
655          * nr_disks is not constant
656          */
657         tmp1->nr_disks = 0;
658         tmp2->nr_disks = 0;
659
660         ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
661 abort:
662         kfree(tmp1);
663         kfree(tmp2);
664         return ret;
665 }
666
667
668 static u32 md_csum_fold(u32 csum)
669 {
670         csum = (csum & 0xffff) + (csum >> 16);
671         return (csum & 0xffff) + (csum >> 16);
672 }
673
674 static unsigned int calc_sb_csum(mdp_super_t * sb)
675 {
676         u64 newcsum = 0;
677         u32 *sb32 = (u32*)sb;
678         int i;
679         unsigned int disk_csum, csum;
680
681         disk_csum = sb->sb_csum;
682         sb->sb_csum = 0;
683
684         for (i = 0; i < MD_SB_BYTES/4 ; i++)
685                 newcsum += sb32[i];
686         csum = (newcsum & 0xffffffff) + (newcsum>>32);
687
688
689 #ifdef CONFIG_ALPHA
690         /* This used to use csum_partial, which was wrong for several
691          * reasons including that different results are returned on
692          * different architectures.  It isn't critical that we get exactly
693          * the same return value as before (we always csum_fold before
694          * testing, and that removes any differences).  However as we
695          * know that csum_partial always returned a 16bit value on
696          * alphas, do a fold to maximise conformity to previous behaviour.
697          */
698         sb->sb_csum = md_csum_fold(disk_csum);
699 #else
700         sb->sb_csum = disk_csum;
701 #endif
702         return csum;
703 }
704
705
706 /*
707  * Handle superblock details.
708  * We want to be able to handle multiple superblock formats
709  * so we have a common interface to them all, and an array of
710  * different handlers.
711  * We rely on user-space to write the initial superblock, and support
712  * reading and updating of superblocks.
713  * Interface methods are:
714  *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
715  *      loads and validates a superblock on dev.
716  *      if refdev != NULL, compare superblocks on both devices
717  *    Return:
718  *      0 - dev has a superblock that is compatible with refdev
719  *      1 - dev has a superblock that is compatible and newer than refdev
720  *          so dev should be used as the refdev in future
721  *     -EINVAL superblock incompatible or invalid
722  *     -othererror e.g. -EIO
723  *
724  *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
725  *      Verify that dev is acceptable into mddev.
726  *       The first time, mddev->raid_disks will be 0, and data from
727  *       dev should be merged in.  Subsequent calls check that dev
728  *       is new enough.  Return 0 or -EINVAL
729  *
730  *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
731  *     Update the superblock for rdev with data in mddev
732  *     This does not write to disc.
733  *
734  */
735
736 struct super_type  {
737         char                *name;
738         struct module       *owner;
739         int                 (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
740                                           int minor_version);
741         int                 (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
742         void                (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
743         unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
744                                                 sector_t num_sectors);
745 };
746
747 /*
748  * load_super for 0.90.0 
749  */
750 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
751 {
752         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
753         mdp_super_t *sb;
754         int ret;
755
756         /*
757          * Calculate the position of the superblock (512byte sectors),
758          * it's at the end of the disk.
759          *
760          * It also happens to be a multiple of 4Kb.
761          */
762         rdev->sb_start = calc_dev_sboffset(rdev->bdev);
763
764         ret = read_disk_sb(rdev, MD_SB_BYTES);
765         if (ret) return ret;
766
767         ret = -EINVAL;
768
769         bdevname(rdev->bdev, b);
770         sb = (mdp_super_t*)page_address(rdev->sb_page);
771
772         if (sb->md_magic != MD_SB_MAGIC) {
773                 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
774                        b);
775                 goto abort;
776         }
777
778         if (sb->major_version != 0 ||
779             sb->minor_version < 90 ||
780             sb->minor_version > 91) {
781                 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
782                         sb->major_version, sb->minor_version,
783                         b);
784                 goto abort;
785         }
786
787         if (sb->raid_disks <= 0)
788                 goto abort;
789
790         if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
791                 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
792                         b);
793                 goto abort;
794         }
795
796         rdev->preferred_minor = sb->md_minor;
797         rdev->data_offset = 0;
798         rdev->sb_size = MD_SB_BYTES;
799
800         if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
801                 if (sb->level != 1 && sb->level != 4
802                     && sb->level != 5 && sb->level != 6
803                     && sb->level != 10) {
804                         /* FIXME use a better test */
805                         printk(KERN_WARNING
806                                "md: bitmaps not supported for this level.\n");
807                         goto abort;
808                 }
809         }
810
811         if (sb->level == LEVEL_MULTIPATH)
812                 rdev->desc_nr = -1;
813         else
814                 rdev->desc_nr = sb->this_disk.number;
815
816         if (!refdev) {
817                 ret = 1;
818         } else {
819                 __u64 ev1, ev2;
820                 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
821                 if (!uuid_equal(refsb, sb)) {
822                         printk(KERN_WARNING "md: %s has different UUID to %s\n",
823                                 b, bdevname(refdev->bdev,b2));
824                         goto abort;
825                 }
826                 if (!sb_equal(refsb, sb)) {
827                         printk(KERN_WARNING "md: %s has same UUID"
828                                " but different superblock to %s\n",
829                                b, bdevname(refdev->bdev, b2));
830                         goto abort;
831                 }
832                 ev1 = md_event(sb);
833                 ev2 = md_event(refsb);
834                 if (ev1 > ev2)
835                         ret = 1;
836                 else 
837                         ret = 0;
838         }
839         rdev->sectors = calc_num_sectors(rdev, sb->chunk_size);
840
841         if (rdev->sectors < sb->size * 2 && sb->level > 1)
842                 /* "this cannot possibly happen" ... */
843                 ret = -EINVAL;
844
845  abort:
846         return ret;
847 }
848
849 /*
850  * validate_super for 0.90.0
851  */
852 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
853 {
854         mdp_disk_t *desc;
855         mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
856         __u64 ev1 = md_event(sb);
857
858         rdev->raid_disk = -1;
859         clear_bit(Faulty, &rdev->flags);
860         clear_bit(In_sync, &rdev->flags);
861         clear_bit(WriteMostly, &rdev->flags);
862         clear_bit(BarriersNotsupp, &rdev->flags);
863
864         if (mddev->raid_disks == 0) {
865                 mddev->major_version = 0;
866                 mddev->minor_version = sb->minor_version;
867                 mddev->patch_version = sb->patch_version;
868                 mddev->external = 0;
869                 mddev->chunk_size = sb->chunk_size;
870                 mddev->ctime = sb->ctime;
871                 mddev->utime = sb->utime;
872                 mddev->level = sb->level;
873                 mddev->clevel[0] = 0;
874                 mddev->layout = sb->layout;
875                 mddev->raid_disks = sb->raid_disks;
876                 mddev->dev_sectors = sb->size * 2;
877                 mddev->events = ev1;
878                 mddev->bitmap_offset = 0;
879                 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
880
881                 if (mddev->minor_version >= 91) {
882                         mddev->reshape_position = sb->reshape_position;
883                         mddev->delta_disks = sb->delta_disks;
884                         mddev->new_level = sb->new_level;
885                         mddev->new_layout = sb->new_layout;
886                         mddev->new_chunk = sb->new_chunk;
887                 } else {
888                         mddev->reshape_position = MaxSector;
889                         mddev->delta_disks = 0;
890                         mddev->new_level = mddev->level;
891                         mddev->new_layout = mddev->layout;
892                         mddev->new_chunk = mddev->chunk_size;
893                 }
894
895                 if (sb->state & (1<<MD_SB_CLEAN))
896                         mddev->recovery_cp = MaxSector;
897                 else {
898                         if (sb->events_hi == sb->cp_events_hi && 
899                                 sb->events_lo == sb->cp_events_lo) {
900                                 mddev->recovery_cp = sb->recovery_cp;
901                         } else
902                                 mddev->recovery_cp = 0;
903                 }
904
905                 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
906                 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
907                 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
908                 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
909
910                 mddev->max_disks = MD_SB_DISKS;
911
912                 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
913                     mddev->bitmap_file == NULL)
914                         mddev->bitmap_offset = mddev->default_bitmap_offset;
915
916         } else if (mddev->pers == NULL) {
917                 /* Insist on good event counter while assembling */
918                 ++ev1;
919                 if (ev1 < mddev->events) 
920                         return -EINVAL;
921         } else if (mddev->bitmap) {
922                 /* if adding to array with a bitmap, then we can accept an
923                  * older device ... but not too old.
924                  */
925                 if (ev1 < mddev->bitmap->events_cleared)
926                         return 0;
927         } else {
928                 if (ev1 < mddev->events)
929                         /* just a hot-add of a new device, leave raid_disk at -1 */
930                         return 0;
931         }
932
933         if (mddev->level != LEVEL_MULTIPATH) {
934                 desc = sb->disks + rdev->desc_nr;
935
936                 if (desc->state & (1<<MD_DISK_FAULTY))
937                         set_bit(Faulty, &rdev->flags);
938                 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
939                             desc->raid_disk < mddev->raid_disks */) {
940                         set_bit(In_sync, &rdev->flags);
941                         rdev->raid_disk = desc->raid_disk;
942                 }
943                 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
944                         set_bit(WriteMostly, &rdev->flags);
945         } else /* MULTIPATH are always insync */
946                 set_bit(In_sync, &rdev->flags);
947         return 0;
948 }
949
950 /*
951  * sync_super for 0.90.0
952  */
953 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
954 {
955         mdp_super_t *sb;
956         mdk_rdev_t *rdev2;
957         int next_spare = mddev->raid_disks;
958
959
960         /* make rdev->sb match mddev data..
961          *
962          * 1/ zero out disks
963          * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
964          * 3/ any empty disks < next_spare become removed
965          *
966          * disks[0] gets initialised to REMOVED because
967          * we cannot be sure from other fields if it has
968          * been initialised or not.
969          */
970         int i;
971         int active=0, working=0,failed=0,spare=0,nr_disks=0;
972
973         rdev->sb_size = MD_SB_BYTES;
974
975         sb = (mdp_super_t*)page_address(rdev->sb_page);
976
977         memset(sb, 0, sizeof(*sb));
978
979         sb->md_magic = MD_SB_MAGIC;
980         sb->major_version = mddev->major_version;
981         sb->patch_version = mddev->patch_version;
982         sb->gvalid_words  = 0; /* ignored */
983         memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
984         memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
985         memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
986         memcpy(&sb->set_uuid3, mddev->uuid+12,4);
987
988         sb->ctime = mddev->ctime;
989         sb->level = mddev->level;
990         sb->size = mddev->dev_sectors / 2;
991         sb->raid_disks = mddev->raid_disks;
992         sb->md_minor = mddev->md_minor;
993         sb->not_persistent = 0;
994         sb->utime = mddev->utime;
995         sb->state = 0;
996         sb->events_hi = (mddev->events>>32);
997         sb->events_lo = (u32)mddev->events;
998
999         if (mddev->reshape_position == MaxSector)
1000                 sb->minor_version = 90;
1001         else {
1002                 sb->minor_version = 91;
1003                 sb->reshape_position = mddev->reshape_position;
1004                 sb->new_level = mddev->new_level;
1005                 sb->delta_disks = mddev->delta_disks;
1006                 sb->new_layout = mddev->new_layout;
1007                 sb->new_chunk = mddev->new_chunk;
1008         }
1009         mddev->minor_version = sb->minor_version;
1010         if (mddev->in_sync)
1011         {
1012                 sb->recovery_cp = mddev->recovery_cp;
1013                 sb->cp_events_hi = (mddev->events>>32);
1014                 sb->cp_events_lo = (u32)mddev->events;
1015                 if (mddev->recovery_cp == MaxSector)
1016                         sb->state = (1<< MD_SB_CLEAN);
1017         } else
1018                 sb->recovery_cp = 0;
1019
1020         sb->layout = mddev->layout;
1021         sb->chunk_size = mddev->chunk_size;
1022
1023         if (mddev->bitmap && mddev->bitmap_file == NULL)
1024                 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1025
1026         sb->disks[0].state = (1<<MD_DISK_REMOVED);
1027         list_for_each_entry(rdev2, &mddev->disks, same_set) {
1028                 mdp_disk_t *d;
1029                 int desc_nr;
1030                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
1031                     && !test_bit(Faulty, &rdev2->flags))
1032                         desc_nr = rdev2->raid_disk;
1033                 else
1034                         desc_nr = next_spare++;
1035                 rdev2->desc_nr = desc_nr;
1036                 d = &sb->disks[rdev2->desc_nr];
1037                 nr_disks++;
1038                 d->number = rdev2->desc_nr;
1039                 d->major = MAJOR(rdev2->bdev->bd_dev);
1040                 d->minor = MINOR(rdev2->bdev->bd_dev);
1041                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
1042                     && !test_bit(Faulty, &rdev2->flags))
1043                         d->raid_disk = rdev2->raid_disk;
1044                 else
1045                         d->raid_disk = rdev2->desc_nr; /* compatibility */
1046                 if (test_bit(Faulty, &rdev2->flags))
1047                         d->state = (1<<MD_DISK_FAULTY);
1048                 else if (test_bit(In_sync, &rdev2->flags)) {
1049                         d->state = (1<<MD_DISK_ACTIVE);
1050                         d->state |= (1<<MD_DISK_SYNC);
1051                         active++;
1052                         working++;
1053                 } else {
1054                         d->state = 0;
1055                         spare++;
1056                         working++;
1057                 }
1058                 if (test_bit(WriteMostly, &rdev2->flags))
1059                         d->state |= (1<<MD_DISK_WRITEMOSTLY);
1060         }
1061         /* now set the "removed" and "faulty" bits on any missing devices */
1062         for (i=0 ; i < mddev->raid_disks ; i++) {
1063                 mdp_disk_t *d = &sb->disks[i];
1064                 if (d->state == 0 && d->number == 0) {
1065                         d->number = i;
1066                         d->raid_disk = i;
1067                         d->state = (1<<MD_DISK_REMOVED);
1068                         d->state |= (1<<MD_DISK_FAULTY);
1069                         failed++;
1070                 }
1071         }
1072         sb->nr_disks = nr_disks;
1073         sb->active_disks = active;
1074         sb->working_disks = working;
1075         sb->failed_disks = failed;
1076         sb->spare_disks = spare;
1077
1078         sb->this_disk = sb->disks[rdev->desc_nr];
1079         sb->sb_csum = calc_sb_csum(sb);
1080 }
1081
1082 /*
1083  * rdev_size_change for 0.90.0
1084  */
1085 static unsigned long long
1086 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1087 {
1088         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1089                 return 0; /* component must fit device */
1090         if (rdev->mddev->bitmap_offset)
1091                 return 0; /* can't move bitmap */
1092         rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1093         if (!num_sectors || num_sectors > rdev->sb_start)
1094                 num_sectors = rdev->sb_start;
1095         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1096                        rdev->sb_page);
1097         md_super_wait(rdev->mddev);
1098         return num_sectors / 2; /* kB for sysfs */
1099 }
1100
1101
1102 /*
1103  * version 1 superblock
1104  */
1105
1106 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1107 {
1108         __le32 disk_csum;
1109         u32 csum;
1110         unsigned long long newcsum;
1111         int size = 256 + le32_to_cpu(sb->max_dev)*2;
1112         __le32 *isuper = (__le32*)sb;
1113         int i;
1114
1115         disk_csum = sb->sb_csum;
1116         sb->sb_csum = 0;
1117         newcsum = 0;
1118         for (i=0; size>=4; size -= 4 )
1119                 newcsum += le32_to_cpu(*isuper++);
1120
1121         if (size == 2)
1122                 newcsum += le16_to_cpu(*(__le16*) isuper);
1123
1124         csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1125         sb->sb_csum = disk_csum;
1126         return cpu_to_le32(csum);
1127 }
1128
1129 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1130 {
1131         struct mdp_superblock_1 *sb;
1132         int ret;
1133         sector_t sb_start;
1134         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1135         int bmask;
1136
1137         /*
1138          * Calculate the position of the superblock in 512byte sectors.
1139          * It is always aligned to a 4K boundary and
1140          * depeding on minor_version, it can be:
1141          * 0: At least 8K, but less than 12K, from end of device
1142          * 1: At start of device
1143          * 2: 4K from start of device.
1144          */
1145         switch(minor_version) {
1146         case 0:
1147                 sb_start = rdev->bdev->bd_inode->i_size >> 9;
1148                 sb_start -= 8*2;
1149                 sb_start &= ~(sector_t)(4*2-1);
1150                 break;
1151         case 1:
1152                 sb_start = 0;
1153                 break;
1154         case 2:
1155                 sb_start = 8;
1156                 break;
1157         default:
1158                 return -EINVAL;
1159         }
1160         rdev->sb_start = sb_start;
1161
1162         /* superblock is rarely larger than 1K, but it can be larger,
1163          * and it is safe to read 4k, so we do that
1164          */
1165         ret = read_disk_sb(rdev, 4096);
1166         if (ret) return ret;
1167
1168
1169         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1170
1171         if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1172             sb->major_version != cpu_to_le32(1) ||
1173             le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1174             le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1175             (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1176                 return -EINVAL;
1177
1178         if (calc_sb_1_csum(sb) != sb->sb_csum) {
1179                 printk("md: invalid superblock checksum on %s\n",
1180                         bdevname(rdev->bdev,b));
1181                 return -EINVAL;
1182         }
1183         if (le64_to_cpu(sb->data_size) < 10) {
1184                 printk("md: data_size too small on %s\n",
1185                        bdevname(rdev->bdev,b));
1186                 return -EINVAL;
1187         }
1188         if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
1189                 if (sb->level != cpu_to_le32(1) &&
1190                     sb->level != cpu_to_le32(4) &&
1191                     sb->level != cpu_to_le32(5) &&
1192                     sb->level != cpu_to_le32(6) &&
1193                     sb->level != cpu_to_le32(10)) {
1194                         printk(KERN_WARNING
1195                                "md: bitmaps not supported for this level.\n");
1196                         return -EINVAL;
1197                 }
1198         }
1199
1200         rdev->preferred_minor = 0xffff;
1201         rdev->data_offset = le64_to_cpu(sb->data_offset);
1202         atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1203
1204         rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1205         bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1206         if (rdev->sb_size & bmask)
1207                 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1208
1209         if (minor_version
1210             && rdev->data_offset < sb_start + (rdev->sb_size/512))
1211                 return -EINVAL;
1212
1213         if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1214                 rdev->desc_nr = -1;
1215         else
1216                 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1217
1218         if (!refdev) {
1219                 ret = 1;
1220         } else {
1221                 __u64 ev1, ev2;
1222                 struct mdp_superblock_1 *refsb = 
1223                         (struct mdp_superblock_1*)page_address(refdev->sb_page);
1224
1225                 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1226                     sb->level != refsb->level ||
1227                     sb->layout != refsb->layout ||
1228                     sb->chunksize != refsb->chunksize) {
1229                         printk(KERN_WARNING "md: %s has strangely different"
1230                                 " superblock to %s\n",
1231                                 bdevname(rdev->bdev,b),
1232                                 bdevname(refdev->bdev,b2));
1233                         return -EINVAL;
1234                 }
1235                 ev1 = le64_to_cpu(sb->events);
1236                 ev2 = le64_to_cpu(refsb->events);
1237
1238                 if (ev1 > ev2)
1239                         ret = 1;
1240                 else
1241                         ret = 0;
1242         }
1243         if (minor_version)
1244                 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
1245                         le64_to_cpu(sb->data_offset);
1246         else
1247                 rdev->sectors = rdev->sb_start;
1248         if (rdev->sectors < le64_to_cpu(sb->data_size))
1249                 return -EINVAL;
1250         rdev->sectors = le64_to_cpu(sb->data_size);
1251         if (le32_to_cpu(sb->chunksize))
1252                 rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1);
1253
1254         if (le64_to_cpu(sb->size) > rdev->sectors)
1255                 return -EINVAL;
1256         return ret;
1257 }
1258
1259 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1260 {
1261         struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1262         __u64 ev1 = le64_to_cpu(sb->events);
1263
1264         rdev->raid_disk = -1;
1265         clear_bit(Faulty, &rdev->flags);
1266         clear_bit(In_sync, &rdev->flags);
1267         clear_bit(WriteMostly, &rdev->flags);
1268         clear_bit(BarriersNotsupp, &rdev->flags);
1269
1270         if (mddev->raid_disks == 0) {
1271                 mddev->major_version = 1;
1272                 mddev->patch_version = 0;
1273                 mddev->external = 0;
1274                 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1275                 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1276                 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1277                 mddev->level = le32_to_cpu(sb->level);
1278                 mddev->clevel[0] = 0;
1279                 mddev->layout = le32_to_cpu(sb->layout);
1280                 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1281                 mddev->dev_sectors = le64_to_cpu(sb->size);
1282                 mddev->events = ev1;
1283                 mddev->bitmap_offset = 0;
1284                 mddev->default_bitmap_offset = 1024 >> 9;
1285                 
1286                 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1287                 memcpy(mddev->uuid, sb->set_uuid, 16);
1288
1289                 mddev->max_disks =  (4096-256)/2;
1290
1291                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1292                     mddev->bitmap_file == NULL )
1293                         mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1294
1295                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1296                         mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1297                         mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1298                         mddev->new_level = le32_to_cpu(sb->new_level);
1299                         mddev->new_layout = le32_to_cpu(sb->new_layout);
1300                         mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1301                 } else {
1302                         mddev->reshape_position = MaxSector;
1303                         mddev->delta_disks = 0;
1304                         mddev->new_level = mddev->level;
1305                         mddev->new_layout = mddev->layout;
1306                         mddev->new_chunk = mddev->chunk_size;
1307                 }
1308
1309         } else if (mddev->pers == NULL) {
1310                 /* Insist of good event counter while assembling */
1311                 ++ev1;
1312                 if (ev1 < mddev->events)
1313                         return -EINVAL;
1314         } else if (mddev->bitmap) {
1315                 /* If adding to array with a bitmap, then we can accept an
1316                  * older device, but not too old.
1317                  */
1318                 if (ev1 < mddev->bitmap->events_cleared)
1319                         return 0;
1320         } else {
1321                 if (ev1 < mddev->events)
1322                         /* just a hot-add of a new device, leave raid_disk at -1 */
1323                         return 0;
1324         }
1325         if (mddev->level != LEVEL_MULTIPATH) {
1326                 int role;
1327                 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1328                 switch(role) {
1329                 case 0xffff: /* spare */
1330                         break;
1331                 case 0xfffe: /* faulty */
1332                         set_bit(Faulty, &rdev->flags);
1333                         break;
1334                 default:
1335                         if ((le32_to_cpu(sb->feature_map) &
1336                              MD_FEATURE_RECOVERY_OFFSET))
1337                                 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1338                         else
1339                                 set_bit(In_sync, &rdev->flags);
1340                         rdev->raid_disk = role;
1341                         break;
1342                 }
1343                 if (sb->devflags & WriteMostly1)
1344                         set_bit(WriteMostly, &rdev->flags);
1345         } else /* MULTIPATH are always insync */
1346                 set_bit(In_sync, &rdev->flags);
1347
1348         return 0;
1349 }
1350
1351 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1352 {
1353         struct mdp_superblock_1 *sb;
1354         mdk_rdev_t *rdev2;
1355         int max_dev, i;
1356         /* make rdev->sb match mddev and rdev data. */
1357
1358         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1359
1360         sb->feature_map = 0;
1361         sb->pad0 = 0;
1362         sb->recovery_offset = cpu_to_le64(0);
1363         memset(sb->pad1, 0, sizeof(sb->pad1));
1364         memset(sb->pad2, 0, sizeof(sb->pad2));
1365         memset(sb->pad3, 0, sizeof(sb->pad3));
1366
1367         sb->utime = cpu_to_le64((__u64)mddev->utime);
1368         sb->events = cpu_to_le64(mddev->events);
1369         if (mddev->in_sync)
1370                 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1371         else
1372                 sb->resync_offset = cpu_to_le64(0);
1373
1374         sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1375
1376         sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1377         sb->size = cpu_to_le64(mddev->dev_sectors);
1378
1379         if (mddev->bitmap && mddev->bitmap_file == NULL) {
1380                 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1381                 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1382         }
1383
1384         if (rdev->raid_disk >= 0 &&
1385             !test_bit(In_sync, &rdev->flags)) {
1386                 if (mddev->curr_resync_completed > rdev->recovery_offset)
1387                         rdev->recovery_offset = mddev->curr_resync_completed;
1388                 if (rdev->recovery_offset > 0) {
1389                         sb->feature_map |=
1390                                 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1391                         sb->recovery_offset =
1392                                 cpu_to_le64(rdev->recovery_offset);
1393                 }
1394         }
1395
1396         if (mddev->reshape_position != MaxSector) {
1397                 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1398                 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1399                 sb->new_layout = cpu_to_le32(mddev->new_layout);
1400                 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1401                 sb->new_level = cpu_to_le32(mddev->new_level);
1402                 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1403         }
1404
1405         max_dev = 0;
1406         list_for_each_entry(rdev2, &mddev->disks, same_set)
1407                 if (rdev2->desc_nr+1 > max_dev)
1408                         max_dev = rdev2->desc_nr+1;
1409
1410         if (max_dev > le32_to_cpu(sb->max_dev))
1411                 sb->max_dev = cpu_to_le32(max_dev);
1412         for (i=0; i<max_dev;i++)
1413                 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1414         
1415         list_for_each_entry(rdev2, &mddev->disks, same_set) {
1416                 i = rdev2->desc_nr;
1417                 if (test_bit(Faulty, &rdev2->flags))
1418                         sb->dev_roles[i] = cpu_to_le16(0xfffe);
1419                 else if (test_bit(In_sync, &rdev2->flags))
1420                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1421                 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1422                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1423                 else
1424                         sb->dev_roles[i] = cpu_to_le16(0xffff);
1425         }
1426
1427         sb->sb_csum = calc_sb_1_csum(sb);
1428 }
1429
1430 static unsigned long long
1431 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1432 {
1433         struct mdp_superblock_1 *sb;
1434         sector_t max_sectors;
1435         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1436                 return 0; /* component must fit device */
1437         if (rdev->sb_start < rdev->data_offset) {
1438                 /* minor versions 1 and 2; superblock before data */
1439                 max_sectors = rdev->bdev->bd_inode->i_size >> 9;
1440                 max_sectors -= rdev->data_offset;
1441                 if (!num_sectors || num_sectors > max_sectors)
1442                         num_sectors = max_sectors;
1443         } else if (rdev->mddev->bitmap_offset) {
1444                 /* minor version 0 with bitmap we can't move */
1445                 return 0;
1446         } else {
1447                 /* minor version 0; superblock after data */
1448                 sector_t sb_start;
1449                 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1450                 sb_start &= ~(sector_t)(4*2 - 1);
1451                 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1452                 if (!num_sectors || num_sectors > max_sectors)
1453                         num_sectors = max_sectors;
1454                 rdev->sb_start = sb_start;
1455         }
1456         sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1457         sb->data_size = cpu_to_le64(num_sectors);
1458         sb->super_offset = rdev->sb_start;
1459         sb->sb_csum = calc_sb_1_csum(sb);
1460         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1461                        rdev->sb_page);
1462         md_super_wait(rdev->mddev);
1463         return num_sectors / 2; /* kB for sysfs */
1464 }
1465
1466 static struct super_type super_types[] = {
1467         [0] = {
1468                 .name   = "0.90.0",
1469                 .owner  = THIS_MODULE,
1470                 .load_super         = super_90_load,
1471                 .validate_super     = super_90_validate,
1472                 .sync_super         = super_90_sync,
1473                 .rdev_size_change   = super_90_rdev_size_change,
1474         },
1475         [1] = {
1476                 .name   = "md-1",
1477                 .owner  = THIS_MODULE,
1478                 .load_super         = super_1_load,
1479                 .validate_super     = super_1_validate,
1480                 .sync_super         = super_1_sync,
1481                 .rdev_size_change   = super_1_rdev_size_change,
1482         },
1483 };
1484
1485 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1486 {
1487         mdk_rdev_t *rdev, *rdev2;
1488
1489         rcu_read_lock();
1490         rdev_for_each_rcu(rdev, mddev1)
1491                 rdev_for_each_rcu(rdev2, mddev2)
1492                         if (rdev->bdev->bd_contains ==
1493                             rdev2->bdev->bd_contains) {
1494                                 rcu_read_unlock();
1495                                 return 1;
1496                         }
1497         rcu_read_unlock();
1498         return 0;
1499 }
1500
1501 static LIST_HEAD(pending_raid_disks);
1502
1503 static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
1504 {
1505         struct mdk_personality *pers = mddev->pers;
1506         struct gendisk *disk = mddev->gendisk;
1507         struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1508         struct blk_integrity *bi_mddev = blk_get_integrity(disk);
1509
1510         /* Data integrity passthrough not supported on RAID 4, 5 and 6 */
1511         if (pers && pers->level >= 4 && pers->level <= 6)
1512                 return;
1513
1514         /* If rdev is integrity capable, register profile for mddev */
1515         if (!bi_mddev && bi_rdev) {
1516                 if (blk_integrity_register(disk, bi_rdev))
1517                         printk(KERN_ERR "%s: %s Could not register integrity!\n",
1518                                __func__, disk->disk_name);
1519                 else
1520                         printk(KERN_NOTICE "Enabling data integrity on %s\n",
1521                                disk->disk_name);
1522                 return;
1523         }
1524
1525         /* Check that mddev and rdev have matching profiles */
1526         if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
1527                 printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
1528                        disk->disk_name, rdev->bdev->bd_disk->disk_name);
1529                 printk(KERN_NOTICE "Disabling data integrity on %s\n",
1530                        disk->disk_name);
1531                 blk_integrity_unregister(disk);
1532         }
1533 }
1534
1535 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1536 {
1537         char b[BDEVNAME_SIZE];
1538         struct kobject *ko;
1539         char *s;
1540         int err;
1541
1542         if (rdev->mddev) {
1543                 MD_BUG();
1544                 return -EINVAL;
1545         }
1546
1547         /* prevent duplicates */
1548         if (find_rdev(mddev, rdev->bdev->bd_dev))
1549                 return -EEXIST;
1550
1551         /* make sure rdev->sectors exceeds mddev->dev_sectors */
1552         if (rdev->sectors && (mddev->dev_sectors == 0 ||
1553                         rdev->sectors < mddev->dev_sectors)) {
1554                 if (mddev->pers) {
1555                         /* Cannot change size, so fail
1556                          * If mddev->level <= 0, then we don't care
1557                          * about aligning sizes (e.g. linear)
1558                          */
1559                         if (mddev->level > 0)
1560                                 return -ENOSPC;
1561                 } else
1562                         mddev->dev_sectors = rdev->sectors;
1563         }
1564
1565         /* Verify rdev->desc_nr is unique.
1566          * If it is -1, assign a free number, else
1567          * check number is not in use
1568          */
1569         if (rdev->desc_nr < 0) {
1570                 int choice = 0;
1571                 if (mddev->pers) choice = mddev->raid_disks;
1572                 while (find_rdev_nr(mddev, choice))
1573                         choice++;
1574                 rdev->desc_nr = choice;
1575         } else {
1576                 if (find_rdev_nr(mddev, rdev->desc_nr))
1577                         return -EBUSY;
1578         }
1579         if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
1580                 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
1581                        mdname(mddev), mddev->max_disks);
1582                 return -EBUSY;
1583         }
1584         bdevname(rdev->bdev,b);
1585         while ( (s=strchr(b, '/')) != NULL)
1586                 *s = '!';
1587
1588         rdev->mddev = mddev;
1589         printk(KERN_INFO "md: bind<%s>\n", b);
1590
1591         if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
1592                 goto fail;
1593
1594         ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
1595         if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1596                 kobject_del(&rdev->kobj);
1597                 goto fail;
1598         }
1599         rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state");
1600
1601         list_add_rcu(&rdev->same_set, &mddev->disks);
1602         bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1603
1604         /* May as well allow recovery to be retried once */
1605         mddev->recovery_disabled = 0;
1606
1607         md_integrity_check(rdev, mddev);
1608         return 0;
1609
1610  fail:
1611         printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1612                b, mdname(mddev));
1613         return err;
1614 }
1615
1616 static void md_delayed_delete(struct work_struct *ws)
1617 {
1618         mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1619         kobject_del(&rdev->kobj);
1620         kobject_put(&rdev->kobj);
1621 }
1622
1623 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1624 {
1625         char b[BDEVNAME_SIZE];
1626         if (!rdev->mddev) {
1627                 MD_BUG();
1628                 return;
1629         }
1630         bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1631         list_del_rcu(&rdev->same_set);
1632         printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1633         rdev->mddev = NULL;
1634         sysfs_remove_link(&rdev->kobj, "block");
1635         sysfs_put(rdev->sysfs_state);
1636         rdev->sysfs_state = NULL;
1637         /* We need to delay this, otherwise we can deadlock when
1638          * writing to 'remove' to "dev/state".  We also need
1639          * to delay it due to rcu usage.
1640          */
1641         synchronize_rcu();
1642         INIT_WORK(&rdev->del_work, md_delayed_delete);
1643         kobject_get(&rdev->kobj);
1644         schedule_work(&rdev->del_work);
1645 }
1646
1647 /*
1648  * prevent the device from being mounted, repartitioned or
1649  * otherwise reused by a RAID array (or any other kernel
1650  * subsystem), by bd_claiming the device.
1651  */
1652 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1653 {
1654         int err = 0;
1655         struct block_device *bdev;
1656         char b[BDEVNAME_SIZE];
1657
1658         bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1659         if (IS_ERR(bdev)) {
1660                 printk(KERN_ERR "md: could not open %s.\n",
1661                         __bdevname(dev, b));
1662                 return PTR_ERR(bdev);
1663         }
1664         err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1665         if (err) {
1666                 printk(KERN_ERR "md: could not bd_claim %s.\n",
1667                         bdevname(bdev, b));
1668                 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1669                 return err;
1670         }
1671         if (!shared)
1672                 set_bit(AllReserved, &rdev->flags);
1673         rdev->bdev = bdev;
1674         return err;
1675 }
1676
1677 static void unlock_rdev(mdk_rdev_t *rdev)
1678 {
1679         struct block_device *bdev = rdev->bdev;
1680         rdev->bdev = NULL;
1681         if (!bdev)
1682                 MD_BUG();
1683         bd_release(bdev);
1684         blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1685 }
1686
1687 void md_autodetect_dev(dev_t dev);
1688
1689 static void export_rdev(mdk_rdev_t * rdev)
1690 {
1691         char b[BDEVNAME_SIZE];
1692         printk(KERN_INFO "md: export_rdev(%s)\n",
1693                 bdevname(rdev->bdev,b));
1694         if (rdev->mddev)
1695                 MD_BUG();
1696         free_disk_sb(rdev);
1697 #ifndef MODULE
1698         if (test_bit(AutoDetected, &rdev->flags))
1699                 md_autodetect_dev(rdev->bdev->bd_dev);
1700 #endif
1701         unlock_rdev(rdev);
1702         kobject_put(&rdev->kobj);
1703 }
1704
1705 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1706 {
1707         unbind_rdev_from_array(rdev);
1708         export_rdev(rdev);
1709 }
1710
1711 static void export_array(mddev_t *mddev)
1712 {
1713         mdk_rdev_t *rdev, *tmp;
1714
1715         rdev_for_each(rdev, tmp, mddev) {
1716                 if (!rdev->mddev) {
1717                         MD_BUG();
1718                         continue;
1719                 }
1720                 kick_rdev_from_array(rdev);
1721         }
1722         if (!list_empty(&mddev->disks))
1723                 MD_BUG();
1724         mddev->raid_disks = 0;
1725         mddev->major_version = 0;
1726 }
1727
1728 static void print_desc(mdp_disk_t *desc)
1729 {
1730         printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1731                 desc->major,desc->minor,desc->raid_disk,desc->state);
1732 }
1733
1734 static void print_sb_90(mdp_super_t *sb)
1735 {
1736         int i;
1737
1738         printk(KERN_INFO 
1739                 "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1740                 sb->major_version, sb->minor_version, sb->patch_version,
1741                 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1742                 sb->ctime);
1743         printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1744                 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1745                 sb->md_minor, sb->layout, sb->chunk_size);
1746         printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1747                 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1748                 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1749                 sb->failed_disks, sb->spare_disks,
1750                 sb->sb_csum, (unsigned long)sb->events_lo);
1751
1752         printk(KERN_INFO);
1753         for (i = 0; i < MD_SB_DISKS; i++) {
1754                 mdp_disk_t *desc;
1755
1756                 desc = sb->disks + i;
1757                 if (desc->number || desc->major || desc->minor ||
1758                     desc->raid_disk || (desc->state && (desc->state != 4))) {
1759                         printk("     D %2d: ", i);
1760                         print_desc(desc);
1761                 }
1762         }
1763         printk(KERN_INFO "md:     THIS: ");
1764         print_desc(&sb->this_disk);
1765 }
1766
1767 static void print_sb_1(struct mdp_superblock_1 *sb)
1768 {
1769         __u8 *uuid;
1770
1771         uuid = sb->set_uuid;
1772         printk(KERN_INFO "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
1773                         ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
1774                KERN_INFO "md:    Name: \"%s\" CT:%llu\n",
1775                 le32_to_cpu(sb->major_version),
1776                 le32_to_cpu(sb->feature_map),
1777                 uuid[0], uuid[1], uuid[2], uuid[3],
1778                 uuid[4], uuid[5], uuid[6], uuid[7],
1779                 uuid[8], uuid[9], uuid[10], uuid[11],
1780                 uuid[12], uuid[13], uuid[14], uuid[15],
1781                 sb->set_name,
1782                 (unsigned long long)le64_to_cpu(sb->ctime)
1783                        & MD_SUPERBLOCK_1_TIME_SEC_MASK);
1784
1785         uuid = sb->device_uuid;
1786         printk(KERN_INFO "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1787                         " RO:%llu\n"
1788                KERN_INFO "md:     Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
1789                         ":%02x%02x%02x%02x%02x%02x\n"
1790                KERN_INFO "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1791                KERN_INFO "md:         (MaxDev:%u) \n",
1792                 le32_to_cpu(sb->level),
1793                 (unsigned long long)le64_to_cpu(sb->size),
1794                 le32_to_cpu(sb->raid_disks),
1795                 le32_to_cpu(sb->layout),
1796                 le32_to_cpu(sb->chunksize),
1797                 (unsigned long long)le64_to_cpu(sb->data_offset),
1798                 (unsigned long long)le64_to_cpu(sb->data_size),
1799                 (unsigned long long)le64_to_cpu(sb->super_offset),
1800                 (unsigned long long)le64_to_cpu(sb->recovery_offset),
1801                 le32_to_cpu(sb->dev_number),
1802                 uuid[0], uuid[1], uuid[2], uuid[3],
1803                 uuid[4], uuid[5], uuid[6], uuid[7],
1804                 uuid[8], uuid[9], uuid[10], uuid[11],
1805                 uuid[12], uuid[13], uuid[14], uuid[15],
1806                 sb->devflags,
1807                 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
1808                 (unsigned long long)le64_to_cpu(sb->events),
1809                 (unsigned long long)le64_to_cpu(sb->resync_offset),
1810                 le32_to_cpu(sb->sb_csum),
1811                 le32_to_cpu(sb->max_dev)
1812                 );
1813 }
1814
1815 static void print_rdev(mdk_rdev_t *rdev, int major_version)
1816 {
1817         char b[BDEVNAME_SIZE];
1818         printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
1819                 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
1820                 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1821                 rdev->desc_nr);
1822         if (rdev->sb_loaded) {
1823                 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
1824                 switch (major_version) {
1825                 case 0:
1826                         print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
1827                         break;
1828                 case 1:
1829                         print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
1830                         break;
1831                 }
1832         } else
1833                 printk(KERN_INFO "md: no rdev superblock!\n");
1834 }
1835
1836 static void md_print_devices(void)
1837 {
1838         struct list_head *tmp;
1839         mdk_rdev_t *rdev;
1840         mddev_t *mddev;
1841         char b[BDEVNAME_SIZE];
1842
1843         printk("\n");
1844         printk("md:     **********************************\n");
1845         printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
1846         printk("md:     **********************************\n");
1847         for_each_mddev(mddev, tmp) {
1848
1849                 if (mddev->bitmap)
1850                         bitmap_print_sb(mddev->bitmap);
1851                 else
1852                         printk("%s: ", mdname(mddev));
1853                 list_for_each_entry(rdev, &mddev->disks, same_set)
1854                         printk("<%s>", bdevname(rdev->bdev,b));
1855                 printk("\n");
1856
1857                 list_for_each_entry(rdev, &mddev->disks, same_set)
1858                         print_rdev(rdev, mddev->major_version);
1859         }
1860         printk("md:     **********************************\n");
1861         printk("\n");
1862 }
1863
1864
1865 static void sync_sbs(mddev_t * mddev, int nospares)
1866 {
1867         /* Update each superblock (in-memory image), but
1868          * if we are allowed to, skip spares which already
1869          * have the right event counter, or have one earlier
1870          * (which would mean they aren't being marked as dirty
1871          * with the rest of the array)
1872          */
1873         mdk_rdev_t *rdev;
1874
1875         list_for_each_entry(rdev, &mddev->disks, same_set) {
1876                 if (rdev->sb_events == mddev->events ||
1877                     (nospares &&
1878                      rdev->raid_disk < 0 &&
1879                      (rdev->sb_events&1)==0 &&
1880                      rdev->sb_events+1 == mddev->events)) {
1881                         /* Don't update this superblock */
1882                         rdev->sb_loaded = 2;
1883                 } else {
1884                         super_types[mddev->major_version].
1885                                 sync_super(mddev, rdev);
1886                         rdev->sb_loaded = 1;
1887                 }
1888         }
1889 }
1890
1891 static void md_update_sb(mddev_t * mddev, int force_change)
1892 {
1893         mdk_rdev_t *rdev;
1894         int sync_req;
1895         int nospares = 0;
1896
1897         if (mddev->external)
1898                 return;
1899 repeat:
1900         spin_lock_irq(&mddev->write_lock);
1901
1902         set_bit(MD_CHANGE_PENDING, &mddev->flags);
1903         if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1904                 force_change = 1;
1905         if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1906                 /* just a clean<-> dirty transition, possibly leave spares alone,
1907                  * though if events isn't the right even/odd, we will have to do
1908                  * spares after all
1909                  */
1910                 nospares = 1;
1911         if (force_change)
1912                 nospares = 0;
1913         if (mddev->degraded)
1914                 /* If the array is degraded, then skipping spares is both
1915                  * dangerous and fairly pointless.
1916                  * Dangerous because a device that was removed from the array
1917                  * might have a event_count that still looks up-to-date,
1918                  * so it can be re-added without a resync.
1919                  * Pointless because if there are any spares to skip,
1920                  * then a recovery will happen and soon that array won't
1921                  * be degraded any more and the spare can go back to sleep then.
1922                  */
1923                 nospares = 0;
1924
1925         sync_req = mddev->in_sync;
1926         mddev->utime = get_seconds();
1927
1928         /* If this is just a dirty<->clean transition, and the array is clean
1929          * and 'events' is odd, we can roll back to the previous clean state */
1930         if (nospares
1931             && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1932             && (mddev->events & 1)
1933             && mddev->events != 1)
1934                 mddev->events--;
1935         else {
1936                 /* otherwise we have to go forward and ... */
1937                 mddev->events ++;
1938                 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1939                         /* .. if the array isn't clean, insist on an odd 'events' */
1940                         if ((mddev->events&1)==0) {
1941                                 mddev->events++;
1942                                 nospares = 0;
1943                         }
1944                 } else {
1945                         /* otherwise insist on an even 'events' (for clean states) */
1946                         if ((mddev->events&1)) {
1947                                 mddev->events++;
1948                                 nospares = 0;
1949                         }
1950                 }
1951         }
1952
1953         if (!mddev->events) {
1954                 /*
1955                  * oops, this 64-bit counter should never wrap.
1956                  * Either we are in around ~1 trillion A.C., assuming
1957                  * 1 reboot per second, or we have a bug:
1958                  */
1959                 MD_BUG();
1960                 mddev->events --;
1961         }
1962
1963         /*
1964          * do not write anything to disk if using
1965          * nonpersistent superblocks
1966          */
1967         if (!mddev->persistent) {
1968                 if (!mddev->external)
1969                         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1970
1971                 spin_unlock_irq(&mddev->write_lock);
1972                 wake_up(&mddev->sb_wait);
1973                 return;
1974         }
1975         sync_sbs(mddev, nospares);
1976         spin_unlock_irq(&mddev->write_lock);
1977
1978         dprintk(KERN_INFO 
1979                 "md: updating %s RAID superblock on device (in sync %d)\n",
1980                 mdname(mddev),mddev->in_sync);
1981
1982         bitmap_update_sb(mddev->bitmap);
1983         list_for_each_entry(rdev, &mddev->disks, same_set) {
1984                 char b[BDEVNAME_SIZE];
1985                 dprintk(KERN_INFO "md: ");
1986                 if (rdev->sb_loaded != 1)
1987                         continue; /* no noise on spare devices */
1988                 if (test_bit(Faulty, &rdev->flags))
1989                         dprintk("(skipping faulty ");
1990
1991                 dprintk("%s ", bdevname(rdev->bdev,b));
1992                 if (!test_bit(Faulty, &rdev->flags)) {
1993                         md_super_write(mddev,rdev,
1994                                        rdev->sb_start, rdev->sb_size,
1995                                        rdev->sb_page);
1996                         dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1997                                 bdevname(rdev->bdev,b),
1998                                 (unsigned long long)rdev->sb_start);
1999                         rdev->sb_events = mddev->events;
2000
2001                 } else
2002                         dprintk(")\n");
2003                 if (mddev->level == LEVEL_MULTIPATH)
2004                         /* only need to write one superblock... */
2005                         break;
2006         }
2007         md_super_wait(mddev);
2008         /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2009
2010         spin_lock_irq(&mddev->write_lock);
2011         if (mddev->in_sync != sync_req ||
2012             test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2013                 /* have to write it out again */
2014                 spin_unlock_irq(&mddev->write_lock);
2015                 goto repeat;
2016         }
2017         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2018         spin_unlock_irq(&mddev->write_lock);
2019         wake_up(&mddev->sb_wait);
2020
2021 }
2022
2023 /* words written to sysfs files may, or may not, be \n terminated.
2024  * We want to accept with case. For this we use cmd_match.
2025  */
2026 static int cmd_match(const char *cmd, const char *str)
2027 {
2028         /* See if cmd, written into a sysfs file, matches
2029          * str.  They must either be the same, or cmd can
2030          * have a trailing newline
2031          */
2032         while (*cmd && *str && *cmd == *str) {
2033                 cmd++;
2034                 str++;
2035         }
2036         if (*cmd == '\n')
2037                 cmd++;
2038         if (*str || *cmd)
2039                 return 0;
2040         return 1;
2041 }
2042
2043 struct rdev_sysfs_entry {
2044         struct attribute attr;
2045         ssize_t (*show)(mdk_rdev_t *, char *);
2046         ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
2047 };
2048
2049 static ssize_t
2050 state_show(mdk_rdev_t *rdev, char *page)
2051 {
2052         char *sep = "";
2053         size_t len = 0;
2054
2055         if (test_bit(Faulty, &rdev->flags)) {
2056                 len+= sprintf(page+len, "%sfaulty",sep);
2057                 sep = ",";
2058         }
2059         if (test_bit(In_sync, &rdev->flags)) {
2060                 len += sprintf(page+len, "%sin_sync",sep);
2061                 sep = ",";
2062         }
2063         if (test_bit(WriteMostly, &rdev->flags)) {
2064                 len += sprintf(page+len, "%swrite_mostly",sep);
2065                 sep = ",";
2066         }
2067         if (test_bit(Blocked, &rdev->flags)) {
2068                 len += sprintf(page+len, "%sblocked", sep);
2069                 sep = ",";
2070         }
2071         if (!test_bit(Faulty, &rdev->flags) &&
2072             !test_bit(In_sync, &rdev->flags)) {
2073                 len += sprintf(page+len, "%sspare", sep);
2074                 sep = ",";
2075         }
2076         return len+sprintf(page+len, "\n");
2077 }
2078
2079 static ssize_t
2080 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2081 {
2082         /* can write
2083          *  faulty  - simulates and error
2084          *  remove  - disconnects the device
2085          *  writemostly - sets write_mostly
2086          *  -writemostly - clears write_mostly
2087          *  blocked - sets the Blocked flag
2088          *  -blocked - clears the Blocked flag
2089          */
2090         int err = -EINVAL;
2091         if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2092                 md_error(rdev->mddev, rdev);
2093                 err = 0;
2094         } else if (cmd_match(buf, "remove")) {
2095                 if (rdev->raid_disk >= 0)
2096                         err = -EBUSY;
2097                 else {
2098                         mddev_t *mddev = rdev->mddev;
2099                         kick_rdev_from_array(rdev);
2100                         if (mddev->pers)
2101                                 md_update_sb(mddev, 1);
2102                         md_new_event(mddev);
2103                         err = 0;
2104                 }
2105         } else if (cmd_match(buf, "writemostly")) {
2106                 set_bit(WriteMostly, &rdev->flags);
2107                 err = 0;
2108         } else if (cmd_match(buf, "-writemostly")) {
2109                 clear_bit(WriteMostly, &rdev->flags);
2110                 err = 0;
2111         } else if (cmd_match(buf, "blocked")) {
2112                 set_bit(Blocked, &rdev->flags);
2113                 err = 0;
2114         } else if (cmd_match(buf, "-blocked")) {
2115                 clear_bit(Blocked, &rdev->flags);
2116                 wake_up(&rdev->blocked_wait);
2117                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2118                 md_wakeup_thread(rdev->mddev->thread);
2119
2120                 err = 0;
2121         }
2122         if (!err && rdev->sysfs_state)
2123                 sysfs_notify_dirent(rdev->sysfs_state);
2124         return err ? err : len;
2125 }
2126 static struct rdev_sysfs_entry rdev_state =
2127 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2128
2129 static ssize_t
2130 errors_show(mdk_rdev_t *rdev, char *page)
2131 {
2132         return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2133 }
2134
2135 static ssize_t
2136 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2137 {
2138         char *e;
2139         unsigned long n = simple_strtoul(buf, &e, 10);
2140         if (*buf && (*e == 0 || *e == '\n')) {
2141                 atomic_set(&rdev->corrected_errors, n);
2142                 return len;
2143         }
2144         return -EINVAL;
2145 }
2146 static struct rdev_sysfs_entry rdev_errors =
2147 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2148
2149 static ssize_t
2150 slot_show(mdk_rdev_t *rdev, char *page)
2151 {
2152         if (rdev->raid_disk < 0)
2153                 return sprintf(page, "none\n");
2154         else
2155                 return sprintf(page, "%d\n", rdev->raid_disk);
2156 }
2157
2158 static ssize_t
2159 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2160 {
2161         char *e;
2162         int err;
2163         char nm[20];
2164         int slot = simple_strtoul(buf, &e, 10);
2165         if (strncmp(buf, "none", 4)==0)
2166                 slot = -1;
2167         else if (e==buf || (*e && *e!= '\n'))
2168                 return -EINVAL;
2169         if (rdev->mddev->pers && slot == -1) {
2170                 /* Setting 'slot' on an active array requires also
2171                  * updating the 'rd%d' link, and communicating
2172                  * with the personality with ->hot_*_disk.
2173                  * For now we only support removing
2174                  * failed/spare devices.  This normally happens automatically,
2175                  * but not when the metadata is externally managed.
2176                  */
2177                 if (rdev->raid_disk == -1)
2178                         return -EEXIST;
2179                 /* personality does all needed checks */
2180                 if (rdev->mddev->pers->hot_add_disk == NULL)
2181                         return -EINVAL;
2182                 err = rdev->mddev->pers->
2183                         hot_remove_disk(rdev->mddev, rdev->raid_disk);
2184                 if (err)
2185                         return err;
2186                 sprintf(nm, "rd%d", rdev->raid_disk);
2187                 sysfs_remove_link(&rdev->mddev->kobj, nm);
2188                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2189                 md_wakeup_thread(rdev->mddev->thread);
2190         } else if (rdev->mddev->pers) {
2191                 mdk_rdev_t *rdev2;
2192                 /* Activating a spare .. or possibly reactivating
2193                  * if we every get bitmaps working here.
2194                  */
2195
2196                 if (rdev->raid_disk != -1)
2197                         return -EBUSY;
2198
2199                 if (rdev->mddev->pers->hot_add_disk == NULL)
2200                         return -EINVAL;
2201
2202                 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2203                         if (rdev2->raid_disk == slot)
2204                                 return -EEXIST;
2205
2206                 rdev->raid_disk = slot;
2207                 if (test_bit(In_sync, &rdev->flags))
2208                         rdev->saved_raid_disk = slot;
2209                 else
2210                         rdev->saved_raid_disk = -1;
2211                 err = rdev->mddev->pers->
2212                         hot_add_disk(rdev->mddev, rdev);
2213                 if (err) {
2214                         rdev->raid_disk = -1;
2215                         return err;
2216                 } else
2217                         sysfs_notify_dirent(rdev->sysfs_state);
2218                 sprintf(nm, "rd%d", rdev->raid_disk);
2219                 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2220                         printk(KERN_WARNING
2221                                "md: cannot register "
2222                                "%s for %s\n",
2223                                nm, mdname(rdev->mddev));
2224
2225                 /* don't wakeup anyone, leave that to userspace. */
2226         } else {
2227                 if (slot >= rdev->mddev->raid_disks)
2228                         return -ENOSPC;
2229                 rdev->raid_disk = slot;
2230                 /* assume it is working */
2231                 clear_bit(Faulty, &rdev->flags);
2232                 clear_bit(WriteMostly, &rdev->flags);
2233                 set_bit(In_sync, &rdev->flags);
2234                 sysfs_notify_dirent(rdev->sysfs_state);
2235         }
2236         return len;
2237 }
2238
2239
2240 static struct rdev_sysfs_entry rdev_slot =
2241 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2242
2243 static ssize_t
2244 offset_show(mdk_rdev_t *rdev, char *page)
2245 {
2246         return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2247 }
2248
2249 static ssize_t
2250 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2251 {
2252         char *e;
2253         unsigned long long offset = simple_strtoull(buf, &e, 10);
2254         if (e==buf || (*e && *e != '\n'))
2255                 return -EINVAL;
2256         if (rdev->mddev->pers && rdev->raid_disk >= 0)
2257                 return -EBUSY;
2258         if (rdev->sectors && rdev->mddev->external)
2259                 /* Must set offset before size, so overlap checks
2260                  * can be sane */
2261                 return -EBUSY;
2262         rdev->data_offset = offset;
2263         return len;
2264 }
2265
2266 static struct rdev_sysfs_entry rdev_offset =
2267 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2268
2269 static ssize_t
2270 rdev_size_show(mdk_rdev_t *rdev, char *page)
2271 {
2272         return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2273 }
2274
2275 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2276 {
2277         /* check if two start/length pairs overlap */
2278         if (s1+l1 <= s2)
2279                 return 0;
2280         if (s2+l2 <= s1)
2281                 return 0;
2282         return 1;
2283 }
2284
2285 static ssize_t
2286 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2287 {
2288         mddev_t *my_mddev = rdev->mddev;
2289         sector_t oldsectors = rdev->sectors;
2290         unsigned long long sectors;
2291
2292         if (strict_strtoull(buf, 10, &sectors) < 0)
2293                 return -EINVAL;
2294         sectors *= 2;
2295         if (my_mddev->pers && rdev->raid_disk >= 0) {
2296                 if (my_mddev->persistent) {
2297                         sectors = super_types[my_mddev->major_version].
2298                                 rdev_size_change(rdev, sectors);
2299                         if (!sectors)
2300                                 return -EBUSY;
2301                 } else if (!sectors)
2302                         sectors = (rdev->bdev->bd_inode->i_size >> 9) -
2303                                 rdev->data_offset;
2304         }
2305         if (sectors < my_mddev->dev_sectors)
2306                 return -EINVAL; /* component must fit device */
2307
2308         rdev->sectors = sectors;
2309         if (sectors > oldsectors && my_mddev->external) {
2310                 /* need to check that all other rdevs with the same ->bdev
2311                  * do not overlap.  We need to unlock the mddev to avoid
2312                  * a deadlock.  We have already changed rdev->sectors, and if
2313                  * we have to change it back, we will have the lock again.
2314                  */
2315                 mddev_t *mddev;
2316                 int overlap = 0;
2317                 struct list_head *tmp;
2318
2319                 mddev_unlock(my_mddev);
2320                 for_each_mddev(mddev, tmp) {
2321                         mdk_rdev_t *rdev2;
2322
2323                         mddev_lock(mddev);
2324                         list_for_each_entry(rdev2, &mddev->disks, same_set)
2325                                 if (test_bit(AllReserved, &rdev2->flags) ||
2326                                     (rdev->bdev == rdev2->bdev &&
2327                                      rdev != rdev2 &&
2328                                      overlaps(rdev->data_offset, rdev->sectors,
2329                                               rdev2->data_offset,
2330                                               rdev2->sectors))) {
2331                                         overlap = 1;
2332                                         break;
2333                                 }
2334                         mddev_unlock(mddev);
2335                         if (overlap) {
2336                                 mddev_put(mddev);
2337                                 break;
2338                         }
2339                 }
2340                 mddev_lock(my_mddev);
2341                 if (overlap) {
2342                         /* Someone else could have slipped in a size
2343                          * change here, but doing so is just silly.
2344                          * We put oldsectors back because we *know* it is
2345                          * safe, and trust userspace not to race with
2346                          * itself
2347                          */
2348                         rdev->sectors = oldsectors;
2349                         return -EBUSY;
2350                 }
2351         }
2352         return len;
2353 }
2354
2355 static struct rdev_sysfs_entry rdev_size =
2356 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2357
2358 static struct attribute *rdev_default_attrs[] = {
2359         &rdev_state.attr,
2360         &rdev_errors.attr,
2361         &rdev_slot.attr,
2362         &rdev_offset.attr,
2363         &rdev_size.attr,
2364         NULL,
2365 };
2366 static ssize_t
2367 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2368 {
2369         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2370         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2371         mddev_t *mddev = rdev->mddev;
2372         ssize_t rv;
2373
2374         if (!entry->show)
2375                 return -EIO;
2376
2377         rv = mddev ? mddev_lock(mddev) : -EBUSY;
2378         if (!rv) {
2379                 if (rdev->mddev == NULL)
2380                         rv = -EBUSY;
2381                 else
2382                         rv = entry->show(rdev, page);
2383                 mddev_unlock(mddev);
2384         }
2385         return rv;
2386 }
2387
2388 static ssize_t
2389 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2390               const char *page, size_t length)
2391 {
2392         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2393         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2394         ssize_t rv;
2395         mddev_t *mddev = rdev->mddev;
2396
2397         if (!entry->store)
2398                 return -EIO;
2399         if (!capable(CAP_SYS_ADMIN))
2400                 return -EACCES;
2401         rv = mddev ? mddev_lock(mddev): -EBUSY;
2402         if (!rv) {
2403                 if (rdev->mddev == NULL)
2404                         rv = -EBUSY;
2405                 else
2406                         rv = entry->store(rdev, page, length);
2407                 mddev_unlock(mddev);
2408         }
2409         return rv;
2410 }
2411
2412 static void rdev_free(struct kobject *ko)
2413 {
2414         mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2415         kfree(rdev);
2416 }
2417 static struct sysfs_ops rdev_sysfs_ops = {
2418         .show           = rdev_attr_show,
2419         .store          = rdev_attr_store,
2420 };
2421 static struct kobj_type rdev_ktype = {
2422         .release        = rdev_free,
2423         .sysfs_ops      = &rdev_sysfs_ops,
2424         .default_attrs  = rdev_default_attrs,
2425 };
2426
2427 /*
2428  * Import a device. If 'super_format' >= 0, then sanity check the superblock
2429  *
2430  * mark the device faulty if:
2431  *
2432  *   - the device is nonexistent (zero size)
2433  *   - the device has no valid superblock
2434  *
2435  * a faulty rdev _never_ has rdev->sb set.
2436  */
2437 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2438 {
2439         char b[BDEVNAME_SIZE];
2440         int err;
2441         mdk_rdev_t *rdev;
2442         sector_t size;
2443
2444         rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2445         if (!rdev) {
2446                 printk(KERN_ERR "md: could not alloc mem for new device!\n");
2447                 return ERR_PTR(-ENOMEM);
2448         }
2449
2450         if ((err = alloc_disk_sb(rdev)))
2451                 goto abort_free;
2452
2453         err = lock_rdev(rdev, newdev, super_format == -2);
2454         if (err)
2455                 goto abort_free;
2456
2457         kobject_init(&rdev->kobj, &rdev_ktype);
2458
2459         rdev->desc_nr = -1;
2460         rdev->saved_raid_disk = -1;
2461         rdev->raid_disk = -1;
2462         rdev->flags = 0;
2463         rdev->data_offset = 0;
2464         rdev->sb_events = 0;
2465         atomic_set(&rdev->nr_pending, 0);
2466         atomic_set(&rdev->read_errors, 0);
2467         atomic_set(&rdev->corrected_errors, 0);
2468
2469         size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2470         if (!size) {
2471                 printk(KERN_WARNING 
2472                         "md: %s has zero or unknown size, marking faulty!\n",
2473                         bdevname(rdev->bdev,b));
2474                 err = -EINVAL;
2475                 goto abort_free;
2476         }
2477
2478         if (super_format >= 0) {
2479                 err = super_types[super_format].
2480                         load_super(rdev, NULL, super_minor);
2481                 if (err == -EINVAL) {
2482                         printk(KERN_WARNING
2483                                 "md: %s does not have a valid v%d.%d "
2484                                "superblock, not importing!\n",
2485                                 bdevname(rdev->bdev,b),
2486                                super_format, super_minor);
2487                         goto abort_free;
2488                 }
2489                 if (err < 0) {
2490                         printk(KERN_WARNING 
2491                                 "md: could not read %s's sb, not importing!\n",
2492                                 bdevname(rdev->bdev,b));
2493                         goto abort_free;
2494                 }
2495         }
2496
2497         INIT_LIST_HEAD(&rdev->same_set);
2498         init_waitqueue_head(&rdev->blocked_wait);
2499
2500         return rdev;
2501
2502 abort_free:
2503         if (rdev->sb_page) {
2504                 if (rdev->bdev)
2505                         unlock_rdev(rdev);
2506                 free_disk_sb(rdev);
2507         }
2508         kfree(rdev);
2509         return ERR_PTR(err);
2510 }
2511
2512 /*
2513  * Check a full RAID array for plausibility
2514  */
2515
2516
2517 static void analyze_sbs(mddev_t * mddev)
2518 {
2519         int i;
2520         mdk_rdev_t *rdev, *freshest, *tmp;
2521         char b[BDEVNAME_SIZE];
2522
2523         freshest = NULL;
2524         rdev_for_each(rdev, tmp, mddev)
2525                 switch (super_types[mddev->major_version].
2526                         load_super(rdev, freshest, mddev->minor_version)) {
2527                 case 1:
2528                         freshest = rdev;
2529                         break;
2530                 case 0:
2531                         break;
2532                 default:
2533                         printk( KERN_ERR \
2534                                 "md: fatal superblock inconsistency in %s"
2535                                 " -- removing from array\n", 
2536                                 bdevname(rdev->bdev,b));
2537                         kick_rdev_from_array(rdev);
2538                 }
2539
2540
2541         super_types[mddev->major_version].
2542                 validate_super(mddev, freshest);
2543
2544         i = 0;
2545         rdev_for_each(rdev, tmp, mddev) {
2546                 if (rdev->desc_nr >= mddev->max_disks ||
2547                     i > mddev->max_disks) {
2548                         printk(KERN_WARNING
2549                                "md: %s: %s: only %d devices permitted\n",
2550                                mdname(mddev), bdevname(rdev->bdev, b),
2551                                mddev->max_disks);
2552                         kick_rdev_from_array(rdev);
2553                         continue;
2554                 }
2555                 if (rdev != freshest)
2556                         if (super_types[mddev->major_version].
2557                             validate_super(mddev, rdev)) {
2558                                 printk(KERN_WARNING "md: kicking non-fresh %s"
2559                                         " from array!\n",
2560                                         bdevname(rdev->bdev,b));
2561                                 kick_rdev_from_array(rdev);
2562                                 continue;
2563                         }
2564                 if (mddev->level == LEVEL_MULTIPATH) {
2565                         rdev->desc_nr = i++;
2566                         rdev->raid_disk = rdev->desc_nr;
2567                         set_bit(In_sync, &rdev->flags);
2568                 } else if (rdev->raid_disk >= mddev->raid_disks) {
2569                         rdev->raid_disk = -1;
2570                         clear_bit(In_sync, &rdev->flags);
2571                 }
2572         }
2573
2574
2575
2576         if (mddev->recovery_cp != MaxSector &&
2577             mddev->level >= 1)
2578                 printk(KERN_ERR "md: %s: raid array is not clean"
2579                        " -- starting background reconstruction\n",
2580                        mdname(mddev));
2581
2582 }
2583
2584 static void md_safemode_timeout(unsigned long data);
2585
2586 static ssize_t
2587 safe_delay_show(mddev_t *mddev, char *page)
2588 {
2589         int msec = (mddev->safemode_delay*1000)/HZ;
2590         return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2591 }
2592 static ssize_t
2593 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2594 {
2595         int scale=1;
2596         int dot=0;
2597         int i;
2598         unsigned long msec;
2599         char buf[30];
2600
2601         /* remove a period, and count digits after it */
2602         if (len >= sizeof(buf))
2603                 return -EINVAL;
2604         strlcpy(buf, cbuf, sizeof(buf));
2605         for (i=0; i<len; i++) {
2606                 if (dot) {
2607                         if (isdigit(buf[i])) {
2608                                 buf[i-1] = buf[i];
2609                                 scale *= 10;
2610                         }
2611                         buf[i] = 0;
2612                 } else if (buf[i] == '.') {
2613                         dot=1;
2614                         buf[i] = 0;
2615                 }
2616         }
2617         if (strict_strtoul(buf, 10, &msec) < 0)
2618                 return -EINVAL;
2619         msec = (msec * 1000) / scale;
2620         if (msec == 0)
2621                 mddev->safemode_delay = 0;
2622         else {
2623                 unsigned long old_delay = mddev->safemode_delay;
2624                 mddev->safemode_delay = (msec*HZ)/1000;
2625                 if (mddev->safemode_delay == 0)
2626                         mddev->safemode_delay = 1;
2627                 if (mddev->safemode_delay < old_delay)
2628                         md_safemode_timeout((unsigned long)mddev);
2629         }
2630         return len;
2631 }
2632 static struct md_sysfs_entry md_safe_delay =
2633 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2634
2635 static ssize_t
2636 level_show(mddev_t *mddev, char *page)
2637 {
2638         struct mdk_personality *p = mddev->pers;
2639         if (p)
2640                 return sprintf(page, "%s\n", p->name);
2641         else if (mddev->clevel[0])
2642                 return sprintf(page, "%s\n", mddev->clevel);
2643         else if (mddev->level != LEVEL_NONE)
2644                 return sprintf(page, "%d\n", mddev->level);
2645         else
2646                 return 0;
2647 }
2648
2649 static ssize_t
2650 level_store(mddev_t *mddev, const char *buf, size_t len)
2651 {
2652         ssize_t rv = len;
2653         if (mddev->pers)
2654                 return -EBUSY;
2655         if (len == 0)
2656                 return 0;
2657         if (len >= sizeof(mddev->clevel))
2658                 return -ENOSPC;
2659         strncpy(mddev->clevel, buf, len);
2660         if (mddev->clevel[len-1] == '\n')
2661                 len--;
2662         mddev->clevel[len] = 0;
2663         mddev->level = LEVEL_NONE;
2664         return rv;
2665 }
2666
2667 static struct md_sysfs_entry md_level =
2668 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2669
2670
2671 static ssize_t
2672 layout_show(mddev_t *mddev, char *page)
2673 {
2674         /* just a number, not meaningful for all levels */
2675         if (mddev->reshape_position != MaxSector &&
2676             mddev->layout != mddev->new_layout)
2677                 return sprintf(page, "%d (%d)\n",
2678                                mddev->new_layout, mddev->layout);
2679         return sprintf(page, "%d\n", mddev->layout);
2680 }
2681
2682 static ssize_t
2683 layout_store(mddev_t *mddev, const char *buf, size_t len)
2684 {
2685         char *e;
2686         unsigned long n = simple_strtoul(buf, &e, 10);
2687
2688         if (!*buf || (*e && *e != '\n'))
2689                 return -EINVAL;
2690
2691         if (mddev->pers)
2692                 return -EBUSY;
2693
2694         mddev->new_layout = n;
2695         if (mddev->reshape_position == MaxSector)
2696                 mddev->layout = n;
2697         return len;
2698 }
2699 static struct md_sysfs_entry md_layout =
2700 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2701
2702
2703 static ssize_t
2704 raid_disks_show(mddev_t *mddev, char *page)
2705 {
2706         if (mddev->raid_disks == 0)
2707                 return 0;
2708         if (mddev->reshape_position != MaxSector &&
2709             mddev->delta_disks != 0)
2710                 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2711                                mddev->raid_disks - mddev->delta_disks);
2712         return sprintf(page, "%d\n", mddev->raid_disks);
2713 }
2714
2715 static int update_raid_disks(mddev_t *mddev, int raid_disks);
2716
2717 static ssize_t
2718 raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2719 {
2720         char *e;
2721         int rv = 0;
2722         unsigned long n = simple_strtoul(buf, &e, 10);
2723
2724         if (!*buf || (*e && *e != '\n'))
2725                 return -EINVAL;
2726
2727         if (mddev->pers)
2728                 rv = update_raid_disks(mddev, n);
2729         else if (mddev->reshape_position != MaxSector) {
2730                 int olddisks = mddev->raid_disks - mddev->delta_disks;
2731                 mddev->delta_disks = n - olddisks;
2732                 mddev->raid_disks = n;
2733         } else
2734                 mddev->raid_disks = n;
2735         return rv ? rv : len;
2736 }
2737 static struct md_sysfs_entry md_raid_disks =
2738 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2739
2740 static ssize_t
2741 chunk_size_show(mddev_t *mddev, char *page)
2742 {
2743         if (mddev->reshape_position != MaxSector &&
2744             mddev->chunk_size != mddev->new_chunk)
2745                 return sprintf(page, "%d (%d)\n", mddev->new_chunk,
2746                                mddev->chunk_size);
2747         return sprintf(page, "%d\n", mddev->chunk_size);
2748 }
2749
2750 static ssize_t
2751 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2752 {
2753         /* can only set chunk_size if array is not yet active */
2754         char *e;
2755         unsigned long n = simple_strtoul(buf, &e, 10);
2756
2757         if (!*buf || (*e && *e != '\n'))
2758                 return -EINVAL;
2759
2760         if (mddev->pers)
2761                 return -EBUSY;
2762
2763         mddev->new_chunk = n;
2764         if (mddev->reshape_position == MaxSector)
2765                 mddev->chunk_size = n;
2766         return len;
2767 }
2768 static struct md_sysfs_entry md_chunk_size =
2769 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2770
2771 static ssize_t
2772 resync_start_show(mddev_t *mddev, char *page)
2773 {
2774         return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2775 }
2776
2777 static ssize_t
2778 resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2779 {
2780         char *e;
2781         unsigned long long n = simple_strtoull(buf, &e, 10);
2782
2783         if (mddev->pers)
2784                 return -EBUSY;
2785         if (!*buf || (*e && *e != '\n'))
2786                 return -EINVAL;
2787
2788         mddev->recovery_cp = n;
2789         return len;
2790 }
2791 static struct md_sysfs_entry md_resync_start =
2792 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2793
2794 /*
2795  * The array state can be:
2796  *
2797  * clear
2798  *     No devices, no size, no level
2799  *     Equivalent to STOP_ARRAY ioctl
2800  * inactive
2801  *     May have some settings, but array is not active
2802  *        all IO results in error
2803  *     When written, doesn't tear down array, but just stops it
2804  * suspended (not supported yet)
2805  *     All IO requests will block. The array can be reconfigured.
2806  *     Writing this, if accepted, will block until array is quiescent
2807  * readonly
2808  *     no resync can happen.  no superblocks get written.
2809  *     write requests fail
2810  * read-auto
2811  *     like readonly, but behaves like 'clean' on a write request.
2812  *
2813  * clean - no pending writes, but otherwise active.
2814  *     When written to inactive array, starts without resync
2815  *     If a write request arrives then
2816  *       if metadata is known, mark 'dirty' and switch to 'active'.
2817  *       if not known, block and switch to write-pending
2818  *     If written to an active array that has pending writes, then fails.
2819  * active
2820  *     fully active: IO and resync can be happening.
2821  *     When written to inactive array, starts with resync
2822  *
2823  * write-pending
2824  *     clean, but writes are blocked waiting for 'active' to be written.
2825  *
2826  * active-idle
2827  *     like active, but no writes have been seen for a while (100msec).
2828  *
2829  */
2830 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2831                    write_pending, active_idle, bad_word};
2832 static char *array_states[] = {
2833         "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2834         "write-pending", "active-idle", NULL };
2835
2836 static int match_word(const char *word, char **list)
2837 {
2838         int n;
2839         for (n=0; list[n]; n++)
2840                 if (cmd_match(word, list[n]))
2841                         break;
2842         return n;
2843 }
2844
2845 static ssize_t
2846 array_state_show(mddev_t *mddev, char *page)
2847 {
2848         enum array_state st = inactive;
2849
2850         if (mddev->pers)
2851                 switch(mddev->ro) {
2852                 case 1:
2853                         st = readonly;
2854                         break;
2855                 case 2:
2856                         st = read_auto;
2857                         break;
2858                 case 0:
2859                         if (mddev->in_sync)
2860                                 st = clean;
2861                         else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
2862                                 st = write_pending;
2863                         else if (mddev->safemode)
2864                                 st = active_idle;
2865                         else
2866                                 st = active;
2867                 }
2868         else {
2869                 if (list_empty(&mddev->disks) &&
2870                     mddev->raid_disks == 0 &&
2871                     mddev->dev_sectors == 0)
2872                         st = clear;
2873                 else
2874                         st = inactive;
2875         }
2876         return sprintf(page, "%s\n", array_states[st]);
2877 }
2878
2879 static int do_md_stop(mddev_t * mddev, int ro, int is_open);
2880 static int do_md_run(mddev_t * mddev);
2881 static int restart_array(mddev_t *mddev);
2882
2883 static ssize_t
2884 array_state_store(mddev_t *mddev, const char *buf, size_t len)
2885 {
2886         int err = -EINVAL;
2887         enum array_state st = match_word(buf, array_states);
2888         switch(st) {
2889         case bad_word:
2890                 break;
2891         case clear:
2892                 /* stopping an active array */
2893                 if (atomic_read(&mddev->openers) > 0)
2894                         return -EBUSY;
2895                 err = do_md_stop(mddev, 0, 0);
2896                 break;
2897         case inactive:
2898                 /* stopping an active array */
2899                 if (mddev->pers) {
2900                         if (atomic_read(&mddev->openers) > 0)
2901                                 return -EBUSY;
2902                         err = do_md_stop(mddev, 2, 0);
2903                 } else
2904                         err = 0; /* already inactive */
2905                 break;
2906         case suspended:
2907                 break; /* not supported yet */
2908         case readonly:
2909                 if (mddev->pers)
2910                         err = do_md_stop(mddev, 1, 0);
2911                 else {
2912                         mddev->ro = 1;
2913                         set_disk_ro(mddev->gendisk, 1);
2914                         err = do_md_run(mddev);
2915                 }
2916                 break;
2917         case read_auto:
2918                 if (mddev->pers) {
2919                         if (mddev->ro == 0)
2920                                 err = do_md_stop(mddev, 1, 0);
2921                         else if (mddev->ro == 1)
2922                                 err = restart_array(mddev);
2923                         if (err == 0) {
2924                                 mddev->ro = 2;
2925                                 set_disk_ro(mddev->gendisk, 0);
2926                         }
2927                 } else {
2928                         mddev->ro = 2;
2929                         err = do_md_run(mddev);
2930                 }
2931                 break;
2932         case clean:
2933                 if (mddev->pers) {
2934                         restart_array(mddev);
2935                         spin_lock_irq(&mddev->write_lock);
2936                         if (atomic_read(&mddev->writes_pending) == 0) {
2937                                 if (mddev->in_sync == 0) {
2938                                         mddev->in_sync = 1;
2939                                         if (mddev->safemode == 1)
2940                                                 mddev->safemode = 0;
2941                                         if (mddev->persistent)
2942                                                 set_bit(MD_CHANGE_CLEAN,
2943                                                         &mddev->flags);
2944                                 }
2945                                 err = 0;
2946                         } else
2947                                 err = -EBUSY;
2948                         spin_unlock_irq(&mddev->write_lock);
2949                 } else {
2950                         mddev->ro = 0;
2951                         mddev->recovery_cp = MaxSector;
2952                         err = do_md_run(mddev);
2953                 }
2954                 break;
2955         case active:
2956                 if (mddev->pers) {
2957                         restart_array(mddev);
2958                         if (mddev->external)
2959                                 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2960                         wake_up(&mddev->sb_wait);
2961                         err = 0;
2962                 } else {
2963                         mddev->ro = 0;
2964                         set_disk_ro(mddev->gendisk, 0);
2965                         err = do_md_run(mddev);
2966                 }
2967                 break;
2968         case write_pending:
2969         case active_idle:
2970                 /* these cannot be set */
2971                 break;
2972         }
2973         if (err)
2974                 return err;
2975         else {
2976                 sysfs_notify_dirent(mddev->sysfs_state);
2977                 return len;
2978         }
2979 }
2980 static struct md_sysfs_entry md_array_state =
2981 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
2982
2983 static ssize_t
2984 null_show(mddev_t *mddev, char *page)
2985 {
2986         return -EINVAL;
2987 }
2988
2989 static ssize_t
2990 new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2991 {
2992         /* buf must be %d:%d\n? giving major and minor numbers */
2993         /* The new device is added to the array.
2994          * If the array has a persistent superblock, we read the
2995          * superblock to initialise info and check validity.
2996          * Otherwise, only checking done is that in bind_rdev_to_array,
2997          * which mainly checks size.
2998          */
2999         char *e;
3000         int major = simple_strtoul(buf, &e, 10);
3001         int minor;
3002         dev_t dev;
3003         mdk_rdev_t *rdev;
3004         int err;
3005
3006         if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
3007                 return -EINVAL;
3008         minor = simple_strtoul(e+1, &e, 10);
3009         if (*e && *e != '\n')
3010                 return -EINVAL;
3011         dev = MKDEV(major, minor);
3012         if (major != MAJOR(dev) ||
3013             minor != MINOR(dev))
3014                 return -EOVERFLOW;
3015
3016
3017         if (mddev->persistent) {
3018                 rdev = md_import_device(dev, mddev->major_version,
3019                                         mddev->minor_version);
3020                 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3021                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3022                                                        mdk_rdev_t, same_set);
3023                         err = super_types[mddev->major_version]
3024                                 .load_super(rdev, rdev0, mddev->minor_version);
3025                         if (err < 0)
3026                                 goto out;
3027                 }
3028         } else if (mddev->external)
3029                 rdev = md_import_device(dev, -2, -1);
3030         else
3031                 rdev = md_import_device(dev, -1, -1);
3032
3033         if (IS_ERR(rdev))
3034                 return PTR_ERR(rdev);
3035         err = bind_rdev_to_array(rdev, mddev);
3036  out:
3037         if (err)
3038                 export_rdev(rdev);
3039         return err ? err : len;
3040 }
3041
3042 static struct md_sysfs_entry md_new_device =
3043 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3044
3045 static ssize_t
3046 bitmap_store(mddev_t *mddev, const char *buf, size_t len)
3047 {
3048         char *end;
3049         unsigned long chunk, end_chunk;
3050
3051         if (!mddev->bitmap)
3052                 goto out;
3053         /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
3054         while (*buf) {
3055                 chunk = end_chunk = simple_strtoul(buf, &end, 0);
3056                 if (buf == end) break;
3057                 if (*end == '-') { /* range */
3058                         buf = end + 1;
3059                         end_chunk = simple_strtoul(buf, &end, 0);
3060                         if (buf == end) break;
3061                 }
3062                 if (*end && !isspace(*end)) break;
3063                 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3064                 buf = end;
3065                 while (isspace(*buf)) buf++;
3066         }
3067         bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3068 out:
3069         return len;
3070 }
3071
3072 static struct md_sysfs_entry md_bitmap =
3073 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
3074
3075 static ssize_t
3076 size_show(mddev_t *mddev, char *page)
3077 {
3078         return sprintf(page, "%llu\n",
3079                 (unsigned long long)mddev->dev_sectors / 2);
3080 }
3081
3082 static int update_size(mddev_t *mddev, sector_t num_sectors);
3083
3084 static ssize_t
3085 size_store(mddev_t *mddev, const char *buf, size_t len)
3086 {
3087         /* If array is inactive, we can reduce the component size, but
3088          * not increase it (except from 0).
3089          * If array is active, we can try an on-line resize
3090          */
3091         unsigned long long sectors;
3092         int err = strict_strtoull(buf, 10, &sectors);
3093
3094         if (err < 0)
3095                 return err;
3096         sectors *= 2;
3097         if (mddev->pers) {
3098                 err = update_size(mddev, sectors);
3099                 md_update_sb(mddev, 1);
3100         } else {
3101                 if (mddev->dev_sectors == 0 ||
3102                     mddev->dev_sectors > sectors)
3103                         mddev->dev_sectors = sectors;
3104                 else
3105                         err = -ENOSPC;
3106         }
3107         return err ? err : len;
3108 }
3109
3110 static struct md_sysfs_entry md_size =
3111 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
3112
3113
3114 /* Metdata version.
3115  * This is one of
3116  *   'none' for arrays with no metadata (good luck...)
3117  *   'external' for arrays with externally managed metadata,
3118  * or N.M for internally known formats
3119  */
3120 static ssize_t
3121 metadata_show(mddev_t *mddev, char *page)
3122 {
3123         if (mddev->persistent)
3124                 return sprintf(page, "%d.%d\n",
3125                                mddev->major_version, mddev->minor_version);
3126         else if (mddev->external)
3127                 return sprintf(page, "external:%s\n", mddev->metadata_type);
3128         else
3129                 return sprintf(page, "none\n");
3130 }
3131
3132 static ssize_t
3133 metadata_store(mddev_t *mddev, const char *buf, size_t len)
3134 {
3135         int major, minor;
3136         char *e;
3137         /* Changing the details of 'external' metadata is
3138          * always permitted.  Otherwise there must be
3139          * no devices attached to the array.
3140          */
3141         if (mddev->external && strncmp(buf, "external:", 9) == 0)
3142                 ;
3143         else if (!list_empty(&mddev->disks))
3144                 return -EBUSY;
3145
3146         if (cmd_match(buf, "none")) {
3147                 mddev->persistent = 0;
3148                 mddev->external = 0;
3149                 mddev->major_version = 0;
3150                 mddev->minor_version = 90;
3151                 return len;
3152         }
3153         if (strncmp(buf, "external:", 9) == 0) {
3154                 size_t namelen = len-9;
3155                 if (namelen >= sizeof(mddev->metadata_type))
3156                         namelen = sizeof(mddev->metadata_type)-1;
3157                 strncpy(mddev->metadata_type, buf+9, namelen);
3158                 mddev->metadata_type[namelen] = 0;
3159                 if (namelen && mddev->metadata_type[namelen-1] == '\n')
3160                         mddev->metadata_type[--namelen] = 0;
3161                 mddev->persistent = 0;
3162                 mddev->external = 1;
3163                 mddev->major_version = 0;
3164                 mddev->minor_version = 90;
3165                 return len;
3166         }
3167         major = simple_strtoul(buf, &e, 10);
3168         if (e==buf || *e != '.')
3169                 return -EINVAL;
3170         buf = e+1;
3171         minor = simple_strtoul(buf, &e, 10);
3172         if (e==buf || (*e && *e != '\n') )
3173                 return -EINVAL;
3174         if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
3175                 return -ENOENT;
3176         mddev->major_version = major;
3177         mddev->minor_version = minor;
3178         mddev->persistent = 1;
3179         mddev->external = 0;
3180         return len;
3181 }
3182
3183 static struct md_sysfs_entry md_metadata =
3184 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
3185
3186 static ssize_t
3187 action_show(mddev_t *mddev, char *page)
3188 {
3189         char *type = "idle";
3190         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3191             (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
3192                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3193                         type = "reshape";
3194                 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3195                         if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
3196                                 type = "resync";
3197                         else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
3198                                 type = "check";
3199                         else
3200                                 type = "repair";
3201                 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
3202                         type = "recover";
3203         }
3204         return sprintf(page, "%s\n", type);
3205 }
3206
3207 static ssize_t
3208 action_store(mddev_t *mddev, const char *page, size_t len)
3209 {
3210         if (!mddev->pers || !mddev->pers->sync_request)
3211                 return -EINVAL;
3212
3213         if (cmd_match(page, "idle")) {
3214                 if (mddev->sync_thread) {
3215                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3216                         md_unregister_thread(mddev->sync_thread);
3217                         mddev->sync_thread = NULL;
3218                         mddev->recovery = 0;
3219                 }
3220         } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3221                    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
3222                 return -EBUSY;
3223         else if (cmd_match(page, "resync"))
3224                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3225         else if (cmd_match(page, "recover")) {
3226                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
3227                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3228         } else if (cmd_match(page, "reshape")) {
3229                 int err;
3230                 if (mddev->pers->start_reshape == NULL)
3231                         return -EINVAL;
3232                 err = mddev->pers->start_reshape(mddev);
3233                 if (err)
3234                         return err;
3235                 sysfs_notify(&mddev->kobj, NULL, "degraded");
3236         } else {
3237                 if (cmd_match(page, "check"))
3238                         set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3239                 else if (!cmd_match(page, "repair"))
3240                         return -EINVAL;
3241                 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3242                 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3243         }
3244         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3245         md_wakeup_thread(mddev->thread);
3246         sysfs_notify_dirent(mddev->sysfs_action);
3247         return len;
3248 }
3249
3250 static ssize_t
3251 mismatch_cnt_show(mddev_t *mddev, char *page)
3252 {
3253         return sprintf(page, "%llu\n",
3254                        (unsigned long long) mddev->resync_mismatches);
3255 }
3256
3257 static struct md_sysfs_entry md_scan_mode =
3258 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
3259
3260
3261 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
3262
3263 static ssize_t
3264 sync_min_show(mddev_t *mddev, char *page)
3265 {
3266         return sprintf(page, "%d (%s)\n", speed_min(mddev),
3267                        mddev->sync_speed_min ? "local": "system");
3268 }
3269
3270 static ssize_t
3271 sync_min_store(mddev_t *mddev, const char *buf, size_t len)
3272 {
3273         int min;
3274         char *e;
3275         if (strncmp(buf, "system", 6)==0) {
3276                 mddev->sync_speed_min = 0;
3277                 return len;
3278         }
3279         min = simple_strtoul(buf, &e, 10);
3280         if (buf == e || (*e && *e != '\n') || min <= 0)
3281                 return -EINVAL;
3282         mddev->sync_speed_min = min;
3283         return len;
3284 }
3285
3286 static struct md_sysfs_entry md_sync_min =
3287 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
3288
3289 static ssize_t
3290 sync_max_show(mddev_t *mddev, char *page)
3291 {
3292         return sprintf(page, "%d (%s)\n", speed_max(mddev),
3293                        mddev->sync_speed_max ? "local": "system");
3294 }
3295
3296 static ssize_t
3297 sync_max_store(mddev_t *mddev, const char *buf, size_t len)
3298 {
3299         int max;
3300         char *e;
3301         if (strncmp(buf, "system", 6)==0) {
3302                 mddev->sync_speed_max = 0;
3303                 return len;
3304         }
3305         max = simple_strtoul(buf, &e, 10);
3306         if (buf == e || (*e && *e != '\n') || max <= 0)
3307                 return -EINVAL;
3308         mddev->sync_speed_max = max;
3309         return len;
3310 }
3311
3312 static struct md_sysfs_entry md_sync_max =
3313 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
3314
3315 static ssize_t
3316 degraded_show(mddev_t *mddev, char *page)
3317 {
3318         return sprintf(page, "%d\n", mddev->degraded);
3319 }
3320 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
3321
3322 static ssize_t
3323 sync_force_parallel_show(mddev_t *mddev, char *page)
3324 {
3325         return sprintf(page, "%d\n", mddev->parallel_resync);
3326 }
3327
3328 static ssize_t
3329 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
3330 {
3331         long n;
3332
3333         if (strict_strtol(buf, 10, &n))
3334                 return -EINVAL;
3335
3336         if (n != 0 && n != 1)
3337                 return -EINVAL;
3338
3339         mddev->parallel_resync = n;
3340
3341         if (mddev->sync_thread)
3342                 wake_up(&resync_wait);
3343
3344         return len;
3345 }
3346
3347 /* force parallel resync, even with shared block devices */
3348 static struct md_sysfs_entry md_sync_force_parallel =
3349 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
3350        sync_force_parallel_show, sync_force_parallel_store);
3351
3352 static ssize_t
3353 sync_speed_show(mddev_t *mddev, char *page)
3354 {
3355         unsigned long resync, dt, db;
3356         resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3357         dt = (jiffies - mddev->resync_mark) / HZ;
3358         if (!dt) dt++;
3359         db = resync - mddev->resync_mark_cnt;
3360         return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3361 }
3362
3363 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3364
3365 static ssize_t
3366 sync_completed_show(mddev_t *mddev, char *page)
3367 {
3368         unsigned long max_sectors, resync;
3369
3370         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3371                 max_sectors = mddev->resync_max_sectors;
3372         else
3373                 max_sectors = mddev->dev_sectors;
3374
3375         resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
3376         return sprintf(page, "%lu / %lu\n", resync, max_sectors);
3377 }
3378
3379 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3380
3381 static ssize_t
3382 min_sync_show(mddev_t *mddev, char *page)
3383 {
3384         return sprintf(page, "%llu\n",
3385                        (unsigned long long)mddev->resync_min);
3386 }
3387 static ssize_t
3388 min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3389 {
3390         unsigned long long min;
3391         if (strict_strtoull(buf, 10, &min))
3392                 return -EINVAL;
3393         if (min > mddev->resync_max)
3394                 return -EINVAL;
3395         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3396                 return -EBUSY;
3397
3398         /* Must be a multiple of chunk_size */
3399         if (mddev->chunk_size) {
3400                 if (min & (sector_t)((mddev->chunk_size>>9)-1))
3401                         return -EINVAL;
3402         }
3403         mddev->resync_min = min;
3404
3405         return len;
3406 }
3407
3408 static struct md_sysfs_entry md_min_sync =
3409 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3410
3411 static ssize_t
3412 max_sync_show(mddev_t *mddev, char *page)
3413 {
3414         if (mddev->resync_max == MaxSector)
3415                 return sprintf(page, "max\n");
3416         else
3417                 return sprintf(page, "%llu\n",
3418                                (unsigned long long)mddev->resync_max);
3419 }
3420 static ssize_t
3421 max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3422 {
3423         if (strncmp(buf, "max", 3) == 0)
3424                 mddev->resync_max = MaxSector;
3425         else {
3426                 unsigned long long max;
3427                 if (strict_strtoull(buf, 10, &max))
3428                         return -EINVAL;
3429                 if (max < mddev->resync_min)
3430                         return -EINVAL;
3431                 if (max < mddev->resync_max &&
3432                     test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3433                         return -EBUSY;
3434
3435                 /* Must be a multiple of chunk_size */
3436                 if (mddev->chunk_size) {
3437                         if (max & (sector_t)((mddev->chunk_size>>9)-1))
3438                                 return -EINVAL;
3439                 }
3440                 mddev->resync_max = max;
3441         }
3442         wake_up(&mddev->recovery_wait);
3443         return len;
3444 }
3445
3446 static struct md_sysfs_entry md_max_sync =
3447 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
3448
3449 static ssize_t
3450 suspend_lo_show(mddev_t *mddev, char *page)
3451 {
3452         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
3453 }
3454
3455 static ssize_t
3456 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3457 {
3458         char *e;
3459         unsigned long long new = simple_strtoull(buf, &e, 10);
3460
3461         if (mddev->pers->quiesce == NULL)
3462                 return -EINVAL;
3463         if (buf == e || (*e && *e != '\n'))
3464                 return -EINVAL;
3465         if (new >= mddev->suspend_hi ||
3466             (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
3467                 mddev->suspend_lo = new;
3468                 mddev->pers->quiesce(mddev, 2);
3469                 return len;
3470         } else
3471                 return -EINVAL;
3472 }
3473 static struct md_sysfs_entry md_suspend_lo =
3474 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
3475
3476
3477 static ssize_t
3478 suspend_hi_show(mddev_t *mddev, char *page)
3479 {
3480         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
3481 }
3482
3483 static ssize_t
3484 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3485 {
3486         char *e;
3487         unsigned long long new = simple_strtoull(buf, &e, 10);
3488
3489         if (mddev->pers->quiesce == NULL)
3490                 return -EINVAL;
3491         if (buf == e || (*e && *e != '\n'))
3492                 return -EINVAL;
3493         if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
3494             (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
3495                 mddev->suspend_hi = new;
3496                 mddev->pers->quiesce(mddev, 1);
3497                 mddev->pers->quiesce(mddev, 0);
3498                 return len;
3499         } else
3500                 return -EINVAL;
3501 }
3502 static struct md_sysfs_entry md_suspend_hi =
3503 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
3504
3505 static ssize_t
3506 reshape_position_show(mddev_t *mddev, char *page)
3507 {
3508         if (mddev->reshape_position != MaxSector)
3509                 return sprintf(page, "%llu\n",
3510                                (unsigned long long)mddev->reshape_position);
3511         strcpy(page, "none\n");
3512         return 5;
3513 }
3514
3515 static ssize_t
3516 reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3517 {
3518         char *e;
3519         unsigned long long new = simple_strtoull(buf, &e, 10);
3520         if (mddev->pers)
3521                 return -EBUSY;
3522         if (buf == e || (*e && *e != '\n'))
3523                 return -EINVAL;
3524         mddev->reshape_position = new;
3525         mddev->delta_disks = 0;
3526         mddev->new_level = mddev->level;
3527         mddev->new_layout = mddev->layout;
3528         mddev->new_chunk = mddev->chunk_size;
3529         return len;
3530 }
3531
3532 static struct md_sysfs_entry md_reshape_position =
3533 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3534        reshape_position_store);
3535
3536
3537 static struct attribute *md_default_attrs[] = {
3538         &md_level.attr,
3539         &md_layout.attr,
3540         &md_raid_disks.attr,
3541         &md_chunk_size.attr,
3542         &md_size.attr,
3543         &md_resync_start.attr,
3544         &md_metadata.attr,
3545         &md_new_device.attr,
3546         &md_safe_delay.attr,
3547         &md_array_state.attr,
3548         &md_reshape_position.attr,
3549         NULL,
3550 };
3551
3552 static struct attribute *md_redundancy_attrs[] = {
3553         &md_scan_mode.attr,
3554         &md_mismatches.attr,
3555         &md_sync_min.attr,
3556         &md_sync_max.attr,
3557         &md_sync_speed.attr,
3558         &md_sync_force_parallel.attr,
3559         &md_sync_completed.attr,
3560         &md_min_sync.attr,
3561         &md_max_sync.attr,
3562         &md_suspend_lo.attr,
3563         &md_suspend_hi.attr,
3564         &md_bitmap.attr,
3565         &md_degraded.attr,
3566         NULL,
3567 };
3568 static struct attribute_group md_redundancy_group = {
3569         .name = NULL,
3570         .attrs = md_redundancy_attrs,
3571 };
3572
3573
3574 static ssize_t
3575 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3576 {
3577         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3578         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3579         ssize_t rv;
3580
3581         if (!entry->show)
3582                 return -EIO;
3583         rv = mddev_lock(mddev);
3584         if (!rv) {
3585                 rv = entry->show(mddev, page);
3586                 mddev_unlock(mddev);
3587         }
3588         return rv;
3589 }
3590
3591 static ssize_t
3592 md_attr_store(struct kobject *kobj, struct attribute *attr,
3593               const char *page, size_t length)
3594 {
3595         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3596         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3597         ssize_t rv;
3598
3599         if (!entry->store)
3600                 return -EIO;
3601         if (!capable(CAP_SYS_ADMIN))
3602                 return -EACCES;
3603         rv = mddev_lock(mddev);
3604         if (mddev->hold_active == UNTIL_IOCTL)
3605                 mddev->hold_active = 0;
3606         if (!rv) {
3607                 rv = entry->store(mddev, page, length);
3608                 mddev_unlock(mddev);
3609         }
3610         return rv;
3611 }
3612
3613 static void md_free(struct kobject *ko)
3614 {
3615         mddev_t *mddev = container_of(ko, mddev_t, kobj);
3616
3617         if (mddev->sysfs_state)
3618                 sysfs_put(mddev->sysfs_state);
3619
3620         if (mddev->gendisk) {
3621                 del_gendisk(mddev->gendisk);
3622                 put_disk(mddev->gendisk);
3623         }
3624         if (mddev->queue)
3625                 blk_cleanup_queue(mddev->queue);
3626
3627         kfree(mddev);
3628 }
3629
3630 static struct sysfs_ops md_sysfs_ops = {
3631         .show   = md_attr_show,
3632         .store  = md_attr_store,
3633 };
3634 static struct kobj_type md_ktype = {
3635         .release        = md_free,
3636         .sysfs_ops      = &md_sysfs_ops,
3637         .default_attrs  = md_default_attrs,
3638 };
3639
3640 int mdp_major = 0;
3641
3642 static int md_alloc(dev_t dev, char *name)
3643 {
3644         static DEFINE_MUTEX(disks_mutex);
3645         mddev_t *mddev = mddev_find(dev);
3646         struct gendisk *disk;
3647         int partitioned;
3648         int shift;
3649         int unit;
3650         int error;
3651
3652         if (!mddev)
3653                 return -ENODEV;
3654
3655         partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
3656         shift = partitioned ? MdpMinorShift : 0;
3657         unit = MINOR(mddev->unit) >> shift;
3658
3659         /* wait for any previous instance if this device
3660          * to be completed removed (mddev_delayed_delete).
3661          */
3662         flush_scheduled_work();
3663
3664         mutex_lock(&disks_mutex);
3665         if (mddev->gendisk) {
3666                 mutex_unlock(&disks_mutex);
3667                 mddev_put(mddev);
3668                 return -EEXIST;
3669         }
3670
3671         if (name) {
3672                 /* Need to ensure that 'name' is not a duplicate.
3673                  */
3674                 mddev_t *mddev2;
3675                 spin_lock(&all_mddevs_lock);
3676
3677                 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
3678                         if (mddev2->gendisk &&
3679                             strcmp(mddev2->gendisk->disk_name, name) == 0) {
3680                                 spin_unlock(&all_mddevs_lock);
3681                                 return -EEXIST;
3682                         }
3683                 spin_unlock(&all_mddevs_lock);
3684         }
3685
3686         mddev->queue = blk_alloc_queue(GFP_KERNEL);
3687         if (!mddev->queue) {
3688                 mutex_unlock(&disks_mutex);
3689                 mddev_put(mddev);
3690                 return -ENOMEM;
3691         }
3692         mddev->queue->queuedata = mddev;
3693
3694         /* Can be unlocked because the queue is new: no concurrency */
3695         queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
3696
3697         blk_queue_make_request(mddev->queue, md_make_request);
3698
3699         disk = alloc_disk(1 << shift);
3700         if (!disk) {
3701                 mutex_unlock(&disks_mutex);
3702                 blk_cleanup_queue(mddev->queue);
3703                 mddev->queue = NULL;
3704                 mddev_put(mddev);
3705                 return -ENOMEM;
3706         }
3707         disk->major = MAJOR(mddev->unit);
3708         disk->first_minor = unit << shift;
3709         if (name)
3710                 strcpy(disk->disk_name, name);
3711         else if (partitioned)
3712                 sprintf(disk->disk_name, "md_d%d", unit);
3713         else
3714                 sprintf(disk->disk_name, "md%d", unit);
3715         disk->fops = &md_fops;
3716         disk->private_data = mddev;
3717         disk->queue = mddev->queue;
3718         /* Allow extended partitions.  This makes the
3719          * 'mdp' device redundant, but we can't really
3720          * remove it now.
3721          */
3722         disk->flags |= GENHD_FL_EXT_DEVT;
3723         add_disk(disk);
3724         mddev->gendisk = disk;
3725         error = kobject_init_and_add(&mddev->kobj, &md_ktype,
3726                                      &disk_to_dev(disk)->kobj, "%s", "md");
3727         mutex_unlock(&disks_mutex);
3728         if (error)
3729                 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3730                        disk->disk_name);
3731         else {
3732                 kobject_uevent(&mddev->kobj, KOBJ_ADD);
3733                 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
3734         }
3735         mddev_put(mddev);
3736         return 0;
3737 }
3738
3739 static struct kobject *md_probe(dev_t dev, int *part, void *data)
3740 {
3741         md_alloc(dev, NULL);
3742         return NULL;
3743 }
3744
3745 static int add_named_array(const char *val, struct kernel_param *kp)
3746 {
3747         /* val must be "md_*" where * is not all digits.
3748          * We allocate an array with a large free minor number, and
3749          * set the name to val.  val must not already be an active name.
3750          */
3751         int len = strlen(val);
3752         char buf[DISK_NAME_LEN];
3753
3754         while (len && val[len-1] == '\n')
3755                 len--;
3756         if (len >= DISK_NAME_LEN)
3757                 return -E2BIG;
3758         strlcpy(buf, val, len+1);
3759         if (strncmp(buf, "md_", 3) != 0)
3760                 return -EINVAL;
3761         return md_alloc(0, buf);
3762 }
3763
3764 static void md_safemode_timeout(unsigned long data)
3765 {
3766         mddev_t *mddev = (mddev_t *) data;
3767
3768         if (!atomic_read(&mddev->writes_pending)) {
3769                 mddev->safemode = 1;
3770                 if (mddev->external)
3771                         sysfs_notify_dirent(mddev->sysfs_state);
3772         }
3773         md_wakeup_thread(mddev->thread);
3774 }
3775
3776 static int start_dirty_degraded;
3777
3778 static int do_md_run(mddev_t * mddev)
3779 {
3780         int err;
3781         int chunk_size;
3782         mdk_rdev_t *rdev;
3783         struct gendisk *disk;
3784         struct mdk_personality *pers;
3785         char b[BDEVNAME_SIZE];
3786
3787         if (list_empty(&mddev->disks))
3788                 /* cannot run an array with no devices.. */
3789                 return -EINVAL;
3790
3791         if (mddev->pers)
3792                 return -EBUSY;
3793
3794         /*
3795          * Analyze all RAID superblock(s)
3796          */
3797         if (!mddev->raid_disks) {
3798                 if (!mddev->persistent)
3799                         return -EINVAL;
3800                 analyze_sbs(mddev);
3801         }
3802
3803         chunk_size = mddev->chunk_size;
3804
3805         if (chunk_size) {
3806                 if (chunk_size > MAX_CHUNK_SIZE) {
3807                         printk(KERN_ERR "too big chunk_size: %d > %d\n",
3808                                 chunk_size, MAX_CHUNK_SIZE);
3809                         return -EINVAL;
3810                 }
3811                 /*
3812                  * chunk-size has to be a power of 2
3813                  */
3814                 if ( (1 << ffz(~chunk_size)) != chunk_size) {
3815                         printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
3816                         return -EINVAL;
3817                 }
3818
3819                 /* devices must have minimum size of one chunk */
3820                 list_for_each_entry(rdev, &mddev->disks, same_set) {
3821                         if (test_bit(Faulty, &rdev->flags))
3822                                 continue;
3823                         if (rdev->sectors < chunk_size / 512) {
3824                                 printk(KERN_WARNING
3825                                         "md: Dev %s smaller than chunk_size:"
3826                                         " %llu < %d\n",
3827                                         bdevname(rdev->bdev,b),
3828                                         (unsigned long long)rdev->sectors,
3829                                         chunk_size / 512);
3830                                 return -EINVAL;
3831                         }
3832                 }
3833         }
3834
3835         if (mddev->level != LEVEL_NONE)
3836                 request_module("md-level-%d", mddev->level);
3837         else if (mddev->clevel[0])
3838                 request_module("md-%s", mddev->clevel);
3839
3840         /*
3841          * Drop all container device buffers, from now on
3842          * the only valid external interface is through the md
3843          * device.
3844          */
3845         list_for_each_entry(rdev, &mddev->disks, same_set) {
3846                 if (test_bit(Faulty, &rdev->flags))
3847                         continue;
3848                 sync_blockdev(rdev->bdev);
3849                 invalidate_bdev(rdev->bdev);
3850
3851                 /* perform some consistency tests on the device.
3852                  * We don't want the data to overlap the metadata,
3853                  * Internal Bitmap issues have been handled elsewhere.
3854                  */
3855                 if (rdev->data_offset < rdev->sb_start) {
3856                         if (mddev->dev_sectors &&
3857                             rdev->data_offset + mddev->dev_sectors
3858                             > rdev->sb_start) {
3859                                 printk("md: %s: data overlaps metadata\n",
3860                                        mdname(mddev));
3861                                 return -EINVAL;
3862                         }
3863                 } else {
3864                         if (rdev->sb_start + rdev->sb_size/512
3865                             > rdev->data_offset) {
3866                                 printk("md: %s: metadata overlaps data\n",
3867                                        mdname(mddev));
3868                                 return -EINVAL;
3869                         }
3870                 }
3871                 sysfs_notify_dirent(rdev->sysfs_state);
3872         }
3873
3874         md_probe(mddev->unit, NULL, NULL);
3875         disk = mddev->gendisk;
3876         if (!disk)
3877                 return -ENOMEM;
3878
3879         spin_lock(&pers_lock);
3880         pers = find_pers(mddev->level, mddev->clevel);
3881         if (!pers || !try_module_get(pers->owner)) {
3882                 spin_unlock(&pers_lock);
3883                 if (mddev->level != LEVEL_NONE)
3884                         printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
3885                                mddev->level);
3886                 else
3887                         printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
3888                                mddev->clevel);
3889                 return -EINVAL;
3890         }
3891         mddev->pers = pers;
3892         spin_unlock(&pers_lock);
3893         if (mddev->level != pers->level) {
3894                 mddev->level = pers->level;
3895                 mddev->new_level = pers->level;
3896         }
3897         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3898
3899         if (pers->level >= 4 && pers->level <= 6)
3900                 /* Cannot support integrity (yet) */
3901                 blk_integrity_unregister(mddev->gendisk);
3902
3903         if (mddev->reshape_position != MaxSector &&
3904             pers->start_reshape == NULL) {
3905                 /* This personality cannot handle reshaping... */
3906                 mddev->pers = NULL;
3907                 module_put(pers->owner);
3908                 return -EINVAL;
3909         }
3910
3911         if (pers->sync_request) {
3912                 /* Warn if this is a potentially silly
3913                  * configuration.
3914                  */
3915                 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3916                 mdk_rdev_t *rdev2;
3917                 int warned = 0;
3918
3919                 list_for_each_entry(rdev, &mddev->disks, same_set)
3920                         list_for_each_entry(rdev2, &mddev->disks, same_set) {
3921                                 if (rdev < rdev2 &&
3922                                     rdev->bdev->bd_contains ==
3923                                     rdev2->bdev->bd_contains) {
3924                                         printk(KERN_WARNING
3925                                                "%s: WARNING: %s appears to be"
3926                                                " on the same physical disk as"
3927                                                " %s.\n",
3928                                                mdname(mddev),
3929                                                bdevname(rdev->bdev,b),
3930                                                bdevname(rdev2->bdev,b2));
3931                                         warned = 1;
3932                                 }
3933                         }
3934
3935                 if (warned)
3936                         printk(KERN_WARNING
3937                                "True protection against single-disk"
3938                                " failure might be compromised.\n");
3939         }
3940
3941         mddev->recovery = 0;
3942         /* may be over-ridden by personality */
3943         mddev->resync_max_sectors = mddev->dev_sectors;
3944
3945         mddev->barriers_work = 1;
3946         mddev->ok_start_degraded = start_dirty_degraded;
3947
3948         if (start_readonly)
3949                 mddev->ro = 2; /* read-only, but switch on first write */
3950
3951         err = mddev->pers->run(mddev);
3952         if (err)
3953                 printk(KERN_ERR "md: pers->run() failed ...\n");
3954         else if (mddev->pers->sync_request) {
3955                 err = bitmap_create(mddev);
3956                 if (err) {
3957                         printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
3958                                mdname(mddev), err);
3959                         mddev->pers->stop(mddev);
3960                 }
3961         }
3962         if (err) {
3963                 module_put(mddev->pers->owner);
3964                 mddev->pers = NULL;
3965                 bitmap_destroy(mddev);
3966                 return err;
3967         }
3968         if (mddev->pers->sync_request) {
3969                 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3970                         printk(KERN_WARNING
3971                                "md: cannot register extra attributes for %s\n",
3972                                mdname(mddev));
3973                 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3974         } else if (mddev->ro == 2) /* auto-readonly not meaningful */
3975                 mddev->ro = 0;
3976
3977         atomic_set(&mddev->writes_pending,0);
3978         mddev->safemode = 0;
3979         mddev->safemode_timer.function = md_safemode_timeout;
3980         mddev->safemode_timer.data = (unsigned long) mddev;
3981         mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
3982         mddev->in_sync = 1;
3983
3984         list_for_each_entry(rdev, &mddev->disks, same_set)
3985                 if (rdev->raid_disk >= 0) {
3986                         char nm[20];
3987                         sprintf(nm, "rd%d", rdev->raid_disk);
3988                         if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
3989                                 printk("md: cannot register %s for %s\n",
3990                                        nm, mdname(mddev));
3991                 }
3992         
3993         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3994         
3995         if (mddev->flags)
3996                 md_update_sb(mddev, 0);
3997
3998         set_capacity(disk, mddev->array_sectors);
3999
4000         /* If there is a partially-recovered drive we need to
4001          * start recovery here.  If we leave it to md_check_recovery,
4002          * it will remove the drives and not do the right thing
4003          */
4004         if (mddev->degraded && !mddev->sync_thread) {
4005                 int spares = 0;
4006                 list_for_each_entry(rdev, &mddev->disks, same_set)
4007                         if (rdev->raid_disk >= 0 &&
4008                             !test_bit(In_sync, &rdev->flags) &&
4009                             !test_bit(Faulty, &rdev->flags))
4010                                 /* complete an interrupted recovery */
4011                                 spares++;
4012                 if (spares && mddev->pers->sync_request) {
4013                         mddev->recovery = 0;
4014                         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4015                         mddev->sync_thread = md_register_thread(md_do_sync,
4016                                                                 mddev,
4017                                                                 "%s_resync");
4018                         if (!mddev->sync_thread) {
4019                                 printk(KERN_ERR "%s: could not start resync"
4020                                        " thread...\n",
4021                                        mdname(mddev));
4022                                 /* leave the spares where they are, it shouldn't hurt */
4023                                 mddev->recovery = 0;
4024                         }
4025                 }
4026         }
4027         md_wakeup_thread(mddev->thread);
4028         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4029
4030         mddev->changed = 1;
4031         md_new_event(mddev);
4032         sysfs_notify_dirent(mddev->sysfs_state);
4033         if (mddev->sysfs_action)
4034                 sysfs_notify_dirent(mddev->sysfs_action);
4035         sysfs_notify(&mddev->kobj, NULL, "degraded");
4036         kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4037         return 0;
4038 }
4039
4040 static int restart_array(mddev_t *mddev)
4041 {
4042         struct gendisk *disk = mddev->gendisk;
4043
4044         /* Complain if it has no devices */
4045         if (list_empty(&mddev->disks))
4046                 return -ENXIO;
4047         if (!mddev->pers)
4048                 return -EINVAL;
4049         if (!mddev->ro)
4050                 return -EBUSY;
4051         mddev->safemode = 0;
4052         mddev->ro = 0;
4053         set_disk_ro(disk, 0);
4054         printk(KERN_INFO "md: %s switched to read-write mode.\n",
4055                 mdname(mddev));
4056         /* Kick recovery or resync if necessary */
4057         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4058         md_wakeup_thread(mddev->thread);
4059         md_wakeup_thread(mddev->sync_thread);
4060         sysfs_notify_dirent(mddev->sysfs_state);
4061         return 0;
4062 }
4063
4064 /* similar to deny_write_access, but accounts for our holding a reference
4065  * to the file ourselves */
4066 static int deny_bitmap_write_access(struct file * file)
4067 {
4068         struct inode *inode = file->f_mapping->host;
4069
4070         spin_lock(&inode->i_lock);
4071         if (atomic_read(&inode->i_writecount) > 1) {
4072                 spin_unlock(&inode->i_lock);
4073                 return -ETXTBSY;
4074         }
4075         atomic_set(&inode->i_writecount, -1);
4076         spin_unlock(&inode->i_lock);
4077
4078         return 0;
4079 }
4080
4081 static void restore_bitmap_write_access(struct file *file)
4082 {
4083         struct inode *inode = file->f_mapping->host;
4084
4085         spin_lock(&inode->i_lock);
4086         atomic_set(&inode->i_writecount, 1);
4087         spin_unlock(&inode->i_lock);
4088 }
4089
4090 /* mode:
4091  *   0 - completely stop and dis-assemble array
4092  *   1 - switch to readonly
4093  *   2 - stop but do not disassemble array
4094  */
4095 static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4096 {
4097         int err = 0;
4098         struct gendisk *disk = mddev->gendisk;
4099
4100         if (atomic_read(&mddev->openers) > is_open) {
4101                 printk("md: %s still in use.\n",mdname(mddev));
4102                 return -EBUSY;
4103         }
4104
4105         if (mddev->pers) {
4106
4107                 if (mddev->sync_thread) {
4108                         set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4109                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4110                         md_unregister_thread(mddev->sync_thread);
4111                         mddev->sync_thread = NULL;
4112                 }
4113
4114                 del_timer_sync(&mddev->safemode_timer);
4115
4116                 switch(mode) {
4117                 case 1: /* readonly */
4118                         err  = -ENXIO;
4119                         if (mddev->ro==1)
4120                                 goto out;
4121                         mddev->ro = 1;
4122                         break;
4123                 case 0: /* disassemble */
4124                 case 2: /* stop */
4125                         bitmap_flush(mddev);
4126                         md_super_wait(mddev);
4127                         if (mddev->ro)
4128                                 set_disk_ro(disk, 0);
4129
4130                         mddev->pers->stop(mddev);
4131                         mddev->queue->merge_bvec_fn = NULL;
4132                         mddev->queue->unplug_fn = NULL;
4133                         mddev->queue->backing_dev_info.congested_fn = NULL;
4134                         if (mddev->pers->sync_request) {
4135                                 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
4136                                 if (mddev->sysfs_action)
4137                                         sysfs_put(mddev->sysfs_action);
4138                                 mddev->sysfs_action = NULL;
4139                         }
4140                         module_put(mddev->pers->owner);
4141                         mddev->pers = NULL;
4142                         /* tell userspace to handle 'inactive' */
4143                         sysfs_notify_dirent(mddev->sysfs_state);
4144
4145                         set_capacity(disk, 0);
4146                         mddev->changed = 1;
4147
4148                         if (mddev->ro)
4149                                 mddev->ro = 0;
4150                 }
4151                 if (!mddev->in_sync || mddev->flags) {
4152                         /* mark array as shutdown cleanly */
4153                         mddev->in_sync = 1;
4154                         md_update_sb(mddev, 1);
4155                 }
4156                 if (mode == 1)
4157                         set_disk_ro(disk, 1);
4158                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4159         }
4160
4161         /*
4162          * Free resources if final stop
4163          */
4164         if (mode == 0) {
4165                 mdk_rdev_t *rdev;
4166
4167                 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
4168
4169                 bitmap_destroy(mddev);
4170                 if (mddev->bitmap_file) {
4171                         restore_bitmap_write_access(mddev->bitmap_file);
4172                         fput(mddev->bitmap_file);
4173                         mddev->bitmap_file = NULL;
4174                 }
4175                 mddev->bitmap_offset = 0;
4176
4177                 list_for_each_entry(rdev, &mddev->disks, same_set)
4178                         if (rdev->raid_disk >= 0) {
4179                                 char nm[20];
4180                                 sprintf(nm, "rd%d", rdev->raid_disk);
4181                                 sysfs_remove_link(&mddev->kobj, nm);
4182                         }
4183
4184                 /* make sure all md_delayed_delete calls have finished */
4185                 flush_scheduled_work();
4186
4187                 export_array(mddev);
4188
4189                 mddev->array_sectors = 0;
4190                 mddev->dev_sectors = 0;
4191                 mddev->raid_disks = 0;
4192                 mddev->recovery_cp = 0;
4193                 mddev->resync_min = 0;
4194                 mddev->resync_max = MaxSector;
4195                 mddev->reshape_position = MaxSector;
4196                 mddev->external = 0;
4197                 mddev->persistent = 0;
4198                 mddev->level = LEVEL_NONE;
4199                 mddev->clevel[0] = 0;
4200                 mddev->flags = 0;
4201                 mddev->ro = 0;
4202                 mddev->metadata_type[0] = 0;
4203                 mddev->chunk_size = 0;
4204                 mddev->ctime = mddev->utime = 0;
4205                 mddev->layout = 0;
4206                 mddev->max_disks = 0;
4207                 mddev->events = 0;
4208                 mddev->delta_disks = 0;
4209                 mddev->new_level = LEVEL_NONE;
4210                 mddev->new_layout = 0;
4211                 mddev->new_chunk = 0;
4212                 mddev->curr_resync = 0;
4213                 mddev->resync_mismatches = 0;
4214                 mddev->suspend_lo = mddev->suspend_hi = 0;
4215                 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4216                 mddev->recovery = 0;
4217                 mddev->in_sync = 0;
4218                 mddev->changed = 0;
4219                 mddev->degraded = 0;
4220                 mddev->barriers_work = 0;
4221                 mddev->safemode = 0;
4222                 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4223                 if (mddev->hold_active == UNTIL_STOP)
4224                         mddev->hold_active = 0;
4225
4226         } else if (mddev->pers)
4227                 printk(KERN_INFO "md: %s switched to read-only mode.\n",
4228                         mdname(mddev));
4229         err = 0;
4230         blk_integrity_unregister(disk);
4231         md_new_event(mddev);
4232         sysfs_notify_dirent(mddev->sysfs_state);
4233 out:
4234         return err;
4235 }
4236
4237 #ifndef MODULE
4238 static void autorun_array(mddev_t *mddev)
4239 {
4240         mdk_rdev_t *rdev;
4241         int err;
4242
4243         if (list_empty(&mddev->disks))
4244                 return;
4245
4246         printk(KERN_INFO "md: running: ");
4247
4248         list_for_each_entry(rdev, &mddev->disks, same_set) {
4249                 char b[BDEVNAME_SIZE];
4250                 printk("<%s>", bdevname(rdev->bdev,b));
4251         }
4252         printk("\n");
4253
4254         err = do_md_run(mddev);
4255         if (err) {
4256                 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
4257                 do_md_stop(mddev, 0, 0);
4258         }
4259 }
4260
4261 /*
4262  * lets try to run arrays based on all disks that have arrived
4263  * until now. (those are in pending_raid_disks)
4264  *
4265  * the method: pick the first pending disk, collect all disks with
4266  * the same UUID, remove all from the pending list and put them into
4267  * the 'same_array' list. Then order this list based on superblock
4268  * update time (freshest comes first), kick out 'old' disks and
4269  * compare superblocks. If everything's fine then run it.
4270  *
4271  * If "unit" is allocated, then bump its reference count
4272  */
4273 static void autorun_devices(int part)
4274 {
4275         mdk_rdev_t *rdev0, *rdev, *tmp;
4276         mddev_t *mddev;
4277         char b[BDEVNAME_SIZE];
4278
4279         printk(KERN_INFO "md: autorun ...\n");
4280         while (!list_empty(&pending_raid_disks)) {
4281                 int unit;
4282                 dev_t dev;
4283                 LIST_HEAD(candidates);
4284                 rdev0 = list_entry(pending_raid_disks.next,
4285                                          mdk_rdev_t, same_set);
4286
4287                 printk(KERN_INFO "md: considering %s ...\n",
4288                         bdevname(rdev0->bdev,b));
4289                 INIT_LIST_HEAD(&candidates);
4290                 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
4291                         if (super_90_load(rdev, rdev0, 0) >= 0) {
4292                                 printk(KERN_INFO "md:  adding %s ...\n",
4293                                         bdevname(rdev->bdev,b));
4294                                 list_move(&rdev->same_set, &candidates);
4295                         }
4296                 /*
4297                  * now we have a set of devices, with all of them having
4298                  * mostly sane superblocks. It's time to allocate the
4299                  * mddev.
4300                  */
4301                 if (part) {
4302                         dev = MKDEV(mdp_major,
4303                                     rdev0->preferred_minor << MdpMinorShift);
4304                         unit = MINOR(dev) >> MdpMinorShift;
4305                 } else {
4306                         dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
4307                         unit = MINOR(dev);
4308                 }
4309                 if (rdev0->preferred_minor != unit) {
4310                         printk(KERN_INFO "md: unit number in %s is bad: %d\n",
4311                                bdevname(rdev0->bdev, b), rdev0->preferred_minor);
4312                         break;
4313                 }
4314
4315                 md_probe(dev, NULL, NULL);
4316                 mddev = mddev_find(dev);
4317                 if (!mddev || !mddev->gendisk) {
4318                         if (mddev)
4319                                 mddev_put(mddev);
4320                         printk(KERN_ERR
4321                                 "md: cannot allocate memory for md drive.\n");
4322                         break;
4323                 }
4324                 if (mddev_lock(mddev)) 
4325                         printk(KERN_WARNING "md: %s locked, cannot run\n",
4326                                mdname(mddev));
4327                 else if (mddev->raid_disks || mddev->major_version
4328                          || !list_empty(&mddev->disks)) {
4329                         printk(KERN_WARNING 
4330                                 "md: %s already running, cannot run %s\n",
4331                                 mdname(mddev), bdevname(rdev0->bdev,b));
4332                         mddev_unlock(mddev);
4333                 } else {
4334                         printk(KERN_INFO "md: created %s\n", mdname(mddev));
4335                         mddev->persistent = 1;
4336                         rdev_for_each_list(rdev, tmp, &candidates) {
4337                                 list_del_init(&rdev->same_set);
4338                                 if (bind_rdev_to_array(rdev, mddev))
4339                                         export_rdev(rdev);
4340                         }
4341                         autorun_array(mddev);
4342                         mddev_unlock(mddev);
4343                 }
4344                 /* on success, candidates will be empty, on error
4345                  * it won't...
4346                  */
4347                 rdev_for_each_list(rdev, tmp, &candidates) {
4348                         list_del_init(&rdev->same_set);
4349                         export_rdev(rdev);
4350                 }
4351                 mddev_put(mddev);
4352         }
4353         printk(KERN_INFO "md: ... autorun DONE.\n");
4354 }
4355 #endif /* !MODULE */
4356
4357 static int get_version(void __user * arg)
4358 {
4359         mdu_version_t ver;
4360
4361         ver.major = MD_MAJOR_VERSION;
4362         ver.minor = MD_MINOR_VERSION;
4363         ver.patchlevel = MD_PATCHLEVEL_VERSION;
4364
4365         if (copy_to_user(arg, &ver, sizeof(ver)))
4366                 return -EFAULT;
4367
4368         return 0;
4369 }
4370
4371 static int get_array_info(mddev_t * mddev, void __user * arg)
4372 {
4373         mdu_array_info_t info;
4374         int nr,working,active,failed,spare;
4375         mdk_rdev_t *rdev;
4376
4377         nr=working=active=failed=spare=0;
4378         list_for_each_entry(rdev, &mddev->disks, same_set) {
4379                 nr++;
4380                 if (test_bit(Faulty, &rdev->flags))
4381                         failed++;
4382                 else {
4383                         working++;
4384                         if (test_bit(In_sync, &rdev->flags))
4385                                 active++;       
4386                         else
4387                                 spare++;
4388                 }
4389         }
4390
4391         info.major_version = mddev->major_version;
4392         info.minor_version = mddev->minor_version;
4393         info.patch_version = MD_PATCHLEVEL_VERSION;
4394         info.ctime         = mddev->ctime;
4395         info.level         = mddev->level;
4396         info.size          = mddev->dev_sectors / 2;
4397         if (info.size != mddev->dev_sectors / 2) /* overflow */
4398                 info.size = -1;
4399         info.nr_disks      = nr;
4400         info.raid_disks    = mddev->raid_disks;
4401         info.md_minor      = mddev->md_minor;
4402         info.not_persistent= !mddev->persistent;
4403
4404         info.utime         = mddev->utime;
4405         info.state         = 0;
4406         if (mddev->in_sync)
4407                 info.state = (1<<MD_SB_CLEAN);
4408         if (mddev->bitmap && mddev->bitmap_offset)
4409                 info.state = (1<<MD_SB_BITMAP_PRESENT);
4410         info.active_disks  = active;
4411         info.working_disks = working;
4412         info.failed_disks  = failed;
4413         info.spare_disks   = spare;
4414
4415         info.layout        = mddev->layout;
4416         info.chunk_size    = mddev->chunk_size;
4417
4418         if (copy_to_user(arg, &info, sizeof(info)))
4419                 return -EFAULT;
4420
4421         return 0;
4422 }
4423
4424 static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4425 {
4426         mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
4427         char *ptr, *buf = NULL;
4428         int err = -ENOMEM;
4429
4430         if (md_allow_write(mddev))
4431                 file = kmalloc(sizeof(*file), GFP_NOIO);
4432         else
4433                 file = kmalloc(sizeof(*file), GFP_KERNEL);
4434
4435         if (!file)
4436                 goto out;
4437
4438         /* bitmap disabled, zero the first byte and copy out */
4439         if (!mddev->bitmap || !mddev->bitmap->file) {
4440                 file->pathname[0] = '\0';
4441                 goto copy_out;
4442         }
4443
4444         buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
4445         if (!buf)
4446                 goto out;
4447
4448         ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
4449         if (IS_ERR(ptr))
4450                 goto out;
4451
4452         strcpy(file->pathname, ptr);
4453
4454 copy_out:
4455         err = 0;
4456         if (copy_to_user(arg, file, sizeof(*file)))
4457                 err = -EFAULT;
4458 out:
4459         kfree(buf);
4460         kfree(file);
4461         return err;
4462 }
4463
4464 static int get_disk_info(mddev_t * mddev, void __user * arg)
4465 {
4466         mdu_disk_info_t info;
4467         mdk_rdev_t *rdev;
4468
4469         if (copy_from_user(&info, arg, sizeof(info)))
4470                 return -EFAULT;
4471
4472         rdev = find_rdev_nr(mddev, info.number);
4473         if (rdev) {
4474                 info.major = MAJOR(rdev->bdev->bd_dev);
4475                 info.minor = MINOR(rdev->bdev->bd_dev);
4476                 info.raid_disk = rdev->raid_disk;
4477                 info.state = 0;
4478                 if (test_bit(Faulty, &rdev->flags))
4479                         info.state |= (1<<MD_DISK_FAULTY);
4480                 else if (test_bit(In_sync, &rdev->flags)) {
4481                         info.state |= (1<<MD_DISK_ACTIVE);
4482                         info.state |= (1<<MD_DISK_SYNC);
4483                 }
4484                 if (test_bit(WriteMostly, &rdev->flags))
4485                         info.state |= (1<<MD_DISK_WRITEMOSTLY);
4486         } else {
4487                 info.major = info.minor = 0;
4488                 info.raid_disk = -1;
4489                 info.state = (1<<MD_DISK_REMOVED);
4490         }
4491
4492         if (copy_to_user(arg, &info, sizeof(info)))
4493                 return -EFAULT;
4494
4495         return 0;
4496 }
4497
4498 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4499 {
4500         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4501         mdk_rdev_t *rdev;
4502         dev_t dev = MKDEV(info->major,info->minor);
4503
4504         if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
4505                 return -EOVERFLOW;
4506
4507         if (!mddev->raid_disks) {
4508                 int err;
4509                 /* expecting a device which has a superblock */
4510                 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
4511                 if (IS_ERR(rdev)) {
4512                         printk(KERN_WARNING 
4513                                 "md: md_import_device returned %ld\n",
4514                                 PTR_ERR(rdev));
4515                         return PTR_ERR(rdev);
4516                 }
4517                 if (!list_empty(&mddev->disks)) {
4518                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4519                                                         mdk_rdev_t, same_set);
4520                         int err = super_types[mddev->major_version]
4521                                 .load_super(rdev, rdev0, mddev->minor_version);
4522                         if (err < 0) {
4523                                 printk(KERN_WARNING 
4524                                         "md: %s has different UUID to %s\n",
4525                                         bdevname(rdev->bdev,b), 
4526                                         bdevname(rdev0->bdev,b2));
4527                                 export_rdev(rdev);
4528                                 return -EINVAL;
4529                         }
4530                 }
4531                 err = bind_rdev_to_array(rdev, mddev);
4532                 if (err)
4533                         export_rdev(rdev);
4534                 return err;
4535         }
4536
4537         /*
4538          * add_new_disk can be used once the array is assembled
4539          * to add "hot spares".  They must already have a superblock
4540          * written
4541          */
4542         if (mddev->pers) {
4543                 int err;
4544                 if (!mddev->pers->hot_add_disk) {
4545                         printk(KERN_WARNING 
4546                                 "%s: personality does not support diskops!\n",
4547                                mdname(mddev));
4548                         return -EINVAL;
4549                 }
4550                 if (mddev->persistent)
4551                         rdev = md_import_device(dev, mddev->major_version,
4552                                                 mddev->minor_version);
4553                 else
4554                         rdev = md_import_device(dev, -1, -1);
4555                 if (IS_ERR(rdev)) {
4556                         printk(KERN_WARNING 
4557                                 "md: md_import_device returned %ld\n",
4558                                 PTR_ERR(rdev));
4559                         return PTR_ERR(rdev);
4560                 }
4561                 /* set save_raid_disk if appropriate */
4562                 if (!mddev->persistent) {
4563                         if (info->state & (1<<MD_DISK_SYNC)  &&
4564                             info->raid_disk < mddev->raid_disks)
4565                                 rdev->raid_disk = info->raid_disk;
4566                         else
4567                                 rdev->raid_disk = -1;
4568                 } else
4569                         super_types[mddev->major_version].
4570                                 validate_super(mddev, rdev);
4571                 rdev->saved_raid_disk = rdev->raid_disk;
4572
4573                 clear_bit(In_sync, &rdev->flags); /* just to be sure */
4574                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4575                         set_bit(WriteMostly, &rdev->flags);
4576                 else
4577                         clear_bit(WriteMostly, &rdev->flags);
4578
4579                 rdev->raid_disk = -1;
4580                 err = bind_rdev_to_array(rdev, mddev);
4581                 if (!err && !mddev->pers->hot_remove_disk) {
4582                         /* If there is hot_add_disk but no hot_remove_disk
4583                          * then added disks for geometry changes,
4584                          * and should be added immediately.
4585                          */
4586                         super_types[mddev->major_version].
4587                                 validate_super(mddev, rdev);
4588                         err = mddev->pers->hot_add_disk(mddev, rdev);
4589                         if (err)
4590                                 unbind_rdev_from_array(rdev);
4591                 }
4592                 if (err)
4593                         export_rdev(rdev);
4594                 else
4595                         sysfs_notify_dirent(rdev->sysfs_state);
4596
4597                 md_update_sb(mddev, 1);
4598                 if (mddev->degraded)
4599                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4600                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4601                 md_wakeup_thread(mddev->thread);
4602                 return err;
4603         }
4604
4605         /* otherwise, add_new_disk is only allowed
4606          * for major_version==0 superblocks
4607          */
4608         if (mddev->major_version != 0) {
4609                 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
4610                        mdname(mddev));
4611                 return -EINVAL;
4612         }
4613
4614         if (!(info->state & (1<<MD_DISK_FAULTY))) {
4615                 int err;
4616                 rdev = md_import_device(dev, -1, 0);
4617                 if (IS_ERR(rdev)) {
4618                         printk(KERN_WARNING 
4619                                 "md: error, md_import_device() returned %ld\n",
4620                                 PTR_ERR(rdev));
4621                         return PTR_ERR(rdev);
4622                 }
4623                 rdev->desc_nr = info->number;
4624                 if (info->raid_disk < mddev->raid_disks)
4625                         rdev->raid_disk = info->raid_disk;
4626                 else
4627                         rdev->raid_disk = -1;
4628
4629                 if (rdev->raid_disk < mddev->raid_disks)
4630                         if (info->state & (1<<MD_DISK_SYNC))
4631                                 set_bit(In_sync, &rdev->flags);
4632
4633                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4634                         set_bit(WriteMostly, &rdev->flags);
4635
4636                 if (!mddev->persistent) {
4637                         printk(KERN_INFO "md: nonpersistent superblock ...\n");
4638                         rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4639                 } else 
4640                         rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4641                 rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
4642
4643                 err = bind_rdev_to_array(rdev, mddev);
4644                 if (err) {
4645                         export_rdev(rdev);
4646                         return err;
4647                 }
4648         }
4649
4650         return 0;
4651 }
4652
4653 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4654 {
4655         char b[BDEVNAME_SIZE];
4656         mdk_rdev_t *rdev;
4657
4658         rdev = find_rdev(mddev, dev);
4659         if (!rdev)
4660                 return -ENXIO;
4661
4662         if (rdev->raid_disk >= 0)
4663                 goto busy;
4664
4665         kick_rdev_from_array(rdev);
4666         md_update_sb(mddev, 1);
4667         md_new_event(mddev);
4668
4669         return 0;
4670 busy:
4671         printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
4672                 bdevname(rdev->bdev,b), mdname(mddev));
4673         return -EBUSY;
4674 }
4675
4676 static int hot_add_disk(mddev_t * mddev, dev_t dev)
4677 {
4678         char b[BDEVNAME_SIZE];
4679         int err;
4680         mdk_rdev_t *rdev;
4681
4682         if (!mddev->pers)
4683                 return -ENODEV;
4684
4685         if (mddev->major_version != 0) {
4686                 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
4687                         " version-0 superblocks.\n",
4688                         mdname(mddev));
4689                 return -EINVAL;
4690         }
4691         if (!mddev->pers->hot_add_disk) {
4692                 printk(KERN_WARNING 
4693                         "%s: personality does not support diskops!\n",
4694                         mdname(mddev));
4695                 return -EINVAL;
4696         }
4697
4698         rdev = md_import_device(dev, -1, 0);
4699         if (IS_ERR(rdev)) {
4700                 printk(KERN_WARNING 
4701                         "md: error, md_import_device() returned %ld\n",
4702                         PTR_ERR(rdev));
4703                 return -EINVAL;
4704         }
4705
4706         if (mddev->persistent)
4707                 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4708         else
4709                 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4710
4711         rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
4712
4713         if (test_bit(Faulty, &rdev->flags)) {
4714                 printk(KERN_WARNING 
4715                         "md: can not hot-add faulty %s disk to %s!\n",
4716                         bdevname(rdev->bdev,b), mdname(mddev));
4717                 err = -EINVAL;
4718                 goto abort_export;
4719         }
4720         clear_bit(In_sync, &rdev->flags);
4721         rdev->desc_nr = -1;
4722         rdev->saved_raid_disk = -1;
4723         err = bind_rdev_to_array(rdev, mddev);
4724         if (err)
4725                 goto abort_export;
4726
4727         /*
4728          * The rest should better be atomic, we can have disk failures
4729          * noticed in interrupt contexts ...
4730          */
4731
4732         rdev->raid_disk = -1;
4733
4734         md_update_sb(mddev, 1);
4735
4736         /*
4737          * Kick recovery, maybe this spare has to be added to the
4738          * array immediately.
4739          */
4740         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4741         md_wakeup_thread(mddev->thread);
4742         md_new_event(mddev);
4743         return 0;
4744
4745 abort_export:
4746         export_rdev(rdev);
4747         return err;
4748 }
4749
4750 static int set_bitmap_file(mddev_t *mddev, int fd)
4751 {
4752         int err;
4753
4754         if (mddev->pers) {
4755                 if (!mddev->pers->quiesce)
4756                         return -EBUSY;
4757                 if (mddev->recovery || mddev->sync_thread)
4758                         return -EBUSY;
4759                 /* we should be able to change the bitmap.. */
4760         }
4761
4762
4763         if (fd >= 0) {
4764                 if (mddev->bitmap)
4765                         return -EEXIST; /* cannot add when bitmap is present */
4766                 mddev->bitmap_file = fget(fd);
4767
4768                 if (mddev->bitmap_file == NULL) {
4769                         printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4770                                mdname(mddev));
4771                         return -EBADF;
4772                 }
4773
4774                 err = deny_bitmap_write_access(mddev->bitmap_file);
4775                 if (err) {
4776                         printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4777                                mdname(mddev));
4778                         fput(mddev->bitmap_file);
4779                         mddev->bitmap_file = NULL;
4780                         return err;
4781                 }
4782                 mddev->bitmap_offset = 0; /* file overrides offset */
4783         } else if (mddev->bitmap == NULL)
4784                 return -ENOENT; /* cannot remove what isn't there */
4785         err = 0;
4786         if (mddev->pers) {
4787                 mddev->pers->quiesce(mddev, 1);
4788                 if (fd >= 0)
4789                         err = bitmap_create(mddev);
4790                 if (fd < 0 || err) {
4791                         bitmap_destroy(mddev);
4792                         fd = -1; /* make sure to put the file */
4793                 }
4794                 mddev->pers->quiesce(mddev, 0);
4795         }
4796         if (fd < 0) {
4797                 if (mddev->bitmap_file) {
4798                         restore_bitmap_write_access(mddev->bitmap_file);
4799                         fput(mddev->bitmap_file);
4800                 }
4801                 mddev->bitmap_file = NULL;
4802         }
4803
4804         return err;
4805 }
4806
4807 /*
4808  * set_array_info is used two different ways
4809  * The original usage is when creating a new array.
4810  * In this usage, raid_disks is > 0 and it together with
4811  *  level, size, not_persistent,layout,chunksize determine the
4812  *  shape of the array.
4813  *  This will always create an array with a type-0.90.0 superblock.
4814  * The newer usage is when assembling an array.
4815  *  In this case raid_disks will be 0, and the major_version field is
4816  *  use to determine which style super-blocks are to be found on the devices.
4817  *  The minor and patch _version numbers are also kept incase the
4818  *  super_block handler wishes to interpret them.
4819  */
4820 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4821 {
4822
4823         if (info->raid_disks == 0) {
4824                 /* just setting version number for superblock loading */
4825                 if (info->major_version < 0 ||
4826                     info->major_version >= ARRAY_SIZE(super_types) ||
4827                     super_types[info->major_version].name == NULL) {
4828                         /* maybe try to auto-load a module? */
4829                         printk(KERN_INFO 
4830                                 "md: superblock version %d not known\n",
4831                                 info->major_version);
4832                         return -EINVAL;
4833                 }
4834                 mddev->major_version = info->major_version;
4835                 mddev->minor_version = info->minor_version;
4836                 mddev->patch_version = info->patch_version;
4837                 mddev->persistent = !info->not_persistent;
4838                 return 0;
4839         }
4840         mddev->major_version = MD_MAJOR_VERSION;
4841         mddev->minor_version = MD_MINOR_VERSION;
4842         mddev->patch_version = MD_PATCHLEVEL_VERSION;
4843         mddev->ctime         = get_seconds();
4844
4845         mddev->level         = info->level;
4846         mddev->clevel[0]     = 0;
4847         mddev->dev_sectors   = 2 * (sector_t)info->size;
4848         mddev->raid_disks    = info->raid_disks;
4849         /* don't set md_minor, it is determined by which /dev/md* was
4850          * openned
4851          */
4852         if (info->state & (1<<MD_SB_CLEAN))
4853                 mddev->recovery_cp = MaxSector;
4854         else
4855                 mddev->recovery_cp = 0;
4856         mddev->persistent    = ! info->not_persistent;
4857         mddev->external      = 0;
4858
4859         mddev->layout        = info->layout;
4860         mddev->chunk_size    = info->chunk_size;
4861
4862         mddev->max_disks     = MD_SB_DISKS;
4863
4864         if (mddev->persistent)
4865                 mddev->flags         = 0;
4866         set_bit(MD_CHANGE_DEVS, &mddev->flags);
4867
4868         mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
4869         mddev->bitmap_offset = 0;
4870
4871         mddev->reshape_position = MaxSector;
4872
4873         /*
4874          * Generate a 128 bit UUID
4875          */
4876         get_random_bytes(mddev->uuid, 16);
4877
4878         mddev->new_level = mddev->level;
4879         mddev->new_chunk = mddev->chunk_size;
4880         mddev->new_layout = mddev->layout;
4881         mddev->delta_disks = 0;
4882
4883         return 0;
4884 }
4885
4886 static int update_size(mddev_t *mddev, sector_t num_sectors)
4887 {
4888         mdk_rdev_t *rdev;
4889         int rv;
4890         int fit = (num_sectors == 0);
4891
4892         if (mddev->pers->resize == NULL)
4893                 return -EINVAL;
4894         /* The "num_sectors" is the number of sectors of each device that
4895          * is used.  This can only make sense for arrays with redundancy.
4896          * linear and raid0 always use whatever space is available. We can only
4897          * consider changing this number if no resync or reconstruction is
4898          * happening, and if the new size is acceptable. It must fit before the
4899          * sb_start or, if that is <data_offset, it must fit before the size
4900          * of each device.  If num_sectors is zero, we find the largest size
4901          * that fits.
4902
4903          */
4904         if (mddev->sync_thread)
4905                 return -EBUSY;
4906         if (mddev->bitmap)
4907                 /* Sorry, cannot grow a bitmap yet, just remove it,
4908                  * grow, and re-add.
4909                  */
4910                 return -EBUSY;
4911         list_for_each_entry(rdev, &mddev->disks, same_set) {
4912                 sector_t avail = rdev->sectors;
4913
4914                 if (fit && (num_sectors == 0 || num_sectors > avail))
4915                         num_sectors = avail;
4916                 if (avail < num_sectors)
4917                         return -ENOSPC;
4918         }
4919         rv = mddev->pers->resize(mddev, num_sectors);
4920         if (!rv) {
4921                 struct block_device *bdev;
4922
4923                 bdev = bdget_disk(mddev->gendisk, 0);
4924                 if (bdev) {
4925                         mutex_lock(&bdev->bd_inode->i_mutex);
4926                         i_size_write(bdev->bd_inode,
4927                                      (loff_t)mddev->array_sectors << 9);
4928                         mutex_unlock(&bdev->bd_inode->i_mutex);
4929                         bdput(bdev);
4930                 }
4931         }
4932         return rv;
4933 }
4934
4935 static int update_raid_disks(mddev_t *mddev, int raid_disks)
4936 {
4937         int rv;
4938         /* change the number of raid disks */
4939         if (mddev->pers->check_reshape == NULL)
4940                 return -EINVAL;
4941         if (raid_disks <= 0 ||
4942             raid_disks >= mddev->max_disks)
4943                 return -EINVAL;
4944         if (mddev->sync_thread || mddev->reshape_position != MaxSector)
4945                 return -EBUSY;
4946         mddev->delta_disks = raid_disks - mddev->raid_disks;
4947
4948         rv = mddev->pers->check_reshape(mddev);
4949         return rv;
4950 }
4951
4952
4953 /*
4954  * update_array_info is used to change the configuration of an
4955  * on-line array.
4956  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
4957  * fields in the info are checked against the array.
4958  * Any differences that cannot be handled will cause an error.
4959  * Normally, only one change can be managed at a time.
4960  */
4961 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4962 {
4963         int rv = 0;
4964         int cnt = 0;
4965         int state = 0;
4966
4967         /* calculate expected state,ignoring low bits */
4968         if (mddev->bitmap && mddev->bitmap_offset)
4969                 state |= (1 << MD_SB_BITMAP_PRESENT);
4970
4971         if (mddev->major_version != info->major_version ||
4972             mddev->minor_version != info->minor_version ||
4973 /*          mddev->patch_version != info->patch_version || */
4974             mddev->ctime         != info->ctime         ||
4975             mddev->level         != info->level         ||
4976 /*          mddev->layout        != info->layout        || */
4977             !mddev->persistent   != info->not_persistent||
4978             mddev->chunk_size    != info->chunk_size    ||
4979             /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
4980             ((state^info->state) & 0xfffffe00)
4981                 )
4982                 return -EINVAL;
4983         /* Check there is only one change */
4984         if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
4985                 cnt++;
4986         if (mddev->raid_disks != info->raid_disks)
4987                 cnt++;
4988         if (mddev->layout != info->layout)
4989                 cnt++;
4990         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
4991                 cnt++;
4992         if (cnt == 0)
4993                 return 0;
4994         if (cnt > 1)
4995                 return -EINVAL;
4996
4997         if (mddev->layout != info->layout) {
4998                 /* Change layout
4999                  * we don't need to do anything at the md level, the
5000                  * personality will take care of it all.
5001                  */
5002                 if (mddev->pers->reconfig == NULL)
5003                         return -EINVAL;
5004                 else
5005                         return mddev->pers->reconfig(mddev, info->layout, -1);
5006         }
5007         if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5008                 rv = update_size(mddev, (sector_t)info->size * 2);
5009
5010         if (mddev->raid_disks    != info->raid_disks)
5011                 rv = update_raid_disks(mddev, info->raid_disks);
5012
5013         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
5014                 if (mddev->pers->quiesce == NULL)
5015                         return -EINVAL;
5016                 if (mddev->recovery || mddev->sync_thread)
5017                         return -EBUSY;
5018                 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
5019                         /* add the bitmap */
5020                         if (mddev->bitmap)
5021                                 return -EEXIST;
5022                         if (mddev->default_bitmap_offset == 0)
5023                                 return -EINVAL;
5024                         mddev->bitmap_offset = mddev->default_bitmap_offset;
5025                         mddev->pers->quiesce(mddev, 1);
5026                         rv = bitmap_create(mddev);
5027                         if (rv)
5028                                 bitmap_destroy(mddev);
5029                         mddev->pers->quiesce(mddev, 0);
5030                 } else {
5031                         /* remove the bitmap */
5032                         if (!mddev->bitmap)
5033                                 return -ENOENT;
5034                         if (mddev->bitmap->file)
5035                                 return -EINVAL;
5036                         mddev->pers->quiesce(mddev, 1);
5037                         bitmap_destroy(mddev);
5038                         mddev->pers->quiesce(mddev, 0);
5039                         mddev->bitmap_offset = 0;
5040                 }
5041         }
5042         md_update_sb(mddev, 1);
5043         return rv;
5044 }
5045
5046 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
5047 {
5048         mdk_rdev_t *rdev;
5049
5050         if (mddev->pers == NULL)
5051                 return -ENODEV;
5052
5053         rdev = find_rdev(mddev, dev);
5054         if (!rdev)
5055                 return -ENODEV;
5056
5057         md_error(mddev, rdev);
5058         return 0;
5059 }
5060
5061 /*
5062  * We have a problem here : there is no easy way to give a CHS
5063  * virtual geometry. We currently pretend that we have a 2 heads
5064  * 4 sectors (with a BIG number of cylinders...). This drives
5065  * dosfs just mad... ;-)
5066  */
5067 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
5068 {
5069         mddev_t *mddev = bdev->bd_disk->private_data;
5070
5071         geo->heads = 2;
5072         geo->sectors = 4;
5073         geo->cylinders = get_capacity(mddev->gendisk) / 8;
5074         return 0;
5075 }
5076
5077 static int md_ioctl(struct block_device *bdev, fmode_t mode,
5078                         unsigned int cmd, unsigned long arg)
5079 {
5080         int err = 0;
5081         void __user *argp = (void __user *)arg;
5082         mddev_t *mddev = NULL;
5083
5084         if (!capable(CAP_SYS_ADMIN))
5085                 return -EACCES;
5086
5087         /*
5088          * Commands dealing with the RAID driver but not any
5089          * particular array:
5090          */
5091         switch (cmd)
5092         {
5093                 case RAID_VERSION:
5094                         err = get_version(argp);
5095                         goto done;
5096
5097                 case PRINT_RAID_DEBUG:
5098                         err = 0;
5099                         md_print_devices();
5100                         goto done;
5101
5102 #ifndef MODULE
5103                 case RAID_AUTORUN:
5104                         err = 0;
5105                         autostart_arrays(arg);
5106                         goto done;
5107 #endif
5108                 default:;
5109         }
5110
5111         /*
5112          * Commands creating/starting a new array:
5113          */
5114
5115         mddev = bdev->bd_disk->private_data;
5116
5117         if (!mddev) {
5118                 BUG();
5119                 goto abort;
5120         }
5121
5122         err = mddev_lock(mddev);
5123         if (err) {
5124                 printk(KERN_INFO 
5125                         "md: ioctl lock interrupted, reason %d, cmd %d\n",
5126                         err, cmd);
5127                 goto abort;
5128         }
5129
5130         switch (cmd)
5131         {
5132                 case SET_ARRAY_INFO:
5133                         {
5134                                 mdu_array_info_t info;
5135                                 if (!arg)
5136                                         memset(&info, 0, sizeof(info));
5137                                 else if (copy_from_user(&info, argp, sizeof(info))) {
5138                                         err = -EFAULT;
5139                                         goto abort_unlock;
5140                                 }
5141                                 if (mddev->pers) {
5142                                         err = update_array_info(mddev, &info);
5143                                         if (err) {
5144                                                 printk(KERN_WARNING "md: couldn't update"
5145                                                        " array info. %d\n", err);
5146                                                 goto abort_unlock;
5147                                         }
5148                                         goto done_unlock;
5149                                 }
5150                                 if (!list_empty(&mddev->disks)) {
5151                                         printk(KERN_WARNING
5152                                                "md: array %s already has disks!\n",
5153                                                mdname(mddev));
5154                                         err = -EBUSY;
5155                                         goto abort_unlock;
5156                                 }
5157                                 if (mddev->raid_disks) {
5158                                         printk(KERN_WARNING
5159                                                "md: array %s already initialised!\n",
5160                                                mdname(mddev));
5161                                         err = -EBUSY;
5162                                         goto abort_unlock;
5163                                 }
5164                                 err = set_array_info(mddev, &info);
5165                                 if (err) {
5166                                         printk(KERN_WARNING "md: couldn't set"
5167                                                " array info. %d\n", err);
5168                                         goto abort_unlock;
5169                                 }
5170                         }
5171                         goto done_unlock;
5172
5173                 default:;
5174         }
5175
5176         /*
5177          * Commands querying/configuring an existing array:
5178          */
5179         /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
5180          * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
5181         if ((!mddev->raid_disks && !mddev->external)
5182             && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
5183             && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
5184             && cmd != GET_BITMAP_FILE) {
5185                 err = -ENODEV;
5186                 goto abort_unlock;
5187         }
5188
5189         /*
5190          * Commands even a read-only array can execute:
5191          */
5192         switch (cmd)
5193         {
5194                 case GET_ARRAY_INFO:
5195                         err = get_array_info(mddev, argp);
5196                         goto done_unlock;
5197
5198                 case GET_BITMAP_FILE:
5199                         err = get_bitmap_file(mddev, argp);
5200                         goto done_unlock;
5201
5202                 case GET_DISK_INFO:
5203                         err = get_disk_info(mddev, argp);
5204                         goto done_unlock;
5205
5206                 case RESTART_ARRAY_RW:
5207                         err = restart_array(mddev);
5208                         goto done_unlock;
5209
5210                 case STOP_ARRAY:
5211                         err = do_md_stop(mddev, 0, 1);
5212                         goto done_unlock;
5213
5214                 case STOP_ARRAY_RO:
5215                         err = do_md_stop(mddev, 1, 1);
5216                         goto done_unlock;
5217
5218         }
5219
5220         /*
5221          * The remaining ioctls are changing the state of the
5222          * superblock, so we do not allow them on read-only arrays.
5223          * However non-MD ioctls (e.g. get-size) will still come through
5224          * here and hit the 'default' below, so only disallow
5225          * 'md' ioctls, and switch to rw mode if started auto-readonly.
5226          */
5227         if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
5228                 if (mddev->ro == 2) {
5229                         mddev->ro = 0;
5230                         sysfs_notify_dirent(mddev->sysfs_state);
5231                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5232                         md_wakeup_thread(mddev->thread);
5233                 } else {
5234                         err = -EROFS;
5235                         goto abort_unlock;
5236                 }
5237         }
5238
5239         switch (cmd)
5240         {
5241                 case ADD_NEW_DISK:
5242                 {
5243                         mdu_disk_info_t info;
5244                         if (copy_from_user(&info, argp, sizeof(info)))
5245                                 err = -EFAULT;
5246                         else
5247                                 err = add_new_disk(mddev, &info);
5248                         goto done_unlock;
5249                 }
5250
5251                 case HOT_REMOVE_DISK:
5252                         err = hot_remove_disk(mddev, new_decode_dev(arg));
5253                         goto done_unlock;
5254
5255                 case HOT_ADD_DISK:
5256                         err = hot_add_disk(mddev, new_decode_dev(arg));
5257                         goto done_unlock;
5258
5259                 case SET_DISK_FAULTY:
5260                         err = set_disk_faulty(mddev, new_decode_dev(arg));
5261                         goto done_unlock;
5262
5263                 case RUN_ARRAY:
5264                         err = do_md_run(mddev);
5265                         goto done_unlock;
5266
5267                 case SET_BITMAP_FILE:
5268                         err = set_bitmap_file(mddev, (int)arg);
5269                         goto done_unlock;
5270
5271                 default:
5272                         err = -EINVAL;
5273                         goto abort_unlock;
5274         }
5275
5276 done_unlock:
5277 abort_unlock:
5278         if (mddev->hold_active == UNTIL_IOCTL &&
5279             err != -EINVAL)
5280                 mddev->hold_active = 0;
5281         mddev_unlock(mddev);
5282
5283         return err;
5284 done:
5285         if (err)
5286                 MD_BUG();
5287 abort:
5288         return err;
5289 }
5290
5291 static int md_open(struct block_device *bdev, fmode_t mode)
5292 {
5293         /*
5294          * Succeed if we can lock the mddev, which confirms that
5295          * it isn't being stopped right now.
5296          */
5297         mddev_t *mddev = mddev_find(bdev->bd_dev);
5298         int err;
5299
5300         if (mddev->gendisk != bdev->bd_disk) {
5301                 /* we are racing with mddev_put which is discarding this
5302                  * bd_disk.
5303                  */
5304                 mddev_put(mddev);
5305                 /* Wait until bdev->bd_disk is definitely gone */
5306                 flush_scheduled_work();
5307                 /* Then retry the open from the top */
5308                 return -ERESTARTSYS;
5309         }
5310         BUG_ON(mddev != bdev->bd_disk->private_data);
5311
5312         if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
5313                 goto out;
5314
5315         err = 0;
5316         atomic_inc(&mddev->openers);
5317         mddev_unlock(mddev);
5318
5319         check_disk_change(bdev);
5320  out:
5321         return err;
5322 }
5323
5324 static int md_release(struct gendisk *disk, fmode_t mode)
5325 {
5326         mddev_t *mddev = disk->private_data;
5327
5328         BUG_ON(!mddev);
5329         atomic_dec(&mddev->openers);
5330         mddev_put(mddev);
5331
5332         return 0;
5333 }
5334
5335 static int md_media_changed(struct gendisk *disk)
5336 {
5337         mddev_t *mddev = disk->private_data;
5338
5339         return mddev->changed;
5340 }
5341
5342 static int md_revalidate(struct gendisk *disk)
5343 {
5344         mddev_t *mddev = disk->private_data;
5345
5346         mddev->changed = 0;
5347         return 0;
5348 }
5349 static struct block_device_operations md_fops =
5350 {
5351         .owner          = THIS_MODULE,
5352         .open           = md_open,
5353         .release        = md_release,
5354         .locked_ioctl   = md_ioctl,
5355         .getgeo         = md_getgeo,
5356         .media_changed  = md_media_changed,
5357         .revalidate_disk= md_revalidate,
5358 };
5359
5360 static int md_thread(void * arg)
5361 {
5362         mdk_thread_t *thread = arg;
5363
5364         /*
5365          * md_thread is a 'system-thread', it's priority should be very
5366          * high. We avoid resource deadlocks individually in each
5367          * raid personality. (RAID5 does preallocation) We also use RR and
5368          * the very same RT priority as kswapd, thus we will never get
5369          * into a priority inversion deadlock.
5370          *
5371          * we definitely have to have equal or higher priority than
5372          * bdflush, otherwise bdflush will deadlock if there are too
5373          * many dirty RAID5 blocks.
5374          */
5375
5376         allow_signal(SIGKILL);
5377         while (!kthread_should_stop()) {
5378
5379                 /* We need to wait INTERRUPTIBLE so that
5380                  * we don't add to the load-average.
5381                  * That means we need to be sure no signals are
5382                  * pending
5383                  */
5384                 if (signal_pending(current))
5385                         flush_signals(current);
5386
5387                 wait_event_interruptible_timeout
5388                         (thread->wqueue,
5389                          test_bit(THREAD_WAKEUP, &thread->flags)
5390                          || kthread_should_stop(),
5391                          thread->timeout);
5392
5393                 clear_bit(THREAD_WAKEUP, &thread->flags);
5394
5395                 thread->run(thread->mddev);
5396         }
5397
5398         return 0;
5399 }
5400
5401 void md_wakeup_thread(mdk_thread_t *thread)
5402 {
5403         if (thread) {
5404                 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
5405                 set_bit(THREAD_WAKEUP, &thread->flags);
5406                 wake_up(&thread->wqueue);
5407         }
5408 }
5409
5410 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5411                                  const char *name)
5412 {
5413         mdk_thread_t *thread;
5414
5415         thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
5416         if (!thread)
5417                 return NULL;
5418
5419         init_waitqueue_head(&thread->wqueue);
5420
5421         thread->run = run;
5422         thread->mddev = mddev;
5423         thread->timeout = MAX_SCHEDULE_TIMEOUT;
5424         thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
5425         if (IS_ERR(thread->tsk)) {
5426                 kfree(thread);
5427                 return NULL;
5428         }
5429         return thread;
5430 }
5431
5432 void md_unregister_thread(mdk_thread_t *thread)
5433 {
5434         if (!thread)
5435                 return;
5436         dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
5437
5438         kthread_stop(thread->tsk);
5439         kfree(thread);
5440 }
5441
5442 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5443 {
5444         if (!mddev) {
5445                 MD_BUG();
5446                 return;
5447         }
5448
5449         if (!rdev || test_bit(Faulty, &rdev->flags))
5450                 return;
5451
5452         if (mddev->external)
5453                 set_bit(Blocked, &rdev->flags);
5454 /*
5455         dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
5456                 mdname(mddev),
5457                 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
5458                 __builtin_return_address(0),__builtin_return_address(1),
5459                 __builtin_return_address(2),__builtin_return_address(3));
5460 */
5461         if (!mddev->pers)
5462                 return;
5463         if (!mddev->pers->error_handler)
5464                 return;
5465         mddev->pers->error_handler(mddev,rdev);
5466         if (mddev->degraded)
5467                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5468         set_bit(StateChanged, &rdev->flags);
5469         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5470         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5471         md_wakeup_thread(mddev->thread);
5472         md_new_event_inintr(mddev);
5473 }
5474
5475 /* seq_file implementation /proc/mdstat */
5476
5477 static void status_unused(struct seq_file *seq)
5478 {
5479         int i = 0;
5480         mdk_rdev_t *rdev;
5481
5482         seq_printf(seq, "unused devices: ");
5483
5484         list_for_each_entry(rdev, &pending_raid_disks, same_set) {
5485                 char b[BDEVNAME_SIZE];
5486                 i++;
5487                 seq_printf(seq, "%s ",
5488                               bdevname(rdev->bdev,b));
5489         }
5490         if (!i)
5491                 seq_printf(seq, "<none>");
5492
5493         seq_printf(seq, "\n");
5494 }
5495
5496
5497 static void status_resync(struct seq_file *seq, mddev_t * mddev)
5498 {
5499         sector_t max_blocks, resync, res;
5500         unsigned long dt, db, rt;
5501         int scale;
5502         unsigned int per_milli;
5503
5504         resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
5505
5506         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5507                 max_blocks = mddev->resync_max_sectors >> 1;
5508         else
5509                 max_blocks = mddev->dev_sectors / 2;
5510
5511         /*
5512          * Should not happen.
5513          */
5514         if (!max_blocks) {
5515                 MD_BUG();
5516                 return;
5517         }
5518         /* Pick 'scale' such that (resync>>scale)*1000 will fit
5519          * in a sector_t, and (max_blocks>>scale) will fit in a
5520          * u32, as those are the requirements for sector_div.
5521          * Thus 'scale' must be at least 10
5522          */
5523         scale = 10;
5524         if (sizeof(sector_t) > sizeof(unsigned long)) {
5525                 while ( max_blocks/2 > (1ULL<<(scale+32)))
5526                         scale++;
5527         }
5528         res = (resync>>scale)*1000;
5529         sector_div(res, (u32)((max_blocks>>scale)+1));
5530
5531         per_milli = res;
5532         {
5533                 int i, x = per_milli/50, y = 20-x;
5534                 seq_printf(seq, "[");
5535                 for (i = 0; i < x; i++)
5536                         seq_printf(seq, "=");
5537                 seq_printf(seq, ">");
5538                 for (i = 0; i < y; i++)
5539                         seq_printf(seq, ".");
5540                 seq_printf(seq, "] ");
5541         }
5542         seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
5543                    (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
5544                     "reshape" :
5545                     (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
5546                      "check" :
5547                      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
5548                       "resync" : "recovery"))),
5549                    per_milli/10, per_milli % 10,
5550                    (unsigned long long) resync,
5551                    (unsigned long long) max_blocks);
5552
5553         /*
5554          * We do not want to overflow, so the order of operands and
5555          * the * 100 / 100 trick are important. We do a +1 to be
5556          * safe against division by zero. We only estimate anyway.
5557          *
5558          * dt: time from mark until now
5559          * db: blocks written from mark until now
5560          * rt: remaining time
5561          */
5562         dt = ((jiffies - mddev->resync_mark) / HZ);
5563         if (!dt) dt++;
5564         db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
5565                 - mddev->resync_mark_cnt;
5566         rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
5567
5568         seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
5569
5570         seq_printf(seq, " speed=%ldK/sec", db/2/dt);
5571 }
5572
5573 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
5574 {
5575         struct list_head *tmp;
5576         loff_t l = *pos;
5577         mddev_t *mddev;
5578
5579         if (l >= 0x10000)
5580                 return NULL;
5581         if (!l--)
5582                 /* header */
5583                 return (void*)1;
5584
5585         spin_lock(&all_mddevs_lock);
5586         list_for_each(tmp,&all_mddevs)
5587                 if (!l--) {
5588                         mddev = list_entry(tmp, mddev_t, all_mddevs);
5589                         mddev_get(mddev);
5590                         spin_unlock(&all_mddevs_lock);
5591                         return mddev;
5592                 }
5593         spin_unlock(&all_mddevs_lock);
5594         if (!l--)
5595                 return (void*)2;/* tail */
5596         return NULL;
5597 }
5598
5599 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
5600 {
5601         struct list_head *tmp;
5602         mddev_t *next_mddev, *mddev = v;
5603         
5604         ++*pos;
5605         if (v == (void*)2)
5606                 return NULL;
5607
5608         spin_lock(&all_mddevs_lock);
5609         if (v == (void*)1)
5610                 tmp = all_mddevs.next;
5611         else
5612                 tmp = mddev->all_mddevs.next;
5613         if (tmp != &all_mddevs)
5614                 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
5615         else {
5616                 next_mddev = (void*)2;
5617                 *pos = 0x10000;
5618         }               
5619         spin_unlock(&all_mddevs_lock);
5620
5621         if (v != (void*)1)
5622                 mddev_put(mddev);
5623         return next_mddev;
5624
5625 }
5626
5627 static void md_seq_stop(struct seq_file *seq, void *v)
5628 {
5629         mddev_t *mddev = v;
5630
5631         if (mddev && v != (void*)1 && v != (void*)2)
5632                 mddev_put(mddev);
5633 }
5634
5635 struct mdstat_info {
5636         int event;
5637 };
5638
5639 static int md_seq_show(struct seq_file *seq, void *v)
5640 {
5641         mddev_t *mddev = v;
5642         sector_t sectors;
5643         mdk_rdev_t *rdev;
5644         struct mdstat_info *mi = seq->private;
5645         struct bitmap *bitmap;
5646
5647         if (v == (void*)1) {
5648                 struct mdk_personality *pers;
5649                 seq_printf(seq, "Personalities : ");
5650                 spin_lock(&pers_lock);
5651                 list_for_each_entry(pers, &pers_list, list)
5652                         seq_printf(seq, "[%s] ", pers->name);
5653
5654                 spin_unlock(&pers_lock);
5655                 seq_printf(seq, "\n");
5656                 mi->event = atomic_read(&md_event_count);
5657                 return 0;
5658         }
5659         if (v == (void*)2) {
5660                 status_unused(seq);
5661                 return 0;
5662         }
5663
5664         if (mddev_lock(mddev) < 0)
5665                 return -EINTR;
5666
5667         if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
5668                 seq_printf(seq, "%s : %sactive", mdname(mddev),
5669                                                 mddev->pers ? "" : "in");
5670                 if (mddev->pers) {
5671                         if (mddev->ro==1)
5672                                 seq_printf(seq, " (read-only)");
5673                         if (mddev->ro==2)
5674                                 seq_printf(seq, " (auto-read-only)");
5675                         seq_printf(seq, " %s", mddev->pers->name);
5676                 }
5677
5678                 sectors = 0;
5679                 list_for_each_entry(rdev, &mddev->disks, same_set) {
5680                         char b[BDEVNAME_SIZE];
5681                         seq_printf(seq, " %s[%d]",
5682                                 bdevname(rdev->bdev,b), rdev->desc_nr);
5683                         if (test_bit(WriteMostly, &rdev->flags))
5684                                 seq_printf(seq, "(W)");
5685                         if (test_bit(Faulty, &rdev->flags)) {
5686                                 seq_printf(seq, "(F)");
5687                                 continue;
5688                         } else if (rdev->raid_disk < 0)
5689                                 seq_printf(seq, "(S)"); /* spare */
5690                         sectors += rdev->sectors;
5691                 }
5692
5693                 if (!list_empty(&mddev->disks)) {
5694                         if (mddev->pers)
5695                                 seq_printf(seq, "\n      %llu blocks",
5696                                            (unsigned long long)
5697                                            mddev->array_sectors / 2);
5698                         else
5699                                 seq_printf(seq, "\n      %llu blocks",
5700                                            (unsigned long long)sectors / 2);
5701                 }
5702                 if (mddev->persistent) {
5703                         if (mddev->major_version != 0 ||
5704                             mddev->minor_version != 90) {
5705                                 seq_printf(seq," super %d.%d",
5706                                            mddev->major_version,
5707                                            mddev->minor_version);
5708                         }
5709                 } else if (mddev->external)
5710                         seq_printf(seq, " super external:%s",
5711                                    mddev->metadata_type);
5712                 else
5713                         seq_printf(seq, " super non-persistent");
5714
5715                 if (mddev->pers) {
5716                         mddev->pers->status(seq, mddev);
5717                         seq_printf(seq, "\n      ");
5718                         if (mddev->pers->sync_request) {
5719                                 if (mddev->curr_resync > 2) {
5720                                         status_resync(seq, mddev);
5721                                         seq_printf(seq, "\n      ");
5722                                 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
5723                                         seq_printf(seq, "\tresync=DELAYED\n      ");
5724                                 else if (mddev->recovery_cp < MaxSector)
5725                                         seq_printf(seq, "\tresync=PENDING\n      ");
5726                         }
5727                 } else
5728                         seq_printf(seq, "\n       ");
5729
5730                 if ((bitmap = mddev->bitmap)) {
5731                         unsigned long chunk_kb;
5732                         unsigned long flags;
5733                         spin_lock_irqsave(&bitmap->lock, flags);
5734                         chunk_kb = bitmap->chunksize >> 10;
5735                         seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5736                                 "%lu%s chunk",
5737                                 bitmap->pages - bitmap->missing_pages,
5738                                 bitmap->pages,
5739                                 (bitmap->pages - bitmap->missing_pages)
5740                                         << (PAGE_SHIFT - 10),
5741                                 chunk_kb ? chunk_kb : bitmap->chunksize,
5742                                 chunk_kb ? "KB" : "B");
5743                         if (bitmap->file) {
5744                                 seq_printf(seq, ", file: ");
5745                                 seq_path(seq, &bitmap->file->f_path, " \t\n");
5746                         }
5747
5748                         seq_printf(seq, "\n");
5749                         spin_unlock_irqrestore(&bitmap->lock, flags);
5750                 }
5751
5752                 seq_printf(seq, "\n");
5753         }
5754         mddev_unlock(mddev);
5755         
5756         return 0;
5757 }
5758
5759 static struct seq_operations md_seq_ops = {
5760         .start  = md_seq_start,
5761         .next   = md_seq_next,
5762         .stop   = md_seq_stop,
5763         .show   = md_seq_show,
5764 };
5765
5766 static int md_seq_open(struct inode *inode, struct file *file)
5767 {
5768         int error;
5769         struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
5770         if (mi == NULL)
5771                 return -ENOMEM;
5772
5773         error = seq_open(file, &md_seq_ops);
5774         if (error)
5775                 kfree(mi);
5776         else {
5777                 struct seq_file *p = file->private_data;
5778                 p->private = mi;
5779                 mi->event = atomic_read(&md_event_count);
5780         }
5781         return error;
5782 }
5783
5784 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
5785 {
5786         struct seq_file *m = filp->private_data;
5787         struct mdstat_info *mi = m->private;
5788         int mask;
5789
5790         poll_wait(filp, &md_event_waiters, wait);
5791
5792         /* always allow read */
5793         mask = POLLIN | POLLRDNORM;
5794
5795         if (mi->event != atomic_read(&md_event_count))
5796                 mask |= POLLERR | POLLPRI;
5797         return mask;
5798 }
5799
5800 static const struct file_operations md_seq_fops = {
5801         .owner          = THIS_MODULE,
5802         .open           = md_seq_open,
5803         .read           = seq_read,
5804         .llseek         = seq_lseek,
5805         .release        = seq_release_private,
5806         .poll           = mdstat_poll,
5807 };
5808
5809 int register_md_personality(struct mdk_personality *p)
5810 {
5811         spin_lock(&pers_lock);
5812         list_add_tail(&p->list, &pers_list);
5813         printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
5814         spin_unlock(&pers_lock);
5815         return 0;
5816 }
5817
5818 int unregister_md_personality(struct mdk_personality *p)
5819 {
5820         printk(KERN_INFO "md: %s personality unregistered\n", p->name);
5821         spin_lock(&pers_lock);
5822         list_del_init(&p->list);
5823         spin_unlock(&pers_lock);
5824         return 0;
5825 }
5826
5827 static int is_mddev_idle(mddev_t *mddev, int init)
5828 {
5829         mdk_rdev_t * rdev;
5830         int idle;
5831         int curr_events;
5832
5833         idle = 1;
5834         rcu_read_lock();
5835         rdev_for_each_rcu(rdev, mddev) {
5836                 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5837                 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
5838                               (int)part_stat_read(&disk->part0, sectors[1]) -
5839                               atomic_read(&disk->sync_io);
5840                 /* sync IO will cause sync_io to increase before the disk_stats
5841                  * as sync_io is counted when a request starts, and
5842                  * disk_stats is counted when it completes.
5843                  * So resync activity will cause curr_events to be smaller than
5844                  * when there was no such activity.
5845                  * non-sync IO will cause disk_stat to increase without
5846                  * increasing sync_io so curr_events will (eventually)
5847                  * be larger than it was before.  Once it becomes
5848                  * substantially larger, the test below will cause
5849                  * the array to appear non-idle, and resync will slow
5850                  * down.
5851                  * If there is a lot of outstanding resync activity when
5852                  * we set last_event to curr_events, then all that activity
5853                  * completing might cause the array to appear non-idle
5854                  * and resync will be slowed down even though there might
5855                  * not have been non-resync activity.  This will only
5856                  * happen once though.  'last_events' will soon reflect
5857                  * the state where there is little or no outstanding
5858                  * resync requests, and further resync activity will
5859                  * always make curr_events less than last_events.
5860                  *
5861                  */
5862                 if (init || curr_events - rdev->last_events > 64) {
5863                         rdev->last_events = curr_events;
5864                         idle = 0;
5865                 }
5866         }
5867         rcu_read_unlock();
5868         return idle;
5869 }
5870
5871 void md_done_sync(mddev_t *mddev, int blocks, int ok)
5872 {
5873         /* another "blocks" (512byte) blocks have been synced */
5874         atomic_sub(blocks, &mddev->recovery_active);
5875         wake_up(&mddev->recovery_wait);
5876         if (!ok) {
5877                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5878                 md_wakeup_thread(mddev->thread);
5879                 // stop recovery, signal do_sync ....
5880         }
5881 }
5882
5883
5884 /* md_write_start(mddev, bi)
5885  * If we need to update some array metadata (e.g. 'active' flag
5886  * in superblock) before writing, schedule a superblock update
5887  * and wait for it to complete.
5888  */
5889 void md_write_start(mddev_t *mddev, struct bio *bi)
5890 {
5891         int did_change = 0;
5892         if (bio_data_dir(bi) != WRITE)
5893                 return;
5894
5895         BUG_ON(mddev->ro == 1);
5896         if (mddev->ro == 2) {
5897                 /* need to switch to read/write */
5898                 mddev->ro = 0;
5899                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5900                 md_wakeup_thread(mddev->thread);
5901                 md_wakeup_thread(mddev->sync_thread);
5902                 did_change = 1;
5903         }
5904         atomic_inc(&mddev->writes_pending);
5905         if (mddev->safemode == 1)
5906                 mddev->safemode = 0;
5907         if (mddev->in_sync) {
5908                 spin_lock_irq(&mddev->write_lock);
5909                 if (mddev->in_sync) {
5910                         mddev->in_sync = 0;
5911                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5912                         md_wakeup_thread(mddev->thread);
5913                         did_change = 1;
5914                 }
5915                 spin_unlock_irq(&mddev->write_lock);
5916         }
5917         if (did_change)
5918                 sysfs_notify_dirent(mddev->sysfs_state);
5919         wait_event(mddev->sb_wait,
5920                    !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
5921                    !test_bit(MD_CHANGE_PENDING, &mddev->flags));
5922 }
5923
5924 void md_write_end(mddev_t *mddev)
5925 {
5926         if (atomic_dec_and_test(&mddev->writes_pending)) {
5927                 if (mddev->safemode == 2)
5928                         md_wakeup_thread(mddev->thread);
5929                 else if (mddev->safemode_delay)
5930                         mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
5931         }
5932 }
5933
5934 /* md_allow_write(mddev)
5935  * Calling this ensures that the array is marked 'active' so that writes
5936  * may proceed without blocking.  It is important to call this before
5937  * attempting a GFP_KERNEL allocation while holding the mddev lock.
5938  * Must be called with mddev_lock held.
5939  *
5940  * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
5941  * is dropped, so return -EAGAIN after notifying userspace.
5942  */
5943 int md_allow_write(mddev_t *mddev)
5944 {
5945         if (!mddev->pers)
5946                 return 0;
5947         if (mddev->ro)
5948                 return 0;
5949         if (!mddev->pers->sync_request)
5950                 return 0;
5951
5952         spin_lock_irq(&mddev->write_lock);
5953         if (mddev->in_sync) {
5954                 mddev->in_sync = 0;
5955                 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5956                 if (mddev->safemode_delay &&
5957                     mddev->safemode == 0)
5958                         mddev->safemode = 1;
5959                 spin_unlock_irq(&mddev->write_lock);
5960                 md_update_sb(mddev, 0);
5961                 sysfs_notify_dirent(mddev->sysfs_state);
5962         } else
5963                 spin_unlock_irq(&mddev->write_lock);
5964
5965         if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
5966                 return -EAGAIN;
5967         else
5968                 return 0;
5969 }
5970 EXPORT_SYMBOL_GPL(md_allow_write);
5971
5972 #define SYNC_MARKS      10
5973 #define SYNC_MARK_STEP  (3*HZ)
5974 void md_do_sync(mddev_t *mddev)
5975 {
5976         mddev_t *mddev2;
5977         unsigned int currspeed = 0,
5978                  window;
5979         sector_t max_sectors,j, io_sectors;
5980         unsigned long mark[SYNC_MARKS];
5981         sector_t mark_cnt[SYNC_MARKS];
5982         int last_mark,m;
5983         struct list_head *tmp;
5984         sector_t last_check;
5985         int skipped = 0;
5986         mdk_rdev_t *rdev;
5987         char *desc;
5988
5989         /* just incase thread restarts... */
5990         if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
5991                 return;
5992         if (mddev->ro) /* never try to sync a read-only array */
5993                 return;
5994
5995         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5996                 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
5997                         desc = "data-check";
5998                 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5999                         desc = "requested-resync";
6000                 else
6001                         desc = "resync";
6002         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6003                 desc = "reshape";
6004         else
6005                 desc = "recovery";
6006
6007         /* we overload curr_resync somewhat here.
6008          * 0 == not engaged in resync at all
6009          * 2 == checking that there is no conflict with another sync
6010          * 1 == like 2, but have yielded to allow conflicting resync to
6011          *              commense
6012          * other == active in resync - this many blocks
6013          *
6014          * Before starting a resync we must have set curr_resync to
6015          * 2, and then checked that every "conflicting" array has curr_resync
6016          * less than ours.  When we find one that is the same or higher
6017          * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
6018          * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
6019          * This will mean we have to start checking from the beginning again.
6020          *
6021          */
6022
6023         do {
6024                 mddev->curr_resync = 2;
6025
6026         try_again:
6027                 if (kthread_should_stop()) {
6028                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6029                         goto skip;
6030                 }
6031                 for_each_mddev(mddev2, tmp) {
6032                         if (mddev2 == mddev)
6033                                 continue;
6034                         if (!mddev->parallel_resync
6035                         &&  mddev2->curr_resync
6036                         &&  match_mddev_units(mddev, mddev2)) {
6037                                 DEFINE_WAIT(wq);
6038                                 if (mddev < mddev2 && mddev->curr_resync == 2) {
6039                                         /* arbitrarily yield */
6040                                         mddev->curr_resync = 1;
6041                                         wake_up(&resync_wait);
6042                                 }
6043                                 if (mddev > mddev2 && mddev->curr_resync == 1)
6044                                         /* no need to wait here, we can wait the next
6045                                          * time 'round when curr_resync == 2
6046                                          */
6047                                         continue;
6048                                 /* We need to wait 'interruptible' so as not to
6049                                  * contribute to the load average, and not to
6050                                  * be caught by 'softlockup'
6051                                  */
6052                                 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
6053                                 if (!kthread_should_stop() &&
6054                                     mddev2->curr_resync >= mddev->curr_resync) {
6055                                         printk(KERN_INFO "md: delaying %s of %s"
6056                                                " until %s has finished (they"
6057                                                " share one or more physical units)\n",
6058                                                desc, mdname(mddev), mdname(mddev2));
6059                                         mddev_put(mddev2);
6060                                         if (signal_pending(current))
6061                                                 flush_signals(current);
6062                                         schedule();
6063                                         finish_wait(&resync_wait, &wq);
6064                                         goto try_again;
6065                                 }
6066                                 finish_wait(&resync_wait, &wq);
6067                         }
6068                 }
6069         } while (mddev->curr_resync < 2);
6070
6071         j = 0;
6072         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6073                 /* resync follows the size requested by the personality,
6074                  * which defaults to physical size, but can be virtual size
6075                  */
6076                 max_sectors = mddev->resync_max_sectors;
6077                 mddev->resync_mismatches = 0;
6078                 /* we don't use the checkpoint if there's a bitmap */
6079                 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6080                         j = mddev->resync_min;
6081                 else if (!mddev->bitmap)
6082                         j = mddev->recovery_cp;
6083
6084         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6085                 max_sectors = mddev->dev_sectors;
6086         else {
6087                 /* recovery follows the physical size of devices */
6088                 max_sectors = mddev->dev_sectors;
6089                 j = MaxSector;
6090                 list_for_each_entry(rdev, &mddev->disks, same_set)
6091                         if (rdev->raid_disk >= 0 &&
6092                             !test_bit(Faulty, &rdev->flags) &&
6093                             !test_bit(In_sync, &rdev->flags) &&
6094                             rdev->recovery_offset < j)
6095                                 j = rdev->recovery_offset;
6096         }
6097
6098         printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
6099         printk(KERN_INFO "md: minimum _guaranteed_  speed:"
6100                 " %d KB/sec/disk.\n", speed_min(mddev));
6101         printk(KERN_INFO "md: using maximum available idle IO bandwidth "
6102                "(but not more than %d KB/sec) for %s.\n",
6103                speed_max(mddev), desc);
6104
6105         is_mddev_idle(mddev, 1); /* this initializes IO event counters */
6106
6107         io_sectors = 0;
6108         for (m = 0; m < SYNC_MARKS; m++) {
6109                 mark[m] = jiffies;
6110                 mark_cnt[m] = io_sectors;
6111         }
6112         last_mark = 0;
6113         mddev->resync_mark = mark[last_mark];
6114         mddev->resync_mark_cnt = mark_cnt[last_mark];
6115
6116         /*
6117          * Tune reconstruction:
6118          */
6119         window = 32*(PAGE_SIZE/512);
6120         printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
6121                 window/2,(unsigned long long) max_sectors/2);
6122
6123         atomic_set(&mddev->recovery_active, 0);
6124         last_check = 0;
6125
6126         if (j>2) {
6127                 printk(KERN_INFO 
6128                        "md: resuming %s of %s from checkpoint.\n",
6129                        desc, mdname(mddev));
6130                 mddev->curr_resync = j;
6131         }
6132
6133         while (j < max_sectors) {
6134                 sector_t sectors;
6135
6136                 skipped = 0;
6137                 if (j >= mddev->resync_max) {
6138                         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6139                         wait_event(mddev->recovery_wait,
6140                                    mddev->resync_max > j
6141                                    || kthread_should_stop());
6142                 }
6143                 if (kthread_should_stop())
6144                         goto interrupted;
6145
6146                 if (mddev->curr_resync > mddev->curr_resync_completed &&
6147                     (mddev->curr_resync - mddev->curr_resync_completed)
6148                     > (max_sectors >> 4)) {
6149                         /* time to update curr_resync_completed */
6150                         blk_unplug(mddev->queue);
6151                         wait_event(mddev->recovery_wait,
6152                                    atomic_read(&mddev->recovery_active) == 0);
6153                         mddev->curr_resync_completed =
6154                                 mddev->curr_resync;
6155                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6156                 }
6157                 sectors = mddev->pers->sync_request(mddev, j, &skipped,
6158                                                   currspeed < speed_min(mddev));
6159                 if (sectors == 0) {
6160                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6161                         goto out;
6162                 }
6163
6164                 if (!skipped) { /* actual IO requested */
6165                         io_sectors += sectors;
6166                         atomic_add(sectors, &mddev->recovery_active);
6167                 }
6168
6169                 j += sectors;
6170                 if (j>1) mddev->curr_resync = j;
6171                 mddev->curr_mark_cnt = io_sectors;
6172                 if (last_check == 0)
6173                         /* this is the earliers that rebuilt will be
6174                          * visible in /proc/mdstat
6175                          */
6176                         md_new_event(mddev);
6177
6178                 if (last_check + window > io_sectors || j == max_sectors)
6179                         continue;
6180
6181                 last_check = io_sectors;
6182
6183                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6184                         break;
6185
6186         repeat:
6187                 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
6188                         /* step marks */
6189                         int next = (last_mark+1) % SYNC_MARKS;
6190
6191                         mddev->resync_mark = mark[next];
6192                         mddev->resync_mark_cnt = mark_cnt[next];
6193                         mark[next] = jiffies;
6194                         mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
6195                         last_mark = next;
6196                 }
6197
6198
6199                 if (kthread_should_stop())
6200                         goto interrupted;
6201
6202
6203                 /*
6204                  * this loop exits only if either when we are slower than
6205                  * the 'hard' speed limit, or the system was IO-idle for
6206                  * a jiffy.
6207                  * the system might be non-idle CPU-wise, but we only care
6208                  * about not overloading the IO subsystem. (things like an
6209                  * e2fsck being done on the RAID array should execute fast)
6210                  */
6211                 blk_unplug(mddev->queue);
6212                 cond_resched();
6213
6214                 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
6215                         /((jiffies-mddev->resync_mark)/HZ +1) +1;
6216
6217                 if (currspeed > speed_min(mddev)) {
6218                         if ((currspeed > speed_max(mddev)) ||
6219                                         !is_mddev_idle(mddev, 0)) {
6220                                 msleep(500);
6221                                 goto repeat;
6222                         }
6223                 }
6224         }
6225         printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
6226         /*
6227          * this also signals 'finished resyncing' to md_stop
6228          */
6229  out:
6230         blk_unplug(mddev->queue);
6231
6232         wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
6233
6234         /* tell personality that we are finished */
6235         mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
6236
6237         if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
6238             mddev->curr_resync > 2) {
6239                 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6240                         if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6241                                 if (mddev->curr_resync >= mddev->recovery_cp) {
6242                                         printk(KERN_INFO
6243                                                "md: checkpointing %s of %s.\n",
6244                                                desc, mdname(mddev));
6245                                         mddev->recovery_cp = mddev->curr_resync;
6246                                 }
6247                         } else
6248                                 mddev->recovery_cp = MaxSector;
6249                 } else {
6250                         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6251                                 mddev->curr_resync = MaxSector;
6252                         list_for_each_entry(rdev, &mddev->disks, same_set)
6253                                 if (rdev->raid_disk >= 0 &&
6254                                     !test_bit(Faulty, &rdev->flags) &&
6255                                     !test_bit(In_sync, &rdev->flags) &&
6256                                     rdev->recovery_offset < mddev->curr_resync)
6257                                         rdev->recovery_offset = mddev->curr_resync;
6258                 }
6259         }
6260         set_bit(MD_CHANGE_DEVS, &mddev->flags);
6261
6262  skip:
6263         mddev->curr_resync = 0;
6264         mddev->resync_min = 0;
6265         mddev->resync_max = MaxSector;
6266         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6267         wake_up(&resync_wait);
6268         set_bit(MD_RECOVERY_DONE, &mddev->recovery);
6269         md_wakeup_thread(mddev->thread);
6270         return;
6271
6272  interrupted:
6273         /*
6274          * got a signal, exit.
6275          */
6276         printk(KERN_INFO
6277                "md: md_do_sync() got signal ... exiting\n");
6278         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6279         goto out;
6280
6281 }
6282 EXPORT_SYMBOL_GPL(md_do_sync);
6283
6284
6285 static int remove_and_add_spares(mddev_t *mddev)
6286 {
6287         mdk_rdev_t *rdev;
6288         int spares = 0;
6289
6290         mddev->curr_resync_completed = 0;
6291
6292         list_for_each_entry(rdev, &mddev->disks, same_set)
6293                 if (rdev->raid_disk >= 0 &&
6294                     !test_bit(Blocked, &rdev->flags) &&
6295                     (test_bit(Faulty, &rdev->flags) ||
6296                      ! test_bit(In_sync, &rdev->flags)) &&
6297                     atomic_read(&rdev->nr_pending)==0) {
6298                         if (mddev->pers->hot_remove_disk(
6299                                     mddev, rdev->raid_disk)==0) {
6300                                 char nm[20];
6301                                 sprintf(nm,"rd%d", rdev->raid_disk);
6302                                 sysfs_remove_link(&mddev->kobj, nm);
6303                                 rdev->raid_disk = -1;
6304                         }
6305                 }
6306
6307         if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
6308                 list_for_each_entry(rdev, &mddev->disks, same_set) {
6309                         if (rdev->raid_disk >= 0 &&
6310                             !test_bit(In_sync, &rdev->flags) &&
6311                             !test_bit(Blocked, &rdev->flags))
6312                                 spares++;
6313                         if (rdev->raid_disk < 0
6314                             && !test_bit(Faulty, &rdev->flags)) {
6315                                 rdev->recovery_offset = 0;
6316                                 if (mddev->pers->
6317                                     hot_add_disk(mddev, rdev) == 0) {
6318                                         char nm[20];
6319                                         sprintf(nm, "rd%d", rdev->raid_disk);
6320                                         if (sysfs_create_link(&mddev->kobj,
6321                                                               &rdev->kobj, nm))
6322                                                 printk(KERN_WARNING
6323                                                        "md: cannot register "
6324                                                        "%s for %s\n",
6325                                                        nm, mdname(mddev));
6326                                         spares++;
6327                                         md_new_event(mddev);
6328                                 } else
6329                                         break;
6330                         }
6331                 }
6332         }
6333         return spares;
6334 }
6335 /*
6336  * This routine is regularly called by all per-raid-array threads to
6337  * deal with generic issues like resync and super-block update.
6338  * Raid personalities that don't have a thread (linear/raid0) do not
6339  * need this as they never do any recovery or update the superblock.
6340  *
6341  * It does not do any resync itself, but rather "forks" off other threads
6342  * to do that as needed.
6343  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
6344  * "->recovery" and create a thread at ->sync_thread.
6345  * When the thread finishes it sets MD_RECOVERY_DONE
6346  * and wakeups up this thread which will reap the thread and finish up.
6347  * This thread also removes any faulty devices (with nr_pending == 0).
6348  *
6349  * The overall approach is:
6350  *  1/ if the superblock needs updating, update it.
6351  *  2/ If a recovery thread is running, don't do anything else.
6352  *  3/ If recovery has finished, clean up, possibly marking spares active.
6353  *  4/ If there are any faulty devices, remove them.
6354  *  5/ If array is degraded, try to add spares devices
6355  *  6/ If array has spares or is not in-sync, start a resync thread.
6356  */
6357 void md_check_recovery(mddev_t *mddev)
6358 {
6359         mdk_rdev_t *rdev;
6360
6361
6362         if (mddev->bitmap)
6363                 bitmap_daemon_work(mddev->bitmap);
6364
6365         if (mddev->ro)
6366                 return;
6367
6368         if (signal_pending(current)) {
6369                 if (mddev->pers->sync_request && !mddev->external) {
6370                         printk(KERN_INFO "md: %s in immediate safe mode\n",
6371                                mdname(mddev));
6372                         mddev->safemode = 2;
6373                 }
6374                 flush_signals(current);
6375         }
6376
6377         if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
6378                 return;
6379         if ( ! (
6380                 (mddev->flags && !mddev->external) ||
6381                 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
6382                 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
6383                 (mddev->external == 0 && mddev->safemode == 1) ||
6384                 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
6385                  && !mddev->in_sync && mddev->recovery_cp == MaxSector)
6386                 ))
6387                 return;
6388
6389         if (mddev_trylock(mddev)) {
6390                 int spares = 0;
6391
6392                 if (mddev->ro) {
6393                         /* Only thing we do on a ro array is remove
6394                          * failed devices.
6395                          */
6396                         remove_and_add_spares(mddev);
6397                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6398                         goto unlock;
6399                 }
6400
6401                 if (!mddev->external) {
6402                         int did_change = 0;
6403                         spin_lock_irq(&mddev->write_lock);
6404                         if (mddev->safemode &&
6405                             !atomic_read(&mddev->writes_pending) &&
6406                             !mddev->in_sync &&
6407                             mddev->recovery_cp == MaxSector) {
6408                                 mddev->in_sync = 1;
6409                                 did_change = 1;
6410                                 if (mddev->persistent)
6411                                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6412                         }
6413                         if (mddev->safemode == 1)
6414                                 mddev->safemode = 0;
6415                         spin_unlock_irq(&mddev->write_lock);
6416                         if (did_change)
6417                                 sysfs_notify_dirent(mddev->sysfs_state);
6418                 }
6419
6420                 if (mddev->flags)
6421                         md_update_sb(mddev, 0);
6422
6423                 list_for_each_entry(rdev, &mddev->disks, same_set)
6424                         if (test_and_clear_bit(StateChanged, &rdev->flags))
6425                                 sysfs_notify_dirent(rdev->sysfs_state);
6426
6427
6428                 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
6429                     !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
6430                         /* resync/recovery still happening */
6431                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6432                         goto unlock;
6433                 }
6434                 if (mddev->sync_thread) {
6435                         /* resync has finished, collect result */
6436                         md_unregister_thread(mddev->sync_thread);
6437                         mddev->sync_thread = NULL;
6438                         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
6439                             !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
6440                                 /* success...*/
6441                                 /* activate any spares */
6442                                 if (mddev->pers->spare_active(mddev))
6443                                         sysfs_notify(&mddev->kobj, NULL,
6444                                                      "degraded");
6445                         }
6446                         md_update_sb(mddev, 1);
6447
6448                         /* if array is no-longer degraded, then any saved_raid_disk
6449                          * information must be scrapped
6450                          */
6451                         if (!mddev->degraded)
6452                                 list_for_each_entry(rdev, &mddev->disks, same_set)
6453                                         rdev->saved_raid_disk = -1;
6454
6455                         mddev->recovery = 0;
6456                         /* flag recovery needed just to double check */
6457                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6458                         sysfs_notify_dirent(mddev->sysfs_action);
6459                         md_new_event(mddev);
6460                         goto unlock;
6461                 }
6462                 /* Set RUNNING before clearing NEEDED to avoid
6463                  * any transients in the value of "sync_action".
6464                  */
6465                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6466                 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6467                 /* Clear some bits that don't mean anything, but
6468                  * might be left set
6469                  */
6470                 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
6471                 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
6472
6473                 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
6474                         goto unlock;
6475                 /* no recovery is running.
6476                  * remove any failed drives, then
6477                  * add spares if possible.
6478                  * Spare are also removed and re-added, to allow
6479                  * the personality to fail the re-add.
6480                  */
6481
6482                 if (mddev->reshape_position != MaxSector) {
6483                         if (mddev->pers->check_reshape(mddev) != 0)
6484                                 /* Cannot proceed */
6485                                 goto unlock;
6486                         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6487                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6488                 } else if ((spares = remove_and_add_spares(mddev))) {
6489                         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6490                         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6491                         clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
6492                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6493                 } else if (mddev->recovery_cp < MaxSector) {
6494                         set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6495                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6496                 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
6497                         /* nothing to be done ... */
6498                         goto unlock;
6499
6500                 if (mddev->pers->sync_request) {
6501                         if (spares && mddev->bitmap && ! mddev->bitmap->file) {
6502                                 /* We are adding a device or devices to an array
6503                                  * which has the bitmap stored on all devices.
6504                                  * So make sure all bitmap pages get written
6505                                  */
6506                                 bitmap_write_all(mddev->bitmap);
6507                         }
6508                         mddev->sync_thread = md_register_thread(md_do_sync,
6509                                                                 mddev,
6510                                                                 "%s_resync");
6511                         if (!mddev->sync_thread) {
6512                                 printk(KERN_ERR "%s: could not start resync"
6513                                         " thread...\n", 
6514                                         mdname(mddev));
6515                                 /* leave the spares where they are, it shouldn't hurt */
6516                                 mddev->recovery = 0;
6517                         } else
6518                                 md_wakeup_thread(mddev->sync_thread);
6519                         sysfs_notify_dirent(mddev->sysfs_action);
6520                         md_new_event(mddev);
6521                 }
6522         unlock:
6523                 if (!mddev->sync_thread) {
6524                         clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6525                         if (test_and_clear_bit(MD_RECOVERY_RECOVER,
6526                                                &mddev->recovery))
6527                                 if (mddev->sysfs_action)
6528                                         sysfs_notify_dirent(mddev->sysfs_action);
6529                 }
6530                 mddev_unlock(mddev);
6531         }
6532 }
6533
6534 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
6535 {
6536         sysfs_notify_dirent(rdev->sysfs_state);
6537         wait_event_timeout(rdev->blocked_wait,
6538                            !test_bit(Blocked, &rdev->flags),
6539                            msecs_to_jiffies(5000));
6540         rdev_dec_pending(rdev, mddev);
6541 }
6542 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
6543
6544 static int md_notify_reboot(struct notifier_block *this,
6545                             unsigned long code, void *x)
6546 {
6547         struct list_head *tmp;
6548         mddev_t *mddev;
6549
6550         if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
6551
6552                 printk(KERN_INFO "md: stopping all md devices.\n");
6553
6554                 for_each_mddev(mddev, tmp)
6555                         if (mddev_trylock(mddev)) {
6556                                 /* Force a switch to readonly even array
6557                                  * appears to still be in use.  Hence
6558                                  * the '100'.
6559                                  */
6560                                 do_md_stop(mddev, 1, 100);
6561                                 mddev_unlock(mddev);
6562                         }
6563                 /*
6564                  * certain more exotic SCSI devices are known to be
6565                  * volatile wrt too early system reboots. While the
6566                  * right place to handle this issue is the given
6567                  * driver, we do want to have a safe RAID driver ...
6568                  */
6569                 mdelay(1000*1);
6570         }
6571         return NOTIFY_DONE;
6572 }
6573
6574 static struct notifier_block md_notifier = {
6575         .notifier_call  = md_notify_reboot,
6576         .next           = NULL,
6577         .priority       = INT_MAX, /* before any real devices */
6578 };
6579
6580 static void md_geninit(void)
6581 {
6582         dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
6583
6584         proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
6585 }
6586
6587 static int __init md_init(void)
6588 {
6589         if (register_blkdev(MD_MAJOR, "md"))
6590                 return -1;
6591         if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
6592                 unregister_blkdev(MD_MAJOR, "md");
6593                 return -1;
6594         }
6595         blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
6596                             md_probe, NULL, NULL);
6597         blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
6598                             md_probe, NULL, NULL);
6599
6600         register_reboot_notifier(&md_notifier);
6601         raid_table_header = register_sysctl_table(raid_root_table);
6602
6603         md_geninit();
6604         return 0;
6605 }
6606
6607
6608 #ifndef MODULE
6609
6610 /*
6611  * Searches all registered partitions for autorun RAID arrays
6612  * at boot time.
6613  */
6614
6615 static LIST_HEAD(all_detected_devices);
6616 struct detected_devices_node {
6617         struct list_head list;
6618         dev_t dev;
6619 };
6620
6621 void md_autodetect_dev(dev_t dev)
6622 {
6623         struct detected_devices_node *node_detected_dev;
6624
6625         node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
6626         if (node_detected_dev) {
6627                 node_detected_dev->dev = dev;
6628                 list_add_tail(&node_detected_dev->list, &all_detected_devices);
6629         } else {
6630                 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
6631                         ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
6632         }
6633 }
6634
6635
6636 static void autostart_arrays(int part)
6637 {
6638         mdk_rdev_t *rdev;
6639         struct detected_devices_node *node_detected_dev;
6640         dev_t dev;
6641         int i_scanned, i_passed;
6642
6643         i_scanned = 0;
6644         i_passed = 0;
6645
6646         printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
6647
6648         while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
6649                 i_scanned++;
6650                 node_detected_dev = list_entry(all_detected_devices.next,
6651                                         struct detected_devices_node, list);
6652                 list_del(&node_detected_dev->list);
6653                 dev = node_detected_dev->dev;
6654                 kfree(node_detected_dev);
6655                 rdev = md_import_device(dev,0, 90);
6656                 if (IS_ERR(rdev))
6657                         continue;
6658
6659                 if (test_bit(Faulty, &rdev->flags)) {
6660                         MD_BUG();
6661                         continue;
6662                 }
6663                 set_bit(AutoDetected, &rdev->flags);
6664                 list_add(&rdev->same_set, &pending_raid_disks);
6665                 i_passed++;
6666         }
6667
6668         printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
6669                                                 i_scanned, i_passed);
6670
6671         autorun_devices(part);
6672 }
6673
6674 #endif /* !MODULE */
6675
6676 static __exit void md_exit(void)
6677 {
6678         mddev_t *mddev;
6679         struct list_head *tmp;
6680
6681         blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
6682         blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
6683
6684         unregister_blkdev(MD_MAJOR,"md");
6685         unregister_blkdev(mdp_major, "mdp");
6686         unregister_reboot_notifier(&md_notifier);
6687         unregister_sysctl_table(raid_table_header);
6688         remove_proc_entry("mdstat", NULL);
6689         for_each_mddev(mddev, tmp) {
6690                 export_array(mddev);
6691                 mddev->hold_active = 0;
6692         }
6693 }
6694
6695 subsys_initcall(md_init);
6696 module_exit(md_exit)
6697
6698 static int get_ro(char *buffer, struct kernel_param *kp)
6699 {
6700         return sprintf(buffer, "%d", start_readonly);
6701 }
6702 static int set_ro(const char *val, struct kernel_param *kp)
6703 {
6704         char *e;
6705         int num = simple_strtoul(val, &e, 10);
6706         if (*val && (*e == '\0' || *e == '\n')) {
6707                 start_readonly = num;
6708                 return 0;
6709         }
6710         return -EINVAL;
6711 }
6712
6713 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
6714 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
6715
6716 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
6717
6718 EXPORT_SYMBOL(register_md_personality);
6719 EXPORT_SYMBOL(unregister_md_personality);
6720 EXPORT_SYMBOL(md_error);
6721 EXPORT_SYMBOL(md_done_sync);
6722 EXPORT_SYMBOL(md_write_start);
6723 EXPORT_SYMBOL(md_write_end);
6724 EXPORT_SYMBOL(md_register_thread);
6725 EXPORT_SYMBOL(md_unregister_thread);
6726 EXPORT_SYMBOL(md_wakeup_thread);
6727 EXPORT_SYMBOL(md_check_recovery);
6728 MODULE_LICENSE("GPL");
6729 MODULE_ALIAS("md");
6730 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);