[PATCH] md: all hot-add and hot-remove of md intent logging bitmaps
[safe/jmp/linux-2.6] / drivers / md / md.c
index 789b114..ae65446 100644 (file)
@@ -195,8 +195,7 @@ static mddev_t * mddev_find(dev_t unit)
                if (mddev->unit == unit) {
                        mddev_get(mddev);
                        spin_unlock(&all_mddevs_lock);
-                       if (new)
-                               kfree(new);
+                       kfree(new);
                        return mddev;
                }
 
@@ -224,8 +223,8 @@ static mddev_t * mddev_find(dev_t unit)
        INIT_LIST_HEAD(&new->all_mddevs);
        init_timer(&new->safemode_timer);
        atomic_set(&new->active, 1);
-       bio_list_init(&new->write_list);
        spin_lock_init(&new->write_lock);
+       init_waitqueue_head(&new->sb_wait);
 
        new->queue = blk_alloc_queue(GFP_KERNEL);
        if (!new->queue) {
@@ -257,8 +256,7 @@ static inline void mddev_unlock(mddev_t * mddev)
 {
        up(&mddev->reconfig_sem);
 
-       if (mddev->thread)
-               md_wakeup_thread(mddev->thread);
+       md_wakeup_thread(mddev->thread);
 }
 
 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
@@ -285,7 +283,7 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
        return NULL;
 }
 
-inline static sector_t calc_dev_sboffset(struct block_device *bdev)
+static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 {
        sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
        return MD_NEW_SIZE_BLOCKS(size);
@@ -328,6 +326,41 @@ static void free_disk_sb(mdk_rdev_t * rdev)
 }
 
 
+static int super_written(struct bio *bio, unsigned int bytes_done, int error)
+{
+       mdk_rdev_t *rdev = bio->bi_private;
+       if (bio->bi_size)
+               return 1;
+
+       if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
+               md_error(rdev->mddev, rdev);
+
+       if (atomic_dec_and_test(&rdev->mddev->pending_writes))
+               wake_up(&rdev->mddev->sb_wait);
+       bio_put(bio);
+       return 0;
+}
+
+void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+                  sector_t sector, int size, struct page *page)
+{
+       /* write first size bytes of page to sector of rdev
+        * Increment mddev->pending_writes before returning
+        * and decrement it on completion, waking up sb_wait
+        * if zero is reached.
+        * If an error occurred, call md_error
+        */
+       struct bio *bio = bio_alloc(GFP_NOIO, 1);
+
+       bio->bi_bdev = rdev->bdev;
+       bio->bi_sector = sector;
+       bio_add_page(bio, page, size, 0);
+       bio->bi_private = rdev;
+       bio->bi_end_io = super_written;
+       atomic_inc(&mddev->pending_writes);
+       submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio);
+}
+
 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
 {
        if (bio->bi_size)
@@ -337,7 +370,7 @@ static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
        return 0;
 }
 
-static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+int sync_page_io(struct block_device *bdev, sector_t sector, int size,
                   struct page *page, int rw)
 {
        struct bio *bio = bio_alloc(GFP_NOIO, 1);
@@ -424,11 +457,8 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
                ret = 1;
 
 abort:
-       if (tmp1)
-               kfree(tmp1);
-       if (tmp2)
-               kfree(tmp2);
-
+       kfree(tmp1);
+       kfree(tmp2);
        return ret;
 }
 
@@ -592,6 +622,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->raid_disks = sb->raid_disks;
                mddev->size = sb->size;
                mddev->events = md_event(sb);
+               mddev->bitmap_offset = 0;
+               mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 
                if (sb->state & (1<<MD_SB_CLEAN))
                        mddev->recovery_cp = MaxSector;
@@ -609,6 +641,17 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
 
                mddev->max_disks = MD_SB_DISKS;
+
+               if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
+                   mddev->bitmap_file == NULL) {
+                       if (mddev->level != 1) {
+                               /* FIXME use a better test */
+                               printk(KERN_WARNING "md: bitmaps only support for raid1\n");
+                               return -EINVAL;
+                       }
+                       mddev->bitmap_offset = mddev->default_bitmap_offset;
+               }
+
        } else if (mddev->pers == NULL) {
                /* Insist on good event counter while assembling */
                __u64 ev1 = md_event(sb);
@@ -702,6 +745,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        sb->layout = mddev->layout;
        sb->chunk_size = mddev->chunk_size;
 
+       if (mddev->bitmap && mddev->bitmap_file == NULL)
+               sb->state |= (1<<MD_SB_BITMAP_PRESENT);
+
        sb->disks[0].state = (1<<MD_DISK_REMOVED);
        ITERATE_RDEV(mddev,rdev2,tmp) {
                mdp_disk_t *d;
@@ -799,7 +845,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
        case 0:
                sb_offset = rdev->bdev->bd_inode->i_size >> 9;
                sb_offset -= 8*2;
-               sb_offset &= ~(4*2-1);
+               sb_offset &= ~(sector_t)(4*2-1);
                /* convert from sectors to K */
                sb_offset /= 2;
                break;
@@ -893,11 +939,24 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
                mddev->size = le64_to_cpu(sb->size)/2;
                mddev->events = le64_to_cpu(sb->events);
+               mddev->bitmap_offset = 0;
+               mddev->default_bitmap_offset = 0;
+               if (mddev->minor_version == 0)
+                       mddev->default_bitmap_offset = -(64*1024)/512;
                
                mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
                memcpy(mddev->uuid, sb->set_uuid, 16);
 
                mddev->max_disks =  (4096-256)/2;
+
+               if ((le32_to_cpu(sb->feature_map) & 1) &&
+                   mddev->bitmap_file == NULL ) {
+                       if (mddev->level != 1) {
+                               printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
+                               return -EINVAL;
+                       }
+                       mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
+               }
        } else if (mddev->pers == NULL) {
                /* Insist of good event counter while assembling */
                __u64 ev1 = le64_to_cpu(sb->events);
@@ -960,6 +1019,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        else
                sb->resync_offset = cpu_to_le64(0);
 
+       if (mddev->bitmap && mddev->bitmap_file == NULL) {
+               sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
+               sb->feature_map = cpu_to_le32(1);
+       }
+
        max_dev = 0;
        ITERATE_RDEV(mddev,rdev2,tmp)
                if (rdev2->desc_nr+1 > max_dev)
@@ -1240,30 +1304,6 @@ void md_print_devices(void)
 }
 
 
-static int write_disk_sb(mdk_rdev_t * rdev)
-{
-       char b[BDEVNAME_SIZE];
-       if (!rdev->sb_loaded) {
-               MD_BUG();
-               return 1;
-       }
-       if (rdev->faulty) {
-               MD_BUG();
-               return 1;
-       }
-
-       dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
-               bdevname(rdev->bdev,b),
-              (unsigned long long)rdev->sb_offset);
-  
-       if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
-               return 0;
-
-       printk("md: write_disk_sb failed for device %s\n", 
-               bdevname(rdev->bdev,b));
-       return 1;
-}
-
 static void sync_sbs(mddev_t * mddev)
 {
        mdk_rdev_t *rdev;
@@ -1278,7 +1318,7 @@ static void sync_sbs(mddev_t * mddev)
 
 static void md_update_sb(mddev_t * mddev)
 {
-       int err, count = 100;
+       int err;
        struct list_head *tmp;
        mdk_rdev_t *rdev;
        int sync_req;
@@ -1298,6 +1338,7 @@ repeat:
                MD_BUG();
                mddev->events --;
        }
+       mddev->sb_dirty = 2;
        sync_sbs(mddev);
 
        /*
@@ -1307,6 +1348,7 @@ repeat:
        if (!mddev->persistent) {
                mddev->sb_dirty = 0;
                spin_unlock(&mddev->write_lock);
+               wake_up(&mddev->sb_wait);
                return;
        }
        spin_unlock(&mddev->write_lock);
@@ -1324,30 +1366,31 @@ repeat:
 
                dprintk("%s ", bdevname(rdev->bdev,b));
                if (!rdev->faulty) {
-                       err += write_disk_sb(rdev);
+                       md_super_write(mddev,rdev,
+                                      rdev->sb_offset<<1, MD_SB_BYTES,
+                                      rdev->sb_page);
+                       dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
+                               bdevname(rdev->bdev,b),
+                               (unsigned long long)rdev->sb_offset);
+
                } else
                        dprintk(")\n");
-               if (!err && mddev->level == LEVEL_MULTIPATH)
+               if (mddev->level == LEVEL_MULTIPATH)
                        /* only need to write one superblock... */
                        break;
        }
-       if (err) {
-               if (--count) {
-                       printk(KERN_ERR "md: errors occurred during superblock"
-                               " update, repeating\n");
-                       goto repeat;
-               }
-               printk(KERN_ERR \
-                       "md: excessive errors occurred during superblock update, exiting\n");
-       }
+       wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
+       /* if there was a failure, sb_dirty was set to 1, and we re-write super */
+
        spin_lock(&mddev->write_lock);
-       if (mddev->in_sync != sync_req) {
+       if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
                /* have to write it out again */
                spin_unlock(&mddev->write_lock);
                goto repeat;
        }
        mddev->sb_dirty = 0;
        spin_unlock(&mddev->write_lock);
+       wake_up(&mddev->sb_wait);
 
 }
 
@@ -1650,6 +1693,7 @@ static int do_md_run(mddev_t * mddev)
        mddev->pers = pers[pnum];
        spin_unlock(&pers_lock);
 
+       mddev->recovery = 0;
        mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
 
        /* before we start the array running, initialise the bitmap */
@@ -1674,6 +1718,7 @@ static int do_md_run(mddev_t * mddev)
        mddev->in_sync = 1;
        
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+       md_wakeup_thread(mddev->thread);
        
        if (mddev->sb_dirty)
                md_update_sb(mddev);
@@ -1760,6 +1805,8 @@ static int do_md_stop(mddev_t * mddev, int ro)
                                goto out;
                        mddev->ro = 1;
                } else {
+                       bitmap_flush(mddev);
+                       wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
                        if (mddev->ro)
                                set_disk_ro(disk, 0);
                        blk_queue_make_request(mddev->queue, md_fail_request);
@@ -1784,6 +1831,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
                fput(mddev->bitmap_file);
                mddev->bitmap_file = NULL;
        }
+       mddev->bitmap_offset = 0;
 
        /*
         * Free resources if final stop
@@ -2029,6 +2077,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
        info.state         = 0;
        if (mddev->in_sync)
                info.state = (1<<MD_SB_CLEAN);
+       if (mddev->bitmap && mddev->bitmap_offset)
+               info.state = (1<<MD_SB_BITMAP_PRESENT);
        info.active_disks  = active;
        info.working_disks = working;
        info.failed_disks  = failed;
@@ -2043,7 +2093,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
        return 0;
 }
 
-static int get_bitmap_file(mddev_t * mddev, void * arg)
+static int get_bitmap_file(mddev_t * mddev, void __user * arg)
 {
        mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
        char *ptr, *buf = NULL;
@@ -2193,8 +2243,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                        export_rdev(rdev);
 
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-               if (mddev->thread)
-                       md_wakeup_thread(mddev->thread);
+               md_wakeup_thread(mddev->thread);
                return err;
        }
 
@@ -2387,24 +2436,51 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
 {
        int err;
 
-       if (mddev->pers)
-               return -EBUSY;
+       if (mddev->pers) {
+               if (!mddev->pers->quiesce)
+                       return -EBUSY;
+               if (mddev->recovery || mddev->sync_thread)
+                       return -EBUSY;
+               /* we should be able to change the bitmap.. */
+       }
 
-       mddev->bitmap_file = fget(fd);
 
-       if (mddev->bitmap_file == NULL) {
-               printk(KERN_ERR "%s: error: failed to get bitmap file\n",
-                       mdname(mddev));
-               return -EBADF;
-       }
+       if (fd >= 0) {
+               if (mddev->bitmap)
+                       return -EEXIST; /* cannot add when bitmap is present */
+               mddev->bitmap_file = fget(fd);
 
-       err = deny_bitmap_write_access(mddev->bitmap_file);
-       if (err) {
-               printk(KERN_ERR "%s: error: bitmap file is already in use\n",
-                       mdname(mddev));
-               fput(mddev->bitmap_file);
+               if (mddev->bitmap_file == NULL) {
+                       printk(KERN_ERR "%s: error: failed to get bitmap file\n",
+                              mdname(mddev));
+                       return -EBADF;
+               }
+
+               err = deny_bitmap_write_access(mddev->bitmap_file);
+               if (err) {
+                       printk(KERN_ERR "%s: error: bitmap file is already in use\n",
+                              mdname(mddev));
+                       fput(mddev->bitmap_file);
+                       mddev->bitmap_file = NULL;
+                       return err;
+               }
+               mddev->bitmap_offset = 0; /* file overrides offset */
+       } else if (mddev->bitmap == NULL)
+               return -ENOENT; /* cannot remove what isn't there */
+       err = 0;
+       if (mddev->pers) {
+               mddev->pers->quiesce(mddev, 1);
+               if (fd >= 0)
+                       err = bitmap_create(mddev);
+               if (fd < 0 || err)
+                       bitmap_destroy(mddev);
+               mddev->pers->quiesce(mddev, 0);
+       } else if (fd < 0) {
+               if (mddev->bitmap_file)
+                       fput(mddev->bitmap_file);
                mddev->bitmap_file = NULL;
        }
+
        return err;
 }
 
@@ -2484,6 +2560,11 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 {
        int rv = 0;
        int cnt = 0;
+       int state = 0;
+
+       /* calculate expected state,ignoring low bits */
+       if (mddev->bitmap && mddev->bitmap_offset)
+               state |= (1 << MD_SB_BITMAP_PRESENT);
 
        if (mddev->major_version != info->major_version ||
            mddev->minor_version != info->minor_version ||
@@ -2492,12 +2573,16 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
            mddev->level         != info->level         ||
 /*         mddev->layout        != info->layout        || */
            !mddev->persistent   != info->not_persistent||
-           mddev->chunk_size    != info->chunk_size    )
+           mddev->chunk_size    != info->chunk_size    ||
+           /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
+           ((state^info->state) & 0xfffffe00)
+               )
                return -EINVAL;
        /* Check there is only one change */
        if (mddev->size != info->size) cnt++;
        if (mddev->raid_disks != info->raid_disks) cnt++;
        if (mddev->layout != info->layout) cnt++;
+       if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
        if (cnt == 0) return 0;
        if (cnt > 1) return -EINVAL;
 
@@ -2576,6 +2661,35 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
                        }
                }
        }
+       if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
+               if (mddev->pers->quiesce == NULL)
+                       return -EINVAL;
+               if (mddev->recovery || mddev->sync_thread)
+                       return -EBUSY;
+               if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
+                       /* add the bitmap */
+                       if (mddev->bitmap)
+                               return -EEXIST;
+                       if (mddev->default_bitmap_offset == 0)
+                               return -EINVAL;
+                       mddev->bitmap_offset = mddev->default_bitmap_offset;
+                       mddev->pers->quiesce(mddev, 1);
+                       rv = bitmap_create(mddev);
+                       if (rv)
+                               bitmap_destroy(mddev);
+                       mddev->pers->quiesce(mddev, 0);
+               } else {
+                       /* remove the bitmap */
+                       if (!mddev->bitmap)
+                               return -ENOENT;
+                       if (mddev->bitmap->file)
+                               return -EINVAL;
+                       mddev->pers->quiesce(mddev, 1);
+                       bitmap_destroy(mddev);
+                       mddev->pers->quiesce(mddev, 0);
+                       mddev->bitmap_offset = 0;
+               }
+       }
        md_update_sb(mddev);
        return rv;
 }
@@ -2737,7 +2851,7 @@ static int md_ioctl(struct inode *inode, struct file *file,
                        goto done_unlock;
 
                case GET_BITMAP_FILE:
-                       err = get_bitmap_file(mddev, (void *)arg);
+                       err = get_bitmap_file(mddev, argp);
                        goto done_unlock;
 
                case GET_DISK_INFO:
@@ -2938,8 +3052,7 @@ static int md_thread(void * arg)
                wait_event_interruptible_timeout(thread->wqueue,
                                                 test_bit(THREAD_WAKEUP, &thread->flags),
                                                 thread->timeout);
-               if (current->flags & PF_FREEZE)
-                       refrigerator(PF_FREEZE);
+               try_to_freeze();
 
                clear_bit(THREAD_WAKEUP, &thread->flags);
 
@@ -3368,29 +3481,26 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
 
 /* md_write_start(mddev, bi)
  * If we need to update some array metadata (e.g. 'active' flag
- * in superblock) before writing, queue bi for later writing
- * and return 0, else return 1 and it will be written now
+ * in superblock) before writing, schedule a superblock update
+ * and wait for it to complete.
  */
-int md_write_start(mddev_t *mddev, struct bio *bi)
+void md_write_start(mddev_t *mddev, struct bio *bi)
 {
+       DEFINE_WAIT(w);
        if (bio_data_dir(bi) != WRITE)
-               return 1;
+               return;
 
        atomic_inc(&mddev->writes_pending);
-       spin_lock(&mddev->write_lock);
-       if (mddev->in_sync == 0 && mddev->sb_dirty == 0) {
-               spin_unlock(&mddev->write_lock);
-               return 1;
-       }
-       bio_list_add(&mddev->write_list, bi);
-
        if (mddev->in_sync) {
-               mddev->in_sync = 0;
-               mddev->sb_dirty = 1;
+               spin_lock(&mddev->write_lock);
+               if (mddev->in_sync) {
+                       mddev->in_sync = 0;
+                       mddev->sb_dirty = 1;
+                       md_wakeup_thread(mddev->thread);
+               }
+               spin_unlock(&mddev->write_lock);
        }
-       spin_unlock(&mddev->write_lock);
-       md_wakeup_thread(mddev->thread);
-       return 0;
+       wait_event(mddev->sb_wait, mddev->sb_dirty==0);
 }
 
 void md_write_end(mddev_t *mddev)
@@ -3449,7 +3559,6 @@ static void md_do_sync(mddev_t *mddev)
                        goto skip;
                }
                ITERATE_MDDEV(mddev2,tmp) {
-                       printk(".");
                        if (mddev2 == mddev)
                                continue;
                        if (mddev2->curr_resync && 
@@ -3685,7 +3794,6 @@ void md_check_recovery(mddev_t *mddev)
                mddev->sb_dirty ||
                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
-               mddev->write_list.head ||
                (mddev->safemode == 1) ||
                (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
                 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
@@ -3694,7 +3802,6 @@ void md_check_recovery(mddev_t *mddev)
 
        if (mddev_trylock(mddev)==0) {
                int spares =0;
-               struct bio *blist;
 
                spin_lock(&mddev->write_lock);
                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
@@ -3704,21 +3811,11 @@ void md_check_recovery(mddev_t *mddev)
                }
                if (mddev->safemode == 1)
                        mddev->safemode = 0;
-               blist = bio_list_get(&mddev->write_list);
                spin_unlock(&mddev->write_lock);
 
                if (mddev->sb_dirty)
                        md_update_sb(mddev);
 
-               while (blist) {
-                       struct bio *b = blist;
-                       blist = blist->bi_next;
-                       b->bi_next = NULL;
-                       generic_make_request(b);
-                       /* we already counted this, so need to un-count */
-                       md_write_end(mddev);
-               }
-
 
                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
                    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -3787,6 +3884,13 @@ void md_check_recovery(mddev_t *mddev)
                        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
                        if (!spares)
                                set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+                       if (spares && mddev->bitmap && ! mddev->bitmap->file) {
+                               /* We are adding a device or devices to an array
+                                * which has the bitmap stored on all devices.
+                                * So make sure all bitmap pages get written
+                                */
+                               bitmap_write_all(mddev->bitmap);
+                       }
                        mddev->sync_thread = md_register_thread(md_do_sync,
                                                                mddev,
                                                                "%s_resync");
@@ -3977,3 +4081,5 @@ EXPORT_SYMBOL(md_wakeup_thread);
 EXPORT_SYMBOL(md_print_devices);
 EXPORT_SYMBOL(md_check_recovery);
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("md");
+MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);