Merge branch 'for-linus' of git://git.kernel.dk/linux-2.6-block

[safe/jmp/linux-2.6] / fs / btrfs / volumes.c
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 724ead5..7eda483 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,7 @@
  #include <linux/buffer_head.h>
  #include <linux/blkdev.h>
  #include <linux/random.h>
-#include <linux/version.h>
+#include <linux/iocontext.h>
  #include <asm/div64.h>
  #include "compat.h"
  #include "ctree.h"
@@ -47,7 +47,6 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                 struct btrfs_device *device);
  static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
  
-
  #define map_lookup_size(n) (sizeof(struct map_lookup) + \
                             (sizeof(struct btrfs_bio_stripe) * (n)))
  
@@ -74,34 +73,29 @@ static void unlock_chunks(struct btrfs_root *root)
         mutex_unlock(&root->fs_info->chunk_mutex);
  }
  
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+       struct btrfs_device *device;
+       WARN_ON(fs_devices->opened);
+       while (!list_empty(&fs_devices->devices)) {
+               device = list_entry(fs_devices->devices.next,
+                                   struct btrfs_device, dev_list);
+               list_del(&device->dev_list);
+               kfree(device->name);
+               kfree(device);
+       }
+       kfree(fs_devices);
+}
+
  int btrfs_cleanup_fs_uuids(void)
  {
         struct btrfs_fs_devices *fs_devices;
-       struct btrfs_device *dev;
  
         while (!list_empty(&fs_uuids)) {
                 fs_devices = list_entry(fs_uuids.next,
                                         struct btrfs_fs_devices, list);
                 list_del(&fs_devices->list);
-               while(!list_empty(&fs_devices->devices)) {
-                       dev = list_entry(fs_devices->devices.next,
-                                        struct btrfs_device, dev_list);
-                       if (dev->bdev) {
-                               close_bdev_exclusive(dev->bdev, dev->mode);
-                               fs_devices->open_devices--;
-                       }
-                       fs_devices->num_devices--;
-                       if (dev->writeable)
-                               fs_devices->rw_devices--;
-                       list_del(&dev->dev_list);
-                       list_del(&dev->dev_alloc_list);
-                       kfree(dev->name);
-                       kfree(dev);
-               }
-               WARN_ON(fs_devices->num_devices);
-               WARN_ON(fs_devices->open_devices);
-               WARN_ON(fs_devices->rw_devices);
-               kfree(fs_devices);
+               free_fs_devices(fs_devices);
         }
         return 0;
  }
@@ -110,10 +104,8 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
                                                    u64 devid, u8 *uuid)
  {
         struct btrfs_device *dev;
-       struct list_head *cur;
  
-       list_for_each(cur, head) {
-               dev = list_entry(cur, struct btrfs_device, dev_list);
+       list_for_each_entry(dev, head, dev_list) {
                 if (dev->devid == devid &&
                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
                         return dev;
@@ -124,17 +116,29 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
  
  static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
  {
-       struct list_head *cur;
         struct btrfs_fs_devices *fs_devices;
  
-       list_for_each(cur, &fs_uuids) {
-               fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
+       list_for_each_entry(fs_devices, &fs_uuids, list) {
                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
                         return fs_devices;
         }
         return NULL;
  }
  
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+                       struct bio *head, struct bio *tail)
+{
+
+       struct bio *old_head;
+
+       old_head = pending_bios->head;
+       pending_bios->head = head;
+       if (pending_bios->tail)
+               tail->bi_next = old_head;
+       else
+               pending_bios->tail = tail;
+}
+
  /*
   * we try to collect pending bios for a device so we don't get a large
   * number of procs sending bios down to the same device.  This greatly
@@ -146,35 +150,54 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
   * the list if the block device is congested.  This way, multiple devices
   * can make progress from a single worker thread.
   */
-static int noinline run_scheduled_bios(struct btrfs_device *device)
+static noinline int run_scheduled_bios(struct btrfs_device *device)
  {
         struct bio *pending;
         struct backing_dev_info *bdi;
         struct btrfs_fs_info *fs_info;
+       struct btrfs_pending_bios *pending_bios;
         struct bio *tail;
         struct bio *cur;
         int again = 0;
-       unsigned long num_run = 0;
+       unsigned long num_run;
+       unsigned long num_sync_run;
+       unsigned long batch_run = 0;
         unsigned long limit;
+       unsigned long last_waited = 0;
+       int force_reg = 0;
  
-       bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+       bdi = blk_get_backing_dev_info(device->bdev);
         fs_info = device->dev_root->fs_info;
         limit = btrfs_async_submit_limit(fs_info);
         limit = limit * 2 / 3;
  
+       /* we want to make sure that every time we switch from the sync
+        * list to the normal list, we unplug
+        */
+       num_sync_run = 0;
+
  loop:
         spin_lock(&device->io_lock);
  
+loop_lock:
+       num_run = 0;
+
         /* take all the bios off the list at once and process them
          * later on (without the lock held).  But, remember the
          * tail and other pointers so the bios can be properly reinserted
          * into the list if we hit congestion
          */
-       pending = device->pending_bios;
-       tail = device->pending_bio_tail;
+       if (!force_reg && device->pending_sync_bios.head) {
+               pending_bios = &device->pending_sync_bios;
+               force_reg = 1;
+       } else {
+               pending_bios = &device->pending_bios;
+               force_reg = 0;
+       }
+
+       pending = pending_bios->head;
+       tail = pending_bios->tail;
         WARN_ON(pending && !tail);
-       device->pending_bios = NULL;
-       device->pending_bio_tail = NULL;
  
         /*
          * if pending was null this time around, no bios need processing
@@ -184,16 +207,45 @@ loop:
          * device->running_pending is used to synchronize with the
          * schedule_bio code.
          */
-       if (pending) {
-               again = 1;
-               device->running_pending = 1;
-       } else {
+       if (device->pending_sync_bios.head == NULL &&
+           device->pending_bios.head == NULL) {
                 again = 0;
                 device->running_pending = 0;
+       } else {
+               again = 1;
+               device->running_pending = 1;
         }
+
+       pending_bios->head = NULL;
+       pending_bios->tail = NULL;
+
         spin_unlock(&device->io_lock);
  
-       while(pending) {
+       /*
+        * if we're doing the regular priority list, make sure we unplug
+        * for any high prio bios we've sent down
+        */
+       if (pending_bios == &device->pending_bios && num_sync_run > 0) {
+               num_sync_run = 0;
+               blk_run_backing_dev(bdi, NULL);
+       }
+
+       while (pending) {
+
+               rmb();
+               /* we want to work on both lists, but do more bios on the
+                * sync list than the regular list
+                */
+               if ((num_run > 32 &&
+                   pending_bios != &device->pending_sync_bios &&
+                   device->pending_sync_bios.head) ||
+                  (num_run > 64 && pending_bios == &device->pending_sync_bios &&
+                   device->pending_bios.head)) {
+                       spin_lock(&device->io_lock);
+                       requeue_list(pending_bios, pending, tail);
+                       goto loop_lock;
+               }
+
                 cur = pending;
                 pending = pending->bi_next;
                 cur->bi_next = NULL;
@@ -204,41 +256,101 @@ loop:
                         wake_up(&fs_info->async_submit_wait);
  
                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-               bio_get(cur);
                 submit_bio(cur->bi_rw, cur);
-               bio_put(cur);
                 num_run++;
+               batch_run++;
+
+               if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
+                       num_sync_run++;
+
+               if (need_resched()) {
+                       if (num_sync_run) {
+                               blk_run_backing_dev(bdi, NULL);
+                               num_sync_run = 0;
+                       }
+                       cond_resched();
+               }
  
                 /*
                  * we made progress, there is more work to do and the bdi
                  * is now congested.  Back off and let other work structs
                  * run instead
                  */
-               if (pending && bdi_write_congested(bdi) &&
+               if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
                     fs_info->fs_devices->open_devices > 1) {
-                       struct bio *old_head;
+                       struct io_context *ioc;
  
-                       spin_lock(&device->io_lock);
+                       ioc = current->io_context;
  
-                       old_head = device->pending_bios;
-                       device->pending_bios = pending;
-                       if (device->pending_bio_tail)
-                               tail->bi_next = old_head;
-                       else
-                               device->pending_bio_tail = tail;
+                       /*
+                        * the main goal here is that we don't want to
+                        * block if we're going to be able to submit
+                        * more requests without blocking.
+                        *
+                        * This code does two great things, it pokes into
+                        * the elevator code from a filesystem _and_
+                        * it makes assumptions about how batching works.
+                        */
+                       if (ioc && ioc->nr_batch_requests > 0 &&
+                           time_before(jiffies, ioc->last_waited + HZ/50UL) &&
+                           (last_waited == 0 ||
+                            ioc->last_waited == last_waited)) {
+                               /*
+                                * we want to go through our batch of
+                                * requests and stop.  So, we copy out
+                                * the ioc->last_waited time and test
+                                * against it before looping
+                                */
+                               last_waited = ioc->last_waited;
+                               if (need_resched()) {
+                                       if (num_sync_run) {
+                                               blk_run_backing_dev(bdi, NULL);
+                                               num_sync_run = 0;
+                                       }
+                                       cond_resched();
+                               }
+                               continue;
+                       }
+                       spin_lock(&device->io_lock);
+                       requeue_list(pending_bios, pending, tail);
+                       device->running_pending = 1;
  
                         spin_unlock(&device->io_lock);
                         btrfs_requeue_work(&device->work);
                         goto done;
                 }
         }
+
+       if (num_sync_run) {
+               num_sync_run = 0;
+               blk_run_backing_dev(bdi, NULL);
+       }
+
+       cond_resched();
         if (again)
                 goto loop;
+
+       spin_lock(&device->io_lock);
+       if (device->pending_bios.head || device->pending_sync_bios.head)
+               goto loop_lock;
+       spin_unlock(&device->io_lock);
+
+       /*
+        * IO has already been through a long path to get here.  Checksumming,
+        * async helper threads, perhaps compression.  We've done a pretty
+        * good job of collecting a batch of IO and should just unplug
+        * the device right away.
+        *
+        * This will help anyone who is waiting on the IO, they might have
+        * already unplugged, but managed to do so before the bio they
+        * cared about found its way down here.
+        */
+       blk_run_backing_dev(bdi, NULL);
  done:
         return 0;
  }
  
-void pending_bios_fn(struct btrfs_work *work)
+static void pending_bios_fn(struct btrfs_work *work)
  {
         struct btrfs_device *device;
  
@@ -265,6 +377,7 @@ static noinline int device_list_add(const char *path,
                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
                 fs_devices->latest_devid = devid;
                 fs_devices->latest_trans = found_transid;
+               mutex_init(&fs_devices->device_list_mutex);
                 device = NULL;
         } else {
                 device = __find_device(&fs_devices->devices, devid,
@@ -291,7 +404,11 @@ static noinline int device_list_add(const char *path,
                         return -ENOMEM;
                 }
                 INIT_LIST_HEAD(&device->dev_alloc_list);
+
+               mutex_lock(&fs_devices->device_list_mutex);
                 list_add(&device->dev_list, &fs_devices->devices);
+               mutex_unlock(&fs_devices->device_list_mutex);
+
                 device->fs_devices = fs_devices;
                 fs_devices->num_devices++;
         }
@@ -304,17 +421,64 @@ static noinline int device_list_add(const char *path,
         return 0;
  }
  
-int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
  {
-       struct list_head *tmp;
-       struct list_head *cur;
+       struct btrfs_fs_devices *fs_devices;
         struct btrfs_device *device;
-       int seed_devices = 0;
+       struct btrfs_device *orig_dev;
+
+       fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+       if (!fs_devices)
+               return ERR_PTR(-ENOMEM);
+
+       INIT_LIST_HEAD(&fs_devices->devices);
+       INIT_LIST_HEAD(&fs_devices->alloc_list);
+       INIT_LIST_HEAD(&fs_devices->list);
+       mutex_init(&fs_devices->device_list_mutex);
+       fs_devices->latest_devid = orig->latest_devid;
+       fs_devices->latest_trans = orig->latest_trans;
+       memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+
+       mutex_lock(&orig->device_list_mutex);
+       list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+               device = kzalloc(sizeof(*device), GFP_NOFS);
+               if (!device)
+                       goto error;
+
+               device->name = kstrdup(orig_dev->name, GFP_NOFS);
+               if (!device->name) {
+                       kfree(device);
+                       goto error;
+               }
+
+               device->devid = orig_dev->devid;
+               device->work.func = pending_bios_fn;
+               memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
+               device->barriers = 1;
+               spin_lock_init(&device->io_lock);
+               INIT_LIST_HEAD(&device->dev_list);
+               INIT_LIST_HEAD(&device->dev_alloc_list);
+
+               list_add(&device->dev_list, &fs_devices->devices);
+               device->fs_devices = fs_devices;
+               fs_devices->num_devices++;
+       }
+       mutex_unlock(&orig->device_list_mutex);
+       return fs_devices;
+error:
+       mutex_unlock(&orig->device_list_mutex);
+       free_fs_devices(fs_devices);
+       return ERR_PTR(-ENOMEM);
+}
+
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+{
+       struct btrfs_device *device, *next;
  
         mutex_lock(&uuid_mutex);
  again:
-       list_for_each_safe(cur, tmp, &fs_devices->devices) {
-               device = list_entry(cur, struct btrfs_device, dev_list);
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
                 if (device->in_fs_metadata)
                         continue;
  
@@ -328,17 +492,15 @@ again:
                         device->writeable = 0;
                         fs_devices->rw_devices--;
                 }
-               if (!seed_devices) {
-                       list_del_init(&device->dev_list);
-                       fs_devices->num_devices--;
-                       kfree(device->name);
-                       kfree(device);
-               }
+               list_del_init(&device->dev_list);
+               fs_devices->num_devices--;
+               kfree(device->name);
+               kfree(device);
         }
+       mutex_unlock(&fs_devices->device_list_mutex);
  
         if (fs_devices->seed) {
                 fs_devices = fs_devices->seed;
-               seed_devices = 1;
                 goto again;
         }
  
@@ -348,15 +510,12 @@ again:
  
  static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
  {
-       struct btrfs_fs_devices *seed_devices;
-       struct list_head *cur;
         struct btrfs_device *device;
-again:
+
         if (--fs_devices->opened > 0)
                 return 0;
  
-       list_for_each(cur, &fs_devices->devices) {
-               device = list_entry(cur, struct btrfs_device, dev_list);
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
                 if (device->bdev) {
                         close_bdev_exclusive(device->bdev, device->mode);
                         fs_devices->open_devices--;
@@ -370,35 +529,41 @@ again:
                 device->writeable = 0;
                 device->in_fs_metadata = 0;
         }
+       WARN_ON(fs_devices->open_devices);
+       WARN_ON(fs_devices->rw_devices);
         fs_devices->opened = 0;
         fs_devices->seeding = 0;
-       fs_devices->sprouted = 0;
  
-       seed_devices = fs_devices->seed;
-       fs_devices->seed = NULL;
-       if (seed_devices) {
-               fs_devices = seed_devices;
-               goto again;
-       }
         return 0;
  }
  
  int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
  {
+       struct btrfs_fs_devices *seed_devices = NULL;
         int ret;
  
         mutex_lock(&uuid_mutex);
         ret = __btrfs_close_devices(fs_devices);
+       if (!fs_devices->opened) {
+               seed_devices = fs_devices->seed;
+               fs_devices->seed = NULL;
+       }
         mutex_unlock(&uuid_mutex);
+
+       while (seed_devices) {
+               fs_devices = seed_devices;
+               seed_devices = fs_devices->seed;
+               __btrfs_close_devices(fs_devices);
+               free_fs_devices(fs_devices);
+       }
         return ret;
  }
  
-int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-                        int flags, void *holder)
+static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+                               fmode_t flags, void *holder)
  {
         struct block_device *bdev;
         struct list_head *head = &fs_devices->devices;
-       struct list_head *cur;
         struct btrfs_device *device;
         struct block_device *latest_bdev = NULL;
         struct buffer_head *bh;
@@ -409,8 +574,7 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
         int seeding = 1;
         int ret = 0;
  
-       list_for_each(cur, head) {
-               device = list_entry(cur, struct btrfs_device, dev_list);
+       list_for_each_entry(device, head, dev_list) {
                 if (device->bdev)
                         continue;
                 if (!device->name)
@@ -418,20 +582,16 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
  
                 bdev = open_bdev_exclusive(device->name, flags, holder);
                 if (IS_ERR(bdev)) {
-                       printk("open %s failed\n", device->name);
+                       printk(KERN_INFO "open %s failed\n", device->name);
                         goto error;
                 }
                 set_blocksize(bdev, 4096);
  
-               bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+               bh = btrfs_read_dev_super(bdev);
                 if (!bh)
                         goto error_close;
  
                 disk_super = (struct btrfs_super_block *)bh->b_data;
-               if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-                   sizeof(disk_super->magic)))
-                       goto error_brelse;
-
                 devid = le64_to_cpu(disk_super->dev_item.devid);
                 if (devid != device->devid)
                         goto error_brelse;
@@ -458,6 +618,9 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                 device->in_fs_metadata = 0;
                 device->mode = flags;
  
+               if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+                       fs_devices->rotating = 1;
+
                 fs_devices->open_devices++;
                 if (device->writeable) {
                         fs_devices->rw_devices++;
@@ -469,7 +632,7 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
  error_brelse:
                 brelse(bh);
  error_close:
-               close_bdev_exclusive(bdev, MS_RDONLY);
+               close_bdev_exclusive(bdev, FMODE_READ);
  error:
                 continue;
         }
@@ -488,18 +651,14 @@ out:
  }
  
  int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-                      int flags, void *holder)
+                      fmode_t flags, void *holder)
  {
         int ret;
  
         mutex_lock(&uuid_mutex);
         if (fs_devices->opened) {
-               if (fs_devices->sprouted) {
-                       ret = -EBUSY;
-               } else {
-                       fs_devices->opened++;
-                       ret = 0;
-               }
+               fs_devices->opened++;
+               ret = 0;
         } else {
                 ret = __btrfs_open_devices(fs_devices, flags, holder);
         }
@@ -507,7 +666,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
         return ret;
  }
  
-int btrfs_scan_one_device(const char *path, int flags, void *holder,
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                           struct btrfs_fs_devices **fs_devices_ret)
  {
         struct btrfs_super_block *disk_super;
@@ -529,31 +688,26 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
         ret = set_blocksize(bdev, 4096);
         if (ret)
                 goto error_close;
-       bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+       bh = btrfs_read_dev_super(bdev);
         if (!bh) {
                 ret = -EIO;
                 goto error_close;
         }
         disk_super = (struct btrfs_super_block *)bh->b_data;
-       if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-           sizeof(disk_super->magic))) {
-               ret = -EINVAL;
-               goto error_brelse;
-       }
         devid = le64_to_cpu(disk_super->dev_item.devid);
         transid = btrfs_super_generation(disk_super);
         if (disk_super->label[0])
-               printk("device label %s ", disk_super->label);
+               printk(KERN_INFO "device label %s ", disk_super->label);
         else {
                 /* FIXME, make a readl uuid parser */
-               printk("device fsid %llx-%llx ",
+               printk(KERN_INFO "device fsid %llx-%llx ",
                        *(unsigned long long *)disk_super->fsid,
                        *(unsigned long long *)(disk_super->fsid + 8));
         }
-       printk("devid %Lu transid %Lu %s\n", devid, transid, path);
+       printk(KERN_CONT "devid %llu transid %llu %s\n",
+              (unsigned long long)devid, (unsigned long long)transid, path);
         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
  
-error_brelse:
         brelse(bh);
  error_close:
         close_bdev_exclusive(bdev, flags);
@@ -567,9 +721,9 @@ error:
   * called very infrequently and that a given device has a small number
   * of extents
   */
-static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
-                                        struct btrfs_device *device,
-                                        u64 num_bytes, u64 *start)
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                        struct btrfs_device *device, u64 num_bytes,
+                        u64 *start, u64 *max_avail)
  {
         struct btrfs_key key;
         struct btrfs_root *root = device->dev_root;
@@ -606,9 +760,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
         ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
         if (ret < 0)
                 goto error;
-       ret = btrfs_previous_item(root, path, 0, key.type);
-       if (ret < 0)
-               goto error;
+       if (ret > 0) {
+               ret = btrfs_previous_item(root, path, key.objectid, key.type);
+               if (ret < 0)
+                       goto error;
+               if (ret > 0)
+                       start_found = 1;
+       }
         l = path->nodes[0];
         btrfs_item_key_to_cpu(l, &key, path->slots[0]);
         while (1) {
@@ -651,15 +809,18 @@ no_more_items:
                         if (last_byte < search_start)
                                 last_byte = search_start;
                         hole_size = key.offset - last_byte;
+
+                       if (hole_size > *max_avail)
+                               *max_avail = hole_size;
+
                         if (key.offset > last_byte &&
                             hole_size >= num_bytes) {
                                 *start = last_byte;
                                 goto check_pending;
                         }
                 }
-               if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
+               if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
                         goto next;
-               }
  
                 start_found = 1;
                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
@@ -686,7 +847,7 @@ error:
         return ret;
  }
  
-int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
                           struct btrfs_device *device,
                           u64 start)
  {
@@ -900,6 +1061,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
         btrfs_set_device_group(leaf, dev_item, 0);
         btrfs_set_device_seek_speed(leaf, dev_item, 0);
         btrfs_set_device_bandwidth(leaf, dev_item, 0);
+       btrfs_set_device_start_offset(leaf, dev_item, 0);
  
         ptr = (unsigned long)btrfs_device_uuid(dev_item);
         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -974,41 +1136,44 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
  
         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
             root->fs_info->fs_devices->rw_devices <= 4) {
-               printk("btrfs: unable to go below four devices on raid10\n");
+               printk(KERN_ERR "btrfs: unable to go below four devices "
+                      "on raid10\n");
                 ret = -EINVAL;
                 goto out;
         }
  
         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
             root->fs_info->fs_devices->rw_devices <= 2) {
-               printk("btrfs: unable to go below two devices on raid1\n");
+               printk(KERN_ERR "btrfs: unable to go below two "
+                      "devices on raid1\n");
                 ret = -EINVAL;
                 goto out;
         }
  
         if (strcmp(device_path, "missing") == 0) {
-               struct list_head *cur;
                 struct list_head *devices;
                 struct btrfs_device *tmp;
  
                 device = NULL;
                 devices = &root->fs_info->fs_devices->devices;
-               list_for_each(cur, devices) {
-                       tmp = list_entry(cur, struct btrfs_device, dev_list);
+               mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+               list_for_each_entry(tmp, devices, dev_list) {
                         if (tmp->in_fs_metadata && !tmp->bdev) {
                                 device = tmp;
                                 break;
                         }
                 }
+               mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
                 bdev = NULL;
                 bh = NULL;
                 disk_super = NULL;
                 if (!device) {
-                       printk("btrfs: no missing devices found to remove\n");
+                       printk(KERN_ERR "btrfs: no missing devices found to "
+                              "remove\n");
                         goto out;
                 }
         } else {
-               bdev = open_bdev_exclusive(device_path, MS_RDONLY,
+               bdev = open_bdev_exclusive(device_path, FMODE_READ,
                                       root->fs_info->bdev_holder);
                 if (IS_ERR(bdev)) {
                         ret = PTR_ERR(bdev);
@@ -1016,17 +1181,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                 }
  
                 set_blocksize(bdev, 4096);
-               bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+               bh = btrfs_read_dev_super(bdev);
                 if (!bh) {
                         ret = -EIO;
                         goto error_close;
                 }
                 disk_super = (struct btrfs_super_block *)bh->b_data;
-               if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-                           sizeof(disk_super->magic))) {
-                       ret = -ENOENT;
-                       goto error_brelse;
-               }
                 devid = le64_to_cpu(disk_super->dev_item.devid);
                 dev_uuid = disk_super->dev_item.uuid;
                 device = btrfs_find_device(root, devid, dev_uuid,
@@ -1038,7 +1198,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         }
  
         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
-               printk("btrfs: unable to remove the only writeable device\n");
+               printk(KERN_ERR "btrfs: unable to remove the only writeable "
+                      "device\n");
                 ret = -EINVAL;
                 goto error_brelse;
         }
@@ -1057,12 +1218,17 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                 goto error_brelse;
  
         device->in_fs_metadata = 0;
-       if (device->fs_devices == root->fs_info->fs_devices) {
-               list_del_init(&device->dev_list);
-               root->fs_info->fs_devices->num_devices--;
-               if (device->bdev)
-                       device->fs_devices->open_devices--;
-       }
+
+       /*
+        * the device list mutex makes sure that we don't change
+        * the device list while someone else is writing out all
+        * the device supers.
+        */
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+       list_del_init(&device->dev_list);
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+       device->fs_devices->num_devices--;
  
         next_device = list_entry(root->fs_info->fs_devices->devices.next,
                                  struct btrfs_device, dev_list);
@@ -1071,34 +1237,27 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
  
+       if (device->bdev) {
+               close_bdev_exclusive(device->bdev, device->mode);
+               device->bdev = NULL;
+               device->fs_devices->open_devices--;
+       }
+
         num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
         btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
  
-       if (device->fs_devices != root->fs_info->fs_devices) {
-               BUG_ON(device->writeable);
-               brelse(bh);
-               if (bdev)
-                       close_bdev_exclusive(bdev, MS_RDONLY);
-
-               if (device->bdev) {
-                       close_bdev_exclusive(device->bdev, device->mode);
-                       device->bdev = NULL;
-                       device->fs_devices->open_devices--;
-               }
-               if (device->fs_devices->open_devices == 0) {
-                       struct btrfs_fs_devices *fs_devices;
-                       fs_devices = root->fs_info->fs_devices;
-                       while (fs_devices) {
-                               if (fs_devices->seed == device->fs_devices)
-                                       break;
-                               fs_devices = fs_devices->seed;
-                       }
-                       fs_devices->seed = device->fs_devices->seed;
-                       device->fs_devices->seed = NULL;
-                       __btrfs_close_devices(device->fs_devices);
+       if (device->fs_devices->open_devices == 0) {
+               struct btrfs_fs_devices *fs_devices;
+               fs_devices = root->fs_info->fs_devices;
+               while (fs_devices) {
+                       if (fs_devices->seed == device->fs_devices)
+                               break;
+                       fs_devices = fs_devices->seed;
                 }
-               ret = 0;
-               goto out;
+               fs_devices->seed = device->fs_devices->seed;
+               device->fs_devices->seed = NULL;
+               __btrfs_close_devices(device->fs_devices);
+               free_fs_devices(device->fs_devices);
         }
  
         /*
@@ -1113,26 +1272,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                 set_buffer_dirty(bh);
                 sync_dirty_buffer(bh);
         }
-       brelse(bh);
  
-       if (device->bdev) {
-               /* one close for the device struct or super_block */
-               close_bdev_exclusive(device->bdev, device->mode);
-       }
-       if (bdev) {
-               /* one close for us */
-               close_bdev_exclusive(bdev, MS_RDONLY);
-       }
         kfree(device->name);
         kfree(device);
         ret = 0;
-       goto out;
  
  error_brelse:
         brelse(bh);
  error_close:
         if (bdev)
-               close_bdev_exclusive(bdev, MS_RDONLY);
+               close_bdev_exclusive(bdev, FMODE_READ);
  out:
         mutex_unlock(&root->fs_info->volume_mutex);
         mutex_unlock(&uuid_mutex);
@@ -1147,34 +1296,42 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
  {
         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
         struct btrfs_fs_devices *old_devices;
+       struct btrfs_fs_devices *seed_devices;
         struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
         struct btrfs_device *device;
         u64 super_flags;
  
         BUG_ON(!mutex_is_locked(&uuid_mutex));
-       if (!fs_devices->seeding || fs_devices->opened != 1)
+       if (!fs_devices->seeding)
                 return -EINVAL;
  
-       old_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-       if (!old_devices)
+       seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+       if (!seed_devices)
                 return -ENOMEM;
  
-       memcpy(old_devices, fs_devices, sizeof(*old_devices));
-       old_devices->opened = 1;
-       old_devices->sprouted = 1;
-       INIT_LIST_HEAD(&old_devices->devices);
-       INIT_LIST_HEAD(&old_devices->alloc_list);
-       list_splice_init(&fs_devices->devices, &old_devices->devices);
-       list_splice_init(&fs_devices->alloc_list, &old_devices->alloc_list);
-       list_for_each_entry(device, &old_devices->devices, dev_list) {
-               device->fs_devices = old_devices;
+       old_devices = clone_fs_devices(fs_devices);
+       if (IS_ERR(old_devices)) {
+               kfree(seed_devices);
+               return PTR_ERR(old_devices);
         }
+
         list_add(&old_devices->list, &fs_uuids);
  
+       memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+       seed_devices->opened = 1;
+       INIT_LIST_HEAD(&seed_devices->devices);
+       INIT_LIST_HEAD(&seed_devices->alloc_list);
+       mutex_init(&seed_devices->device_list_mutex);
+       list_splice_init(&fs_devices->devices, &seed_devices->devices);
+       list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
+       list_for_each_entry(device, &seed_devices->devices, dev_list) {
+               device->fs_devices = seed_devices;
+       }
+
         fs_devices->seeding = 0;
         fs_devices->num_devices = 0;
         fs_devices->open_devices = 0;
-       fs_devices->seed = old_devices;
+       fs_devices->seed = seed_devices;
  
         generate_random_uuid(fs_devices->fsid);
         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -1267,7 +1424,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         struct btrfs_trans_handle *trans;
         struct btrfs_device *device;
         struct block_device *bdev;
-       struct list_head *cur;
         struct list_head *devices;
         struct super_block *sb = root->fs_info->sb;
         u64 total_bytes;
@@ -1278,9 +1434,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                 return -EINVAL;
  
         bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
-       if (!bdev) {
+       if (!bdev)
                 return -EIO;
-       }
  
         if (root->fs_info->fs_devices->seeding) {
                 seeding_dev = 1;
@@ -1292,8 +1447,11 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         mutex_lock(&root->fs_info->volume_mutex);
  
         devices = &root->fs_info->fs_devices->devices;
-       list_for_each(cur, devices) {
-               device = list_entry(cur, struct btrfs_device, dev_list);
+       /*
+        * we have the volume lock, so we don't need the extra
+        * device list mutex while reading the list here.
+        */
+       list_for_each_entry(device, devices, dev_list) {
                 if (device->bdev == bdev) {
                         ret = -EEXIST;
                         goto error;
@@ -1333,6 +1491,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         device->io_align = root->sectorsize;
         device->sector_size = root->sectorsize;
         device->total_bytes = i_size_read(bdev->bd_inode);
+       device->disk_total_bytes = device->total_bytes;
         device->dev_root = root->fs_info->dev_root;
         device->bdev = bdev;
         device->in_fs_metadata = 1;
@@ -1346,6 +1505,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         }
  
         device->fs_devices = root->fs_info->fs_devices;
+
+       /*
+        * we don't want write_supers to jump in here with our device
+        * half setup
+        */
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
         list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
         list_add(&device->dev_alloc_list,
                  &root->fs_info->fs_devices->alloc_list);
@@ -1354,6 +1519,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         root->fs_info->fs_devices->rw_devices++;
         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
  
+       if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+               root->fs_info->fs_devices->rotating = 1;
+
         total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
         btrfs_set_super_total_bytes(&root->fs_info->super_copy,
                                     total_bytes + device->total_bytes);
@@ -1361,6 +1529,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
         btrfs_set_super_num_devices(&root->fs_info->super_copy,
                                     total_bytes + 1);
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  
         if (seeding_dev) {
                 ret = init_first_rw_device(trans, root, device);
@@ -1371,6 +1540,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                 ret = btrfs_add_device(trans, root, device);
         }
  
+       /*
+        * we've got more storage, clear any full flags on the space
+        * infos
+        */
+       btrfs_clear_space_info_full(root->fs_info);
+
         unlock_chunks(root);
         btrfs_commit_transaction(trans, root);
  
@@ -1393,8 +1568,8 @@ error:
         goto out;
  }
  
-int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
-                                struct btrfs_device *device)
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+                                       struct btrfs_device *device)
  {
         int ret;
         struct btrfs_path *path;
@@ -1430,7 +1605,7 @@ int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-       btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+       btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
         btrfs_mark_buffer_dirty(leaf);
  
@@ -1456,6 +1631,9 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
         device->fs_devices->total_rw_bytes += diff;
  
         device->total_bytes = new_size;
+       device->disk_total_bytes = new_size;
+       btrfs_clear_space_info_full(device->dev_root->fs_info);
+
         return btrfs_update_device(trans, device);
  }
  
@@ -1497,7 +1675,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
         return 0;
  }
  
-int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
                         chunk_offset)
  {
         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
@@ -1543,7 +1721,7 @@ int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
         return ret;
  }
  
-int btrfs_relocate_chunk(struct btrfs_root *root,
+static int btrfs_relocate_chunk(struct btrfs_root *root,
                          u64 chunk_tree, u64 chunk_objectid,
                          u64 chunk_offset)
  {
@@ -1555,12 +1733,14 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
         int ret;
         int i;
  
-       printk("btrfs relocating chunk %llu\n",
-              (unsigned long long)chunk_offset);
         root = root->fs_info->chunk_root;
         extent_root = root->fs_info->extent_root;
         em_tree = &root->fs_info->mapping_tree.map_tree;
  
+       ret = btrfs_can_relocate(extent_root, chunk_offset);
+       if (ret)
+               return -ENOSPC;
+
         /* step one, relocate all the extents inside this chunk */
         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
         BUG_ON(ret);
@@ -1574,9 +1754,9 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
          * step two, delete the device extents and the
          * chunk tree entries
          */
-       spin_lock(&em_tree->lock);
+       read_lock(&em_tree->lock);
         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
-       spin_unlock(&em_tree->lock);
+       read_unlock(&em_tree->lock);
  
         BUG_ON(em->start > chunk_offset ||
                em->start + em->len < chunk_offset);
@@ -1605,9 +1785,9 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
         BUG_ON(ret);
  
-       spin_lock(&em_tree->lock);
+       write_lock(&em_tree->lock);
         remove_extent_mapping(em_tree, em);
-       spin_unlock(&em_tree->lock);
+       write_unlock(&em_tree->lock);
  
         kfree(map);
         em->bdev = NULL;
@@ -1632,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
         struct btrfs_key found_key;
         u64 chunk_tree = chunk_root->root_key.objectid;
         u64 chunk_type;
+       bool retried = false;
+       int failed = 0;
         int ret;
  
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
  
+again:
         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
         key.offset = (u64)-1;
         key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -1667,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
                                                    found_key.objectid,
                                                    found_key.offset);
-                       BUG_ON(ret);
+                       if (ret == -ENOSPC)
+                               failed++;
+                       else if (ret)
+                               BUG();
                 }
  
                 if (found_key.offset == 0)
@@ -1675,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
                 key.offset = found_key.offset - 1;
         }
         ret = 0;
+       if (failed && !retried) {
+               failed = 0;
+               retried = true;
+               goto again;
+       } else if (failed && retried) {
+               WARN_ON(1);
+               ret = -ENOSPC;
+       }
  error:
         btrfs_free_path(path);
         return ret;
@@ -1692,7 +1886,6 @@ static u64 div_factor(u64 num, int factor)
  int btrfs_balance(struct btrfs_root *dev_root)
  {
         int ret;
-       struct list_head *cur;
         struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
         struct btrfs_device *device;
         u64 old_size;
@@ -1711,8 +1904,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
         dev_root = dev_root->fs_info->dev_root;
  
         /* step one make some room on all the devices */
-       list_for_each(cur, devices) {
-               device = list_entry(cur, struct btrfs_device, dev_list);
+       list_for_each_entry(device, devices, dev_list) {
                 old_size = device->total_bytes;
                 size_to_free = div_factor(old_size, 1);
                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
@@ -1721,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
                         continue;
  
                 ret = btrfs_shrink_device(device, old_size - size_to_free);
+               if (ret == -ENOSPC)
+                       break;
                 BUG_ON(ret);
  
                 trans = btrfs_start_transaction(dev_root, 1);
@@ -1740,7 +1934,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
         key.offset = (u64)-1;
         key.type = BTRFS_CHUNK_ITEM_KEY;
  
-       while(1) {
+       while (1) {
                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
                 if (ret < 0)
                         goto error;
@@ -1765,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
                 chunk = btrfs_item_ptr(path->nodes[0],
                                        path->slots[0],
                                        struct btrfs_chunk);
-               key.offset = found_key.offset;
                 /* chunk zero is special */
-               if (key.offset == 0)
+               if (found_key.offset == 0)
                         break;
  
                 btrfs_release_path(chunk_root, path);
@@ -1775,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
                                            chunk_root->root_key.objectid,
                                            found_key.objectid,
                                            found_key.offset);
-               BUG_ON(ret);
+               BUG_ON(ret && ret != -ENOSPC);
+               key.offset = found_key.offset - 1;
         }
         ret = 0;
  error:
@@ -1801,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
         u64 chunk_offset;
         int ret;
         int slot;
+       int failed = 0;
+       bool retried = false;
         struct extent_buffer *l;
         struct btrfs_key key;
         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
         u64 old_total = btrfs_super_total_bytes(super_copy);
+       u64 old_size = device->total_bytes;
         u64 diff = device->total_bytes - new_size;
  
         if (new_size >= device->total_bytes)
@@ -1814,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
         if (!path)
                 return -ENOMEM;
  
-       trans = btrfs_start_transaction(root, 1);
-       if (!trans) {
-               ret = -ENOMEM;
-               goto done;
-       }
-
         path->reada = 2;
  
         lock_chunks(root);
@@ -1827,17 +2018,9 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
         device->total_bytes = new_size;
         if (device->writeable)
                 device->fs_devices->total_rw_bytes -= diff;
-       ret = btrfs_update_device(trans, device);
-       if (ret) {
-               unlock_chunks(root);
-               btrfs_end_transaction(trans, root);
-               goto done;
-       }
-       WARN_ON(diff > old_total);
-       btrfs_set_super_total_bytes(super_copy, old_total - diff);
         unlock_chunks(root);
-       btrfs_end_transaction(trans, root);
  
+again:
         key.objectid = device->devid;
         key.offset = (u64)-1;
         key.type = BTRFS_DEV_EXTENT_KEY;
@@ -1852,21 +2035,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                         goto done;
                 if (ret) {
                         ret = 0;
-                       goto done;
+                       btrfs_release_path(root, path);
+                       break;
                 }
  
                 l = path->nodes[0];
                 slot = path->slots[0];
                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
  
-               if (key.objectid != device->devid)
-                       goto done;
+               if (key.objectid != device->devid) {
+                       btrfs_release_path(root, path);
+                       break;
+               }
  
                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
                 length = btrfs_dev_extent_length(l, dev_extent);
  
-               if (key.offset + length <= new_size)
-                       goto done;
+               if (key.offset + length <= new_size) {
+                       btrfs_release_path(root, path);
+                       break;
+               }
  
                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -1875,16 +2063,54 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
  
                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
                                            chunk_offset);
-               if (ret)
+               if (ret && ret != -ENOSPC)
                         goto done;
+               if (ret == -ENOSPC)
+                       failed++;
+               key.offset -= 1;
+       }
+
+       if (failed && !retried) {
+               failed = 0;
+               retried = true;
+               goto again;
+       } else if (failed && retried) {
+               ret = -ENOSPC;
+               lock_chunks(root);
+
+               device->total_bytes = old_size;
+               if (device->writeable)
+                       device->fs_devices->total_rw_bytes += diff;
+               unlock_chunks(root);
+               goto done;
+       }
+
+       /* Shrinking succeeded, else we would be at "done". */
+       trans = btrfs_start_transaction(root, 1);
+       if (!trans) {
+               ret = -ENOMEM;
+               goto done;
         }
+       lock_chunks(root);
  
+       device->disk_total_bytes = new_size;
+       /* Now btrfs_update_device() will change the on-disk size. */
+       ret = btrfs_update_device(trans, device);
+       if (ret) {
+               unlock_chunks(root);
+               btrfs_end_transaction(trans, root);
+               goto done;
+       }
+       WARN_ON(diff > old_total);
+       btrfs_set_super_total_bytes(super_copy, old_total - diff);
+       unlock_chunks(root);
+       btrfs_end_transaction(trans, root);
  done:
         btrfs_free_path(path);
         return ret;
  }
  
-int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct btrfs_key *key,
                            struct btrfs_chunk *chunk, int item_size)
@@ -1908,7 +2134,7 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
         return 0;
  }
  
-static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
+static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
                                         int num_stripes, int sub_stripes)
  {
         if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
@@ -1996,6 +2222,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                              max_chunk_size);
  
  again:
+       max_avail = 0;
         if (!map || map->num_stripes != num_stripes) {
                 kfree(map);
                 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2033,7 +2260,7 @@ again:
                 min_free += 1024 * 1024;
  
         INIT_LIST_HEAD(&private_devs);
-       while(index < num_stripes) {
+       while (index < num_stripes) {
                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
                 BUG_ON(!device->writeable);
                 if (device->total_bytes > device->bytes_used)
@@ -2044,7 +2271,8 @@ again:
  
                 if (device->in_fs_metadata && avail >= min_free) {
                         ret = find_free_dev_extent(trans, device,
-                                                  min_free, &dev_offset);
+                                                  min_free, &dev_offset,
+                                                  &max_avail);
                         if (ret == 0) {
                                 list_move_tail(&device->dev_alloc_list,
                                                &private_devs);
@@ -2107,9 +2335,9 @@ again:
         em->block_len = em->len;
  
         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
-       spin_lock(&em_tree->lock);
+       write_lock(&em_tree->lock);
         ret = add_extent_mapping(em_tree, em);
-       spin_unlock(&em_tree->lock);
+       write_unlock(&em_tree->lock);
         BUG_ON(ret);
         free_extent_map(em);
  
@@ -2234,7 +2462,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
         return 0;
  }
  
-static int noinline init_first_rw_device(struct btrfs_trans_handle *trans,
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_device *device)
  {
@@ -2304,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
         int readonly = 0;
         int i;
  
-       spin_lock(&map_tree->map_tree.lock);
+       read_lock(&map_tree->map_tree.lock);
         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
-       spin_unlock(&map_tree->map_tree.lock);
+       read_unlock(&map_tree->map_tree.lock);
         if (!em)
                 return 1;
  
@@ -2330,12 +2558,12 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
  {
         struct extent_map *em;
  
-       while(1) {
-               spin_lock(&tree->map_tree.lock);
+       while (1) {
+               write_lock(&tree->map_tree.lock);
                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
                 if (em)
                         remove_extent_mapping(&tree->map_tree, em);
-               spin_unlock(&tree->map_tree.lock);
+               write_unlock(&tree->map_tree.lock);
                 if (!em)
                         break;
                 kfree(em->bdev);
@@ -2353,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
         struct extent_map_tree *em_tree = &map_tree->map_tree;
         int ret;
  
-       spin_lock(&em_tree->lock);
+       read_lock(&em_tree->lock);
         em = lookup_extent_mapping(em_tree, logical, len);
-       spin_unlock(&em_tree->lock);
+       read_unlock(&em_tree->lock);
         BUG_ON(!em);
  
         BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -2405,9 +2633,8 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
         int max_errors = 0;
         struct btrfs_multi_bio *multi = NULL;
  
-       if (multi_ret && !(rw & (1 << BIO_RW))) {
+       if (multi_ret && !(rw & (1 << BIO_RW)))
                 stripes_allocated = 1;
-       }
  again:
         if (multi_ret) {
                 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
@@ -2418,15 +2645,17 @@ again:
                 atomic_set(&multi->error, 0);
         }
  
-       spin_lock(&em_tree->lock);
+       read_lock(&em_tree->lock);
         em = lookup_extent_mapping(em_tree, logical, *length);
-       spin_unlock(&em_tree->lock);
+       read_unlock(&em_tree->lock);
  
         if (!em && unplug_page)
                 return 0;
  
         if (!em) {
-               printk("unable to find logical %Lu len %Lu\n", logical, *length);
+               printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+                      (unsigned long long)logical,
+                      (unsigned long long)*length);
                 BUG();
         }
  
@@ -2448,7 +2677,7 @@ again:
                         max_errors = 1;
                 }
         }
-       if (multi_ret && rw == WRITE &&
+       if (multi_ret && (rw & (1 << BIO_RW)) &&
             stripes_allocated < stripes_required) {
                 stripes_allocated = map->num_stripes;
                 free_extent_map(em);
@@ -2533,9 +2762,8 @@ again:
                         device = map->stripes[stripe_index].dev;
                         if (device->bdev) {
                                 bdi = blk_get_backing_dev_info(device->bdev);
-                               if (bdi->unplug_io_fn) {
+                               if (bdi->unplug_io_fn)
                                         bdi->unplug_io_fn(bdi, unplug_page);
-                               }
                         }
                 } else {
                         multi->stripes[i].physical =
@@ -2563,6 +2791,71 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                                  mirror_num, NULL);
  }
  
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+                    u64 chunk_start, u64 physical, u64 devid,
+                    u64 **logical, int *naddrs, int *stripe_len)
+{
+       struct extent_map_tree *em_tree = &map_tree->map_tree;
+       struct extent_map *em;
+       struct map_lookup *map;
+       u64 *buf;
+       u64 bytenr;
+       u64 length;
+       u64 stripe_nr;
+       int i, j, nr = 0;
+
+       read_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree, chunk_start, 1);
+       read_unlock(&em_tree->lock);
+
+       BUG_ON(!em || em->start != chunk_start);
+       map = (struct map_lookup *)em->bdev;
+
+       length = em->len;
+       if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+               do_div(length, map->num_stripes / map->sub_stripes);
+       else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+               do_div(length, map->num_stripes);
+
+       buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+       BUG_ON(!buf);
+
+       for (i = 0; i < map->num_stripes; i++) {
+               if (devid && map->stripes[i].dev->devid != devid)
+                       continue;
+               if (map->stripes[i].physical > physical ||
+                   map->stripes[i].physical + length <= physical)
+                       continue;
+
+               stripe_nr = physical - map->stripes[i].physical;
+               do_div(stripe_nr, map->stripe_len);
+
+               if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                       stripe_nr = stripe_nr * map->num_stripes + i;
+                       do_div(stripe_nr, map->sub_stripes);
+               } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                       stripe_nr = stripe_nr * map->num_stripes + i;
+               }
+               bytenr = chunk_start + stripe_nr * map->stripe_len;
+               WARN_ON(nr >= map->num_stripes);
+               for (j = 0; j < nr; j++) {
+                       if (buf[j] == bytenr)
+                               break;
+               }
+               if (j == nr) {
+                       WARN_ON(nr >= map->num_stripes);
+                       buf[nr++] = bytenr;
+               }
+       }
+
+       *logical = buf;
+       *naddrs = nr;
+       *stripe_len = map->stripe_len;
+
+       free_extent_map(em);
+       return 0;
+}
+
  int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
                       u64 logical, struct page *page)
  {
@@ -2571,7 +2864,6 @@ int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
                                  NULL, 0, page);
  }
  
-
  static void end_bio_multi_stripe(struct bio *bio, int err)
  {
         struct btrfs_multi_bio *multi = bio->bi_private;
@@ -2625,11 +2917,12 @@ struct async_sched {
   * This will add one bio to the pending list for a device and make sure
   * the work struct is scheduled.
   */
-static int noinline schedule_bio(struct btrfs_root *root,
+static noinline int schedule_bio(struct btrfs_root *root,
                                  struct btrfs_device *device,
                                  int rw, struct bio *bio)
  {
         int should_queue = 1;
+       struct btrfs_pending_bios *pending_bios;
  
         /* don't bother with additional async steps for reads, right now */
         if (!(rw & (1 << BIO_RW))) {
@@ -2651,13 +2944,17 @@ static int noinline schedule_bio(struct btrfs_root *root,
         bio->bi_rw |= rw;
  
         spin_lock(&device->io_lock);
+       if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
+               pending_bios = &device->pending_sync_bios;
+       else
+               pending_bios = &device->pending_bios;
  
-       if (device->pending_bio_tail)
-               device->pending_bio_tail->bi_next = bio;
+       if (pending_bios->tail)
+               pending_bios->tail->bi_next = bio;
  
-       device->pending_bio_tail = bio;
-       if (!device->pending_bios)
-               device->pending_bios = bio;
+       pending_bios->tail = bio;
+       if (!pending_bios->head)
+               pending_bios->head = bio;
         if (device->running_pending)
                 should_queue = 0;
  
@@ -2693,8 +2990,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
  
         total_devs = multi->num_stripes;
         if (map_length < length) {
-               printk("mapping failed logical %Lu bio len %Lu "
-                      "len %Lu\n", logical, length, map_length);
+               printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+                      "len %llu\n", (unsigned long long)logical,
+                      (unsigned long long)length,
+                      (unsigned long long)map_length);
                 BUG();
         }
         multi->end_io = first_bio->bi_end_io;
@@ -2702,7 +3001,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
         multi->orig_bio = first_bio;
         atomic_set(&multi->stripes_pending, multi->num_stripes);
  
-       while(dev_nr < total_devs) {
+       while (dev_nr < total_devs) {
                 if (total_devs > 1) {
                         if (dev_nr < total_devs - 1) {
                                 bio = bio_clone(first_bio, GFP_NOFS);
@@ -2769,8 +3068,10 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
         device->dev_root = root->fs_info->dev_root;
         device->devid = devid;
         device->work.func = pending_bios_fn;
+       device->fs_devices = fs_devices;
         fs_devices->num_devices++;
         spin_lock_init(&device->io_lock);
+       INIT_LIST_HEAD(&device->dev_alloc_list);
         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
         return device;
  }
@@ -2793,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
         logical = key->offset;
         length = btrfs_chunk_length(leaf, chunk);
  
-       spin_lock(&map_tree->map_tree.lock);
+       read_lock(&map_tree->map_tree.lock);
         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
-       spin_unlock(&map_tree->map_tree.lock);
+       read_unlock(&map_tree->map_tree.lock);
  
         /* already mapped? */
         if (em && em->start <= logical && em->start + em->len > logical) {
@@ -2805,10 +3106,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                 free_extent_map(em);
         }
  
-       map = kzalloc(sizeof(*map), GFP_NOFS);
-       if (!map)
-               return -ENOMEM;
-
         em = alloc_extent_map(GFP_NOFS);
         if (!em)
                 return -ENOMEM;
@@ -2858,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                 map->stripes[i].dev->in_fs_metadata = 1;
         }
  
-       spin_lock(&map_tree->map_tree.lock);
+       write_lock(&map_tree->map_tree.lock);
         ret = add_extent_mapping(&map_tree->map_tree, em);
-       spin_unlock(&map_tree->map_tree.lock);
+       write_unlock(&map_tree->map_tree.lock);
         BUG_ON(ret);
         free_extent_map(em);
  
@@ -2874,7 +3171,8 @@ static int fill_device_from_item(struct extent_buffer *leaf,
         unsigned long ptr;
  
         device->devid = btrfs_device_id(leaf, dev_item);
-       device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+       device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+       device->total_bytes = device->disk_total_bytes;
         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
         device->type = btrfs_device_type(leaf, dev_item);
         device->io_align = btrfs_device_io_align(leaf, dev_item);
@@ -2908,25 +3206,27 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
                 ret = -ENOENT;
                 goto out;
         }
-       if (fs_devices->opened) {
-               ret = -EBUSY;
+
+       fs_devices = clone_fs_devices(fs_devices);
+       if (IS_ERR(fs_devices)) {
+               ret = PTR_ERR(fs_devices);
                 goto out;
         }
  
-       ret = __btrfs_open_devices(fs_devices, MS_RDONLY,
+       ret = __btrfs_open_devices(fs_devices, FMODE_READ,
                                    root->fs_info->bdev_holder);
         if (ret)
                 goto out;
  
         if (!fs_devices->seeding) {
                 __btrfs_close_devices(fs_devices);
+               free_fs_devices(fs_devices);
                 ret = -EINVAL;
                 goto out;
         }
  
         fs_devices->seed = root->fs_info->fs_devices->seed;
         root->fs_info->fs_devices->seed = fs_devices;
-       fs_devices->sprouted = 1;
  out:
         mutex_unlock(&uuid_mutex);
         return ret;
@@ -2939,7 +3239,6 @@ static int read_one_dev(struct btrfs_root *root,
         struct btrfs_device *device;
         u64 devid;
         int ret;
-       int seed_devices = 0;
         u8 fs_uuid[BTRFS_UUID_SIZE];
         u8 dev_uuid[BTRFS_UUID_SIZE];
  
@@ -2953,18 +3252,18 @@ static int read_one_dev(struct btrfs_root *root,
  
         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
                 ret = open_seed_devices(root, fs_uuid);
-               if (ret)
+               if (ret && !btrfs_test_opt(root, DEGRADED))
                         return ret;
-               seed_devices = 1;
         }
  
         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
         if (!device || !device->bdev) {
-               if (!btrfs_test_opt(root, DEGRADED) || seed_devices)
+               if (!btrfs_test_opt(root, DEGRADED))
                         return -EIO;
  
                 if (!device) {
-                       printk("warning devid %Lu missing\n", devid);
+                       printk(KERN_WARNING "warning devid %llu missing\n",
+                              (unsigned long long)devid);
                         device = add_missing_dev(root, devid, dev_uuid);
                         if (!device)
                                 return -ENOMEM;
@@ -2984,12 +3283,6 @@ static int read_one_dev(struct btrfs_root *root,
         if (device->writeable)
                 device->fs_devices->total_rw_bytes += device->total_bytes;
         ret = 0;
-#if 0
-       ret = btrfs_open_device(device);
-       if (ret) {
-               kfree(device);
-       }
-#endif
         return ret;
  }
  
@@ -3022,6 +3315,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
         if (!sb)
                 return -ENOMEM;
         btrfs_set_buffer_uptodate(sb);
+       btrfs_set_buffer_lockdep_class(sb, 0);
+
         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
         array_size = btrfs_super_sys_array_size(super_copy);
  
@@ -3080,7 +3375,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
         key.type = 0;
  again:
         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-       while(1) {
+       while (1) {
                 leaf = path->nodes[0];
                 slot = path->slots[0];
                 if (slot >= btrfs_header_nritems(leaf)) {