Merge branch 'bkl/ioctl' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic...

[safe/jmp/linux-2.6] / fs / ceph / super.c
diff --git a/fs/ceph/super.c b/fs/ceph/super.c

index 3a25489..7c663d9 100644 (file)
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,13 +8,11 @@
  #include <linux/module.h>
  #include <linux/mount.h>
  #include <linux/parser.h>
-#include <linux/rwsem.h>
  #include <linux/sched.h>
  #include <linux/seq_file.h>
+#include <linux/slab.h>
  #include <linux/statfs.h>
  #include <linux/string.h>
-#include <linux/version.h>
-#include <linux/vmalloc.h>
  
  #include "decode.h"
  #include "super.h"
@@ -46,10 +44,20 @@ const char *ceph_file_part(const char *s, int len)
   */
  static void ceph_put_super(struct super_block *s)
  {
-       struct ceph_client *cl = ceph_client(s);
+       struct ceph_client *client = ceph_sb_to_client(s);
  
         dout("put_super\n");
-       ceph_mdsc_close_sessions(&cl->mdsc);
+       ceph_mdsc_close_sessions(&client->mdsc);
+
+       /*
+        * ensure we release the bdi before put_anon_super releases
+        * the device name.
+        */
+       if (s->s_bdi == &client->backing_dev_info) {
+               bdi_unregister(&client->backing_dev_info);
+               s->s_bdi = NULL;
+       }
+
         return;
  }
  
@@ -96,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
  static int ceph_syncfs(struct super_block *sb, int wait)
  {
         dout("sync_fs %d\n", wait);
-       ceph_osdc_sync(&ceph_client(sb)->osdc);
-       ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+       ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
+       ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
         dout("sync_fs %d done\n", wait);
         return 0;
  }
  
+static int default_congestion_kb(void)
+{
+       int congestion_kb;
+
+       /*
+        * Copied from NFS
+        *
+        * congestion size, scale with available memory.
+        *
+        *  64MB:    8192k
+        * 128MB:   11585k
+        * 256MB:   16384k
+        * 512MB:   23170k
+        *   1GB:   32768k
+        *   2GB:   46340k
+        *   4GB:   65536k
+        *   8GB:   92681k
+        *  16GB:  131072k
+        *
+        * This allows larger machines to have larger/more transfers.
+        * Limit the default to 256M
+        */
+       congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+       if (congestion_kb > 256*1024)
+               congestion_kb = 256*1024;
+
+       return congestion_kb;
+}
  
  /**
   * ceph_show_options - Show mount options in /proc/mounts
@@ -127,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
                 seq_puts(m, ",nocrc");
         if (args->flags & CEPH_OPT_NOASYNCREADDIR)
                 seq_puts(m, ",noasyncreaddir");
+
+       if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+               seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
+       if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+               seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
+       if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+               seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
+       if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+               seq_printf(m, ",osdkeepalivetimeout=%d",
+                        args->osd_keepalive_timeout);
+       if (args->wsize)
+               seq_printf(m, ",wsize=%d", args->wsize);
+       if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+               seq_printf(m, ",rsize=%d", args->rsize);
+       if (args->congestion_kb != default_congestion_kb())
+               seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
+       if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+               seq_printf(m, ",caps_wanted_delay_min=%d",
+                        args->caps_wanted_delay_min);
+       if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+               seq_printf(m, ",caps_wanted_delay_max=%d",
+                          args->caps_wanted_delay_max);
+       if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+               seq_printf(m, ",cap_release_safety=%d",
+                          args->cap_release_safety);
+       if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+               seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
+       if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+               seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
         if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
                 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
         if (args->name)
@@ -150,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
         inode_init_once(&ci->vfs_inode);
  }
  
-static int default_congestion_kb(void)
-{
-       int congestion_kb;
-
-       /*
-        * Copied from NFS
-        *
-        * congestion size, scale with available memory.
-        *
-        *  64MB:    8192k
-        * 128MB:   11585k
-        * 256MB:   16384k
-        * 512MB:   23170k
-        *   1GB:   32768k
-        *   2GB:   46340k
-        *   4GB:   65536k
-        *   8GB:   92681k
-        *  16GB:  131072k
-        *
-        * This allows larger machines to have larger/more transfers.
-        * Limit the default to 256M
-        */
-       congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-       if (congestion_kb > 256*1024)
-               congestion_kb = 256*1024;
-
-       return congestion_kb;
-}
-
  static int __init init_caches(void)
  {
         ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -292,10 +328,14 @@ enum {
         Opt_wsize,
         Opt_rsize,
         Opt_osdtimeout,
+       Opt_osdkeepalivetimeout,
         Opt_mount_timeout,
+       Opt_osd_idle_ttl,
         Opt_caps_wanted_delay_min,
         Opt_caps_wanted_delay_max,
+       Opt_cap_release_safety,
         Opt_readdir_max_entries,
+       Opt_readdir_max_bytes,
         Opt_congestion_kb,
         Opt_last_int,
         /* int args above */
@@ -321,10 +361,14 @@ static match_table_t arg_tokens = {
         {Opt_wsize, "wsize=%d"},
         {Opt_rsize, "rsize=%d"},
         {Opt_osdtimeout, "osdtimeout=%d"},
+       {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
         {Opt_mount_timeout, "mount_timeout=%d"},
+       {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
         {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
         {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+       {Opt_cap_release_safety, "cap_release_safety=%d"},
         {Opt_readdir_max_entries, "readdir_max_entries=%d"},
+       {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
         {Opt_congestion_kb, "write_congestion_kb=%d"},
         /* int args above */
         {Opt_snapdirname, "snapdirname=%s"},
@@ -365,14 +409,17 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
         /* start with defaults */
         args->sb_flags = flags;
         args->flags = CEPH_OPT_DEFAULT;
-       args->osd_timeout = 5;    /* seconds */
+       args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+       args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
         args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+       args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
         args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
         args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
         args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
         args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-       args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
-       args->max_readdir = 1024;
+       args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+       args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+       args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
         args->congestion_kb = default_congestion_kb();
  
         /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -465,6 +512,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                 case Opt_osdtimeout:
                         args->osd_timeout = intval;
                         break;
+               case Opt_osdkeepalivetimeout:
+                       args->osd_keepalive_timeout = intval;
+                       break;
                 case Opt_mount_timeout:
                         args->mount_timeout = intval;
                         break;
@@ -477,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                 case Opt_readdir_max_entries:
                         args->max_readdir = intval;
                         break;
+               case Opt_readdir_max_bytes:
+                       args->max_readdir_bytes = intval;
+                       break;
                 case Opt_congestion_kb:
                         args->congestion_kb = intval;
                         break;
@@ -575,6 +628,9 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
         if (!client->wb_pagevec_pool)
                 goto fail_trunc_wq;
  
+       /* caps */
+       client->min_caps = args->max_readdir;
+       ceph_adjust_min_caps(client->min_caps);
  
         /* subsystems */
         err = ceph_monc_init(&client->monc, client);
@@ -616,11 +672,15 @@ static void ceph_destroy_client(struct ceph_client *client)
         ceph_monc_stop(&client->monc);
         ceph_osdc_stop(&client->osdc);
  
+       ceph_adjust_min_caps(-client->min_caps);
+
         ceph_debugfs_client_cleanup(client);
         destroy_workqueue(client->wb_wq);
         destroy_workqueue(client->pg_inv_wq);
         destroy_workqueue(client->trunc_wq);
  
+       bdi_destroy(&client->backing_dev_info);
+
         if (client->msgr)
                 ceph_messenger_destroy(client->msgr);
         mempool_destroy(client->wb_pagevec_pool);
@@ -655,9 +715,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
  /*
   * true if we have the mon map (and have thus joined the cluster)
   */
-static int have_mon_map(struct ceph_client *client)
+static int have_mon_and_osd_map(struct ceph_client *client)
  {
-       return client->monc.monmap && client->monc.monmap->epoch;
+       return client->monc.monmap && client->monc.monmap->epoch &&
+              client->osdc.osdmap && client->osdc.osdmap->epoch;
  }
  
  /*
@@ -735,7 +796,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
         if (err < 0)
                 goto out;
  
-       while (!have_mon_map(client)) {
+       while (!have_mon_and_osd_map(client)) {
                 err = -EIO;
                 if (timeout && time_after_eq(jiffies, started + timeout))
                         goto out;
@@ -743,8 +804,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
                 /* wait */
                 dout("mount waiting for mon_map\n");
                 err = wait_event_interruptible_timeout(client->auth_wq,
-                              have_mon_map(client) || (client->auth_err < 0),
-                              timeout);
+                      have_mon_and_osd_map(client) || (client->auth_err < 0),
+                      timeout);
                 if (err == -EINTR || err == -ERESTARTSYS)
                         goto out;
                 if (client->auth_err < 0) {
@@ -857,18 +918,21 @@ static int ceph_compare_super(struct super_block *sb, void *data)
  /*
   * construct our own bdi so we can control readahead, etc.
   */
+static atomic_long_t bdi_seq = ATOMIC_INIT(0);
+
  static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
  {
         int err;
  
-       sb->s_bdi = &client->backing_dev_info;
-
         /* set ra_pages based on rsize mount option? */
         if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
                 client->backing_dev_info.ra_pages =
                         (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
                         >> PAGE_SHIFT;
-       err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+       err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+                          atomic_long_inc_return(&bdi_seq));
+       if (!err)
+               sb->s_bdi = &client->backing_dev_info;
         return err;
  }
  
@@ -905,9 +969,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
                 goto out;
         }
  
-       if (ceph_client(sb) != client) {
+       if (ceph_sb_to_client(sb) != client) {
                 ceph_destroy_client(client);
-               client = ceph_client(sb);
+               client = ceph_sb_to_client(sb);
                 dout("get_sb got existing client %p\n", client);
         } else {
                 dout("get_sb using new client %p\n", client);
@@ -925,8 +989,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
  
  out_splat:
         ceph_mdsc_close_sessions(&client->mdsc);
-       up_write(&sb->s_umount);
-       deactivate_super(sb);
+       deactivate_locked_super(sb);
         goto out_final;
  
  out:
@@ -942,9 +1005,6 @@ static void ceph_kill_sb(struct super_block *s)
         dout("kill_sb %p\n", s);
         ceph_mdsc_pre_umount(&client->mdsc);
         kill_anon_super(s);    /* will call put_super after sb is r/o */
-       if (s->s_bdi == &client->backing_dev_info)
-               bdi_unregister(&client->backing_dev_info);
-       bdi_destroy(&client->backing_dev_info);
         ceph_destroy_client(client);
  }
  
@@ -981,9 +1041,10 @@ static int __init init_ceph(void)
         if (ret)
                 goto out_icache;
  
-       pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
-               CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
-               CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
+       pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
+               CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
+               CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+               CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
         return 0;
  
  out_icache: