fs/minix: bugfix, number of indirect block ptrs per block depends on block size
[safe/jmp/linux-2.6] / fs / ceph / osdmap.c
index a143c51..cfdd8f4 100644 (file)
@@ -1,4 +1,7 @@
 
+#include "ceph_debug.h"
+
+#include <linux/slab.h>
 #include <asm/div64.h>
 
 #include "super.h"
@@ -6,7 +9,6 @@
 #include "crush/hash.h"
 #include "crush/mapper.h"
 #include "decode.h"
-#include "ceph_debug.h"
 
 char *ceph_osdmap_state_str(char *str, int len, int state)
 {
@@ -312,6 +314,152 @@ bad:
        return ERR_PTR(err);
 }
 
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+       u64 a = *(u64 *)&l;
+       u64 b = *(u64 *)&r;
+
+       if (a < b)
+               return -1;
+       if (a > b)
+               return 1;
+       return 0;
+}
+
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+                              struct rb_root *root)
+{
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct ceph_pg_mapping *pg = NULL;
+       int c;
+
+       while (*p) {
+               parent = *p;
+               pg = rb_entry(parent, struct ceph_pg_mapping, node);
+               c = pgid_cmp(new->pgid, pg->pgid);
+               if (c < 0)
+                       p = &(*p)->rb_left;
+               else if (c > 0)
+                       p = &(*p)->rb_right;
+               else
+                       return -EEXIST;
+       }
+
+       rb_link_node(&new->node, parent, p);
+       rb_insert_color(&new->node, root);
+       return 0;
+}
+
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+                                                  struct ceph_pg pgid)
+{
+       struct rb_node *n = root->rb_node;
+       struct ceph_pg_mapping *pg;
+       int c;
+
+       while (n) {
+               pg = rb_entry(n, struct ceph_pg_mapping, node);
+               c = pgid_cmp(pgid, pg->pgid);
+               if (c < 0)
+                       n = n->rb_left;
+               else if (c > 0)
+                       n = n->rb_right;
+               else
+                       return pg;
+       }
+       return NULL;
+}
+
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct ceph_pg_pool_info *pi = NULL;
+
+       while (*p) {
+               parent = *p;
+               pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+               if (new->id < pi->id)
+                       p = &(*p)->rb_left;
+               else if (new->id > pi->id)
+                       p = &(*p)->rb_right;
+               else
+                       return -EEXIST;
+       }
+
+       rb_link_node(&new->node, parent, p);
+       rb_insert_color(&new->node, root);
+       return 0;
+}
+
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+       struct ceph_pg_pool_info *pi;
+       struct rb_node *n = root->rb_node;
+
+       while (n) {
+               pi = rb_entry(n, struct ceph_pg_pool_info, node);
+               if (id < pi->id)
+                       n = n->rb_left;
+               else if (id > pi->id)
+                       n = n->rb_right;
+               else
+                       return pi;
+       }
+       return NULL;
+}
+
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+       rb_erase(&pi->node, root);
+       kfree(pi->name);
+       kfree(pi);
+}
+
+void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
+{
+       ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+       calc_pg_masks(pi);
+       *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+       *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+}
+
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+       struct ceph_pg_pool_info *pi;
+       u32 num, len, pool;
+
+       ceph_decode_32_safe(p, end, num, bad);
+       dout(" %d pool names\n", num);
+       while (num--) {
+               ceph_decode_32_safe(p, end, pool, bad);
+               ceph_decode_32_safe(p, end, len, bad);
+               dout("  pool %d len %d\n", pool, len);
+               pi = __lookup_pg_pool(&map->pg_pools, pool);
+               if (pi) {
+                       kfree(pi->name);
+                       pi->name = kmalloc(len + 1, GFP_NOFS);
+                       if (pi->name) {
+                               memcpy(pi->name, *p, len);
+                               pi->name[len] = '\0';
+                               dout("  name is %s\n", pi->name);
+                       }
+               }
+               *p += len;
+       }
+       return 0;
+
+bad:
+       return -EINVAL;
+}
 
 /*
  * osd map
@@ -321,11 +469,21 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
        dout("osdmap_destroy %p\n", map);
        if (map->crush)
                crush_destroy(map->crush);
-       while (!RB_EMPTY_ROOT(&map->pg_temp))
-               rb_erase(rb_first(&map->pg_temp), &map->pg_temp);
+       while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+               struct ceph_pg_mapping *pg =
+                       rb_entry(rb_first(&map->pg_temp),
+                                struct ceph_pg_mapping, node);
+               rb_erase(&pg->node, &map->pg_temp);
+               kfree(pg);
+       }
+       while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+               struct ceph_pg_pool_info *pi =
+                       rb_entry(rb_first(&map->pg_pools),
+                                struct ceph_pg_pool_info, node);
+               __remove_pg_pool(&map->pg_pools, pi);
+       }
        kfree(map->osd_state);
        kfree(map->osd_weight);
-       kfree(map->pg_pool);
        kfree(map->osd_addr);
        kfree(map);
 }
@@ -367,46 +525,6 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 }
 
 /*
- * Insert a new pg_temp mapping
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
-{
-       u64 a = *(u64 *)&l;
-       u64 b = *(u64 *)&r;
-
-       if (a < b)
-               return -1;
-       if (a > b)
-               return 1;
-       return 0;
-}
-
-static int __insert_pg_mapping(struct ceph_pg_mapping *new,
-                              struct rb_root *root)
-{
-       struct rb_node **p = &root->rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_pg_mapping *pg = NULL;
-       int c;
-
-       while (*p) {
-               parent = *p;
-               pg = rb_entry(parent, struct ceph_pg_mapping, node);
-               c = pgid_cmp(new->pgid, pg->pgid);
-               if (c < 0)
-                       p = &(*p)->rb_left;
-               else if (c > 0)
-                       p = &(*p)->rb_right;
-               else
-                       return -EEXIST;
-       }
-
-       rb_link_node(&new->node, parent, p);
-       rb_insert_color(&new->node, root);
-       return 0;
-}
-
-/*
  * decode a full map.
  */
 struct ceph_osdmap *osdmap_decode(void **p, void *end)
@@ -417,6 +535,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
        u8 ev;
        int err = -EINVAL;
        void *start = *p;
+       struct ceph_pg_pool_info *pi;
 
        dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
 
@@ -426,6 +545,11 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
        map->pg_temp = RB_ROOT;
 
        ceph_decode_16_safe(p, end, version, bad);
+       if (version > CEPH_OSDMAP_VERSION) {
+               pr_warning("got unknown v %d > %d of osdmap\n", version,
+                          CEPH_OSDMAP_VERSION);
+               goto bad;
+       }
 
        ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
        ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
@@ -433,28 +557,28 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
        ceph_decode_copy(p, &map->created, sizeof(map->created));
        ceph_decode_copy(p, &map->modified, sizeof(map->modified));
 
-       map->num_pools = ceph_decode_32(p);
-       map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool),
-                              GFP_NOFS);
-       if (!map->pg_pool) {
-               err = -ENOMEM;
-               goto bad;
-       }
        ceph_decode_32_safe(p, end, max, bad);
        while (max--) {
-               ceph_decode_need(p, end, 4+1+sizeof(map->pg_pool->v), bad);
-               i = ceph_decode_32(p);
-               if (i >= map->num_pools)
+               ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+               pi = kzalloc(sizeof(*pi), GFP_NOFS);
+               if (!pi)
                        goto bad;
+               pi->id = ceph_decode_32(p);
                ev = ceph_decode_8(p); /* encoding version */
-               ceph_decode_copy(p, &map->pg_pool[i].v,
-                                sizeof(map->pg_pool->v));
-               calc_pg_masks(&map->pg_pool[i]);
-               p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64);
-               p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals)
-                       * sizeof(u64) * 2;
+               if (ev > CEPH_PG_POOL_VERSION) {
+                       pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+                                  ev, CEPH_PG_POOL_VERSION);
+                       goto bad;
+               }
+               __decode_pool(p, pi);
+               __insert_pg_pool(&map->pg_pools, pi);
        }
 
+       if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+               goto bad;
+
+       ceph_decode_32_safe(p, end, map->pool_max, bad);
+
        ceph_decode_32_safe(p, end, map->flags, bad);
 
        max = ceph_decode_32(p);
@@ -545,13 +669,18 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        u32 epoch = 0;
        struct ceph_timespec modified;
        u32 len, pool;
-       __s32 new_flags, max;
+       __s32 new_pool_max, new_flags, max;
        void *start = *p;
        int err = -EINVAL;
        u16 version;
        struct rb_node *rbp;
 
        ceph_decode_16_safe(p, end, version, bad);
+       if (version > CEPH_OSDMAP_INC_VERSION) {
+               pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+                          CEPH_OSDMAP_INC_VERSION);
+               goto bad;
+       }
 
        ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
                         bad);
@@ -559,6 +688,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        epoch = ceph_decode_32(p);
        BUG_ON(epoch != map->epoch+1);
        ceph_decode_copy(p, &modified, sizeof(modified));
+       new_pool_max = ceph_decode_32(p);
        new_flags = ceph_decode_32(p);
 
        /* full map? */
@@ -582,6 +712,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        /* new flags? */
        if (new_flags >= 0)
                map->flags = new_flags;
+       if (new_pool_max >= 0)
+               map->pool_max = new_pool_max;
 
        ceph_decode_need(p, end, 5*sizeof(u32), bad);
 
@@ -606,32 +738,41 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        ceph_decode_32_safe(p, end, len, bad);
        while (len--) {
                __u8 ev;
+               struct ceph_pg_pool_info *pi;
 
                ceph_decode_32_safe(p, end, pool, bad);
-               if (pool >= map->num_pools) {
-                       void *pg_pool = kcalloc(pool + 1,
-                                               sizeof(*map->pg_pool),
-                                               GFP_NOFS);
-                       if (!pg_pool) {
+               ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
+               ev = ceph_decode_8(p);  /* encoding version */
+               if (ev > CEPH_PG_POOL_VERSION) {
+                       pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+                                  ev, CEPH_PG_POOL_VERSION);
+                       goto bad;
+               }
+               pi = __lookup_pg_pool(&map->pg_pools, pool);
+               if (!pi) {
+                       pi = kzalloc(sizeof(*pi), GFP_NOFS);
+                       if (!pi) {
                                err = -ENOMEM;
                                goto bad;
                        }
-                       memcpy(pg_pool, map->pg_pool,
-                              map->num_pools * sizeof(*map->pg_pool));
-                       kfree(map->pg_pool);
-                       map->pg_pool = pg_pool;
-                       map->num_pools = pool+1;
+                       pi->id = pool;
+                       __insert_pg_pool(&map->pg_pools, pi);
                }
-               ceph_decode_need(p, end, 1 + sizeof(map->pg_pool->v), bad);
-               ev = ceph_decode_8(p);  /* encoding version */
-               ceph_decode_copy(p, &map->pg_pool[pool].v,
-                                sizeof(map->pg_pool->v));
-               calc_pg_masks(&map->pg_pool[pool]);
+               __decode_pool(p, pi);
        }
+       if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+               goto bad;
 
-       /* old_pool (ignore) */
+       /* old_pool */
        ceph_decode_32_safe(p, end, len, bad);
-       *p += len * sizeof(u32);
+       while (len--) {
+               struct ceph_pg_pool_info *pi;
+
+               ceph_decode_32_safe(p, end, pool, bad);
+               pi = __lookup_pg_pool(&map->pg_pools, pool);
+               if (pi)
+                       __remove_pg_pool(&map->pg_pools, pi);
+       }
 
        /* new_up */
        err = -EINVAL;
@@ -815,10 +956,10 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
        unsigned ps;
 
        BUG_ON(!osdmap);
-       if (poolid >= osdmap->num_pools)
-               return -EIO;
 
-       pool = &osdmap->pg_pool[poolid];
+       pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+       if (!pool)
+               return -EIO;
        ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
        if (preferred >= 0) {
                ps += preferred;
@@ -850,26 +991,17 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
                        int *osds, int *num)
 {
-       struct rb_node *n = osdmap->pg_temp.rb_node;
        struct ceph_pg_mapping *pg;
        struct ceph_pg_pool_info *pool;
        int ruleno;
        unsigned poolid, ps, pps;
        int preferred;
-       int c;
 
        /* pg_temp? */
-       while (n) {
-               pg = rb_entry(n, struct ceph_pg_mapping, node);
-               c = pgid_cmp(pgid, pg->pgid);
-               if (c < 0)
-                       n = n->rb_left;
-               else if (c > 0)
-                       n = n->rb_right;
-               else {
-                       *num = pg->len;
-                       return pg->osds;
-               }
+       pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+       if (pg) {
+               *num = pg->len;
+               return pg->osds;
        }
 
        /* crush */
@@ -882,9 +1014,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
            preferred >= osdmap->crush->max_devices)
                preferred = -1;
 
-       if (poolid >= osdmap->num_pools)
+       pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+       if (!pool)
                return NULL;
-       pool = &osdmap->pg_pool[poolid];
        ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
                                 pool->v.type, pool->v.size);
        if (ruleno < 0) {
@@ -909,12 +1041,33 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 }
 
 /*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                       int *acting)
+{
+       int rawosds[CEPH_PG_MAX_SIZE], *osds;
+       int i, o, num = CEPH_PG_MAX_SIZE;
+
+       osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+       if (!osds)
+               return -1;
+
+       /* primary is first up osd */
+       o = 0;
+       for (i = 0; i < num; i++)
+               if (ceph_osd_is_up(osdmap, osds[i]))
+                       acting[o++] = osds[i];
+       return o;
+}
+
+/*
  * Return primary osd for given pgid, or -1 if none.
  */
 int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
-       int rawosds[10], *osds;
-       int i, num = ARRAY_SIZE(rawosds);
+       int rawosds[CEPH_PG_MAX_SIZE], *osds;
+       int i, num = CEPH_PG_MAX_SIZE;
 
        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
        if (!osds)
@@ -922,9 +1075,7 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 
        /* primary is first up osd */
        for (i = 0; i < num; i++)
-               if (ceph_osd_is_up(osdmap, osds[i])) {
+               if (ceph_osd_is_up(osdmap, osds[i]))
                        return osds[i];
-                       break;
-               }
        return -1;
 }