+#include "ceph_debug.h"
+
+#include <linux/slab.h>
#include <asm/div64.h>
#include "super.h"
#include "crush/hash.h"
#include "crush/mapper.h"
#include "decode.h"
-#include "ceph_debug.h"
char *ceph_osdmap_state_str(char *str, int len, int state)
{
size = sizeof(struct crush_bucket_straw);
break;
default:
+ err = -EINVAL;
goto bad;
}
BUG_ON(size == 0);
ceph_decode_need(p, end, 4*sizeof(u32), bad);
b->id = ceph_decode_32(p);
b->type = ceph_decode_16(p);
- b->alg = ceph_decode_16(p);
+ b->alg = ceph_decode_8(p);
+ b->hash = ceph_decode_8(p);
b->weight = ceph_decode_32(p);
b->size = ceph_decode_32(p);
/* len */
ceph_decode_32_safe(p, end, yes, bad);
#if BITS_PER_LONG == 32
+ err = -EINVAL;
if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
goto bad;
#endif
return ERR_PTR(err);
}
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+ u64 a = *(u64 *)&l;
+ u64 b = *(u64 *)&r;
+
+ if (a < b)
+ return -1;
+ if (a > b)
+ return 1;
+ return 0;
+}
+
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+ struct rb_root *root)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_pg_mapping *pg = NULL;
+ int c;
+
+ while (*p) {
+ parent = *p;
+ pg = rb_entry(parent, struct ceph_pg_mapping, node);
+ c = pgid_cmp(new->pgid, pg->pgid);
+ if (c < 0)
+ p = &(*p)->rb_left;
+ else if (c > 0)
+ p = &(*p)->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, root);
+ return 0;
+}
+
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+ struct ceph_pg pgid)
+{
+ struct rb_node *n = root->rb_node;
+ struct ceph_pg_mapping *pg;
+ int c;
+
+ while (n) {
+ pg = rb_entry(n, struct ceph_pg_mapping, node);
+ c = pgid_cmp(pgid, pg->pgid);
+ if (c < 0)
+ n = n->rb_left;
+ else if (c > 0)
+ n = n->rb_right;
+ else
+ return pg;
+ }
+ return NULL;
+}
+
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_pg_pool_info *pi = NULL;
+
+ while (*p) {
+ parent = *p;
+ pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+ if (new->id < pi->id)
+ p = &(*p)->rb_left;
+ else if (new->id > pi->id)
+ p = &(*p)->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, root);
+ return 0;
+}
+
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+ struct ceph_pg_pool_info *pi;
+ struct rb_node *n = root->rb_node;
+
+ while (n) {
+ pi = rb_entry(n, struct ceph_pg_pool_info, node);
+ if (id < pi->id)
+ n = n->rb_left;
+ else if (id > pi->id)
+ n = n->rb_right;
+ else
+ return pi;
+ }
+ return NULL;
+}
+
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+ rb_erase(&pi->node, root);
+ kfree(pi->name);
+ kfree(pi);
+}
+
+void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
+{
+ ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+ calc_pg_masks(pi);
+ *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+ *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+}
+
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+ struct ceph_pg_pool_info *pi;
+ u32 num, len, pool;
+
+ ceph_decode_32_safe(p, end, num, bad);
+ dout(" %d pool names\n", num);
+ while (num--) {
+ ceph_decode_32_safe(p, end, pool, bad);
+ ceph_decode_32_safe(p, end, len, bad);
+ dout(" pool %d len %d\n", pool, len);
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (pi) {
+ kfree(pi->name);
+ pi->name = kmalloc(len + 1, GFP_NOFS);
+ if (pi->name) {
+ memcpy(pi->name, *p, len);
+ pi->name[len] = '\0';
+ dout(" name is %s\n", pi->name);
+ }
+ }
+ *p += len;
+ }
+ return 0;
+
+bad:
+ return -EINVAL;
+}
/*
* osd map
dout("osdmap_destroy %p\n", map);
if (map->crush)
crush_destroy(map->crush);
- while (!RB_EMPTY_ROOT(&map->pg_temp))
- rb_erase(rb_first(&map->pg_temp), &map->pg_temp);
+ while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->pg_temp),
+ struct ceph_pg_mapping, node);
+ rb_erase(&pg->node, &map->pg_temp);
+ kfree(pg);
+ }
+ while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(rb_first(&map->pg_pools),
+ struct ceph_pg_pool_info, node);
+ __remove_pg_pool(&map->pg_pools, pi);
+ }
kfree(map->osd_state);
kfree(map->osd_weight);
- kfree(map->pg_pool);
kfree(map->osd_addr);
kfree(map);
}
}
/*
- * Insert a new pg_temp mapping
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
-{
- u64 a = *(u64 *)&l;
- u64 b = *(u64 *)&r;
-
- if (a < b)
- return -1;
- if (a > b)
- return 1;
- return 0;
-}
-
-static int __insert_pg_mapping(struct ceph_pg_mapping *new,
- struct rb_root *root)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct ceph_pg_mapping *pg = NULL;
- int c;
-
- while (*p) {
- parent = *p;
- pg = rb_entry(parent, struct ceph_pg_mapping, node);
- c = pgid_cmp(new->pgid, pg->pgid);
- if (c < 0)
- p = &(*p)->rb_left;
- else if (c > 0)
- p = &(*p)->rb_right;
- else
- return -EEXIST;
- }
-
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, root);
- return 0;
-}
-
-/*
* decode a full map.
*/
struct ceph_osdmap *osdmap_decode(void **p, void *end)
struct ceph_osdmap *map;
u16 version;
u32 len, max, i;
+ u8 ev;
int err = -EINVAL;
void *start = *p;
+ struct ceph_pg_pool_info *pi;
dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
map->pg_temp = RB_ROOT;
ceph_decode_16_safe(p, end, version, bad);
+ if (version > CEPH_OSDMAP_VERSION) {
+ pr_warning("got unknown v %d > %d of osdmap\n", version,
+ CEPH_OSDMAP_VERSION);
+ goto bad;
+ }
ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
ceph_decode_copy(p, &map->created, sizeof(map->created));
ceph_decode_copy(p, &map->modified, sizeof(map->modified));
- map->num_pools = ceph_decode_32(p);
- map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool),
- GFP_NOFS);
- if (!map->pg_pool) {
- err = -ENOMEM;
- goto bad;
- }
ceph_decode_32_safe(p, end, max, bad);
while (max--) {
- ceph_decode_need(p, end, 4+sizeof(map->pg_pool->v), bad);
- i = ceph_decode_32(p);
- if (i >= map->num_pools)
+ ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+ pi = kzalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi)
+ goto bad;
+ pi->id = ceph_decode_32(p);
+ ev = ceph_decode_8(p); /* encoding version */
+ if (ev > CEPH_PG_POOL_VERSION) {
+ pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+ ev, CEPH_PG_POOL_VERSION);
goto bad;
- ceph_decode_copy(p, &map->pg_pool[i].v,
- sizeof(map->pg_pool->v));
- calc_pg_masks(&map->pg_pool[i]);
- p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64);
- p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals)
- * sizeof(u64) * 2;
+ }
+ __decode_pool(p, pi);
+ __insert_pg_pool(&map->pg_pools, pi);
}
+ if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+ goto bad;
+
+ ceph_decode_32_safe(p, end, map->pool_max, bad);
+
ceph_decode_32_safe(p, end, map->flags, bad);
max = ceph_decode_32(p);
ceph_decode_copy(p, &pgid, sizeof(pgid));
n = ceph_decode_32(p);
ceph_decode_need(p, end, n * sizeof(u32), bad);
+ err = -ENOMEM;
pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
- if (!pg) {
- err = -ENOMEM;
+ if (!pg)
goto bad;
- }
pg->pgid = pgid;
pg->len = n;
for (j = 0; j < n; j++)
struct ceph_osdmap *map,
struct ceph_messenger *msgr)
{
- struct ceph_osdmap *newmap = map;
struct crush_map *newcrush = NULL;
struct ceph_fsid fsid;
u32 epoch = 0;
struct ceph_timespec modified;
u32 len, pool;
- __s32 new_flags, max;
+ __s32 new_pool_max, new_flags, max;
void *start = *p;
int err = -EINVAL;
u16 version;
struct rb_node *rbp;
ceph_decode_16_safe(p, end, version, bad);
+ if (version > CEPH_OSDMAP_INC_VERSION) {
+ pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+ CEPH_OSDMAP_INC_VERSION);
+ goto bad;
+ }
ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
bad);
epoch = ceph_decode_32(p);
BUG_ON(epoch != map->epoch+1);
ceph_decode_copy(p, &modified, sizeof(modified));
+ new_pool_max = ceph_decode_32(p);
new_flags = ceph_decode_32(p);
/* full map? */
if (len > 0) {
dout("apply_incremental full map len %d, %p to %p\n",
len, *p, end);
- newmap = osdmap_decode(p, min(*p+len, end));
- return newmap; /* error or not */
+ return osdmap_decode(p, min(*p+len, end));
}
/* new crush? */
/* new flags? */
if (new_flags >= 0)
map->flags = new_flags;
+ if (new_pool_max >= 0)
+ map->pool_max = new_pool_max;
ceph_decode_need(p, end, 5*sizeof(u32), bad);
/* new_pool */
ceph_decode_32_safe(p, end, len, bad);
while (len--) {
+ __u8 ev;
+ struct ceph_pg_pool_info *pi;
+
ceph_decode_32_safe(p, end, pool, bad);
- if (pool >= map->num_pools) {
- void *pg_pool = kcalloc(pool + 1,
- sizeof(*map->pg_pool),
- GFP_NOFS);
- if (!pg_pool) {
+ ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
+ ev = ceph_decode_8(p); /* encoding version */
+ if (ev > CEPH_PG_POOL_VERSION) {
+ pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+ ev, CEPH_PG_POOL_VERSION);
+ goto bad;
+ }
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (!pi) {
+ pi = kzalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi) {
err = -ENOMEM;
goto bad;
}
- memcpy(pg_pool, map->pg_pool,
- map->num_pools * sizeof(*map->pg_pool));
- kfree(map->pg_pool);
- map->pg_pool = pg_pool;
- map->num_pools = pool+1;
+ pi->id = pool;
+ __insert_pg_pool(&map->pg_pools, pi);
}
- ceph_decode_copy(p, &map->pg_pool[pool].v,
- sizeof(map->pg_pool->v));
- calc_pg_masks(&map->pg_pool[pool]);
+ __decode_pool(p, pi);
}
+ if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+ goto bad;
- /* old_pool (ignore) */
+ /* old_pool */
ceph_decode_32_safe(p, end, len, bad);
- *p += len * sizeof(u32);
+ while (len--) {
+ struct ceph_pg_pool_info *pi;
+
+ ceph_decode_32_safe(p, end, pool, bad);
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (pi)
+ __remove_pg_pool(&map->pg_pools, pi);
+ }
/* new_up */
err = -EINVAL;
}
pg->pgid = pgid;
pg->len = pglen;
- for (j = 0; j < len; j++)
+ for (j = 0; j < pglen; j++)
pg->osds[j] = ceph_decode_32(p);
err = __insert_pg_mapping(pg, &map->pg_temp);
if (err)
bad:
pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
epoch, (int)(*p - start), *p, start, end);
+ print_hex_dump(KERN_DEBUG, "osdmap: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ start, end - start, true);
if (newcrush)
crush_destroy(newcrush);
return ERR_PTR(err);
struct ceph_pg_pool_info *pool;
unsigned ps;
- if (poolid >= osdmap->num_pools)
- return -EIO;
+ BUG_ON(!osdmap);
- pool = &osdmap->pg_pool[poolid];
- ps = ceph_full_name_hash(oid, strlen(oid));
+ pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+ if (!pool)
+ return -EIO;
+ ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
if (preferred >= 0) {
ps += preferred;
num = le32_to_cpu(pool->v.lpg_num);
static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
int *osds, int *num)
{
- struct rb_node *n = osdmap->pg_temp.rb_node;
struct ceph_pg_mapping *pg;
struct ceph_pg_pool_info *pool;
int ruleno;
unsigned poolid, ps, pps;
int preferred;
- int c;
/* pg_temp? */
- while (n) {
- pg = rb_entry(n, struct ceph_pg_mapping, node);
- c = pgid_cmp(pgid, pg->pgid);
- if (c < 0)
- n = n->rb_left;
- else if (c > 0)
- n = n->rb_right;
- else {
- *num = pg->len;
- return pg->osds;
- }
+ pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+ if (pg) {
+ *num = pg->len;
+ return pg->osds;
}
/* crush */
ps = le16_to_cpu(pgid.ps);
preferred = (s16)le16_to_cpu(pgid.preferred);
- if (poolid >= osdmap->num_pools)
+ /* don't forcefeed bad device ids to crush */
+ if (preferred >= osdmap->max_osd ||
+ preferred >= osdmap->crush->max_devices)
+ preferred = -1;
+
+ pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+ if (!pool)
return NULL;
- pool = &osdmap->pg_pool[poolid];
ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
pool->v.type, pool->v.size);
if (ruleno < 0) {
}
/*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+ int *acting)
+{
+ int rawosds[CEPH_PG_MAX_SIZE], *osds;
+ int i, o, num = CEPH_PG_MAX_SIZE;
+
+ osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+ if (!osds)
+ return -1;
+
+ /* primary is first up osd */
+ o = 0;
+ for (i = 0; i < num; i++)
+ if (ceph_osd_is_up(osdmap, osds[i]))
+ acting[o++] = osds[i];
+ return o;
+}
+
+/*
* Return primary osd for given pgid, or -1 if none.
*/
int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
{
- int rawosds[10], *osds;
- int i, num = ARRAY_SIZE(rawosds);
+ int rawosds[CEPH_PG_MAX_SIZE], *osds;
+ int i, num = CEPH_PG_MAX_SIZE;
osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
if (!osds)
/* primary is first up osd */
for (i = 0; i < num; i++)
- if (ceph_osd_is_up(osdmap, osds[i])) {
+ if (ceph_osd_is_up(osdmap, osds[i]))
return osds[i];
- break;
- }
return -1;
}