+#include "ceph_debug.h"
+
+#include <linux/slab.h>
#include <asm/div64.h>
#include "super.h"
#include "crush/hash.h"
#include "crush/mapper.h"
#include "decode.h"
-#include "ceph_debug.h"
char *ceph_osdmap_state_str(char *str, int len, int state)
{
return ERR_PTR(err);
}
-
-/*
- * osd map
- */
-void ceph_osdmap_destroy(struct ceph_osdmap *map)
-{
- dout("osdmap_destroy %p\n", map);
- if (map->crush)
- crush_destroy(map->crush);
- while (!RB_EMPTY_ROOT(&map->pg_temp)) {
- struct ceph_pg_mapping *pg =
- rb_entry(rb_first(&map->pg_temp),
- struct ceph_pg_mapping, node);
- rb_erase(&pg->node, &map->pg_temp);
- kfree(pg);
- }
- kfree(map->osd_state);
- kfree(map->osd_weight);
- kfree(map->pg_pool);
- kfree(map->osd_addr);
- kfree(map);
-}
-
-/*
- * adjust max osd value. reallocate arrays.
- */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
-{
- u8 *state;
- struct ceph_entity_addr *addr;
- u32 *weight;
-
- state = kcalloc(max, sizeof(*state), GFP_NOFS);
- addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
- weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
- if (state == NULL || addr == NULL || weight == NULL) {
- kfree(state);
- kfree(addr);
- kfree(weight);
- return -ENOMEM;
- }
-
- /* copy old? */
- if (map->osd_state) {
- memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
- memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
- memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
- kfree(map->osd_state);
- kfree(map->osd_addr);
- kfree(map->osd_weight);
- }
-
- map->osd_state = state;
- map->osd_weight = weight;
- map->osd_addr = addr;
- map->max_osd = max;
- return 0;
-}
-
/*
* rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
* to a set of osds)
}
/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_pg_pool_info *pi = NULL;
+
+ while (*p) {
+ parent = *p;
+ pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+ if (new->id < pi->id)
+ p = &(*p)->rb_left;
+ else if (new->id > pi->id)
+ p = &(*p)->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, root);
+ return 0;
+}
+
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+ struct ceph_pg_pool_info *pi;
+ struct rb_node *n = root->rb_node;
+
+ while (n) {
+ pi = rb_entry(n, struct ceph_pg_pool_info, node);
+ if (id < pi->id)
+ n = n->rb_left;
+ else if (id > pi->id)
+ n = n->rb_right;
+ else
+ return pi;
+ }
+ return NULL;
+}
+
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+ rb_erase(&pi->node, root);
+ kfree(pi->name);
+ kfree(pi);
+}
+
+void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
+{
+ ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+ calc_pg_masks(pi);
+ *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+ *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+}
+
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+ struct ceph_pg_pool_info *pi;
+ u32 num, len, pool;
+
+ ceph_decode_32_safe(p, end, num, bad);
+ dout(" %d pool names\n", num);
+ while (num--) {
+ ceph_decode_32_safe(p, end, pool, bad);
+ ceph_decode_32_safe(p, end, len, bad);
+ dout(" pool %d len %d\n", pool, len);
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (pi) {
+ kfree(pi->name);
+ pi->name = kmalloc(len + 1, GFP_NOFS);
+ if (pi->name) {
+ memcpy(pi->name, *p, len);
+ pi->name[len] = '\0';
+ dout(" name is %s\n", pi->name);
+ }
+ }
+ *p += len;
+ }
+ return 0;
+
+bad:
+ return -EINVAL;
+}
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+ dout("osdmap_destroy %p\n", map);
+ if (map->crush)
+ crush_destroy(map->crush);
+ while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->pg_temp),
+ struct ceph_pg_mapping, node);
+ rb_erase(&pg->node, &map->pg_temp);
+ kfree(pg);
+ }
+ while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(rb_first(&map->pg_pools),
+ struct ceph_pg_pool_info, node);
+ __remove_pg_pool(&map->pg_pools, pi);
+ }
+ kfree(map->osd_state);
+ kfree(map->osd_weight);
+ kfree(map->osd_addr);
+ kfree(map);
+}
+
+/*
+ * adjust max osd value. reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+ u8 *state;
+ struct ceph_entity_addr *addr;
+ u32 *weight;
+
+ state = kcalloc(max, sizeof(*state), GFP_NOFS);
+ addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+ weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+ if (state == NULL || addr == NULL || weight == NULL) {
+ kfree(state);
+ kfree(addr);
+ kfree(weight);
+ return -ENOMEM;
+ }
+
+ /* copy old? */
+ if (map->osd_state) {
+ memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+ memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+ memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+ kfree(map->osd_state);
+ kfree(map->osd_addr);
+ kfree(map->osd_weight);
+ }
+
+ map->osd_state = state;
+ map->osd_weight = weight;
+ map->osd_addr = addr;
+ map->max_osd = max;
+ return 0;
+}
+
+/*
* decode a full map.
*/
struct ceph_osdmap *osdmap_decode(void **p, void *end)
u8 ev;
int err = -EINVAL;
void *start = *p;
+ struct ceph_pg_pool_info *pi;
dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
ceph_decode_copy(p, &map->created, sizeof(map->created));
ceph_decode_copy(p, &map->modified, sizeof(map->modified));
- map->num_pools = ceph_decode_32(p);
- map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool),
- GFP_NOFS);
- if (!map->pg_pool) {
- err = -ENOMEM;
- goto bad;
- }
ceph_decode_32_safe(p, end, max, bad);
while (max--) {
- ceph_decode_need(p, end, 4+1+sizeof(map->pg_pool->v), bad);
- i = ceph_decode_32(p);
- if (i >= map->num_pools)
+ ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+ pi = kzalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi)
goto bad;
+ pi->id = ceph_decode_32(p);
ev = ceph_decode_8(p); /* encoding version */
if (ev > CEPH_PG_POOL_VERSION) {
pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
ev, CEPH_PG_POOL_VERSION);
goto bad;
}
- ceph_decode_copy(p, &map->pg_pool[i].v,
- sizeof(map->pg_pool->v));
- calc_pg_masks(&map->pg_pool[i]);
- p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64);
- p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals)
- * sizeof(u64) * 2;
+ __decode_pool(p, pi);
+ __insert_pg_pool(&map->pg_pools, pi);
}
+ if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+ goto bad;
+
+ ceph_decode_32_safe(p, end, map->pool_max, bad);
+
ceph_decode_32_safe(p, end, map->flags, bad);
max = ceph_decode_32(p);
u32 epoch = 0;
struct ceph_timespec modified;
u32 len, pool;
- __s32 new_flags, max;
+ __s32 new_pool_max, new_flags, max;
void *start = *p;
int err = -EINVAL;
u16 version;
epoch = ceph_decode_32(p);
BUG_ON(epoch != map->epoch+1);
ceph_decode_copy(p, &modified, sizeof(modified));
+ new_pool_max = ceph_decode_32(p);
new_flags = ceph_decode_32(p);
/* full map? */
/* new flags? */
if (new_flags >= 0)
map->flags = new_flags;
+ if (new_pool_max >= 0)
+ map->pool_max = new_pool_max;
ceph_decode_need(p, end, 5*sizeof(u32), bad);
ceph_decode_32_safe(p, end, len, bad);
while (len--) {
__u8 ev;
+ struct ceph_pg_pool_info *pi;
ceph_decode_32_safe(p, end, pool, bad);
- if (pool >= map->num_pools) {
- void *pg_pool = kcalloc(pool + 1,
- sizeof(*map->pg_pool),
- GFP_NOFS);
- if (!pg_pool) {
- err = -ENOMEM;
- goto bad;
- }
- memcpy(pg_pool, map->pg_pool,
- map->num_pools * sizeof(*map->pg_pool));
- kfree(map->pg_pool);
- map->pg_pool = pg_pool;
- map->num_pools = pool+1;
- }
- ceph_decode_need(p, end, 1 + sizeof(map->pg_pool->v), bad);
+ ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
ev = ceph_decode_8(p); /* encoding version */
if (ev > CEPH_PG_POOL_VERSION) {
pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
ev, CEPH_PG_POOL_VERSION);
goto bad;
}
- ceph_decode_copy(p, &map->pg_pool[pool].v,
- sizeof(map->pg_pool->v));
- calc_pg_masks(&map->pg_pool[pool]);
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (!pi) {
+ pi = kzalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi) {
+ err = -ENOMEM;
+ goto bad;
+ }
+ pi->id = pool;
+ __insert_pg_pool(&map->pg_pools, pi);
+ }
+ __decode_pool(p, pi);
}
+ if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+ goto bad;
- /* old_pool (ignore) */
+ /* old_pool */
ceph_decode_32_safe(p, end, len, bad);
- *p += len * sizeof(u32);
+ while (len--) {
+ struct ceph_pg_pool_info *pi;
+
+ ceph_decode_32_safe(p, end, pool, bad);
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (pi)
+ __remove_pg_pool(&map->pg_pools, pi);
+ }
/* new_up */
err = -EINVAL;
unsigned ps;
BUG_ON(!osdmap);
- if (poolid >= osdmap->num_pools)
- return -EIO;
- pool = &osdmap->pg_pool[poolid];
+ pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+ if (!pool)
+ return -EIO;
ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
if (preferred >= 0) {
ps += preferred;
preferred >= osdmap->crush->max_devices)
preferred = -1;
- if (poolid >= osdmap->num_pools)
+ pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+ if (!pool)
return NULL;
- pool = &osdmap->pg_pool[poolid];
ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
pool->v.type, pool->v.size);
if (ruleno < 0) {
}
/*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+ int *acting)
+{
+ int rawosds[CEPH_PG_MAX_SIZE], *osds;
+ int i, o, num = CEPH_PG_MAX_SIZE;
+
+ osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+ if (!osds)
+ return -1;
+
+ /* primary is first up osd */
+ o = 0;
+ for (i = 0; i < num; i++)
+ if (ceph_osd_is_up(osdmap, osds[i]))
+ acting[o++] = osds[i];
+ return o;
+}
+
+/*
* Return primary osd for given pgid, or -1 if none.
*/
int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
{
- int rawosds[10], *osds;
- int i, num = ARRAY_SIZE(rawosds);
+ int rawosds[CEPH_PG_MAX_SIZE], *osds;
+ int i, num = CEPH_PG_MAX_SIZE;
osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
if (!osds)
/* primary is first up osd */
for (i = 0; i < num; i++)
- if (ceph_osd_is_up(osdmap, osds[i])) {
+ if (ceph_osd_is_up(osdmap, osds[i]))
return osds[i];
- break;
- }
return -1;
}