#include <linux/spinlock.h>
#include <linux/delay.h>
#include <linux/err.h>
+#include <linux/debugfs.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "dlmapi.h"
#include "dlmcommon.h"
-
-#include "dlmdebug.h"
#include "dlmdomain.h"
+#include "dlmdebug.h"
#include "dlmver.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
#include "cluster/masklog.h"
+/*
+ * ocfs2 node maps are array of long int, which limits to send them freely
+ * across the wire due to endianness issues. To workaround this, we convert
+ * long ints to byte arrays. Following 3 routines are helper functions to
+ * set/test/copy bits within those array of bytes
+ */
+static inline void byte_set_bit(u8 nr, u8 map[])
+{
+ map[nr >> 3] |= (1UL << (nr & 7));
+}
+
+static inline int byte_test_bit(u8 nr, u8 map[])
+{
+ return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0;
+}
+
+static inline void byte_copymap(u8 dmap[], unsigned long smap[],
+ unsigned int sz)
+{
+ unsigned int nn;
+
+ if (!sz)
+ return;
+
+ memset(dmap, 0, ((sz + 7) >> 3));
+ for (nn = 0 ; nn < sz; nn++)
+ if (test_bit(nn, smap))
+ byte_set_bit(nn, dmap);
+}
+
static void dlm_free_pagevec(void **vec, int pages)
{
while (pages--)
if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
goto out_free;
- mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %Zd buckets per page\n",
- pages, DLM_HASH_PAGES, DLM_BUCKETS_PER_PAGE);
+ mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
+ pages, (unsigned long)DLM_HASH_PAGES,
+ (unsigned long)DLM_BUCKETS_PER_PAGE);
return vec;
out_free:
dlm_free_pagevec(vec, i);
*
*/
-spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(dlm_domain_lock);
LIST_HEAD(dlm_domains);
static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
+/*
+ * The supported protocol version for DLM communication. Running domains
+ * will have a negotiated version with the same major number and a minor
+ * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should
+ * be used to determine what a running domain is actually using.
+ */
+static const struct dlm_protocol_version dlm_protocol = {
+ .pv_major = 1,
+ .pv_minor = 0,
+};
+
#define DLM_DOMAIN_BACKOFF_MS 200
-static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data);
-static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data);
-static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data);
-static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+static int dlm_protocol_compare(struct dlm_protocol_version *existing,
+ struct dlm_protocol_version *request);
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
{
- hlist_del_init(&lockres->hash_node);
- dlm_lockres_put(lockres);
+ if (!hlist_unhashed(&lockres->hash_node)) {
+ hlist_del_init(&lockres->hash_node);
+ dlm_lockres_put(lockres);
+ }
}
void __dlm_insert_lockres(struct dlm_ctxt *dlm,
hlist_add_head(&res->hash_node, bucket);
}
-struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
- const char *name,
- unsigned int len,
- unsigned int hash)
+struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
+ const char *name,
+ unsigned int len,
+ unsigned int hash)
{
struct hlist_head *bucket;
struct hlist_node *list;
return NULL;
}
+/* intended to be called by functions which do not care about lock
+ * resources which are being purged (most net _handler functions).
+ * this will return NULL for any lock resource which is found but
+ * currently in the process of dropping its mastery reference.
+ * use __dlm_lookup_lockres_full when you need the lock resource
+ * regardless (e.g. dlm_get_lock_resource) */
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+ const char *name,
+ unsigned int len,
+ unsigned int hash)
+{
+ struct dlm_lock_resource *res = NULL;
+
+ mlog_entry("%.*s\n", len, name);
+
+ assert_spin_locked(&dlm->spinlock);
+
+ res = __dlm_lookup_lockres_full(dlm, name, len, hash);
+ if (res) {
+ spin_lock(&res->spinlock);
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ spin_unlock(&res->spinlock);
+ dlm_lockres_put(res);
+ return NULL;
+ }
+ spin_unlock(&res->spinlock);
+ }
+
+ return res;
+}
+
struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int len)
static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
{
+ dlm_destroy_debugfs_subroot(dlm);
+
if (dlm->lockres_hash)
dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+ if (dlm->master_hash)
+ dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
+
if (dlm->name)
kfree(dlm->name);
return ret;
}
+static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
+{
+ if (dlm->dlm_worker) {
+ flush_workqueue(dlm->dlm_worker);
+ destroy_workqueue(dlm->dlm_worker);
+ dlm->dlm_worker = NULL;
+ }
+}
+
static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
{
dlm_unregister_domain_handlers(dlm);
+ dlm_debug_shutdown(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
+ dlm_destroy_dlm_worker(dlm);
/* We've left the domain. Now we can take ourselves out of the
* list and allow the kref stuff to help us free the
wake_up(&dlm_domain_events);
}
-static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
+static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
{
- int i;
+ int i, num, n, ret = 0;
struct dlm_lock_resource *res;
+ struct hlist_node *iter;
+ struct hlist_head *bucket;
+ int dropped;
mlog(0, "Migrating locks from domain %s\n", dlm->name);
-restart:
+
+ num = 0;
spin_lock(&dlm->spinlock);
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
- while (!hlist_empty(dlm_lockres_hash(dlm, i))) {
- res = hlist_entry(dlm_lockres_hash(dlm, i)->first,
- struct dlm_lock_resource, hash_node);
- /* need reference when manually grabbing lockres */
+redo_bucket:
+ n = 0;
+ bucket = dlm_lockres_hash(dlm, i);
+ iter = bucket->first;
+ while (iter) {
+ n++;
+ res = hlist_entry(iter, struct dlm_lock_resource,
+ hash_node);
dlm_lockres_get(res);
- /* this should unhash the lockres
- * and exit with dlm->spinlock */
- mlog(0, "purging res=%p\n", res);
- if (dlm_lockres_is_dirty(dlm, res)) {
- /* HACK! this should absolutely go.
- * need to figure out why some empty
- * lockreses are still marked dirty */
- mlog(ML_ERROR, "lockres %.*s dirty!\n",
- res->lockname.len, res->lockname.name);
-
- spin_unlock(&dlm->spinlock);
- dlm_kick_thread(dlm, res);
- wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
- dlm_lockres_put(res);
- goto restart;
- }
- dlm_purge_lockres(dlm, res);
+ /* migrate, if necessary. this will drop the dlm
+ * spinlock and retake it if it does migration. */
+ dropped = dlm_empty_lockres(dlm, res);
+
+ spin_lock(&res->spinlock);
+ __dlm_lockres_calc_usage(dlm, res);
+ iter = res->hash_node.next;
+ spin_unlock(&res->spinlock);
+
dlm_lockres_put(res);
+
+ if (dropped)
+ goto redo_bucket;
}
+ cond_resched_lock(&dlm->spinlock);
+ num += n;
+ mlog(0, "%s: touched %d lockreses in bucket %d "
+ "(tot=%d)\n", dlm->name, n, i, num);
}
spin_unlock(&dlm->spinlock);
-
+ wake_up(&dlm->dlm_thread_wq);
+
+ /* let the dlm thread take care of purging, keep scanning until
+ * nothing remains in the hash */
+ if (num) {
+ mlog(0, "%s: %d lock resources in hash last pass\n",
+ dlm->name, num);
+ ret = -EAGAIN;
+ }
mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
+ return ret;
}
static int dlm_no_joining_node(struct dlm_ctxt *dlm)
assert_spin_locked(&dlm->spinlock);
- mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name);
+ printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, " node %d\n", node);
+ printk("%d ", node);
}
+ printk("\n");
}
-static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
unsigned int node;
node = exit_msg->node_idx;
- mlog(0, "Node %u leaves domain %s\n", node, dlm->name);
+ printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
spin_lock(&dlm->spinlock);
clear_bit(node, dlm->domain_map);
void dlm_unregister_domain(struct dlm_ctxt *dlm)
{
int leave = 0;
+ struct dlm_lock_resource *res;
spin_lock(&dlm_domain_lock);
BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
/* We changed dlm state, notify the thread */
dlm_kick_thread(dlm, NULL);
- dlm_migrate_all_locks(dlm);
+ while (dlm_migrate_all_locks(dlm)) {
+ /* Give dlm_thread time to purge the lockres' */
+ msleep(500);
+ mlog(0, "%s: more migration to do\n", dlm->name);
+ }
+
+ /* This list should be empty. If not, print remaining lockres */
+ if (!list_empty(&dlm->tracking_list)) {
+ mlog(ML_ERROR, "Following lockres' are still on the "
+ "tracking list:\n");
+ list_for_each_entry(res, &dlm->tracking_list, tracking)
+ dlm_print_one_lock_resource(res);
+ }
+
dlm_mark_domain_leaving(dlm);
dlm_leave_domain(dlm);
dlm_complete_dlm_shutdown(dlm);
}
EXPORT_SYMBOL_GPL(dlm_unregister_domain);
-static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
+static int dlm_query_join_proto_check(char *proto_type, int node,
+ struct dlm_protocol_version *ours,
+ struct dlm_protocol_version *request)
+{
+ int rc;
+ struct dlm_protocol_version proto = *request;
+
+ if (!dlm_protocol_compare(ours, &proto)) {
+ mlog(0,
+ "node %u wanted to join with %s locking protocol "
+ "%u.%u, we respond with %u.%u\n",
+ node, proto_type,
+ request->pv_major,
+ request->pv_minor,
+ proto.pv_major, proto.pv_minor);
+ request->pv_minor = proto.pv_minor;
+ rc = 0;
+ } else {
+ mlog(ML_NOTICE,
+ "Node %u wanted to join with %s locking "
+ "protocol %u.%u, but we have %u.%u, disallowing\n",
+ node, proto_type,
+ request->pv_major,
+ request->pv_minor,
+ ours->pv_major,
+ ours->pv_minor);
+ rc = 1;
+ }
+
+ return rc;
+}
+
+/*
+ * struct dlm_query_join_packet is made up of four one-byte fields. They
+ * are effectively in big-endian order already. However, little-endian
+ * machines swap them before putting the packet on the wire (because
+ * query_join's response is a status, and that status is treated as a u32
+ * on the wire). Thus, a big-endian and little-endian machines will treat
+ * this structure differently.
+ *
+ * The solution is to have little-endian machines swap the structure when
+ * converting from the structure to the u32 representation. This will
+ * result in the structure having the correct format on the wire no matter
+ * the host endian format.
+ */
+static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet,
+ u32 *wire)
+{
+ union dlm_query_join_response response;
+
+ response.packet = *packet;
+ *wire = cpu_to_be32(response.intval);
+}
+
+static void dlm_query_join_wire_to_packet(u32 wire,
+ struct dlm_query_join_packet *packet)
+{
+ union dlm_query_join_response response;
+
+ response.intval = cpu_to_be32(wire);
+ *packet = response.packet;
+}
+
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_query_join_request *query;
- enum dlm_query_join_response response;
+ struct dlm_query_join_packet packet = {
+ .code = JOIN_DISALLOW,
+ };
struct dlm_ctxt *dlm = NULL;
+ u32 response;
+ u8 nodenum;
query = (struct dlm_query_join_request *) msg->buf;
mlog(0, "node %u is not in our live map yet\n",
query->node_idx);
- response = JOIN_DISALLOW;
+ packet.code = JOIN_DISALLOW;
goto respond;
}
- response = JOIN_OK_NO_MAP;
+ packet.code = JOIN_OK_NO_MAP;
spin_lock(&dlm_domain_lock);
dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
+ if (!dlm)
+ goto unlock_respond;
+
+ /*
+ * There is a small window where the joining node may not see the
+ * node(s) that just left but still part of the cluster. DISALLOW
+ * join request if joining node has different node map.
+ */
+ nodenum=0;
+ while (nodenum < O2NM_MAX_NODES) {
+ if (test_bit(nodenum, dlm->domain_map)) {
+ if (!byte_test_bit(nodenum, query->node_map)) {
+ mlog(0, "disallow join as node %u does not "
+ "have node %u in its nodemap\n",
+ query->node_idx, nodenum);
+ packet.code = JOIN_DISALLOW;
+ goto unlock_respond;
+ }
+ }
+ nodenum++;
+ }
+
/* Once the dlm ctxt is marked as leaving then we don't want
* to be put in someone's domain map.
* Also, explicitly disallow joining at certain troublesome
/*If this is a brand new context and we
* haven't started our join process yet, then
* the other node won the race. */
- response = JOIN_OK_NO_MAP;
+ packet.code = JOIN_OK_NO_MAP;
} else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
/* Disallow parallel joins. */
- response = JOIN_DISALLOW;
+ packet.code = JOIN_DISALLOW;
} else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
- mlog(ML_NOTICE, "node %u trying to join, but recovery "
+ mlog(0, "node %u trying to join, but recovery "
"is ongoing.\n", bit);
- response = JOIN_DISALLOW;
+ packet.code = JOIN_DISALLOW;
} else if (test_bit(bit, dlm->recovery_map)) {
- mlog(ML_NOTICE, "node %u trying to join, but it "
+ mlog(0, "node %u trying to join, but it "
"still needs recovery.\n", bit);
- response = JOIN_DISALLOW;
+ packet.code = JOIN_DISALLOW;
} else if (test_bit(bit, dlm->domain_map)) {
- mlog(ML_NOTICE, "node %u trying to join, but it "
+ mlog(0, "node %u trying to join, but it "
"is still in the domain! needs recovery?\n",
bit);
- response = JOIN_DISALLOW;
+ packet.code = JOIN_DISALLOW;
} else {
/* Alright we're fully a part of this domain
* so we keep some state as to who's joining
* and indicate to him that needs to be fixed
* up. */
- response = JOIN_OK;
- __dlm_set_joining_node(dlm, query->node_idx);
+
+ /* Make sure we speak compatible locking protocols. */
+ if (dlm_query_join_proto_check("DLM", bit,
+ &dlm->dlm_locking_proto,
+ &query->dlm_proto)) {
+ packet.code = JOIN_PROTOCOL_MISMATCH;
+ } else if (dlm_query_join_proto_check("fs", bit,
+ &dlm->fs_locking_proto,
+ &query->fs_proto)) {
+ packet.code = JOIN_PROTOCOL_MISMATCH;
+ } else {
+ packet.dlm_minor = query->dlm_proto.pv_minor;
+ packet.fs_minor = query->fs_proto.pv_minor;
+ packet.code = JOIN_OK;
+ __dlm_set_joining_node(dlm, query->node_idx);
+ }
}
spin_unlock(&dlm->spinlock);
}
+unlock_respond:
spin_unlock(&dlm_domain_lock);
respond:
- mlog(0, "We respond with %u\n", response);
+ mlog(0, "We respond with %u\n", packet.code);
+ dlm_query_join_packet_to_wire(&packet, &response);
return response;
}
-static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_assert_joined *assert;
struct dlm_ctxt *dlm = NULL;
set_bit(assert->node_idx, dlm->domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+ printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+ assert->node_idx, dlm->name);
__dlm_print_nodes(dlm);
/* notify anything attached to the heartbeat events */
return 0;
}
-static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data)
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_cancel_join *cancel;
struct dlm_ctxt *dlm = NULL;
sizeof(unsigned long))) {
mlog(ML_ERROR,
"map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
- map_size, BITS_TO_LONGS(O2NM_MAX_NODES));
+ map_size, (unsigned)BITS_TO_LONGS(O2NM_MAX_NODES));
return -EINVAL;
}
static int dlm_request_join(struct dlm_ctxt *dlm,
int node,
- enum dlm_query_join_response *response)
+ enum dlm_query_join_response_code *response)
{
- int status, retval;
+ int status;
struct dlm_query_join_request join_msg;
+ struct dlm_query_join_packet packet;
+ u32 join_resp;
mlog(0, "querying node %d\n", node);
join_msg.node_idx = dlm->node_num;
join_msg.name_len = strlen(dlm->name);
memcpy(join_msg.domain, dlm->name, join_msg.name_len);
+ join_msg.dlm_proto = dlm->dlm_locking_proto;
+ join_msg.fs_proto = dlm->fs_locking_proto;
+
+ /* copy live node map to join message */
+ byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
- sizeof(join_msg), node, &retval);
+ sizeof(join_msg), node,
+ &join_resp);
if (status < 0 && status != -ENOPROTOOPT) {
mlog_errno(status);
goto bail;
}
+ dlm_query_join_wire_to_packet(join_resp, &packet);
/* -ENOPROTOOPT from the net code means the other side isn't
listening for our message type -- that's fine, it means
if (status == -ENOPROTOOPT) {
status = 0;
*response = JOIN_OK_NO_MAP;
- } else if (retval == JOIN_DISALLOW ||
- retval == JOIN_OK ||
- retval == JOIN_OK_NO_MAP) {
- *response = retval;
+ } else if (packet.code == JOIN_DISALLOW ||
+ packet.code == JOIN_OK_NO_MAP) {
+ *response = packet.code;
+ } else if (packet.code == JOIN_PROTOCOL_MISMATCH) {
+ mlog(ML_NOTICE,
+ "This node requested DLM locking protocol %u.%u and "
+ "filesystem locking protocol %u.%u. At least one of "
+ "the protocol versions on node %d is not compatible, "
+ "disconnecting\n",
+ dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor,
+ dlm->fs_locking_proto.pv_major,
+ dlm->fs_locking_proto.pv_minor,
+ node);
+ status = -EPROTO;
+ *response = packet.code;
+ } else if (packet.code == JOIN_OK) {
+ *response = packet.code;
+ /* Use the same locking protocol as the remote node */
+ dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
+ dlm->fs_locking_proto.pv_minor = packet.fs_minor;
+ mlog(0,
+ "Node %d responds JOIN_OK with DLM locking protocol "
+ "%u.%u and fs locking protocol %u.%u\n",
+ node,
+ dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor,
+ dlm->fs_locking_proto.pv_major,
+ dlm->fs_locking_proto.pv_minor);
} else {
status = -EINVAL;
- mlog(ML_ERROR, "invalid response %d from node %u\n", retval,
- node);
+ mlog(ML_ERROR, "invalid response %d from node %u\n",
+ packet.code, node);
}
mlog(0, "status %d, node %d response is %d\n", status, node,
- *response);
+ *response);
bail:
return status;
static int dlm_should_restart_join(struct dlm_ctxt *dlm,
struct domain_join_ctxt *ctxt,
- enum dlm_query_join_response response)
+ enum dlm_query_join_response_code response)
{
int ret;
{
int status = 0, tmpstat, node;
struct domain_join_ctxt *ctxt;
- enum dlm_query_join_response response;
+ enum dlm_query_join_response_code response = JOIN_DISALLOW;
mlog_entry("%p", dlm);
- ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL);
+ ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (!ctxt) {
status = -ENOMEM;
mlog_errno(status);
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
{
- o2hb_unregister_callback(&dlm->dlm_hb_up);
- o2hb_unregister_callback(&dlm->dlm_hb_down);
+ o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
+ o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
}
o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
- status = o2hb_register_callback(&dlm->dlm_hb_down);
+ status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
if (status)
goto bail;
o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
- status = o2hb_register_callback(&dlm->dlm_hb_up);
+ status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
if (status)
goto bail;
status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
sizeof(struct dlm_master_request),
dlm_master_request_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
sizeof(struct dlm_assert_master),
dlm_assert_master_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, dlm_assert_master_post_handler,
+ &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
sizeof(struct dlm_create_lock),
dlm_create_lock_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
DLM_CONVERT_LOCK_MAX_LEN,
dlm_convert_lock_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
DLM_UNLOCK_LOCK_MAX_LEN,
dlm_unlock_lock_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
DLM_PROXY_AST_MAX_LEN,
dlm_proxy_ast_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
sizeof(struct dlm_exit_domain),
dlm_exit_domain_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
+ if (status)
+ goto bail;
+
+ status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key,
+ sizeof(struct dlm_deref_lockres),
+ dlm_deref_lockres_handler,
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
sizeof(struct dlm_migrate_request),
dlm_migrate_request_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
DLM_MIG_LOCKRES_MAX_LEN,
dlm_mig_lockres_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
sizeof(struct dlm_master_requery),
dlm_master_requery_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
sizeof(struct dlm_lock_request),
dlm_request_all_locks_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
sizeof(struct dlm_reco_data_done),
dlm_reco_data_done_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
sizeof(struct dlm_begin_reco),
dlm_begin_reco_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
sizeof(struct dlm_finalize_reco),
dlm_finalize_reco_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
static int dlm_join_domain(struct dlm_ctxt *dlm)
{
int status;
+ unsigned int backoff;
+ unsigned int total_backoff = 0;
BUG_ON(!dlm);
goto bail;
}
+ status = dlm_debug_init(dlm);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
status = dlm_launch_thread(dlm);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+ dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+ if (!dlm->dlm_worker) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
do {
- unsigned int backoff;
status = dlm_try_to_join_domain(dlm);
/* If we're racing another node to the join, then we
* need to back off temporarily and let them
* complete. */
+#define DLM_JOIN_TIMEOUT_MSECS 90000
if (status == -EAGAIN) {
if (signal_pending(current)) {
status = -ERESTARTSYS;
goto bail;
}
+ if (total_backoff >
+ msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) {
+ status = -ERESTARTSYS;
+ mlog(ML_NOTICE, "Timed out joining dlm domain "
+ "%s after %u msecs\n", dlm->name,
+ jiffies_to_msecs(total_backoff));
+ goto bail;
+ }
+
/*
* <chip> After you!
* <dale> No, after you!
*/
backoff = (unsigned int)(jiffies & 0x3);
backoff *= DLM_DOMAIN_BACKOFF_MS;
+ total_backoff += backoff;
mlog(0, "backoff %d\n", backoff);
msleep(backoff);
}
if (status) {
dlm_unregister_domain_handlers(dlm);
+ dlm_debug_shutdown(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
+ dlm_destroy_dlm_worker(dlm);
}
return status;
u32 key)
{
int i;
+ int ret;
struct dlm_ctxt *dlm = NULL;
- dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL);
+ dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
if (!dlm) {
mlog_errno(-ENOMEM);
goto leave;
for (i = 0; i < DLM_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
+ dlm->master_hash = (struct hlist_head **)
+ dlm_alloc_pagevec(DLM_HASH_PAGES);
+ if (!dlm->master_hash) {
+ mlog_errno(-ENOMEM);
+ dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+ goto leave;
+ }
+
+ for (i = 0; i < DLM_HASH_BUCKETS; i++)
+ INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
+
strcpy(dlm->name, domain);
dlm->key = key;
dlm->node_num = o2nm_this_node();
+ ret = dlm_create_debugfs_subroot(dlm);
+ if (ret < 0) {
+ dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
+ dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+ goto leave;
+ }
+
spin_lock_init(&dlm->spinlock);
spin_lock_init(&dlm->master_lock);
spin_lock_init(&dlm->ast_lock);
+ spin_lock_init(&dlm->track_lock);
INIT_LIST_HEAD(&dlm->list);
INIT_LIST_HEAD(&dlm->dirty_list);
INIT_LIST_HEAD(&dlm->reco.resources);
INIT_LIST_HEAD(&dlm->reco.node_data);
INIT_LIST_HEAD(&dlm->purge_list);
INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
+ INIT_LIST_HEAD(&dlm->tracking_list);
dlm->reco.state = 0;
INIT_LIST_HEAD(&dlm->pending_asts);
dlm->dlm_thread_task = NULL;
dlm->dlm_reco_thread_task = NULL;
+ dlm->dlm_worker = NULL;
init_waitqueue_head(&dlm->dlm_thread_wq);
init_waitqueue_head(&dlm->dlm_reco_thread_wq);
init_waitqueue_head(&dlm->reco.event);
init_waitqueue_head(&dlm->ast_wq);
init_waitqueue_head(&dlm->migration_wq);
- INIT_LIST_HEAD(&dlm->master_list);
INIT_LIST_HEAD(&dlm->mle_hb_events);
dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
- atomic_set(&dlm->local_resources, 0);
- atomic_set(&dlm->remote_resources, 0);
- atomic_set(&dlm->unknown_resources, 0);
+
+ atomic_set(&dlm->res_tot_count, 0);
+ atomic_set(&dlm->res_cur_count, 0);
+ for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
+ atomic_set(&dlm->mle_tot_count[i], 0);
+ atomic_set(&dlm->mle_cur_count[i], 0);
+ }
spin_lock_init(&dlm->work_lock);
INIT_LIST_HEAD(&dlm->work_list);
- INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm);
+ INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work);
kref_init(&dlm->dlm_refs);
dlm->dlm_state = DLM_CTXT_NEW;
}
/*
- * dlm_register_domain: one-time setup per "domain"
+ * Compare a requested locking protocol version against the current one.
+ *
+ * If the major numbers are different, they are incompatible.
+ * If the current minor is greater than the request, they are incompatible.
+ * If the current minor is less than or equal to the request, they are
+ * compatible, and the requester should run at the current minor version.
+ */
+static int dlm_protocol_compare(struct dlm_protocol_version *existing,
+ struct dlm_protocol_version *request)
+{
+ if (existing->pv_major != request->pv_major)
+ return 1;
+
+ if (existing->pv_minor > request->pv_minor)
+ return 1;
+
+ if (existing->pv_minor < request->pv_minor)
+ request->pv_minor = existing->pv_minor;
+
+ return 0;
+}
+
+/*
+ * dlm_register_domain: one-time setup per "domain".
+ *
+ * The filesystem passes in the requested locking version via proto.
+ * If registration was successful, proto will contain the negotiated
+ * locking protocol.
*/
struct dlm_ctxt * dlm_register_domain(const char *domain,
- u32 key)
+ u32 key,
+ struct dlm_protocol_version *fs_proto)
{
int ret;
struct dlm_ctxt *dlm = NULL;
goto retry;
}
+ if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+ mlog(ML_ERROR,
+ "Requested locking protocol version is not "
+ "compatible with already registered domain "
+ "\"%s\"\n", domain);
+ ret = -EPROTO;
+ goto leave;
+ }
+
__dlm_get(dlm);
dlm->num_joins++;
list_add_tail(&dlm->list, &dlm_domains);
spin_unlock(&dlm_domain_lock);
+ /*
+ * Pass the locking protocol version into the join. If the join
+ * succeeds, it will have the negotiated protocol set.
+ */
+ dlm->dlm_locking_proto = dlm_protocol;
+ dlm->fs_locking_proto = *fs_proto;
+
ret = dlm_join_domain(dlm);
if (ret) {
mlog_errno(ret);
goto leave;
}
+ /* Tell the caller what locking protocol we negotiated */
+ *fs_proto = dlm->fs_locking_proto;
+
ret = 0;
leave:
if (new_ctxt)
status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
sizeof(struct dlm_query_join_request),
dlm_query_join_handler,
- NULL, &dlm_join_handlers);
+ NULL, NULL, &dlm_join_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
sizeof(struct dlm_assert_joined),
dlm_assert_joined_handler,
- NULL, &dlm_join_handlers);
+ NULL, NULL, &dlm_join_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
sizeof(struct dlm_cancel_join),
dlm_cancel_join_handler,
- NULL, &dlm_join_handlers);
+ NULL, NULL, &dlm_join_handlers);
bail:
if (status < 0)
dlm_print_version();
status = dlm_init_mle_cache();
- if (status)
- return -1;
+ if (status) {
+ mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
+ goto error;
+ }
+
+ status = dlm_init_master_caches();
+ if (status) {
+ mlog(ML_ERROR, "Could not create o2dlm_lockres and "
+ "o2dlm_lockname slabcaches\n");
+ goto error;
+ }
+
+ status = dlm_init_lock_cache();
+ if (status) {
+ mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n");
+ goto error;
+ }
status = dlm_register_net_handlers();
if (status) {
- dlm_destroy_mle_cache();
- return -1;
+ mlog(ML_ERROR, "Unable to register network handlers\n");
+ goto error;
}
+ status = dlm_create_debugfs_root();
+ if (status)
+ goto error;
+
return 0;
+error:
+ dlm_unregister_net_handlers();
+ dlm_destroy_lock_cache();
+ dlm_destroy_master_caches();
+ dlm_destroy_mle_cache();
+ return -1;
}
static void __exit dlm_exit (void)
{
+ dlm_destroy_debugfs_root();
dlm_unregister_net_handlers();
+ dlm_destroy_lock_cache();
+ dlm_destroy_master_caches();
dlm_destroy_mle_cache();
}