c144509011b851005ccf3fff7f517d3f5464c718
[safe/jmp/linux-2.6] / drivers / block / drbd / drbd_main.c
1 /*
2    drbd.c
3
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10    Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11    from Logicworks, Inc. for making SDP replication support possible.
12
13    drbd is free software; you can redistribute it and/or modify
14    it under the terms of the GNU General Public License as published by
15    the Free Software Foundation; either version 2, or (at your option)
16    any later version.
17
18    drbd is distributed in the hope that it will be useful,
19    but WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21    GNU General Public License for more details.
22
23    You should have received a copy of the GNU General Public License
24    along with drbd; see the file COPYING.  If not, write to
25    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27  */
28
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
33 #include <net/sock.h>
34 #include <linux/ctype.h>
35 #include <linux/smp_lock.h>
36 #include <linux/fs.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
40 #include <linux/mm.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
48
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
52
53 #include <linux/drbd_limits.h>
54 #include "drbd_int.h"
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57 #include "drbd_vli.h"
58
59 struct after_state_chg_work {
60         struct drbd_work w;
61         union drbd_state os;
62         union drbd_state ns;
63         enum chg_state_flags flags;
64         struct completion *done;
65 };
66
67 int drbdd_init(struct drbd_thread *);
68 int drbd_worker(struct drbd_thread *);
69 int drbd_asender(struct drbd_thread *);
70
71 int drbd_init(void);
72 static int drbd_open(struct block_device *bdev, fmode_t mode);
73 static int drbd_release(struct gendisk *gd, fmode_t mode);
74 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
75 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
76                            union drbd_state ns, enum chg_state_flags flags);
77 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78 static void md_sync_timer_fn(unsigned long data);
79 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
80
81 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
82               "Lars Ellenberg <lars@linbit.com>");
83 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
84 MODULE_VERSION(REL_VERSION);
85 MODULE_LICENSE("GPL");
86 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
87 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
88
89 #include <linux/moduleparam.h>
90 /* allow_open_on_secondary */
91 MODULE_PARM_DESC(allow_oos, "DONT USE!");
92 /* thanks to these macros, if compiled into the kernel (not-module),
93  * this becomes the boot parameter drbd.minor_count */
94 module_param(minor_count, uint, 0444);
95 module_param(disable_sendpage, bool, 0644);
96 module_param(allow_oos, bool, 0);
97 module_param(cn_idx, uint, 0444);
98 module_param(proc_details, int, 0644);
99
100 #ifdef CONFIG_DRBD_FAULT_INJECTION
101 int enable_faults;
102 int fault_rate;
103 static int fault_count;
104 int fault_devs;
105 /* bitmap of enabled faults */
106 module_param(enable_faults, int, 0664);
107 /* fault rate % value - applies to all enabled faults */
108 module_param(fault_rate, int, 0664);
109 /* count of faults inserted */
110 module_param(fault_count, int, 0664);
111 /* bitmap of devices to insert faults on */
112 module_param(fault_devs, int, 0644);
113 #endif
114
115 /* module parameter, defined */
116 unsigned int minor_count = 32;
117 int disable_sendpage;
118 int allow_oos;
119 unsigned int cn_idx = CN_IDX_DRBD;
120 int proc_details;       /* Detail level in proc drbd*/
121
122 /* Module parameter for setting the user mode helper program
123  * to run. Default is /sbin/drbdadm */
124 char usermode_helper[80] = "/sbin/drbdadm";
125
126 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
127
128 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
129  * as member "struct gendisk *vdisk;"
130  */
131 struct drbd_conf **minor_table;
132
133 struct kmem_cache *drbd_request_cache;
134 struct kmem_cache *drbd_ee_cache;       /* epoch entries */
135 struct kmem_cache *drbd_bm_ext_cache;   /* bitmap extents */
136 struct kmem_cache *drbd_al_ext_cache;   /* activity log extents */
137 mempool_t *drbd_request_mempool;
138 mempool_t *drbd_ee_mempool;
139
140 /* I do not use a standard mempool, because:
141    1) I want to hand out the pre-allocated objects first.
142    2) I want to be able to interrupt sleeping allocation with a signal.
143    Note: This is a single linked list, the next pointer is the private
144          member of struct page.
145  */
146 struct page *drbd_pp_pool;
147 spinlock_t   drbd_pp_lock;
148 int          drbd_pp_vacant;
149 wait_queue_head_t drbd_pp_wait;
150
151 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
152
153 static const struct block_device_operations drbd_ops = {
154         .owner =   THIS_MODULE,
155         .open =    drbd_open,
156         .release = drbd_release,
157 };
158
159 #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
160
161 #ifdef __CHECKER__
162 /* When checking with sparse, and this is an inline function, sparse will
163    give tons of false positives. When this is a real functions sparse works.
164  */
165 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
166 {
167         int io_allowed;
168
169         atomic_inc(&mdev->local_cnt);
170         io_allowed = (mdev->state.disk >= mins);
171         if (!io_allowed) {
172                 if (atomic_dec_and_test(&mdev->local_cnt))
173                         wake_up(&mdev->misc_wait);
174         }
175         return io_allowed;
176 }
177
178 #endif
179
180 /**
181  * DOC: The transfer log
182  *
183  * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
184  * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
185  * of the list. There is always at least one &struct drbd_tl_epoch object.
186  *
187  * Each &struct drbd_tl_epoch has a circular double linked list of requests
188  * attached.
189  */
190 static int tl_init(struct drbd_conf *mdev)
191 {
192         struct drbd_tl_epoch *b;
193
194         /* during device minor initialization, we may well use GFP_KERNEL */
195         b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
196         if (!b)
197                 return 0;
198         INIT_LIST_HEAD(&b->requests);
199         INIT_LIST_HEAD(&b->w.list);
200         b->next = NULL;
201         b->br_number = 4711;
202         b->n_req = 0;
203         b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
204
205         mdev->oldest_tle = b;
206         mdev->newest_tle = b;
207         INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
208
209         mdev->tl_hash = NULL;
210         mdev->tl_hash_s = 0;
211
212         return 1;
213 }
214
215 static void tl_cleanup(struct drbd_conf *mdev)
216 {
217         D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
218         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
219         kfree(mdev->oldest_tle);
220         mdev->oldest_tle = NULL;
221         kfree(mdev->unused_spare_tle);
222         mdev->unused_spare_tle = NULL;
223         kfree(mdev->tl_hash);
224         mdev->tl_hash = NULL;
225         mdev->tl_hash_s = 0;
226 }
227
228 /**
229  * _tl_add_barrier() - Adds a barrier to the transfer log
230  * @mdev:       DRBD device.
231  * @new:        Barrier to be added before the current head of the TL.
232  *
233  * The caller must hold the req_lock.
234  */
235 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
236 {
237         struct drbd_tl_epoch *newest_before;
238
239         INIT_LIST_HEAD(&new->requests);
240         INIT_LIST_HEAD(&new->w.list);
241         new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
242         new->next = NULL;
243         new->n_req = 0;
244
245         newest_before = mdev->newest_tle;
246         /* never send a barrier number == 0, because that is special-cased
247          * when using TCQ for our write ordering code */
248         new->br_number = (newest_before->br_number+1) ?: 1;
249         if (mdev->newest_tle != new) {
250                 mdev->newest_tle->next = new;
251                 mdev->newest_tle = new;
252         }
253 }
254
255 /**
256  * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
257  * @mdev:       DRBD device.
258  * @barrier_nr: Expected identifier of the DRBD write barrier packet.
259  * @set_size:   Expected number of requests before that barrier.
260  *
261  * In case the passed barrier_nr or set_size does not match the oldest
262  * &struct drbd_tl_epoch objects this function will cause a termination
263  * of the connection.
264  */
265 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
266                        unsigned int set_size)
267 {
268         struct drbd_tl_epoch *b, *nob; /* next old barrier */
269         struct list_head *le, *tle;
270         struct drbd_request *r;
271
272         spin_lock_irq(&mdev->req_lock);
273
274         b = mdev->oldest_tle;
275
276         /* first some paranoia code */
277         if (b == NULL) {
278                 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
279                         barrier_nr);
280                 goto bail;
281         }
282         if (b->br_number != barrier_nr) {
283                 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
284                         barrier_nr, b->br_number);
285                 goto bail;
286         }
287         if (b->n_req != set_size) {
288                 dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
289                         barrier_nr, set_size, b->n_req);
290                 goto bail;
291         }
292
293         /* Clean up list of requests processed during current epoch */
294         list_for_each_safe(le, tle, &b->requests) {
295                 r = list_entry(le, struct drbd_request, tl_requests);
296                 _req_mod(r, barrier_acked);
297         }
298         /* There could be requests on the list waiting for completion
299            of the write to the local disk. To avoid corruptions of
300            slab's data structures we have to remove the lists head.
301
302            Also there could have been a barrier ack out of sequence, overtaking
303            the write acks - which would be a bug and violating write ordering.
304            To not deadlock in case we lose connection while such requests are
305            still pending, we need some way to find them for the
306            _req_mode(connection_lost_while_pending).
307
308            These have been list_move'd to the out_of_sequence_requests list in
309            _req_mod(, barrier_acked) above.
310            */
311         list_del_init(&b->requests);
312
313         nob = b->next;
314         if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
315                 _tl_add_barrier(mdev, b);
316                 if (nob)
317                         mdev->oldest_tle = nob;
318                 /* if nob == NULL b was the only barrier, and becomes the new
319                    barrier. Therefore mdev->oldest_tle points already to b */
320         } else {
321                 D_ASSERT(nob != NULL);
322                 mdev->oldest_tle = nob;
323                 kfree(b);
324         }
325
326         spin_unlock_irq(&mdev->req_lock);
327         dec_ap_pending(mdev);
328
329         return;
330
331 bail:
332         spin_unlock_irq(&mdev->req_lock);
333         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
334 }
335
336
337 /**
338  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
339  * @mdev:       DRBD device.
340  *
341  * This is called after the connection to the peer was lost. The storage covered
342  * by the requests on the transfer gets marked as our of sync. Called from the
343  * receiver thread and the worker thread.
344  */
345 void tl_clear(struct drbd_conf *mdev)
346 {
347         struct drbd_tl_epoch *b, *tmp;
348         struct list_head *le, *tle;
349         struct drbd_request *r;
350         int new_initial_bnr = net_random();
351
352         spin_lock_irq(&mdev->req_lock);
353
354         b = mdev->oldest_tle;
355         while (b) {
356                 list_for_each_safe(le, tle, &b->requests) {
357                         r = list_entry(le, struct drbd_request, tl_requests);
358                         /* It would be nice to complete outside of spinlock.
359                          * But this is easier for now. */
360                         _req_mod(r, connection_lost_while_pending);
361                 }
362                 tmp = b->next;
363
364                 /* there could still be requests on that ring list,
365                  * in case local io is still pending */
366                 list_del(&b->requests);
367
368                 /* dec_ap_pending corresponding to queue_barrier.
369                  * the newest barrier may not have been queued yet,
370                  * in which case w.cb is still NULL. */
371                 if (b->w.cb != NULL)
372                         dec_ap_pending(mdev);
373
374                 if (b == mdev->newest_tle) {
375                         /* recycle, but reinit! */
376                         D_ASSERT(tmp == NULL);
377                         INIT_LIST_HEAD(&b->requests);
378                         INIT_LIST_HEAD(&b->w.list);
379                         b->w.cb = NULL;
380                         b->br_number = new_initial_bnr;
381                         b->n_req = 0;
382
383                         mdev->oldest_tle = b;
384                         break;
385                 }
386                 kfree(b);
387                 b = tmp;
388         }
389
390         /* we expect this list to be empty. */
391         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
392
393         /* but just in case, clean it up anyways! */
394         list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
395                 r = list_entry(le, struct drbd_request, tl_requests);
396                 /* It would be nice to complete outside of spinlock.
397                  * But this is easier for now. */
398                 _req_mod(r, connection_lost_while_pending);
399         }
400
401         /* ensure bit indicating barrier is required is clear */
402         clear_bit(CREATE_BARRIER, &mdev->flags);
403
404         spin_unlock_irq(&mdev->req_lock);
405 }
406
407 /**
408  * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
409  * @mdev:       DRBD device.
410  * @os:         old (current) state.
411  * @ns:         new (wanted) state.
412  */
413 static int cl_wide_st_chg(struct drbd_conf *mdev,
414                           union drbd_state os, union drbd_state ns)
415 {
416         return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
417                  ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
418                   (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
419                   (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
420                   (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
421                 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
422                 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
423 }
424
425 int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
426                       union drbd_state mask, union drbd_state val)
427 {
428         unsigned long flags;
429         union drbd_state os, ns;
430         int rv;
431
432         spin_lock_irqsave(&mdev->req_lock, flags);
433         os = mdev->state;
434         ns.i = (os.i & ~mask.i) | val.i;
435         rv = _drbd_set_state(mdev, ns, f, NULL);
436         ns = mdev->state;
437         spin_unlock_irqrestore(&mdev->req_lock, flags);
438
439         return rv;
440 }
441
442 /**
443  * drbd_force_state() - Impose a change which happens outside our control on our state
444  * @mdev:       DRBD device.
445  * @mask:       mask of state bits to change.
446  * @val:        value of new state bits.
447  */
448 void drbd_force_state(struct drbd_conf *mdev,
449         union drbd_state mask, union drbd_state val)
450 {
451         drbd_change_state(mdev, CS_HARD, mask, val);
452 }
453
454 static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
455 static int is_valid_state_transition(struct drbd_conf *,
456                                      union drbd_state, union drbd_state);
457 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
458                                        union drbd_state ns, int *warn_sync_abort);
459 int drbd_send_state_req(struct drbd_conf *,
460                         union drbd_state, union drbd_state);
461
462 static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
463                                     union drbd_state mask, union drbd_state val)
464 {
465         union drbd_state os, ns;
466         unsigned long flags;
467         int rv;
468
469         if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
470                 return SS_CW_SUCCESS;
471
472         if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
473                 return SS_CW_FAILED_BY_PEER;
474
475         rv = 0;
476         spin_lock_irqsave(&mdev->req_lock, flags);
477         os = mdev->state;
478         ns.i = (os.i & ~mask.i) | val.i;
479         ns = sanitize_state(mdev, os, ns, NULL);
480
481         if (!cl_wide_st_chg(mdev, os, ns))
482                 rv = SS_CW_NO_NEED;
483         if (!rv) {
484                 rv = is_valid_state(mdev, ns);
485                 if (rv == SS_SUCCESS) {
486                         rv = is_valid_state_transition(mdev, ns, os);
487                         if (rv == SS_SUCCESS)
488                                 rv = 0; /* cont waiting, otherwise fail. */
489                 }
490         }
491         spin_unlock_irqrestore(&mdev->req_lock, flags);
492
493         return rv;
494 }
495
496 /**
497  * drbd_req_state() - Perform an eventually cluster wide state change
498  * @mdev:       DRBD device.
499  * @mask:       mask of state bits to change.
500  * @val:        value of new state bits.
501  * @f:          flags
502  *
503  * Should not be called directly, use drbd_request_state() or
504  * _drbd_request_state().
505  */
506 static int drbd_req_state(struct drbd_conf *mdev,
507                           union drbd_state mask, union drbd_state val,
508                           enum chg_state_flags f)
509 {
510         struct completion done;
511         unsigned long flags;
512         union drbd_state os, ns;
513         int rv;
514
515         init_completion(&done);
516
517         if (f & CS_SERIALIZE)
518                 mutex_lock(&mdev->state_mutex);
519
520         spin_lock_irqsave(&mdev->req_lock, flags);
521         os = mdev->state;
522         ns.i = (os.i & ~mask.i) | val.i;
523         ns = sanitize_state(mdev, os, ns, NULL);
524
525         if (cl_wide_st_chg(mdev, os, ns)) {
526                 rv = is_valid_state(mdev, ns);
527                 if (rv == SS_SUCCESS)
528                         rv = is_valid_state_transition(mdev, ns, os);
529                 spin_unlock_irqrestore(&mdev->req_lock, flags);
530
531                 if (rv < SS_SUCCESS) {
532                         if (f & CS_VERBOSE)
533                                 print_st_err(mdev, os, ns, rv);
534                         goto abort;
535                 }
536
537                 drbd_state_lock(mdev);
538                 if (!drbd_send_state_req(mdev, mask, val)) {
539                         drbd_state_unlock(mdev);
540                         rv = SS_CW_FAILED_BY_PEER;
541                         if (f & CS_VERBOSE)
542                                 print_st_err(mdev, os, ns, rv);
543                         goto abort;
544                 }
545
546                 wait_event(mdev->state_wait,
547                         (rv = _req_st_cond(mdev, mask, val)));
548
549                 if (rv < SS_SUCCESS) {
550                         drbd_state_unlock(mdev);
551                         if (f & CS_VERBOSE)
552                                 print_st_err(mdev, os, ns, rv);
553                         goto abort;
554                 }
555                 spin_lock_irqsave(&mdev->req_lock, flags);
556                 os = mdev->state;
557                 ns.i = (os.i & ~mask.i) | val.i;
558                 rv = _drbd_set_state(mdev, ns, f, &done);
559                 drbd_state_unlock(mdev);
560         } else {
561                 rv = _drbd_set_state(mdev, ns, f, &done);
562         }
563
564         spin_unlock_irqrestore(&mdev->req_lock, flags);
565
566         if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
567                 D_ASSERT(current != mdev->worker.task);
568                 wait_for_completion(&done);
569         }
570
571 abort:
572         if (f & CS_SERIALIZE)
573                 mutex_unlock(&mdev->state_mutex);
574
575         return rv;
576 }
577
578 /**
579  * _drbd_request_state() - Request a state change (with flags)
580  * @mdev:       DRBD device.
581  * @mask:       mask of state bits to change.
582  * @val:        value of new state bits.
583  * @f:          flags
584  *
585  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
586  * flag, or when logging of failed state change requests is not desired.
587  */
588 int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
589                         union drbd_state val,   enum chg_state_flags f)
590 {
591         int rv;
592
593         wait_event(mdev->state_wait,
594                    (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
595
596         return rv;
597 }
598
599 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
600 {
601         dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
602             name,
603             drbd_conn_str(ns.conn),
604             drbd_role_str(ns.role),
605             drbd_role_str(ns.peer),
606             drbd_disk_str(ns.disk),
607             drbd_disk_str(ns.pdsk),
608             ns.susp ? 's' : 'r',
609             ns.aftr_isp ? 'a' : '-',
610             ns.peer_isp ? 'p' : '-',
611             ns.user_isp ? 'u' : '-'
612             );
613 }
614
615 void print_st_err(struct drbd_conf *mdev,
616         union drbd_state os, union drbd_state ns, int err)
617 {
618         if (err == SS_IN_TRANSIENT_STATE)
619                 return;
620         dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
621         print_st(mdev, " state", os);
622         print_st(mdev, "wanted", ns);
623 }
624
625
626 #define drbd_peer_str drbd_role_str
627 #define drbd_pdsk_str drbd_disk_str
628
629 #define drbd_susp_str(A)     ((A) ? "1" : "0")
630 #define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
631 #define drbd_peer_isp_str(A) ((A) ? "1" : "0")
632 #define drbd_user_isp_str(A) ((A) ? "1" : "0")
633
634 #define PSC(A) \
635         ({ if (ns.A != os.A) { \
636                 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
637                               drbd_##A##_str(os.A), \
638                               drbd_##A##_str(ns.A)); \
639         } })
640
641 /**
642  * is_valid_state() - Returns an SS_ error code if ns is not valid
643  * @mdev:       DRBD device.
644  * @ns:         State to consider.
645  */
646 static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
647 {
648         /* See drbd_state_sw_errors in drbd_strings.c */
649
650         enum drbd_fencing_p fp;
651         int rv = SS_SUCCESS;
652
653         fp = FP_DONT_CARE;
654         if (get_ldev(mdev)) {
655                 fp = mdev->ldev->dc.fencing;
656                 put_ldev(mdev);
657         }
658
659         if (get_net_conf(mdev)) {
660                 if (!mdev->net_conf->two_primaries &&
661                     ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
662                         rv = SS_TWO_PRIMARIES;
663                 put_net_conf(mdev);
664         }
665
666         if (rv <= 0)
667                 /* already found a reason to abort */;
668         else if (ns.role == R_SECONDARY && mdev->open_cnt)
669                 rv = SS_DEVICE_IN_USE;
670
671         else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
672                 rv = SS_NO_UP_TO_DATE_DISK;
673
674         else if (fp >= FP_RESOURCE &&
675                  ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
676                 rv = SS_PRIMARY_NOP;
677
678         else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
679                 rv = SS_NO_UP_TO_DATE_DISK;
680
681         else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
682                 rv = SS_NO_LOCAL_DISK;
683
684         else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
685                 rv = SS_NO_REMOTE_DISK;
686
687         else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
688                 rv = SS_NO_UP_TO_DATE_DISK;
689
690         else if ((ns.conn == C_CONNECTED ||
691                   ns.conn == C_WF_BITMAP_S ||
692                   ns.conn == C_SYNC_SOURCE ||
693                   ns.conn == C_PAUSED_SYNC_S) &&
694                   ns.disk == D_OUTDATED)
695                 rv = SS_CONNECTED_OUTDATES;
696
697         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
698                  (mdev->sync_conf.verify_alg[0] == 0))
699                 rv = SS_NO_VERIFY_ALG;
700
701         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
702                   mdev->agreed_pro_version < 88)
703                 rv = SS_NOT_SUPPORTED;
704
705         return rv;
706 }
707
708 /**
709  * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
710  * @mdev:       DRBD device.
711  * @ns:         new state.
712  * @os:         old state.
713  */
714 static int is_valid_state_transition(struct drbd_conf *mdev,
715                                      union drbd_state ns, union drbd_state os)
716 {
717         int rv = SS_SUCCESS;
718
719         if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
720             os.conn > C_CONNECTED)
721                 rv = SS_RESYNC_RUNNING;
722
723         if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
724                 rv = SS_ALREADY_STANDALONE;
725
726         if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
727                 rv = SS_IS_DISKLESS;
728
729         if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
730                 rv = SS_NO_NET_CONFIG;
731
732         if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
733                 rv = SS_LOWER_THAN_OUTDATED;
734
735         if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
736                 rv = SS_IN_TRANSIENT_STATE;
737
738         if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
739                 rv = SS_IN_TRANSIENT_STATE;
740
741         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
742                 rv = SS_NEED_CONNECTION;
743
744         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
745             ns.conn != os.conn && os.conn > C_CONNECTED)
746                 rv = SS_RESYNC_RUNNING;
747
748         if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
749             os.conn < C_CONNECTED)
750                 rv = SS_NEED_CONNECTION;
751
752         return rv;
753 }
754
755 /**
756  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
757  * @mdev:       DRBD device.
758  * @os:         old state.
759  * @ns:         new state.
760  * @warn_sync_abort:
761  *
762  * When we loose connection, we have to set the state of the peers disk (pdsk)
763  * to D_UNKNOWN. This rule and many more along those lines are in this function.
764  */
765 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
766                                        union drbd_state ns, int *warn_sync_abort)
767 {
768         enum drbd_fencing_p fp;
769
770         fp = FP_DONT_CARE;
771         if (get_ldev(mdev)) {
772                 fp = mdev->ldev->dc.fencing;
773                 put_ldev(mdev);
774         }
775
776         /* Disallow Network errors to configure a device's network part */
777         if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
778             os.conn <= C_DISCONNECTING)
779                 ns.conn = os.conn;
780
781         /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
782         if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
783             ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
784                 ns.conn = os.conn;
785
786         /* After C_DISCONNECTING only C_STANDALONE may follow */
787         if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
788                 ns.conn = os.conn;
789
790         if (ns.conn < C_CONNECTED) {
791                 ns.peer_isp = 0;
792                 ns.peer = R_UNKNOWN;
793                 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
794                         ns.pdsk = D_UNKNOWN;
795         }
796
797         /* Clear the aftr_isp when becoming unconfigured */
798         if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
799                 ns.aftr_isp = 0;
800
801         if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
802                 ns.pdsk = D_UNKNOWN;
803
804         /* Abort resync if a disk fails/detaches */
805         if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
806             (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
807                 if (warn_sync_abort)
808                         *warn_sync_abort = 1;
809                 ns.conn = C_CONNECTED;
810         }
811
812         if (ns.conn >= C_CONNECTED &&
813             ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
814              (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
815                 switch (ns.conn) {
816                 case C_WF_BITMAP_T:
817                 case C_PAUSED_SYNC_T:
818                         ns.disk = D_OUTDATED;
819                         break;
820                 case C_CONNECTED:
821                 case C_WF_BITMAP_S:
822                 case C_SYNC_SOURCE:
823                 case C_PAUSED_SYNC_S:
824                         ns.disk = D_UP_TO_DATE;
825                         break;
826                 case C_SYNC_TARGET:
827                         ns.disk = D_INCONSISTENT;
828                         dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
829                         break;
830                 }
831                 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
832                         dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
833         }
834
835         if (ns.conn >= C_CONNECTED &&
836             (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
837                 switch (ns.conn) {
838                 case C_CONNECTED:
839                 case C_WF_BITMAP_T:
840                 case C_PAUSED_SYNC_T:
841                 case C_SYNC_TARGET:
842                         ns.pdsk = D_UP_TO_DATE;
843                         break;
844                 case C_WF_BITMAP_S:
845                 case C_PAUSED_SYNC_S:
846                         /* remap any consistent state to D_OUTDATED,
847                          * but disallow "upgrade" of not even consistent states.
848                          */
849                         ns.pdsk =
850                                 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
851                                 ? os.pdsk : D_OUTDATED;
852                         break;
853                 case C_SYNC_SOURCE:
854                         ns.pdsk = D_INCONSISTENT;
855                         dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
856                         break;
857                 }
858                 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
859                         dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
860         }
861
862         /* Connection breaks down before we finished "Negotiating" */
863         if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
864             get_ldev_if_state(mdev, D_NEGOTIATING)) {
865                 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
866                         ns.disk = mdev->new_state_tmp.disk;
867                         ns.pdsk = mdev->new_state_tmp.pdsk;
868                 } else {
869                         dev_alert(DEV, "Connection lost while negotiating, no data!\n");
870                         ns.disk = D_DISKLESS;
871                         ns.pdsk = D_UNKNOWN;
872                 }
873                 put_ldev(mdev);
874         }
875
876         if (fp == FP_STONITH &&
877             (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
878             !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
879                 ns.susp = 1;
880
881         if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
882                 if (ns.conn == C_SYNC_SOURCE)
883                         ns.conn = C_PAUSED_SYNC_S;
884                 if (ns.conn == C_SYNC_TARGET)
885                         ns.conn = C_PAUSED_SYNC_T;
886         } else {
887                 if (ns.conn == C_PAUSED_SYNC_S)
888                         ns.conn = C_SYNC_SOURCE;
889                 if (ns.conn == C_PAUSED_SYNC_T)
890                         ns.conn = C_SYNC_TARGET;
891         }
892
893         return ns;
894 }
895
896 /* helper for __drbd_set_state */
897 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
898 {
899         if (cs == C_VERIFY_T) {
900                 /* starting online verify from an arbitrary position
901                  * does not fit well into the existing protocol.
902                  * on C_VERIFY_T, we initialize ov_left and friends
903                  * implicitly in receive_DataRequest once the
904                  * first P_OV_REQUEST is received */
905                 mdev->ov_start_sector = ~(sector_t)0;
906         } else {
907                 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
908                 if (bit >= mdev->rs_total)
909                         mdev->ov_start_sector =
910                                 BM_BIT_TO_SECT(mdev->rs_total - 1);
911                 mdev->ov_position = mdev->ov_start_sector;
912         }
913 }
914
915 /**
916  * __drbd_set_state() - Set a new DRBD state
917  * @mdev:       DRBD device.
918  * @ns:         new state.
919  * @flags:      Flags
920  * @done:       Optional completion, that will get completed after the after_state_ch() finished
921  *
922  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
923  */
924 int __drbd_set_state(struct drbd_conf *mdev,
925                     union drbd_state ns, enum chg_state_flags flags,
926                     struct completion *done)
927 {
928         union drbd_state os;
929         int rv = SS_SUCCESS;
930         int warn_sync_abort = 0;
931         struct after_state_chg_work *ascw;
932
933         os = mdev->state;
934
935         ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
936
937         if (ns.i == os.i)
938                 return SS_NOTHING_TO_DO;
939
940         if (!(flags & CS_HARD)) {
941                 /*  pre-state-change checks ; only look at ns  */
942                 /* See drbd_state_sw_errors in drbd_strings.c */
943
944                 rv = is_valid_state(mdev, ns);
945                 if (rv < SS_SUCCESS) {
946                         /* If the old state was illegal as well, then let
947                            this happen...*/
948
949                         if (is_valid_state(mdev, os) == rv) {
950                                 dev_err(DEV, "Considering state change from bad state. "
951                                     "Error would be: '%s'\n",
952                                     drbd_set_st_err_str(rv));
953                                 print_st(mdev, "old", os);
954                                 print_st(mdev, "new", ns);
955                                 rv = is_valid_state_transition(mdev, ns, os);
956                         }
957                 } else
958                         rv = is_valid_state_transition(mdev, ns, os);
959         }
960
961         if (rv < SS_SUCCESS) {
962                 if (flags & CS_VERBOSE)
963                         print_st_err(mdev, os, ns, rv);
964                 return rv;
965         }
966
967         if (warn_sync_abort)
968                 dev_warn(DEV, "Resync aborted.\n");
969
970         {
971                 char *pbp, pb[300];
972                 pbp = pb;
973                 *pbp = 0;
974                 PSC(role);
975                 PSC(peer);
976                 PSC(conn);
977                 PSC(disk);
978                 PSC(pdsk);
979                 PSC(susp);
980                 PSC(aftr_isp);
981                 PSC(peer_isp);
982                 PSC(user_isp);
983                 dev_info(DEV, "%s\n", pb);
984         }
985
986         /* solve the race between becoming unconfigured,
987          * worker doing the cleanup, and
988          * admin reconfiguring us:
989          * on (re)configure, first set CONFIG_PENDING,
990          * then wait for a potentially exiting worker,
991          * start the worker, and schedule one no_op.
992          * then proceed with configuration.
993          */
994         if (ns.disk == D_DISKLESS &&
995             ns.conn == C_STANDALONE &&
996             ns.role == R_SECONDARY &&
997             !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
998                 set_bit(DEVICE_DYING, &mdev->flags);
999
1000         mdev->state.i = ns.i;
1001         wake_up(&mdev->misc_wait);
1002         wake_up(&mdev->state_wait);
1003
1004         /*   post-state-change actions   */
1005         if (os.conn >= C_SYNC_SOURCE   && ns.conn <= C_CONNECTED) {
1006                 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1007                 mod_timer(&mdev->resync_timer, jiffies);
1008         }
1009
1010         /* aborted verify run. log the last position */
1011         if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1012             ns.conn < C_CONNECTED) {
1013                 mdev->ov_start_sector =
1014                         BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1015                 dev_info(DEV, "Online Verify reached sector %llu\n",
1016                         (unsigned long long)mdev->ov_start_sector);
1017         }
1018
1019         if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1020             (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
1021                 dev_info(DEV, "Syncer continues.\n");
1022                 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
1023                 if (ns.conn == C_SYNC_TARGET) {
1024                         if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
1025                                 mod_timer(&mdev->resync_timer, jiffies);
1026                         /* This if (!test_bit) is only needed for the case
1027                            that a device that has ceased to used its timer,
1028                            i.e. it is already in drbd_resync_finished() gets
1029                            paused and resumed. */
1030                 }
1031         }
1032
1033         if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
1034             (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1035                 dev_info(DEV, "Resync suspended\n");
1036                 mdev->rs_mark_time = jiffies;
1037                 if (ns.conn == C_PAUSED_SYNC_T)
1038                         set_bit(STOP_SYNC_TIMER, &mdev->flags);
1039         }
1040
1041         if (os.conn == C_CONNECTED &&
1042             (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1043                 mdev->ov_position = 0;
1044                 mdev->rs_total =
1045                 mdev->rs_mark_left = drbd_bm_bits(mdev);
1046                 if (mdev->agreed_pro_version >= 90)
1047                         set_ov_position(mdev, ns.conn);
1048                 else
1049                         mdev->ov_start_sector = 0;
1050                 mdev->ov_left = mdev->rs_total
1051                               - BM_SECT_TO_BIT(mdev->ov_position);
1052                 mdev->rs_start     =
1053                 mdev->rs_mark_time = jiffies;
1054                 mdev->ov_last_oos_size = 0;
1055                 mdev->ov_last_oos_start = 0;
1056
1057                 if (ns.conn == C_VERIFY_S) {
1058                         dev_info(DEV, "Starting Online Verify from sector %llu\n",
1059                                         (unsigned long long)mdev->ov_position);
1060                         mod_timer(&mdev->resync_timer, jiffies);
1061                 }
1062         }
1063
1064         if (get_ldev(mdev)) {
1065                 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1066                                                  MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1067                                                  MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1068
1069                 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1070                         mdf |= MDF_CRASHED_PRIMARY;
1071                 if (mdev->state.role == R_PRIMARY ||
1072                     (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1073                         mdf |= MDF_PRIMARY_IND;
1074                 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1075                         mdf |= MDF_CONNECTED_IND;
1076                 if (mdev->state.disk > D_INCONSISTENT)
1077                         mdf |= MDF_CONSISTENT;
1078                 if (mdev->state.disk > D_OUTDATED)
1079                         mdf |= MDF_WAS_UP_TO_DATE;
1080                 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1081                         mdf |= MDF_PEER_OUT_DATED;
1082                 if (mdf != mdev->ldev->md.flags) {
1083                         mdev->ldev->md.flags = mdf;
1084                         drbd_md_mark_dirty(mdev);
1085                 }
1086                 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1087                         drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1088                 put_ldev(mdev);
1089         }
1090
1091         /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1092         if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1093             os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1094                 set_bit(CONSIDER_RESYNC, &mdev->flags);
1095
1096         /* Receiver should clean up itself */
1097         if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1098                 drbd_thread_stop_nowait(&mdev->receiver);
1099
1100         /* Now the receiver finished cleaning up itself, it should die */
1101         if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1102                 drbd_thread_stop_nowait(&mdev->receiver);
1103
1104         /* Upon network failure, we need to restart the receiver. */
1105         if (os.conn > C_TEAR_DOWN &&
1106             ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1107                 drbd_thread_restart_nowait(&mdev->receiver);
1108
1109         ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1110         if (ascw) {
1111                 ascw->os = os;
1112                 ascw->ns = ns;
1113                 ascw->flags = flags;
1114                 ascw->w.cb = w_after_state_ch;
1115                 ascw->done = done;
1116                 drbd_queue_work(&mdev->data.work, &ascw->w);
1117         } else {
1118                 dev_warn(DEV, "Could not kmalloc an ascw\n");
1119         }
1120
1121         return rv;
1122 }
1123
1124 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1125 {
1126         struct after_state_chg_work *ascw =
1127                 container_of(w, struct after_state_chg_work, w);
1128         after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1129         if (ascw->flags & CS_WAIT_COMPLETE) {
1130                 D_ASSERT(ascw->done != NULL);
1131                 complete(ascw->done);
1132         }
1133         kfree(ascw);
1134
1135         return 1;
1136 }
1137
1138 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1139 {
1140         if (rv) {
1141                 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1142                 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1143                 return;
1144         }
1145
1146         switch (mdev->state.conn) {
1147         case C_STARTING_SYNC_T:
1148                 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1149                 break;
1150         case C_STARTING_SYNC_S:
1151                 drbd_start_resync(mdev, C_SYNC_SOURCE);
1152                 break;
1153         }
1154 }
1155
1156 /**
1157  * after_state_ch() - Perform after state change actions that may sleep
1158  * @mdev:       DRBD device.
1159  * @os:         old state.
1160  * @ns:         new state.
1161  * @flags:      Flags
1162  */
1163 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1164                            union drbd_state ns, enum chg_state_flags flags)
1165 {
1166         enum drbd_fencing_p fp;
1167
1168         if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1169                 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1170                 if (mdev->p_uuid)
1171                         mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1172         }
1173
1174         fp = FP_DONT_CARE;
1175         if (get_ldev(mdev)) {
1176                 fp = mdev->ldev->dc.fencing;
1177                 put_ldev(mdev);
1178         }
1179
1180         /* Inform userspace about the change... */
1181         drbd_bcast_state(mdev, ns);
1182
1183         if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1184             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1185                 drbd_khelper(mdev, "pri-on-incon-degr");
1186
1187         /* Here we have the actions that are performed after a
1188            state change. This function might sleep */
1189
1190         if (fp == FP_STONITH && ns.susp) {
1191                 /* case1: The outdate peer handler is successful:
1192                  * case2: The connection was established again: */
1193                 if ((os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) ||
1194                     (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
1195                         tl_clear(mdev);
1196                         spin_lock_irq(&mdev->req_lock);
1197                         _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1198                         spin_unlock_irq(&mdev->req_lock);
1199                 }
1200         }
1201         /* Do not change the order of the if above and the two below... */
1202         if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
1203                 drbd_send_uuids(mdev);
1204                 drbd_send_state(mdev);
1205         }
1206         if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1207                 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1208
1209         /* Lost contact to peer's copy of the data */
1210         if ((os.pdsk >= D_INCONSISTENT &&
1211              os.pdsk != D_UNKNOWN &&
1212              os.pdsk != D_OUTDATED)
1213         &&  (ns.pdsk < D_INCONSISTENT ||
1214              ns.pdsk == D_UNKNOWN ||
1215              ns.pdsk == D_OUTDATED)) {
1216                 kfree(mdev->p_uuid);
1217                 mdev->p_uuid = NULL;
1218                 if (get_ldev(mdev)) {
1219                         if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1220                             mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE)
1221                                 atomic_set(&mdev->new_c_uuid, 2);
1222                         put_ldev(mdev);
1223                 }
1224         }
1225
1226         if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1227                 /* Diskless peer becomes primary or got connected do diskless, primary peer. */
1228                 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
1229                         atomic_set(&mdev->new_c_uuid, 2);
1230
1231                 /* D_DISKLESS Peer becomes secondary */
1232                 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1233                         drbd_al_to_on_disk_bm(mdev);
1234                 put_ldev(mdev);
1235         }
1236
1237         /* Last part of the attaching process ... */
1238         if (ns.conn >= C_CONNECTED &&
1239             os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1240                 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
1241                 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
1242                 drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
1243                 drbd_send_uuids(mdev);
1244                 drbd_send_state(mdev);
1245         }
1246
1247         /* We want to pause/continue resync, tell peer. */
1248         if (ns.conn >= C_CONNECTED &&
1249              ((os.aftr_isp != ns.aftr_isp) ||
1250               (os.user_isp != ns.user_isp)))
1251                 drbd_send_state(mdev);
1252
1253         /* In case one of the isp bits got set, suspend other devices. */
1254         if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1255             (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1256                 suspend_other_sg(mdev);
1257
1258         /* Make sure the peer gets informed about eventual state
1259            changes (ISP bits) while we were in WFReportParams. */
1260         if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1261                 drbd_send_state(mdev);
1262
1263         /* We are in the progress to start a full sync... */
1264         if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1265             (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1266                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1267
1268         /* We are invalidating our self... */
1269         if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1270             os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1271                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1272
1273         if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1274                 enum drbd_io_error_p eh;
1275
1276                 eh = EP_PASS_ON;
1277                 if (get_ldev_if_state(mdev, D_FAILED)) {
1278                         eh = mdev->ldev->dc.on_io_error;
1279                         put_ldev(mdev);
1280                 }
1281
1282                 drbd_rs_cancel_all(mdev);
1283                 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1284                    and it is D_DISKLESS here, local_cnt can only go down, it can
1285                    not increase... It will reach zero */
1286                 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1287                 mdev->rs_total = 0;
1288                 mdev->rs_failed = 0;
1289                 atomic_set(&mdev->rs_pending_cnt, 0);
1290
1291                 spin_lock_irq(&mdev->req_lock);
1292                 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1293                 spin_unlock_irq(&mdev->req_lock);
1294
1295                 if (eh == EP_CALL_HELPER)
1296                         drbd_khelper(mdev, "local-io-error");
1297         }
1298
1299         if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1300
1301                 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1302                         if (drbd_send_state(mdev))
1303                                 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1304                         else
1305                                 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1306                 }
1307
1308                 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1309                 lc_destroy(mdev->resync);
1310                 mdev->resync = NULL;
1311                 lc_destroy(mdev->act_log);
1312                 mdev->act_log = NULL;
1313                 __no_warn(local,
1314                         drbd_free_bc(mdev->ldev);
1315                         mdev->ldev = NULL;);
1316
1317                 if (mdev->md_io_tmpp)
1318                         __free_page(mdev->md_io_tmpp);
1319         }
1320
1321         /* Disks got bigger while they were detached */
1322         if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1323             test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1324                 if (ns.conn == C_CONNECTED)
1325                         resync_after_online_grow(mdev);
1326         }
1327
1328         /* A resync finished or aborted, wake paused devices... */
1329         if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1330             (os.peer_isp && !ns.peer_isp) ||
1331             (os.user_isp && !ns.user_isp))
1332                 resume_next_sg(mdev);
1333
1334         /* Upon network connection, we need to start the receiver */
1335         if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1336                 drbd_thread_start(&mdev->receiver);
1337
1338         /* Terminate worker thread if we are unconfigured - it will be
1339            restarted as needed... */
1340         if (ns.disk == D_DISKLESS &&
1341             ns.conn == C_STANDALONE &&
1342             ns.role == R_SECONDARY) {
1343                 if (os.aftr_isp != ns.aftr_isp)
1344                         resume_next_sg(mdev);
1345                 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1346                 if (test_bit(DEVICE_DYING, &mdev->flags))
1347                         drbd_thread_stop_nowait(&mdev->worker);
1348         }
1349
1350         drbd_md_sync(mdev);
1351 }
1352
1353 static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1354 {
1355         if (get_ldev(mdev)) {
1356                 drbd_uuid_new_current(mdev);
1357                 drbd_send_uuids(mdev);
1358                 drbd_md_sync(mdev);
1359                 put_ldev(mdev);
1360         }
1361         atomic_dec(&mdev->new_c_uuid);
1362         wake_up(&mdev->misc_wait);
1363
1364         return 1;
1365 }
1366
1367 static int drbd_thread_setup(void *arg)
1368 {
1369         struct drbd_thread *thi = (struct drbd_thread *) arg;
1370         struct drbd_conf *mdev = thi->mdev;
1371         unsigned long flags;
1372         int retval;
1373
1374 restart:
1375         retval = thi->function(thi);
1376
1377         spin_lock_irqsave(&thi->t_lock, flags);
1378
1379         /* if the receiver has been "Exiting", the last thing it did
1380          * was set the conn state to "StandAlone",
1381          * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1382          * and receiver thread will be "started".
1383          * drbd_thread_start needs to set "Restarting" in that case.
1384          * t_state check and assignment needs to be within the same spinlock,
1385          * so either thread_start sees Exiting, and can remap to Restarting,
1386          * or thread_start see None, and can proceed as normal.
1387          */
1388
1389         if (thi->t_state == Restarting) {
1390                 dev_info(DEV, "Restarting %s\n", current->comm);
1391                 thi->t_state = Running;
1392                 spin_unlock_irqrestore(&thi->t_lock, flags);
1393                 goto restart;
1394         }
1395
1396         thi->task = NULL;
1397         thi->t_state = None;
1398         smp_mb();
1399         complete(&thi->stop);
1400         spin_unlock_irqrestore(&thi->t_lock, flags);
1401
1402         dev_info(DEV, "Terminating %s\n", current->comm);
1403
1404         /* Release mod reference taken when thread was started */
1405         module_put(THIS_MODULE);
1406         return retval;
1407 }
1408
1409 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1410                       int (*func) (struct drbd_thread *))
1411 {
1412         spin_lock_init(&thi->t_lock);
1413         thi->task    = NULL;
1414         thi->t_state = None;
1415         thi->function = func;
1416         thi->mdev = mdev;
1417 }
1418
1419 int drbd_thread_start(struct drbd_thread *thi)
1420 {
1421         struct drbd_conf *mdev = thi->mdev;
1422         struct task_struct *nt;
1423         unsigned long flags;
1424
1425         const char *me =
1426                 thi == &mdev->receiver ? "receiver" :
1427                 thi == &mdev->asender  ? "asender"  :
1428                 thi == &mdev->worker   ? "worker"   : "NONSENSE";
1429
1430         /* is used from state engine doing drbd_thread_stop_nowait,
1431          * while holding the req lock irqsave */
1432         spin_lock_irqsave(&thi->t_lock, flags);
1433
1434         switch (thi->t_state) {
1435         case None:
1436                 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1437                                 me, current->comm, current->pid);
1438
1439                 /* Get ref on module for thread - this is released when thread exits */
1440                 if (!try_module_get(THIS_MODULE)) {
1441                         dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1442                         spin_unlock_irqrestore(&thi->t_lock, flags);
1443                         return FALSE;
1444                 }
1445
1446                 init_completion(&thi->stop);
1447                 D_ASSERT(thi->task == NULL);
1448                 thi->reset_cpu_mask = 1;
1449                 thi->t_state = Running;
1450                 spin_unlock_irqrestore(&thi->t_lock, flags);
1451                 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1452
1453                 nt = kthread_create(drbd_thread_setup, (void *) thi,
1454                                     "drbd%d_%s", mdev_to_minor(mdev), me);
1455
1456                 if (IS_ERR(nt)) {
1457                         dev_err(DEV, "Couldn't start thread\n");
1458
1459                         module_put(THIS_MODULE);
1460                         return FALSE;
1461                 }
1462                 spin_lock_irqsave(&thi->t_lock, flags);
1463                 thi->task = nt;
1464                 thi->t_state = Running;
1465                 spin_unlock_irqrestore(&thi->t_lock, flags);
1466                 wake_up_process(nt);
1467                 break;
1468         case Exiting:
1469                 thi->t_state = Restarting;
1470                 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1471                                 me, current->comm, current->pid);
1472                 /* fall through */
1473         case Running:
1474         case Restarting:
1475         default:
1476                 spin_unlock_irqrestore(&thi->t_lock, flags);
1477                 break;
1478         }
1479
1480         return TRUE;
1481 }
1482
1483
1484 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1485 {
1486         unsigned long flags;
1487
1488         enum drbd_thread_state ns = restart ? Restarting : Exiting;
1489
1490         /* may be called from state engine, holding the req lock irqsave */
1491         spin_lock_irqsave(&thi->t_lock, flags);
1492
1493         if (thi->t_state == None) {
1494                 spin_unlock_irqrestore(&thi->t_lock, flags);
1495                 if (restart)
1496                         drbd_thread_start(thi);
1497                 return;
1498         }
1499
1500         if (thi->t_state != ns) {
1501                 if (thi->task == NULL) {
1502                         spin_unlock_irqrestore(&thi->t_lock, flags);
1503                         return;
1504                 }
1505
1506                 thi->t_state = ns;
1507                 smp_mb();
1508                 init_completion(&thi->stop);
1509                 if (thi->task != current)
1510                         force_sig(DRBD_SIGKILL, thi->task);
1511
1512         }
1513
1514         spin_unlock_irqrestore(&thi->t_lock, flags);
1515
1516         if (wait)
1517                 wait_for_completion(&thi->stop);
1518 }
1519
1520 #ifdef CONFIG_SMP
1521 /**
1522  * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1523  * @mdev:       DRBD device.
1524  *
1525  * Forces all threads of a device onto the same CPU. This is beneficial for
1526  * DRBD's performance. May be overwritten by user's configuration.
1527  */
1528 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1529 {
1530         int ord, cpu;
1531
1532         /* user override. */
1533         if (cpumask_weight(mdev->cpu_mask))
1534                 return;
1535
1536         ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1537         for_each_online_cpu(cpu) {
1538                 if (ord-- == 0) {
1539                         cpumask_set_cpu(cpu, mdev->cpu_mask);
1540                         return;
1541                 }
1542         }
1543         /* should not be reached */
1544         cpumask_setall(mdev->cpu_mask);
1545 }
1546
1547 /**
1548  * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1549  * @mdev:       DRBD device.
1550  *
1551  * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1552  * prematurely.
1553  */
1554 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1555 {
1556         struct task_struct *p = current;
1557         struct drbd_thread *thi =
1558                 p == mdev->asender.task  ? &mdev->asender  :
1559                 p == mdev->receiver.task ? &mdev->receiver :
1560                 p == mdev->worker.task   ? &mdev->worker   :
1561                 NULL;
1562         ERR_IF(thi == NULL)
1563                 return;
1564         if (!thi->reset_cpu_mask)
1565                 return;
1566         thi->reset_cpu_mask = 0;
1567         set_cpus_allowed_ptr(p, mdev->cpu_mask);
1568 }
1569 #endif
1570
1571 /* the appropriate socket mutex must be held already */
1572 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1573                           enum drbd_packets cmd, struct p_header *h,
1574                           size_t size, unsigned msg_flags)
1575 {
1576         int sent, ok;
1577
1578         ERR_IF(!h) return FALSE;
1579         ERR_IF(!size) return FALSE;
1580
1581         h->magic   = BE_DRBD_MAGIC;
1582         h->command = cpu_to_be16(cmd);
1583         h->length  = cpu_to_be16(size-sizeof(struct p_header));
1584
1585         sent = drbd_send(mdev, sock, h, size, msg_flags);
1586
1587         ok = (sent == size);
1588         if (!ok)
1589                 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1590                     cmdname(cmd), (int)size, sent);
1591         return ok;
1592 }
1593
1594 /* don't pass the socket. we may only look at it
1595  * when we hold the appropriate socket mutex.
1596  */
1597 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1598                   enum drbd_packets cmd, struct p_header *h, size_t size)
1599 {
1600         int ok = 0;
1601         struct socket *sock;
1602
1603         if (use_data_socket) {
1604                 mutex_lock(&mdev->data.mutex);
1605                 sock = mdev->data.socket;
1606         } else {
1607                 mutex_lock(&mdev->meta.mutex);
1608                 sock = mdev->meta.socket;
1609         }
1610
1611         /* drbd_disconnect() could have called drbd_free_sock()
1612          * while we were waiting in down()... */
1613         if (likely(sock != NULL))
1614                 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1615
1616         if (use_data_socket)
1617                 mutex_unlock(&mdev->data.mutex);
1618         else
1619                 mutex_unlock(&mdev->meta.mutex);
1620         return ok;
1621 }
1622
1623 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1624                    size_t size)
1625 {
1626         struct p_header h;
1627         int ok;
1628
1629         h.magic   = BE_DRBD_MAGIC;
1630         h.command = cpu_to_be16(cmd);
1631         h.length  = cpu_to_be16(size);
1632
1633         if (!drbd_get_data_sock(mdev))
1634                 return 0;
1635
1636         ok = (sizeof(h) ==
1637                 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1638         ok = ok && (size ==
1639                 drbd_send(mdev, mdev->data.socket, data, size, 0));
1640
1641         drbd_put_data_sock(mdev);
1642
1643         return ok;
1644 }
1645
1646 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1647 {
1648         struct p_rs_param_89 *p;
1649         struct socket *sock;
1650         int size, rv;
1651         const int apv = mdev->agreed_pro_version;
1652
1653         size = apv <= 87 ? sizeof(struct p_rs_param)
1654                 : apv == 88 ? sizeof(struct p_rs_param)
1655                         + strlen(mdev->sync_conf.verify_alg) + 1
1656                 : /* 89 */    sizeof(struct p_rs_param_89);
1657
1658         /* used from admin command context and receiver/worker context.
1659          * to avoid kmalloc, grab the socket right here,
1660          * then use the pre-allocated sbuf there */
1661         mutex_lock(&mdev->data.mutex);
1662         sock = mdev->data.socket;
1663
1664         if (likely(sock != NULL)) {
1665                 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1666
1667                 p = &mdev->data.sbuf.rs_param_89;
1668
1669                 /* initialize verify_alg and csums_alg */
1670                 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1671
1672                 p->rate = cpu_to_be32(sc->rate);
1673
1674                 if (apv >= 88)
1675                         strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1676                 if (apv >= 89)
1677                         strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1678
1679                 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1680         } else
1681                 rv = 0; /* not ok */
1682
1683         mutex_unlock(&mdev->data.mutex);
1684
1685         return rv;
1686 }
1687
1688 int drbd_send_protocol(struct drbd_conf *mdev)
1689 {
1690         struct p_protocol *p;
1691         int size, cf, rv;
1692
1693         size = sizeof(struct p_protocol);
1694
1695         if (mdev->agreed_pro_version >= 87)
1696                 size += strlen(mdev->net_conf->integrity_alg) + 1;
1697
1698         /* we must not recurse into our own queue,
1699          * as that is blocked during handshake */
1700         p = kmalloc(size, GFP_NOIO);
1701         if (p == NULL)
1702                 return 0;
1703
1704         p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
1705         p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
1706         p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
1707         p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
1708         p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1709
1710         cf = 0;
1711         if (mdev->net_conf->want_lose)
1712                 cf |= CF_WANT_LOSE;
1713         if (mdev->net_conf->dry_run) {
1714                 if (mdev->agreed_pro_version >= 92)
1715                         cf |= CF_DRY_RUN;
1716                 else {
1717                         dev_err(DEV, "--dry-run is not supported by peer");
1718                         kfree(p);
1719                         return 0;
1720                 }
1721         }
1722         p->conn_flags    = cpu_to_be32(cf);
1723
1724         if (mdev->agreed_pro_version >= 87)
1725                 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1726
1727         rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1728                            (struct p_header *)p, size);
1729         kfree(p);
1730         return rv;
1731 }
1732
1733 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1734 {
1735         struct p_uuids p;
1736         int i;
1737
1738         if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1739                 return 1;
1740
1741         for (i = UI_CURRENT; i < UI_SIZE; i++)
1742                 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1743
1744         mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1745         p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1746         uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1747         uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1748         uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1749         p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1750
1751         put_ldev(mdev);
1752
1753         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1754                              (struct p_header *)&p, sizeof(p));
1755 }
1756
1757 int drbd_send_uuids(struct drbd_conf *mdev)
1758 {
1759         return _drbd_send_uuids(mdev, 0);
1760 }
1761
1762 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1763 {
1764         return _drbd_send_uuids(mdev, 8);
1765 }
1766
1767
1768 int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1769 {
1770         struct p_rs_uuid p;
1771
1772         p.uuid = cpu_to_be64(val);
1773
1774         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1775                              (struct p_header *)&p, sizeof(p));
1776 }
1777
1778 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
1779 {
1780         struct p_sizes p;
1781         sector_t d_size, u_size;
1782         int q_order_type;
1783         int ok;
1784
1785         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1786                 D_ASSERT(mdev->ldev->backing_bdev);
1787                 d_size = drbd_get_max_capacity(mdev->ldev);
1788                 u_size = mdev->ldev->dc.disk_size;
1789                 q_order_type = drbd_queue_order_type(mdev);
1790                 put_ldev(mdev);
1791         } else {
1792                 d_size = 0;
1793                 u_size = 0;
1794                 q_order_type = QUEUE_ORDERED_NONE;
1795         }
1796
1797         p.d_size = cpu_to_be64(d_size);
1798         p.u_size = cpu_to_be64(u_size);
1799         p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1800         p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
1801         p.queue_order_type = cpu_to_be16(q_order_type);
1802         p.dds_flags = cpu_to_be16(flags);
1803
1804         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1805                            (struct p_header *)&p, sizeof(p));
1806         return ok;
1807 }
1808
1809 /**
1810  * drbd_send_state() - Sends the drbd state to the peer
1811  * @mdev:       DRBD device.
1812  */
1813 int drbd_send_state(struct drbd_conf *mdev)
1814 {
1815         struct socket *sock;
1816         struct p_state p;
1817         int ok = 0;
1818
1819         /* Grab state lock so we wont send state if we're in the middle
1820          * of a cluster wide state change on another thread */
1821         drbd_state_lock(mdev);
1822
1823         mutex_lock(&mdev->data.mutex);
1824
1825         p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1826         sock = mdev->data.socket;
1827
1828         if (likely(sock != NULL)) {
1829                 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1830                                     (struct p_header *)&p, sizeof(p), 0);
1831         }
1832
1833         mutex_unlock(&mdev->data.mutex);
1834
1835         drbd_state_unlock(mdev);
1836         return ok;
1837 }
1838
1839 int drbd_send_state_req(struct drbd_conf *mdev,
1840         union drbd_state mask, union drbd_state val)
1841 {
1842         struct p_req_state p;
1843
1844         p.mask    = cpu_to_be32(mask.i);
1845         p.val     = cpu_to_be32(val.i);
1846
1847         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1848                              (struct p_header *)&p, sizeof(p));
1849 }
1850
1851 int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1852 {
1853         struct p_req_state_reply p;
1854
1855         p.retcode    = cpu_to_be32(retcode);
1856
1857         return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1858                              (struct p_header *)&p, sizeof(p));
1859 }
1860
1861 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1862         struct p_compressed_bm *p,
1863         struct bm_xfer_ctx *c)
1864 {
1865         struct bitstream bs;
1866         unsigned long plain_bits;
1867         unsigned long tmp;
1868         unsigned long rl;
1869         unsigned len;
1870         unsigned toggle;
1871         int bits;
1872
1873         /* may we use this feature? */
1874         if ((mdev->sync_conf.use_rle == 0) ||
1875                 (mdev->agreed_pro_version < 90))
1876                         return 0;
1877
1878         if (c->bit_offset >= c->bm_bits)
1879                 return 0; /* nothing to do. */
1880
1881         /* use at most thus many bytes */
1882         bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1883         memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1884         /* plain bits covered in this code string */
1885         plain_bits = 0;
1886
1887         /* p->encoding & 0x80 stores whether the first run length is set.
1888          * bit offset is implicit.
1889          * start with toggle == 2 to be able to tell the first iteration */
1890         toggle = 2;
1891
1892         /* see how much plain bits we can stuff into one packet
1893          * using RLE and VLI. */
1894         do {
1895                 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1896                                     : _drbd_bm_find_next(mdev, c->bit_offset);
1897                 if (tmp == -1UL)
1898                         tmp = c->bm_bits;
1899                 rl = tmp - c->bit_offset;
1900
1901                 if (toggle == 2) { /* first iteration */
1902                         if (rl == 0) {
1903                                 /* the first checked bit was set,
1904                                  * store start value, */
1905                                 DCBP_set_start(p, 1);
1906                                 /* but skip encoding of zero run length */
1907                                 toggle = !toggle;
1908                                 continue;
1909                         }
1910                         DCBP_set_start(p, 0);
1911                 }
1912
1913                 /* paranoia: catch zero runlength.
1914                  * can only happen if bitmap is modified while we scan it. */
1915                 if (rl == 0) {
1916                         dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1917                             "t:%u bo:%lu\n", toggle, c->bit_offset);
1918                         return -1;
1919                 }
1920
1921                 bits = vli_encode_bits(&bs, rl);
1922                 if (bits == -ENOBUFS) /* buffer full */
1923                         break;
1924                 if (bits <= 0) {
1925                         dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1926                         return 0;
1927                 }
1928
1929                 toggle = !toggle;
1930                 plain_bits += rl;
1931                 c->bit_offset = tmp;
1932         } while (c->bit_offset < c->bm_bits);
1933
1934         len = bs.cur.b - p->code + !!bs.cur.bit;
1935
1936         if (plain_bits < (len << 3)) {
1937                 /* incompressible with this method.
1938                  * we need to rewind both word and bit position. */
1939                 c->bit_offset -= plain_bits;
1940                 bm_xfer_ctx_bit_to_word_offset(c);
1941                 c->bit_offset = c->word_offset * BITS_PER_LONG;
1942                 return 0;
1943         }
1944
1945         /* RLE + VLI was able to compress it just fine.
1946          * update c->word_offset. */
1947         bm_xfer_ctx_bit_to_word_offset(c);
1948
1949         /* store pad_bits */
1950         DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1951
1952         return len;
1953 }
1954
1955 enum { OK, FAILED, DONE }
1956 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1957         struct p_header *h, struct bm_xfer_ctx *c)
1958 {
1959         struct p_compressed_bm *p = (void*)h;
1960         unsigned long num_words;
1961         int len;
1962         int ok;
1963
1964         len = fill_bitmap_rle_bits(mdev, p, c);
1965
1966         if (len < 0)
1967                 return FAILED;
1968
1969         if (len) {
1970                 DCBP_set_code(p, RLE_VLI_Bits);
1971                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
1972                         sizeof(*p) + len, 0);
1973
1974                 c->packets[0]++;
1975                 c->bytes[0] += sizeof(*p) + len;
1976
1977                 if (c->bit_offset >= c->bm_bits)
1978                         len = 0; /* DONE */
1979         } else {
1980                 /* was not compressible.
1981                  * send a buffer full of plain text bits instead. */
1982                 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1983                 len = num_words * sizeof(long);
1984                 if (len)
1985                         drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
1986                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
1987                                    h, sizeof(struct p_header) + len, 0);
1988                 c->word_offset += num_words;
1989                 c->bit_offset = c->word_offset * BITS_PER_LONG;
1990
1991                 c->packets[1]++;
1992                 c->bytes[1] += sizeof(struct p_header) + len;
1993
1994                 if (c->bit_offset > c->bm_bits)
1995                         c->bit_offset = c->bm_bits;
1996         }
1997         ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
1998
1999         if (ok == DONE)
2000                 INFO_bm_xfer_stats(mdev, "send", c);
2001         return ok;
2002 }
2003
2004 /* See the comment at receive_bitmap() */
2005 int _drbd_send_bitmap(struct drbd_conf *mdev)
2006 {
2007         struct bm_xfer_ctx c;
2008         struct p_header *p;
2009         int ret;
2010
2011         ERR_IF(!mdev->bitmap) return FALSE;
2012
2013         /* maybe we should use some per thread scratch page,
2014          * and allocate that during initial device creation? */
2015         p = (struct p_header *) __get_free_page(GFP_NOIO);
2016         if (!p) {
2017                 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2018                 return FALSE;
2019         }
2020
2021         if (get_ldev(mdev)) {
2022                 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2023                         dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2024                         drbd_bm_set_all(mdev);
2025                         if (drbd_bm_write(mdev)) {
2026                                 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2027                                  * but otherwise process as per normal - need to tell other
2028                                  * side that a full resync is required! */
2029                                 dev_err(DEV, "Failed to write bitmap to disk!\n");
2030                         } else {
2031                                 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2032                                 drbd_md_sync(mdev);
2033                         }
2034                 }
2035                 put_ldev(mdev);
2036         }
2037
2038         c = (struct bm_xfer_ctx) {
2039                 .bm_bits = drbd_bm_bits(mdev),
2040                 .bm_words = drbd_bm_words(mdev),
2041         };
2042
2043         do {
2044                 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2045         } while (ret == OK);
2046
2047         free_page((unsigned long) p);
2048         return (ret == DONE);
2049 }
2050
2051 int drbd_send_bitmap(struct drbd_conf *mdev)
2052 {
2053         int err;
2054
2055         if (!drbd_get_data_sock(mdev))
2056                 return -1;
2057         err = !_drbd_send_bitmap(mdev);
2058         drbd_put_data_sock(mdev);
2059         return err;
2060 }
2061
2062 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2063 {
2064         int ok;
2065         struct p_barrier_ack p;
2066
2067         p.barrier  = barrier_nr;
2068         p.set_size = cpu_to_be32(set_size);
2069
2070         if (mdev->state.conn < C_CONNECTED)
2071                 return FALSE;
2072         ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2073                         (struct p_header *)&p, sizeof(p));
2074         return ok;
2075 }
2076
2077 /**
2078  * _drbd_send_ack() - Sends an ack packet
2079  * @mdev:       DRBD device.
2080  * @cmd:        Packet command code.
2081  * @sector:     sector, needs to be in big endian byte order
2082  * @blksize:    size in byte, needs to be in big endian byte order
2083  * @block_id:   Id, big endian byte order
2084  */
2085 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2086                           u64 sector,
2087                           u32 blksize,
2088                           u64 block_id)
2089 {
2090         int ok;
2091         struct p_block_ack p;
2092
2093         p.sector   = sector;
2094         p.block_id = block_id;
2095         p.blksize  = blksize;
2096         p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2097
2098         if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2099                 return FALSE;
2100         ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2101                                 (struct p_header *)&p, sizeof(p));
2102         return ok;
2103 }
2104
2105 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2106                      struct p_data *dp)
2107 {
2108         const int header_size = sizeof(struct p_data)
2109                               - sizeof(struct p_header);
2110         int data_size  = ((struct p_header *)dp)->length - header_size;
2111
2112         return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2113                               dp->block_id);
2114 }
2115
2116 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2117                      struct p_block_req *rp)
2118 {
2119         return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2120 }
2121
2122 /**
2123  * drbd_send_ack() - Sends an ack packet
2124  * @mdev:       DRBD device.
2125  * @cmd:        Packet command code.
2126  * @e:          Epoch entry.
2127  */
2128 int drbd_send_ack(struct drbd_conf *mdev,
2129         enum drbd_packets cmd, struct drbd_epoch_entry *e)
2130 {
2131         return _drbd_send_ack(mdev, cmd,
2132                               cpu_to_be64(e->sector),
2133                               cpu_to_be32(e->size),
2134                               e->block_id);
2135 }
2136
2137 /* This function misuses the block_id field to signal if the blocks
2138  * are is sync or not. */
2139 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2140                      sector_t sector, int blksize, u64 block_id)
2141 {
2142         return _drbd_send_ack(mdev, cmd,
2143                               cpu_to_be64(sector),
2144                               cpu_to_be32(blksize),
2145                               cpu_to_be64(block_id));
2146 }
2147
2148 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2149                        sector_t sector, int size, u64 block_id)
2150 {
2151         int ok;
2152         struct p_block_req p;
2153
2154         p.sector   = cpu_to_be64(sector);
2155         p.block_id = block_id;
2156         p.blksize  = cpu_to_be32(size);
2157
2158         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2159                                 (struct p_header *)&p, sizeof(p));
2160         return ok;
2161 }
2162
2163 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2164                             sector_t sector, int size,
2165                             void *digest, int digest_size,
2166                             enum drbd_packets cmd)
2167 {
2168         int ok;
2169         struct p_block_req p;
2170
2171         p.sector   = cpu_to_be64(sector);
2172         p.block_id = BE_DRBD_MAGIC + 0xbeef;
2173         p.blksize  = cpu_to_be32(size);
2174
2175         p.head.magic   = BE_DRBD_MAGIC;
2176         p.head.command = cpu_to_be16(cmd);
2177         p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
2178
2179         mutex_lock(&mdev->data.mutex);
2180
2181         ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2182         ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2183
2184         mutex_unlock(&mdev->data.mutex);
2185
2186         return ok;
2187 }
2188
2189 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2190 {
2191         int ok;
2192         struct p_block_req p;
2193
2194         p.sector   = cpu_to_be64(sector);
2195         p.block_id = BE_DRBD_MAGIC + 0xbabe;
2196         p.blksize  = cpu_to_be32(size);
2197
2198         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2199                            (struct p_header *)&p, sizeof(p));
2200         return ok;
2201 }
2202
2203 static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
2204 {
2205         struct p_delay_probe dp;
2206         int offset, ok = 0;
2207         struct timeval now;
2208
2209         mutex_lock(&ds->mutex);
2210         if (likely(ds->socket)) {
2211                 do_gettimeofday(&now);
2212                 offset = now.tv_usec - mdev->dps_time.tv_usec +
2213                          (now.tv_sec - mdev->dps_time.tv_sec) * 1000000;
2214                 dp.seq_num  = cpu_to_be32(mdev->delay_seq);
2215                 dp.offset   = cpu_to_be32(offset);
2216
2217                 ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE,
2218                                     (struct p_header *)&dp, sizeof(dp), 0);
2219         }
2220         mutex_unlock(&ds->mutex);
2221
2222         return ok;
2223 }
2224
2225 static int drbd_send_delay_probes(struct drbd_conf *mdev)
2226 {
2227         int ok;
2228
2229         mdev->delay_seq++;
2230         do_gettimeofday(&mdev->dps_time);
2231         ok = drbd_send_delay_probe(mdev, &mdev->meta);
2232         ok = ok && drbd_send_delay_probe(mdev, &mdev->data);
2233
2234         mdev->dp_volume_last = mdev->send_cnt;
2235         mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10);
2236
2237         return ok;
2238 }
2239
2240 /* called on sndtimeo
2241  * returns FALSE if we should retry,
2242  * TRUE if we think connection is dead
2243  */
2244 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2245 {
2246         int drop_it;
2247         /* long elapsed = (long)(jiffies - mdev->last_received); */
2248
2249         drop_it =   mdev->meta.socket == sock
2250                 || !mdev->asender.task
2251                 || get_t_state(&mdev->asender) != Running
2252                 || mdev->state.conn < C_CONNECTED;
2253
2254         if (drop_it)
2255                 return TRUE;
2256
2257         drop_it = !--mdev->ko_count;
2258         if (!drop_it) {
2259                 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2260                        current->comm, current->pid, mdev->ko_count);
2261                 request_ping(mdev);
2262         }
2263
2264         return drop_it; /* && (mdev->state == R_PRIMARY) */;
2265 }
2266
2267 /* The idea of sendpage seems to be to put some kind of reference
2268  * to the page into the skb, and to hand it over to the NIC. In
2269  * this process get_page() gets called.
2270  *
2271  * As soon as the page was really sent over the network put_page()
2272  * gets called by some part of the network layer. [ NIC driver? ]
2273  *
2274  * [ get_page() / put_page() increment/decrement the count. If count
2275  *   reaches 0 the page will be freed. ]
2276  *
2277  * This works nicely with pages from FSs.
2278  * But this means that in protocol A we might signal IO completion too early!
2279  *
2280  * In order not to corrupt data during a resync we must make sure
2281  * that we do not reuse our own buffer pages (EEs) to early, therefore
2282  * we have the net_ee list.
2283  *
2284  * XFS seems to have problems, still, it submits pages with page_count == 0!
2285  * As a workaround, we disable sendpage on pages
2286  * with page_count == 0 or PageSlab.
2287  */
2288 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2289                    int offset, size_t size)
2290 {
2291         int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
2292         kunmap(page);
2293         if (sent == size)
2294                 mdev->send_cnt += size>>9;
2295         return sent == size;
2296 }
2297
2298 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2299                     int offset, size_t size)
2300 {
2301         mm_segment_t oldfs = get_fs();
2302         int sent, ok;
2303         int len = size;
2304
2305         /* e.g. XFS meta- & log-data is in slab pages, which have a
2306          * page_count of 0 and/or have PageSlab() set.
2307          * we cannot use send_page for those, as that does get_page();
2308          * put_page(); and would cause either a VM_BUG directly, or
2309          * __page_cache_release a page that would actually still be referenced
2310          * by someone, leading to some obscure delayed Oops somewhere else. */
2311         if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2312                 return _drbd_no_send_page(mdev, page, offset, size);
2313
2314         drbd_update_congested(mdev);
2315         set_fs(KERNEL_DS);
2316         do {
2317                 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2318                                                         offset, len,
2319                                                         MSG_NOSIGNAL);
2320                 if (sent == -EAGAIN) {
2321                         if (we_should_drop_the_connection(mdev,
2322                                                           mdev->data.socket))
2323                                 break;
2324                         else
2325                                 continue;
2326                 }
2327                 if (sent <= 0) {
2328                         dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2329                              __func__, (int)size, len, sent);
2330                         break;
2331                 }
2332                 len    -= sent;
2333                 offset += sent;
2334         } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2335         set_fs(oldfs);
2336         clear_bit(NET_CONGESTED, &mdev->flags);
2337
2338         ok = (len == 0);
2339         if (likely(ok))
2340                 mdev->send_cnt += size>>9;
2341         return ok;
2342 }
2343
2344 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2345 {
2346         struct bio_vec *bvec;
2347         int i;
2348         __bio_for_each_segment(bvec, bio, i, 0) {
2349                 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2350                                      bvec->bv_offset, bvec->bv_len))
2351                         return 0;
2352         }
2353         return 1;
2354 }
2355
2356 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2357 {
2358         struct bio_vec *bvec;
2359         int i;
2360         __bio_for_each_segment(bvec, bio, i, 0) {
2361                 if (!_drbd_send_page(mdev, bvec->bv_page,
2362                                      bvec->bv_offset, bvec->bv_len))
2363                         return 0;
2364         }
2365
2366         return 1;
2367 }
2368
2369 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2370 {
2371         struct page *page = e->pages;
2372         unsigned len = e->size;
2373         page_chain_for_each(page) {
2374                 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2375                 if (!_drbd_send_page(mdev, page, 0, l))
2376                         return 0;
2377                 len -= l;
2378         }
2379         return 1;
2380 }
2381
2382 static void consider_delay_probes(struct drbd_conf *mdev)
2383 {
2384         if (mdev->state.conn != C_SYNC_SOURCE || mdev->agreed_pro_version < 93)
2385                 return;
2386
2387         if (mdev->dp_volume_last + mdev->sync_conf.dp_volume * 2 < mdev->send_cnt)
2388                 drbd_send_delay_probes(mdev);
2389 }
2390
2391 static int w_delay_probes(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
2392 {
2393         if (!cancel && mdev->state.conn == C_SYNC_SOURCE)
2394                 drbd_send_delay_probes(mdev);
2395
2396         return 1;
2397 }
2398
2399 static void delay_probe_timer_fn(unsigned long data)
2400 {
2401         struct drbd_conf *mdev = (struct drbd_conf *) data;
2402
2403         if (list_empty(&mdev->delay_probe_work.list))
2404                 drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work);
2405 }
2406
2407 /* Used to send write requests
2408  * R_PRIMARY -> Peer    (P_DATA)
2409  */
2410 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2411 {
2412         int ok = 1;
2413         struct p_data p;
2414         unsigned int dp_flags = 0;
2415         void *dgb;
2416         int dgs;
2417
2418         if (!drbd_get_data_sock(mdev))
2419                 return 0;
2420
2421         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2422                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2423
2424         p.head.magic   = BE_DRBD_MAGIC;
2425         p.head.command = cpu_to_be16(P_DATA);
2426         p.head.length  =
2427                 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
2428
2429         p.sector   = cpu_to_be64(req->sector);
2430         p.block_id = (unsigned long)req;
2431         p.seq_num  = cpu_to_be32(req->seq_num =
2432                                  atomic_add_return(1, &mdev->packet_seq));
2433         dp_flags = 0;
2434
2435         /* NOTE: no need to check if barriers supported here as we would
2436          *       not pass the test in make_request_common in that case
2437          */
2438         if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
2439                 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2440                 /* dp_flags |= DP_HARDBARRIER; */
2441         }
2442         if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
2443                 dp_flags |= DP_RW_SYNC;
2444         /* for now handle SYNCIO and UNPLUG
2445          * as if they still were one and the same flag */
2446         if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
2447                 dp_flags |= DP_RW_SYNC;
2448         if (mdev->state.conn >= C_SYNC_SOURCE &&
2449             mdev->state.conn <= C_PAUSED_SYNC_T)
2450                 dp_flags |= DP_MAY_SET_IN_SYNC;
2451
2452         p.dp_flags = cpu_to_be32(dp_flags);
2453         set_bit(UNPLUG_REMOTE, &mdev->flags);
2454         ok = (sizeof(p) ==
2455                 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
2456         if (ok && dgs) {
2457                 dgb = mdev->int_dig_out;
2458                 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2459                 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2460         }
2461         if (ok) {
2462                 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2463                         ok = _drbd_send_bio(mdev, req->master_bio);
2464                 else
2465                         ok = _drbd_send_zc_bio(mdev, req->master_bio);
2466         }
2467
2468         drbd_put_data_sock(mdev);
2469
2470         if (ok)
2471                 consider_delay_probes(mdev);
2472
2473         return ok;
2474 }
2475
2476 /* answer packet, used to send data back for read requests:
2477  *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
2478  *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
2479  */
2480 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2481                     struct drbd_epoch_entry *e)
2482 {
2483         int ok;
2484         struct p_data p;
2485         void *dgb;
2486         int dgs;
2487
2488         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2489                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2490
2491         p.head.magic   = BE_DRBD_MAGIC;
2492         p.head.command = cpu_to_be16(cmd);
2493         p.head.length  =
2494                 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
2495
2496         p.sector   = cpu_to_be64(e->sector);
2497         p.block_id = e->block_id;
2498         /* p.seq_num  = 0;    No sequence numbers here.. */
2499
2500         /* Only called by our kernel thread.
2501          * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2502          * in response to admin command or module unload.
2503          */
2504         if (!drbd_get_data_sock(mdev))
2505                 return 0;
2506
2507         ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2508                                         sizeof(p), MSG_MORE);
2509         if (ok && dgs) {
2510                 dgb = mdev->int_dig_out;
2511                 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2512                 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2513         }
2514         if (ok)
2515                 ok = _drbd_send_zc_ee(mdev, e);
2516
2517         drbd_put_data_sock(mdev);
2518
2519         if (ok)
2520                 consider_delay_probes(mdev);
2521
2522         return ok;
2523 }
2524
2525 /*
2526   drbd_send distinguishes two cases:
2527
2528   Packets sent via the data socket "sock"
2529   and packets sent via the meta data socket "msock"
2530
2531                     sock                      msock
2532   -----------------+-------------------------+------------------------------
2533   timeout           conf.timeout / 2          conf.timeout / 2
2534   timeout action    send a ping via msock     Abort communication
2535                                               and close all sockets
2536 */
2537
2538 /*
2539  * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2540  */
2541 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2542               void *buf, size_t size, unsigned msg_flags)
2543 {
2544         struct kvec iov;
2545         struct msghdr msg;
2546         int rv, sent = 0;
2547
2548         if (!sock)
2549                 return -1000;
2550
2551         /* THINK  if (signal_pending) return ... ? */
2552
2553         iov.iov_base = buf;
2554         iov.iov_len  = size;
2555
2556         msg.msg_name       = NULL;
2557         msg.msg_namelen    = 0;
2558         msg.msg_control    = NULL;
2559         msg.msg_controllen = 0;
2560         msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
2561
2562         if (sock == mdev->data.socket) {
2563                 mdev->ko_count = mdev->net_conf->ko_count;
2564                 drbd_update_congested(mdev);
2565         }
2566         do {
2567                 /* STRANGE
2568                  * tcp_sendmsg does _not_ use its size parameter at all ?
2569                  *
2570                  * -EAGAIN on timeout, -EINTR on signal.
2571                  */
2572 /* THINK
2573  * do we need to block DRBD_SIG if sock == &meta.socket ??
2574  * otherwise wake_asender() might interrupt some send_*Ack !
2575  */
2576                 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2577                 if (rv == -EAGAIN) {
2578                         if (we_should_drop_the_connection(mdev, sock))
2579                                 break;
2580                         else
2581                                 continue;
2582                 }
2583                 D_ASSERT(rv != 0);
2584                 if (rv == -EINTR) {
2585                         flush_signals(current);
2586                         rv = 0;
2587                 }
2588                 if (rv < 0)
2589                         break;
2590                 sent += rv;
2591                 iov.iov_base += rv;
2592                 iov.iov_len  -= rv;
2593         } while (sent < size);
2594
2595         if (sock == mdev->data.socket)
2596                 clear_bit(NET_CONGESTED, &mdev->flags);
2597
2598         if (rv <= 0) {
2599                 if (rv != -EAGAIN) {
2600                         dev_err(DEV, "%s_sendmsg returned %d\n",
2601                             sock == mdev->meta.socket ? "msock" : "sock",
2602                             rv);
2603                         drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2604                 } else
2605                         drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2606         }
2607
2608         return sent;
2609 }
2610
2611 static int drbd_open(struct block_device *bdev, fmode_t mode)
2612 {
2613         struct drbd_conf *mdev = bdev->bd_disk->private_data;
2614         unsigned long flags;
2615         int rv = 0;
2616
2617         spin_lock_irqsave(&mdev->req_lock, flags);
2618         /* to have a stable mdev->state.role
2619          * and no race with updating open_cnt */
2620
2621         if (mdev->state.role != R_PRIMARY) {
2622                 if (mode & FMODE_WRITE)
2623                         rv = -EROFS;
2624                 else if (!allow_oos)
2625                         rv = -EMEDIUMTYPE;
2626         }
2627
2628         if (!rv)
2629                 mdev->open_cnt++;
2630         spin_unlock_irqrestore(&mdev->req_lock, flags);
2631
2632         return rv;
2633 }
2634
2635 static int drbd_release(struct gendisk *gd, fmode_t mode)
2636 {
2637         struct drbd_conf *mdev = gd->private_data;
2638         mdev->open_cnt--;
2639         return 0;
2640 }
2641
2642 static void drbd_unplug_fn(struct request_queue *q)
2643 {
2644         struct drbd_conf *mdev = q->queuedata;
2645
2646         /* unplug FIRST */
2647         spin_lock_irq(q->queue_lock);
2648         blk_remove_plug(q);
2649         spin_unlock_irq(q->queue_lock);
2650
2651         /* only if connected */
2652         spin_lock_irq(&mdev->req_lock);
2653         if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2654                 D_ASSERT(mdev->state.role == R_PRIMARY);
2655                 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2656                         /* add to the data.work queue,
2657                          * unless already queued.
2658                          * XXX this might be a good addition to drbd_queue_work
2659                          * anyways, to detect "double queuing" ... */
2660                         if (list_empty(&mdev->unplug_work.list))
2661                                 drbd_queue_work(&mdev->data.work,
2662                                                 &mdev->unplug_work);
2663                 }
2664         }
2665         spin_unlock_irq(&mdev->req_lock);
2666
2667         if (mdev->state.disk >= D_INCONSISTENT)
2668                 drbd_kick_lo(mdev);
2669 }
2670
2671 static void drbd_set_defaults(struct drbd_conf *mdev)
2672 {
2673         mdev->sync_conf.after      = DRBD_AFTER_DEF;
2674         mdev->sync_conf.rate       = DRBD_RATE_DEF;
2675         mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
2676         mdev->state = (union drbd_state) {
2677                 { .role = R_SECONDARY,
2678                   .peer = R_UNKNOWN,
2679                   .conn = C_STANDALONE,
2680                   .disk = D_DISKLESS,
2681                   .pdsk = D_UNKNOWN,
2682                   .susp = 0
2683                 } };
2684 }
2685
2686 void drbd_init_set_defaults(struct drbd_conf *mdev)
2687 {
2688         /* the memset(,0,) did most of this.
2689          * note: only assignments, no allocation in here */
2690
2691         drbd_set_defaults(mdev);
2692
2693         /* for now, we do NOT yet support it,
2694          * even though we start some framework
2695          * to eventually support barriers */
2696         set_bit(NO_BARRIER_SUPP, &mdev->flags);
2697
2698         atomic_set(&mdev->ap_bio_cnt, 0);
2699         atomic_set(&mdev->ap_pending_cnt, 0);
2700         atomic_set(&mdev->rs_pending_cnt, 0);
2701         atomic_set(&mdev->unacked_cnt, 0);
2702         atomic_set(&mdev->local_cnt, 0);
2703         atomic_set(&mdev->net_cnt, 0);
2704         atomic_set(&mdev->packet_seq, 0);
2705         atomic_set(&mdev->pp_in_use, 0);
2706         atomic_set(&mdev->new_c_uuid, 0);
2707
2708         mutex_init(&mdev->md_io_mutex);
2709         mutex_init(&mdev->data.mutex);
2710         mutex_init(&mdev->meta.mutex);
2711         sema_init(&mdev->data.work.s, 0);
2712         sema_init(&mdev->meta.work.s, 0);
2713         mutex_init(&mdev->state_mutex);
2714
2715         spin_lock_init(&mdev->data.work.q_lock);
2716         spin_lock_init(&mdev->meta.work.q_lock);
2717
2718         spin_lock_init(&mdev->al_lock);
2719         spin_lock_init(&mdev->req_lock);
2720         spin_lock_init(&mdev->peer_seq_lock);
2721         spin_lock_init(&mdev->epoch_lock);
2722
2723         INIT_LIST_HEAD(&mdev->active_ee);
2724         INIT_LIST_HEAD(&mdev->sync_ee);
2725         INIT_LIST_HEAD(&mdev->done_ee);
2726         INIT_LIST_HEAD(&mdev->read_ee);
2727         INIT_LIST_HEAD(&mdev->net_ee);
2728         INIT_LIST_HEAD(&mdev->resync_reads);
2729         INIT_LIST_HEAD(&mdev->data.work.q);
2730         INIT_LIST_HEAD(&mdev->meta.work.q);
2731         INIT_LIST_HEAD(&mdev->resync_work.list);
2732         INIT_LIST_HEAD(&mdev->unplug_work.list);
2733         INIT_LIST_HEAD(&mdev->md_sync_work.list);
2734         INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2735         INIT_LIST_HEAD(&mdev->delay_probes);
2736         INIT_LIST_HEAD(&mdev->delay_probe_work.list);
2737         INIT_LIST_HEAD(&mdev->uuid_work.list);
2738
2739         mdev->resync_work.cb  = w_resync_inactive;
2740         mdev->unplug_work.cb  = w_send_write_hint;
2741         mdev->md_sync_work.cb = w_md_sync;
2742         mdev->bm_io_work.w.cb = w_bitmap_io;
2743         mdev->delay_probe_work.cb = w_delay_probes;
2744         mdev->uuid_work.cb = w_new_current_uuid;
2745         init_timer(&mdev->resync_timer);
2746         init_timer(&mdev->md_sync_timer);
2747         init_timer(&mdev->delay_probe_timer);
2748         mdev->resync_timer.function = resync_timer_fn;
2749         mdev->resync_timer.data = (unsigned long) mdev;
2750         mdev->md_sync_timer.function = md_sync_timer_fn;
2751         mdev->md_sync_timer.data = (unsigned long) mdev;
2752         mdev->delay_probe_timer.function = delay_probe_timer_fn;
2753         mdev->delay_probe_timer.data = (unsigned long) mdev;
2754
2755
2756         init_waitqueue_head(&mdev->misc_wait);
2757         init_waitqueue_head(&mdev->state_wait);
2758         init_waitqueue_head(&mdev->ee_wait);
2759         init_waitqueue_head(&mdev->al_wait);
2760         init_waitqueue_head(&mdev->seq_wait);
2761
2762         drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2763         drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2764         drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2765
2766         mdev->agreed_pro_version = PRO_VERSION_MAX;
2767         mdev->write_ordering = WO_bio_barrier;
2768         mdev->resync_wenr = LC_FREE;
2769 }
2770
2771 void drbd_mdev_cleanup(struct drbd_conf *mdev)
2772 {
2773         if (mdev->receiver.t_state != None)
2774                 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2775                                 mdev->receiver.t_state);
2776
2777         /* no need to lock it, I'm the only thread alive */
2778         if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
2779                 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2780         mdev->al_writ_cnt  =
2781         mdev->bm_writ_cnt  =
2782         mdev->read_cnt     =
2783         mdev->recv_cnt     =
2784         mdev->send_cnt     =
2785         mdev->writ_cnt     =
2786         mdev->p_size       =
2787         mdev->rs_start     =
2788         mdev->rs_total     =
2789         mdev->rs_failed    =
2790         mdev->rs_mark_left =
2791         mdev->rs_mark_time = 0;
2792         D_ASSERT(mdev->net_conf == NULL);
2793
2794         drbd_set_my_capacity(mdev, 0);
2795         if (mdev->bitmap) {
2796                 /* maybe never allocated. */
2797                 drbd_bm_resize(mdev, 0, 1);
2798                 drbd_bm_cleanup(mdev);
2799         }
2800
2801         drbd_free_resources(mdev);
2802
2803         /*
2804          * currently we drbd_init_ee only on module load, so
2805          * we may do drbd_release_ee only on module unload!
2806          */
2807         D_ASSERT(list_empty(&mdev->active_ee));
2808         D_ASSERT(list_empty(&mdev->sync_ee));
2809         D_ASSERT(list_empty(&mdev->done_ee));
2810         D_ASSERT(list_empty(&mdev->read_ee));
2811         D_ASSERT(list_empty(&mdev->net_ee));
2812         D_ASSERT(list_empty(&mdev->resync_reads));
2813         D_ASSERT(list_empty(&mdev->data.work.q));
2814         D_ASSERT(list_empty(&mdev->meta.work.q));
2815         D_ASSERT(list_empty(&mdev->resync_work.list));
2816         D_ASSERT(list_empty(&mdev->unplug_work.list));
2817
2818 }
2819
2820
2821 static void drbd_destroy_mempools(void)
2822 {
2823         struct page *page;
2824
2825         while (drbd_pp_pool) {
2826                 page = drbd_pp_pool;
2827                 drbd_pp_pool = (struct page *)page_private(page);
2828                 __free_page(page);
2829                 drbd_pp_vacant--;
2830         }
2831
2832         /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2833
2834         if (drbd_ee_mempool)
2835                 mempool_destroy(drbd_ee_mempool);
2836         if (drbd_request_mempool)
2837                 mempool_destroy(drbd_request_mempool);
2838         if (drbd_ee_cache)
2839                 kmem_cache_destroy(drbd_ee_cache);
2840         if (drbd_request_cache)
2841                 kmem_cache_destroy(drbd_request_cache);
2842         if (drbd_bm_ext_cache)
2843                 kmem_cache_destroy(drbd_bm_ext_cache);
2844         if (drbd_al_ext_cache)
2845                 kmem_cache_destroy(drbd_al_ext_cache);
2846
2847         drbd_ee_mempool      = NULL;
2848         drbd_request_mempool = NULL;
2849         drbd_ee_cache        = NULL;
2850         drbd_request_cache   = NULL;
2851         drbd_bm_ext_cache    = NULL;
2852         drbd_al_ext_cache    = NULL;
2853
2854         return;
2855 }
2856
2857 static int drbd_create_mempools(void)
2858 {
2859         struct page *page;
2860         const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2861         int i;
2862
2863         /* prepare our caches and mempools */
2864         drbd_request_mempool = NULL;
2865         drbd_ee_cache        = NULL;
2866         drbd_request_cache   = NULL;
2867         drbd_bm_ext_cache    = NULL;
2868         drbd_al_ext_cache    = NULL;
2869         drbd_pp_pool         = NULL;
2870
2871         /* caches */
2872         drbd_request_cache = kmem_cache_create(
2873                 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2874         if (drbd_request_cache == NULL)
2875                 goto Enomem;
2876
2877         drbd_ee_cache = kmem_cache_create(
2878                 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2879         if (drbd_ee_cache == NULL)
2880                 goto Enomem;
2881
2882         drbd_bm_ext_cache = kmem_cache_create(
2883                 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2884         if (drbd_bm_ext_cache == NULL)
2885                 goto Enomem;
2886
2887         drbd_al_ext_cache = kmem_cache_create(
2888                 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2889         if (drbd_al_ext_cache == NULL)
2890                 goto Enomem;
2891
2892         /* mempools */
2893         drbd_request_mempool = mempool_create(number,
2894                 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2895         if (drbd_request_mempool == NULL)
2896                 goto Enomem;
2897
2898         drbd_ee_mempool = mempool_create(number,
2899                 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2900         if (drbd_request_mempool == NULL)
2901                 goto Enomem;
2902
2903         /* drbd's page pool */
2904         spin_lock_init(&drbd_pp_lock);
2905
2906         for (i = 0; i < number; i++) {
2907                 page = alloc_page(GFP_HIGHUSER);
2908                 if (!page)
2909                         goto Enomem;
2910                 set_page_private(page, (unsigned long)drbd_pp_pool);
2911                 drbd_pp_pool = page;
2912         }
2913         drbd_pp_vacant = number;
2914
2915         return 0;
2916
2917 Enomem:
2918         drbd_destroy_mempools(); /* in case we allocated some */
2919         return -ENOMEM;
2920 }
2921
2922 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2923         void *unused)
2924 {
2925         /* just so we have it.  you never know what interesting things we
2926          * might want to do here some day...
2927          */
2928
2929         return NOTIFY_DONE;
2930 }
2931
2932 static struct notifier_block drbd_notifier = {
2933         .notifier_call = drbd_notify_sys,
2934 };
2935
2936 static void drbd_release_ee_lists(struct drbd_conf *mdev)
2937 {
2938         int rr;
2939
2940         rr = drbd_release_ee(mdev, &mdev->active_ee);
2941         if (rr)
2942                 dev_err(DEV, "%d EEs in active list found!\n", rr);
2943
2944         rr = drbd_release_ee(mdev, &mdev->sync_ee);
2945         if (rr)
2946                 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2947
2948         rr = drbd_release_ee(mdev, &mdev->read_ee);
2949         if (rr)
2950                 dev_err(DEV, "%d EEs in read list found!\n", rr);
2951
2952         rr = drbd_release_ee(mdev, &mdev->done_ee);
2953         if (rr)
2954                 dev_err(DEV, "%d EEs in done list found!\n", rr);
2955
2956         rr = drbd_release_ee(mdev, &mdev->net_ee);
2957         if (rr)
2958                 dev_err(DEV, "%d EEs in net list found!\n", rr);
2959 }
2960
2961 /* caution. no locking.
2962  * currently only used from module cleanup code. */
2963 static void drbd_delete_device(unsigned int minor)
2964 {
2965         struct drbd_conf *mdev = minor_to_mdev(minor);
2966
2967         if (!mdev)
2968                 return;
2969
2970         /* paranoia asserts */
2971         if (mdev->open_cnt != 0)
2972                 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
2973                                 __FILE__ , __LINE__);
2974
2975         ERR_IF (!list_empty(&mdev->data.work.q)) {
2976                 struct list_head *lp;
2977                 list_for_each(lp, &mdev->data.work.q) {
2978                         dev_err(DEV, "lp = %p\n", lp);
2979                 }
2980         };
2981         /* end paranoia asserts */
2982
2983         del_gendisk(mdev->vdisk);
2984
2985         /* cleanup stuff that may have been allocated during
2986          * device (re-)configuration or state changes */
2987
2988         if (mdev->this_bdev)
2989                 bdput(mdev->this_bdev);
2990
2991         drbd_free_resources(mdev);
2992
2993         drbd_release_ee_lists(mdev);
2994
2995         /* should be free'd on disconnect? */
2996         kfree(mdev->ee_hash);
2997         /*
2998         mdev->ee_hash_s = 0;
2999         mdev->ee_hash = NULL;
3000         */
3001
3002         lc_destroy(mdev->act_log);
3003         lc_destroy(mdev->resync);
3004
3005         kfree(mdev->p_uuid);
3006         /* mdev->p_uuid = NULL; */
3007
3008         kfree(mdev->int_dig_out);
3009         kfree(mdev->int_dig_in);
3010         kfree(mdev->int_dig_vv);
3011
3012         /* cleanup the rest that has been
3013          * allocated from drbd_new_device
3014          * and actually free the mdev itself */
3015         drbd_free_mdev(mdev);
3016 }
3017
3018 static void drbd_cleanup(void)
3019 {
3020         unsigned int i;
3021
3022         unregister_reboot_notifier(&drbd_notifier);
3023
3024         drbd_nl_cleanup();
3025
3026         if (minor_table) {
3027                 if (drbd_proc)
3028                         remove_proc_entry("drbd", NULL);
3029                 i = minor_count;
3030                 while (i--)
3031                         drbd_delete_device(i);
3032                 drbd_destroy_mempools();
3033         }
3034
3035         kfree(minor_table);
3036
3037         unregister_blkdev(DRBD_MAJOR, "drbd");
3038
3039         printk(KERN_INFO "drbd: module cleanup done.\n");
3040 }
3041
3042 /**
3043  * drbd_congested() - Callback for pdflush
3044  * @congested_data:     User data
3045  * @bdi_bits:           Bits pdflush is currently interested in
3046  *
3047  * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3048  */
3049 static int drbd_congested(void *congested_data, int bdi_bits)
3050 {
3051         struct drbd_conf *mdev = congested_data;
3052         struct request_queue *q;
3053         char reason = '-';
3054         int r = 0;
3055
3056         if (!__inc_ap_bio_cond(mdev)) {
3057                 /* DRBD has frozen IO */
3058                 r = bdi_bits;
3059                 reason = 'd';
3060                 goto out;
3061         }
3062
3063         if (get_ldev(mdev)) {
3064                 q = bdev_get_queue(mdev->ldev->backing_bdev);
3065                 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3066                 put_ldev(mdev);
3067                 if (r)
3068                         reason = 'b';
3069         }
3070
3071         if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3072                 r |= (1 << BDI_async_congested);
3073                 reason = reason == 'b' ? 'a' : 'n';
3074         }
3075
3076 out:
3077         mdev->congestion_reason = reason;
3078         return r;
3079 }
3080
3081 struct drbd_conf *drbd_new_device(unsigned int minor)
3082 {
3083         struct drbd_conf *mdev;
3084         struct gendisk *disk;
3085         struct request_queue *q;
3086
3087         /* GFP_KERNEL, we are outside of all write-out paths */
3088         mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3089         if (!mdev)
3090                 return NULL;
3091         if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3092                 goto out_no_cpumask;
3093
3094         mdev->minor = minor;
3095
3096         drbd_init_set_defaults(mdev);
3097
3098         q = blk_alloc_queue(GFP_KERNEL);
3099         if (!q)
3100                 goto out_no_q;
3101         mdev->rq_queue = q;
3102         q->queuedata   = mdev;
3103
3104         disk = alloc_disk(1);
3105         if (!disk)
3106                 goto out_no_disk;
3107         mdev->vdisk = disk;
3108
3109         set_disk_ro(disk, TRUE);
3110
3111         disk->queue = q;
3112         disk->major = DRBD_MAJOR;
3113         disk->first_minor = minor;
3114         disk->fops = &drbd_ops;
3115         sprintf(disk->disk_name, "drbd%d", minor);
3116         disk->private_data = mdev;
3117
3118         mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3119         /* we have no partitions. we contain only ourselves. */
3120         mdev->this_bdev->bd_contains = mdev->this_bdev;
3121
3122         q->backing_dev_info.congested_fn = drbd_congested;
3123         q->backing_dev_info.congested_data = mdev;
3124
3125         blk_queue_make_request(q, drbd_make_request_26);
3126         blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
3127         blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3128         blk_queue_merge_bvec(q, drbd_merge_bvec);
3129         q->queue_lock = &mdev->req_lock; /* needed since we use */
3130                 /* plugging on a queue, that actually has no requests! */
3131         q->unplug_fn = drbd_unplug_fn;
3132
3133         mdev->md_io_page = alloc_page(GFP_KERNEL);
3134         if (!mdev->md_io_page)
3135                 goto out_no_io_page;
3136
3137         if (drbd_bm_init(mdev))
3138                 goto out_no_bitmap;
3139         /* no need to lock access, we are still initializing this minor device. */
3140         if (!tl_init(mdev))
3141                 goto out_no_tl;
3142
3143         mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3144         if (!mdev->app_reads_hash)
3145                 goto out_no_app_reads;
3146
3147         mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3148         if (!mdev->current_epoch)
3149                 goto out_no_epoch;
3150
3151         INIT_LIST_HEAD(&mdev->current_epoch->list);
3152         mdev->epochs = 1;
3153
3154         return mdev;
3155
3156 /* out_whatever_else:
3157         kfree(mdev->current_epoch); */
3158 out_no_epoch:
3159         kfree(mdev->app_reads_hash);
3160 out_no_app_reads:
3161         tl_cleanup(mdev);
3162 out_no_tl:
3163         drbd_bm_cleanup(mdev);
3164 out_no_bitmap:
3165         __free_page(mdev->md_io_page);
3166 out_no_io_page:
3167         put_disk(disk);
3168 out_no_disk:
3169         blk_cleanup_queue(q);
3170 out_no_q:
3171         free_cpumask_var(mdev->cpu_mask);
3172 out_no_cpumask:
3173         kfree(mdev);
3174         return NULL;
3175 }
3176
3177 /* counterpart of drbd_new_device.
3178  * last part of drbd_delete_device. */
3179 void drbd_free_mdev(struct drbd_conf *mdev)
3180 {
3181         kfree(mdev->current_epoch);
3182         kfree(mdev->app_reads_hash);
3183         tl_cleanup(mdev);
3184         if (mdev->bitmap) /* should no longer be there. */
3185                 drbd_bm_cleanup(mdev);
3186         __free_page(mdev->md_io_page);
3187         put_disk(mdev->vdisk);
3188         blk_cleanup_queue(mdev->rq_queue);
3189         free_cpumask_var(mdev->cpu_mask);
3190         kfree(mdev);
3191 }
3192
3193
3194 int __init drbd_init(void)
3195 {
3196         int err;
3197
3198         if (sizeof(struct p_handshake) != 80) {
3199                 printk(KERN_ERR
3200                        "drbd: never change the size or layout "
3201                        "of the HandShake packet.\n");
3202                 return -EINVAL;
3203         }
3204
3205         if (1 > minor_count || minor_count > 255) {
3206                 printk(KERN_ERR
3207                         "drbd: invalid minor_count (%d)\n", minor_count);
3208 #ifdef MODULE
3209                 return -EINVAL;
3210 #else
3211                 minor_count = 8;
3212 #endif
3213         }
3214
3215         err = drbd_nl_init();
3216         if (err)
3217                 return err;
3218
3219         err = register_blkdev(DRBD_MAJOR, "drbd");
3220         if (err) {
3221                 printk(KERN_ERR
3222                        "drbd: unable to register block device major %d\n",
3223                        DRBD_MAJOR);
3224                 return err;
3225         }
3226
3227         register_reboot_notifier(&drbd_notifier);
3228
3229         /*
3230          * allocate all necessary structs
3231          */
3232         err = -ENOMEM;
3233
3234         init_waitqueue_head(&drbd_pp_wait);
3235
3236         drbd_proc = NULL; /* play safe for drbd_cleanup */
3237         minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3238                                 GFP_KERNEL);
3239         if (!minor_table)
3240                 goto Enomem;
3241
3242         err = drbd_create_mempools();
3243         if (err)
3244                 goto Enomem;
3245
3246         drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3247         if (!drbd_proc) {
3248                 printk(KERN_ERR "drbd: unable to register proc file\n");
3249                 goto Enomem;
3250         }
3251
3252         rwlock_init(&global_state_lock);
3253
3254         printk(KERN_INFO "drbd: initialized. "
3255                "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3256                API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3257         printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3258         printk(KERN_INFO "drbd: registered as block device major %d\n",
3259                 DRBD_MAJOR);
3260         printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3261
3262         return 0; /* Success! */
3263
3264 Enomem:
3265         drbd_cleanup();
3266         if (err == -ENOMEM)
3267                 /* currently always the case */
3268                 printk(KERN_ERR "drbd: ran out of memory\n");
3269         else
3270                 printk(KERN_ERR "drbd: initialization failure\n");
3271         return err;
3272 }
3273
3274 void drbd_free_bc(struct drbd_backing_dev *ldev)
3275 {
3276         if (ldev == NULL)
3277                 return;
3278
3279         bd_release(ldev->backing_bdev);
3280         bd_release(ldev->md_bdev);
3281
3282         fput(ldev->lo_file);
3283         fput(ldev->md_file);
3284
3285         kfree(ldev);
3286 }
3287
3288 void drbd_free_sock(struct drbd_conf *mdev)
3289 {
3290         if (mdev->data.socket) {
3291                 mutex_lock(&mdev->data.mutex);
3292                 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3293                 sock_release(mdev->data.socket);
3294                 mdev->data.socket = NULL;
3295                 mutex_unlock(&mdev->data.mutex);
3296         }
3297         if (mdev->meta.socket) {
3298                 mutex_lock(&mdev->meta.mutex);
3299                 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3300                 sock_release(mdev->meta.socket);
3301                 mdev->meta.socket = NULL;
3302                 mutex_unlock(&mdev->meta.mutex);
3303         }
3304 }
3305
3306
3307 void drbd_free_resources(struct drbd_conf *mdev)
3308 {
3309         crypto_free_hash(mdev->csums_tfm);
3310         mdev->csums_tfm = NULL;
3311         crypto_free_hash(mdev->verify_tfm);
3312         mdev->verify_tfm = NULL;
3313         crypto_free_hash(mdev->cram_hmac_tfm);
3314         mdev->cram_hmac_tfm = NULL;
3315         crypto_free_hash(mdev->integrity_w_tfm);
3316         mdev->integrity_w_tfm = NULL;
3317         crypto_free_hash(mdev->integrity_r_tfm);
3318         mdev->integrity_r_tfm = NULL;
3319
3320         drbd_free_sock(mdev);
3321
3322         __no_warn(local,
3323                   drbd_free_bc(mdev->ldev);
3324                   mdev->ldev = NULL;);
3325 }
3326
3327 /* meta data management */
3328
3329 struct meta_data_on_disk {
3330         u64 la_size;           /* last agreed size. */
3331         u64 uuid[UI_SIZE];   /* UUIDs. */
3332         u64 device_uuid;
3333         u64 reserved_u64_1;
3334         u32 flags;             /* MDF */
3335         u32 magic;
3336         u32 md_size_sect;
3337         u32 al_offset;         /* offset to this block */
3338         u32 al_nr_extents;     /* important for restoring the AL */
3339               /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3340         u32 bm_offset;         /* offset to the bitmap, from here */
3341         u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3342         u32 reserved_u32[4];
3343
3344 } __packed;
3345
3346 /**
3347  * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3348  * @mdev:       DRBD device.
3349  */
3350 void drbd_md_sync(struct drbd_conf *mdev)
3351 {
3352         struct meta_data_on_disk *buffer;
3353         sector_t sector;
3354         int i;
3355
3356         if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3357                 return;
3358         del_timer(&mdev->md_sync_timer);
3359
3360         /* We use here D_FAILED and not D_ATTACHING because we try to write
3361          * metadata even if we detach due to a disk failure! */
3362         if (!get_ldev_if_state(mdev, D_FAILED))
3363                 return;
3364
3365         mutex_lock(&mdev->md_io_mutex);
3366         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3367         memset(buffer, 0, 512);
3368
3369         buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3370         for (i = UI_CURRENT; i < UI_SIZE; i++)
3371                 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3372         buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3373         buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3374
3375         buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
3376         buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
3377         buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3378         buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3379         buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3380
3381         buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3382
3383         D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3384         sector = mdev->ldev->md.md_offset;
3385
3386         if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3387                 clear_bit(MD_DIRTY, &mdev->flags);
3388         } else {
3389                 /* this was a try anyways ... */
3390                 dev_err(DEV, "meta data update failed!\n");
3391
3392                 drbd_chk_io_error(mdev, 1, TRUE);
3393         }
3394
3395         /* Update mdev->ldev->md.la_size_sect,
3396          * since we updated it on metadata. */
3397         mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3398
3399         mutex_unlock(&mdev->md_io_mutex);
3400         put_ldev(mdev);
3401 }
3402
3403 /**
3404  * drbd_md_read() - Reads in the meta data super block
3405  * @mdev:       DRBD device.
3406  * @bdev:       Device from which the meta data should be read in.
3407  *
3408  * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3409  * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3410  */
3411 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3412 {
3413         struct meta_data_on_disk *buffer;
3414         int i, rv = NO_ERROR;
3415
3416         if (!get_ldev_if_state(mdev, D_ATTACHING))
3417                 return ERR_IO_MD_DISK;
3418
3419         mutex_lock(&mdev->md_io_mutex);
3420         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3421
3422         if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3423                 /* NOTE: cant do normal error processing here as this is
3424                    called BEFORE disk is attached */
3425                 dev_err(DEV, "Error while reading metadata.\n");
3426                 rv = ERR_IO_MD_DISK;
3427                 goto err;
3428         }
3429
3430         if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3431                 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3432                 rv = ERR_MD_INVALID;
3433                 goto err;
3434         }
3435         if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3436                 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3437                     be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3438                 rv = ERR_MD_INVALID;
3439                 goto err;
3440         }
3441         if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3442                 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3443                     be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3444                 rv = ERR_MD_INVALID;
3445                 goto err;
3446         }
3447         if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3448                 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3449                     be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3450                 rv = ERR_MD_INVALID;
3451                 goto err;
3452         }
3453
3454         if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3455                 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3456                     be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3457                 rv = ERR_MD_INVALID;
3458                 goto err;
3459         }
3460
3461         bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3462         for (i = UI_CURRENT; i < UI_SIZE; i++)
3463                 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3464         bdev->md.flags = be32_to_cpu(buffer->flags);
3465         mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3466         bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3467
3468         if (mdev->sync_conf.al_extents < 7)
3469                 mdev->sync_conf.al_extents = 127;
3470
3471  err:
3472         mutex_unlock(&mdev->md_io_mutex);
3473         put_ldev(mdev);
3474
3475         return rv;
3476 }
3477
3478 /**
3479  * drbd_md_mark_dirty() - Mark meta data super block as dirty
3480  * @mdev:       DRBD device.
3481  *
3482  * Call this function if you change anything that should be written to
3483  * the meta-data super block. This function sets MD_DIRTY, and starts a
3484  * timer that ensures that within five seconds you have to call drbd_md_sync().
3485  */
3486 void drbd_md_mark_dirty(struct drbd_conf *mdev)
3487 {
3488         set_bit(MD_DIRTY, &mdev->flags);
3489         mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3490 }
3491
3492
3493 static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3494 {
3495         int i;
3496
3497         for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3498                 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3499 }
3500
3501 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3502 {
3503         if (idx == UI_CURRENT) {
3504                 if (mdev->state.role == R_PRIMARY)
3505                         val |= 1;
3506                 else
3507                         val &= ~((u64)1);
3508
3509                 drbd_set_ed_uuid(mdev, val);
3510         }
3511
3512         mdev->ldev->md.uuid[idx] = val;
3513         drbd_md_mark_dirty(mdev);
3514 }
3515
3516
3517 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3518 {
3519         if (mdev->ldev->md.uuid[idx]) {
3520                 drbd_uuid_move_history(mdev);
3521                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3522         }
3523         _drbd_uuid_set(mdev, idx, val);
3524 }
3525
3526 /**
3527  * drbd_uuid_new_current() - Creates a new current UUID
3528  * @mdev:       DRBD device.
3529  *
3530  * Creates a new current UUID, and rotates the old current UUID into
3531  * the bitmap slot. Causes an incremental resync upon next connect.
3532  */
3533 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3534 {
3535         u64 val;
3536
3537         dev_info(DEV, "Creating new current UUID\n");
3538         D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3539         mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3540
3541         get_random_bytes(&val, sizeof(u64));
3542         _drbd_uuid_set(mdev, UI_CURRENT, val);
3543 }
3544
3545 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3546 {
3547         if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3548                 return;
3549
3550         if (val == 0) {
3551                 drbd_uuid_move_history(mdev);
3552                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3553                 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3554         } else {
3555                 if (mdev->ldev->md.uuid[UI_BITMAP])
3556                         dev_warn(DEV, "bm UUID already set");
3557
3558                 mdev->ldev->md.uuid[UI_BITMAP] = val;
3559                 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3560
3561         }
3562         drbd_md_mark_dirty(mdev);
3563 }
3564
3565 /**
3566  * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3567  * @mdev:       DRBD device.
3568  *
3569  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3570  */
3571 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3572 {
3573         int rv = -EIO;
3574
3575         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3576                 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3577                 drbd_md_sync(mdev);
3578                 drbd_bm_set_all(mdev);
3579
3580                 rv = drbd_bm_write(mdev);
3581
3582                 if (!rv) {
3583                         drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3584                         drbd_md_sync(mdev);
3585                 }
3586
3587                 put_ldev(mdev);
3588         }
3589
3590         return rv;
3591 }
3592
3593 /**
3594  * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3595  * @mdev:       DRBD device.
3596  *
3597  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3598  */
3599 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3600 {
3601         int rv = -EIO;
3602
3603         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3604                 drbd_bm_clear_all(mdev);
3605                 rv = drbd_bm_write(mdev);
3606                 put_ldev(mdev);
3607         }
3608
3609         return rv;
3610 }
3611
3612 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3613 {
3614         struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3615         int rv;
3616
3617         D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3618
3619         drbd_bm_lock(mdev, work->why);
3620         rv = work->io_fn(mdev);
3621         drbd_bm_unlock(mdev);
3622
3623         clear_bit(BITMAP_IO, &mdev->flags);
3624         wake_up(&mdev->misc_wait);
3625
3626         if (work->done)
3627                 work->done(mdev, rv);
3628
3629         clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3630         work->why = NULL;
3631
3632         return 1;
3633 }
3634
3635 /**
3636  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3637  * @mdev:       DRBD device.
3638  * @io_fn:      IO callback to be called when bitmap IO is possible
3639  * @done:       callback to be called after the bitmap IO was performed
3640  * @why:        Descriptive text of the reason for doing the IO
3641  *
3642  * While IO on the bitmap happens we freeze application IO thus we ensure
3643  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3644  * called from worker context. It MUST NOT be used while a previous such
3645  * work is still pending!
3646  */
3647 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3648                           int (*io_fn)(struct drbd_conf *),
3649                           void (*done)(struct drbd_conf *, int),
3650                           char *why)
3651 {
3652         D_ASSERT(current == mdev->worker.task);
3653
3654         D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3655         D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3656         D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3657         if (mdev->bm_io_work.why)
3658                 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3659                         why, mdev->bm_io_work.why);
3660
3661         mdev->bm_io_work.io_fn = io_fn;
3662         mdev->bm_io_work.done = done;
3663         mdev->bm_io_work.why = why;
3664
3665         set_bit(BITMAP_IO, &mdev->flags);
3666         if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3667                 if (list_empty(&mdev->bm_io_work.w.list)) {
3668                         set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3669                         drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3670                 } else
3671                         dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3672         }
3673 }
3674
3675 /**
3676  * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
3677  * @mdev:       DRBD device.
3678  * @io_fn:      IO callback to be called when bitmap IO is possible
3679  * @why:        Descriptive text of the reason for doing the IO
3680  *
3681  * freezes application IO while that the actual IO operations runs. This
3682  * functions MAY NOT be called from worker context.
3683  */
3684 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3685 {
3686         int rv;
3687
3688         D_ASSERT(current != mdev->worker.task);
3689
3690         drbd_suspend_io(mdev);
3691
3692         drbd_bm_lock(mdev, why);
3693         rv = io_fn(mdev);
3694         drbd_bm_unlock(mdev);
3695
3696         drbd_resume_io(mdev);
3697
3698         return rv;
3699 }
3700
3701 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3702 {
3703         if ((mdev->ldev->md.flags & flag) != flag) {
3704                 drbd_md_mark_dirty(mdev);
3705                 mdev->ldev->md.flags |= flag;
3706         }
3707 }
3708
3709 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3710 {
3711         if ((mdev->ldev->md.flags & flag) != 0) {
3712                 drbd_md_mark_dirty(mdev);
3713                 mdev->ldev->md.flags &= ~flag;
3714         }
3715 }
3716 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3717 {
3718         return (bdev->md.flags & flag) != 0;
3719 }
3720
3721 static void md_sync_timer_fn(unsigned long data)
3722 {
3723         struct drbd_conf *mdev = (struct drbd_conf *) data;
3724
3725         drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3726 }
3727
3728 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3729 {
3730         dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3731         drbd_md_sync(mdev);
3732
3733         return 1;
3734 }
3735
3736 #ifdef CONFIG_DRBD_FAULT_INJECTION
3737 /* Fault insertion support including random number generator shamelessly
3738  * stolen from kernel/rcutorture.c */
3739 struct fault_random_state {
3740         unsigned long state;
3741         unsigned long count;
3742 };
3743
3744 #define FAULT_RANDOM_MULT 39916801  /* prime */
3745 #define FAULT_RANDOM_ADD        479001701 /* prime */
3746 #define FAULT_RANDOM_REFRESH 10000
3747
3748 /*
3749  * Crude but fast random-number generator.  Uses a linear congruential
3750  * generator, with occasional help from get_random_bytes().
3751  */
3752 static unsigned long
3753 _drbd_fault_random(struct fault_random_state *rsp)
3754 {
3755         long refresh;
3756
3757         if (!rsp->count--) {
3758                 get_random_bytes(&refresh, sizeof(refresh));
3759                 rsp->state += refresh;
3760                 rsp->count = FAULT_RANDOM_REFRESH;
3761         }
3762         rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3763         return swahw32(rsp->state);
3764 }
3765
3766 static char *
3767 _drbd_fault_str(unsigned int type) {
3768         static char *_faults[] = {
3769                 [DRBD_FAULT_MD_WR] = "Meta-data write",
3770                 [DRBD_FAULT_MD_RD] = "Meta-data read",
3771                 [DRBD_FAULT_RS_WR] = "Resync write",
3772                 [DRBD_FAULT_RS_RD] = "Resync read",
3773                 [DRBD_FAULT_DT_WR] = "Data write",
3774                 [DRBD_FAULT_DT_RD] = "Data read",
3775                 [DRBD_FAULT_DT_RA] = "Data read ahead",
3776                 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
3777                 [DRBD_FAULT_AL_EE] = "EE allocation",
3778                 [DRBD_FAULT_RECEIVE] = "receive data corruption",
3779         };
3780
3781         return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3782 }
3783
3784 unsigned int
3785 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3786 {
3787         static struct fault_random_state rrs = {0, 0};
3788
3789         unsigned int ret = (
3790                 (fault_devs == 0 ||
3791                         ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3792                 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3793
3794         if (ret) {
3795                 fault_count++;
3796
3797                 if (printk_ratelimit())
3798                         dev_warn(DEV, "***Simulating %s failure\n",
3799                                 _drbd_fault_str(type));
3800         }
3801
3802         return ret;
3803 }
3804 #endif
3805
3806 const char *drbd_buildtag(void)
3807 {
3808         /* DRBD built from external sources has here a reference to the
3809            git hash of the source code. */
3810
3811         static char buildtag[38] = "\0uilt-in";
3812
3813         if (buildtag[0] == 0) {
3814 #ifdef CONFIG_MODULES
3815                 if (THIS_MODULE != NULL)
3816                         sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3817                 else
3818 #endif
3819                         buildtag[0] = 'b';
3820         }
3821
3822         return buildtag;
3823 }
3824
3825 module_init(drbd_init)
3826 module_exit(drbd_cleanup)
3827
3828 EXPORT_SYMBOL(drbd_conn_str);
3829 EXPORT_SYMBOL(drbd_role_str);
3830 EXPORT_SYMBOL(drbd_disk_str);
3831 EXPORT_SYMBOL(drbd_set_st_err_str);