drbd: distribute former syncer_conf settings to disk, connection, and resource level
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
2a48fc0a 35#include <linux/mutex.h>
b411b363
PR
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
2a48fc0a 59static DEFINE_MUTEX(drbd_main_mutex);
b411b363
PR
60int drbdd_init(struct drbd_thread *);
61int drbd_worker(struct drbd_thread *);
62int drbd_asender(struct drbd_thread *);
63
64int drbd_init(void);
65static int drbd_open(struct block_device *bdev, fmode_t mode);
66static int drbd_release(struct gendisk *gd, fmode_t mode);
00d56944 67static int w_md_sync(struct drbd_work *w, int unused);
b411b363 68static void md_sync_timer_fn(unsigned long data);
00d56944
PR
69static int w_bitmap_io(struct drbd_work *w, int unused);
70static int w_go_diskless(struct drbd_work *w, int unused);
b411b363 71
b411b363
PR
72MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
73 "Lars Ellenberg <lars@linbit.com>");
74MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
75MODULE_VERSION(REL_VERSION);
76MODULE_LICENSE("GPL");
81a5d60e 77MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
2b8a90b5 78 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
b411b363
PR
79MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
80
81#include <linux/moduleparam.h>
82/* allow_open_on_secondary */
83MODULE_PARM_DESC(allow_oos, "DONT USE!");
84/* thanks to these macros, if compiled into the kernel (not-module),
85 * this becomes the boot parameter drbd.minor_count */
86module_param(minor_count, uint, 0444);
87module_param(disable_sendpage, bool, 0644);
88module_param(allow_oos, bool, 0);
b411b363
PR
89module_param(proc_details, int, 0644);
90
91#ifdef CONFIG_DRBD_FAULT_INJECTION
92int enable_faults;
93int fault_rate;
94static int fault_count;
95int fault_devs;
96/* bitmap of enabled faults */
97module_param(enable_faults, int, 0664);
98/* fault rate % value - applies to all enabled faults */
99module_param(fault_rate, int, 0664);
100/* count of faults inserted */
101module_param(fault_count, int, 0664);
102/* bitmap of devices to insert faults on */
103module_param(fault_devs, int, 0644);
104#endif
105
106/* module parameter, defined */
2b8a90b5 107unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
b411b363
PR
108int disable_sendpage;
109int allow_oos;
b411b363
PR
110int proc_details; /* Detail level in proc drbd*/
111
112/* Module parameter for setting the user mode helper program
113 * to run. Default is /sbin/drbdadm */
114char usermode_helper[80] = "/sbin/drbdadm";
115
116module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
117
118/* in 2.6.x, our device mapping and config info contains our virtual gendisks
119 * as member "struct gendisk *vdisk;"
120 */
81a5d60e 121struct idr minors;
2111438b 122struct list_head drbd_tconns; /* list of struct drbd_tconn */
543cc10b 123DEFINE_MUTEX(drbd_cfg_mutex);
b411b363
PR
124
125struct kmem_cache *drbd_request_cache;
6c852bec 126struct kmem_cache *drbd_ee_cache; /* peer requests */
b411b363
PR
127struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
128struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
129mempool_t *drbd_request_mempool;
130mempool_t *drbd_ee_mempool;
35abf594 131mempool_t *drbd_md_io_page_pool;
da4a75d2 132struct bio_set *drbd_md_io_bio_set;
b411b363
PR
133
134/* I do not use a standard mempool, because:
135 1) I want to hand out the pre-allocated objects first.
136 2) I want to be able to interrupt sleeping allocation with a signal.
137 Note: This is a single linked list, the next pointer is the private
138 member of struct page.
139 */
140struct page *drbd_pp_pool;
141spinlock_t drbd_pp_lock;
142int drbd_pp_vacant;
143wait_queue_head_t drbd_pp_wait;
144
145DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
146
7d4e9d09 147static const struct block_device_operations drbd_ops = {
b411b363
PR
148 .owner = THIS_MODULE,
149 .open = drbd_open,
150 .release = drbd_release,
151};
152
da4a75d2
LE
153static void bio_destructor_drbd(struct bio *bio)
154{
155 bio_free(bio, drbd_md_io_bio_set);
156}
157
158struct bio *bio_alloc_drbd(gfp_t gfp_mask)
159{
160 struct bio *bio;
161
162 if (!drbd_md_io_bio_set)
163 return bio_alloc(gfp_mask, 1);
164
165 bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
166 if (!bio)
167 return NULL;
168 bio->bi_destructor = bio_destructor_drbd;
169 return bio;
170}
171
b411b363
PR
172#ifdef __CHECKER__
173/* When checking with sparse, and this is an inline function, sparse will
174 give tons of false positives. When this is a real functions sparse works.
175 */
176int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
177{
178 int io_allowed;
179
180 atomic_inc(&mdev->local_cnt);
181 io_allowed = (mdev->state.disk >= mins);
182 if (!io_allowed) {
183 if (atomic_dec_and_test(&mdev->local_cnt))
184 wake_up(&mdev->misc_wait);
185 }
186 return io_allowed;
187}
188
189#endif
190
191/**
192 * DOC: The transfer log
193 *
194 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
87eeee41 195 * mdev->tconn->newest_tle points to the head, mdev->tconn->oldest_tle points to the tail
b411b363
PR
196 * of the list. There is always at least one &struct drbd_tl_epoch object.
197 *
198 * Each &struct drbd_tl_epoch has a circular double linked list of requests
199 * attached.
200 */
2f5cdd0b 201static int tl_init(struct drbd_tconn *tconn)
b411b363
PR
202{
203 struct drbd_tl_epoch *b;
204
205 /* during device minor initialization, we may well use GFP_KERNEL */
206 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
207 if (!b)
208 return 0;
209 INIT_LIST_HEAD(&b->requests);
210 INIT_LIST_HEAD(&b->w.list);
211 b->next = NULL;
212 b->br_number = 4711;
7e602c0a 213 b->n_writes = 0;
b411b363
PR
214 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
215
2f5cdd0b
PR
216 tconn->oldest_tle = b;
217 tconn->newest_tle = b;
218 INIT_LIST_HEAD(&tconn->out_of_sequence_requests);
b411b363 219
b411b363
PR
220 return 1;
221}
222
2f5cdd0b 223static void tl_cleanup(struct drbd_tconn *tconn)
b411b363 224{
2f5cdd0b
PR
225 if (tconn->oldest_tle != tconn->newest_tle)
226 conn_err(tconn, "ASSERT FAILED: oldest_tle == newest_tle\n");
227 if (!list_empty(&tconn->out_of_sequence_requests))
228 conn_err(tconn, "ASSERT FAILED: list_empty(out_of_sequence_requests)\n");
229 kfree(tconn->oldest_tle);
230 tconn->oldest_tle = NULL;
231 kfree(tconn->unused_spare_tle);
232 tconn->unused_spare_tle = NULL;
d628769b
AG
233}
234
b411b363
PR
235/**
236 * _tl_add_barrier() - Adds a barrier to the transfer log
237 * @mdev: DRBD device.
238 * @new: Barrier to be added before the current head of the TL.
239 *
240 * The caller must hold the req_lock.
241 */
2f5cdd0b 242void _tl_add_barrier(struct drbd_tconn *tconn, struct drbd_tl_epoch *new)
b411b363
PR
243{
244 struct drbd_tl_epoch *newest_before;
245
246 INIT_LIST_HEAD(&new->requests);
247 INIT_LIST_HEAD(&new->w.list);
248 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
249 new->next = NULL;
7e602c0a 250 new->n_writes = 0;
b411b363 251
2f5cdd0b 252 newest_before = tconn->newest_tle;
b411b363
PR
253 /* never send a barrier number == 0, because that is special-cased
254 * when using TCQ for our write ordering code */
255 new->br_number = (newest_before->br_number+1) ?: 1;
2f5cdd0b
PR
256 if (tconn->newest_tle != new) {
257 tconn->newest_tle->next = new;
258 tconn->newest_tle = new;
b411b363
PR
259 }
260}
261
262/**
263 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
264 * @mdev: DRBD device.
265 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
266 * @set_size: Expected number of requests before that barrier.
267 *
268 * In case the passed barrier_nr or set_size does not match the oldest
269 * &struct drbd_tl_epoch objects this function will cause a termination
270 * of the connection.
271 */
2f5cdd0b
PR
272void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr,
273 unsigned int set_size)
b411b363 274{
2f5cdd0b 275 struct drbd_conf *mdev;
b411b363
PR
276 struct drbd_tl_epoch *b, *nob; /* next old barrier */
277 struct list_head *le, *tle;
278 struct drbd_request *r;
279
2f5cdd0b 280 spin_lock_irq(&tconn->req_lock);
b411b363 281
2f5cdd0b 282 b = tconn->oldest_tle;
b411b363
PR
283
284 /* first some paranoia code */
285 if (b == NULL) {
2f5cdd0b
PR
286 conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
287 barrier_nr);
b411b363
PR
288 goto bail;
289 }
290 if (b->br_number != barrier_nr) {
2f5cdd0b
PR
291 conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n",
292 barrier_nr, b->br_number);
b411b363
PR
293 goto bail;
294 }
7e602c0a 295 if (b->n_writes != set_size) {
2f5cdd0b
PR
296 conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
297 barrier_nr, set_size, b->n_writes);
b411b363
PR
298 goto bail;
299 }
300
301 /* Clean up list of requests processed during current epoch */
302 list_for_each_safe(le, tle, &b->requests) {
303 r = list_entry(le, struct drbd_request, tl_requests);
8554df1c 304 _req_mod(r, BARRIER_ACKED);
b411b363
PR
305 }
306 /* There could be requests on the list waiting for completion
307 of the write to the local disk. To avoid corruptions of
308 slab's data structures we have to remove the lists head.
309
310 Also there could have been a barrier ack out of sequence, overtaking
311 the write acks - which would be a bug and violating write ordering.
312 To not deadlock in case we lose connection while such requests are
313 still pending, we need some way to find them for the
8554df1c 314 _req_mode(CONNECTION_LOST_WHILE_PENDING).
b411b363
PR
315
316 These have been list_move'd to the out_of_sequence_requests list in
8554df1c 317 _req_mod(, BARRIER_ACKED) above.
b411b363
PR
318 */
319 list_del_init(&b->requests);
2f5cdd0b 320 mdev = b->w.mdev;
b411b363
PR
321
322 nob = b->next;
323 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
2f5cdd0b 324 _tl_add_barrier(tconn, b);
b411b363 325 if (nob)
2f5cdd0b 326 tconn->oldest_tle = nob;
b411b363 327 /* if nob == NULL b was the only barrier, and becomes the new
2f5cdd0b 328 barrier. Therefore tconn->oldest_tle points already to b */
b411b363
PR
329 } else {
330 D_ASSERT(nob != NULL);
2f5cdd0b 331 tconn->oldest_tle = nob;
b411b363
PR
332 kfree(b);
333 }
334
2f5cdd0b 335 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
336 dec_ap_pending(mdev);
337
338 return;
339
340bail:
2f5cdd0b
PR
341 spin_unlock_irq(&tconn->req_lock);
342 conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
b411b363
PR
343}
344
617049aa 345
b411b363 346/**
11b58e73 347 * _tl_restart() - Walks the transfer log, and applies an action to all requests
b411b363 348 * @mdev: DRBD device.
11b58e73 349 * @what: The action/event to perform with all request objects
b411b363 350 *
8554df1c
AG
351 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
352 * RESTART_FROZEN_DISK_IO.
b411b363 353 */
2f5cdd0b 354void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
b411b363 355{
11b58e73 356 struct drbd_tl_epoch *b, *tmp, **pn;
b9b98716 357 struct list_head *le, *tle, carry_reads;
11b58e73
PR
358 struct drbd_request *req;
359 int rv, n_writes, n_reads;
b411b363 360
2f5cdd0b
PR
361 b = tconn->oldest_tle;
362 pn = &tconn->oldest_tle;
b411b363 363 while (b) {
11b58e73
PR
364 n_writes = 0;
365 n_reads = 0;
b9b98716 366 INIT_LIST_HEAD(&carry_reads);
b411b363 367 list_for_each_safe(le, tle, &b->requests) {
11b58e73
PR
368 req = list_entry(le, struct drbd_request, tl_requests);
369 rv = _req_mod(req, what);
370
371 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
372 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
b411b363
PR
373 }
374 tmp = b->next;
375
b9b98716 376 if (n_writes) {
8554df1c 377 if (what == RESEND) {
11b58e73
PR
378 b->n_writes = n_writes;
379 if (b->w.cb == NULL) {
380 b->w.cb = w_send_barrier;
2f5cdd0b
PR
381 inc_ap_pending(b->w.mdev);
382 set_bit(CREATE_BARRIER, &b->w.mdev->flags);
11b58e73
PR
383 }
384
2f5cdd0b 385 drbd_queue_work(&tconn->data.work, &b->w);
11b58e73
PR
386 }
387 pn = &b->next;
388 } else {
b9b98716
PR
389 if (n_reads)
390 list_add(&carry_reads, &b->requests);
11b58e73
PR
391 /* there could still be requests on that ring list,
392 * in case local io is still pending */
393 list_del(&b->requests);
394
395 /* dec_ap_pending corresponding to queue_barrier.
396 * the newest barrier may not have been queued yet,
397 * in which case w.cb is still NULL. */
398 if (b->w.cb != NULL)
2f5cdd0b 399 dec_ap_pending(b->w.mdev);
11b58e73 400
2f5cdd0b 401 if (b == tconn->newest_tle) {
11b58e73 402 /* recycle, but reinit! */
2f5cdd0b
PR
403 if (tmp != NULL)
404 conn_err(tconn, "ASSERT FAILED tmp == NULL");
11b58e73 405 INIT_LIST_HEAD(&b->requests);
b9b98716 406 list_splice(&carry_reads, &b->requests);
11b58e73
PR
407 INIT_LIST_HEAD(&b->w.list);
408 b->w.cb = NULL;
409 b->br_number = net_random();
410 b->n_writes = 0;
411
412 *pn = b;
413 break;
414 }
415 *pn = tmp;
416 kfree(b);
b411b363 417 }
b411b363 418 b = tmp;
b9b98716 419 list_splice(&carry_reads, &b->requests);
b411b363 420 }
11b58e73
PR
421}
422
b411b363
PR
423
424/**
425 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
426 * @mdev: DRBD device.
427 *
428 * This is called after the connection to the peer was lost. The storage covered
429 * by the requests on the transfer gets marked as our of sync. Called from the
430 * receiver thread and the worker thread.
431 */
2f5cdd0b 432void tl_clear(struct drbd_tconn *tconn)
b411b363 433{
2f5cdd0b 434 struct drbd_conf *mdev;
b411b363
PR
435 struct list_head *le, *tle;
436 struct drbd_request *r;
2f5cdd0b 437 int minor;
b411b363 438
2f5cdd0b 439 spin_lock_irq(&tconn->req_lock);
b411b363 440
2f5cdd0b 441 _tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
442
443 /* we expect this list to be empty. */
2f5cdd0b
PR
444 if (!list_empty(&tconn->out_of_sequence_requests))
445 conn_err(tconn, "ASSERT FAILED list_empty(&out_of_sequence_requests)\n");
b411b363
PR
446
447 /* but just in case, clean it up anyways! */
2f5cdd0b 448 list_for_each_safe(le, tle, &tconn->out_of_sequence_requests) {
b411b363
PR
449 r = list_entry(le, struct drbd_request, tl_requests);
450 /* It would be nice to complete outside of spinlock.
451 * But this is easier for now. */
8554df1c 452 _req_mod(r, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
453 }
454
455 /* ensure bit indicating barrier is required is clear */
2f5cdd0b
PR
456 idr_for_each_entry(&tconn->volumes, mdev, minor)
457 clear_bit(CREATE_BARRIER, &mdev->flags);
b411b363 458
2f5cdd0b 459 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
460}
461
2f5cdd0b 462void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
11b58e73 463{
2f5cdd0b
PR
464 spin_lock_irq(&tconn->req_lock);
465 _tl_restart(tconn, what);
466 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
467}
468
b411b363
PR
469static int drbd_thread_setup(void *arg)
470{
471 struct drbd_thread *thi = (struct drbd_thread *) arg;
392c8801 472 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
473 unsigned long flags;
474 int retval;
475
f1b3a6ec 476 snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
392c8801 477 thi->name[0], thi->tconn->name);
f1b3a6ec 478
b411b363
PR
479restart:
480 retval = thi->function(thi);
481
482 spin_lock_irqsave(&thi->t_lock, flags);
483
e77a0a5c 484 /* if the receiver has been "EXITING", the last thing it did
b411b363
PR
485 * was set the conn state to "StandAlone",
486 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
487 * and receiver thread will be "started".
e77a0a5c 488 * drbd_thread_start needs to set "RESTARTING" in that case.
b411b363 489 * t_state check and assignment needs to be within the same spinlock,
e77a0a5c
AG
490 * so either thread_start sees EXITING, and can remap to RESTARTING,
491 * or thread_start see NONE, and can proceed as normal.
b411b363
PR
492 */
493
e77a0a5c 494 if (thi->t_state == RESTARTING) {
392c8801 495 conn_info(tconn, "Restarting %s thread\n", thi->name);
e77a0a5c 496 thi->t_state = RUNNING;
b411b363
PR
497 spin_unlock_irqrestore(&thi->t_lock, flags);
498 goto restart;
499 }
500
501 thi->task = NULL;
e77a0a5c 502 thi->t_state = NONE;
b411b363
PR
503 smp_mb();
504 complete(&thi->stop);
505 spin_unlock_irqrestore(&thi->t_lock, flags);
506
392c8801 507 conn_info(tconn, "Terminating %s\n", current->comm);
b411b363
PR
508
509 /* Release mod reference taken when thread was started */
510 module_put(THIS_MODULE);
511 return retval;
512}
513
392c8801 514static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi,
bed879ae 515 int (*func) (struct drbd_thread *), char *name)
b411b363
PR
516{
517 spin_lock_init(&thi->t_lock);
518 thi->task = NULL;
e77a0a5c 519 thi->t_state = NONE;
b411b363 520 thi->function = func;
392c8801 521 thi->tconn = tconn;
bed879ae 522 strncpy(thi->name, name, ARRAY_SIZE(thi->name));
b411b363
PR
523}
524
525int drbd_thread_start(struct drbd_thread *thi)
526{
392c8801 527 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
528 struct task_struct *nt;
529 unsigned long flags;
530
b411b363
PR
531 /* is used from state engine doing drbd_thread_stop_nowait,
532 * while holding the req lock irqsave */
533 spin_lock_irqsave(&thi->t_lock, flags);
534
535 switch (thi->t_state) {
e77a0a5c 536 case NONE:
392c8801 537 conn_info(tconn, "Starting %s thread (from %s [%d])\n",
bed879ae 538 thi->name, current->comm, current->pid);
b411b363
PR
539
540 /* Get ref on module for thread - this is released when thread exits */
541 if (!try_module_get(THIS_MODULE)) {
392c8801 542 conn_err(tconn, "Failed to get module reference in drbd_thread_start\n");
b411b363 543 spin_unlock_irqrestore(&thi->t_lock, flags);
81e84650 544 return false;
b411b363
PR
545 }
546
547 init_completion(&thi->stop);
b411b363 548 thi->reset_cpu_mask = 1;
e77a0a5c 549 thi->t_state = RUNNING;
b411b363
PR
550 spin_unlock_irqrestore(&thi->t_lock, flags);
551 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
552
553 nt = kthread_create(drbd_thread_setup, (void *) thi,
392c8801 554 "drbd_%c_%s", thi->name[0], thi->tconn->name);
b411b363
PR
555
556 if (IS_ERR(nt)) {
392c8801 557 conn_err(tconn, "Couldn't start thread\n");
b411b363
PR
558
559 module_put(THIS_MODULE);
81e84650 560 return false;
b411b363
PR
561 }
562 spin_lock_irqsave(&thi->t_lock, flags);
563 thi->task = nt;
e77a0a5c 564 thi->t_state = RUNNING;
b411b363
PR
565 spin_unlock_irqrestore(&thi->t_lock, flags);
566 wake_up_process(nt);
567 break;
e77a0a5c
AG
568 case EXITING:
569 thi->t_state = RESTARTING;
392c8801 570 conn_info(tconn, "Restarting %s thread (from %s [%d])\n",
bed879ae 571 thi->name, current->comm, current->pid);
b411b363 572 /* fall through */
e77a0a5c
AG
573 case RUNNING:
574 case RESTARTING:
b411b363
PR
575 default:
576 spin_unlock_irqrestore(&thi->t_lock, flags);
577 break;
578 }
579
81e84650 580 return true;
b411b363
PR
581}
582
583
584void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
585{
586 unsigned long flags;
587
e77a0a5c 588 enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
b411b363
PR
589
590 /* may be called from state engine, holding the req lock irqsave */
591 spin_lock_irqsave(&thi->t_lock, flags);
592
e77a0a5c 593 if (thi->t_state == NONE) {
b411b363
PR
594 spin_unlock_irqrestore(&thi->t_lock, flags);
595 if (restart)
596 drbd_thread_start(thi);
597 return;
598 }
599
600 if (thi->t_state != ns) {
601 if (thi->task == NULL) {
602 spin_unlock_irqrestore(&thi->t_lock, flags);
603 return;
604 }
605
606 thi->t_state = ns;
607 smp_mb();
608 init_completion(&thi->stop);
609 if (thi->task != current)
610 force_sig(DRBD_SIGKILL, thi->task);
b411b363
PR
611 }
612
613 spin_unlock_irqrestore(&thi->t_lock, flags);
614
615 if (wait)
616 wait_for_completion(&thi->stop);
617}
618
392c8801 619static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task)
bed879ae 620{
bed879ae
PR
621 struct drbd_thread *thi =
622 task == tconn->receiver.task ? &tconn->receiver :
623 task == tconn->asender.task ? &tconn->asender :
624 task == tconn->worker.task ? &tconn->worker : NULL;
625
626 return thi;
627}
628
392c8801 629char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task)
bed879ae 630{
392c8801 631 struct drbd_thread *thi = drbd_task_to_thread(tconn, task);
bed879ae
PR
632 return thi ? thi->name : task->comm;
633}
634
80883197 635int conn_lowest_minor(struct drbd_tconn *tconn)
80822284
PR
636{
637 int minor = 0;
774b3055
PR
638
639 if (!idr_get_next(&tconn->volumes, &minor))
640 return -1;
80822284
PR
641 return minor;
642}
774b3055
PR
643
644#ifdef CONFIG_SMP
b411b363
PR
645/**
646 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
647 * @mdev: DRBD device.
648 *
649 * Forces all threads of a device onto the same CPU. This is beneficial for
650 * DRBD's performance. May be overwritten by user's configuration.
651 */
80822284 652void drbd_calc_cpu_mask(struct drbd_tconn *tconn)
b411b363
PR
653{
654 int ord, cpu;
655
656 /* user override. */
80822284 657 if (cpumask_weight(tconn->cpu_mask))
b411b363
PR
658 return;
659
80822284 660 ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask);
b411b363
PR
661 for_each_online_cpu(cpu) {
662 if (ord-- == 0) {
80822284 663 cpumask_set_cpu(cpu, tconn->cpu_mask);
b411b363
PR
664 return;
665 }
666 }
667 /* should not be reached */
80822284 668 cpumask_setall(tconn->cpu_mask);
b411b363
PR
669}
670
671/**
672 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
673 * @mdev: DRBD device.
bc31fe33 674 * @thi: drbd_thread object
b411b363
PR
675 *
676 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
677 * prematurely.
678 */
80822284 679void drbd_thread_current_set_cpu(struct drbd_thread *thi)
b411b363
PR
680{
681 struct task_struct *p = current;
bed879ae 682
b411b363
PR
683 if (!thi->reset_cpu_mask)
684 return;
685 thi->reset_cpu_mask = 0;
392c8801 686 set_cpus_allowed_ptr(p, thi->tconn->cpu_mask);
b411b363
PR
687}
688#endif
689
d38e787e 690static void prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
fd340c12
PR
691{
692 h->magic = cpu_to_be32(DRBD_MAGIC);
693 h->command = cpu_to_be16(cmd);
694 h->length = cpu_to_be16(size);
695}
696
d38e787e 697static void prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
fd340c12
PR
698{
699 h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
700 h->command = cpu_to_be16(cmd);
701 h->length = cpu_to_be32(size);
702}
703
d38e787e
PR
704static void _prepare_header(struct drbd_tconn *tconn, int vnr, struct p_header *h,
705 enum drbd_packet cmd, int size)
706{
707 if (tconn->agreed_pro_version >= 100 || size > DRBD_MAX_SIZE_H80_PACKET)
708 prepare_header95(&h->h95, cmd, size);
709 else
710 prepare_header80(&h->h80, cmd, size);
711}
712
fd340c12 713static void prepare_header(struct drbd_conf *mdev, struct p_header *h,
d8763023 714 enum drbd_packet cmd, int size)
fd340c12 715{
d38e787e 716 _prepare_header(mdev->tconn, mdev->vnr, h, cmd, size);
fd340c12
PR
717}
718
b411b363 719/* the appropriate socket mutex must be held already */
d38e787e 720int _conn_send_cmd(struct drbd_tconn *tconn, int vnr, struct socket *sock,
d8763023
AG
721 enum drbd_packet cmd, struct p_header *h, size_t size,
722 unsigned msg_flags)
b411b363
PR
723{
724 int sent, ok;
725
d38e787e 726 _prepare_header(tconn, vnr, h, cmd, size - sizeof(struct p_header));
b411b363 727
d38e787e 728 sent = drbd_send(tconn, sock, h, size, msg_flags);
b411b363
PR
729
730 ok = (sent == size);
0ddc5549 731 if (!ok && !signal_pending(current))
d38e787e
PR
732 conn_warn(tconn, "short sent %s size=%d sent=%d\n",
733 cmdname(cmd), (int)size, sent);
b411b363
PR
734 return ok;
735}
736
737/* don't pass the socket. we may only look at it
738 * when we hold the appropriate socket mutex.
739 */
2a67d8b9 740int conn_send_cmd(struct drbd_tconn *tconn, int vnr, int use_data_socket,
d8763023 741 enum drbd_packet cmd, struct p_header *h, size_t size)
b411b363
PR
742{
743 int ok = 0;
744 struct socket *sock;
745
746 if (use_data_socket) {
2a67d8b9
PR
747 mutex_lock(&tconn->data.mutex);
748 sock = tconn->data.socket;
b411b363 749 } else {
2a67d8b9
PR
750 mutex_lock(&tconn->meta.mutex);
751 sock = tconn->meta.socket;
b411b363
PR
752 }
753
754 /* drbd_disconnect() could have called drbd_free_sock()
755 * while we were waiting in down()... */
756 if (likely(sock != NULL))
2a67d8b9 757 ok = _conn_send_cmd(tconn, vnr, sock, cmd, h, size, 0);
b411b363
PR
758
759 if (use_data_socket)
2a67d8b9 760 mutex_unlock(&tconn->data.mutex);
b411b363 761 else
2a67d8b9 762 mutex_unlock(&tconn->meta.mutex);
b411b363
PR
763 return ok;
764}
765
61120870 766int conn_send_cmd2(struct drbd_tconn *tconn, enum drbd_packet cmd, char *data,
b411b363
PR
767 size_t size)
768{
61120870 769 struct p_header80 h;
b411b363
PR
770 int ok;
771
61120870 772 prepare_header80(&h, cmd, size);
b411b363 773
61120870 774 if (!drbd_get_data_sock(tconn))
b411b363
PR
775 return 0;
776
b411b363 777 ok = (sizeof(h) ==
61120870 778 drbd_send(tconn, tconn->data.socket, &h, sizeof(h), 0));
b411b363 779 ok = ok && (size ==
61120870 780 drbd_send(tconn, tconn->data.socket, data, size, 0));
b411b363 781
61120870 782 drbd_put_data_sock(tconn);
b411b363
PR
783
784 return ok;
785}
786
f399002e 787int drbd_send_sync_param(struct drbd_conf *mdev)
b411b363 788{
8e26f9cc 789 struct p_rs_param_95 *p;
b411b363
PR
790 struct socket *sock;
791 int size, rv;
31890f4a 792 const int apv = mdev->tconn->agreed_pro_version;
b411b363
PR
793
794 size = apv <= 87 ? sizeof(struct p_rs_param)
795 : apv == 88 ? sizeof(struct p_rs_param)
f399002e 796 + strlen(mdev->tconn->net_conf->verify_alg) + 1
8e26f9cc
PR
797 : apv <= 94 ? sizeof(struct p_rs_param_89)
798 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363
PR
799
800 /* used from admin command context and receiver/worker context.
801 * to avoid kmalloc, grab the socket right here,
802 * then use the pre-allocated sbuf there */
e42325a5
PR
803 mutex_lock(&mdev->tconn->data.mutex);
804 sock = mdev->tconn->data.socket;
b411b363
PR
805
806 if (likely(sock != NULL)) {
d8763023
AG
807 enum drbd_packet cmd =
808 apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
b411b363 809
e42325a5 810 p = &mdev->tconn->data.sbuf.rs_param_95;
b411b363
PR
811
812 /* initialize verify_alg and csums_alg */
813 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
814
f399002e
LE
815 if (get_ldev(mdev)) {
816 p->rate = cpu_to_be32(mdev->ldev->dc.resync_rate);
817 p->c_plan_ahead = cpu_to_be32(mdev->ldev->dc.c_plan_ahead);
818 p->c_delay_target = cpu_to_be32(mdev->ldev->dc.c_delay_target);
819 p->c_fill_target = cpu_to_be32(mdev->ldev->dc.c_fill_target);
820 p->c_max_rate = cpu_to_be32(mdev->ldev->dc.c_max_rate);
821 put_ldev(mdev);
822 } else {
823 p->rate = cpu_to_be32(DRBD_RATE_DEF);
824 p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
825 p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
826 p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
827 p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
828 }
b411b363
PR
829
830 if (apv >= 88)
f399002e 831 strcpy(p->verify_alg, mdev->tconn->net_conf->verify_alg);
b411b363 832 if (apv >= 89)
f399002e 833 strcpy(p->csums_alg, mdev->tconn->net_conf->csums_alg);
b411b363
PR
834
835 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
836 } else
837 rv = 0; /* not ok */
838
e42325a5 839 mutex_unlock(&mdev->tconn->data.mutex);
b411b363
PR
840
841 return rv;
842}
843
dc8228d1 844int drbd_send_protocol(struct drbd_tconn *tconn)
b411b363
PR
845{
846 struct p_protocol *p;
cf14c2e9 847 int size, cf, rv;
b411b363
PR
848
849 size = sizeof(struct p_protocol);
850
dc8228d1
PR
851 if (tconn->agreed_pro_version >= 87)
852 size += strlen(tconn->net_conf->integrity_alg) + 1;
b411b363
PR
853
854 /* we must not recurse into our own queue,
855 * as that is blocked during handshake */
856 p = kmalloc(size, GFP_NOIO);
857 if (p == NULL)
858 return 0;
859
dc8228d1
PR
860 p->protocol = cpu_to_be32(tconn->net_conf->wire_protocol);
861 p->after_sb_0p = cpu_to_be32(tconn->net_conf->after_sb_0p);
862 p->after_sb_1p = cpu_to_be32(tconn->net_conf->after_sb_1p);
863 p->after_sb_2p = cpu_to_be32(tconn->net_conf->after_sb_2p);
864 p->two_primaries = cpu_to_be32(tconn->net_conf->two_primaries);
b411b363 865
cf14c2e9 866 cf = 0;
dc8228d1 867 if (tconn->net_conf->want_lose)
cf14c2e9 868 cf |= CF_WANT_LOSE;
dc8228d1
PR
869 if (tconn->net_conf->dry_run) {
870 if (tconn->agreed_pro_version >= 92)
cf14c2e9
PR
871 cf |= CF_DRY_RUN;
872 else {
dc8228d1 873 conn_err(tconn, "--dry-run is not supported by peer");
7ac314c8 874 kfree(p);
148efa16 875 return -1;
cf14c2e9
PR
876 }
877 }
878 p->conn_flags = cpu_to_be32(cf);
879
dc8228d1
PR
880 if (tconn->agreed_pro_version >= 87)
881 strcpy(p->integrity_alg, tconn->net_conf->integrity_alg);
b411b363 882
dc8228d1 883 rv = conn_send_cmd2(tconn, P_PROTOCOL, p->head.payload, size - sizeof(struct p_header));
b411b363
PR
884 kfree(p);
885 return rv;
886}
887
888int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
889{
890 struct p_uuids p;
891 int i;
892
893 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
894 return 1;
895
896 for (i = UI_CURRENT; i < UI_SIZE; i++)
897 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
898
899 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
900 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
89e58e75 901 uuid_flags |= mdev->tconn->net_conf->want_lose ? 1 : 0;
b411b363
PR
902 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
903 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
904 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
905
906 put_ldev(mdev);
907
c012949a 908 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, &p.head, sizeof(p));
b411b363
PR
909}
910
911int drbd_send_uuids(struct drbd_conf *mdev)
912{
913 return _drbd_send_uuids(mdev, 0);
914}
915
916int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
917{
918 return _drbd_send_uuids(mdev, 8);
919}
920
62b0da3a
LE
921void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
922{
923 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
924 u64 *uuid = mdev->ldev->md.uuid;
925 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
926 text,
927 (unsigned long long)uuid[UI_CURRENT],
928 (unsigned long long)uuid[UI_BITMAP],
929 (unsigned long long)uuid[UI_HISTORY_START],
930 (unsigned long long)uuid[UI_HISTORY_END]);
931 put_ldev(mdev);
932 } else {
933 dev_info(DEV, "%s effective data uuid: %016llX\n",
934 text,
935 (unsigned long long)mdev->ed_uuid);
936 }
937}
938
5a22db89 939int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
b411b363
PR
940{
941 struct p_rs_uuid p;
5a22db89
LE
942 u64 uuid;
943
944 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
b411b363 945
4a23f264 946 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
5a22db89 947 drbd_uuid_set(mdev, UI_BITMAP, uuid);
62b0da3a 948 drbd_print_uuids(mdev, "updated sync UUID");
5a22db89
LE
949 drbd_md_sync(mdev);
950 p.uuid = cpu_to_be64(uuid);
b411b363 951
c012949a 952 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, &p.head, sizeof(p));
b411b363
PR
953}
954
e89b591c 955int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363
PR
956{
957 struct p_sizes p;
958 sector_t d_size, u_size;
99432fcc 959 int q_order_type, max_bio_size;
b411b363
PR
960 int ok;
961
962 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
963 D_ASSERT(mdev->ldev->backing_bdev);
964 d_size = drbd_get_max_capacity(mdev->ldev);
965 u_size = mdev->ldev->dc.disk_size;
966 q_order_type = drbd_queue_order_type(mdev);
99432fcc
PR
967 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
968 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
b411b363
PR
969 put_ldev(mdev);
970 } else {
971 d_size = 0;
972 u_size = 0;
973 q_order_type = QUEUE_ORDERED_NONE;
99432fcc 974 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
b411b363
PR
975 }
976
977 p.d_size = cpu_to_be64(d_size);
978 p.u_size = cpu_to_be64(u_size);
979 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
99432fcc 980 p.max_bio_size = cpu_to_be32(max_bio_size);
e89b591c
PR
981 p.queue_order_type = cpu_to_be16(q_order_type);
982 p.dds_flags = cpu_to_be16(flags);
b411b363 983
c012949a 984 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, &p.head, sizeof(p));
b411b363
PR
985 return ok;
986}
987
988/**
989 * drbd_send_state() - Sends the drbd state to the peer
990 * @mdev: DRBD device.
991 */
992int drbd_send_state(struct drbd_conf *mdev)
993{
994 struct socket *sock;
995 struct p_state p;
996 int ok = 0;
997
e42325a5 998 mutex_lock(&mdev->tconn->data.mutex);
b411b363
PR
999
1000 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
e42325a5 1001 sock = mdev->tconn->data.socket;
b411b363
PR
1002
1003 if (likely(sock != NULL)) {
c012949a 1004 ok = _drbd_send_cmd(mdev, sock, P_STATE, &p.head, sizeof(p), 0);
b411b363
PR
1005 }
1006
e42325a5 1007 mutex_unlock(&mdev->tconn->data.mutex);
b411b363 1008
b411b363
PR
1009 return ok;
1010}
1011
cf29c9d8
PR
1012int _conn_send_state_req(struct drbd_tconn *tconn, int vnr, enum drbd_packet cmd,
1013 union drbd_state mask, union drbd_state val)
b411b363
PR
1014{
1015 struct p_req_state p;
1016
1017 p.mask = cpu_to_be32(mask.i);
1018 p.val = cpu_to_be32(val.i);
1019
cf29c9d8 1020 return conn_send_cmd(tconn, vnr, USE_DATA_SOCKET, cmd, &p.head, sizeof(p));
b411b363
PR
1021}
1022
bf885f8a 1023int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
b411b363
PR
1024{
1025 struct p_req_state_reply p;
1026
1027 p.retcode = cpu_to_be32(retcode);
1028
c012949a 1029 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, &p.head, sizeof(p));
b411b363
PR
1030}
1031
047cd4a6
PR
1032int conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode)
1033{
1034 struct p_req_state_reply p;
1035 enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
1036
1037 p.retcode = cpu_to_be32(retcode);
1038
1039 return conn_send_cmd(tconn, 0, USE_META_SOCKET, cmd, &p.head, sizeof(p));
1040}
1041
b411b363
PR
1042int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1043 struct p_compressed_bm *p,
1044 struct bm_xfer_ctx *c)
1045{
1046 struct bitstream bs;
1047 unsigned long plain_bits;
1048 unsigned long tmp;
1049 unsigned long rl;
1050 unsigned len;
1051 unsigned toggle;
1052 int bits;
1053
1054 /* may we use this feature? */
f399002e 1055 if ((mdev->tconn->net_conf->use_rle == 0) ||
31890f4a 1056 (mdev->tconn->agreed_pro_version < 90))
b411b363
PR
1057 return 0;
1058
1059 if (c->bit_offset >= c->bm_bits)
1060 return 0; /* nothing to do. */
1061
1062 /* use at most thus many bytes */
1063 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1064 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1065 /* plain bits covered in this code string */
1066 plain_bits = 0;
1067
1068 /* p->encoding & 0x80 stores whether the first run length is set.
1069 * bit offset is implicit.
1070 * start with toggle == 2 to be able to tell the first iteration */
1071 toggle = 2;
1072
1073 /* see how much plain bits we can stuff into one packet
1074 * using RLE and VLI. */
1075 do {
1076 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1077 : _drbd_bm_find_next(mdev, c->bit_offset);
1078 if (tmp == -1UL)
1079 tmp = c->bm_bits;
1080 rl = tmp - c->bit_offset;
1081
1082 if (toggle == 2) { /* first iteration */
1083 if (rl == 0) {
1084 /* the first checked bit was set,
1085 * store start value, */
1086 DCBP_set_start(p, 1);
1087 /* but skip encoding of zero run length */
1088 toggle = !toggle;
1089 continue;
1090 }
1091 DCBP_set_start(p, 0);
1092 }
1093
1094 /* paranoia: catch zero runlength.
1095 * can only happen if bitmap is modified while we scan it. */
1096 if (rl == 0) {
1097 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1098 "t:%u bo:%lu\n", toggle, c->bit_offset);
1099 return -1;
1100 }
1101
1102 bits = vli_encode_bits(&bs, rl);
1103 if (bits == -ENOBUFS) /* buffer full */
1104 break;
1105 if (bits <= 0) {
1106 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1107 return 0;
1108 }
1109
1110 toggle = !toggle;
1111 plain_bits += rl;
1112 c->bit_offset = tmp;
1113 } while (c->bit_offset < c->bm_bits);
1114
1115 len = bs.cur.b - p->code + !!bs.cur.bit;
1116
1117 if (plain_bits < (len << 3)) {
1118 /* incompressible with this method.
1119 * we need to rewind both word and bit position. */
1120 c->bit_offset -= plain_bits;
1121 bm_xfer_ctx_bit_to_word_offset(c);
1122 c->bit_offset = c->word_offset * BITS_PER_LONG;
1123 return 0;
1124 }
1125
1126 /* RLE + VLI was able to compress it just fine.
1127 * update c->word_offset. */
1128 bm_xfer_ctx_bit_to_word_offset(c);
1129
1130 /* store pad_bits */
1131 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1132
1133 return len;
1134}
1135
f70af118
AG
1136/**
1137 * send_bitmap_rle_or_plain
1138 *
1139 * Return 0 when done, 1 when another iteration is needed, and a negative error
1140 * code upon failure.
1141 */
1142static int
b411b363 1143send_bitmap_rle_or_plain(struct drbd_conf *mdev,
c012949a 1144 struct p_header *h, struct bm_xfer_ctx *c)
b411b363
PR
1145{
1146 struct p_compressed_bm *p = (void*)h;
1147 unsigned long num_words;
1148 int len;
1149 int ok;
1150
1151 len = fill_bitmap_rle_bits(mdev, p, c);
1152
1153 if (len < 0)
f70af118 1154 return -EIO;
b411b363
PR
1155
1156 if (len) {
1157 DCBP_set_code(p, RLE_VLI_Bits);
e42325a5 1158 ok = _drbd_send_cmd(mdev, mdev->tconn->data.socket, P_COMPRESSED_BITMAP, h,
b411b363
PR
1159 sizeof(*p) + len, 0);
1160
1161 c->packets[0]++;
1162 c->bytes[0] += sizeof(*p) + len;
1163
1164 if (c->bit_offset >= c->bm_bits)
1165 len = 0; /* DONE */
1166 } else {
1167 /* was not compressible.
1168 * send a buffer full of plain text bits instead. */
1169 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1170 len = num_words * sizeof(long);
1171 if (len)
1172 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
e42325a5 1173 ok = _drbd_send_cmd(mdev, mdev->tconn->data.socket, P_BITMAP,
0b70a13d 1174 h, sizeof(struct p_header80) + len, 0);
b411b363
PR
1175 c->word_offset += num_words;
1176 c->bit_offset = c->word_offset * BITS_PER_LONG;
1177
1178 c->packets[1]++;
0b70a13d 1179 c->bytes[1] += sizeof(struct p_header80) + len;
b411b363
PR
1180
1181 if (c->bit_offset > c->bm_bits)
1182 c->bit_offset = c->bm_bits;
1183 }
f70af118
AG
1184 if (ok) {
1185 if (len == 0) {
1186 INFO_bm_xfer_stats(mdev, "send", c);
1187 return 0;
1188 } else
1189 return 1;
1190 }
1191 return -EIO;
b411b363
PR
1192}
1193
1194/* See the comment at receive_bitmap() */
1195int _drbd_send_bitmap(struct drbd_conf *mdev)
1196{
1197 struct bm_xfer_ctx c;
c012949a 1198 struct p_header *p;
f70af118 1199 int err;
b411b363 1200
841ce241
AG
1201 if (!expect(mdev->bitmap))
1202 return false;
b411b363
PR
1203
1204 /* maybe we should use some per thread scratch page,
1205 * and allocate that during initial device creation? */
c012949a 1206 p = (struct p_header *) __get_free_page(GFP_NOIO);
b411b363
PR
1207 if (!p) {
1208 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
81e84650 1209 return false;
b411b363
PR
1210 }
1211
1212 if (get_ldev(mdev)) {
1213 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1214 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
1215 drbd_bm_set_all(mdev);
1216 if (drbd_bm_write(mdev)) {
1217 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
1218 * but otherwise process as per normal - need to tell other
1219 * side that a full resync is required! */
1220 dev_err(DEV, "Failed to write bitmap to disk!\n");
1221 } else {
1222 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
1223 drbd_md_sync(mdev);
1224 }
1225 }
1226 put_ldev(mdev);
1227 }
1228
1229 c = (struct bm_xfer_ctx) {
1230 .bm_bits = drbd_bm_bits(mdev),
1231 .bm_words = drbd_bm_words(mdev),
1232 };
1233
1234 do {
f70af118
AG
1235 err = send_bitmap_rle_or_plain(mdev, p, &c);
1236 } while (err > 0);
b411b363
PR
1237
1238 free_page((unsigned long) p);
f70af118 1239 return err == 0;
b411b363
PR
1240}
1241
1242int drbd_send_bitmap(struct drbd_conf *mdev)
1243{
1244 int err;
1245
61120870 1246 if (!drbd_get_data_sock(mdev->tconn))
b411b363
PR
1247 return -1;
1248 err = !_drbd_send_bitmap(mdev);
61120870 1249 drbd_put_data_sock(mdev->tconn);
b411b363
PR
1250 return err;
1251}
1252
1253int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
1254{
1255 int ok;
1256 struct p_barrier_ack p;
1257
1258 p.barrier = barrier_nr;
1259 p.set_size = cpu_to_be32(set_size);
1260
1261 if (mdev->state.conn < C_CONNECTED)
81e84650 1262 return false;
c012949a 1263 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, &p.head, sizeof(p));
b411b363
PR
1264 return ok;
1265}
1266
1267/**
1268 * _drbd_send_ack() - Sends an ack packet
1269 * @mdev: DRBD device.
1270 * @cmd: Packet command code.
1271 * @sector: sector, needs to be in big endian byte order
1272 * @blksize: size in byte, needs to be in big endian byte order
1273 * @block_id: Id, big endian byte order
1274 */
d8763023
AG
1275static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
1276 u64 sector, u32 blksize, u64 block_id)
b411b363
PR
1277{
1278 int ok;
1279 struct p_block_ack p;
1280
1281 p.sector = sector;
1282 p.block_id = block_id;
1283 p.blksize = blksize;
8ccf218e 1284 p.seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
b411b363 1285
e42325a5 1286 if (!mdev->tconn->meta.socket || mdev->state.conn < C_CONNECTED)
81e84650 1287 return false;
c012949a 1288 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, &p.head, sizeof(p));
b411b363
PR
1289 return ok;
1290}
1291
2b2bf214
LE
1292/* dp->sector and dp->block_id already/still in network byte order,
1293 * data_size is payload size according to dp->head,
1294 * and may need to be corrected for digest size. */
d8763023 1295int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
2b2bf214 1296 struct p_data *dp, int data_size)
b411b363 1297{
a0638456
PR
1298 data_size -= (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_r_tfm) ?
1299 crypto_hash_digestsize(mdev->tconn->integrity_r_tfm) : 0;
b411b363
PR
1300 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
1301 dp->block_id);
1302}
1303
d8763023 1304int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
b411b363
PR
1305 struct p_block_req *rp)
1306{
1307 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
1308}
1309
1310/**
1311 * drbd_send_ack() - Sends an ack packet
db830c46
AG
1312 * @mdev: DRBD device
1313 * @cmd: packet command code
1314 * @peer_req: peer request
b411b363 1315 */
d8763023 1316int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
db830c46 1317 struct drbd_peer_request *peer_req)
b411b363
PR
1318{
1319 return _drbd_send_ack(mdev, cmd,
db830c46
AG
1320 cpu_to_be64(peer_req->i.sector),
1321 cpu_to_be32(peer_req->i.size),
1322 peer_req->block_id);
b411b363
PR
1323}
1324
1325/* This function misuses the block_id field to signal if the blocks
1326 * are is sync or not. */
d8763023 1327int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
b411b363
PR
1328 sector_t sector, int blksize, u64 block_id)
1329{
1330 return _drbd_send_ack(mdev, cmd,
1331 cpu_to_be64(sector),
1332 cpu_to_be32(blksize),
1333 cpu_to_be64(block_id));
1334}
1335
1336int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
1337 sector_t sector, int size, u64 block_id)
1338{
1339 int ok;
1340 struct p_block_req p;
1341
1342 p.sector = cpu_to_be64(sector);
1343 p.block_id = block_id;
1344 p.blksize = cpu_to_be32(size);
1345
c012949a 1346 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &p.head, sizeof(p));
b411b363
PR
1347 return ok;
1348}
1349
d8763023
AG
1350int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size,
1351 void *digest, int digest_size, enum drbd_packet cmd)
b411b363
PR
1352{
1353 int ok;
1354 struct p_block_req p;
1355
fd340c12 1356 prepare_header(mdev, &p.head, cmd, sizeof(p) - sizeof(struct p_header) + digest_size);
b411b363 1357 p.sector = cpu_to_be64(sector);
9a8e7753 1358 p.block_id = ID_SYNCER /* unused */;
b411b363
PR
1359 p.blksize = cpu_to_be32(size);
1360
e42325a5 1361 mutex_lock(&mdev->tconn->data.mutex);
b411b363 1362
bedbd2a5
PR
1363 ok = (sizeof(p) == drbd_send(mdev->tconn, mdev->tconn->data.socket, &p, sizeof(p), 0));
1364 ok = ok && (digest_size == drbd_send(mdev->tconn, mdev->tconn->data.socket, digest, digest_size, 0));
b411b363 1365
e42325a5 1366 mutex_unlock(&mdev->tconn->data.mutex);
b411b363
PR
1367
1368 return ok;
1369}
1370
1371int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
1372{
1373 int ok;
1374 struct p_block_req p;
1375
1376 p.sector = cpu_to_be64(sector);
9a8e7753 1377 p.block_id = ID_SYNCER /* unused */;
b411b363
PR
1378 p.blksize = cpu_to_be32(size);
1379
c012949a 1380 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, &p.head, sizeof(p));
b411b363
PR
1381 return ok;
1382}
1383
1384/* called on sndtimeo
81e84650
AG
1385 * returns false if we should retry,
1386 * true if we think connection is dead
b411b363 1387 */
1a7ba646 1388static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock)
b411b363
PR
1389{
1390 int drop_it;
1391 /* long elapsed = (long)(jiffies - mdev->last_received); */
1392
1a7ba646
PR
1393 drop_it = tconn->meta.socket == sock
1394 || !tconn->asender.task
1395 || get_t_state(&tconn->asender) != RUNNING
bbeb641c 1396 || tconn->cstate < C_WF_REPORT_PARAMS;
b411b363
PR
1397
1398 if (drop_it)
81e84650 1399 return true;
b411b363 1400
1a7ba646 1401 drop_it = !--tconn->ko_count;
b411b363 1402 if (!drop_it) {
1a7ba646
PR
1403 conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
1404 current->comm, current->pid, tconn->ko_count);
1405 request_ping(tconn);
b411b363
PR
1406 }
1407
1408 return drop_it; /* && (mdev->state == R_PRIMARY) */;
1409}
1410
1a7ba646 1411static void drbd_update_congested(struct drbd_tconn *tconn)
9e204cdd 1412{
1a7ba646 1413 struct sock *sk = tconn->data.socket->sk;
9e204cdd 1414 if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
1a7ba646 1415 set_bit(NET_CONGESTED, &tconn->flags);
9e204cdd
AG
1416}
1417
b411b363
PR
1418/* The idea of sendpage seems to be to put some kind of reference
1419 * to the page into the skb, and to hand it over to the NIC. In
1420 * this process get_page() gets called.
1421 *
1422 * As soon as the page was really sent over the network put_page()
1423 * gets called by some part of the network layer. [ NIC driver? ]
1424 *
1425 * [ get_page() / put_page() increment/decrement the count. If count
1426 * reaches 0 the page will be freed. ]
1427 *
1428 * This works nicely with pages from FSs.
1429 * But this means that in protocol A we might signal IO completion too early!
1430 *
1431 * In order not to corrupt data during a resync we must make sure
1432 * that we do not reuse our own buffer pages (EEs) to early, therefore
1433 * we have the net_ee list.
1434 *
1435 * XFS seems to have problems, still, it submits pages with page_count == 0!
1436 * As a workaround, we disable sendpage on pages
1437 * with page_count == 0 or PageSlab.
1438 */
1439static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 1440 int offset, size_t size, unsigned msg_flags)
b411b363 1441{
bedbd2a5 1442 int sent = drbd_send(mdev->tconn, mdev->tconn->data.socket, kmap(page) + offset, size, msg_flags);
b411b363
PR
1443 kunmap(page);
1444 if (sent == size)
1445 mdev->send_cnt += size>>9;
1446 return sent == size;
1447}
1448
1449static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 1450 int offset, size_t size, unsigned msg_flags)
b411b363
PR
1451{
1452 mm_segment_t oldfs = get_fs();
1453 int sent, ok;
1454 int len = size;
1455
1456 /* e.g. XFS meta- & log-data is in slab pages, which have a
1457 * page_count of 0 and/or have PageSlab() set.
1458 * we cannot use send_page for those, as that does get_page();
1459 * put_page(); and would cause either a VM_BUG directly, or
1460 * __page_cache_release a page that would actually still be referenced
1461 * by someone, leading to some obscure delayed Oops somewhere else. */
1462 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
ba11ad9a 1463 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
b411b363 1464
ba11ad9a 1465 msg_flags |= MSG_NOSIGNAL;
1a7ba646 1466 drbd_update_congested(mdev->tconn);
b411b363
PR
1467 set_fs(KERNEL_DS);
1468 do {
e42325a5 1469 sent = mdev->tconn->data.socket->ops->sendpage(mdev->tconn->data.socket, page,
b411b363 1470 offset, len,
ba11ad9a 1471 msg_flags);
b411b363 1472 if (sent == -EAGAIN) {
1a7ba646 1473 if (we_should_drop_the_connection(mdev->tconn,
e42325a5 1474 mdev->tconn->data.socket))
b411b363
PR
1475 break;
1476 else
1477 continue;
1478 }
1479 if (sent <= 0) {
1480 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
1481 __func__, (int)size, len, sent);
1482 break;
1483 }
1484 len -= sent;
1485 offset += sent;
1486 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
1487 set_fs(oldfs);
01a311a5 1488 clear_bit(NET_CONGESTED, &mdev->tconn->flags);
b411b363
PR
1489
1490 ok = (len == 0);
1491 if (likely(ok))
1492 mdev->send_cnt += size>>9;
1493 return ok;
1494}
1495
1496static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
1497{
1498 struct bio_vec *bvec;
1499 int i;
ba11ad9a 1500 /* hint all but last page with MSG_MORE */
b411b363
PR
1501 __bio_for_each_segment(bvec, bio, i, 0) {
1502 if (!_drbd_no_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
1503 bvec->bv_offset, bvec->bv_len,
1504 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
1505 return 0;
1506 }
1507 return 1;
1508}
1509
1510static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
1511{
1512 struct bio_vec *bvec;
1513 int i;
ba11ad9a 1514 /* hint all but last page with MSG_MORE */
b411b363
PR
1515 __bio_for_each_segment(bvec, bio, i, 0) {
1516 if (!_drbd_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
1517 bvec->bv_offset, bvec->bv_len,
1518 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
1519 return 0;
1520 }
b411b363
PR
1521 return 1;
1522}
1523
db830c46
AG
1524static int _drbd_send_zc_ee(struct drbd_conf *mdev,
1525 struct drbd_peer_request *peer_req)
45bb912b 1526{
db830c46
AG
1527 struct page *page = peer_req->pages;
1528 unsigned len = peer_req->i.size;
1529
ba11ad9a 1530 /* hint all but last page with MSG_MORE */
45bb912b
LE
1531 page_chain_for_each(page) {
1532 unsigned l = min_t(unsigned, len, PAGE_SIZE);
ba11ad9a
LE
1533 if (!_drbd_send_page(mdev, page, 0, l,
1534 page_chain_next(page) ? MSG_MORE : 0))
45bb912b
LE
1535 return 0;
1536 len -= l;
1537 }
1538 return 1;
1539}
1540
76d2e7ec
PR
1541static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
1542{
31890f4a 1543 if (mdev->tconn->agreed_pro_version >= 95)
76d2e7ec 1544 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
76d2e7ec
PR
1545 (bi_rw & REQ_FUA ? DP_FUA : 0) |
1546 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
1547 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
1548 else
721a9602 1549 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
76d2e7ec
PR
1550}
1551
b411b363
PR
1552/* Used to send write requests
1553 * R_PRIMARY -> Peer (P_DATA)
1554 */
1555int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
1556{
1557 int ok = 1;
1558 struct p_data p;
1559 unsigned int dp_flags = 0;
1560 void *dgb;
1561 int dgs;
1562
61120870 1563 if (!drbd_get_data_sock(mdev->tconn))
b411b363
PR
1564 return 0;
1565
a0638456
PR
1566 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_w_tfm) ?
1567 crypto_hash_digestsize(mdev->tconn->integrity_w_tfm) : 0;
b411b363 1568
fd340c12 1569 prepare_header(mdev, &p.head, P_DATA, sizeof(p) - sizeof(struct p_header) + dgs + req->i.size);
ace652ac 1570 p.sector = cpu_to_be64(req->i.sector);
b411b363 1571 p.block_id = (unsigned long)req;
8ccf218e 1572 p.seq_num = cpu_to_be32(req->seq_num = atomic_inc_return(&mdev->packet_seq));
b411b363 1573
76d2e7ec
PR
1574 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
1575
b411b363
PR
1576 if (mdev->state.conn >= C_SYNC_SOURCE &&
1577 mdev->state.conn <= C_PAUSED_SYNC_T)
1578 dp_flags |= DP_MAY_SET_IN_SYNC;
1579
1580 p.dp_flags = cpu_to_be32(dp_flags);
b411b363
PR
1581 set_bit(UNPLUG_REMOTE, &mdev->flags);
1582 ok = (sizeof(p) ==
bedbd2a5 1583 drbd_send(mdev->tconn, mdev->tconn->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
b411b363 1584 if (ok && dgs) {
a0638456
PR
1585 dgb = mdev->tconn->int_dig_out;
1586 drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, dgb);
bedbd2a5 1587 ok = dgs == drbd_send(mdev->tconn, mdev->tconn->data.socket, dgb, dgs, 0);
b411b363
PR
1588 }
1589 if (ok) {
470be44a
LE
1590 /* For protocol A, we have to memcpy the payload into
1591 * socket buffers, as we may complete right away
1592 * as soon as we handed it over to tcp, at which point the data
1593 * pages may become invalid.
1594 *
1595 * For data-integrity enabled, we copy it as well, so we can be
1596 * sure that even if the bio pages may still be modified, it
1597 * won't change the data on the wire, thus if the digest checks
1598 * out ok after sending on this side, but does not fit on the
1599 * receiving side, we sure have detected corruption elsewhere.
1600 */
89e58e75 1601 if (mdev->tconn->net_conf->wire_protocol == DRBD_PROT_A || dgs)
b411b363
PR
1602 ok = _drbd_send_bio(mdev, req->master_bio);
1603 else
1604 ok = _drbd_send_zc_bio(mdev, req->master_bio);
470be44a
LE
1605
1606 /* double check digest, sometimes buffers have been modified in flight. */
1607 if (dgs > 0 && dgs <= 64) {
24c4830c 1608 /* 64 byte, 512 bit, is the largest digest size
470be44a
LE
1609 * currently supported in kernel crypto. */
1610 unsigned char digest[64];
a0638456
PR
1611 drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, digest);
1612 if (memcmp(mdev->tconn->int_dig_out, digest, dgs)) {
470be44a
LE
1613 dev_warn(DEV,
1614 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
ace652ac 1615 (unsigned long long)req->i.sector, req->i.size);
470be44a
LE
1616 }
1617 } /* else if (dgs > 64) {
1618 ... Be noisy about digest too large ...
1619 } */
b411b363
PR
1620 }
1621
61120870 1622 drbd_put_data_sock(mdev->tconn);
bd26bfc5 1623
b411b363
PR
1624 return ok;
1625}
1626
1627/* answer packet, used to send data back for read requests:
1628 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
1629 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
1630 */
d8763023 1631int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd,
db830c46 1632 struct drbd_peer_request *peer_req)
b411b363
PR
1633{
1634 int ok;
1635 struct p_data p;
1636 void *dgb;
1637 int dgs;
1638
a0638456
PR
1639 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_w_tfm) ?
1640 crypto_hash_digestsize(mdev->tconn->integrity_w_tfm) : 0;
b411b363 1641
db830c46
AG
1642 prepare_header(mdev, &p.head, cmd, sizeof(p) -
1643 sizeof(struct p_header80) +
1644 dgs + peer_req->i.size);
1645 p.sector = cpu_to_be64(peer_req->i.sector);
1646 p.block_id = peer_req->block_id;
cc378270 1647 p.seq_num = 0; /* unused */
b411b363
PR
1648
1649 /* Only called by our kernel thread.
1650 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
1651 * in response to admin command or module unload.
1652 */
61120870 1653 if (!drbd_get_data_sock(mdev->tconn))
b411b363
PR
1654 return 0;
1655
bedbd2a5 1656 ok = sizeof(p) == drbd_send(mdev->tconn, mdev->tconn->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
b411b363 1657 if (ok && dgs) {
a0638456 1658 dgb = mdev->tconn->int_dig_out;
db830c46 1659 drbd_csum_ee(mdev, mdev->tconn->integrity_w_tfm, peer_req, dgb);
bedbd2a5 1660 ok = dgs == drbd_send(mdev->tconn, mdev->tconn->data.socket, dgb, dgs, 0);
b411b363
PR
1661 }
1662 if (ok)
db830c46 1663 ok = _drbd_send_zc_ee(mdev, peer_req);
b411b363 1664
61120870 1665 drbd_put_data_sock(mdev->tconn);
bd26bfc5 1666
b411b363
PR
1667 return ok;
1668}
1669
73a01a18
PR
1670int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
1671{
1672 struct p_block_desc p;
1673
ace652ac
AG
1674 p.sector = cpu_to_be64(req->i.sector);
1675 p.blksize = cpu_to_be32(req->i.size);
73a01a18
PR
1676
1677 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
1678}
1679
b411b363
PR
1680/*
1681 drbd_send distinguishes two cases:
1682
1683 Packets sent via the data socket "sock"
1684 and packets sent via the meta data socket "msock"
1685
1686 sock msock
1687 -----------------+-------------------------+------------------------------
1688 timeout conf.timeout / 2 conf.timeout / 2
1689 timeout action send a ping via msock Abort communication
1690 and close all sockets
1691*/
1692
1693/*
1694 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
1695 */
bedbd2a5 1696int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
b411b363
PR
1697 void *buf, size_t size, unsigned msg_flags)
1698{
1699 struct kvec iov;
1700 struct msghdr msg;
1701 int rv, sent = 0;
1702
1703 if (!sock)
1704 return -1000;
1705
1706 /* THINK if (signal_pending) return ... ? */
1707
1708 iov.iov_base = buf;
1709 iov.iov_len = size;
1710
1711 msg.msg_name = NULL;
1712 msg.msg_namelen = 0;
1713 msg.msg_control = NULL;
1714 msg.msg_controllen = 0;
1715 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
1716
bedbd2a5
PR
1717 if (sock == tconn->data.socket) {
1718 tconn->ko_count = tconn->net_conf->ko_count;
1719 drbd_update_congested(tconn);
b411b363
PR
1720 }
1721 do {
1722 /* STRANGE
1723 * tcp_sendmsg does _not_ use its size parameter at all ?
1724 *
1725 * -EAGAIN on timeout, -EINTR on signal.
1726 */
1727/* THINK
1728 * do we need to block DRBD_SIG if sock == &meta.socket ??
1729 * otherwise wake_asender() might interrupt some send_*Ack !
1730 */
1731 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
1732 if (rv == -EAGAIN) {
bedbd2a5 1733 if (we_should_drop_the_connection(tconn, sock))
b411b363
PR
1734 break;
1735 else
1736 continue;
1737 }
b411b363
PR
1738 if (rv == -EINTR) {
1739 flush_signals(current);
1740 rv = 0;
1741 }
1742 if (rv < 0)
1743 break;
1744 sent += rv;
1745 iov.iov_base += rv;
1746 iov.iov_len -= rv;
1747 } while (sent < size);
1748
bedbd2a5
PR
1749 if (sock == tconn->data.socket)
1750 clear_bit(NET_CONGESTED, &tconn->flags);
b411b363
PR
1751
1752 if (rv <= 0) {
1753 if (rv != -EAGAIN) {
bedbd2a5
PR
1754 conn_err(tconn, "%s_sendmsg returned %d\n",
1755 sock == tconn->meta.socket ? "msock" : "sock",
1756 rv);
bbeb641c 1757 conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
b411b363 1758 } else
bbeb641c 1759 conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD);
b411b363
PR
1760 }
1761
1762 return sent;
1763}
1764
1765static int drbd_open(struct block_device *bdev, fmode_t mode)
1766{
1767 struct drbd_conf *mdev = bdev->bd_disk->private_data;
1768 unsigned long flags;
1769 int rv = 0;
1770
2a48fc0a 1771 mutex_lock(&drbd_main_mutex);
87eeee41 1772 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
b411b363
PR
1773 /* to have a stable mdev->state.role
1774 * and no race with updating open_cnt */
1775
1776 if (mdev->state.role != R_PRIMARY) {
1777 if (mode & FMODE_WRITE)
1778 rv = -EROFS;
1779 else if (!allow_oos)
1780 rv = -EMEDIUMTYPE;
1781 }
1782
1783 if (!rv)
1784 mdev->open_cnt++;
87eeee41 1785 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
2a48fc0a 1786 mutex_unlock(&drbd_main_mutex);
b411b363
PR
1787
1788 return rv;
1789}
1790
1791static int drbd_release(struct gendisk *gd, fmode_t mode)
1792{
1793 struct drbd_conf *mdev = gd->private_data;
2a48fc0a 1794 mutex_lock(&drbd_main_mutex);
b411b363 1795 mdev->open_cnt--;
2a48fc0a 1796 mutex_unlock(&drbd_main_mutex);
b411b363
PR
1797 return 0;
1798}
1799
b411b363
PR
1800static void drbd_set_defaults(struct drbd_conf *mdev)
1801{
f399002e
LE
1802 /* Beware! The actual layout differs
1803 * between big endian and little endian */
b411b363
PR
1804 mdev->state = (union drbd_state) {
1805 { .role = R_SECONDARY,
1806 .peer = R_UNKNOWN,
1807 .conn = C_STANDALONE,
1808 .disk = D_DISKLESS,
1809 .pdsk = D_UNKNOWN,
fb22c402
PR
1810 .susp = 0,
1811 .susp_nod = 0,
1812 .susp_fen = 0
b411b363
PR
1813 } };
1814}
1815
1816void drbd_init_set_defaults(struct drbd_conf *mdev)
1817{
1818 /* the memset(,0,) did most of this.
1819 * note: only assignments, no allocation in here */
1820
1821 drbd_set_defaults(mdev);
1822
b411b363
PR
1823 atomic_set(&mdev->ap_bio_cnt, 0);
1824 atomic_set(&mdev->ap_pending_cnt, 0);
1825 atomic_set(&mdev->rs_pending_cnt, 0);
1826 atomic_set(&mdev->unacked_cnt, 0);
1827 atomic_set(&mdev->local_cnt, 0);
b411b363 1828 atomic_set(&mdev->pp_in_use, 0);
435f0740 1829 atomic_set(&mdev->pp_in_use_by_net, 0);
778f271d 1830 atomic_set(&mdev->rs_sect_in, 0);
0f0601f4 1831 atomic_set(&mdev->rs_sect_ev, 0);
759fbdfb 1832 atomic_set(&mdev->ap_in_flight, 0);
b411b363
PR
1833
1834 mutex_init(&mdev->md_io_mutex);
8410da8f
PR
1835 mutex_init(&mdev->own_state_mutex);
1836 mdev->state_mutex = &mdev->own_state_mutex;
b411b363 1837
b411b363 1838 spin_lock_init(&mdev->al_lock);
b411b363
PR
1839 spin_lock_init(&mdev->peer_seq_lock);
1840 spin_lock_init(&mdev->epoch_lock);
1841
1842 INIT_LIST_HEAD(&mdev->active_ee);
1843 INIT_LIST_HEAD(&mdev->sync_ee);
1844 INIT_LIST_HEAD(&mdev->done_ee);
1845 INIT_LIST_HEAD(&mdev->read_ee);
1846 INIT_LIST_HEAD(&mdev->net_ee);
1847 INIT_LIST_HEAD(&mdev->resync_reads);
b411b363
PR
1848 INIT_LIST_HEAD(&mdev->resync_work.list);
1849 INIT_LIST_HEAD(&mdev->unplug_work.list);
e9e6f3ec 1850 INIT_LIST_HEAD(&mdev->go_diskless.list);
b411b363 1851 INIT_LIST_HEAD(&mdev->md_sync_work.list);
c4752ef1 1852 INIT_LIST_HEAD(&mdev->start_resync_work.list);
b411b363 1853 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 1854
794abb75 1855 mdev->resync_work.cb = w_resync_timer;
b411b363 1856 mdev->unplug_work.cb = w_send_write_hint;
e9e6f3ec 1857 mdev->go_diskless.cb = w_go_diskless;
b411b363
PR
1858 mdev->md_sync_work.cb = w_md_sync;
1859 mdev->bm_io_work.w.cb = w_bitmap_io;
370a43e7 1860 mdev->start_resync_work.cb = w_start_resync;
a21e9298
PR
1861
1862 mdev->resync_work.mdev = mdev;
1863 mdev->unplug_work.mdev = mdev;
1864 mdev->go_diskless.mdev = mdev;
1865 mdev->md_sync_work.mdev = mdev;
1866 mdev->bm_io_work.w.mdev = mdev;
1867 mdev->start_resync_work.mdev = mdev;
1868
b411b363
PR
1869 init_timer(&mdev->resync_timer);
1870 init_timer(&mdev->md_sync_timer);
370a43e7 1871 init_timer(&mdev->start_resync_timer);
7fde2be9 1872 init_timer(&mdev->request_timer);
b411b363
PR
1873 mdev->resync_timer.function = resync_timer_fn;
1874 mdev->resync_timer.data = (unsigned long) mdev;
1875 mdev->md_sync_timer.function = md_sync_timer_fn;
1876 mdev->md_sync_timer.data = (unsigned long) mdev;
370a43e7
PR
1877 mdev->start_resync_timer.function = start_resync_timer_fn;
1878 mdev->start_resync_timer.data = (unsigned long) mdev;
7fde2be9
PR
1879 mdev->request_timer.function = request_timer_fn;
1880 mdev->request_timer.data = (unsigned long) mdev;
b411b363
PR
1881
1882 init_waitqueue_head(&mdev->misc_wait);
1883 init_waitqueue_head(&mdev->state_wait);
1884 init_waitqueue_head(&mdev->ee_wait);
1885 init_waitqueue_head(&mdev->al_wait);
1886 init_waitqueue_head(&mdev->seq_wait);
1887
fd340c12 1888 /* mdev->tconn->agreed_pro_version gets initialized in drbd_connect() */
2451fc3b 1889 mdev->write_ordering = WO_bdev_flush;
b411b363 1890 mdev->resync_wenr = LC_FREE;
99432fcc
PR
1891 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
1892 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
b411b363
PR
1893}
1894
1895void drbd_mdev_cleanup(struct drbd_conf *mdev)
1896{
1d7734a0 1897 int i;
e6b3ea83 1898 if (mdev->tconn->receiver.t_state != NONE)
b411b363 1899 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
e6b3ea83 1900 mdev->tconn->receiver.t_state);
b411b363
PR
1901
1902 /* no need to lock it, I'm the only thread alive */
1903 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
1904 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
1905 mdev->al_writ_cnt =
1906 mdev->bm_writ_cnt =
1907 mdev->read_cnt =
1908 mdev->recv_cnt =
1909 mdev->send_cnt =
1910 mdev->writ_cnt =
1911 mdev->p_size =
1912 mdev->rs_start =
1913 mdev->rs_total =
1d7734a0
LE
1914 mdev->rs_failed = 0;
1915 mdev->rs_last_events = 0;
0f0601f4 1916 mdev->rs_last_sect_ev = 0;
1d7734a0
LE
1917 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1918 mdev->rs_mark_left[i] = 0;
1919 mdev->rs_mark_time[i] = 0;
1920 }
89e58e75 1921 D_ASSERT(mdev->tconn->net_conf == NULL);
b411b363
PR
1922
1923 drbd_set_my_capacity(mdev, 0);
1924 if (mdev->bitmap) {
1925 /* maybe never allocated. */
02d9a94b 1926 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
1927 drbd_bm_cleanup(mdev);
1928 }
1929
1930 drbd_free_resources(mdev);
0778286a 1931 clear_bit(AL_SUSPENDED, &mdev->flags);
b411b363
PR
1932
1933 /*
1934 * currently we drbd_init_ee only on module load, so
1935 * we may do drbd_release_ee only on module unload!
1936 */
1937 D_ASSERT(list_empty(&mdev->active_ee));
1938 D_ASSERT(list_empty(&mdev->sync_ee));
1939 D_ASSERT(list_empty(&mdev->done_ee));
1940 D_ASSERT(list_empty(&mdev->read_ee));
1941 D_ASSERT(list_empty(&mdev->net_ee));
1942 D_ASSERT(list_empty(&mdev->resync_reads));
e42325a5
PR
1943 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
1944 D_ASSERT(list_empty(&mdev->tconn->meta.work.q));
b411b363
PR
1945 D_ASSERT(list_empty(&mdev->resync_work.list));
1946 D_ASSERT(list_empty(&mdev->unplug_work.list));
e9e6f3ec 1947 D_ASSERT(list_empty(&mdev->go_diskless.list));
2265b473
LE
1948
1949 drbd_set_defaults(mdev);
b411b363
PR
1950}
1951
1952
1953static void drbd_destroy_mempools(void)
1954{
1955 struct page *page;
1956
1957 while (drbd_pp_pool) {
1958 page = drbd_pp_pool;
1959 drbd_pp_pool = (struct page *)page_private(page);
1960 __free_page(page);
1961 drbd_pp_vacant--;
1962 }
1963
1964 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
1965
da4a75d2
LE
1966 if (drbd_md_io_bio_set)
1967 bioset_free(drbd_md_io_bio_set);
35abf594
LE
1968 if (drbd_md_io_page_pool)
1969 mempool_destroy(drbd_md_io_page_pool);
b411b363
PR
1970 if (drbd_ee_mempool)
1971 mempool_destroy(drbd_ee_mempool);
1972 if (drbd_request_mempool)
1973 mempool_destroy(drbd_request_mempool);
1974 if (drbd_ee_cache)
1975 kmem_cache_destroy(drbd_ee_cache);
1976 if (drbd_request_cache)
1977 kmem_cache_destroy(drbd_request_cache);
1978 if (drbd_bm_ext_cache)
1979 kmem_cache_destroy(drbd_bm_ext_cache);
1980 if (drbd_al_ext_cache)
1981 kmem_cache_destroy(drbd_al_ext_cache);
1982
da4a75d2 1983 drbd_md_io_bio_set = NULL;
35abf594 1984 drbd_md_io_page_pool = NULL;
b411b363
PR
1985 drbd_ee_mempool = NULL;
1986 drbd_request_mempool = NULL;
1987 drbd_ee_cache = NULL;
1988 drbd_request_cache = NULL;
1989 drbd_bm_ext_cache = NULL;
1990 drbd_al_ext_cache = NULL;
1991
1992 return;
1993}
1994
1995static int drbd_create_mempools(void)
1996{
1997 struct page *page;
1816a2b4 1998 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
b411b363
PR
1999 int i;
2000
2001 /* prepare our caches and mempools */
2002 drbd_request_mempool = NULL;
2003 drbd_ee_cache = NULL;
2004 drbd_request_cache = NULL;
2005 drbd_bm_ext_cache = NULL;
2006 drbd_al_ext_cache = NULL;
2007 drbd_pp_pool = NULL;
35abf594 2008 drbd_md_io_page_pool = NULL;
da4a75d2 2009 drbd_md_io_bio_set = NULL;
b411b363
PR
2010
2011 /* caches */
2012 drbd_request_cache = kmem_cache_create(
2013 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2014 if (drbd_request_cache == NULL)
2015 goto Enomem;
2016
2017 drbd_ee_cache = kmem_cache_create(
f6ffca9f 2018 "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
b411b363
PR
2019 if (drbd_ee_cache == NULL)
2020 goto Enomem;
2021
2022 drbd_bm_ext_cache = kmem_cache_create(
2023 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2024 if (drbd_bm_ext_cache == NULL)
2025 goto Enomem;
2026
2027 drbd_al_ext_cache = kmem_cache_create(
2028 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2029 if (drbd_al_ext_cache == NULL)
2030 goto Enomem;
2031
2032 /* mempools */
da4a75d2
LE
2033 drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
2034 if (drbd_md_io_bio_set == NULL)
2035 goto Enomem;
2036
35abf594
LE
2037 drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
2038 if (drbd_md_io_page_pool == NULL)
2039 goto Enomem;
2040
b411b363
PR
2041 drbd_request_mempool = mempool_create(number,
2042 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2043 if (drbd_request_mempool == NULL)
2044 goto Enomem;
2045
2046 drbd_ee_mempool = mempool_create(number,
2047 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2027ae1f 2048 if (drbd_ee_mempool == NULL)
b411b363
PR
2049 goto Enomem;
2050
2051 /* drbd's page pool */
2052 spin_lock_init(&drbd_pp_lock);
2053
2054 for (i = 0; i < number; i++) {
2055 page = alloc_page(GFP_HIGHUSER);
2056 if (!page)
2057 goto Enomem;
2058 set_page_private(page, (unsigned long)drbd_pp_pool);
2059 drbd_pp_pool = page;
2060 }
2061 drbd_pp_vacant = number;
2062
2063 return 0;
2064
2065Enomem:
2066 drbd_destroy_mempools(); /* in case we allocated some */
2067 return -ENOMEM;
2068}
2069
2070static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2071 void *unused)
2072{
2073 /* just so we have it. you never know what interesting things we
2074 * might want to do here some day...
2075 */
2076
2077 return NOTIFY_DONE;
2078}
2079
2080static struct notifier_block drbd_notifier = {
2081 .notifier_call = drbd_notify_sys,
2082};
2083
2084static void drbd_release_ee_lists(struct drbd_conf *mdev)
2085{
2086 int rr;
2087
2088 rr = drbd_release_ee(mdev, &mdev->active_ee);
2089 if (rr)
2090 dev_err(DEV, "%d EEs in active list found!\n", rr);
2091
2092 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2093 if (rr)
2094 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2095
2096 rr = drbd_release_ee(mdev, &mdev->read_ee);
2097 if (rr)
2098 dev_err(DEV, "%d EEs in read list found!\n", rr);
2099
2100 rr = drbd_release_ee(mdev, &mdev->done_ee);
2101 if (rr)
2102 dev_err(DEV, "%d EEs in done list found!\n", rr);
2103
2104 rr = drbd_release_ee(mdev, &mdev->net_ee);
2105 if (rr)
2106 dev_err(DEV, "%d EEs in net list found!\n", rr);
2107}
2108
774b3055
PR
2109/* caution. no locking. */
2110void drbd_delete_device(unsigned int minor)
b411b363
PR
2111{
2112 struct drbd_conf *mdev = minor_to_mdev(minor);
2113
2114 if (!mdev)
2115 return;
2116
569083c0
LE
2117 idr_remove(&mdev->tconn->volumes, mdev->vnr);
2118 idr_remove(&minors, minor);
2119 synchronize_rcu();
774b3055 2120
b411b363 2121 /* paranoia asserts */
70dc65e1 2122 D_ASSERT(mdev->open_cnt == 0);
e42325a5 2123 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
b411b363
PR
2124 /* end paranoia asserts */
2125
2126 del_gendisk(mdev->vdisk);
2127
2128 /* cleanup stuff that may have been allocated during
2129 * device (re-)configuration or state changes */
2130
2131 if (mdev->this_bdev)
2132 bdput(mdev->this_bdev);
2133
2134 drbd_free_resources(mdev);
2135
2136 drbd_release_ee_lists(mdev);
2137
b411b363
PR
2138 lc_destroy(mdev->act_log);
2139 lc_destroy(mdev->resync);
2140
2141 kfree(mdev->p_uuid);
2142 /* mdev->p_uuid = NULL; */
2143
b411b363
PR
2144 /* cleanup the rest that has been
2145 * allocated from drbd_new_device
2146 * and actually free the mdev itself */
2147 drbd_free_mdev(mdev);
2148}
2149
2150static void drbd_cleanup(void)
2151{
2152 unsigned int i;
81a5d60e 2153 struct drbd_conf *mdev;
b411b363
PR
2154
2155 unregister_reboot_notifier(&drbd_notifier);
2156
17a93f30
LE
2157 /* first remove proc,
2158 * drbdsetup uses it's presence to detect
2159 * whether DRBD is loaded.
2160 * If we would get stuck in proc removal,
2161 * but have netlink already deregistered,
2162 * some drbdsetup commands may wait forever
2163 * for an answer.
2164 */
2165 if (drbd_proc)
2166 remove_proc_entry("drbd", NULL);
2167
3b98c0c2 2168 drbd_genl_unregister();
b411b363 2169
81a5d60e
PR
2170 idr_for_each_entry(&minors, mdev, i)
2171 drbd_delete_device(i);
2172 drbd_destroy_mempools();
b411b363
PR
2173 unregister_blkdev(DRBD_MAJOR, "drbd");
2174
81a5d60e
PR
2175 idr_destroy(&minors);
2176
b411b363
PR
2177 printk(KERN_INFO "drbd: module cleanup done.\n");
2178}
2179
2180/**
2181 * drbd_congested() - Callback for pdflush
2182 * @congested_data: User data
2183 * @bdi_bits: Bits pdflush is currently interested in
2184 *
2185 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2186 */
2187static int drbd_congested(void *congested_data, int bdi_bits)
2188{
2189 struct drbd_conf *mdev = congested_data;
2190 struct request_queue *q;
2191 char reason = '-';
2192 int r = 0;
2193
1b881ef7 2194 if (!may_inc_ap_bio(mdev)) {
b411b363
PR
2195 /* DRBD has frozen IO */
2196 r = bdi_bits;
2197 reason = 'd';
2198 goto out;
2199 }
2200
2201 if (get_ldev(mdev)) {
2202 q = bdev_get_queue(mdev->ldev->backing_bdev);
2203 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2204 put_ldev(mdev);
2205 if (r)
2206 reason = 'b';
2207 }
2208
01a311a5 2209 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) {
b411b363
PR
2210 r |= (1 << BDI_async_congested);
2211 reason = reason == 'b' ? 'a' : 'n';
2212 }
2213
2214out:
2215 mdev->congestion_reason = reason;
2216 return r;
2217}
2218
6699b655
PR
2219static void drbd_init_workqueue(struct drbd_work_queue* wq)
2220{
2221 sema_init(&wq->s, 0);
2222 spin_lock_init(&wq->q_lock);
2223 INIT_LIST_HEAD(&wq->q);
2224}
2225
1aba4d7f
PR
2226struct drbd_tconn *conn_by_name(const char *name)
2227{
2228 struct drbd_tconn *tconn;
2229
3b98c0c2
LE
2230 if (!name || !name[0])
2231 return NULL;
2232
543cc10b 2233 mutex_lock(&drbd_cfg_mutex);
1aba4d7f
PR
2234 list_for_each_entry(tconn, &drbd_tconns, all_tconn) {
2235 if (!strcmp(tconn->name, name))
2236 goto found;
2237 }
2238 tconn = NULL;
2239found:
543cc10b 2240 mutex_unlock(&drbd_cfg_mutex);
1aba4d7f
PR
2241 return tconn;
2242}
2243
3b98c0c2 2244struct drbd_tconn *drbd_new_tconn(const char *name)
2111438b
PR
2245{
2246 struct drbd_tconn *tconn;
2247
2248 tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL);
2249 if (!tconn)
2250 return NULL;
2251
2252 tconn->name = kstrdup(name, GFP_KERNEL);
2253 if (!tconn->name)
2254 goto fail;
2255
774b3055
PR
2256 if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL))
2257 goto fail;
2258
2f5cdd0b
PR
2259 if (!tl_init(tconn))
2260 goto fail;
2261
bbeb641c 2262 tconn->cstate = C_STANDALONE;
8410da8f 2263 mutex_init(&tconn->cstate_mutex);
6699b655 2264 spin_lock_init(&tconn->req_lock);
b2fb6dbe
PR
2265 atomic_set(&tconn->net_cnt, 0);
2266 init_waitqueue_head(&tconn->net_cnt_wait);
2a67d8b9 2267 init_waitqueue_head(&tconn->ping_wait);
062e879c 2268 idr_init(&tconn->volumes);
b2fb6dbe 2269
6699b655
PR
2270 drbd_init_workqueue(&tconn->data.work);
2271 mutex_init(&tconn->data.mutex);
2272
2273 drbd_init_workqueue(&tconn->meta.work);
2274 mutex_init(&tconn->meta.mutex);
2275
392c8801
PR
2276 drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver");
2277 drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker");
2278 drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender");
2279
f399002e
LE
2280 tconn->res_opts = (struct res_opts) {
2281 {}, 0, /* cpu_mask */
2282 DRBD_ON_NO_DATA_DEF, /* on_no_data */
2283 };
2284
543cc10b
LE
2285 mutex_lock(&drbd_cfg_mutex);
2286 list_add_tail(&tconn->all_tconn, &drbd_tconns);
2287 mutex_unlock(&drbd_cfg_mutex);
2111438b
PR
2288
2289 return tconn;
2290
2291fail:
2f5cdd0b 2292 tl_cleanup(tconn);
774b3055 2293 free_cpumask_var(tconn->cpu_mask);
2111438b
PR
2294 kfree(tconn->name);
2295 kfree(tconn);
2296
2297 return NULL;
2298}
2299
2300void drbd_free_tconn(struct drbd_tconn *tconn)
2301{
2111438b 2302 list_del(&tconn->all_tconn);
062e879c 2303 idr_destroy(&tconn->volumes);
2111438b 2304
774b3055 2305 free_cpumask_var(tconn->cpu_mask);
2111438b 2306 kfree(tconn->name);
b42a70ad
PR
2307 kfree(tconn->int_dig_out);
2308 kfree(tconn->int_dig_in);
2309 kfree(tconn->int_dig_vv);
2111438b
PR
2310 kfree(tconn);
2311}
2312
774b3055 2313enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
b411b363
PR
2314{
2315 struct drbd_conf *mdev;
2316 struct gendisk *disk;
2317 struct request_queue *q;
774b3055 2318 int vnr_got = vnr;
81a5d60e 2319 int minor_got = minor;
8432b314 2320 enum drbd_ret_code err = ERR_NOMEM;
774b3055
PR
2321
2322 mdev = minor_to_mdev(minor);
2323 if (mdev)
2324 return ERR_MINOR_EXISTS;
b411b363
PR
2325
2326 /* GFP_KERNEL, we are outside of all write-out paths */
2327 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2328 if (!mdev)
774b3055
PR
2329 return ERR_NOMEM;
2330
2331 mdev->tconn = tconn;
b411b363 2332 mdev->minor = minor;
3b98c0c2 2333 mdev->vnr = vnr;
b411b363
PR
2334
2335 drbd_init_set_defaults(mdev);
2336
2337 q = blk_alloc_queue(GFP_KERNEL);
2338 if (!q)
2339 goto out_no_q;
2340 mdev->rq_queue = q;
2341 q->queuedata = mdev;
b411b363
PR
2342
2343 disk = alloc_disk(1);
2344 if (!disk)
2345 goto out_no_disk;
2346 mdev->vdisk = disk;
2347
81e84650 2348 set_disk_ro(disk, true);
b411b363
PR
2349
2350 disk->queue = q;
2351 disk->major = DRBD_MAJOR;
2352 disk->first_minor = minor;
2353 disk->fops = &drbd_ops;
2354 sprintf(disk->disk_name, "drbd%d", minor);
2355 disk->private_data = mdev;
2356
2357 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
2358 /* we have no partitions. we contain only ourselves. */
2359 mdev->this_bdev->bd_contains = mdev->this_bdev;
2360
2361 q->backing_dev_info.congested_fn = drbd_congested;
2362 q->backing_dev_info.congested_data = mdev;
2363
2f58dcfc 2364 blk_queue_make_request(q, drbd_make_request);
99432fcc
PR
2365 /* Setting the max_hw_sectors to an odd value of 8kibyte here
2366 This triggers a max_bio_size message upon first attach or connect */
2367 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
b411b363
PR
2368 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
2369 blk_queue_merge_bvec(q, drbd_merge_bvec);
87eeee41 2370 q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */
b411b363
PR
2371
2372 mdev->md_io_page = alloc_page(GFP_KERNEL);
2373 if (!mdev->md_io_page)
2374 goto out_no_io_page;
2375
2376 if (drbd_bm_init(mdev))
2377 goto out_no_bitmap;
dac1389c 2378 mdev->read_requests = RB_ROOT;
de696716 2379 mdev->write_requests = RB_ROOT;
b411b363 2380
b411b363
PR
2381 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
2382 if (!mdev->current_epoch)
2383 goto out_no_epoch;
2384
2385 INIT_LIST_HEAD(&mdev->current_epoch->list);
2386 mdev->epochs = 1;
2387
81a5d60e 2388 if (!idr_pre_get(&minors, GFP_KERNEL))
8432b314
LE
2389 goto out_no_minor_idr;
2390 if (idr_get_new_above(&minors, mdev, minor, &minor_got))
2391 goto out_no_minor_idr;
81a5d60e 2392 if (minor_got != minor) {
8432b314
LE
2393 err = ERR_MINOR_EXISTS;
2394 drbd_msg_put_info("requested minor exists already");
569083c0 2395 goto out_idr_remove_minor;
81a5d60e 2396 }
8432b314
LE
2397
2398 if (!idr_pre_get(&tconn->volumes, GFP_KERNEL))
2399 goto out_idr_remove_minor;
2400 if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got))
2401 goto out_idr_remove_minor;
2402 if (vnr_got != vnr) {
2403 err = ERR_INVALID_REQUEST;
2404 drbd_msg_put_info("requested volume exists already");
2405 goto out_idr_remove_vol;
2406 }
774b3055
PR
2407 add_disk(disk);
2408
2325eb66
PR
2409 /* inherit the connection state */
2410 mdev->state.conn = tconn->cstate;
2411 if (mdev->state.conn == C_WF_REPORT_PARAMS)
2412 drbd_connected(vnr, mdev, tconn);
2413
774b3055 2414 return NO_ERROR;
b411b363 2415
569083c0
LE
2416out_idr_remove_vol:
2417 idr_remove(&tconn->volumes, vnr_got);
8432b314
LE
2418out_idr_remove_minor:
2419 idr_remove(&minors, minor_got);
569083c0 2420 synchronize_rcu();
8432b314 2421out_no_minor_idr:
81a5d60e 2422 kfree(mdev->current_epoch);
b411b363 2423out_no_epoch:
b411b363
PR
2424 drbd_bm_cleanup(mdev);
2425out_no_bitmap:
2426 __free_page(mdev->md_io_page);
2427out_no_io_page:
2428 put_disk(disk);
2429out_no_disk:
2430 blk_cleanup_queue(q);
2431out_no_q:
b411b363 2432 kfree(mdev);
8432b314 2433 return err;
b411b363
PR
2434}
2435
2436/* counterpart of drbd_new_device.
2437 * last part of drbd_delete_device. */
2438void drbd_free_mdev(struct drbd_conf *mdev)
2439{
2440 kfree(mdev->current_epoch);
b411b363
PR
2441 if (mdev->bitmap) /* should no longer be there. */
2442 drbd_bm_cleanup(mdev);
2443 __free_page(mdev->md_io_page);
2444 put_disk(mdev->vdisk);
2445 blk_cleanup_queue(mdev->rq_queue);
b411b363
PR
2446 kfree(mdev);
2447}
2448
2449
2450int __init drbd_init(void)
2451{
2452 int err;
2453
fd340c12
PR
2454 BUILD_BUG_ON(sizeof(struct p_header80) != sizeof(struct p_header95));
2455 BUILD_BUG_ON(sizeof(struct p_handshake) != 80);
b411b363 2456
2b8a90b5 2457 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
b411b363 2458 printk(KERN_ERR
81a5d60e 2459 "drbd: invalid minor_count (%d)\n", minor_count);
b411b363
PR
2460#ifdef MODULE
2461 return -EINVAL;
2462#else
2463 minor_count = 8;
2464#endif
2465 }
2466
b411b363
PR
2467 err = register_blkdev(DRBD_MAJOR, "drbd");
2468 if (err) {
2469 printk(KERN_ERR
2470 "drbd: unable to register block device major %d\n",
2471 DRBD_MAJOR);
2472 return err;
2473 }
2474
3b98c0c2
LE
2475 err = drbd_genl_register();
2476 if (err) {
2477 printk(KERN_ERR "drbd: unable to register generic netlink family\n");
2478 goto fail;
2479 }
2480
2481
b411b363
PR
2482 register_reboot_notifier(&drbd_notifier);
2483
2484 /*
2485 * allocate all necessary structs
2486 */
2487 err = -ENOMEM;
2488
2489 init_waitqueue_head(&drbd_pp_wait);
2490
2491 drbd_proc = NULL; /* play safe for drbd_cleanup */
81a5d60e 2492 idr_init(&minors);
b411b363
PR
2493
2494 err = drbd_create_mempools();
2495 if (err)
3b98c0c2 2496 goto fail;
b411b363 2497
8c484ee4 2498 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
2499 if (!drbd_proc) {
2500 printk(KERN_ERR "drbd: unable to register proc file\n");
3b98c0c2 2501 goto fail;
b411b363
PR
2502 }
2503
2504 rwlock_init(&global_state_lock);
2111438b 2505 INIT_LIST_HEAD(&drbd_tconns);
b411b363
PR
2506
2507 printk(KERN_INFO "drbd: initialized. "
2508 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
2509 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
2510 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
2511 printk(KERN_INFO "drbd: registered as block device major %d\n",
2512 DRBD_MAJOR);
b411b363
PR
2513
2514 return 0; /* Success! */
2515
3b98c0c2 2516fail:
b411b363
PR
2517 drbd_cleanup();
2518 if (err == -ENOMEM)
2519 /* currently always the case */
2520 printk(KERN_ERR "drbd: ran out of memory\n");
2521 else
2522 printk(KERN_ERR "drbd: initialization failure\n");
2523 return err;
2524}
2525
2526void drbd_free_bc(struct drbd_backing_dev *ldev)
2527{
2528 if (ldev == NULL)
2529 return;
2530
e525fd89
TH
2531 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2532 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
b411b363
PR
2533
2534 kfree(ldev);
2535}
2536
360cc740
PR
2537void drbd_free_sock(struct drbd_tconn *tconn)
2538{
2539 if (tconn->data.socket) {
2540 mutex_lock(&tconn->data.mutex);
2541 kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR);
2542 sock_release(tconn->data.socket);
2543 tconn->data.socket = NULL;
2544 mutex_unlock(&tconn->data.mutex);
b411b363 2545 }
360cc740
PR
2546 if (tconn->meta.socket) {
2547 mutex_lock(&tconn->meta.mutex);
2548 kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR);
2549 sock_release(tconn->meta.socket);
2550 tconn->meta.socket = NULL;
2551 mutex_unlock(&tconn->meta.mutex);
b411b363
PR
2552 }
2553}
2554
2555
2556void drbd_free_resources(struct drbd_conf *mdev)
2557{
f399002e
LE
2558 crypto_free_hash(mdev->tconn->csums_tfm);
2559 mdev->tconn->csums_tfm = NULL;
2560 crypto_free_hash(mdev->tconn->verify_tfm);
2561 mdev->tconn->verify_tfm = NULL;
a0638456
PR
2562 crypto_free_hash(mdev->tconn->cram_hmac_tfm);
2563 mdev->tconn->cram_hmac_tfm = NULL;
2564 crypto_free_hash(mdev->tconn->integrity_w_tfm);
2565 mdev->tconn->integrity_w_tfm = NULL;
2566 crypto_free_hash(mdev->tconn->integrity_r_tfm);
2567 mdev->tconn->integrity_r_tfm = NULL;
b411b363 2568
360cc740 2569 drbd_free_sock(mdev->tconn);
b411b363
PR
2570
2571 __no_warn(local,
2572 drbd_free_bc(mdev->ldev);
2573 mdev->ldev = NULL;);
2574}
2575
2576/* meta data management */
2577
2578struct meta_data_on_disk {
2579 u64 la_size; /* last agreed size. */
2580 u64 uuid[UI_SIZE]; /* UUIDs. */
2581 u64 device_uuid;
2582 u64 reserved_u64_1;
2583 u32 flags; /* MDF */
2584 u32 magic;
2585 u32 md_size_sect;
2586 u32 al_offset; /* offset to this block */
2587 u32 al_nr_extents; /* important for restoring the AL */
f399002e 2588 /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
b411b363
PR
2589 u32 bm_offset; /* offset to the bitmap, from here */
2590 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
99432fcc
PR
2591 u32 la_peer_max_bio_size; /* last peer max_bio_size */
2592 u32 reserved_u32[3];
b411b363
PR
2593
2594} __packed;
2595
2596/**
2597 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
2598 * @mdev: DRBD device.
2599 */
2600void drbd_md_sync(struct drbd_conf *mdev)
2601{
2602 struct meta_data_on_disk *buffer;
2603 sector_t sector;
2604 int i;
2605
ee15b038
LE
2606 del_timer(&mdev->md_sync_timer);
2607 /* timer may be rearmed by drbd_md_mark_dirty() now. */
b411b363
PR
2608 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
2609 return;
b411b363
PR
2610
2611 /* We use here D_FAILED and not D_ATTACHING because we try to write
2612 * metadata even if we detach due to a disk failure! */
2613 if (!get_ldev_if_state(mdev, D_FAILED))
2614 return;
2615
b411b363
PR
2616 mutex_lock(&mdev->md_io_mutex);
2617 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
2618 memset(buffer, 0, 512);
2619
2620 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
2621 for (i = UI_CURRENT; i < UI_SIZE; i++)
2622 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
2623 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
2624 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
2625
2626 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
2627 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
2628 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
2629 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
2630 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
2631
2632 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
99432fcc 2633 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
b411b363
PR
2634
2635 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
2636 sector = mdev->ldev->md.md_offset;
2637
3f3a9b84 2638 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
b411b363
PR
2639 /* this was a try anyways ... */
2640 dev_err(DEV, "meta data update failed!\n");
81e84650 2641 drbd_chk_io_error(mdev, 1, true);
b411b363
PR
2642 }
2643
2644 /* Update mdev->ldev->md.la_size_sect,
2645 * since we updated it on metadata. */
2646 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
2647
2648 mutex_unlock(&mdev->md_io_mutex);
2649 put_ldev(mdev);
2650}
2651
2652/**
2653 * drbd_md_read() - Reads in the meta data super block
2654 * @mdev: DRBD device.
2655 * @bdev: Device from which the meta data should be read in.
2656 *
116676ca 2657 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
b411b363
PR
2658 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
2659 */
2660int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2661{
2662 struct meta_data_on_disk *buffer;
2663 int i, rv = NO_ERROR;
2664
2665 if (!get_ldev_if_state(mdev, D_ATTACHING))
2666 return ERR_IO_MD_DISK;
2667
b411b363
PR
2668 mutex_lock(&mdev->md_io_mutex);
2669 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
2670
2671 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
25985edc 2672 /* NOTE: can't do normal error processing here as this is
b411b363
PR
2673 called BEFORE disk is attached */
2674 dev_err(DEV, "Error while reading metadata.\n");
2675 rv = ERR_IO_MD_DISK;
2676 goto err;
2677 }
2678
e7fad8af 2679 if (buffer->magic != cpu_to_be32(DRBD_MD_MAGIC)) {
b411b363
PR
2680 dev_err(DEV, "Error while reading metadata, magic not found.\n");
2681 rv = ERR_MD_INVALID;
2682 goto err;
2683 }
2684 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
2685 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
2686 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
2687 rv = ERR_MD_INVALID;
2688 goto err;
2689 }
2690 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
2691 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
2692 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
2693 rv = ERR_MD_INVALID;
2694 goto err;
2695 }
2696 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
2697 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
2698 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
2699 rv = ERR_MD_INVALID;
2700 goto err;
2701 }
2702
2703 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
2704 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
2705 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
2706 rv = ERR_MD_INVALID;
2707 goto err;
2708 }
2709
2710 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
2711 for (i = UI_CURRENT; i < UI_SIZE; i++)
2712 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
2713 bdev->md.flags = be32_to_cpu(buffer->flags);
f399002e 2714 bdev->dc.al_extents = be32_to_cpu(buffer->al_nr_extents);
b411b363
PR
2715 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
2716
87eeee41 2717 spin_lock_irq(&mdev->tconn->req_lock);
99432fcc
PR
2718 if (mdev->state.conn < C_CONNECTED) {
2719 int peer;
2720 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
2721 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
2722 mdev->peer_max_bio_size = peer;
2723 }
87eeee41 2724 spin_unlock_irq(&mdev->tconn->req_lock);
99432fcc 2725
f399002e
LE
2726 if (bdev->dc.al_extents < 7)
2727 bdev->dc.al_extents = 127;
b411b363
PR
2728
2729 err:
2730 mutex_unlock(&mdev->md_io_mutex);
2731 put_ldev(mdev);
2732
2733 return rv;
2734}
2735
2736/**
2737 * drbd_md_mark_dirty() - Mark meta data super block as dirty
2738 * @mdev: DRBD device.
2739 *
2740 * Call this function if you change anything that should be written to
2741 * the meta-data super block. This function sets MD_DIRTY, and starts a
2742 * timer that ensures that within five seconds you have to call drbd_md_sync().
2743 */
ca0e6098 2744#ifdef DEBUG
ee15b038
LE
2745void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
2746{
2747 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
2748 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
2749 mdev->last_md_mark_dirty.line = line;
2750 mdev->last_md_mark_dirty.func = func;
2751 }
2752}
2753#else
b411b363
PR
2754void drbd_md_mark_dirty(struct drbd_conf *mdev)
2755{
ee15b038 2756 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
ca0e6098 2757 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
b411b363 2758}
ee15b038 2759#endif
b411b363
PR
2760
2761static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
2762{
2763 int i;
2764
62b0da3a 2765 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 2766 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
2767}
2768
2769void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
2770{
2771 if (idx == UI_CURRENT) {
2772 if (mdev->state.role == R_PRIMARY)
2773 val |= 1;
2774 else
2775 val &= ~((u64)1);
2776
2777 drbd_set_ed_uuid(mdev, val);
2778 }
2779
2780 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
2781 drbd_md_mark_dirty(mdev);
2782}
2783
2784
2785void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
2786{
2787 if (mdev->ldev->md.uuid[idx]) {
2788 drbd_uuid_move_history(mdev);
2789 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363
PR
2790 }
2791 _drbd_uuid_set(mdev, idx, val);
2792}
2793
2794/**
2795 * drbd_uuid_new_current() - Creates a new current UUID
2796 * @mdev: DRBD device.
2797 *
2798 * Creates a new current UUID, and rotates the old current UUID into
2799 * the bitmap slot. Causes an incremental resync upon next connect.
2800 */
2801void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
2802{
2803 u64 val;
62b0da3a
LE
2804 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
2805
2806 if (bm_uuid)
2807 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 2808
b411b363 2809 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
b411b363
PR
2810
2811 get_random_bytes(&val, sizeof(u64));
2812 _drbd_uuid_set(mdev, UI_CURRENT, val);
62b0da3a 2813 drbd_print_uuids(mdev, "new current UUID");
aaa8e2b3
LE
2814 /* get it to stable storage _now_ */
2815 drbd_md_sync(mdev);
b411b363
PR
2816}
2817
2818void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
2819{
2820 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
2821 return;
2822
2823 if (val == 0) {
2824 drbd_uuid_move_history(mdev);
2825 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
2826 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 2827 } else {
62b0da3a
LE
2828 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
2829 if (bm_uuid)
2830 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 2831
62b0da3a 2832 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
b411b363
PR
2833 }
2834 drbd_md_mark_dirty(mdev);
2835}
2836
2837/**
2838 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
2839 * @mdev: DRBD device.
2840 *
2841 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
2842 */
2843int drbd_bmio_set_n_write(struct drbd_conf *mdev)
2844{
2845 int rv = -EIO;
2846
2847 if (get_ldev_if_state(mdev, D_ATTACHING)) {
2848 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
2849 drbd_md_sync(mdev);
2850 drbd_bm_set_all(mdev);
2851
2852 rv = drbd_bm_write(mdev);
2853
2854 if (!rv) {
2855 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2856 drbd_md_sync(mdev);
2857 }
2858
2859 put_ldev(mdev);
2860 }
2861
2862 return rv;
2863}
2864
2865/**
2866 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
2867 * @mdev: DRBD device.
2868 *
2869 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
2870 */
2871int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
2872{
2873 int rv = -EIO;
2874
0778286a 2875 drbd_resume_al(mdev);
b411b363
PR
2876 if (get_ldev_if_state(mdev, D_ATTACHING)) {
2877 drbd_bm_clear_all(mdev);
2878 rv = drbd_bm_write(mdev);
2879 put_ldev(mdev);
2880 }
2881
2882 return rv;
2883}
2884
00d56944 2885static int w_bitmap_io(struct drbd_work *w, int unused)
b411b363
PR
2886{
2887 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
00d56944 2888 struct drbd_conf *mdev = w->mdev;
02851e9f 2889 int rv = -EIO;
b411b363
PR
2890
2891 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
2892
02851e9f 2893 if (get_ldev(mdev)) {
20ceb2b2 2894 drbd_bm_lock(mdev, work->why, work->flags);
02851e9f
LE
2895 rv = work->io_fn(mdev);
2896 drbd_bm_unlock(mdev);
2897 put_ldev(mdev);
2898 }
b411b363 2899
4738fa16 2900 clear_bit_unlock(BITMAP_IO, &mdev->flags);
b411b363
PR
2901 wake_up(&mdev->misc_wait);
2902
2903 if (work->done)
2904 work->done(mdev, rv);
2905
2906 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
2907 work->why = NULL;
20ceb2b2 2908 work->flags = 0;
b411b363
PR
2909
2910 return 1;
2911}
2912
82f59cc6
LE
2913void drbd_ldev_destroy(struct drbd_conf *mdev)
2914{
2915 lc_destroy(mdev->resync);
2916 mdev->resync = NULL;
2917 lc_destroy(mdev->act_log);
2918 mdev->act_log = NULL;
2919 __no_warn(local,
2920 drbd_free_bc(mdev->ldev);
2921 mdev->ldev = NULL;);
2922
82f59cc6
LE
2923 clear_bit(GO_DISKLESS, &mdev->flags);
2924}
2925
00d56944 2926static int w_go_diskless(struct drbd_work *w, int unused)
e9e6f3ec 2927{
00d56944
PR
2928 struct drbd_conf *mdev = w->mdev;
2929
e9e6f3ec 2930 D_ASSERT(mdev->state.disk == D_FAILED);
9d282875
LE
2931 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
2932 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
82f59cc6
LE
2933 * the protected members anymore, though, so once put_ldev reaches zero
2934 * again, it will be safe to free them. */
e9e6f3ec 2935 drbd_force_state(mdev, NS(disk, D_DISKLESS));
e9e6f3ec
LE
2936 return 1;
2937}
2938
2939void drbd_go_diskless(struct drbd_conf *mdev)
2940{
2941 D_ASSERT(mdev->state.disk == D_FAILED);
2942 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
e42325a5 2943 drbd_queue_work(&mdev->tconn->data.work, &mdev->go_diskless);
e9e6f3ec
LE
2944}
2945
b411b363
PR
2946/**
2947 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
2948 * @mdev: DRBD device.
2949 * @io_fn: IO callback to be called when bitmap IO is possible
2950 * @done: callback to be called after the bitmap IO was performed
2951 * @why: Descriptive text of the reason for doing the IO
2952 *
2953 * While IO on the bitmap happens we freeze application IO thus we ensure
2954 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
2955 * called from worker context. It MUST NOT be used while a previous such
2956 * work is still pending!
2957 */
2958void drbd_queue_bitmap_io(struct drbd_conf *mdev,
2959 int (*io_fn)(struct drbd_conf *),
2960 void (*done)(struct drbd_conf *, int),
20ceb2b2 2961 char *why, enum bm_flag flags)
b411b363 2962{
e6b3ea83 2963 D_ASSERT(current == mdev->tconn->worker.task);
b411b363
PR
2964
2965 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
2966 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
2967 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
2968 if (mdev->bm_io_work.why)
2969 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
2970 why, mdev->bm_io_work.why);
2971
2972 mdev->bm_io_work.io_fn = io_fn;
2973 mdev->bm_io_work.done = done;
2974 mdev->bm_io_work.why = why;
20ceb2b2 2975 mdev->bm_io_work.flags = flags;
b411b363 2976
87eeee41 2977 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
2978 set_bit(BITMAP_IO, &mdev->flags);
2979 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
127b3178 2980 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
e42325a5 2981 drbd_queue_work(&mdev->tconn->data.work, &mdev->bm_io_work.w);
b411b363 2982 }
87eeee41 2983 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
2984}
2985
2986/**
2987 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
2988 * @mdev: DRBD device.
2989 * @io_fn: IO callback to be called when bitmap IO is possible
2990 * @why: Descriptive text of the reason for doing the IO
2991 *
2992 * freezes application IO while that the actual IO operations runs. This
2993 * functions MAY NOT be called from worker context.
2994 */
20ceb2b2
LE
2995int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
2996 char *why, enum bm_flag flags)
b411b363
PR
2997{
2998 int rv;
2999
e6b3ea83 3000 D_ASSERT(current != mdev->tconn->worker.task);
b411b363 3001
20ceb2b2
LE
3002 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
3003 drbd_suspend_io(mdev);
b411b363 3004
20ceb2b2 3005 drbd_bm_lock(mdev, why, flags);
b411b363
PR
3006 rv = io_fn(mdev);
3007 drbd_bm_unlock(mdev);
3008
20ceb2b2
LE
3009 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
3010 drbd_resume_io(mdev);
b411b363
PR
3011
3012 return rv;
3013}
3014
3015void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3016{
3017 if ((mdev->ldev->md.flags & flag) != flag) {
3018 drbd_md_mark_dirty(mdev);
3019 mdev->ldev->md.flags |= flag;
3020 }
3021}
3022
3023void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3024{
3025 if ((mdev->ldev->md.flags & flag) != 0) {
3026 drbd_md_mark_dirty(mdev);
3027 mdev->ldev->md.flags &= ~flag;
3028 }
3029}
3030int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3031{
3032 return (bdev->md.flags & flag) != 0;
3033}
3034
3035static void md_sync_timer_fn(unsigned long data)
3036{
3037 struct drbd_conf *mdev = (struct drbd_conf *) data;
3038
e42325a5 3039 drbd_queue_work_front(&mdev->tconn->data.work, &mdev->md_sync_work);
b411b363
PR
3040}
3041
00d56944 3042static int w_md_sync(struct drbd_work *w, int unused)
b411b363 3043{
00d56944
PR
3044 struct drbd_conf *mdev = w->mdev;
3045
b411b363 3046 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
ee15b038
LE
3047#ifdef DEBUG
3048 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3049 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3050#endif
b411b363 3051 drbd_md_sync(mdev);
b411b363
PR
3052 return 1;
3053}
3054
d8763023 3055const char *cmdname(enum drbd_packet cmd)
f2ad9063
AG
3056{
3057 /* THINK may need to become several global tables
3058 * when we want to support more than
3059 * one PRO_VERSION */
3060 static const char *cmdnames[] = {
3061 [P_DATA] = "Data",
3062 [P_DATA_REPLY] = "DataReply",
3063 [P_RS_DATA_REPLY] = "RSDataReply",
3064 [P_BARRIER] = "Barrier",
3065 [P_BITMAP] = "ReportBitMap",
3066 [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
3067 [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
3068 [P_UNPLUG_REMOTE] = "UnplugRemote",
3069 [P_DATA_REQUEST] = "DataRequest",
3070 [P_RS_DATA_REQUEST] = "RSDataRequest",
3071 [P_SYNC_PARAM] = "SyncParam",
3072 [P_SYNC_PARAM89] = "SyncParam89",
3073 [P_PROTOCOL] = "ReportProtocol",
3074 [P_UUIDS] = "ReportUUIDs",
3075 [P_SIZES] = "ReportSizes",
3076 [P_STATE] = "ReportState",
3077 [P_SYNC_UUID] = "ReportSyncUUID",
3078 [P_AUTH_CHALLENGE] = "AuthChallenge",
3079 [P_AUTH_RESPONSE] = "AuthResponse",
3080 [P_PING] = "Ping",
3081 [P_PING_ACK] = "PingAck",
3082 [P_RECV_ACK] = "RecvAck",
3083 [P_WRITE_ACK] = "WriteAck",
3084 [P_RS_WRITE_ACK] = "RSWriteAck",
7be8da07 3085 [P_DISCARD_WRITE] = "DiscardWrite",
f2ad9063
AG
3086 [P_NEG_ACK] = "NegAck",
3087 [P_NEG_DREPLY] = "NegDReply",
3088 [P_NEG_RS_DREPLY] = "NegRSDReply",
3089 [P_BARRIER_ACK] = "BarrierAck",
3090 [P_STATE_CHG_REQ] = "StateChgRequest",
3091 [P_STATE_CHG_REPLY] = "StateChgReply",
3092 [P_OV_REQUEST] = "OVRequest",
3093 [P_OV_REPLY] = "OVReply",
3094 [P_OV_RESULT] = "OVResult",
3095 [P_CSUM_RS_REQUEST] = "CsumRSRequest",
3096 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
3097 [P_COMPRESSED_BITMAP] = "CBitmap",
3098 [P_DELAY_PROBE] = "DelayProbe",
3099 [P_OUT_OF_SYNC] = "OutOfSync",
7be8da07 3100 [P_RETRY_WRITE] = "RetryWrite",
f2ad9063
AG
3101 };
3102
3103 if (cmd == P_HAND_SHAKE_M)
3104 return "HandShakeM";
3105 if (cmd == P_HAND_SHAKE_S)
3106 return "HandShakeS";
3107 if (cmd == P_HAND_SHAKE)
3108 return "HandShake";
6e849ce8 3109 if (cmd >= ARRAY_SIZE(cmdnames))
f2ad9063
AG
3110 return "Unknown";
3111 return cmdnames[cmd];
3112}
3113
7be8da07
AG
3114/**
3115 * drbd_wait_misc - wait for a request to make progress
3116 * @mdev: device associated with the request
3117 * @i: the struct drbd_interval embedded in struct drbd_request or
3118 * struct drbd_peer_request
3119 */
3120int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i)
3121{
3122 struct net_conf *net_conf = mdev->tconn->net_conf;
3123 DEFINE_WAIT(wait);
3124 long timeout;
3125
3126 if (!net_conf)
3127 return -ETIMEDOUT;
3128 timeout = MAX_SCHEDULE_TIMEOUT;
3129 if (net_conf->ko_count)
3130 timeout = net_conf->timeout * HZ / 10 * net_conf->ko_count;
3131
3132 /* Indicate to wake up mdev->misc_wait on progress. */
3133 i->waiting = true;
3134 prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
3135 spin_unlock_irq(&mdev->tconn->req_lock);
3136 timeout = schedule_timeout(timeout);
3137 finish_wait(&mdev->misc_wait, &wait);
3138 spin_lock_irq(&mdev->tconn->req_lock);
3139 if (!timeout || mdev->state.conn < C_CONNECTED)
3140 return -ETIMEDOUT;
3141 if (signal_pending(current))
3142 return -ERESTARTSYS;
3143 return 0;
3144}
3145
b411b363
PR
3146#ifdef CONFIG_DRBD_FAULT_INJECTION
3147/* Fault insertion support including random number generator shamelessly
3148 * stolen from kernel/rcutorture.c */
3149struct fault_random_state {
3150 unsigned long state;
3151 unsigned long count;
3152};
3153
3154#define FAULT_RANDOM_MULT 39916801 /* prime */
3155#define FAULT_RANDOM_ADD 479001701 /* prime */
3156#define FAULT_RANDOM_REFRESH 10000
3157
3158/*
3159 * Crude but fast random-number generator. Uses a linear congruential
3160 * generator, with occasional help from get_random_bytes().
3161 */
3162static unsigned long
3163_drbd_fault_random(struct fault_random_state *rsp)
3164{
3165 long refresh;
3166
49829ea7 3167 if (!rsp->count--) {
b411b363
PR
3168 get_random_bytes(&refresh, sizeof(refresh));
3169 rsp->state += refresh;
3170 rsp->count = FAULT_RANDOM_REFRESH;
3171 }
3172 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3173 return swahw32(rsp->state);
3174}
3175
3176static char *
3177_drbd_fault_str(unsigned int type) {
3178 static char *_faults[] = {
3179 [DRBD_FAULT_MD_WR] = "Meta-data write",
3180 [DRBD_FAULT_MD_RD] = "Meta-data read",
3181 [DRBD_FAULT_RS_WR] = "Resync write",
3182 [DRBD_FAULT_RS_RD] = "Resync read",
3183 [DRBD_FAULT_DT_WR] = "Data write",
3184 [DRBD_FAULT_DT_RD] = "Data read",
3185 [DRBD_FAULT_DT_RA] = "Data read ahead",
3186 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
3187 [DRBD_FAULT_AL_EE] = "EE allocation",
3188 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
3189 };
3190
3191 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3192}
3193
3194unsigned int
3195_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3196{
3197 static struct fault_random_state rrs = {0, 0};
3198
3199 unsigned int ret = (
3200 (fault_devs == 0 ||
3201 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3202 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3203
3204 if (ret) {
3205 fault_count++;
3206
7383506c 3207 if (__ratelimit(&drbd_ratelimit_state))
b411b363
PR
3208 dev_warn(DEV, "***Simulating %s failure\n",
3209 _drbd_fault_str(type));
3210 }
3211
3212 return ret;
3213}
3214#endif
3215
3216const char *drbd_buildtag(void)
3217{
3218 /* DRBD built from external sources has here a reference to the
3219 git hash of the source code. */
3220
3221 static char buildtag[38] = "\0uilt-in";
3222
3223 if (buildtag[0] == 0) {
3224#ifdef CONFIG_MODULES
3225 if (THIS_MODULE != NULL)
3226 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3227 else
3228#endif
3229 buildtag[0] = 'b';
3230 }
3231
3232 return buildtag;
3233}
3234
3235module_init(drbd_init)
3236module_exit(drbd_cleanup)
3237
b411b363
PR
3238EXPORT_SYMBOL(drbd_conn_str);
3239EXPORT_SYMBOL(drbd_role_str);
3240EXPORT_SYMBOL(drbd_disk_str);
3241EXPORT_SYMBOL(drbd_set_st_err_str);