__EE_CALL_AL_COMPLETE_IO,
__EE_MAY_SET_IN_SYNC,
+ /* is this a TRIM aka REQ_DISCARD? */
+ __EE_IS_TRIM,
+ /* our lower level cannot handle trim,
+ * and we want to fall back to zeroout instead */
+ __EE_IS_TRIM_USE_ZEROOUT,
+
/* In case a barrier failed,
* we need to resubmit without the barrier flag. */
__EE_RESUBMITTED,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
-#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
+#define EE_IS_TRIM (1<<__EE_IS_TRIM)
+#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
+#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
+/* For now, don't allow more than one activity log extent worth of data
+ * to be discarded in one go. We may need to rework drbd_al_begin_io()
+ * to allow for even larger discard ranges */
+#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE
+#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
+
extern int drbd_bm_init(struct drbd_device *device);
extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
extern void drbd_bm_cleanup(struct drbd_device *device);
extern void resync_timer_fn(unsigned long data);
extern void start_resync_timer_fn(unsigned long data);
+extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
+
/* drbd_receiver.c */
extern int drbd_receiver(struct drbd_thread *thi);
extern int drbd_asender(struct drbd_thread *thi);
extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
sector_t, unsigned int,
+ bool,
gfp_t) __must_hold(local);
extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
int);
static int drbd_do_features(struct drbd_connection *connection);
static int drbd_do_auth(struct drbd_connection *connection);
static int drbd_disconnected(struct drbd_peer_device *);
-
+static void conn_wait_active_ee_empty(struct drbd_connection *connection);
static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
static int e_end_block(struct drbd_work *, int);
struct drbd_peer_request *
drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
- unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
+ unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
{
struct drbd_device *device = peer_device->device;
struct drbd_peer_request *peer_req;
return NULL;
}
- if (data_size) {
+ if (has_payload && data_size) {
page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
if (!page)
goto fail;
unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
int err = -ENOMEM;
+ if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
+ /* wait for all pending IO completions, before we start
+ * zeroing things out. */
+ conn_wait_active_ee_empty(first_peer_device(device)->connection);
+ if (blkdev_issue_zeroout(device->ldev->backing_bdev,
+ sector, ds >> 9, GFP_NOIO))
+ peer_req->flags |= EE_WAS_ERROR;
+ drbd_endio_write_sec_final(peer_req);
+ return 0;
+ }
+
+ if (peer_req->flags & EE_IS_TRIM)
+ nr_pages = 0; /* discards don't have any payload. */
+
/* In most cases, we will only need one bio. But in case the lower
* level restrictions happen to be different at this offset on this
* side than those of the sending peer, we may need to submit the
next_bio:
bio = bio_alloc(GFP_NOIO, nr_pages);
if (!bio) {
- drbd_err(device, "submit_ee: Allocation of a bio failed\n");
+ drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
goto fail;
}
/* > peer_req->i.sector, unless this is the first bio */
bios = bio;
++n_bios;
+ if (rw & REQ_DISCARD) {
+ bio->bi_iter.bi_size = ds;
+ goto submit;
+ }
+
page_chain_for_each(page) {
unsigned len = min_t(unsigned, ds, PAGE_SIZE);
if (!bio_add_page(bio, page, len, 0)) {
sector += len >> 9;
--nr_pages;
}
- D_ASSERT(device, page == NULL);
D_ASSERT(device, ds == 0);
+submit:
+ D_ASSERT(device, page == NULL);
atomic_set(&peer_req->pending_bios, n_bios);
do {
* and from receive_Data */
static struct drbd_peer_request *
read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
- int data_size) __must_hold(local)
+ struct packet_info *pi) __must_hold(local)
{
struct drbd_device *device = peer_device->device;
const sector_t capacity = drbd_get_capacity(device->this_bdev);
struct drbd_peer_request *peer_req;
struct page *page;
int dgs, ds, err;
+ int data_size = pi->size;
void *dig_in = peer_device->connection->int_dig_in;
void *dig_vv = peer_device->connection->int_dig_vv;
unsigned long *data;
+ struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
dgs = 0;
- if (peer_device->connection->peer_integrity_tfm) {
+ if (!trim && peer_device->connection->peer_integrity_tfm) {
dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
/*
* FIXME: Receive the incoming digest into the receive buffer
data_size -= dgs;
}
+ if (trim) {
+ D_ASSERT(peer_device, data_size == 0);
+ data_size = be32_to_cpu(trim->size);
+ }
+
if (!expect(IS_ALIGNED(data_size, 512)))
return NULL;
- if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
+ /* prepare for larger trim requests. */
+ if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
return NULL;
/* even though we trust out peer,
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
- peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, GFP_NOIO);
+ peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
if (!peer_req)
return NULL;
- if (!data_size)
+ if (trim)
return peer_req;
ds = data_size;
}
static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
- int data_size) __releases(local)
+ struct packet_info *pi) __releases(local)
{
struct drbd_device *device = peer_device->device;
struct drbd_peer_request *peer_req;
- peer_req = read_in_block(peer_device, ID_SYNCER, sector, data_size);
+ peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
if (!peer_req)
goto fail;
list_add(&peer_req->w.list, &device->sync_ee);
spin_unlock_irq(&device->resource->req_lock);
- atomic_add(data_size >> 9, &device->rs_sect_ev);
+ atomic_add(pi->size >> 9, &device->rs_sect_ev);
if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
return 0;
/* data is submitted to disk within recv_resync_read.
* corresponding put_ldev done below on error,
* or in drbd_peer_request_endio. */
- err = recv_resync_read(peer_device, sector, pi->size);
+ err = recv_resync_read(peer_device, sector, pi);
} else {
if (__ratelimit(&drbd_ratelimit_state))
drbd_err(device, "Can not write resync data to local disk.\n");
*/
sector = be64_to_cpu(p->sector);
- peer_req = read_in_block(peer_device, p->block_id, sector, pi->size);
+ peer_req = read_in_block(peer_device, p->block_id, sector, pi);
if (!peer_req) {
put_ldev(device);
return -EIO;
dp_flags = be32_to_cpu(p->dp_flags);
rw |= wire_flags_to_bio(dp_flags);
- if (peer_req->pages == NULL) {
+ if (pi->cmd == P_TRIM) {
+ struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
+ peer_req->flags |= EE_IS_TRIM;
+ if (!blk_queue_discard(q))
+ peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+ D_ASSERT(peer_device, peer_req->i.size > 0);
+ D_ASSERT(peer_device, rw & REQ_DISCARD);
+ D_ASSERT(peer_device, peer_req->pages == NULL);
+ } else if (peer_req->pages == NULL) {
D_ASSERT(device, peer_req->i.size == 0);
D_ASSERT(device, dp_flags & DP_FLUSH);
}
update_peer_seq(peer_device, peer_seq);
spin_lock_irq(&device->resource->req_lock);
}
- list_add(&peer_req->w.list, &device->active_ee);
+ /* if we use the zeroout fallback code, we process synchronously
+ * and we wait for all pending requests, respectively wait for
+ * active_ee to become empty in drbd_submit_peer_request();
+ * better not add ourselves here. */
+ if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
+ list_add(&peer_req->w.list, &device->active_ee);
spin_unlock_irq(&device->resource->req_lock);
if (device->state.conn == C_SYNC_TARGET)
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
- peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, GFP_NOIO);
+ peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
+ true /* has real payload */, GFP_NOIO);
if (!peer_req) {
put_ldev(device);
return -ENOMEM;
[P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
+ [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
};
static void drbdd(struct drbd_connection *connection)
/* writes on behalf of the partner, or resync writes,
* "submitted" by the receiver, final stage. */
-static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
{
unsigned long flags = 0;
struct drbd_peer_device *peer_device = peer_req->peer_device;
do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
- if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+ /* FIXME do we want to detach for failed REQ_DISCARD?
+ * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
+ if (peer_req->flags & EE_WAS_ERROR)
__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
spin_unlock_irqrestore(&device->resource->req_lock, flags);
struct drbd_device *device = peer_req->peer_device->device;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
int is_write = bio_data_dir(bio) == WRITE;
+ int is_discard = !!(bio->bi_rw & REQ_DISCARD);
if (error && __ratelimit(&drbd_ratelimit_state))
drbd_warn(device, "%s: error=%d s=%llus\n",
- is_write ? "write" : "read", error,
+ is_write ? (is_discard ? "discard" : "write")
+ : "read", error,
(unsigned long long)peer_req->i.sector);
if (!error && !uptodate) {
if (__ratelimit(&drbd_ratelimit_state))
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
- size, GFP_TRY);
+ size, true /* has real payload */, GFP_TRY);
if (!peer_req)
goto defer;