spinlock_t stripe_in_journal_lock;
atomic_t stripe_in_journal_count;
+
+ /* to submit async io_units, to fulfill ordering of flush */
+ struct work_struct deferred_io_work;
};
/*
int state;
bool need_split_bio;
+ struct bio *split_bio;
+
+ unsigned int has_flush:1; /* include flush request */
+ unsigned int has_fua:1; /* include fua request */
+ unsigned int has_null_flush:1; /* include empty flush request */
+ /*
+ * io isn't sent yet, flush/fua request can only be submitted till it's
+ * the first IO in running_ios list
+ */
+ unsigned int io_deferred:1;
+
+ struct bio_list flush_barriers; /* size == 0 flush bios */
};
/* r5l_io_unit state */
}
}
+static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
static void r5l_log_endio(struct bio *bio)
{
struct r5l_io_unit *io = bio->bi_private;
+ struct r5l_io_unit *io_deferred;
struct r5l_log *log = io->log;
unsigned long flags;
r5l_move_to_end_ios(log);
else
r5l_log_run_stripes(log);
+ if (!list_empty(&log->running_ios)) {
+ /*
+ * FLUSH/FUA io_unit is deferred because of ordering, now we
+ * can dispatch it
+ */
+ io_deferred = list_first_entry(&log->running_ios,
+ struct r5l_io_unit, log_sibling);
+ if (io_deferred->io_deferred)
+ schedule_work(&log->deferred_io_work);
+ }
+
spin_unlock_irqrestore(&log->io_list_lock, flags);
if (log->need_cache_flush)
md_wakeup_thread(log->rdev->mddev->thread);
+
+ if (io->has_null_flush) {
+ struct bio *bi;
+
+ WARN_ON(bio_list_empty(&io->flush_barriers));
+ while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
+ bio_endio(bi);
+ atomic_dec(&io->pending_stripe);
+ }
+ if (atomic_read(&io->pending_stripe) == 0)
+ __r5l_stripe_write_finished(io);
+ }
+}
+
+static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+ if (io->has_flush)
+ bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH);
+ if (io->has_fua)
+ bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA);
+ submit_bio(io->current_bio);
+
+ if (!io->split_bio)
+ return;
+
+ if (io->has_flush)
+ bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH);
+ if (io->has_fua)
+ bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA);
+ submit_bio(io->split_bio);
+}
+
+/* deferred io_unit will be dispatched here */
+static void r5l_submit_io_async(struct work_struct *work)
+{
+ struct r5l_log *log = container_of(work, struct r5l_log,
+ deferred_io_work);
+ struct r5l_io_unit *io = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ if (!list_empty(&log->running_ios)) {
+ io = list_first_entry(&log->running_ios, struct r5l_io_unit,
+ log_sibling);
+ if (!io->io_deferred)
+ io = NULL;
+ else
+ io->io_deferred = 0;
+ }
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+ if (io)
+ r5l_do_submit_io(log, io);
}
static void r5l_submit_current_io(struct r5l_log *log)
{
struct r5l_io_unit *io = log->current_io;
+ struct bio *bio;
struct r5l_meta_block *block;
unsigned long flags;
u32 crc;
+ bool do_submit = true;
if (!io)
return;
block->meta_size = cpu_to_le32(io->meta_offset);
crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
block->checksum = cpu_to_le32(crc);
+ bio = io->current_bio;
log->current_io = NULL;
spin_lock_irqsave(&log->io_list_lock, flags);
- __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+ if (io->has_flush || io->has_fua) {
+ if (io != list_first_entry(&log->running_ios,
+ struct r5l_io_unit, log_sibling)) {
+ io->io_deferred = 1;
+ do_submit = false;
+ }
+ }
spin_unlock_irqrestore(&log->io_list_lock, flags);
-
- submit_bio(io->current_bio);
+ if (do_submit)
+ r5l_do_submit_io(log, io);
}
static struct bio *r5l_bio_alloc(struct r5l_log *log)
io->log = log;
INIT_LIST_HEAD(&io->log_sibling);
INIT_LIST_HEAD(&io->stripe_list);
+ bio_list_init(&io->flush_barriers);
io->state = IO_UNIT_RUNNING;
io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
struct r5l_io_unit *io = log->current_io;
if (io->need_split_bio) {
- struct bio *prev = io->current_bio;
-
+ BUG_ON(io->split_bio);
+ io->split_bio = io->current_bio;
io->current_bio = r5l_bio_alloc(log);
- bio_chain(io->current_bio, prev);
-
- submit_bio(prev);
+ bio_chain(io->current_bio, io->split_bio);
+ io->need_split_bio = false;
}
if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
io = log->current_io;
+ if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
+ io->has_flush = 1;
+
for (i = 0; i < sh->disks; i++) {
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
test_bit(R5_InJournal, &sh->dev[i].flags))
continue;
if (i == sh->pd_idx || i == sh->qd_idx)
continue;
+ if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
+ log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
+ io->has_fua = 1;
+ /*
+ * we need to flush journal to make sure recovery can
+ * reach the data with fua flag
+ */
+ io->has_flush = 1;
+ }
r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
raid5_compute_blocknr(sh, i, 0),
sh->dev[i].log_checksum, 0, false);
{
if (!log)
return -ENODEV;
- /*
- * we flush log disk cache first, then write stripe data to raid disks.
- * So if bio is finished, the log disk cache is flushed already. The
- * recovery guarantees we can recovery the bio from log disk, so we
- * don't need to flush again
- */
- if (bio->bi_iter.bi_size == 0) {
- bio_endio(bio);
- return 0;
+
+ if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
+ /*
+ * in write through (journal only)
+ * we flush log disk cache first, then write stripe data to
+ * raid disks. So if bio is finished, the log disk cache is
+ * flushed already. The recovery guarantees we can recovery
+ * the bio from log disk, so we don't need to flush again
+ */
+ if (bio->bi_iter.bi_size == 0) {
+ bio_endio(bio);
+ return 0;
+ }
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ } else {
+ /* write back (with cache) */
+ if (bio->bi_iter.bi_size == 0) {
+ mutex_lock(&log->io_mutex);
+ r5l_get_meta(log, 0);
+ bio_list_add(&log->current_io->flush_barriers, bio);
+ log->current_io->has_flush = 1;
+ log->current_io->has_null_flush = 1;
+ atomic_inc(&log->current_io->pending_stripe);
+ r5l_submit_current_io(log);
+ mutex_unlock(&log->io_mutex);
+ return 0;
+ }
}
- bio->bi_opf &= ~REQ_PREFLUSH;
return -EAGAIN;
}
INIT_LIST_HEAD(&log->no_space_stripes);
spin_lock_init(&log->no_space_stripes_lock);
+ INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
+
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
INIT_LIST_HEAD(&log->stripe_in_journal_list);
spin_lock_init(&log->stripe_in_journal_lock);