#define BLOCK_SECTORS (8)
/*
- * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
- * recovery scans a very long log
+ * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
+ *
+ * In write through mode, the reclaim runs every log->max_free_space.
+ * This can prevent the recovery scans for too long
*/
#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
+/* wake up reclaim thread periodically */
+#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
+/* start flush with these full stripes */
+#define R5C_FULL_STRIPE_FLUSH_BATCH 256
+/* reclaim stripes in groups */
+#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
+
/*
* We only need 2 bios per I/O unit to make progress, but ensure we
* have a few more available to not get too tight.
/* for r5c_cache */
enum r5c_journal_mode r5c_journal_mode;
+
+ /* all stripes in r5cache, in the order of seq at sh->log_start */
+ struct list_head stripe_in_journal_list;
+
+ spinlock_t stripe_in_journal_lock;
+ atomic_t stripe_in_journal_count;
};
/*
}
}
+/* Check whether we should flush some stripes to free up stripe cache */
+void r5c_check_stripe_cache_usage(struct r5conf *conf)
+{
+ int total_cached;
+
+ if (!r5c_is_writeback(conf->log))
+ return;
+
+ total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
+ atomic_read(&conf->r5c_cached_full_stripes);
+
+ /*
+ * The following condition is true for either of the following:
+ * - stripe cache pressure high:
+ * total_cached > 3/4 min_nr_stripes ||
+ * empty_inactive_list_nr > 0
+ * - stripe cache pressure moderate:
+ * total_cached > 1/2 min_nr_stripes
+ */
+ if (total_cached > conf->min_nr_stripes * 1 / 2 ||
+ atomic_read(&conf->empty_inactive_list_nr) > 0)
+ r5l_wake_reclaim(conf->log, 0);
+}
+
+/*
+ * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
+ * stripes in the cache
+ */
+void r5c_check_cached_full_stripe(struct r5conf *conf)
+{
+ if (!r5c_is_writeback(conf->log))
+ return;
+
+ /*
+ * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
+ * or a full stripe (chunk size / 4k stripes).
+ */
+ if (atomic_read(&conf->r5c_cached_full_stripes) >=
+ min(R5C_FULL_STRIPE_FLUSH_BATCH,
+ conf->chunk_sectors >> STRIPE_SHIFT))
+ r5l_wake_reclaim(conf->log, 0);
+}
+
+/*
+ * Total log space (in sectors) needed to flush all data in cache
+ *
+ * Currently, writing-out phase automatically includes all pending writes
+ * to the same sector. So the reclaim of each stripe takes up to
+ * (conf->raid_disks + 1) pages of log space.
+ *
+ * To totally avoid deadlock due to log space, the code reserves
+ * (conf->raid_disks + 1) pages for each stripe in cache, which is not
+ * necessary in most cases.
+ *
+ * To improve this, we will need writing-out phase to be able to NOT include
+ * pending writes, which will reduce the requirement to
+ * (conf->max_degraded + 1) pages per stripe in cache.
+ */
+static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
+{
+ struct r5l_log *log = conf->log;
+
+ if (!r5c_is_writeback(log))
+ return 0;
+
+ return BLOCK_SECTORS * (conf->raid_disks + 1) *
+ atomic_read(&log->stripe_in_journal_count);
+}
+
+/*
+ * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
+ *
+ * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
+ * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
+ * device is less than 2x of reclaim_required_space.
+ */
+static inline void r5c_update_log_state(struct r5l_log *log)
+{
+ struct r5conf *conf = log->rdev->mddev->private;
+ sector_t free_space;
+ sector_t reclaim_space;
+
+ if (!r5c_is_writeback(log))
+ return;
+
+ free_space = r5l_ring_distance(log, log->log_start,
+ log->last_checkpoint);
+ reclaim_space = r5c_log_required_to_flush_cache(conf);
+ if (free_space < 2 * reclaim_space)
+ set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
+ else
+ clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
+ if (free_space < 3 * reclaim_space)
+ set_bit(R5C_LOG_TIGHT, &conf->cache_state);
+ else
+ clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
+}
+
/*
* Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
* This function should only be called in write-back mode.
*/
-static void r5c_make_stripe_write_out(struct stripe_head *sh)
+void r5c_make_stripe_write_out(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
struct r5l_log *log = conf->log;
{
log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
+ r5c_update_log_state(log);
/*
* If we filled up the log device start from the beginning again,
* which will require a new bio.
atomic_inc(&io->pending_stripe);
sh->log_io = io;
+ if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+ return 0;
+
+ if (sh->log_start == MaxSector) {
+ BUG_ON(!list_empty(&sh->r5c));
+ sh->log_start = io->log_start;
+ spin_lock_irq(&log->stripe_in_journal_lock);
+ list_add_tail(&sh->r5c,
+ &log->stripe_in_journal_list);
+ spin_unlock_irq(&log->stripe_in_journal_lock);
+ atomic_inc(&log->stripe_in_journal_count);
+ }
return 0;
}
-static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+/* add stripe to no_space_stripes, and then wake up reclaim */
+static inline void r5l_add_no_space_stripe(struct r5l_log *log,
+ struct stripe_head *sh)
+{
+ spin_lock(&log->no_space_stripes_lock);
+ list_add_tail(&sh->log_list, &log->no_space_stripes);
+ spin_unlock(&log->no_space_stripes_lock);
+}
+
/*
* running in raid5d, where reclaim could wait for raid5d too (when it flushes
* data from log to raid disks), so we shouldn't wait for reclaim here
*/
int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
{
+ struct r5conf *conf = sh->raid_conf;
int write_disks = 0;
int data_pages, parity_pages;
int reserve;
int i;
int ret = 0;
+ bool wake_reclaim = false;
if (!log)
return -EAGAIN;
mutex_lock(&log->io_mutex);
/* meta + data */
reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
- if (!r5l_has_free_space(log, reserve)) {
- spin_lock(&log->no_space_stripes_lock);
- list_add_tail(&sh->log_list, &log->no_space_stripes);
- spin_unlock(&log->no_space_stripes_lock);
- r5l_wake_reclaim(log, reserve);
- } else {
- ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
- if (ret) {
- spin_lock_irq(&log->io_list_lock);
- list_add_tail(&sh->log_list, &log->no_mem_stripes);
- spin_unlock_irq(&log->io_list_lock);
+ if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
+ if (!r5l_has_free_space(log, reserve)) {
+ r5l_add_no_space_stripe(log, sh);
+ wake_reclaim = true;
+ } else {
+ ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
+ if (ret) {
+ spin_lock_irq(&log->io_list_lock);
+ list_add_tail(&sh->log_list,
+ &log->no_mem_stripes);
+ spin_unlock_irq(&log->io_list_lock);
+ }
+ }
+ } else { /* R5C_JOURNAL_MODE_WRITE_BACK */
+ /*
+ * log space critical, do not process stripes that are
+ * not in cache yet (sh->log_start == MaxSector).
+ */
+ if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
+ sh->log_start == MaxSector) {
+ r5l_add_no_space_stripe(log, sh);
+ wake_reclaim = true;
+ reserve = 0;
+ } else if (!r5l_has_free_space(log, reserve)) {
+ if (sh->log_start == log->last_checkpoint)
+ BUG();
+ else
+ r5l_add_no_space_stripe(log, sh);
+ } else {
+ ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
+ if (ret) {
+ spin_lock_irq(&log->io_list_lock);
+ list_add_tail(&sh->log_list,
+ &log->no_mem_stripes);
+ spin_unlock_irq(&log->io_list_lock);
+ }
}
}
mutex_unlock(&log->io_mutex);
+ if (wake_reclaim)
+ r5l_wake_reclaim(log, reserve);
return 0;
}
spin_unlock(&log->no_space_stripes_lock);
}
+/*
+ * calculate new last_checkpoint
+ * for write through mode, returns log->next_checkpoint
+ * for write back, returns log_start of first sh in stripe_in_journal_list
+ */
+static sector_t r5c_calculate_new_cp(struct r5conf *conf)
+{
+ struct stripe_head *sh;
+ struct r5l_log *log = conf->log;
+ sector_t new_cp;
+ unsigned long flags;
+
+ if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+ return log->next_checkpoint;
+
+ spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
+ if (list_empty(&conf->log->stripe_in_journal_list)) {
+ /* all stripes flushed */
+ spin_unlock(&log->stripe_in_journal_lock);
+ return log->next_checkpoint;
+ }
+ sh = list_first_entry(&conf->log->stripe_in_journal_list,
+ struct stripe_head, r5c);
+ new_cp = sh->log_start;
+ spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
+ return new_cp;
+}
+
static sector_t r5l_reclaimable_space(struct r5l_log *log)
{
+ struct r5conf *conf = log->rdev->mddev->private;
+
return r5l_ring_distance(log, log->last_checkpoint,
- log->next_checkpoint);
+ r5c_calculate_new_cp(conf));
}
static void r5l_run_no_mem_stripe(struct r5l_log *log)
static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
{
struct r5l_log *log = io->log;
+ struct r5conf *conf = log->rdev->mddev->private;
unsigned long flags;
spin_lock_irqsave(&log->io_list_lock, flags);
return;
}
- if (r5l_reclaimable_space(log) > log->max_free_space)
+ if (r5l_reclaimable_space(log) > log->max_free_space ||
+ test_bit(R5C_LOG_TIGHT, &conf->cache_state))
r5l_wake_reclaim(log, 0);
spin_unlock_irqrestore(&log->io_list_lock, flags);
}
}
+/*
+ * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
+ * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
+ *
+ * must hold conf->device_lock
+ */
+static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
+{
+ BUG_ON(list_empty(&sh->lru));
+ BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
+ BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
+
+ /*
+ * The stripe is not ON_RELEASE_LIST, so it is safe to call
+ * raid5_release_stripe() while holding conf->device_lock
+ */
+ BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
+ assert_spin_locked(&conf->device_lock);
+
+ list_del_init(&sh->lru);
+ atomic_inc(&sh->count);
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ atomic_inc(&conf->active_stripes);
+ r5c_make_stripe_write_out(sh);
+
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ atomic_inc(&conf->preread_active_stripes);
+ raid5_release_stripe(sh);
+}
+
+/*
+ * if num == 0, flush all full stripes
+ * if num > 0, flush all full stripes. If less than num full stripes are
+ * flushed, flush some partial stripes until totally num stripes are
+ * flushed or there is no more cached stripes.
+ */
+void r5c_flush_cache(struct r5conf *conf, int num)
+{
+ int count;
+ struct stripe_head *sh, *next;
+
+ assert_spin_locked(&conf->device_lock);
+ if (!conf->log)
+ return;
+
+ count = 0;
+ list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
+ r5c_flush_stripe(conf, sh);
+ count++;
+ }
+
+ if (count >= num)
+ return;
+ list_for_each_entry_safe(sh, next,
+ &conf->r5c_partial_stripe_list, lru) {
+ r5c_flush_stripe(conf, sh);
+ if (++count >= num)
+ break;
+ }
+}
+
+static void r5c_do_reclaim(struct r5conf *conf)
+{
+ struct r5l_log *log = conf->log;
+ struct stripe_head *sh;
+ int count = 0;
+ unsigned long flags;
+ int total_cached;
+ int stripes_to_flush;
+
+ if (!r5c_is_writeback(log))
+ return;
+
+ total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
+ atomic_read(&conf->r5c_cached_full_stripes);
+
+ if (total_cached > conf->min_nr_stripes * 3 / 4 ||
+ atomic_read(&conf->empty_inactive_list_nr) > 0)
+ /*
+ * if stripe cache pressure high, flush all full stripes and
+ * some partial stripes
+ */
+ stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
+ else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
+ atomic_read(&conf->r5c_cached_full_stripes) >
+ R5C_FULL_STRIPE_FLUSH_BATCH)
+ /*
+ * if stripe cache pressure moderate, or if there is many full
+ * stripes,flush all full stripes
+ */
+ stripes_to_flush = 0;
+ else
+ /* no need to flush */
+ stripes_to_flush = -1;
+
+ if (stripes_to_flush >= 0) {
+ spin_lock_irqsave(&conf->device_lock, flags);
+ r5c_flush_cache(conf, stripes_to_flush);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
+
+ /* if log space is tight, flush stripes on stripe_in_journal_list */
+ if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
+ spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
+ spin_lock(&conf->device_lock);
+ list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
+ /*
+ * stripes on stripe_in_journal_list could be in any
+ * state of the stripe_cache state machine. In this
+ * case, we only want to flush stripe on
+ * r5c_cached_full/partial_stripes. The following
+ * condition makes sure the stripe is on one of the
+ * two lists.
+ */
+ if (!list_empty(&sh->lru) &&
+ !test_bit(STRIPE_HANDLE, &sh->state) &&
+ atomic_read(&sh->count) == 0) {
+ r5c_flush_stripe(conf, sh);
+ }
+ if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
+ break;
+ }
+ spin_unlock(&conf->device_lock);
+ spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
+ }
+ md_wakeup_thread(conf->mddev->thread);
+}
+
static void r5l_do_reclaim(struct r5l_log *log)
{
+ struct r5conf *conf = log->rdev->mddev->private;
sector_t reclaim_target = xchg(&log->reclaim_target, 0);
sector_t reclaimable;
sector_t next_checkpoint;
- u64 next_cp_seq;
+ bool write_super;
spin_lock_irq(&log->io_list_lock);
+ write_super = r5l_reclaimable_space(log) > log->max_free_space ||
+ reclaim_target != 0 || !list_empty(&log->no_space_stripes);
/*
* move proper io_unit to reclaim list. We should not change the order.
* reclaimable/unreclaimable io_unit can be mixed in the list, we
log->io_list_lock);
}
- next_checkpoint = log->next_checkpoint;
- next_cp_seq = log->next_cp_seq;
+ next_checkpoint = r5c_calculate_new_cp(conf);
spin_unlock_irq(&log->io_list_lock);
BUG_ON(reclaimable < 0);
- if (reclaimable == 0)
+
+ if (reclaimable == 0 || !write_super)
return;
/*
mutex_lock(&log->io_mutex);
log->last_checkpoint = next_checkpoint;
- log->last_cp_seq = next_cp_seq;
+ r5c_update_log_state(log);
mutex_unlock(&log->io_mutex);
r5l_run_no_space_stripes(log);
if (!log)
return;
+ r5c_do_reclaim(conf);
r5l_do_reclaim(log);
}
-static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
+void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
{
unsigned long target;
unsigned long new = (unsigned long)space; /* overflow in theory */
+ if (!log)
+ return;
do {
target = log->reclaim_target;
if (new < target)
return;
log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
log->rdev->mddev, "reclaim");
+ log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
} else if (state == 1) {
/* make sure r5l_write_super_and_discard_space exits */
mddev = log->rdev->mddev;
wake_up(&mddev->sb_wait);
- r5l_wake_reclaim(log, -1L);
+ r5l_wake_reclaim(log, MaxSector);
md_unregister_thread(&log->reclaim_thread);
r5l_do_reclaim(log);
}
if (do_wakeup)
wake_up(&conf->wait_for_overlap);
+
+ if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+ return;
+
+ spin_lock_irq(&conf->log->stripe_in_journal_lock);
+ list_del_init(&sh->r5c);
+ spin_unlock_irq(&conf->log->stripe_in_journal_lock);
+ sh->log_start = MaxSector;
+ atomic_dec(&conf->log->stripe_in_journal_count);
}
int
r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
struct stripe_head_state *s)
{
+ struct r5conf *conf = sh->raid_conf;
int pages = 0;
int reserve;
int i;
mutex_lock(&log->io_mutex);
/* meta + data */
reserve = (1 + pages) << (PAGE_SHIFT - 9);
- if (!r5l_has_free_space(log, reserve)) {
- spin_lock(&log->no_space_stripes_lock);
- list_add_tail(&sh->log_list, &log->no_space_stripes);
- spin_unlock(&log->no_space_stripes_lock);
- r5l_wake_reclaim(log, reserve);
+ if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
+ sh->log_start == MaxSector)
+ r5l_add_no_space_stripe(log, sh);
+ else if (!r5l_has_free_space(log, reserve)) {
+ if (sh->log_start == log->last_checkpoint)
+ BUG();
+ else
+ r5l_add_no_space_stripe(log, sh);
} else {
ret = r5l_log_stripe(log, sh, pages, 0);
if (ret) {
return 0;
}
-
static int r5l_load_log(struct r5l_log *log)
{
struct md_rdev *rdev = log->rdev;
log->max_free_space = RECLAIM_MAX_FREE_SPACE;
log->last_checkpoint = cp;
log->next_checkpoint = cp;
+ mutex_lock(&log->io_mutex);
+ r5c_update_log_state(log);
+ mutex_unlock(&log->io_mutex);
__free_page(page);
log->rdev->mddev, "reclaim");
if (!log->reclaim_thread)
goto reclaim_thread;
+ log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
+
init_waitqueue_head(&log->iounit_wait);
INIT_LIST_HEAD(&log->no_mem_stripes);
spin_lock_init(&log->no_space_stripes_lock);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+ INIT_LIST_HEAD(&log->stripe_in_journal_list);
+ spin_lock_init(&log->stripe_in_journal_lock);
+ atomic_set(&log->stripe_in_journal_count, 0);
if (r5l_load_log(log))
goto error;