void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
void set_bdi_congested(struct backing_dev_info *bdi, int sync);
long congestion_wait(int sync, long timeout);
-
+long wait_iff_congested(struct zone *zone, int sync, long timeout);
static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
{
typedef enum {
ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
+ ZONE_CONGESTED, /* zone has many dirty pages backed by
+ * a congested BDI
+ */
} zone_flags_t;
static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
clear_bit(flag, &zone->flags);
}
+static inline int zone_is_reclaim_congested(const struct zone *zone)
+{
+ return test_bit(ZONE_CONGESTED, &zone->flags);
+}
+
static inline int zone_is_reclaim_locked(const struct zone *zone)
{
return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
TP_ARGS(usec_timeout, usec_delayed)
);
+DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
+
+ TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
+
+ TP_ARGS(usec_timeout, usec_delayed)
+);
+
#endif /* _TRACE_WRITEBACK_H */
/* This part must be outside protection */
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
};
+static atomic_t nr_bdi_congested[2];
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
wait_queue_head_t *wqh = &congestion_wqh[sync];
bit = sync ? BDI_sync_congested : BDI_async_congested;
- clear_bit(bit, &bdi->state);
+ if (test_and_clear_bit(bit, &bdi->state))
+ atomic_dec(&nr_bdi_congested[sync]);
smp_mb__after_clear_bit();
if (waitqueue_active(wqh))
wake_up(wqh);
enum bdi_state bit;
bit = sync ? BDI_sync_congested : BDI_async_congested;
- set_bit(bit, &bdi->state);
+ if (!test_and_set_bit(bit, &bdi->state))
+ atomic_inc(&nr_bdi_congested[sync]);
}
EXPORT_SYMBOL(set_bdi_congested);
}
EXPORT_SYMBOL(congestion_wait);
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absense of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ /*
+ * If there is no congestion, or heavy congestion is not being
+ * encountered in the current zone, yield if necessary instead
+ * of sleeping on the congestion queue
+ */
+ if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+ !zone_is_reclaim_congested(zone)) {
+ cond_resched();
+
+ /* In case we scheduled, work out time remaining */
+ ret = timeout - (jiffies - start);
+ if (ret < 0)
+ ret = 0;
+
+ goto out;
+ }
+
+ /* Sleep until uncongested or a write happens */
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+out:
+ trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
preferred_zone, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
} while (!page && (gfp_mask & __GFP_NOFAIL));
return page;
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
}
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(mapping->backing_dev_info, sc)) {
- disable_lumpy_reclaim_mode(sc);
+ if (!may_write_to_queue(mapping->backing_dev_info, sc))
return PAGE_KEEP;
- }
if (clear_page_dirty_for_io(page)) {
int res;
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
+ struct zone *zone,
struct scan_control *sc)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
int pgactivate = 0;
+ unsigned long nr_dirty = 0;
+ unsigned long nr_congested = 0;
unsigned long nr_reclaimed = 0;
cond_resched();
goto keep;
VM_BUG_ON(PageActive(page));
+ VM_BUG_ON(page_zone(page) != zone);
sc->nr_scanned++;
}
if (PageDirty(page)) {
+ nr_dirty++;
+
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
/* Page is dirty, try to write it out here */
switch (pageout(page, mapping, sc)) {
case PAGE_KEEP:
+ nr_congested++;
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
+ /*
+ * Tag a zone as congested if all the dirty pages encountered were
+ * backed by a congested BDI. In this case, reclaimers should just
+ * back off and wait for congestion to clear because further reclaim
+ * will encounter the same problem
+ */
+ if (nr_dirty == nr_congested)
+ zone_set_flag(zone, ZONE_CONGESTED);
+
free_page_list(&free_pages);
list_splice(&ret_pages, page_list);
spin_unlock_irq(&zone->lru_lock);
- nr_reclaimed = shrink_page_list(&page_list, sc);
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc);
/* Check if we should syncronously wait for writeback */
if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
set_lumpy_reclaim_mode(priority, sc, true);
- nr_reclaimed += shrink_page_list(&page_list, sc);
+ nr_reclaimed += shrink_page_list(&page_list, zone, sc);
}
local_irq_disable();
/* Take a nap, wait for some writeback to complete */
if (!sc->hibernation_mode && sc->nr_scanned &&
- priority < DEF_PRIORITY - 2)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ priority < DEF_PRIORITY - 2) {
+ struct zone *preferred_zone;
+
+ first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+ NULL, &preferred_zone);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
+ }
}
out:
if (!zone_watermark_ok(zone, order,
min_wmark_pages(zone), end_zone, 0))
has_under_min_watermark_zone = 1;
+ } else {
+ /*
+ * If a zone reaches its high watermark,
+ * consider it to be no longer congested. It's
+ * possible there are dirty pages backed by
+ * congested BDIs but as pressure is relieved,
+ * spectulatively avoid congestion waits
+ */
+ zone_clear_flag(zone, ZONE_CONGESTED);
}
}