may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+ /*
+ * If a page at the tail of the LRU is under writeback, there
+ * are three cases to consider.
+ *
+ * 1) If reclaim is encountering an excessive number of pages
+ * under writeback and this page is both under writeback and
+ * PageReclaim then it indicates that pages are being queued
+ * for IO but are being recycled through the LRU before the
+ * IO can complete. Waiting on the page itself risks an
+ * indefinite stall if it is impossible to writeback the
+ * page due to IO error or disconnected storage so instead
+ * block for HZ/10 or until some IO completes then clear the
+ * ZONE_WRITEBACK flag to recheck if the condition exists.
+ *
+ * 2) Global reclaim encounters a page, memcg encounters a
+ * page that is not marked for immediate reclaim or
+ * the caller does not have __GFP_IO. In this case mark
+ * the page for immediate reclaim and continue scanning.
+ *
+ * __GFP_IO is checked because a loop driver thread might
+ * enter reclaim, and deadlock if it waits on a page for
+ * which it is needed to do the write (loop masks off
+ * __GFP_IO|__GFP_FS for this reason); but more thought
+ * would probably show more reasons.
+ *
+ * Don't require __GFP_FS, since we're not going into the
+ * FS, just waiting on its writeback completion. Worryingly,
+ * ext4 gfs2 and xfs allocate pages with
+ * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+ * may_enter_fs here is liable to OOM on them.
+ *
+ * 3) memcg encounters a page that is not already marked
+ * PageReclaim. memcg does not have any dirty pages
+ * throttling so we could easily OOM just because too many
+ * pages are in writeback and there is nothing else to
+ * reclaim. Wait for the writeback to complete.
+ */
if (PageWriteback(page)) {
- /*
- * memcg doesn't have any dirty pages throttling so we
- * could easily OOM just because too many pages are in
- * writeback and there is nothing else to reclaim.
- *
- * Check __GFP_IO, certainly because a loop driver
- * thread might enter reclaim, and deadlock if it waits
- * on a page for which it is needed to do the write
- * (loop masks off __GFP_IO|__GFP_FS for this reason);
- * but more thought would probably show more reasons.
- *
- * Don't require __GFP_FS, since we're not going into
- * the FS, just waiting on its writeback completion.
- * Worryingly, ext4 gfs2 and xfs allocate pages with
- * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
- * testing may_enter_fs here is liable to OOM on them.
- */
- if (global_reclaim(sc) ||
+ /* Case 1 above */
+ if (current_is_kswapd() &&
+ PageReclaim(page) &&
+ zone_is_reclaim_writeback(zone)) {
+ unlock_page(page);
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ zone_clear_flag(zone, ZONE_WRITEBACK);
+ goto keep;
+
+ /* Case 2 above */
+ } else if (global_reclaim(sc) ||
!PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
/*
* This is slightly racy - end_page_writeback()
*/
SetPageReclaim(page);
nr_writeback++;
+
goto keep_locked;
+
+ /* Case 3 above */
+ } else {
+ wait_on_page_writeback(page);
}
- wait_on_page_writeback(page);
}
if (!force_reclaim)
* isolated page is PageWriteback
*/
if (nr_writeback && nr_writeback >=
- (nr_taken >> (DEF_PRIORITY - sc->priority)))
+ (nr_taken >> (DEF_PRIORITY - sc->priority))) {
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+ zone_set_flag(zone, ZONE_WRITEBACK);
+ }
/*
* Similarly, if many dirty pages are encountered that are not
* the high watermark.
*
* Returns true if kswapd scanned at least the requested number of pages to
- * reclaim. This is used to determine if the scanning priority needs to be
- * raised.
+ * reclaim or if the lack of progress was due to pages under writeback.
+ * This is used to determine if the scanning priority needs to be raised.
*/
static bool kswapd_shrink_zone(struct zone *zone,
struct scan_control *sc,
if (nr_slab == 0 && !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
+ zone_clear_flag(zone, ZONE_WRITEBACK);
+
return sc->nr_scanned >= sc->nr_to_reclaim;
}