#include <linux/atomic.h>
#include <linux/math64.h>
#include <linux/crypto.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
#include <linux/cleancache.h>
#include <linux/frontswap.h>
}
#endif
+/* enable (or fix code) when Seth's patches are accepted upstream */
+#define zcache_writeback_enabled 0
+
static int zcache_enabled __read_mostly;
static int disable_cleancache __read_mostly;
static int disable_frontswap __read_mostly;
static unsigned long zcache_last_inactive_anon_pageframes;
static unsigned long zcache_eph_nonactive_puts_ignored;
static unsigned long zcache_pers_nonactive_puts_ignored;
+static unsigned long zcache_writtenback_pages;
+static long zcache_outstanding_writeback_pages;
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
zdfs64("eph_zbytes_max", S_IRUGO, root, &zcache_eph_zbytes_max);
zdfs64("pers_zbytes", S_IRUGO, root, &zcache_pers_zbytes);
zdfs64("pers_zbytes_max", S_IRUGO, root, &zcache_pers_zbytes_max);
+ zdfs("outstanding_writeback_pages", S_IRUGO, root,
+ &zcache_outstanding_writeback_pages);
+ zdfs("writtenback_pages", S_IRUGO, root, &zcache_writtenback_pages);
return 0;
}
#undef zdebugfs
pr_info("zcache: eph_zpages_max=%lu\n", zcache_eph_zpages_max);
pr_info("zcache: pers_zpages=%lu\n", zcache_pers_zpages);
pr_info("zcache: pers_zpages_max=%lu\n", zcache_pers_zpages_max);
+ pr_info("zcache: last_active_file_pageframes=%lu\n",
+ zcache_last_active_file_pageframes);
+ pr_info("zcache: last_inactive_file_pageframes=%lu\n",
+ zcache_last_inactive_file_pageframes);
+ pr_info("zcache: last_active_anon_pageframes=%lu\n",
+ zcache_last_active_anon_pageframes);
+ pr_info("zcache: last_inactive_anon_pageframes=%lu\n",
+ zcache_last_inactive_anon_pageframes);
+ pr_info("zcache: eph_nonactive_puts_ignored=%lu\n",
+ zcache_eph_nonactive_puts_ignored);
+ pr_info("zcache: pers_nonactive_puts_ignored=%lu\n",
+ zcache_pers_nonactive_puts_ignored);
pr_info("zcache: eph_zbytes=%llu\n",
(unsigned long long)zcache_eph_zbytes);
pr_info("zcache: eph_zbytes_max=%llu\n",
pr_info("zcache: pers_zbytes=%llu\n",
(unsigned long long)zcache_pers_zbytes);
pr_info("zcache: pers_zbytes_max=%llu\n",
- (unsigned long long)zcache_pers_zbytes_max);
+ (unsigned long long)zcache_pers_zbytes_max);
+ pr_info("zcache: outstanding_writeback_pages=%lu\n",
+ zcache_outstanding_writeback_pages);
+ pr_info("zcache: writtenback_pages=%lu\n", zcache_writtenback_pages);
}
#endif
return page;
}
-#ifdef FRONTSWAP_HAS_UNUSE
-static void zcache_unacct_page(void)
-{
- zcache_pageframes_freed =
- atomic_inc_return(&zcache_pageframes_freed_atomic);
-}
-#endif
-
static void zcache_free_page(struct page *page)
{
long curr_pageframes;
&zcache_eph_zbytes_atomic);
zcache_eph_zpages = atomic_sub_return(zpages,
&zcache_eph_zpages_atomic);
- zcache_evicted_eph_zpages++;
+ zcache_evicted_eph_zpages += zpages;
zcache_eph_pageframes =
atomic_dec_return(&zcache_eph_pageframes_atomic);
zcache_evicted_eph_pageframes++;
return page;
}
-#ifdef FRONTSWAP_HAS_UNUSE
+#ifdef CONFIG_ZCACHE_WRITEBACK
+
+static atomic_t zcache_outstanding_writeback_pages_atomic = ATOMIC_INIT(0);
+
static void unswiz(struct tmem_oid oid, u32 index,
unsigned *type, pgoff_t *offset);
/*
- * Choose an LRU persistent pageframe and attempt to "unuse" it by
- * calling frontswap_unuse on both zpages.
+ * Choose an LRU persistent pageframe and attempt to write it back to
+ * the backing swap disk by calling frontswap_writeback on both zpages.
*
* This is work-in-progress.
*/
-static int zcache_frontswap_unuse(void)
+static void zcache_end_swap_write(struct bio *bio, int err)
+{
+ end_swap_bio_write(bio, err);
+ zcache_outstanding_writeback_pages =
+ atomic_dec_return(&zcache_outstanding_writeback_pages_atomic);
+ zcache_writtenback_pages++;
+}
+
+/*
+ * zcache_get_swap_cache_page
+ *
+ * This is an adaption of read_swap_cache_async()
+ *
+ * If success, page is returned in retpage
+ * Returns 0 if page was already in the swap cache, page is not locked
+ * Returns 1 if the new page needs to be populated, page is locked
+ */
+static int zcache_get_swap_cache_page(int type, pgoff_t offset,
+ struct page *new_page)
+{
+ struct page *found_page;
+ swp_entry_t entry = swp_entry(type, offset);
+ int err;
+
+ BUG_ON(new_page == NULL);
+ do {
+ /*
+ * First check the swap cache. Since this is normally
+ * called after lookup_swap_cache() failed, re-calling
+ * that would confuse statistics.
+ */
+ found_page = find_get_page(&swapper_space, entry.val);
+ if (found_page)
+ return 0;
+
+ /*
+ * call radix_tree_preload() while we can wait.
+ */
+ err = radix_tree_preload(GFP_KERNEL);
+ if (err)
+ break;
+
+ /*
+ * Swap entry may have been freed since our caller observed it.
+ */
+ err = swapcache_prepare(entry);
+ if (err == -EEXIST) { /* seems racy */
+ radix_tree_preload_end();
+ continue;
+ }
+ if (err) { /* swp entry is obsolete ? */
+ radix_tree_preload_end();
+ break;
+ }
+
+ /* May fail (-ENOMEM) if radix-tree node allocation failed. */
+ __set_page_locked(new_page);
+ SetPageSwapBacked(new_page);
+ err = __add_to_swap_cache(new_page, entry);
+ if (likely(!err)) {
+ radix_tree_preload_end();
+ lru_cache_add_anon(new_page);
+ return 1;
+ }
+ radix_tree_preload_end();
+ ClearPageSwapBacked(new_page);
+ __clear_page_locked(new_page);
+ /*
+ * add_to_swap_cache() doesn't return -EEXIST, so we can safely
+ * clear SWAP_HAS_CACHE flag.
+ */
+ swapcache_free(entry, NULL);
+ /* FIXME: is it possible to get here without err==-ENOMEM?
+ * If not, we can dispense with the do loop, use goto retry */
+ } while (err != -ENOMEM);
+
+ return -ENOMEM;
+}
+
+/*
+ * Given a frontswap zpage in zcache (identified by type/offset) and
+ * an empty page, put the page into the swap cache, use frontswap
+ * to get the page from zcache into the empty page, then give it
+ * to the swap subsystem to send to disk (carefully avoiding the
+ * possibility that frontswap might snatch it back).
+ * Returns < 0 if error, 0 if successful, and 1 if successful but
+ * the newpage passed in not needed and should be freed.
+ */
+static int zcache_frontswap_writeback_zpage(int type, pgoff_t offset,
+ struct page *newpage)
+{
+ struct page *page = newpage;
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ };
+
+ ret = zcache_get_swap_cache_page(type, offset, page);
+ if (ret < 0)
+ return ret;
+ else if (ret == 0) {
+ /* more uptodate page is already in swapcache */
+ __frontswap_invalidate_page(type, offset);
+ return 1;
+ }
+
+ BUG_ON(!frontswap_has_exclusive_gets); /* load must also invalidate */
+ /* FIXME: how is it possible to get here when page is unlocked? */
+ __frontswap_load(page);
+ SetPageUptodate(page); /* above does SetPageDirty, is that enough? */
+
+ /* start writeback */
+ SetPageReclaim(page);
+ /*
+ * Return value is ignored here because it doesn't change anything
+ * for us. Page is returned unlocked.
+ */
+ (void)__swap_writepage(page, &wbc, zcache_end_swap_write);
+ page_cache_release(page);
+ zcache_outstanding_writeback_pages =
+ atomic_inc_return(&zcache_outstanding_writeback_pages_atomic);
+
+ return 0;
+}
+
+/*
+ * The following is still a magic number... we want to allow forward progress
+ * for writeback because it clears out needed RAM when under pressure, but
+ * we don't want to allow writeback to absorb and queue too many GFP_KERNEL
+ * pages if the swap device is very slow.
+ */
+#define ZCACHE_MAX_OUTSTANDING_WRITEBACK_PAGES 6400
+
+/*
+ * Try to allocate two free pages, first using a non-aggressive alloc,
+ * then by evicting zcache ephemeral (clean pagecache) pages, and last
+ * by aggressive GFP_KERNEL alloc. We allow zbud to choose a pageframe
+ * consisting of 1-2 zbuds/zpages, then call the writeback_zpage helper
+ * function above for each.
+ */
+static int zcache_frontswap_writeback(void)
{
struct tmem_handle th[2];
- int ret = -ENOMEM;
- int nzbuds, unuse_ret;
+ int ret = 0;
+ int nzbuds, writeback_ret;
unsigned type;
- struct page *newpage1 = NULL, *newpage2 = NULL;
+ struct page *znewpage1 = NULL, *znewpage2 = NULL;
struct page *evictpage1 = NULL, *evictpage2 = NULL;
+ struct page *newpage1 = NULL, *newpage2 = NULL;
+ struct page *page1 = NULL, *page2 = NULL;
pgoff_t offset;
- newpage1 = alloc_page(ZCACHE_GFP_MASK);
- newpage2 = alloc_page(ZCACHE_GFP_MASK);
- if (newpage1 == NULL)
+ znewpage1 = alloc_page(ZCACHE_GFP_MASK);
+ znewpage2 = alloc_page(ZCACHE_GFP_MASK);
+ if (znewpage1 == NULL)
evictpage1 = zcache_evict_eph_pageframe();
- if (newpage2 == NULL)
+ if (znewpage2 == NULL)
evictpage2 = zcache_evict_eph_pageframe();
- if (evictpage1 == NULL || evictpage2 == NULL)
+
+ if ((evictpage1 == NULL || evictpage2 == NULL) &&
+ atomic_read(&zcache_outstanding_writeback_pages_atomic) >
+ ZCACHE_MAX_OUTSTANDING_WRITEBACK_PAGES) {
goto free_and_out;
- /* ok, we have two pages pre-allocated */
+ }
+ if (znewpage1 == NULL && evictpage1 == NULL)
+ newpage1 = alloc_page(GFP_KERNEL);
+ if (znewpage2 == NULL && evictpage2 == NULL)
+ newpage2 = alloc_page(GFP_KERNEL);
+ if (newpage1 == NULL || newpage2 == NULL)
+ goto free_and_out;
+
+ /* ok, we have two pageframes pre-allocated, get a pair of zbuds */
nzbuds = zbud_make_zombie_lru(&th[0], NULL, NULL, false);
if (nzbuds == 0) {
ret = -ENOENT;
goto free_and_out;
}
+
+ /* process the first zbud */
unswiz(th[0].oid, th[0].index, &type, &offset);
- unuse_ret = frontswap_unuse(type, offset,
- newpage1 != NULL ? newpage1 : evictpage1,
- ZCACHE_GFP_MASK);
- if (unuse_ret != 0)
+ page1 = (znewpage1 != NULL) ? znewpage1 :
+ ((newpage1 != NULL) ? newpage1 : evictpage1);
+ writeback_ret = zcache_frontswap_writeback_zpage(type, offset, page1);
+ if (writeback_ret < 0) {
+ ret = -ENOMEM;
goto free_and_out;
- else if (evictpage1 != NULL)
- zcache_unacct_page();
- newpage1 = NULL;
- evictpage1 = NULL;
- if (nzbuds == 2) {
- unswiz(th[1].oid, th[1].index, &type, &offset);
- unuse_ret = frontswap_unuse(type, offset,
- newpage2 != NULL ? newpage2 : evictpage2,
- ZCACHE_GFP_MASK);
- if (unuse_ret != 0)
- goto free_and_out;
- else if (evictpage2 != NULL)
- zcache_unacct_page();
}
- ret = 0;
- goto out;
+ if (evictpage1 != NULL)
+ zcache_pageframes_freed =
+ atomic_inc_return(&zcache_pageframes_freed_atomic);
+ if (writeback_ret == 0) {
+ /* zcache_get_swap_cache_page will free, don't double free */
+ znewpage1 = NULL;
+ newpage1 = NULL;
+ evictpage1 = NULL;
+ }
+ if (nzbuds < 2)
+ goto free_and_out;
+
+ /* if there is a second zbud, process it */
+ unswiz(th[1].oid, th[1].index, &type, &offset);
+ page2 = (znewpage2 != NULL) ? znewpage2 :
+ ((newpage2 != NULL) ? newpage2 : evictpage2);
+ writeback_ret = zcache_frontswap_writeback_zpage(type, offset, page2);
+ if (writeback_ret < 0) {
+ ret = -ENOMEM;
+ goto free_and_out;
+ }
+ if (evictpage2 != NULL)
+ zcache_pageframes_freed =
+ atomic_inc_return(&zcache_pageframes_freed_atomic);
+ if (writeback_ret == 0) {
+ znewpage2 = NULL;
+ newpage2 = NULL;
+ evictpage2 = NULL;
+ }
free_and_out:
+ if (znewpage1 != NULL)
+ page_cache_release(znewpage1);
+ if (znewpage2 != NULL)
+ page_cache_release(znewpage2);
if (newpage1 != NULL)
- __free_page(newpage1);
+ page_cache_release(newpage1);
if (newpage2 != NULL)
- __free_page(newpage2);
+ page_cache_release(newpage2);
if (evictpage1 != NULL)
zcache_free_page(evictpage1);
if (evictpage2 != NULL)
zcache_free_page(evictpage2);
-out:
return ret;
}
-#endif
+#endif /* CONFIG_ZCACHE_WRITEBACK */
/*
* When zcache is disabled ("frozen"), pools can be created and destroyed,
/*
* This zcache shrinker interface reduces the number of ephemeral pageframes
* used by zcache to approximately the same as the total number of LRU_FILE
- * pageframes in use.
+ * pageframes in use, and now also reduces the number of persistent pageframes
+ * used by zcache to approximately the same as the total number of LRU_ANON
+ * pageframes in use. FIXME POLICY: Probably the writeback should only occur
+ * if the eviction doesn't free enough pages.
*/
static int shrink_zcache_memory(struct shrinker *shrink,
struct shrink_control *sc)
int ret = -1;
int nr = sc->nr_to_scan;
int nr_evict = 0;
- int nr_unuse = 0;
+ int nr_writeback = 0;
struct page *page;
-#ifdef FRONTSWAP_HAS_UNUSE
- int unuse_ret;
-#endif
+ int file_pageframes_inuse, anon_pageframes_inuse;
if (nr <= 0)
goto skip_evict;
global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE);
zcache_last_inactive_file_pageframes =
global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE);
- nr_evict = zcache_eph_pageframes - zcache_last_active_file_pageframes +
- zcache_last_inactive_file_pageframes;
+ file_pageframes_inuse = zcache_last_active_file_pageframes +
+ zcache_last_inactive_file_pageframes;
+ if (zcache_eph_pageframes > file_pageframes_inuse)
+ nr_evict = zcache_eph_pageframes - file_pageframes_inuse;
+ else
+ nr_evict = 0;
while (nr_evict-- > 0) {
page = zcache_evict_eph_pageframe();
if (page == NULL)
global_page_state(NR_LRU_BASE + LRU_ACTIVE_ANON);
zcache_last_inactive_anon_pageframes =
global_page_state(NR_LRU_BASE + LRU_INACTIVE_ANON);
- nr_unuse = zcache_pers_pageframes - zcache_last_active_anon_pageframes +
- zcache_last_inactive_anon_pageframes;
-#ifdef FRONTSWAP_HAS_UNUSE
- /* rate limit for testing */
- if (nr_unuse > 32)
- nr_unuse = 32;
- while (nr_unuse-- > 0) {
- unuse_ret = zcache_frontswap_unuse();
- if (unuse_ret == -ENOMEM)
+ anon_pageframes_inuse = zcache_last_active_anon_pageframes +
+ zcache_last_inactive_anon_pageframes;
+ if (zcache_pers_pageframes > anon_pageframes_inuse)
+ nr_writeback = zcache_pers_pageframes - anon_pageframes_inuse;
+ else
+ nr_writeback = 0;
+ while (nr_writeback-- > 0) {
+#ifdef CONFIG_ZCACHE_WRITEBACK
+ int writeback_ret;
+ writeback_ret = zcache_frontswap_writeback();
+ if (writeback_ret == -ENOMEM)
+#endif
break;
}
-#endif
in_progress = false;
skip_evict:
int zcache_autocreate_pool(unsigned int cli_id, unsigned int pool_id, bool eph)
{
struct tmem_pool *pool;
- struct zcache_client *cli = NULL;
+ struct zcache_client *cli;
uint32_t flags = eph ? 0 : TMEM_POOL_PERSIST;
int ret = -1;
return oid;
}
-#ifdef FRONTSWAP_HAS_UNUSE
+#ifdef CONFIG_ZCACHE_WRITEBACK
static void unswiz(struct tmem_oid oid, u32 index,
unsigned *type, pgoff_t *offset)
{