8. LRU
Each memcg has its own private LRU. Now, its handling is under global
- VM's control (means that it's handled under global zone->lru_lock).
+ VM's control (means that it's handled under global zone_lru_lock).
Almost all routines around memcg's LRU is called by global LRU's
- list management functions under zone->lru_lock().
+ list management functions under zone_lru_lock().
A special function is mem_cgroup_isolate_pages(). This scans
memcg's private LRU and call __isolate_lru_page() to extract a page
Other lock order is following:
PG_locked.
mm->page_table_lock
- zone->lru_lock
+ zone_lru_lock
lock_page_cgroup.
In many cases, just lock_page_cgroup() is called.
per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
- zone->lru_lock, it has no lock of its own.
+ zone_lru_lock, it has no lock of its own.
2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
*/
union {
struct list_head lru; /* Pageout list, eg. active_list
- * protected by zone->lru_lock !
+ * protected by zone_lru_lock !
* Can be used as a generic list
* by the page owner.
*/
struct pglist_data;
/*
- * zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
+ * zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
* So add a wild amount of padding here to ensure that they fall into separate
* cachelines. There are very few zone structures in the machine, so space
* consumption is not a concern here.
/* Write-intensive fields used by page reclaim */
/* Fields commonly accessed by the page reclaim scanner */
- spinlock_t lru_lock;
struct lruvec lruvec;
/*
/* Number of pages migrated during the rate limiting time interval */
unsigned long numabalancing_migrate_nr_pages;
#endif
+ /* Write-intensive fields used by page reclaim */
+ ZONE_PADDING(_pad1_)
+ spinlock_t lru_lock;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
+static inline spinlock_t *zone_lru_lock(struct zone *zone)
+{
+ return &zone->zone_pgdat->lru_lock;
+}
static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
* if contended.
*/
if (!(low_pfn % SWAP_CLUSTER_MAX)
- && compact_unlock_should_abort(&zone->lru_lock, flags,
+ && compact_unlock_should_abort(zone_lru_lock(zone), flags,
&locked, cc))
break;
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- spin_unlock_irqrestore(&zone->lru_lock,
+ spin_unlock_irqrestore(zone_lru_lock(zone),
flags);
locked = false;
}
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
- locked = compact_trylock_irqsave(&zone->lru_lock,
+ locked = compact_trylock_irqsave(zone_lru_lock(zone),
&flags, cc);
if (!locked)
break;
*/
if (nr_isolated) {
if (locked) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(zone), flags);
locked = false;
}
acct_isolated(zone, cc);
low_pfn = end_pfn;
if (locked)
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(zone), flags);
/*
* Update the pageblock-skip information and cached scanner pfn,
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
- * ->zone.lru_lock (follow_page->mark_page_accessed)
- * ->zone.lru_lock (check_pte_range->isolate_lru_page)
+ * ->zone_lru_lock(zone) (follow_page->mark_page_accessed)
+ * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
spin_unlock(&head->mapping->tree_lock);
}
- spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
unfreeze_page(head);
lru_add_drain();
/* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irqsave(&page_zone(head)->lru_lock, flags);
+ spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
if (mapping) {
void **pslot;
spin_unlock(&pgdata->split_queue_lock);
fail: if (mapping)
spin_unlock(&mapping->tree_lock);
- spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
unfreeze_page(head);
ret = -EBUSY;
}
{
struct zone *zone = page_zone(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
if (PageLRU(page)) {
struct lruvec *lruvec;
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
}
static void commit_charge(struct page *page, struct mem_cgroup *memcg,
/*
* Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock and migration entries setup in all page mappings.
+ * zone_lru_lock and migration entries setup in all page mappings.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
* might otherwise copy PageMlocked to part of the tail pages before
* we clear it in the head page. It also stabilizes hpage_nr_pages().
*/
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
nr_pages = hpage_nr_pages(page);
if (!TestClearPageMlocked(page))
__mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
if (__munlock_isolate_lru_page(page, true)) {
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
__munlock_isolated_page(page);
goto out;
}
__munlock_isolation_failed(page);
unlock_out:
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
out:
return nr_pages - 1;
pagevec_init(&pvec_putback, 0);
/* Phase 1: page isolation */
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];
}
delta_munlocked = -nr + pagevec_count(&pvec_putback);
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
/* Now we can release pins of pages that we are not munlocking */
pagevec_release(&pvec_putback);
init_waitqueue_head(&pgdat->kcompactd_wait);
#endif
pgdat_page_ext_init(pgdat);
+ spin_lock_init(&pgdat->lru_lock);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
+ zone->zone_pgdat = pgdat;
spin_lock_init(&zone->lock);
- spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
- zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
/* For bootup, initialized properly in watermark setup */
return NULL;
zone = page_zone(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
if (unlikely(!PageLRU(page))) {
put_page(page);
page = NULL;
}
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
return page;
}
* mapping->i_mmap_rwsem
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
- * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
struct lruvec *lruvec;
unsigned long flags;
- spin_lock_irqsave(&zone->lru_lock, flags);
+ spin_lock_irqsave(zone_lru_lock(zone), flags);
lruvec = mem_cgroup_page_lruvec(page, zone);
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(zone), flags);
}
mem_cgroup_uncharge(page);
}
if (pagezone != zone) {
if (zone)
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(zone), flags);
zone = pagezone;
- spin_lock_irqsave(&zone->lru_lock, flags);
+ spin_lock_irqsave(zone_lru_lock(zone), flags);
}
lruvec = mem_cgroup_page_lruvec(page, zone);
(*move_fn)(page, lruvec, arg);
}
if (zone)
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(zone), flags);
release_pages(pvec->pages, pvec->nr, pvec->cold);
pagevec_reinit(pvec);
}
struct zone *zone = page_zone(page);
page = compound_head(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
__activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
}
#endif
struct zone *zone = page_zone(page);
struct lruvec *lruvec;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
lruvec = mem_cgroup_page_lruvec(page, zone);
ClearPageActive(page);
SetPageUnevictable(page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
}
/**
* same zone. The lock is held only if zone != NULL.
*/
if (zone && ++lock_batch == SWAP_CLUSTER_MAX) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(zone), flags);
zone = NULL;
}
if (PageCompound(page)) {
if (zone) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(zone), flags);
zone = NULL;
}
__put_compound_page(page);
if (pagezone != zone) {
if (zone)
- spin_unlock_irqrestore(&zone->lru_lock,
+ spin_unlock_irqrestore(zone_lru_lock(zone),
flags);
lock_batch = 0;
zone = pagezone;
- spin_lock_irqsave(&zone->lru_lock, flags);
+ spin_lock_irqsave(zone_lru_lock(zone), flags);
}
lruvec = mem_cgroup_page_lruvec(page, zone);
list_add(&page->lru, &pages_to_free);
}
if (zone)
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(zone), flags);
mem_cgroup_uncharge_list(&pages_to_free);
free_hot_cold_page_list(&pages_to_free, cold);
VM_BUG_ON_PAGE(PageCompound(page_tail), page);
VM_BUG_ON_PAGE(PageLRU(page_tail), page);
VM_BUG_ON(NR_CPUS != 1 &&
- !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
+ !spin_is_locked(zone_lru_lock(lruvec_zone(lruvec))));
if (!list)
SetPageLRU(page_tail);
}
/*
- * zone->lru_lock is heavily contended. Some of the functions that
+ * zone_lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
struct zone *zone = page_zone(page);
struct lruvec *lruvec;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
lruvec = mem_cgroup_page_lruvec(page, zone);
if (PageLRU(page)) {
int lru = page_lru(page);
del_page_from_lru_list(page, lruvec, lru);
ret = 0;
}
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
}
return ret;
}
VM_BUG_ON_PAGE(PageLRU(page), page);
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
putback_lru_page(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
continue;
}
del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
} else
list_add(&page->lru, &pages_to_free);
}
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, isolate_mode, lru);
else
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
}
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
if (nr_taken == 0)
return 0;
&nr_writeback, &nr_immediate,
false);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
if (global_reclaim(sc)) {
if (current_is_kswapd())
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
mem_cgroup_uncharge_list(&page_list);
free_hot_cold_page_list(&page_list, true);
* processes, from rmap.
*
* If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone->lru_lock across the whole operation. But if
+ * appropriate to hold zone_lru_lock across the whole operation. But if
* the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone->lru_lock around each page. It's impossible to balance
+ * should drop zone_lru_lock around each page. It's impossible to balance
* this, so instead we remove the pages from the LRU while processing them.
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
} else
list_add(&page->lru, pages_to_free);
}
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
__count_zone_vm_events(PGREFILL, zone, nr_scanned);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
while (!list_empty(&l_hold)) {
cond_resched();
/*
* Move pages back to the lru list.
*/
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
/*
* Count referenced pages from currently used mappings as rotated,
* even though only some of them are actually re-activated. This
move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
mem_cgroup_uncharge_list(&l_hold);
free_hot_cold_page_list(&l_hold, true);
file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
fraction[0] = ap;
fraction[1] = fp;
pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
}
lruvec = mem_cgroup_page_lruvec(page, zone);
if (zone) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
}
}
#endif /* CONFIG_SHMEM */