unsigned long addr = rmap_item->address;
struct vm_area_struct *vma;
+ /*
+ * It is not an accident that whenever we want to break COW
+ * to undo, we also need to drop a reference to the anon_vma.
+ */
+ drop_anon_vma(rmap_item);
+
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
goto out;
return page;
}
+static void remove_node_from_stable_tree(struct stable_node *stable_node)
+{
+ struct rmap_item *rmap_item;
+ struct hlist_node *hlist;
+
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ if (rmap_item->hlist.next)
+ ksm_pages_sharing--;
+ else
+ ksm_pages_shared--;
+ drop_anon_vma(rmap_item);
+ rmap_item->address &= PAGE_MASK;
+ cond_resched();
+ }
+
+ rb_erase(&stable_node->node, &root_stable_tree);
+ free_stable_node(stable_node);
+}
+
+/*
+ * get_ksm_page: checks if the page indicated by the stable node
+ * is still its ksm page, despite having held no reference to it.
+ * In which case we can trust the content of the page, and it
+ * returns the gotten page; but if the page has now been zapped,
+ * remove the stale node from the stable tree and return NULL.
+ *
+ * You would expect the stable_node to hold a reference to the ksm page.
+ * But if it increments the page's count, swapping out has to wait for
+ * ksmd to come around again before it can free the page, which may take
+ * seconds or even minutes: much too unresponsive. So instead we use a
+ * "keyhole reference": access to the ksm page from the stable node peeps
+ * out through its keyhole to see if that page still holds the right key,
+ * pointing back to this stable node. This relies on freeing a PageAnon
+ * page to reset its page->mapping to NULL, and relies on no other use of
+ * a page to put something that might look like our key in page->mapping.
+ *
+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
+ * but this is different - made simpler by ksm_thread_mutex being held, but
+ * interesting for assuming that no other use of the struct page could ever
+ * put our expected_mapping into page->mapping (or a field of the union which
+ * coincides with page->mapping). The RCU calls are not for KSM at all, but
+ * to keep the page_count protocol described with page_cache_get_speculative.
+ *
+ * Note: it is possible that get_ksm_page() will return NULL one moment,
+ * then page the next, if the page is in between page_freeze_refs() and
+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
+ * is on its way to being freed; but it is an anomaly to bear in mind.
+ */
+static struct page *get_ksm_page(struct stable_node *stable_node)
+{
+ struct page *page;
+ void *expected_mapping;
+
+ page = stable_node->page;
+ expected_mapping = (void *)stable_node +
+ (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+ rcu_read_lock();
+ if (page->mapping != expected_mapping)
+ goto stale;
+ if (!get_page_unless_zero(page))
+ goto stale;
+ if (page->mapping != expected_mapping) {
+ put_page(page);
+ goto stale;
+ }
+ rcu_read_unlock();
+ return page;
+stale:
+ rcu_read_unlock();
+ remove_node_from_stable_tree(stable_node);
+ return NULL;
+}
+
/*
* Removing rmap_item from stable or unstable tree.
* This function will clean the information from the stable/unstable tree.
struct page *page;
stable_node = rmap_item->head;
- page = stable_node->page;
- lock_page(page);
+ page = get_ksm_page(stable_node);
+ if (!page)
+ goto out;
+ lock_page(page);
hlist_del(&rmap_item->hlist);
- if (stable_node->hlist.first) {
- unlock_page(page);
- ksm_pages_sharing--;
- } else {
- set_page_stable_node(page, NULL);
- unlock_page(page);
- put_page(page);
+ unlock_page(page);
+ put_page(page);
- rb_erase(&stable_node->node, &root_stable_tree);
- free_stable_node(stable_node);
+ if (stable_node->hlist.first)
+ ksm_pages_sharing--;
+ else
ksm_pages_shared--;
- }
drop_anon_vma(rmap_item);
rmap_item->address &= PAGE_MASK;
ksm_pages_unshared--;
rmap_item->address &= PAGE_MASK;
}
-
+out:
cond_resched(); /* we're called from many long loops */
}
* If that fails, we have a ksm page with only one pte
* pointing to it: so break it.
*/
- if (err) {
- drop_anon_vma(rmap_item);
+ if (err)
break_cow(rmap_item);
- }
}
if (err) {
put_page(kpage);
}
while (node) {
+ struct page *tree_page;
int ret;
cond_resched();
stable_node = rb_entry(node, struct stable_node, node);
+ tree_page = get_ksm_page(stable_node);
+ if (!tree_page)
+ return NULL;
- ret = memcmp_pages(page, stable_node->page);
+ ret = memcmp_pages(page, tree_page);
- if (ret < 0)
+ if (ret < 0) {
+ put_page(tree_page);
node = node->rb_left;
- else if (ret > 0)
+ } else if (ret > 0) {
+ put_page(tree_page);
node = node->rb_right;
- else {
- get_page(stable_node->page);
+ } else
return stable_node;
- }
}
return NULL;
struct stable_node *stable_node;
while (*new) {
+ struct page *tree_page;
int ret;
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
+ tree_page = get_ksm_page(stable_node);
+ if (!tree_page)
+ return NULL;
- ret = memcmp_pages(kpage, stable_node->page);
+ ret = memcmp_pages(kpage, tree_page);
+ put_page(tree_page);
parent = *new;
if (ret < 0)
INIT_HLIST_HEAD(&stable_node->hlist);
- get_page(kpage);
stable_node->page = kpage;
set_page_stable_node(kpage, stable_node);
}
/*
- * A ksm page might have got here by fork, but its other
- * references have already been removed from the stable tree.
- * Or it might be left over from a break_ksm which failed
- * when the mem_cgroup had reached its limit: try again now.
- */
- if (PageKsm(page))
- break_cow(rmap_item);
-
- /*
- * In case the hash value of the page was changed from the last time we
- * have calculated it, this page to be changed frequely, therefore we
- * don't want to insert it to the unstable tree, and we don't want to
- * waste our time to search if there is something identical to it there.
+ * If the hash value of the page has changed from the last time
+ * we calculated it, this page is changing frequently: therefore we
+ * don't want to insert it in the unstable tree, and we don't want
+ * to waste our time searching for something identical to it there.
*/
checksum = calc_checksum(page);
if (rmap_item->oldchecksum != checksum) {
* in which case we need to break_cow on both.
*/
if (!stable_node) {
- drop_anon_vma(tree_rmap_item);
break_cow(tree_rmap_item);
- drop_anon_vma(rmap_item);
break_cow(rmap_item);
}
}