inode: move to per-sb LRU locks
[GitHub/moto-9609/android_kernel_motorola_exynos9610.git] / fs / inode.c
index 0f7e88a7803f39e52b778eb30652654d7d61d2d1..0450e25aeda0887e78afbde55f0da9958c800ca1 100644 (file)
@@ -33,8 +33,8 @@
  *
  * inode->i_lock protects:
  *   inode->i_state, inode->i_hash, __iget()
- * inode_lru_lock protects:
- *   inode_lru, inode->i_lru
+ * inode->i_sb->s_inode_lru_lock protects:
+ *   inode->i_sb->s_inode_lru, inode->i_lru
  * inode_sb_list_lock protects:
  *   sb->s_inodes, inode->i_sb_list
  * inode_wb_list_lock protects:
@@ -46,7 +46,7 @@
  *
  * inode_sb_list_lock
  *   inode->i_lock
- *     inode_lru_lock
+ *     inode->i_sb->s_inode_lru_lock
  *
  * inode_wb_list_lock
  *   inode->i_lock
@@ -64,9 +64,6 @@ static unsigned int i_hash_shift __read_mostly;
 static struct hlist_head *inode_hashtable __read_mostly;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
-static LIST_HEAD(inode_lru);
-static DEFINE_SPINLOCK(inode_lru_lock);
-
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
 
@@ -95,6 +92,7 @@ EXPORT_SYMBOL(empty_aops);
 struct inodes_stat_t inodes_stat;
 
 static DEFINE_PER_CPU(unsigned int, nr_inodes);
+static DEFINE_PER_CPU(unsigned int, nr_unused);
 
 static struct kmem_cache *inode_cachep __read_mostly;
 
@@ -109,7 +107,11 @@ static int get_nr_inodes(void)
 
 static inline int get_nr_inodes_unused(void)
 {
-       return inodes_stat.nr_unused;
+       int i;
+       int sum = 0;
+       for_each_possible_cpu(i)
+               sum += per_cpu(nr_unused, i);
+       return sum < 0 ? 0 : sum;
 }
 
 int get_nr_dirty_inodes(void)
@@ -127,6 +129,7 @@ int proc_nr_inodes(ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        inodes_stat.nr_inodes = get_nr_inodes();
+       inodes_stat.nr_unused = get_nr_inodes_unused();
        return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -337,22 +340,24 @@ EXPORT_SYMBOL(ihold);
 
 static void inode_lru_list_add(struct inode *inode)
 {
-       spin_lock(&inode_lru_lock);
+       spin_lock(&inode->i_sb->s_inode_lru_lock);
        if (list_empty(&inode->i_lru)) {
-               list_add(&inode->i_lru, &inode_lru);
-               inodes_stat.nr_unused++;
+               list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
+               inode->i_sb->s_nr_inodes_unused++;
+               this_cpu_inc(nr_unused);
        }
-       spin_unlock(&inode_lru_lock);
+       spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 
 static void inode_lru_list_del(struct inode *inode)
 {
-       spin_lock(&inode_lru_lock);
+       spin_lock(&inode->i_sb->s_inode_lru_lock);
        if (!list_empty(&inode->i_lru)) {
                list_del_init(&inode->i_lru);
-               inodes_stat.nr_unused--;
+               inode->i_sb->s_nr_inodes_unused--;
+               this_cpu_dec(nr_unused);
        }
-       spin_unlock(&inode_lru_lock);
+       spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 
 /**
@@ -423,7 +428,14 @@ EXPORT_SYMBOL(remove_inode_hash);
 void end_writeback(struct inode *inode)
 {
        might_sleep();
+       /*
+        * We have to cycle tree_lock here because reclaim can be still in the
+        * process of removing the last page (in __delete_from_page_cache())
+        * and we must not free mapping under it.
+        */
+       spin_lock_irq(&inode->i_data.tree_lock);
        BUG_ON(inode->i_data.nrpages);
+       spin_unlock_irq(&inode->i_data.tree_lock);
        BUG_ON(!list_empty(&inode->i_data.private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
@@ -601,7 +613,8 @@ static int can_unuse(struct inode *inode)
 
 /*
  * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * temporary list and then are freed outside inode_lru_lock by dispose_list().
+ * temporary list and then are freed outside sb->s_inode_lru_lock by
+ * dispose_list().
  *
  * Any inodes which are pinned purely because of attached pagecache have their
  * pagecache removed.  If the inode has metadata buffers attached to
@@ -615,29 +628,28 @@ static int can_unuse(struct inode *inode)
  * LRU does not have strict ordering. Hence we don't want to reclaim inodes
  * with this flag set because they are the inodes that are out of order.
  */
-static void prune_icache(int nr_to_scan)
+static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
 {
        LIST_HEAD(freeable);
        int nr_scanned;
        unsigned long reap = 0;
 
-       down_read(&iprune_sem);
-       spin_lock(&inode_lru_lock);
-       for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+       spin_lock(&sb->s_inode_lru_lock);
+       for (nr_scanned = *nr_to_scan; nr_scanned >= 0; nr_scanned--) {
                struct inode *inode;
 
-               if (list_empty(&inode_lru))
+               if (list_empty(&sb->s_inode_lru))
                        break;
 
-               inode = list_entry(inode_lru.prev, struct inode, i_lru);
+               inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
 
                /*
-                * we are inverting the inode_lru_lock/inode->i_lock here,
+                * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
                 * so use a trylock. If we fail to get the lock, just move the
                 * inode to the back of the list so we don't spin on it.
                 */
                if (!spin_trylock(&inode->i_lock)) {
-                       list_move(&inode->i_lru, &inode_lru);
+                       list_move(&inode->i_lru, &sb->s_inode_lru);
                        continue;
                }
 
@@ -649,28 +661,29 @@ static void prune_icache(int nr_to_scan)
                    (inode->i_state & ~I_REFERENCED)) {
                        list_del_init(&inode->i_lru);
                        spin_unlock(&inode->i_lock);
-                       inodes_stat.nr_unused--;
+                       sb->s_nr_inodes_unused--;
+                       this_cpu_dec(nr_unused);
                        continue;
                }
 
                /* recently referenced inodes get one more pass */
                if (inode->i_state & I_REFERENCED) {
                        inode->i_state &= ~I_REFERENCED;
-                       list_move(&inode->i_lru, &inode_lru);
+                       list_move(&inode->i_lru, &sb->s_inode_lru);
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if (inode_has_buffers(inode) || inode->i_data.nrpages) {
                        __iget(inode);
                        spin_unlock(&inode->i_lock);
-                       spin_unlock(&inode_lru_lock);
+                       spin_unlock(&sb->s_inode_lru_lock);
                        if (remove_inode_buffers(inode))
                                reap += invalidate_mapping_pages(&inode->i_data,
                                                                0, -1);
                        iput(inode);
-                       spin_lock(&inode_lru_lock);
+                       spin_lock(&sb->s_inode_lru_lock);
 
-                       if (inode != list_entry(inode_lru.next,
+                       if (inode != list_entry(sb->s_inode_lru.next,
                                                struct inode, i_lru))
                                continue;       /* wrong inode or list_empty */
                        /* avoid lock inversions with trylock */
@@ -686,15 +699,83 @@ static void prune_icache(int nr_to_scan)
                spin_unlock(&inode->i_lock);
 
                list_move(&inode->i_lru, &freeable);
-               inodes_stat.nr_unused--;
+               sb->s_nr_inodes_unused--;
+               this_cpu_dec(nr_unused);
        }
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_INODESTEAL, reap);
        else
                __count_vm_events(PGINODESTEAL, reap);
-       spin_unlock(&inode_lru_lock);
+       spin_unlock(&sb->s_inode_lru_lock);
+       *nr_to_scan = nr_scanned;
 
        dispose_list(&freeable);
+}
+
+static void prune_icache(int count)
+{
+       struct super_block *sb, *p = NULL;
+       int w_count;
+       int unused = inodes_stat.nr_unused;
+       int prune_ratio;
+       int pruned;
+
+       if (unused == 0 || count == 0)
+               return;
+       down_read(&iprune_sem);
+       if (count >= unused)
+               prune_ratio = 1;
+       else
+               prune_ratio = unused / count;
+       spin_lock(&sb_lock);
+       list_for_each_entry(sb, &super_blocks, s_list) {
+               if (list_empty(&sb->s_instances))
+                       continue;
+               if (sb->s_nr_inodes_unused == 0)
+                       continue;
+               sb->s_count++;
+               /* Now, we reclaim unused dentrins with fairness.
+                * We reclaim them same percentage from each superblock.
+                * We calculate number of dentries to scan on this sb
+                * as follows, but the implementation is arranged to avoid
+                * overflows:
+                * number of dentries to scan on this sb =
+                * count * (number of dentries on this sb /
+                * number of dentries in the machine)
+                */
+               spin_unlock(&sb_lock);
+               if (prune_ratio != 1)
+                       w_count = (sb->s_nr_inodes_unused / prune_ratio) + 1;
+               else
+                       w_count = sb->s_nr_inodes_unused;
+               pruned = w_count;
+               /*
+                * We need to be sure this filesystem isn't being unmounted,
+                * otherwise we could race with generic_shutdown_super(), and
+                * end up holding a reference to an inode while the filesystem
+                * is unmounted.  So we try to get s_umount, and make sure
+                * s_root isn't NULL.
+                */
+               if (down_read_trylock(&sb->s_umount)) {
+                       if ((sb->s_root != NULL) &&
+                           (!list_empty(&sb->s_dentry_lru))) {
+                               shrink_icache_sb(sb, &w_count);
+                               pruned -= w_count;
+                       }
+                       up_read(&sb->s_umount);
+               }
+               spin_lock(&sb_lock);
+               if (p)
+                       __put_super(p);
+               count -= pruned;
+               p = sb;
+               /* more work left to do? */
+               if (count <= 0)
+                       break;
+       }
+       if (p)
+               __put_super(p);
+       spin_unlock(&sb_lock);
        up_read(&iprune_sem);
 }
 
@@ -1324,7 +1405,7 @@ static void iput_final(struct inode *inode)
 
        WARN_ON(inode->i_state & I_NEW);
 
-       if (op && op->drop_inode)
+       if (op->drop_inode)
                drop = op->drop_inode(inode);
        else
                drop = generic_drop_inode(inode);