netfilter: conntrack: use a single hashtable for all namespaces
authorFlorian Westphal <fw@strlen.de>
Mon, 2 May 2016 16:39:55 +0000 (18:39 +0200)
committerPablo Neira Ayuso <pablo@netfilter.org>
Thu, 5 May 2016 14:39:47 +0000 (16:39 +0200)
We already include netns address in the hash and compare the netns pointers
during lookup, so even if namespaces have overlapping addresses entries
will be spread across the table.

Assuming 64k bucket size, this change saves 0.5 mbyte per namespace on a
64bit system.

NAT bysrc and expectation hash is still per namespace, those will
changed too soon.

Future patch will also make conntrack object slab cache global again.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
include/net/netfilter/nf_conntrack_core.h
include/net/netns/conntrack.h
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_conntrack_helper.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_conntrack_standalone.c
net/netfilter/nf_nat_core.c
net/netfilter/nfnetlink_cttimeout.c

index 62e17d1319ff7423dbcf176815f04cecfcdf3371..3e2f3328945cca94b94411187cec58ee3cb4e9ae 100644 (file)
@@ -81,6 +81,7 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
 
 #define CONNTRACK_LOCKS 1024
 
+extern struct hlist_nulls_head *nf_conntrack_hash;
 extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
 void nf_conntrack_lock(spinlock_t *lock);
 
index b052785b1590731740e7e6f8cf8235ce8b910ed3..251c435ee330831b3f65610e0d3072c672fef223 100644 (file)
@@ -93,9 +93,7 @@ struct netns_ct {
        int                     sysctl_tstamp;
        int                     sysctl_checksum;
 
-       unsigned int            htable_size;
        struct kmem_cache       *nf_conntrack_cachep;
-       struct hlist_nulls_head *hash;
        struct hlist_head       *expect_hash;
        struct ct_pcpu __percpu *pcpu_lists;
        struct ip_conntrack_stat __percpu *stat;
index e3c46e8e276267b74b0482e85a208a70fa7f3d85..ae1a71a97132bd5a5a49996e95c81e7c08b8c60e 100644 (file)
@@ -360,7 +360,7 @@ static int ipv4_init_net(struct net *net)
 
        in->ctl_table[0].data = &nf_conntrack_max;
        in->ctl_table[1].data = &net->ct.count;
-       in->ctl_table[2].data = &net->ct.htable_size;
+       in->ctl_table[2].data = &nf_conntrack_htable_size;
        in->ctl_table[3].data = &net->ct.sysctl_checksum;
        in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
 #endif
index 171aba15c95218e8a603c51a8d6d042d158d1d91..f8fc7ab201c9ef3b85193be19fae129ae0a0163d 100644 (file)
@@ -31,15 +31,14 @@ struct ct_iter_state {
 
 static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
 {
-       struct net *net = seq_file_net(seq);
        struct ct_iter_state *st = seq->private;
        struct hlist_nulls_node *n;
 
        for (st->bucket = 0;
-            st->bucket < net->ct.htable_size;
+            st->bucket < nf_conntrack_htable_size;
             st->bucket++) {
                n = rcu_dereference(
-                       hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+                       hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
                if (!is_a_nulls(n))
                        return n;
        }
@@ -49,17 +48,16 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
 static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
                                      struct hlist_nulls_node *head)
 {
-       struct net *net = seq_file_net(seq);
        struct ct_iter_state *st = seq->private;
 
        head = rcu_dereference(hlist_nulls_next_rcu(head));
        while (is_a_nulls(head)) {
                if (likely(get_nulls_value(head) == st->bucket)) {
-                       if (++st->bucket >= net->ct.htable_size)
+                       if (++st->bucket >= nf_conntrack_htable_size)
                                return NULL;
                }
                head = rcu_dereference(
-                       hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+                       hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
        }
        return head;
 }
index ebafa7736f0a2065e6f51702ef5c7680ee0e8e93..4c906e73e872187a78a0b82e2c3d59201ab11e4f 100644 (file)
@@ -69,6 +69,9 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
 
+struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_hash);
+
 static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
 static __read_mostly seqcount_t nf_conntrack_generation;
 static __read_mostly bool nf_conntrack_locks_all;
@@ -164,9 +167,9 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
                      tuple->dst.protonum));
 }
 
-static u32 hash_bucket(u32 hash, const struct net *net)
+static u32 scale_hash(u32 hash)
 {
-       return reciprocal_scale(hash, net->ct.htable_size);
+       return reciprocal_scale(hash, nf_conntrack_htable_size);
 }
 
 static u32 __hash_conntrack(const struct net *net,
@@ -179,7 +182,7 @@ static u32 __hash_conntrack(const struct net *net,
 static u32 hash_conntrack(const struct net *net,
                          const struct nf_conntrack_tuple *tuple)
 {
-       return __hash_conntrack(net, tuple, net->ct.htable_size);
+       return scale_hash(hash_conntrack_raw(tuple, net));
 }
 
 bool
@@ -478,8 +481,8 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
 begin:
        do {
                sequence = read_seqcount_begin(&nf_conntrack_generation);
-               bucket = hash_bucket(hash, net);
-               ct_hash = net->ct.hash;
+               bucket = scale_hash(hash);
+               ct_hash = nf_conntrack_hash;
        } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
 
        hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
@@ -543,12 +546,10 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
                                       unsigned int hash,
                                       unsigned int reply_hash)
 {
-       struct net *net = nf_ct_net(ct);
-
        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
-                          &net->ct.hash[hash]);
+                          &nf_conntrack_hash[hash]);
        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
-                          &net->ct.hash[reply_hash]);
+                          &nf_conntrack_hash[reply_hash]);
 }
 
 int
@@ -573,12 +574,12 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
        } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 
        /* See if there's one in the list already, including reverse */
-       hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
+       hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                                    zone, net))
                        goto out;
 
-       hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
+       hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
                                    zone, net))
                        goto out;
@@ -633,7 +634,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
                sequence = read_seqcount_begin(&nf_conntrack_generation);
                /* reuse the hash saved before */
                hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
-               hash = hash_bucket(hash, net);
+               hash = scale_hash(hash);
                reply_hash = hash_conntrack(net,
                                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
@@ -663,12 +664,12 @@ __nf_conntrack_confirm(struct sk_buff *skb)
        /* See if there's one in the list already, including reverse:
           NAT could have grabbed it without realizing, since we're
           not in the hash.  If there is, we lost race. */
-       hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
+       hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                                    zone, net))
                        goto out;
 
-       hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
+       hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
                                    zone, net))
                        goto out;
@@ -736,7 +737,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
        do {
                sequence = read_seqcount_begin(&nf_conntrack_generation);
                hash = hash_conntrack(net, tuple);
-               ct_hash = net->ct.hash;
+               ct_hash = nf_conntrack_hash;
        } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
 
        hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
@@ -773,16 +774,16 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
        local_bh_disable();
 restart:
        sequence = read_seqcount_begin(&nf_conntrack_generation);
-       hash = hash_bucket(_hash, net);
-       for (; i < net->ct.htable_size; i++) {
+       hash = scale_hash(_hash);
+       for (; i < nf_conntrack_htable_size; i++) {
                lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
                nf_conntrack_lock(lockp);
                if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
                        spin_unlock(lockp);
                        goto restart;
                }
-               hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
-                                        hnnode) {
+               hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash],
+                                              hnnode) {
                        tmp = nf_ct_tuplehash_to_ctrack(h);
                        if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
                            !nf_ct_is_dying(tmp) &&
@@ -793,7 +794,7 @@ restart:
                        cnt++;
                }
 
-               hash = (hash + 1) % net->ct.htable_size;
+               hash = (hash + 1) % nf_conntrack_htable_size;
                spin_unlock(lockp);
 
                if (ct || cnt >= NF_CT_EVICTION_RANGE)
@@ -1376,12 +1377,12 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
        int cpu;
        spinlock_t *lockp;
 
-       for (; *bucket < net->ct.htable_size; (*bucket)++) {
+       for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
                lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
                local_bh_disable();
                nf_conntrack_lock(lockp);
-               if (*bucket < net->ct.htable_size) {
-                       hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
+               if (*bucket < nf_conntrack_htable_size) {
+                       hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
                                if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
                                        continue;
                                ct = nf_ct_tuplehash_to_ctrack(h);
@@ -1478,6 +1479,8 @@ void nf_conntrack_cleanup_end(void)
        while (untrack_refs() > 0)
                schedule();
 
+       nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
+
 #ifdef CONFIG_NF_CONNTRACK_ZONES
        nf_ct_extend_unregister(&nf_ct_zone_extend);
 #endif
@@ -1528,7 +1531,6 @@ i_see_dead_people:
        }
 
        list_for_each_entry(net, net_exit_list, exit_list) {
-               nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
                nf_conntrack_proto_pernet_fini(net);
                nf_conntrack_helper_pernet_fini(net);
                nf_conntrack_ecache_pernet_fini(net);
@@ -1599,10 +1601,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
         * though since that required taking the locks.
         */
 
-       for (i = 0; i < init_net.ct.htable_size; i++) {
-               while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
-                       h = hlist_nulls_entry(init_net.ct.hash[i].first,
-                                       struct nf_conntrack_tuple_hash, hnnode);
+       for (i = 0; i < nf_conntrack_htable_size; i++) {
+               while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
+                       h = hlist_nulls_entry(nf_conntrack_hash[i].first,
+                                             struct nf_conntrack_tuple_hash, hnnode);
                        ct = nf_ct_tuplehash_to_ctrack(h);
                        hlist_nulls_del_rcu(&h->hnnode);
                        bucket = __hash_conntrack(nf_ct_net(ct),
@@ -1610,11 +1612,11 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
                        hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
                }
        }
-       old_size = init_net.ct.htable_size;
-       old_hash = init_net.ct.hash;
+       old_size = nf_conntrack_htable_size;
+       old_hash = nf_conntrack_hash;
 
-       init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
-       init_net.ct.hash = hash;
+       nf_conntrack_hash = hash;
+       nf_conntrack_htable_size = hashsize;
 
        write_seqcount_end(&nf_conntrack_generation);
        nf_conntrack_all_unlock();
@@ -1670,6 +1672,11 @@ int nf_conntrack_init_start(void)
                 * entries. */
                max_factor = 4;
        }
+
+       nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
+       if (!nf_conntrack_hash)
+               return -ENOMEM;
+
        nf_conntrack_max = max_factor * nf_conntrack_htable_size;
 
        printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
@@ -1748,6 +1755,7 @@ err_tstamp:
 err_acct:
        nf_conntrack_expect_fini();
 err_expect:
+       nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
        return ret;
 }
 
@@ -1800,12 +1808,6 @@ int nf_conntrack_init_net(struct net *net)
                goto err_cache;
        }
 
-       net->ct.htable_size = nf_conntrack_htable_size;
-       net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
-       if (!net->ct.hash) {
-               printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
-               goto err_hash;
-       }
        ret = nf_conntrack_expect_pernet_init(net);
        if (ret < 0)
                goto err_expect;
@@ -1837,8 +1839,6 @@ err_tstamp:
 err_acct:
        nf_conntrack_expect_pernet_fini(net);
 err_expect:
-       nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
-err_hash:
        kmem_cache_destroy(net->ct.nf_conntrack_cachep);
 err_cache:
        kfree(net->ct.slabname);
index 498bf74f154d767c170c16869ee10ad106a49781..cb48e6adba2ca5cd9bf4de9b74fefc23fc8d153a 100644 (file)
@@ -424,10 +424,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
                spin_unlock_bh(&pcpu->lock);
        }
        local_bh_disable();
-       for (i = 0; i < net->ct.htable_size; i++) {
+       for (i = 0; i < nf_conntrack_htable_size; i++) {
                nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
-               if (i < net->ct.htable_size) {
-                       hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+               if (i < nf_conntrack_htable_size) {
+                       hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
                                unhelp(h, me);
                }
                spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
index f6bbcb23749ecd23c3055ed04a296697ba9d5986..e00f178c48b046f2828255427df7f3a1f837efee 100644 (file)
@@ -824,16 +824,16 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
        last = (struct nf_conn *)cb->args[1];
 
        local_bh_disable();
-       for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
+       for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
 restart:
                lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
                nf_conntrack_lock(lockp);
-               if (cb->args[0] >= net->ct.htable_size) {
+               if (cb->args[0] >= nf_conntrack_htable_size) {
                        spin_unlock(lockp);
                        goto out;
                }
-               hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
-                                        hnnode) {
+               hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]],
+                                          hnnode) {
                        if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
                                continue;
                        ct = nf_ct_tuplehash_to_ctrack(h);
index 0f1a45bcacb2414a2292022d4180584481f3f698..f87e84ebcec3ac78cc4b941ec8828fab8c623f55 100644 (file)
@@ -54,14 +54,13 @@ struct ct_iter_state {
 
 static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
 {
-       struct net *net = seq_file_net(seq);
        struct ct_iter_state *st = seq->private;
        struct hlist_nulls_node *n;
 
        for (st->bucket = 0;
-            st->bucket < net->ct.htable_size;
+            st->bucket < nf_conntrack_htable_size;
             st->bucket++) {
-               n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+               n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
                if (!is_a_nulls(n))
                        return n;
        }
@@ -71,18 +70,17 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
 static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
                                      struct hlist_nulls_node *head)
 {
-       struct net *net = seq_file_net(seq);
        struct ct_iter_state *st = seq->private;
 
        head = rcu_dereference(hlist_nulls_next_rcu(head));
        while (is_a_nulls(head)) {
                if (likely(get_nulls_value(head) == st->bucket)) {
-                       if (++st->bucket >= net->ct.htable_size)
+                       if (++st->bucket >= nf_conntrack_htable_size)
                                return NULL;
                }
                head = rcu_dereference(
                                hlist_nulls_first_rcu(
-                                       &net->ct.hash[st->bucket]));
+                                       &nf_conntrack_hash[st->bucket]));
        }
        return head;
 }
@@ -458,7 +456,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
        },
        {
                .procname       = "nf_conntrack_buckets",
-               .data           = &init_net.ct.htable_size,
+               .data           = &nf_conntrack_htable_size,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0444,
                .proc_handler   = proc_dointvec,
@@ -512,7 +510,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
                goto out_kmemdup;
 
        table[1].data = &net->ct.count;
-       table[2].data = &net->ct.htable_size;
        table[3].data = &net->ct.sysctl_checksum;
        table[4].data = &net->ct.sysctl_log_invalid;
 
index 3d522715a1675a7885cc16f7fe813c580f3a4db0..d74e7167499d868f6d87e91d1d048fdcebe36bb2 100644 (file)
@@ -824,7 +824,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
 static int __net_init nf_nat_net_init(struct net *net)
 {
        /* Leave them the same for the moment. */
-       net->ct.nat_htable_size = net->ct.htable_size;
+       net->ct.nat_htable_size = nf_conntrack_htable_size;
        net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0);
        if (!net->ct.nat_bysource)
                return -ENOMEM;
index 2671b9deb103735ff203999286a8cfbdde434f12..3c84f14326f56da253fc1dd55f58af1ebaba79e8 100644 (file)
@@ -306,10 +306,10 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
        int i;
 
        local_bh_disable();
-       for (i = 0; i < net->ct.htable_size; i++) {
+       for (i = 0; i < nf_conntrack_htable_size; i++) {
                nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
-               if (i < net->ct.htable_size) {
-                       hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+               if (i < nf_conntrack_htable_size) {
+                       hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
                                untimeout(h, timeout);
                }
                spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);