netfilter: conntrack: spinlock per cpu to protect special lists.
authorJesper Dangaard Brouer <brouer@redhat.com>
Mon, 3 Mar 2014 13:45:20 +0000 (14:45 +0100)
committerPablo Neira Ayuso <pablo@netfilter.org>
Fri, 7 Mar 2014 10:40:38 +0000 (11:40 +0100)
One spinlock per cpu to protect dying/unconfirmed/template special lists.
(These lists are now per cpu, a bit like the untracked ct)
Add a @cpu field to nf_conn, to make sure we hold the appropriate
spinlock at removal time.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
include/net/netfilter/nf_conntrack.h
include/net/netns/conntrack.h
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_conntrack_helper.c
net/netfilter/nf_conntrack_netlink.c

index e10d1faa6d096fc580cacf7902085b09beb18598..37252f71a38037d0e969699dc7376e6dcae1c5ec 100644 (file)
@@ -82,7 +82,8 @@ struct nf_conn {
         */
        struct nf_conntrack ct_general;
 
-       spinlock_t lock;
+       spinlock_t      lock;
+       u16             cpu;
 
        /* XXX should I move this to the tail ? - Y.K */
        /* These are my tuples; original and reply */
index fbcc7fa536dc4ab49440d4566e63fedf8b860d2a..c6a8994e992225a9d6b834c38132259447fc8b6a 100644 (file)
@@ -62,6 +62,13 @@ struct nf_ip_net {
 #endif
 };
 
+struct ct_pcpu {
+       spinlock_t              lock;
+       struct hlist_nulls_head unconfirmed;
+       struct hlist_nulls_head dying;
+       struct hlist_nulls_head tmpl;
+};
+
 struct netns_ct {
        atomic_t                count;
        unsigned int            expect_count;
@@ -86,9 +93,7 @@ struct netns_ct {
        struct kmem_cache       *nf_conntrack_cachep;
        struct hlist_nulls_head *hash;
        struct hlist_head       *expect_hash;
-       struct hlist_nulls_head unconfirmed;
-       struct hlist_nulls_head dying;
-       struct hlist_nulls_head tmpl;
+       struct ct_pcpu __percpu *pcpu_lists;
        struct ip_conntrack_stat __percpu *stat;
        struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
        struct nf_exp_event_notifier __rcu *nf_expect_event_cb;
index 965693eb1f0e665f97ce2dfd0af0777a5b3ab6eb..289b27901d8c906f652948eccdeda5fb962242af 100644 (file)
@@ -192,6 +192,50 @@ clean_from_lists(struct nf_conn *ct)
        nf_ct_remove_expectations(ct);
 }
 
+/* must be called with local_bh_disable */
+static void nf_ct_add_to_dying_list(struct nf_conn *ct)
+{
+       struct ct_pcpu *pcpu;
+
+       /* add this conntrack to the (per cpu) dying list */
+       ct->cpu = smp_processor_id();
+       pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+       spin_lock(&pcpu->lock);
+       hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+                            &pcpu->dying);
+       spin_unlock(&pcpu->lock);
+}
+
+/* must be called with local_bh_disable */
+static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
+{
+       struct ct_pcpu *pcpu;
+
+       /* add this conntrack to the (per cpu) unconfirmed list */
+       ct->cpu = smp_processor_id();
+       pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+       spin_lock(&pcpu->lock);
+       hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+                            &pcpu->unconfirmed);
+       spin_unlock(&pcpu->lock);
+}
+
+/* must be called with local_bh_disable */
+static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
+{
+       struct ct_pcpu *pcpu;
+
+       /* We overload first tuple to link into unconfirmed or dying list.*/
+       pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+       spin_lock(&pcpu->lock);
+       BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
+       hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+       spin_unlock(&pcpu->lock);
+}
+
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
@@ -220,9 +264,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
         * too. */
        nf_ct_remove_expectations(ct);
 
-       /* We overload first tuple to link into unconfirmed or dying list.*/
-       BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
-       hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+       nf_ct_del_from_dying_or_unconfirmed_list(ct);
 
        NF_CT_STAT_INC(net, delete);
        spin_unlock_bh(&nf_conntrack_lock);
@@ -244,9 +286,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
         * Otherwise we can get spurious warnings. */
        NF_CT_STAT_INC(net, delete_list);
        clean_from_lists(ct);
-       /* add this conntrack to the dying list */
-       hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
-                            &net->ct.dying);
+       nf_ct_add_to_dying_list(ct);
        spin_unlock_bh(&nf_conntrack_lock);
 }
 
@@ -467,15 +507,22 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 /* deletion from this larval template list happens via nf_ct_put() */
 void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
 {
+       struct ct_pcpu *pcpu;
+
        __set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
        __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
        nf_conntrack_get(&tmpl->ct_general);
 
-       spin_lock_bh(&nf_conntrack_lock);
+       /* add this conntrack to the (per cpu) tmpl list */
+       local_bh_disable();
+       tmpl->cpu = smp_processor_id();
+       pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu);
+
+       spin_lock(&pcpu->lock);
        /* Overload tuple linked list to put us in template list. */
        hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
-                                &net->ct.tmpl);
-       spin_unlock_bh(&nf_conntrack_lock);
+                                &pcpu->tmpl);
+       spin_unlock_bh(&pcpu->lock);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
 
@@ -546,8 +593,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
                    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
                        goto out;
 
-       /* Remove from unconfirmed list */
-       hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+       nf_ct_del_from_dying_or_unconfirmed_list(ct);
 
        /* Timer relative to confirmation time, not original
           setting time, otherwise we'd get timer wrap in
@@ -879,10 +925,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 
        /* Now it is inserted into the unconfirmed list, bump refcount */
        nf_conntrack_get(&ct->ct_general);
-
-       /* Overload tuple linked list to put us in unconfirmed list. */
-       hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
-                      &net->ct.unconfirmed);
+       nf_ct_add_to_unconfirmed_list(ct);
 
        spin_unlock_bh(&nf_conntrack_lock);
 
@@ -1254,6 +1297,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
        struct hlist_nulls_node *n;
+       int cpu;
 
        spin_lock_bh(&nf_conntrack_lock);
        for (; *bucket < net->ct.htable_size; (*bucket)++) {
@@ -1265,12 +1309,19 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
                                goto found;
                }
        }
-       hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) {
-               ct = nf_ct_tuplehash_to_ctrack(h);
-               if (iter(ct, data))
-                       set_bit(IPS_DYING_BIT, &ct->status);
-       }
        spin_unlock_bh(&nf_conntrack_lock);
+
+       for_each_possible_cpu(cpu) {
+               struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+               spin_lock_bh(&pcpu->lock);
+               hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
+                       ct = nf_ct_tuplehash_to_ctrack(h);
+                       if (iter(ct, data))
+                               set_bit(IPS_DYING_BIT, &ct->status);
+               }
+               spin_unlock_bh(&pcpu->lock);
+       }
        return NULL;
 found:
        atomic_inc(&ct->ct_general.use);
@@ -1323,14 +1374,19 @@ static void nf_ct_release_dying_list(struct net *net)
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
        struct hlist_nulls_node *n;
+       int cpu;
 
-       spin_lock_bh(&nf_conntrack_lock);
-       hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
-               ct = nf_ct_tuplehash_to_ctrack(h);
-               /* never fails to remove them, no listeners at this point */
-               nf_ct_kill(ct);
+       for_each_possible_cpu(cpu) {
+               struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+               spin_lock_bh(&pcpu->lock);
+               hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
+                       ct = nf_ct_tuplehash_to_ctrack(h);
+                       /* never fails to remove them, no listeners at this point */
+                       nf_ct_kill(ct);
+               }
+               spin_unlock_bh(&pcpu->lock);
        }
-       spin_unlock_bh(&nf_conntrack_lock);
 }
 
 static int untrack_refs(void)
@@ -1417,6 +1473,7 @@ i_see_dead_people:
                kmem_cache_destroy(net->ct.nf_conntrack_cachep);
                kfree(net->ct.slabname);
                free_percpu(net->ct.stat);
+               free_percpu(net->ct.pcpu_lists);
        }
 }
 
@@ -1629,37 +1686,43 @@ void nf_conntrack_init_end(void)
 
 int nf_conntrack_init_net(struct net *net)
 {
-       int ret;
+       int ret = -ENOMEM;
+       int cpu;
 
        atomic_set(&net->ct.count, 0);
-       INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
-       INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
-       INIT_HLIST_NULLS_HEAD(&net->ct.tmpl, TEMPLATE_NULLS_VAL);
-       net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
-       if (!net->ct.stat) {
-               ret = -ENOMEM;
+
+       net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
+       if (!net->ct.pcpu_lists)
                goto err_stat;
+
+       for_each_possible_cpu(cpu) {
+               struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+               spin_lock_init(&pcpu->lock);
+               INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
+               INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
+               INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);
        }
 
+       net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
+       if (!net->ct.stat)
+               goto err_pcpu_lists;
+
        net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
-       if (!net->ct.slabname) {
-               ret = -ENOMEM;
+       if (!net->ct.slabname)
                goto err_slabname;
-       }
 
        net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
                                                        sizeof(struct nf_conn), 0,
                                                        SLAB_DESTROY_BY_RCU, NULL);
        if (!net->ct.nf_conntrack_cachep) {
                printk(KERN_ERR "Unable to create nf_conn slab cache\n");
-               ret = -ENOMEM;
                goto err_cache;
        }
 
        net->ct.htable_size = nf_conntrack_htable_size;
        net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
        if (!net->ct.hash) {
-               ret = -ENOMEM;
                printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
                goto err_hash;
        }
@@ -1701,6 +1764,8 @@ err_cache:
        kfree(net->ct.slabname);
 err_slabname:
        free_percpu(net->ct.stat);
+err_pcpu_lists:
+       free_percpu(net->ct.pcpu_lists);
 err_stat:
        return ret;
 }
index 974a2a4adefa739c729692e0df3ad3142f83d0d4..27d9302c21915812022cdc5132b21f5ca2b181a4 100644 (file)
@@ -396,6 +396,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
        const struct hlist_node *next;
        const struct hlist_nulls_node *nn;
        unsigned int i;
+       int cpu;
 
        /* Get rid of expectations */
        for (i = 0; i < nf_ct_expect_hsize; i++) {
@@ -414,8 +415,14 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
        }
 
        /* Get rid of expecteds, set helpers to NULL. */
-       hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode)
-               unhelp(h, me);
+       for_each_possible_cpu(cpu) {
+               struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+               spin_lock_bh(&pcpu->lock);
+               hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)
+                       unhelp(h, me);
+               spin_unlock_bh(&pcpu->lock);
+       }
        for (i = 0; i < net->ct.htable_size; i++) {
                hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
                        unhelp(h, me);
index 47e9369997ef657d299885332dd9e2fa7d0c4f38..4ac8ce68bc16e56034d0d1a93a6af2892cf7188f 100644 (file)
@@ -1137,50 +1137,65 @@ static int ctnetlink_done_list(struct netlink_callback *cb)
 }
 
 static int
-ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb,
-                   struct hlist_nulls_head *list)
+ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
 {
-       struct nf_conn *ct, *last;
+       struct nf_conn *ct, *last = NULL;
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_node *n;
        struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        u_int8_t l3proto = nfmsg->nfgen_family;
        int res;
+       int cpu;
+       struct hlist_nulls_head *list;
+       struct net *net = sock_net(skb->sk);
 
        if (cb->args[2])
                return 0;
 
-       spin_lock_bh(&nf_conntrack_lock);
-       last = (struct nf_conn *)cb->args[1];
-restart:
-       hlist_nulls_for_each_entry(h, n, list, hnnode) {
-               ct = nf_ct_tuplehash_to_ctrack(h);
-               if (l3proto && nf_ct_l3num(ct) != l3proto)
+       if (cb->args[0] == nr_cpu_ids)
+               return 0;
+
+       for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+               struct ct_pcpu *pcpu;
+
+               if (!cpu_possible(cpu))
                        continue;
-               if (cb->args[1]) {
-                       if (ct != last)
+
+               pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+               spin_lock_bh(&pcpu->lock);
+               last = (struct nf_conn *)cb->args[1];
+               list = dying ? &pcpu->dying : &pcpu->unconfirmed;
+restart:
+               hlist_nulls_for_each_entry(h, n, list, hnnode) {
+                       ct = nf_ct_tuplehash_to_ctrack(h);
+                       if (l3proto && nf_ct_l3num(ct) != l3proto)
                                continue;
-                       cb->args[1] = 0;
-               }
-               rcu_read_lock();
-               res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
-                                         cb->nlh->nlmsg_seq,
-                                         NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
-                                         ct);
-               rcu_read_unlock();
-               if (res < 0) {
-                       nf_conntrack_get(&ct->ct_general);
-                       cb->args[1] = (unsigned long)ct;
-                       goto out;
+                       if (cb->args[1]) {
+                               if (ct != last)
+                                       continue;
+                               cb->args[1] = 0;
+                       }
+                       rcu_read_lock();
+                       res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+                                                 cb->nlh->nlmsg_seq,
+                                                 NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+                                                 ct);
+                       rcu_read_unlock();
+                       if (res < 0) {
+                               nf_conntrack_get(&ct->ct_general);
+                               cb->args[1] = (unsigned long)ct;
+                               spin_unlock_bh(&pcpu->lock);
+                               goto out;
+                       }
                }
+               if (cb->args[1]) {
+                       cb->args[1] = 0;
+                       goto restart;
+               } else
+                       cb->args[2] = 1;
+               spin_unlock_bh(&pcpu->lock);
        }
-       if (cb->args[1]) {
-               cb->args[1] = 0;
-               goto restart;
-       } else
-               cb->args[2] = 1;
 out:
-       spin_unlock_bh(&nf_conntrack_lock);
        if (last)
                nf_ct_put(last);
 
@@ -1190,9 +1205,7 @@ out:
 static int
 ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
 {
-       struct net *net = sock_net(skb->sk);
-
-       return ctnetlink_dump_list(skb, cb, &net->ct.dying);
+       return ctnetlink_dump_list(skb, cb, true);
 }
 
 static int
@@ -1214,9 +1227,7 @@ ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb,
 static int
 ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
 {
-       struct net *net = sock_net(skb->sk);
-
-       return ctnetlink_dump_list(skb, cb, &net->ct.unconfirmed);
+       return ctnetlink_dump_list(skb, cb, false);
 }
 
 static int