net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls
authorEric Dumazet <dada1@cosmosbay.com>
Mon, 17 Nov 2008 03:40:17 +0000 (19:40 -0800)
committerDavid S. Miller <davem@davemloft.net>
Mon, 17 Nov 2008 03:40:17 +0000 (19:40 -0800)
RCU was added to UDP lookups, using a fast infrastructure :
- sockets kmem_cache use SLAB_DESTROY_BY_RCU and dont pay the
  price of call_rcu() at freeing time.
- hlist_nulls permits to use few memory barriers.

This patch uses same infrastructure for TCP/DCCP established
and timewait sockets.

Thanks to SLAB_DESTROY_BY_RCU, no slowdown for applications
using short lived TCP connections. A followup patch, converting
rwlocks to spinlocks will even speedup this case.

__inet_lookup_established() is pretty fast now we dont have to
dirty a contended cache line (read_lock/read_unlock)

Only established and timewait hashtable are converted to RCU
(bind table and listen table are still using traditional locking)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
13 files changed:
include/net/inet_hashtables.h
include/net/inet_timewait_sock.h
net/core/sock.c
net/dccp/ipv4.c
net/dccp/ipv6.c
net/dccp/proto.c
net/ipv4/inet_diag.c
net/ipv4/inet_hashtables.c
net/ipv4/inet_timewait_sock.c
net/ipv4/tcp.c
net/ipv4/tcp_ipv4.c
net/ipv6/inet6_hashtables.c
net/ipv6/tcp_ipv6.c

index cb31fbf8ae2acdd6d091bca2f4d3b620dd5e18db..4818960451112cf34f20721e16e749baae7e5923 100644 (file)
@@ -41,8 +41,8 @@
  * I'll experiment with dynamic table growth later.
  */
 struct inet_ehash_bucket {
-       struct hlist_head chain;
-       struct hlist_head twchain;
+       struct hlist_nulls_head chain;
+       struct hlist_nulls_head twchain;
 };
 
 /* There are a few simple rules, which allow for local port reuse by
index 80e4977631b8b18ba67e44024895fee03adbe7f9..4b8ece22b8e94417879a6504e87afe5bd0ecc371 100644 (file)
@@ -110,7 +110,7 @@ struct inet_timewait_sock {
 #define tw_state               __tw_common.skc_state
 #define tw_reuse               __tw_common.skc_reuse
 #define tw_bound_dev_if                __tw_common.skc_bound_dev_if
-#define tw_node                        __tw_common.skc_node
+#define tw_node                        __tw_common.skc_nulls_node
 #define tw_bind_node           __tw_common.skc_bind_node
 #define tw_refcnt              __tw_common.skc_refcnt
 #define tw_hash                        __tw_common.skc_hash
@@ -137,10 +137,10 @@ struct inet_timewait_sock {
        struct hlist_node       tw_death_node;
 };
 
-static inline void inet_twsk_add_node(struct inet_timewait_sock *tw,
-                                     struct hlist_head *list)
+static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
+                                     struct hlist_nulls_head *list)
 {
-       hlist_add_head(&tw->tw_node, list);
+       hlist_nulls_add_head_rcu(&tw->tw_node, list);
 }
 
 static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
@@ -175,7 +175,7 @@ static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
 }
 
 #define inet_twsk_for_each(tw, node, head) \
-       hlist_for_each_entry(tw, node, head, tw_node)
+       hlist_nulls_for_each_entry(tw, node, head, tw_node)
 
 #define inet_twsk_for_each_inmate(tw, node, jail) \
        hlist_for_each_entry(tw, node, jail, tw_death_node)
index ded1eb5d2fd44664eace3f26fa745f3c2b2584a2..38de9c3f563bbb3ef80d30a94833729c3192194d 100644 (file)
@@ -2082,7 +2082,9 @@ int proto_register(struct proto *prot, int alloc_slab)
                        prot->twsk_prot->twsk_slab =
                                kmem_cache_create(timewait_sock_slab_name,
                                                  prot->twsk_prot->twsk_obj_size,
-                                                 0, SLAB_HWCACHE_ALIGN,
+                                                 0,
+                                                 SLAB_HWCACHE_ALIGN |
+                                                       prot->slab_flags,
                                                  NULL);
                        if (prot->twsk_prot->twsk_slab == NULL)
                                goto out_free_timewait_sock_slab_name;
index 528baa2e5be4caf3290a44e89cc1c122c7d05ba8..d1dd95289b8960bd1556e3cecbd025b4bb83ca8e 100644 (file)
@@ -938,6 +938,7 @@ static struct proto dccp_v4_prot = {
        .orphan_count           = &dccp_orphan_count,
        .max_header             = MAX_DCCP_HEADER,
        .obj_size               = sizeof(struct dccp_sock),
+       .slab_flags             = SLAB_DESTROY_BY_RCU,
        .rsk_prot               = &dccp_request_sock_ops,
        .twsk_prot              = &dccp_timewait_sock_ops,
        .h.hashinfo             = &dccp_hashinfo,
index 4aa1148cdb200fd923384826bb24aa54626b7b8e..f033e845bb076aa2c911e7ccac4bcb8dc1f46856 100644 (file)
@@ -1140,6 +1140,7 @@ static struct proto dccp_v6_prot = {
        .orphan_count      = &dccp_orphan_count,
        .max_header        = MAX_DCCP_HEADER,
        .obj_size          = sizeof(struct dccp6_sock),
+       .slab_flags        = SLAB_DESTROY_BY_RCU,
        .rsk_prot          = &dccp6_request_sock_ops,
        .twsk_prot         = &dccp6_timewait_sock_ops,
        .h.hashinfo        = &dccp_hashinfo,
index 46cb3490d48e1710c175007981e94f5f7ae73f7d..1117d4d8c8f12034c48b938e52f3a4d6669d9642 100644 (file)
@@ -1090,8 +1090,8 @@ static int __init dccp_init(void)
        }
 
        for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
-               INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
-               INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
+               INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
+               INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
        }
 
        if (inet_ehash_locks_alloc(&dccp_hashinfo))
index 564230dabcb8a7ff172bed468a4257dd0a11616f..41b36720e977a74ea554dda2117c26932fce1a82 100644 (file)
@@ -778,18 +778,19 @@ skip_listen_ht:
                struct inet_ehash_bucket *head = &hashinfo->ehash[i];
                rwlock_t *lock = inet_ehash_lockp(hashinfo, i);
                struct sock *sk;
-               struct hlist_node *node;
+               struct hlist_nulls_node *node;
 
                num = 0;
 
-               if (hlist_empty(&head->chain) && hlist_empty(&head->twchain))
+               if (hlist_nulls_empty(&head->chain) &&
+                       hlist_nulls_empty(&head->twchain))
                        continue;
 
                if (i > s_i)
                        s_num = 0;
 
                read_lock_bh(lock);
-               sk_for_each(sk, node, &head->chain) {
+               sk_nulls_for_each(sk, node, &head->chain) {
                        struct inet_sock *inet = inet_sk(sk);
 
                        if (num < s_num)
index be41ebbec4ebd60a29e7a69cbea0f150843eea8d..fd269cfef0ec8ff215c73dca6328064776398113 100644 (file)
@@ -223,35 +223,65 @@ struct sock * __inet_lookup_established(struct net *net,
        INET_ADDR_COOKIE(acookie, saddr, daddr)
        const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
        struct sock *sk;
-       const struct hlist_node *node;
+       const struct hlist_nulls_node *node;
        /* Optimize here for direct hit, only listening connections can
         * have wildcards anyways.
         */
        unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
-       struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
-       rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
+       unsigned int slot = hash & (hashinfo->ehash_size - 1);
+       struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
 
-       prefetch(head->chain.first);
-       read_lock(lock);
-       sk_for_each(sk, node, &head->chain) {
+       rcu_read_lock();
+begin:
+       sk_nulls_for_each_rcu(sk, node, &head->chain) {
                if (INET_MATCH(sk, net, hash, acookie,
-                                       saddr, daddr, ports, dif))
-                       goto hit; /* You sunk my battleship! */
+                                       saddr, daddr, ports, dif)) {
+                       if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
+                               goto begintw;
+                       if (unlikely(!INET_MATCH(sk, net, hash, acookie,
+                               saddr, daddr, ports, dif))) {
+                               sock_put(sk);
+                               goto begin;
+                       }
+                       goto out;
+               }
        }
+       /*
+        * if the nulls value we got at the end of this lookup is
+        * not the expected one, we must restart lookup.
+        * We probably met an item that was moved to another chain.
+        */
+       if (get_nulls_value(node) != slot)
+               goto begin;
 
+begintw:
        /* Must check for a TIME_WAIT'er before going to listener hash. */
-       sk_for_each(sk, node, &head->twchain) {
+       sk_nulls_for_each_rcu(sk, node, &head->twchain) {
                if (INET_TW_MATCH(sk, net, hash, acookie,
-                                       saddr, daddr, ports, dif))
-                       goto hit;
+                                       saddr, daddr, ports, dif)) {
+                       if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
+                               sk = NULL;
+                               goto out;
+                       }
+                       if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
+                                saddr, daddr, ports, dif))) {
+                               sock_put(sk);
+                               goto begintw;
+                       }
+                       goto out;
+               }
        }
+       /*
+        * if the nulls value we got at the end of this lookup is
+        * not the expected one, we must restart lookup.
+        * We probably met an item that was moved to another chain.
+        */
+       if (get_nulls_value(node) != slot)
+               goto begintw;
        sk = NULL;
 out:
-       read_unlock(lock);
+       rcu_read_unlock();
        return sk;
-hit:
-       sock_hold(sk);
-       goto out;
 }
 EXPORT_SYMBOL_GPL(__inet_lookup_established);
 
@@ -272,14 +302,14 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
        struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
        rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
        struct sock *sk2;
-       const struct hlist_node *node;
+       const struct hlist_nulls_node *node;
        struct inet_timewait_sock *tw;
 
        prefetch(head->chain.first);
        write_lock(lock);
 
        /* Check TIME-WAIT sockets first. */
-       sk_for_each(sk2, node, &head->twchain) {
+       sk_nulls_for_each(sk2, node, &head->twchain) {
                tw = inet_twsk(sk2);
 
                if (INET_TW_MATCH(sk2, net, hash, acookie,
@@ -293,7 +323,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
        tw = NULL;
 
        /* And established part... */
-       sk_for_each(sk2, node, &head->chain) {
+       sk_nulls_for_each(sk2, node, &head->chain) {
                if (INET_MATCH(sk2, net, hash, acookie,
                                        saddr, daddr, ports, dif))
                        goto not_unique;
@@ -306,7 +336,7 @@ unique:
        inet->sport = htons(lport);
        sk->sk_hash = hash;
        WARN_ON(!sk_unhashed(sk));
-       __sk_add_node(sk, &head->chain);
+       __sk_nulls_add_node_rcu(sk, &head->chain);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
        write_unlock(lock);
 
@@ -338,7 +368,7 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
 void __inet_hash_nolisten(struct sock *sk)
 {
        struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-       struct hlist_head *list;
+       struct hlist_nulls_head *list;
        rwlock_t *lock;
        struct inet_ehash_bucket *head;
 
@@ -350,7 +380,7 @@ void __inet_hash_nolisten(struct sock *sk)
        lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 
        write_lock(lock);
-       __sk_add_node(sk, list);
+       __sk_nulls_add_node_rcu(sk, list);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
        write_unlock(lock);
 }
@@ -400,13 +430,15 @@ void inet_unhash(struct sock *sk)
                local_bh_disable();
                inet_listen_wlock(hashinfo);
                lock = &hashinfo->lhash_lock;
+               if (__sk_del_node_init(sk))
+                       sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
        } else {
                lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
                write_lock_bh(lock);
+               if (__sk_nulls_del_node_init_rcu(sk))
+                       sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
        }
 
-       if (__sk_del_node_init(sk))
-               sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
        write_unlock_bh(lock);
 out:
        if (sk->sk_state == TCP_LISTEN)
index 1c5fd38f8824a9a58cafc9220773b134c4a4b771..60689951ecdbd84d46ec4db4e8e5a72fe7e13401 100644 (file)
@@ -23,12 +23,12 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
        rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
 
        write_lock(lock);
-       if (hlist_unhashed(&tw->tw_node)) {
+       if (hlist_nulls_unhashed(&tw->tw_node)) {
                write_unlock(lock);
                return;
        }
-       __hlist_del(&tw->tw_node);
-       sk_node_init(&tw->tw_node);
+       hlist_nulls_del_rcu(&tw->tw_node);
+       sk_nulls_node_init(&tw->tw_node);
        write_unlock(lock);
 
        /* Disassociate with bind bucket. */
@@ -92,13 +92,17 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 
        write_lock(lock);
 
-       /* Step 2: Remove SK from established hash. */
-       if (__sk_del_node_init(sk))
-               sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-
-       /* Step 3: Hash TW into TIMEWAIT chain. */
-       inet_twsk_add_node(tw, &ehead->twchain);
+       /*
+        * Step 2: Hash TW into TIMEWAIT chain.
+        * Should be done before removing sk from established chain
+        * because readers are lockless and search established first.
+        */
        atomic_inc(&tw->tw_refcnt);
+       inet_twsk_add_node_rcu(tw, &ehead->twchain);
+
+       /* Step 3: Remove SK from established hash. */
+       if (__sk_nulls_del_node_init_rcu(sk))
+               sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 
        write_unlock(lock);
 }
@@ -416,7 +420,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
 {
        struct inet_timewait_sock *tw;
        struct sock *sk;
-       struct hlist_node *node;
+       struct hlist_nulls_node *node;
        int h;
 
        local_bh_disable();
@@ -426,7 +430,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
                rwlock_t *lock = inet_ehash_lockp(hashinfo, h);
 restart:
                write_lock(lock);
-               sk_for_each(sk, node, &head->twchain) {
+               sk_nulls_for_each(sk, node, &head->twchain) {
 
                        tw = inet_twsk(sk);
                        if (!net_eq(twsk_net(tw), net) ||
index f60a5917e54d9c733e53b9d1b650b139fcb61fc9..044224a341eb10b6cb476c2b0b43d130a4af5916 100644 (file)
@@ -2707,8 +2707,8 @@ void __init tcp_init(void)
                                        thash_entries ? 0 : 512 * 1024);
        tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
        for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
-               INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
-               INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
+               INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
+               INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
        }
        if (inet_ehash_locks_alloc(&tcp_hashinfo))
                panic("TCP: failed to alloc ehash_locks");
index d49233f409b59b3508caf6cfeec911d19d672c1e..b2e3ab2287baa57ef883ba572cc76e29bb18f247 100644 (file)
@@ -1857,16 +1857,16 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
 #ifdef CONFIG_PROC_FS
 /* Proc filesystem TCP sock list dumping. */
 
-static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
+static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
 {
-       return hlist_empty(head) ? NULL :
+       return hlist_nulls_empty(head) ? NULL :
                list_entry(head->first, struct inet_timewait_sock, tw_node);
 }
 
 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
 {
-       return tw->tw_node.next ?
-               hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
+       return !is_a_nulls(tw->tw_node.next) ?
+               hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
 }
 
 static void *listening_get_next(struct seq_file *seq, void *cur)
@@ -1954,8 +1954,8 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
 
 static inline int empty_bucket(struct tcp_iter_state *st)
 {
-       return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
-               hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
+       return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
+               hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
 }
 
 static void *established_get_first(struct seq_file *seq)
@@ -1966,7 +1966,7 @@ static void *established_get_first(struct seq_file *seq)
 
        for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
                struct sock *sk;
-               struct hlist_node *node;
+               struct hlist_nulls_node *node;
                struct inet_timewait_sock *tw;
                rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
 
@@ -1975,7 +1975,7 @@ static void *established_get_first(struct seq_file *seq)
                        continue;
 
                read_lock_bh(lock);
-               sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
+               sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
                        if (sk->sk_family != st->family ||
                            !net_eq(sock_net(sk), net)) {
                                continue;
@@ -2004,7 +2004,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 {
        struct sock *sk = cur;
        struct inet_timewait_sock *tw;
-       struct hlist_node *node;
+       struct hlist_nulls_node *node;
        struct tcp_iter_state *st = seq->private;
        struct net *net = seq_file_net(seq);
 
@@ -2032,11 +2032,11 @@ get_tw:
                        return NULL;
 
                read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
-               sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
+               sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
        } else
-               sk = sk_next(sk);
+               sk = sk_nulls_next(sk);
 
-       sk_for_each_from(sk, node) {
+       sk_nulls_for_each_from(sk, node) {
                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
                        goto found;
        }
@@ -2375,6 +2375,7 @@ struct proto tcp_prot = {
        .sysctl_rmem            = sysctl_tcp_rmem,
        .max_header             = MAX_TCP_HEADER,
        .obj_size               = sizeof(struct tcp_sock),
+       .slab_flags             = SLAB_DESTROY_BY_RCU,
        .twsk_prot              = &tcp_timewait_sock_ops,
        .rsk_prot               = &tcp_request_sock_ops,
        .h.hashinfo             = &tcp_hashinfo,
index 1646a565825513421332b02411f00aa863f9c261..c1b4d401fd950f18762573f6537d634aa5fd4ced 100644 (file)
 void __inet6_hash(struct sock *sk)
 {
        struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-       struct hlist_head *list;
        rwlock_t *lock;
 
        WARN_ON(!sk_unhashed(sk));
 
        if (sk->sk_state == TCP_LISTEN) {
+               struct hlist_head *list;
+
                list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
                lock = &hashinfo->lhash_lock;
                inet_listen_wlock(hashinfo);
+               __sk_add_node(sk, list);
        } else {
                unsigned int hash;
+               struct hlist_nulls_head *list;
+
                sk->sk_hash = hash = inet6_sk_ehashfn(sk);
                list = &inet_ehash_bucket(hashinfo, hash)->chain;
                lock = inet_ehash_lockp(hashinfo, hash);
                write_lock(lock);
+               __sk_nulls_add_node_rcu(sk, list);
        }
 
-       __sk_add_node(sk, list);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
        write_unlock(lock);
 }
@@ -63,33 +67,53 @@ struct sock *__inet6_lookup_established(struct net *net,
                                           const int dif)
 {
        struct sock *sk;
-       const struct hlist_node *node;
+       const struct hlist_nulls_node *node;
        const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
        /* Optimize here for direct hit, only listening connections can
         * have wildcards anyways.
         */
        unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
-       struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
-       rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
+       unsigned int slot = hash & (hashinfo->ehash_size - 1);
+       struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
 
-       prefetch(head->chain.first);
-       read_lock(lock);
-       sk_for_each(sk, node, &head->chain) {
+
+       rcu_read_lock();
+begin:
+       sk_nulls_for_each_rcu(sk, node, &head->chain) {
                /* For IPV6 do the cheaper port and family tests first. */
-               if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif))
-                       goto hit; /* You sunk my battleship! */
+               if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
+                       if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
+                               goto begintw;
+                       if (!INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
+                               sock_put(sk);
+                               goto begin;
+                       }
+               goto out;
+               }
        }
+       if (get_nulls_value(node) != slot)
+               goto begin;
+
+begintw:
        /* Must check for a TIME_WAIT'er before going to listener hash. */
-       sk_for_each(sk, node, &head->twchain) {
-               if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif))
-                       goto hit;
+       sk_nulls_for_each_rcu(sk, node, &head->twchain) {
+               if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
+                       if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
+                               sk = NULL;
+                               goto out;
+                       }
+                       if (!INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
+                               sock_put(sk);
+                               goto begintw;
+                       }
+                       goto out;
+               }
        }
-       read_unlock(lock);
-       return NULL;
-
-hit:
-       sock_hold(sk);
-       read_unlock(lock);
+       if (get_nulls_value(node) != slot)
+               goto begintw;
+       sk = NULL;
+out:
+       rcu_read_unlock();
        return sk;
 }
 EXPORT_SYMBOL(__inet6_lookup_established);
@@ -172,14 +196,14 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
        struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
        rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
        struct sock *sk2;
-       const struct hlist_node *node;
+       const struct hlist_nulls_node *node;
        struct inet_timewait_sock *tw;
 
        prefetch(head->chain.first);
        write_lock(lock);
 
        /* Check TIME-WAIT sockets first. */
-       sk_for_each(sk2, node, &head->twchain) {
+       sk_nulls_for_each(sk2, node, &head->twchain) {
                tw = inet_twsk(sk2);
 
                if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) {
@@ -192,7 +216,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
        tw = NULL;
 
        /* And established part... */
-       sk_for_each(sk2, node, &head->chain) {
+       sk_nulls_for_each(sk2, node, &head->chain) {
                if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif))
                        goto not_unique;
        }
@@ -203,7 +227,7 @@ unique:
        inet->num = lport;
        inet->sport = htons(lport);
        WARN_ON(!sk_unhashed(sk));
-       __sk_add_node(sk, &head->chain);
+       __sk_nulls_add_node_rcu(sk, &head->chain);
        sk->sk_hash = hash;
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
        write_unlock(lock);
index 984276463a8ddf709e10c56c284ad85fc17e8fd1..b35787056313ffeb970efc1c1483ac0020b77ffb 100644 (file)
@@ -2043,6 +2043,7 @@ struct proto tcpv6_prot = {
        .sysctl_rmem            = sysctl_tcp_rmem,
        .max_header             = MAX_TCP_HEADER,
        .obj_size               = sizeof(struct tcp6_sock),
+       .slab_flags             = SLAB_DESTROY_BY_RCU,
        .twsk_prot              = &tcp6_timewait_sock_ops,
        .rsk_prot               = &tcp6_request_sock_ops,
        .h.hashinfo             = &tcp_hashinfo,