net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/types.h>
  97 #include <linux/socket.h>
  98 #include <linux/in.h>
  99 #include <linux/kernel.h>
 100 #include <linux/module.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/sched.h>
 104 #include <linux/timer.h>
 105 #include <linux/string.h>
 106 #include <linux/sockios.h>
 107 #include <linux/net.h>
 108 #include <linux/mm.h>
 109 #include <linux/slab.h>
 110 #include <linux/interrupt.h>
 111 #include <linux/poll.h>
 112 #include <linux/tcp.h>
 113 #include <linux/init.h>
 114 #include <linux/highmem.h>
 115 #include <linux/user_namespace.h>
 116 #include <linux/static_key.h>
 117 #include <linux/memcontrol.h>
 118 #include <linux/prefetch.h>
 119
 120 #include <asm/uaccess.h>
 121
 122 #include <linux/netdevice.h>
 123 #include <net/protocol.h>
 124 #include <linux/skbuff.h>
 125 #include <net/net_namespace.h>
 126 #include <net/request_sock.h>
 127 #include <net/sock.h>
 128 #include <linux/net_tstamp.h>
 129 #include <net/xfrm.h>
 130 #include <linux/ipsec.h>
 131 #include <net/cls_cgroup.h>
 132 #include <net/netprio_cgroup.h>
 133
 134 #include <linux/filter.h>
 135
 136 #include <trace/events/sock.h>
 137
 138 #ifdef CONFIG_INET
 139 #include <net/tcp.h>
 140 #endif
 141
 142 static DEFINE_MUTEX(proto_list_mutex);
 143 static LIST_HEAD(proto_list);
 144
 145 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 146 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 147 {
 148         struct proto *proto;
 149         int ret = 0;
 150
 151         mutex_lock(&proto_list_mutex);
 152         list_for_each_entry(proto, &proto_list, node) {
 153                 if (proto->init_cgroup) {
 154                         ret = proto->init_cgroup(memcg, ss);
 155                         if (ret)
 156                                 goto out;
 157                 }
 158         }
 159
 160         mutex_unlock(&proto_list_mutex);
 161         return ret;
 162 out:
 163         list_for_each_entry_continue_reverse(proto, &proto_list, node)
 164                 if (proto->destroy_cgroup)
 165                         proto->destroy_cgroup(memcg);
 166         mutex_unlock(&proto_list_mutex);
 167         return ret;
 168 }
 169
 170 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 171 {
 172         struct proto *proto;
 173
 174         mutex_lock(&proto_list_mutex);
 175         list_for_each_entry_reverse(proto, &proto_list, node)
 176                 if (proto->destroy_cgroup)
 177                         proto->destroy_cgroup(memcg);
 178         mutex_unlock(&proto_list_mutex);
 179 }
 180 #endif
 181
 182 /*
 183  * Each address family might have different locking rules, so we have
 184  * one slock key per address family:
 185  */
 186 static struct lock_class_key af_family_keys[AF_MAX];
 187 static struct lock_class_key af_family_slock_keys[AF_MAX];
 188
 189 struct static_key memcg_socket_limit_enabled;
 190 EXPORT_SYMBOL(memcg_socket_limit_enabled);
 191
 192 /*
 193  * Make lock validator output more readable. (we pre-construct these
 194  * strings build-time, so that runtime initialization of socket
 195  * locks is fast):
 196  */
 197 static const char *const af_family_key_strings[AF_MAX+1] = {
 198   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 199   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 200   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 201   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 202   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 203   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 204   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 205   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 206   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 207   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 208   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 209   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 210   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 211   "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
 212 };
 213 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 214   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 215   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 216   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 217   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 218   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 219   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 220   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 221   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 222   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 223   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 224   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 225   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 226   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 227   "slock-AF_NFC"   , "slock-AF_MAX"
 228 };
 229 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 230   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 231   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 232   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 233   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 234   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 235   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 236   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 237   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 238   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 239   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 240   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 241   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 242   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 243   "clock-AF_NFC"   , "clock-AF_MAX"
 244 };
 245
 246 /*
 247  * sk_callback_lock locking rules are per-address-family,
 248  * so split the lock classes by using a per-AF key:
 249  */
 250 static struct lock_class_key af_callback_keys[AF_MAX];
 251
 252 /* Take into consideration the size of the struct sk_buff overhead in the
 253  * determination of these values, since that is non-constant across
 254  * platforms.  This makes socket queueing behavior and performance
 255  * not depend upon such differences.
 256  */
 257 #define _SK_MEM_PACKETS         256
 258 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 259 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 260 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 261
 262 /* Run time adjustable parameters. */
 263 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 264 EXPORT_SYMBOL(sysctl_wmem_max);
 265 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 266 EXPORT_SYMBOL(sysctl_rmem_max);
 267 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 268 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 269
 270 /* Maximal space eaten by iovec or ancillary data plus some space */
 271 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 272 EXPORT_SYMBOL(sysctl_optmem_max);
 273
 274 #if defined(CONFIG_CGROUPS)
 275 #if !defined(CONFIG_NET_CLS_CGROUP)
 276 int net_cls_subsys_id = -1;
 277 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
 278 #endif
 279 #if !defined(CONFIG_NETPRIO_CGROUP)
 280 int net_prio_subsys_id = -1;
 281 EXPORT_SYMBOL_GPL(net_prio_subsys_id);
 282 #endif
 283 #endif
 284
 285 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 286 {
 287         struct timeval tv;
 288
 289         if (optlen < sizeof(tv))
 290                 return -EINVAL;
 291         if (copy_from_user(&tv, optval, sizeof(tv)))
 292                 return -EFAULT;
 293         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 294                 return -EDOM;
 295
 296         if (tv.tv_sec < 0) {
 297                 static int warned __read_mostly;
 298
 299                 *timeo_p = 0;
 300                 if (warned < 10 && net_ratelimit()) {
 301                         warned++;
 302                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 303                                 __func__, current->comm, task_pid_nr(current));
 304                 }
 305                 return 0;
 306         }
 307         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 308         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 309                 return 0;
 310         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 311                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 312         return 0;
 313 }
 314
 315 static void sock_warn_obsolete_bsdism(const char *name)
 316 {
 317         static int warned;
 318         static char warncomm[TASK_COMM_LEN];
 319         if (strcmp(warncomm, current->comm) && warned < 5) {
 320                 strcpy(warncomm,  current->comm);
 321                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 322                         warncomm, name);
 323                 warned++;
 324         }
 325 }
 326
 327 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 328
 329 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 330 {
 331         if (sk->sk_flags & flags) {
 332                 sk->sk_flags &= ~flags;
 333                 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 334                         net_disable_timestamp();
 335         }
 336 }
 337
 338
 339 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 340 {
 341         int err;
 342         int skb_len;
 343         unsigned long flags;
 344         struct sk_buff_head *list = &sk->sk_receive_queue;
 345
 346         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 347                 atomic_inc(&sk->sk_drops);
 348                 trace_sock_rcvqueue_full(sk, skb);
 349                 return -ENOMEM;
 350         }
 351
 352         err = sk_filter(sk, skb);
 353         if (err)
 354                 return err;
 355
 356         if (!sk_rmem_schedule(sk, skb->truesize)) {
 357                 atomic_inc(&sk->sk_drops);
 358                 return -ENOBUFS;
 359         }
 360
 361         skb->dev = NULL;
 362         skb_set_owner_r(skb, sk);
 363
 364         /* Cache the SKB length before we tack it onto the receive
 365          * queue.  Once it is added it no longer belongs to us and
 366          * may be freed by other threads of control pulling packets
 367          * from the queue.
 368          */
 369         skb_len = skb->len;
 370
 371         /* we escape from rcu protected region, make sure we dont leak
 372          * a norefcounted dst
 373          */
 374         skb_dst_force(skb);
 375
 376         spin_lock_irqsave(&list->lock, flags);
 377         skb->dropcount = atomic_read(&sk->sk_drops);
 378         __skb_queue_tail(list, skb);
 379         spin_unlock_irqrestore(&list->lock, flags);
 380
 381         if (!sock_flag(sk, SOCK_DEAD))
 382                 sk->sk_data_ready(sk, skb_len);
 383         return 0;
 384 }
 385 EXPORT_SYMBOL(sock_queue_rcv_skb);
 386
 387 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 388 {
 389         int rc = NET_RX_SUCCESS;
 390
 391         if (sk_filter(sk, skb))
 392                 goto discard_and_relse;
 393
 394         skb->dev = NULL;
 395
 396         if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 397                 atomic_inc(&sk->sk_drops);
 398                 goto discard_and_relse;
 399         }
 400         if (nested)
 401                 bh_lock_sock_nested(sk);
 402         else
 403                 bh_lock_sock(sk);
 404         if (!sock_owned_by_user(sk)) {
 405                 /*
 406                  * trylock + unlock semantics:
 407                  */
 408                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 409
 410                 rc = sk_backlog_rcv(sk, skb);
 411
 412                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 413         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 414                 bh_unlock_sock(sk);
 415                 atomic_inc(&sk->sk_drops);
 416                 goto discard_and_relse;
 417         }
 418
 419         bh_unlock_sock(sk);
 420 out:
 421         sock_put(sk);
 422         return rc;
 423 discard_and_relse:
 424         kfree_skb(skb);
 425         goto out;
 426 }
 427 EXPORT_SYMBOL(sk_receive_skb);
 428
 429 void sk_reset_txq(struct sock *sk)
 430 {
 431         sk_tx_queue_clear(sk);
 432 }
 433 EXPORT_SYMBOL(sk_reset_txq);
 434
 435 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 436 {
 437         struct dst_entry *dst = __sk_dst_get(sk);
 438
 439         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 440                 sk_tx_queue_clear(sk);
 441                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 442                 dst_release(dst);
 443                 return NULL;
 444         }
 445
 446         return dst;
 447 }
 448 EXPORT_SYMBOL(__sk_dst_check);
 449
 450 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 451 {
 452         struct dst_entry *dst = sk_dst_get(sk);
 453
 454         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 455                 sk_dst_reset(sk);
 456                 dst_release(dst);
 457                 return NULL;
 458         }
 459
 460         return dst;
 461 }
 462 EXPORT_SYMBOL(sk_dst_check);
 463
 464 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 465 {
 466         int ret = -ENOPROTOOPT;
 467 #ifdef CONFIG_NETDEVICES
 468         struct net *net = sock_net(sk);
 469         char devname[IFNAMSIZ];
 470         int index;
 471
 472         /* Sorry... */
 473         ret = -EPERM;
 474         if (!capable(CAP_NET_RAW))
 475                 goto out;
 476
 477         ret = -EINVAL;
 478         if (optlen < 0)
 479                 goto out;
 480
 481         /* Bind this socket to a particular device like "eth0",
 482          * as specified in the passed interface name. If the
 483          * name is "" or the option length is zero the socket
 484          * is not bound.
 485          */
 486         if (optlen > IFNAMSIZ - 1)
 487                 optlen = IFNAMSIZ - 1;
 488         memset(devname, 0, sizeof(devname));
 489
 490         ret = -EFAULT;
 491         if (copy_from_user(devname, optval, optlen))
 492                 goto out;
 493
 494         index = 0;
 495         if (devname[0] != '\0') {
 496                 struct net_device *dev;
 497
 498                 rcu_read_lock();
 499                 dev = dev_get_by_name_rcu(net, devname);
 500                 if (dev)
 501                         index = dev->ifindex;
 502                 rcu_read_unlock();
 503                 ret = -ENODEV;
 504                 if (!dev)
 505                         goto out;
 506         }
 507
 508         lock_sock(sk);
 509         sk->sk_bound_dev_if = index;
 510         sk_dst_reset(sk);
 511         release_sock(sk);
 512
 513         ret = 0;
 514
 515 out:
 516 #endif
 517
 518         return ret;
 519 }
 520
 521 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 522 {
 523         if (valbool)
 524                 sock_set_flag(sk, bit);
 525         else
 526                 sock_reset_flag(sk, bit);
 527 }
 528
 529 /*
 530  *      This is meant for all protocols to use and covers goings on
 531  *      at the socket level. Everything here is generic.
 532  */
 533
 534 int sock_setsockopt(struct socket *sock, int level, int optname,
 535                     char __user *optval, unsigned int optlen)
 536 {
 537         struct sock *sk = sock->sk;
 538         int val;
 539         int valbool;
 540         struct linger ling;
 541         int ret = 0;
 542
 543         /*
 544          *      Options without arguments
 545          */
 546
 547         if (optname == SO_BINDTODEVICE)
 548                 return sock_bindtodevice(sk, optval, optlen);
 549
 550         if (optlen < sizeof(int))
 551                 return -EINVAL;
 552
 553         if (get_user(val, (int __user *)optval))
 554                 return -EFAULT;
 555
 556         valbool = val ? 1 : 0;
 557
 558         lock_sock(sk);
 559
 560         switch (optname) {
 561         case SO_DEBUG:
 562                 if (val && !capable(CAP_NET_ADMIN))
 563                         ret = -EACCES;
 564                 else
 565                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 566                 break;
 567         case SO_REUSEADDR:
 568                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 569                 break;
 570         case SO_TYPE:
 571         case SO_PROTOCOL:
 572         case SO_DOMAIN:
 573         case SO_ERROR:
 574                 ret = -ENOPROTOOPT;
 575                 break;
 576         case SO_DONTROUTE:
 577                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 578                 break;
 579         case SO_BROADCAST:
 580                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 581                 break;
 582         case SO_SNDBUF:
 583                 /* Don't error on this BSD doesn't and if you think
 584                  * about it this is right. Otherwise apps have to
 585                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 586                  * are treated in BSD as hints
 587                  */
 588                 val = min_t(u32, val, sysctl_wmem_max);
 589 set_sndbuf:
 590                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 591                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 592                 /* Wake up sending tasks if we upped the value. */
 593                 sk->sk_write_space(sk);
 594                 break;
 595
 596         case SO_SNDBUFFORCE:
 597                 if (!capable(CAP_NET_ADMIN)) {
 598                         ret = -EPERM;
 599                         break;
 600                 }
 601                 goto set_sndbuf;
 602
 603         case SO_RCVBUF:
 604                 /* Don't error on this BSD doesn't and if you think
 605                  * about it this is right. Otherwise apps have to
 606                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 607                  * are treated in BSD as hints
 608                  */
 609                 val = min_t(u32, val, sysctl_rmem_max);
 610 set_rcvbuf:
 611                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 612                 /*
 613                  * We double it on the way in to account for
 614                  * "struct sk_buff" etc. overhead.   Applications
 615                  * assume that the SO_RCVBUF setting they make will
 616                  * allow that much actual data to be received on that
 617                  * socket.
 618                  *
 619                  * Applications are unaware that "struct sk_buff" and
 620                  * other overheads allocate from the receive buffer
 621                  * during socket buffer allocation.
 622                  *
 623                  * And after considering the possible alternatives,
 624                  * returning the value we actually used in getsockopt
 625                  * is the most desirable behavior.
 626                  */
 627                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 628                 break;
 629
 630         case SO_RCVBUFFORCE:
 631                 if (!capable(CAP_NET_ADMIN)) {
 632                         ret = -EPERM;
 633                         break;
 634                 }
 635                 goto set_rcvbuf;
 636
 637         case SO_KEEPALIVE:
 638 #ifdef CONFIG_INET
 639                 if (sk->sk_protocol == IPPROTO_TCP)
 640                         tcp_set_keepalive(sk, valbool);
 641 #endif
 642                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 643                 break;
 644
 645         case SO_OOBINLINE:
 646                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 647                 break;
 648
 649         case SO_NO_CHECK:
 650                 sk->sk_no_check = valbool;
 651                 break;
 652
 653         case SO_PRIORITY:
 654                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 655                         sk->sk_priority = val;
 656                 else
 657                         ret = -EPERM;
 658                 break;
 659
 660         case SO_LINGER:
 661                 if (optlen < sizeof(ling)) {
 662                         ret = -EINVAL;  /* 1003.1g */
 663                         break;
 664                 }
 665                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 666                         ret = -EFAULT;
 667                         break;
 668                 }
 669                 if (!ling.l_onoff)
 670                         sock_reset_flag(sk, SOCK_LINGER);
 671                 else {
 672 #if (BITS_PER_LONG == 32)
 673                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 674                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 675                         else
 676 #endif
 677                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 678                         sock_set_flag(sk, SOCK_LINGER);
 679                 }
 680                 break;
 681
 682         case SO_BSDCOMPAT:
 683                 sock_warn_obsolete_bsdism("setsockopt");
 684                 break;
 685
 686         case SO_PASSCRED:
 687                 if (valbool)
 688                         set_bit(SOCK_PASSCRED, &sock->flags);
 689                 else
 690                         clear_bit(SOCK_PASSCRED, &sock->flags);
 691                 break;
 692
 693         case SO_TIMESTAMP:
 694         case SO_TIMESTAMPNS:
 695                 if (valbool)  {
 696                         if (optname == SO_TIMESTAMP)
 697                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 698                         else
 699                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 700                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 701                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 702                 } else {
 703                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 704                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 705                 }
 706                 break;
 707
 708         case SO_TIMESTAMPING:
 709                 if (val & ~SOF_TIMESTAMPING_MASK) {
 710                         ret = -EINVAL;
 711                         break;
 712                 }
 713                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 714                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
 715                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 716                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
 717                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 718                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
 719                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 720                         sock_enable_timestamp(sk,
 721                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 722                 else
 723                         sock_disable_timestamp(sk,
 724                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 725                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 726                                   val & SOF_TIMESTAMPING_SOFTWARE);
 727                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 728                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
 729                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 730                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
 731                 break;
 732
 733         case SO_RCVLOWAT:
 734                 if (val < 0)
 735                         val = INT_MAX;
 736                 sk->sk_rcvlowat = val ? : 1;
 737                 break;
 738
 739         case SO_RCVTIMEO:
 740                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 741                 break;
 742
 743         case SO_SNDTIMEO:
 744                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 745                 break;
 746
 747         case SO_ATTACH_FILTER:
 748                 ret = -EINVAL;
 749                 if (optlen == sizeof(struct sock_fprog)) {
 750                         struct sock_fprog fprog;
 751
 752                         ret = -EFAULT;
 753                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 754                                 break;
 755
 756                         ret = sk_attach_filter(&fprog, sk);
 757                 }
 758                 break;
 759
 760         case SO_DETACH_FILTER:
 761                 ret = sk_detach_filter(sk);
 762                 break;
 763
 764         case SO_PASSSEC:
 765                 if (valbool)
 766                         set_bit(SOCK_PASSSEC, &sock->flags);
 767                 else
 768                         clear_bit(SOCK_PASSSEC, &sock->flags);
 769                 break;
 770         case SO_MARK:
 771                 if (!capable(CAP_NET_ADMIN))
 772                         ret = -EPERM;
 773                 else
 774                         sk->sk_mark = val;
 775                 break;
 776
 777                 /* We implement the SO_SNDLOWAT etc to
 778                    not be settable (1003.1g 5.3) */
 779         case SO_RXQ_OVFL:
 780                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 781                 break;
 782
 783         case SO_WIFI_STATUS:
 784                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 785                 break;
 786
 787         case SO_PEEK_OFF:
 788                 if (sock->ops->set_peek_off)
 789                         sock->ops->set_peek_off(sk, val);
 790                 else
 791                         ret = -EOPNOTSUPP;
 792                 break;
 793
 794         case SO_NOFCS:
 795                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 796                 break;
 797
 798         default:
 799                 ret = -ENOPROTOOPT;
 800                 break;
 801         }
 802         release_sock(sk);
 803         return ret;
 804 }
 805 EXPORT_SYMBOL(sock_setsockopt);
 806
 807
 808 void cred_to_ucred(struct pid *pid, const struct cred *cred,
 809                    struct ucred *ucred)
 810 {
 811         ucred->pid = pid_vnr(pid);
 812         ucred->uid = ucred->gid = -1;
 813         if (cred) {
 814                 struct user_namespace *current_ns = current_user_ns();
 815
 816                 ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
 817                 ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
 818         }
 819 }
 820 EXPORT_SYMBOL_GPL(cred_to_ucred);
 821
 822 int sock_getsockopt(struct socket *sock, int level, int optname,
 823                     char __user *optval, int __user *optlen)
 824 {
 825         struct sock *sk = sock->sk;
 826
 827         union {
 828                 int val;
 829                 struct linger ling;
 830                 struct timeval tm;
 831         } v;
 832
 833         int lv = sizeof(int);
 834         int len;
 835
 836         if (get_user(len, optlen))
 837                 return -EFAULT;
 838         if (len < 0)
 839                 return -EINVAL;
 840
 841         memset(&v, 0, sizeof(v));
 842
 843         switch (optname) {
 844         case SO_DEBUG:
 845                 v.val = sock_flag(sk, SOCK_DBG);
 846                 break;
 847
 848         case SO_DONTROUTE:
 849                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
 850                 break;
 851
 852         case SO_BROADCAST:
 853                 v.val = sock_flag(sk, SOCK_BROADCAST);
 854                 break;
 855
 856         case SO_SNDBUF:
 857                 v.val = sk->sk_sndbuf;
 858                 break;
 859
 860         case SO_RCVBUF:
 861                 v.val = sk->sk_rcvbuf;
 862                 break;
 863
 864         case SO_REUSEADDR:
 865                 v.val = sk->sk_reuse;
 866                 break;
 867
 868         case SO_KEEPALIVE:
 869                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
 870                 break;
 871
 872         case SO_TYPE:
 873                 v.val = sk->sk_type;
 874                 break;
 875
 876         case SO_PROTOCOL:
 877                 v.val = sk->sk_protocol;
 878                 break;
 879
 880         case SO_DOMAIN:
 881                 v.val = sk->sk_family;
 882                 break;
 883
 884         case SO_ERROR:
 885                 v.val = -sock_error(sk);
 886                 if (v.val == 0)
 887                         v.val = xchg(&sk->sk_err_soft, 0);
 888                 break;
 889
 890         case SO_OOBINLINE:
 891                 v.val = sock_flag(sk, SOCK_URGINLINE);
 892                 break;
 893
 894         case SO_NO_CHECK:
 895                 v.val = sk->sk_no_check;
 896                 break;
 897
 898         case SO_PRIORITY:
 899                 v.val = sk->sk_priority;
 900                 break;
 901
 902         case SO_LINGER:
 903                 lv              = sizeof(v.ling);
 904                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
 905                 v.ling.l_linger = sk->sk_lingertime / HZ;
 906                 break;
 907
 908         case SO_BSDCOMPAT:
 909                 sock_warn_obsolete_bsdism("getsockopt");
 910                 break;
 911
 912         case SO_TIMESTAMP:
 913                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 914                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
 915                 break;
 916
 917         case SO_TIMESTAMPNS:
 918                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 919                 break;
 920
 921         case SO_TIMESTAMPING:
 922                 v.val = 0;
 923                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 924                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
 925                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 926                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
 927                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
 928                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
 929                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 930                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 931                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 932                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
 933                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
 934                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 935                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 936                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 937                 break;
 938
 939         case SO_RCVTIMEO:
 940                 lv = sizeof(struct timeval);
 941                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 942                         v.tm.tv_sec = 0;
 943                         v.tm.tv_usec = 0;
 944                 } else {
 945                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 946                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 947                 }
 948                 break;
 949
 950         case SO_SNDTIMEO:
 951                 lv = sizeof(struct timeval);
 952                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 953                         v.tm.tv_sec = 0;
 954                         v.tm.tv_usec = 0;
 955                 } else {
 956                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 957                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 958                 }
 959                 break;
 960
 961         case SO_RCVLOWAT:
 962                 v.val = sk->sk_rcvlowat;
 963                 break;
 964
 965         case SO_SNDLOWAT:
 966                 v.val = 1;
 967                 break;
 968
 969         case SO_PASSCRED:
 970                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
 971                 break;
 972
 973         case SO_PEERCRED:
 974         {
 975                 struct ucred peercred;
 976                 if (len > sizeof(peercred))
 977                         len = sizeof(peercred);
 978                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
 979                 if (copy_to_user(optval, &peercred, len))
 980                         return -EFAULT;
 981                 goto lenout;
 982         }
 983
 984         case SO_PEERNAME:
 985         {
 986                 char address[128];
 987
 988                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 989                         return -ENOTCONN;
 990                 if (lv < len)
 991                         return -EINVAL;
 992                 if (copy_to_user(optval, address, len))
 993                         return -EFAULT;
 994                 goto lenout;
 995         }
 996
 997         /* Dubious BSD thing... Probably nobody even uses it, but
 998          * the UNIX standard wants it for whatever reason... -DaveM
 999          */
1000         case SO_ACCEPTCONN:
1001                 v.val = sk->sk_state == TCP_LISTEN;
1002                 break;
1003
1004         case SO_PASSSEC:
1005                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1006                 break;
1007
1008         case SO_PEERSEC:
1009                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1010
1011         case SO_MARK:
1012                 v.val = sk->sk_mark;
1013                 break;
1014
1015         case SO_RXQ_OVFL:
1016                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1017                 break;
1018
1019         case SO_WIFI_STATUS:
1020                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1021                 break;
1022
1023         case SO_PEEK_OFF:
1024                 if (!sock->ops->set_peek_off)
1025                         return -EOPNOTSUPP;
1026
1027                 v.val = sk->sk_peek_off;
1028                 break;
1029         case SO_NOFCS:
1030                 v.val = sock_flag(sk, SOCK_NOFCS);
1031                 break;
1032         default:
1033                 return -ENOPROTOOPT;
1034         }
1035
1036         if (len > lv)
1037                 len = lv;
1038         if (copy_to_user(optval, &v, len))
1039                 return -EFAULT;
1040 lenout:
1041         if (put_user(len, optlen))
1042                 return -EFAULT;
1043         return 0;
1044 }
1045
1046 /*
1047  * Initialize an sk_lock.
1048  *
1049  * (We also register the sk_lock with the lock validator.)
1050  */
1051 static inline void sock_lock_init(struct sock *sk)
1052 {
1053         sock_lock_init_class_and_name(sk,
1054                         af_family_slock_key_strings[sk->sk_family],
1055                         af_family_slock_keys + sk->sk_family,
1056                         af_family_key_strings[sk->sk_family],
1057                         af_family_keys + sk->sk_family);
1058 }
1059
1060 /*
1061  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1062  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1063  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1064  */
1065 static void sock_copy(struct sock *nsk, const struct sock *osk)
1066 {
1067 #ifdef CONFIG_SECURITY_NETWORK
1068         void *sptr = nsk->sk_security;
1069 #endif
1070         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1071
1072         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1073                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1074
1075 #ifdef CONFIG_SECURITY_NETWORK
1076         nsk->sk_security = sptr;
1077         security_sk_clone(osk, nsk);
1078 #endif
1079 }
1080
1081 /*
1082  * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1083  * un-modified. Special care is taken when initializing object to zero.
1084  */
1085 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1086 {
1087         if (offsetof(struct sock, sk_node.next) != 0)
1088                 memset(sk, 0, offsetof(struct sock, sk_node.next));
1089         memset(&sk->sk_node.pprev, 0,
1090                size - offsetof(struct sock, sk_node.pprev));
1091 }
1092
1093 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1094 {
1095         unsigned long nulls1, nulls2;
1096
1097         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1098         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1099         if (nulls1 > nulls2)
1100                 swap(nulls1, nulls2);
1101
1102         if (nulls1 != 0)
1103                 memset((char *)sk, 0, nulls1);
1104         memset((char *)sk + nulls1 + sizeof(void *), 0,
1105                nulls2 - nulls1 - sizeof(void *));
1106         memset((char *)sk + nulls2 + sizeof(void *), 0,
1107                size - nulls2 - sizeof(void *));
1108 }
1109 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1110
1111 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1112                 int family)
1113 {
1114         struct sock *sk;
1115         struct kmem_cache *slab;
1116
1117         slab = prot->slab;
1118         if (slab != NULL) {
1119                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1120                 if (!sk)
1121                         return sk;
1122                 if (priority & __GFP_ZERO) {
1123                         if (prot->clear_sk)
1124                                 prot->clear_sk(sk, prot->obj_size);
1125                         else
1126                                 sk_prot_clear_nulls(sk, prot->obj_size);
1127                 }
1128         } else
1129                 sk = kmalloc(prot->obj_size, priority);
1130
1131         if (sk != NULL) {
1132                 kmemcheck_annotate_bitfield(sk, flags);
1133
1134                 if (security_sk_alloc(sk, family, priority))
1135                         goto out_free;
1136
1137                 if (!try_module_get(prot->owner))
1138                         goto out_free_sec;
1139                 sk_tx_queue_clear(sk);
1140         }
1141
1142         return sk;
1143
1144 out_free_sec:
1145         security_sk_free(sk);
1146 out_free:
1147         if (slab != NULL)
1148                 kmem_cache_free(slab, sk);
1149         else
1150                 kfree(sk);
1151         return NULL;
1152 }
1153
1154 static void sk_prot_free(struct proto *prot, struct sock *sk)
1155 {
1156         struct kmem_cache *slab;
1157         struct module *owner;
1158
1159         owner = prot->owner;
1160         slab = prot->slab;
1161
1162         security_sk_free(sk);
1163         if (slab != NULL)
1164                 kmem_cache_free(slab, sk);
1165         else
1166                 kfree(sk);
1167         module_put(owner);
1168 }
1169
1170 #ifdef CONFIG_CGROUPS
1171 void sock_update_classid(struct sock *sk)
1172 {
1173         u32 classid;
1174
1175         rcu_read_lock();  /* doing current task, which cannot vanish. */
1176         classid = task_cls_classid(current);
1177         rcu_read_unlock();
1178         if (classid && classid != sk->sk_classid)
1179                 sk->sk_classid = classid;
1180 }
1181 EXPORT_SYMBOL(sock_update_classid);
1182
1183 void sock_update_netprioidx(struct sock *sk)
1184 {
1185         if (in_interrupt())
1186                 return;
1187
1188         sk->sk_cgrp_prioidx = task_netprioidx(current);
1189 }
1190 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1191 #endif
1192
1193 /**
1194  *      sk_alloc - All socket objects are allocated here
1195  *      @net: the applicable net namespace
1196  *      @family: protocol family
1197  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1198  *      @prot: struct proto associated with this new sock instance
1199  */
1200 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1201                       struct proto *prot)
1202 {
1203         struct sock *sk;
1204
1205         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1206         if (sk) {
1207                 sk->sk_family = family;
1208                 /*
1209                  * See comment in struct sock definition to understand
1210                  * why we need sk_prot_creator -acme
1211                  */
1212                 sk->sk_prot = sk->sk_prot_creator = prot;
1213                 sock_lock_init(sk);
1214                 sock_net_set(sk, get_net(net));
1215                 atomic_set(&sk->sk_wmem_alloc, 1);
1216
1217                 sock_update_classid(sk);
1218                 sock_update_netprioidx(sk);
1219         }
1220
1221         return sk;
1222 }
1223 EXPORT_SYMBOL(sk_alloc);
1224
1225 static void __sk_free(struct sock *sk)
1226 {
1227         struct sk_filter *filter;
1228
1229         if (sk->sk_destruct)
1230                 sk->sk_destruct(sk);
1231
1232         filter = rcu_dereference_check(sk->sk_filter,
1233                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1234         if (filter) {
1235                 sk_filter_uncharge(sk, filter);
1236                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1237         }
1238
1239         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1240
1241         if (atomic_read(&sk->sk_omem_alloc))
1242                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1243                          __func__, atomic_read(&sk->sk_omem_alloc));
1244
1245         if (sk->sk_peer_cred)
1246                 put_cred(sk->sk_peer_cred);
1247         put_pid(sk->sk_peer_pid);
1248         put_net(sock_net(sk));
1249         sk_prot_free(sk->sk_prot_creator, sk);
1250 }
1251
1252 void sk_free(struct sock *sk)
1253 {
1254         /*
1255          * We subtract one from sk_wmem_alloc and can know if
1256          * some packets are still in some tx queue.
1257          * If not null, sock_wfree() will call __sk_free(sk) later
1258          */
1259         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1260                 __sk_free(sk);
1261 }
1262 EXPORT_SYMBOL(sk_free);
1263
1264 /*
1265  * Last sock_put should drop reference to sk->sk_net. It has already
1266  * been dropped in sk_change_net. Taking reference to stopping namespace
1267  * is not an option.
1268  * Take reference to a socket to remove it from hash _alive_ and after that
1269  * destroy it in the context of init_net.
1270  */
1271 void sk_release_kernel(struct sock *sk)
1272 {
1273         if (sk == NULL || sk->sk_socket == NULL)
1274                 return;
1275
1276         sock_hold(sk);
1277         sock_release(sk->sk_socket);
1278         release_net(sock_net(sk));
1279         sock_net_set(sk, get_net(&init_net));
1280         sock_put(sk);
1281 }
1282 EXPORT_SYMBOL(sk_release_kernel);
1283
1284 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1285 {
1286         if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1287                 sock_update_memcg(newsk);
1288 }
1289
1290 /**
1291  *      sk_clone_lock - clone a socket, and lock its clone
1292  *      @sk: the socket to clone
1293  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1294  *
1295  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1296  */
1297 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1298 {
1299         struct sock *newsk;
1300
1301         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1302         if (newsk != NULL) {
1303                 struct sk_filter *filter;
1304
1305                 sock_copy(newsk, sk);
1306
1307                 /* SANITY */
1308                 get_net(sock_net(newsk));
1309                 sk_node_init(&newsk->sk_node);
1310                 sock_lock_init(newsk);
1311                 bh_lock_sock(newsk);
1312                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1313                 newsk->sk_backlog.len = 0;
1314
1315                 atomic_set(&newsk->sk_rmem_alloc, 0);
1316                 /*
1317                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1318                  */
1319                 atomic_set(&newsk->sk_wmem_alloc, 1);
1320                 atomic_set(&newsk->sk_omem_alloc, 0);
1321                 skb_queue_head_init(&newsk->sk_receive_queue);
1322                 skb_queue_head_init(&newsk->sk_write_queue);
1323 #ifdef CONFIG_NET_DMA
1324                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1325 #endif
1326
1327                 spin_lock_init(&newsk->sk_dst_lock);
1328                 rwlock_init(&newsk->sk_callback_lock);
1329                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1330                                 af_callback_keys + newsk->sk_family,
1331                                 af_family_clock_key_strings[newsk->sk_family]);
1332
1333                 newsk->sk_dst_cache     = NULL;
1334                 newsk->sk_wmem_queued   = 0;
1335                 newsk->sk_forward_alloc = 0;
1336                 newsk->sk_send_head     = NULL;
1337                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1338
1339                 sock_reset_flag(newsk, SOCK_DONE);
1340                 skb_queue_head_init(&newsk->sk_error_queue);
1341
1342                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1343                 if (filter != NULL)
1344                         sk_filter_charge(newsk, filter);
1345
1346                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1347                         /* It is still raw copy of parent, so invalidate
1348                          * destructor and make plain sk_free() */
1349                         newsk->sk_destruct = NULL;
1350                         bh_unlock_sock(newsk);
1351                         sk_free(newsk);
1352                         newsk = NULL;
1353                         goto out;
1354                 }
1355
1356                 newsk->sk_err      = 0;
1357                 newsk->sk_priority = 0;
1358                 /*
1359                  * Before updating sk_refcnt, we must commit prior changes to memory
1360                  * (Documentation/RCU/rculist_nulls.txt for details)
1361                  */
1362                 smp_wmb();
1363                 atomic_set(&newsk->sk_refcnt, 2);
1364
1365                 /*
1366                  * Increment the counter in the same struct proto as the master
1367                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1368                  * is the same as sk->sk_prot->socks, as this field was copied
1369                  * with memcpy).
1370                  *
1371                  * This _changes_ the previous behaviour, where
1372                  * tcp_create_openreq_child always was incrementing the
1373                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1374                  * to be taken into account in all callers. -acme
1375                  */
1376                 sk_refcnt_debug_inc(newsk);
1377                 sk_set_socket(newsk, NULL);
1378                 newsk->sk_wq = NULL;
1379
1380                 sk_update_clone(sk, newsk);
1381
1382                 if (newsk->sk_prot->sockets_allocated)
1383                         sk_sockets_allocated_inc(newsk);
1384
1385                 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1386                         net_enable_timestamp();
1387         }
1388 out:
1389         return newsk;
1390 }
1391 EXPORT_SYMBOL_GPL(sk_clone_lock);
1392
1393 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1394 {
1395         __sk_dst_set(sk, dst);
1396         sk->sk_route_caps = dst->dev->features;
1397         if (sk->sk_route_caps & NETIF_F_GSO)
1398                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1399         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1400         if (sk_can_gso(sk)) {
1401                 if (dst->header_len) {
1402                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1403                 } else {
1404                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1405                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1406                 }
1407         }
1408 }
1409 EXPORT_SYMBOL_GPL(sk_setup_caps);
1410
1411 void __init sk_init(void)
1412 {
1413         if (totalram_pages <= 4096) {
1414                 sysctl_wmem_max = 32767;
1415                 sysctl_rmem_max = 32767;
1416                 sysctl_wmem_default = 32767;
1417                 sysctl_rmem_default = 32767;
1418         } else if (totalram_pages >= 131072) {
1419                 sysctl_wmem_max = 131071;
1420                 sysctl_rmem_max = 131071;
1421         }
1422 }
1423
1424 /*
1425  *      Simple resource managers for sockets.
1426  */
1427
1428
1429 /*
1430  * Write buffer destructor automatically called from kfree_skb.
1431  */
1432 void sock_wfree(struct sk_buff *skb)
1433 {
1434         struct sock *sk = skb->sk;
1435         unsigned int len = skb->truesize;
1436
1437         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1438                 /*
1439                  * Keep a reference on sk_wmem_alloc, this will be released
1440                  * after sk_write_space() call
1441                  */
1442                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1443                 sk->sk_write_space(sk);
1444                 len = 1;
1445         }
1446         /*
1447          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1448          * could not do because of in-flight packets
1449          */
1450         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1451                 __sk_free(sk);
1452 }
1453 EXPORT_SYMBOL(sock_wfree);
1454
1455 /*
1456  * Read buffer destructor automatically called from kfree_skb.
1457  */
1458 void sock_rfree(struct sk_buff *skb)
1459 {
1460         struct sock *sk = skb->sk;
1461         unsigned int len = skb->truesize;
1462
1463         atomic_sub(len, &sk->sk_rmem_alloc);
1464         sk_mem_uncharge(sk, len);
1465 }
1466 EXPORT_SYMBOL(sock_rfree);
1467
1468
1469 int sock_i_uid(struct sock *sk)
1470 {
1471         int uid;
1472
1473         read_lock_bh(&sk->sk_callback_lock);
1474         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1475         read_unlock_bh(&sk->sk_callback_lock);
1476         return uid;
1477 }
1478 EXPORT_SYMBOL(sock_i_uid);
1479
1480 unsigned long sock_i_ino(struct sock *sk)
1481 {
1482         unsigned long ino;
1483
1484         read_lock_bh(&sk->sk_callback_lock);
1485         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1486         read_unlock_bh(&sk->sk_callback_lock);
1487         return ino;
1488 }
1489 EXPORT_SYMBOL(sock_i_ino);
1490
1491 /*
1492  * Allocate a skb from the socket's send buffer.
1493  */
1494 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1495                              gfp_t priority)
1496 {
1497         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1498                 struct sk_buff *skb = alloc_skb(size, priority);
1499                 if (skb) {
1500                         skb_set_owner_w(skb, sk);
1501                         return skb;
1502                 }
1503         }
1504         return NULL;
1505 }
1506 EXPORT_SYMBOL(sock_wmalloc);
1507
1508 /*
1509  * Allocate a skb from the socket's receive buffer.
1510  */
1511 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1512                              gfp_t priority)
1513 {
1514         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1515                 struct sk_buff *skb = alloc_skb(size, priority);
1516                 if (skb) {
1517                         skb_set_owner_r(skb, sk);
1518                         return skb;
1519                 }
1520         }
1521         return NULL;
1522 }
1523
1524 /*
1525  * Allocate a memory block from the socket's option memory buffer.
1526  */
1527 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1528 {
1529         if ((unsigned int)size <= sysctl_optmem_max &&
1530             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1531                 void *mem;
1532                 /* First do the add, to avoid the race if kmalloc
1533                  * might sleep.
1534                  */
1535                 atomic_add(size, &sk->sk_omem_alloc);
1536                 mem = kmalloc(size, priority);
1537                 if (mem)
1538                         return mem;
1539                 atomic_sub(size, &sk->sk_omem_alloc);
1540         }
1541         return NULL;
1542 }
1543 EXPORT_SYMBOL(sock_kmalloc);
1544
1545 /*
1546  * Free an option memory block.
1547  */
1548 void sock_kfree_s(struct sock *sk, void *mem, int size)
1549 {
1550         kfree(mem);
1551         atomic_sub(size, &sk->sk_omem_alloc);
1552 }
1553 EXPORT_SYMBOL(sock_kfree_s);
1554
1555 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1556    I think, these locks should be removed for datagram sockets.
1557  */
1558 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1559 {
1560         DEFINE_WAIT(wait);
1561
1562         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1563         for (;;) {
1564                 if (!timeo)
1565                         break;
1566                 if (signal_pending(current))
1567                         break;
1568                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1569                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1570                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1571                         break;
1572                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1573                         break;
1574                 if (sk->sk_err)
1575                         break;
1576                 timeo = schedule_timeout(timeo);
1577         }
1578         finish_wait(sk_sleep(sk), &wait);
1579         return timeo;
1580 }
1581
1582
1583 /*
1584  *      Generic send/receive buffer handlers
1585  */
1586
1587 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1588                                      unsigned long data_len, int noblock,
1589                                      int *errcode)
1590 {
1591         struct sk_buff *skb;
1592         gfp_t gfp_mask;
1593         long timeo;
1594         int err;
1595
1596         gfp_mask = sk->sk_allocation;
1597         if (gfp_mask & __GFP_WAIT)
1598                 gfp_mask |= __GFP_REPEAT;
1599
1600         timeo = sock_sndtimeo(sk, noblock);
1601         while (1) {
1602                 err = sock_error(sk);
1603                 if (err != 0)
1604                         goto failure;
1605
1606                 err = -EPIPE;
1607                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1608                         goto failure;
1609
1610                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1611                         skb = alloc_skb(header_len, gfp_mask);
1612                         if (skb) {
1613                                 int npages;
1614                                 int i;
1615
1616                                 /* No pages, we're done... */
1617                                 if (!data_len)
1618                                         break;
1619
1620                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1621                                 skb->truesize += data_len;
1622                                 skb_shinfo(skb)->nr_frags = npages;
1623                                 for (i = 0; i < npages; i++) {
1624                                         struct page *page;
1625
1626                                         page = alloc_pages(sk->sk_allocation, 0);
1627                                         if (!page) {
1628                                                 err = -ENOBUFS;
1629                                                 skb_shinfo(skb)->nr_frags = i;
1630                                                 kfree_skb(skb);
1631                                                 goto failure;
1632                                         }
1633
1634                                         __skb_fill_page_desc(skb, i,
1635                                                         page, 0,
1636                                                         (data_len >= PAGE_SIZE ?
1637                                                          PAGE_SIZE :
1638                                                          data_len));
1639                                         data_len -= PAGE_SIZE;
1640                                 }
1641
1642                                 /* Full success... */
1643                                 break;
1644                         }
1645                         err = -ENOBUFS;
1646                         goto failure;
1647                 }
1648                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1649                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1650                 err = -EAGAIN;
1651                 if (!timeo)
1652                         goto failure;
1653                 if (signal_pending(current))
1654                         goto interrupted;
1655                 timeo = sock_wait_for_wmem(sk, timeo);
1656         }
1657
1658         skb_set_owner_w(skb, sk);
1659         return skb;
1660
1661 interrupted:
1662         err = sock_intr_errno(timeo);
1663 failure:
1664         *errcode = err;
1665         return NULL;
1666 }
1667 EXPORT_SYMBOL(sock_alloc_send_pskb);
1668
1669 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1670                                     int noblock, int *errcode)
1671 {
1672         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1673 }
1674 EXPORT_SYMBOL(sock_alloc_send_skb);
1675
1676 static void __lock_sock(struct sock *sk)
1677         __releases(&sk->sk_lock.slock)
1678         __acquires(&sk->sk_lock.slock)
1679 {
1680         DEFINE_WAIT(wait);
1681
1682         for (;;) {
1683                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1684                                         TASK_UNINTERRUPTIBLE);
1685                 spin_unlock_bh(&sk->sk_lock.slock);
1686                 schedule();
1687                 spin_lock_bh(&sk->sk_lock.slock);
1688                 if (!sock_owned_by_user(sk))
1689                         break;
1690         }
1691         finish_wait(&sk->sk_lock.wq, &wait);
1692 }
1693
1694 static void __release_sock(struct sock *sk)
1695         __releases(&sk->sk_lock.slock)
1696         __acquires(&sk->sk_lock.slock)
1697 {
1698         struct sk_buff *skb = sk->sk_backlog.head;
1699
1700         do {
1701                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1702                 bh_unlock_sock(sk);
1703
1704                 do {
1705                         struct sk_buff *next = skb->next;
1706
1707                         prefetch(next);
1708                         WARN_ON_ONCE(skb_dst_is_noref(skb));
1709                         skb->next = NULL;
1710                         sk_backlog_rcv(sk, skb);
1711
1712                         /*
1713                          * We are in process context here with softirqs
1714                          * disabled, use cond_resched_softirq() to preempt.
1715                          * This is safe to do because we've taken the backlog
1716                          * queue private:
1717                          */
1718                         cond_resched_softirq();
1719
1720                         skb = next;
1721                 } while (skb != NULL);
1722
1723                 bh_lock_sock(sk);
1724         } while ((skb = sk->sk_backlog.head) != NULL);
1725
1726         /*
1727          * Doing the zeroing here guarantee we can not loop forever
1728          * while a wild producer attempts to flood us.
1729          */
1730         sk->sk_backlog.len = 0;
1731 }
1732
1733 /**
1734  * sk_wait_data - wait for data to arrive at sk_receive_queue
1735  * @sk:    sock to wait on
1736  * @timeo: for how long
1737  *
1738  * Now socket state including sk->sk_err is changed only under lock,
1739  * hence we may omit checks after joining wait queue.
1740  * We check receive queue before schedule() only as optimization;
1741  * it is very likely that release_sock() added new data.
1742  */
1743 int sk_wait_data(struct sock *sk, long *timeo)
1744 {
1745         int rc;
1746         DEFINE_WAIT(wait);
1747
1748         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1749         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1750         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1751         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1752         finish_wait(sk_sleep(sk), &wait);
1753         return rc;
1754 }
1755 EXPORT_SYMBOL(sk_wait_data);
1756
1757 /**
1758  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1759  *      @sk: socket
1760  *      @size: memory size to allocate
1761  *      @kind: allocation type
1762  *
1763  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1764  *      rmem allocation. This function assumes that protocols which have
1765  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1766  */
1767 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1768 {
1769         struct proto *prot = sk->sk_prot;
1770         int amt = sk_mem_pages(size);
1771         long allocated;
1772         int parent_status = UNDER_LIMIT;
1773
1774         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1775
1776         allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1777
1778         /* Under limit. */
1779         if (parent_status == UNDER_LIMIT &&
1780                         allocated <= sk_prot_mem_limits(sk, 0)) {
1781                 sk_leave_memory_pressure(sk);
1782                 return 1;
1783         }
1784
1785         /* Under pressure. (we or our parents) */
1786         if ((parent_status > SOFT_LIMIT) ||
1787                         allocated > sk_prot_mem_limits(sk, 1))
1788                 sk_enter_memory_pressure(sk);
1789
1790         /* Over hard limit (we or our parents) */
1791         if ((parent_status == OVER_LIMIT) ||
1792                         (allocated > sk_prot_mem_limits(sk, 2)))
1793                 goto suppress_allocation;
1794
1795         /* guarantee minimum buffer size under pressure */
1796         if (kind == SK_MEM_RECV) {
1797                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1798                         return 1;
1799
1800         } else { /* SK_MEM_SEND */
1801                 if (sk->sk_type == SOCK_STREAM) {
1802                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1803                                 return 1;
1804                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1805                            prot->sysctl_wmem[0])
1806                                 return 1;
1807         }
1808
1809         if (sk_has_memory_pressure(sk)) {
1810                 int alloc;
1811
1812                 if (!sk_under_memory_pressure(sk))
1813                         return 1;
1814                 alloc = sk_sockets_allocated_read_positive(sk);
1815                 if (sk_prot_mem_limits(sk, 2) > alloc *
1816                     sk_mem_pages(sk->sk_wmem_queued +
1817                                  atomic_read(&sk->sk_rmem_alloc) +
1818                                  sk->sk_forward_alloc))
1819                         return 1;
1820         }
1821
1822 suppress_allocation:
1823
1824         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1825                 sk_stream_moderate_sndbuf(sk);
1826
1827                 /* Fail only if socket is _under_ its sndbuf.
1828                  * In this case we cannot block, so that we have to fail.
1829                  */
1830                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1831                         return 1;
1832         }
1833
1834         trace_sock_exceed_buf_limit(sk, prot, allocated);
1835
1836         /* Alas. Undo changes. */
1837         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1838
1839         sk_memory_allocated_sub(sk, amt);
1840
1841         return 0;
1842 }
1843 EXPORT_SYMBOL(__sk_mem_schedule);
1844
1845 /**
1846  *      __sk_reclaim - reclaim memory_allocated
1847  *      @sk: socket
1848  */
1849 void __sk_mem_reclaim(struct sock *sk)
1850 {
1851         sk_memory_allocated_sub(sk,
1852                                 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1853         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1854
1855         if (sk_under_memory_pressure(sk) &&
1856             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1857                 sk_leave_memory_pressure(sk);
1858 }
1859 EXPORT_SYMBOL(__sk_mem_reclaim);
1860
1861
1862 /*
1863  * Set of default routines for initialising struct proto_ops when
1864  * the protocol does not support a particular function. In certain
1865  * cases where it makes no sense for a protocol to have a "do nothing"
1866  * function, some default processing is provided.
1867  */
1868
1869 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1870 {
1871         return -EOPNOTSUPP;
1872 }
1873 EXPORT_SYMBOL(sock_no_bind);
1874
1875 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1876                     int len, int flags)
1877 {
1878         return -EOPNOTSUPP;
1879 }
1880 EXPORT_SYMBOL(sock_no_connect);
1881
1882 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1883 {
1884         return -EOPNOTSUPP;
1885 }
1886 EXPORT_SYMBOL(sock_no_socketpair);
1887
1888 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1889 {
1890         return -EOPNOTSUPP;
1891 }
1892 EXPORT_SYMBOL(sock_no_accept);
1893
1894 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1895                     int *len, int peer)
1896 {
1897         return -EOPNOTSUPP;
1898 }
1899 EXPORT_SYMBOL(sock_no_getname);
1900
1901 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1902 {
1903         return 0;
1904 }
1905 EXPORT_SYMBOL(sock_no_poll);
1906
1907 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1908 {
1909         return -EOPNOTSUPP;
1910 }
1911 EXPORT_SYMBOL(sock_no_ioctl);
1912
1913 int sock_no_listen(struct socket *sock, int backlog)
1914 {
1915         return -EOPNOTSUPP;
1916 }
1917 EXPORT_SYMBOL(sock_no_listen);
1918
1919 int sock_no_shutdown(struct socket *sock, int how)
1920 {
1921         return -EOPNOTSUPP;
1922 }
1923 EXPORT_SYMBOL(sock_no_shutdown);
1924
1925 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1926                     char __user *optval, unsigned int optlen)
1927 {
1928         return -EOPNOTSUPP;
1929 }
1930 EXPORT_SYMBOL(sock_no_setsockopt);
1931
1932 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1933                     char __user *optval, int __user *optlen)
1934 {
1935         return -EOPNOTSUPP;
1936 }
1937 EXPORT_SYMBOL(sock_no_getsockopt);
1938
1939 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1940                     size_t len)
1941 {
1942         return -EOPNOTSUPP;
1943 }
1944 EXPORT_SYMBOL(sock_no_sendmsg);
1945
1946 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1947                     size_t len, int flags)
1948 {
1949         return -EOPNOTSUPP;
1950 }
1951 EXPORT_SYMBOL(sock_no_recvmsg);
1952
1953 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1954 {
1955         /* Mirror missing mmap method error code */
1956         return -ENODEV;
1957 }
1958 EXPORT_SYMBOL(sock_no_mmap);
1959
1960 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1961 {
1962         ssize_t res;
1963         struct msghdr msg = {.msg_flags = flags};
1964         struct kvec iov;
1965         char *kaddr = kmap(page);
1966         iov.iov_base = kaddr + offset;
1967         iov.iov_len = size;
1968         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1969         kunmap(page);
1970         return res;
1971 }
1972 EXPORT_SYMBOL(sock_no_sendpage);
1973
1974 /*
1975  *      Default Socket Callbacks
1976  */
1977
1978 static void sock_def_wakeup(struct sock *sk)
1979 {
1980         struct socket_wq *wq;
1981
1982         rcu_read_lock();
1983         wq = rcu_dereference(sk->sk_wq);
1984         if (wq_has_sleeper(wq))
1985                 wake_up_interruptible_all(&wq->wait);
1986         rcu_read_unlock();
1987 }
1988
1989 static void sock_def_error_report(struct sock *sk)
1990 {
1991         struct socket_wq *wq;
1992
1993         rcu_read_lock();
1994         wq = rcu_dereference(sk->sk_wq);
1995         if (wq_has_sleeper(wq))
1996                 wake_up_interruptible_poll(&wq->wait, POLLERR);
1997         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1998         rcu_read_unlock();
1999 }
2000
2001 static void sock_def_readable(struct sock *sk, int len)
2002 {
2003         struct socket_wq *wq;
2004
2005         rcu_read_lock();
2006         wq = rcu_dereference(sk->sk_wq);
2007         if (wq_has_sleeper(wq))
2008                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2009                                                 POLLRDNORM | POLLRDBAND);
2010         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2011         rcu_read_unlock();
2012 }
2013
2014 static void sock_def_write_space(struct sock *sk)
2015 {
2016         struct socket_wq *wq;
2017
2018         rcu_read_lock();
2019
2020         /* Do not wake up a writer until he can make "significant"
2021          * progress.  --DaveM
2022          */
2023         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2024                 wq = rcu_dereference(sk->sk_wq);
2025                 if (wq_has_sleeper(wq))
2026                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2027                                                 POLLWRNORM | POLLWRBAND);
2028
2029                 /* Should agree with poll, otherwise some programs break */
2030                 if (sock_writeable(sk))
2031                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2032         }
2033
2034         rcu_read_unlock();
2035 }
2036
2037 static void sock_def_destruct(struct sock *sk)
2038 {
2039         kfree(sk->sk_protinfo);
2040 }
2041
2042 void sk_send_sigurg(struct sock *sk)
2043 {
2044         if (sk->sk_socket && sk->sk_socket->file)
2045                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2046                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2047 }
2048 EXPORT_SYMBOL(sk_send_sigurg);
2049
2050 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2051                     unsigned long expires)
2052 {
2053         if (!mod_timer(timer, expires))
2054                 sock_hold(sk);
2055 }
2056 EXPORT_SYMBOL(sk_reset_timer);
2057
2058 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2059 {
2060         if (timer_pending(timer) && del_timer(timer))
2061                 __sock_put(sk);
2062 }
2063 EXPORT_SYMBOL(sk_stop_timer);
2064
2065 void sock_init_data(struct socket *sock, struct sock *sk)
2066 {
2067         skb_queue_head_init(&sk->sk_receive_queue);
2068         skb_queue_head_init(&sk->sk_write_queue);
2069         skb_queue_head_init(&sk->sk_error_queue);
2070 #ifdef CONFIG_NET_DMA
2071         skb_queue_head_init(&sk->sk_async_wait_queue);
2072 #endif
2073
2074         sk->sk_send_head        =       NULL;
2075
2076         init_timer(&sk->sk_timer);
2077
2078         sk->sk_allocation       =       GFP_KERNEL;
2079         sk->sk_rcvbuf           =       sysctl_rmem_default;
2080         sk->sk_sndbuf           =       sysctl_wmem_default;
2081         sk->sk_state            =       TCP_CLOSE;
2082         sk_set_socket(sk, sock);
2083
2084         sock_set_flag(sk, SOCK_ZAPPED);
2085
2086         if (sock) {
2087                 sk->sk_type     =       sock->type;
2088                 sk->sk_wq       =       sock->wq;
2089                 sock->sk        =       sk;
2090         } else
2091                 sk->sk_wq       =       NULL;
2092
2093         spin_lock_init(&sk->sk_dst_lock);
2094         rwlock_init(&sk->sk_callback_lock);
2095         lockdep_set_class_and_name(&sk->sk_callback_lock,
2096                         af_callback_keys + sk->sk_family,
2097                         af_family_clock_key_strings[sk->sk_family]);
2098
2099         sk->sk_state_change     =       sock_def_wakeup;
2100         sk->sk_data_ready       =       sock_def_readable;
2101         sk->sk_write_space      =       sock_def_write_space;
2102         sk->sk_error_report     =       sock_def_error_report;
2103         sk->sk_destruct         =       sock_def_destruct;
2104
2105         sk->sk_sndmsg_page      =       NULL;
2106         sk->sk_sndmsg_off       =       0;
2107         sk->sk_peek_off         =       -1;
2108
2109         sk->sk_peer_pid         =       NULL;
2110         sk->sk_peer_cred        =       NULL;
2111         sk->sk_write_pending    =       0;
2112         sk->sk_rcvlowat         =       1;
2113         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2114         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2115
2116         sk->sk_stamp = ktime_set(-1L, 0);
2117
2118         /*
2119          * Before updating sk_refcnt, we must commit prior changes to memory
2120          * (Documentation/RCU/rculist_nulls.txt for details)
2121          */
2122         smp_wmb();
2123         atomic_set(&sk->sk_refcnt, 1);
2124         atomic_set(&sk->sk_drops, 0);
2125 }
2126 EXPORT_SYMBOL(sock_init_data);
2127
2128 void lock_sock_nested(struct sock *sk, int subclass)
2129 {
2130         might_sleep();
2131         spin_lock_bh(&sk->sk_lock.slock);
2132         if (sk->sk_lock.owned)
2133                 __lock_sock(sk);
2134         sk->sk_lock.owned = 1;
2135         spin_unlock(&sk->sk_lock.slock);
2136         /*
2137          * The sk_lock has mutex_lock() semantics here:
2138          */
2139         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2140         local_bh_enable();
2141 }
2142 EXPORT_SYMBOL(lock_sock_nested);
2143
2144 void release_sock(struct sock *sk)
2145 {
2146         /*
2147          * The sk_lock has mutex_unlock() semantics:
2148          */
2149         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2150
2151         spin_lock_bh(&sk->sk_lock.slock);
2152         if (sk->sk_backlog.tail)
2153                 __release_sock(sk);
2154         sk->sk_lock.owned = 0;
2155         if (waitqueue_active(&sk->sk_lock.wq))
2156                 wake_up(&sk->sk_lock.wq);
2157         spin_unlock_bh(&sk->sk_lock.slock);
2158 }
2159 EXPORT_SYMBOL(release_sock);
2160
2161 /**
2162  * lock_sock_fast - fast version of lock_sock
2163  * @sk: socket
2164  *
2165  * This version should be used for very small section, where process wont block
2166  * return false if fast path is taken
2167  *   sk_lock.slock locked, owned = 0, BH disabled
2168  * return true if slow path is taken
2169  *   sk_lock.slock unlocked, owned = 1, BH enabled
2170  */
2171 bool lock_sock_fast(struct sock *sk)
2172 {
2173         might_sleep();
2174         spin_lock_bh(&sk->sk_lock.slock);
2175
2176         if (!sk->sk_lock.owned)
2177                 /*
2178                  * Note : We must disable BH
2179                  */
2180                 return false;
2181
2182         __lock_sock(sk);
2183         sk->sk_lock.owned = 1;
2184         spin_unlock(&sk->sk_lock.slock);
2185         /*
2186          * The sk_lock has mutex_lock() semantics here:
2187          */
2188         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2189         local_bh_enable();
2190         return true;
2191 }
2192 EXPORT_SYMBOL(lock_sock_fast);
2193
2194 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2195 {
2196         struct timeval tv;
2197         if (!sock_flag(sk, SOCK_TIMESTAMP))
2198                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2199         tv = ktime_to_timeval(sk->sk_stamp);
2200         if (tv.tv_sec == -1)
2201                 return -ENOENT;
2202         if (tv.tv_sec == 0) {
2203                 sk->sk_stamp = ktime_get_real();
2204                 tv = ktime_to_timeval(sk->sk_stamp);
2205         }
2206         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2207 }
2208 EXPORT_SYMBOL(sock_get_timestamp);
2209
2210 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2211 {
2212         struct timespec ts;
2213         if (!sock_flag(sk, SOCK_TIMESTAMP))
2214                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2215         ts = ktime_to_timespec(sk->sk_stamp);
2216         if (ts.tv_sec == -1)
2217                 return -ENOENT;
2218         if (ts.tv_sec == 0) {
2219                 sk->sk_stamp = ktime_get_real();
2220                 ts = ktime_to_timespec(sk->sk_stamp);
2221         }
2222         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2223 }
2224 EXPORT_SYMBOL(sock_get_timestampns);
2225
2226 void sock_enable_timestamp(struct sock *sk, int flag)
2227 {
2228         if (!sock_flag(sk, flag)) {
2229                 unsigned long previous_flags = sk->sk_flags;
2230
2231                 sock_set_flag(sk, flag);
2232                 /*
2233                  * we just set one of the two flags which require net
2234                  * time stamping, but time stamping might have been on
2235                  * already because of the other one
2236                  */
2237                 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2238                         net_enable_timestamp();
2239         }
2240 }
2241
2242 /*
2243  *      Get a socket option on an socket.
2244  *
2245  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2246  *      asynchronous errors should be reported by getsockopt. We assume
2247  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2248  */
2249 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2250                            char __user *optval, int __user *optlen)
2251 {
2252         struct sock *sk = sock->sk;
2253
2254         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2255 }
2256 EXPORT_SYMBOL(sock_common_getsockopt);
2257
2258 #ifdef CONFIG_COMPAT
2259 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2260                                   char __user *optval, int __user *optlen)
2261 {
2262         struct sock *sk = sock->sk;
2263
2264         if (sk->sk_prot->compat_getsockopt != NULL)
2265                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2266                                                       optval, optlen);
2267         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2268 }
2269 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2270 #endif
2271
2272 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2273                         struct msghdr *msg, size_t size, int flags)
2274 {
2275         struct sock *sk = sock->sk;
2276         int addr_len = 0;
2277         int err;
2278
2279         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2280                                    flags & ~MSG_DONTWAIT, &addr_len);
2281         if (err >= 0)
2282                 msg->msg_namelen = addr_len;
2283         return err;
2284 }
2285 EXPORT_SYMBOL(sock_common_recvmsg);
2286
2287 /*
2288  *      Set socket options on an inet socket.
2289  */
2290 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2291                            char __user *optval, unsigned int optlen)
2292 {
2293         struct sock *sk = sock->sk;
2294
2295         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2296 }
2297 EXPORT_SYMBOL(sock_common_setsockopt);
2298
2299 #ifdef CONFIG_COMPAT
2300 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2301                                   char __user *optval, unsigned int optlen)
2302 {
2303         struct sock *sk = sock->sk;
2304
2305         if (sk->sk_prot->compat_setsockopt != NULL)
2306                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2307                                                       optval, optlen);
2308         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2309 }
2310 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2311 #endif
2312
2313 void sk_common_release(struct sock *sk)
2314 {
2315         if (sk->sk_prot->destroy)
2316                 sk->sk_prot->destroy(sk);
2317
2318         /*
2319          * Observation: when sock_common_release is called, processes have
2320          * no access to socket. But net still has.
2321          * Step one, detach it from networking:
2322          *
2323          * A. Remove from hash tables.
2324          */
2325
2326         sk->sk_prot->unhash(sk);
2327
2328         /*
2329          * In this point socket cannot receive new packets, but it is possible
2330          * that some packets are in flight because some CPU runs receiver and
2331          * did hash table lookup before we unhashed socket. They will achieve
2332          * receive queue and will be purged by socket destructor.
2333          *
2334          * Also we still have packets pending on receive queue and probably,
2335          * our own packets waiting in device queues. sock_destroy will drain
2336          * receive queue, but transmitted packets will delay socket destruction
2337          * until the last reference will be released.
2338          */
2339
2340         sock_orphan(sk);
2341
2342         xfrm_sk_free_policy(sk);
2343
2344         sk_refcnt_debug_release(sk);
2345         sock_put(sk);
2346 }
2347 EXPORT_SYMBOL(sk_common_release);
2348
2349 #ifdef CONFIG_PROC_FS
2350 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2351 struct prot_inuse {
2352         int val[PROTO_INUSE_NR];
2353 };
2354
2355 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2356
2357 #ifdef CONFIG_NET_NS
2358 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2359 {
2360         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2361 }
2362 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2363
2364 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2365 {
2366         int cpu, idx = prot->inuse_idx;
2367         int res = 0;
2368
2369         for_each_possible_cpu(cpu)
2370                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2371
2372         return res >= 0 ? res : 0;
2373 }
2374 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2375
2376 static int __net_init sock_inuse_init_net(struct net *net)
2377 {
2378         net->core.inuse = alloc_percpu(struct prot_inuse);
2379         return net->core.inuse ? 0 : -ENOMEM;
2380 }
2381
2382 static void __net_exit sock_inuse_exit_net(struct net *net)
2383 {
2384         free_percpu(net->core.inuse);
2385 }
2386
2387 static struct pernet_operations net_inuse_ops = {
2388         .init = sock_inuse_init_net,
2389         .exit = sock_inuse_exit_net,
2390 };
2391
2392 static __init int net_inuse_init(void)
2393 {
2394         if (register_pernet_subsys(&net_inuse_ops))
2395                 panic("Cannot initialize net inuse counters");
2396
2397         return 0;
2398 }
2399
2400 core_initcall(net_inuse_init);
2401 #else
2402 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2403
2404 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2405 {
2406         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2407 }
2408 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2409
2410 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2411 {
2412         int cpu, idx = prot->inuse_idx;
2413         int res = 0;
2414
2415         for_each_possible_cpu(cpu)
2416                 res += per_cpu(prot_inuse, cpu).val[idx];
2417
2418         return res >= 0 ? res : 0;
2419 }
2420 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2421 #endif
2422
2423 static void assign_proto_idx(struct proto *prot)
2424 {
2425         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2426
2427         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2428                 pr_err("PROTO_INUSE_NR exhausted\n");
2429                 return;
2430         }
2431
2432         set_bit(prot->inuse_idx, proto_inuse_idx);
2433 }
2434
2435 static void release_proto_idx(struct proto *prot)
2436 {
2437         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2438                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2439 }
2440 #else
2441 static inline void assign_proto_idx(struct proto *prot)
2442 {
2443 }
2444
2445 static inline void release_proto_idx(struct proto *prot)
2446 {
2447 }
2448 #endif
2449
2450 int proto_register(struct proto *prot, int alloc_slab)
2451 {
2452         if (alloc_slab) {
2453                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2454                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2455                                         NULL);
2456
2457                 if (prot->slab == NULL) {
2458                         pr_crit("%s: Can't create sock SLAB cache!\n",
2459                                 prot->name);
2460                         goto out;
2461                 }
2462
2463                 if (prot->rsk_prot != NULL) {
2464                         prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2465                         if (prot->rsk_prot->slab_name == NULL)
2466                                 goto out_free_sock_slab;
2467
2468                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2469                                                                  prot->rsk_prot->obj_size, 0,
2470                                                                  SLAB_HWCACHE_ALIGN, NULL);
2471
2472                         if (prot->rsk_prot->slab == NULL) {
2473                                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2474                                         prot->name);
2475                                 goto out_free_request_sock_slab_name;
2476                         }
2477                 }
2478
2479                 if (prot->twsk_prot != NULL) {
2480                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2481
2482                         if (prot->twsk_prot->twsk_slab_name == NULL)
2483                                 goto out_free_request_sock_slab;
2484
2485                         prot->twsk_prot->twsk_slab =
2486                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2487                                                   prot->twsk_prot->twsk_obj_size,
2488                                                   0,
2489                                                   SLAB_HWCACHE_ALIGN |
2490                                                         prot->slab_flags,
2491                                                   NULL);
2492                         if (prot->twsk_prot->twsk_slab == NULL)
2493                                 goto out_free_timewait_sock_slab_name;
2494                 }
2495         }
2496
2497         mutex_lock(&proto_list_mutex);
2498         list_add(&prot->node, &proto_list);
2499         assign_proto_idx(prot);
2500         mutex_unlock(&proto_list_mutex);
2501         return 0;
2502
2503 out_free_timewait_sock_slab_name:
2504         kfree(prot->twsk_prot->twsk_slab_name);
2505 out_free_request_sock_slab:
2506         if (prot->rsk_prot && prot->rsk_prot->slab) {
2507                 kmem_cache_destroy(prot->rsk_prot->slab);
2508                 prot->rsk_prot->slab = NULL;
2509         }
2510 out_free_request_sock_slab_name:
2511         if (prot->rsk_prot)
2512                 kfree(prot->rsk_prot->slab_name);
2513 out_free_sock_slab:
2514         kmem_cache_destroy(prot->slab);
2515         prot->slab = NULL;
2516 out:
2517         return -ENOBUFS;
2518 }
2519 EXPORT_SYMBOL(proto_register);
2520
2521 void proto_unregister(struct proto *prot)
2522 {
2523         mutex_lock(&proto_list_mutex);
2524         release_proto_idx(prot);
2525         list_del(&prot->node);
2526         mutex_unlock(&proto_list_mutex);
2527
2528         if (prot->slab != NULL) {
2529                 kmem_cache_destroy(prot->slab);
2530                 prot->slab = NULL;
2531         }
2532
2533         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2534                 kmem_cache_destroy(prot->rsk_prot->slab);
2535                 kfree(prot->rsk_prot->slab_name);
2536                 prot->rsk_prot->slab = NULL;
2537         }
2538
2539         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2540                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2541                 kfree(prot->twsk_prot->twsk_slab_name);
2542                 prot->twsk_prot->twsk_slab = NULL;
2543         }
2544 }
2545 EXPORT_SYMBOL(proto_unregister);
2546
2547 #ifdef CONFIG_PROC_FS
2548 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2549         __acquires(proto_list_mutex)
2550 {
2551         mutex_lock(&proto_list_mutex);
2552         return seq_list_start_head(&proto_list, *pos);
2553 }
2554
2555 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2556 {
2557         return seq_list_next(v, &proto_list, pos);
2558 }
2559
2560 static void proto_seq_stop(struct seq_file *seq, void *v)
2561         __releases(proto_list_mutex)
2562 {
2563         mutex_unlock(&proto_list_mutex);
2564 }
2565
2566 static char proto_method_implemented(const void *method)
2567 {
2568         return method == NULL ? 'n' : 'y';
2569 }
2570 static long sock_prot_memory_allocated(struct proto *proto)
2571 {
2572         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2573 }
2574
2575 static char *sock_prot_memory_pressure(struct proto *proto)
2576 {
2577         return proto->memory_pressure != NULL ?
2578         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2579 }
2580
2581 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2582 {
2583
2584         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2585                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2586                    proto->name,
2587                    proto->obj_size,
2588                    sock_prot_inuse_get(seq_file_net(seq), proto),
2589                    sock_prot_memory_allocated(proto),
2590                    sock_prot_memory_pressure(proto),
2591                    proto->max_header,
2592                    proto->slab == NULL ? "no" : "yes",
2593                    module_name(proto->owner),
2594                    proto_method_implemented(proto->close),
2595                    proto_method_implemented(proto->connect),
2596                    proto_method_implemented(proto->disconnect),
2597                    proto_method_implemented(proto->accept),
2598                    proto_method_implemented(proto->ioctl),
2599                    proto_method_implemented(proto->init),
2600                    proto_method_implemented(proto->destroy),
2601                    proto_method_implemented(proto->shutdown),
2602                    proto_method_implemented(proto->setsockopt),
2603                    proto_method_implemented(proto->getsockopt),
2604                    proto_method_implemented(proto->sendmsg),
2605                    proto_method_implemented(proto->recvmsg),
2606                    proto_method_implemented(proto->sendpage),
2607                    proto_method_implemented(proto->bind),
2608                    proto_method_implemented(proto->backlog_rcv),
2609                    proto_method_implemented(proto->hash),
2610                    proto_method_implemented(proto->unhash),
2611                    proto_method_implemented(proto->get_port),
2612                    proto_method_implemented(proto->enter_memory_pressure));
2613 }
2614
2615 static int proto_seq_show(struct seq_file *seq, void *v)
2616 {
2617         if (v == &proto_list)
2618                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2619                            "protocol",
2620                            "size",
2621                            "sockets",
2622                            "memory",
2623                            "press",
2624                            "maxhdr",
2625                            "slab",
2626                            "module",
2627                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2628         else
2629                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2630         return 0;
2631 }
2632
2633 static const struct seq_operations proto_seq_ops = {
2634         .start  = proto_seq_start,
2635         .next   = proto_seq_next,
2636         .stop   = proto_seq_stop,
2637         .show   = proto_seq_show,
2638 };
2639
2640 static int proto_seq_open(struct inode *inode, struct file *file)
2641 {
2642         return seq_open_net(inode, file, &proto_seq_ops,
2643                             sizeof(struct seq_net_private));
2644 }
2645
2646 static const struct file_operations proto_seq_fops = {
2647         .owner          = THIS_MODULE,
2648         .open           = proto_seq_open,
2649         .read           = seq_read,
2650         .llseek         = seq_lseek,
2651         .release        = seq_release_net,
2652 };
2653
2654 static __net_init int proto_init_net(struct net *net)
2655 {
2656         if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2657                 return -ENOMEM;
2658
2659         return 0;
2660 }
2661
2662 static __net_exit void proto_exit_net(struct net *net)
2663 {
2664         proc_net_remove(net, "protocols");
2665 }
2666
2667
2668 static __net_initdata struct pernet_operations proto_net_ops = {
2669         .init = proto_init_net,
2670         .exit = proto_exit_net,
2671 };
2672
2673 static int __init proto_init(void)
2674 {
2675         return register_pernet_subsys(&proto_net_ops);
2676 }
2677
2678 subsys_initcall(proto_init);
2679
2680 #endif /* PROC_FS */