ipvs: ip_vs_proto: local functions should not be exposed globally
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
4fc268d2 92#include <linux/capability.h>
1da177e4
LT
93#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
1da177e4
LT
98#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
a1f8e7f7 112#include <linux/highmem.h>
3f551f94 113#include <linux/user_namespace.h>
c5905afb 114#include <linux/static_key.h>
3969eb38 115#include <linux/memcontrol.h>
8c1ae10d 116#include <linux/prefetch.h>
1da177e4
LT
117
118#include <asm/uaccess.h>
1da177e4
LT
119
120#include <linux/netdevice.h>
121#include <net/protocol.h>
122#include <linux/skbuff.h>
457c4cbc 123#include <net/net_namespace.h>
2e6599cb 124#include <net/request_sock.h>
1da177e4 125#include <net/sock.h>
20d49473 126#include <linux/net_tstamp.h>
1da177e4
LT
127#include <net/xfrm.h>
128#include <linux/ipsec.h>
f8451725 129#include <net/cls_cgroup.h>
5bc1421e 130#include <net/netprio_cgroup.h>
1da177e4
LT
131
132#include <linux/filter.h>
133
3847ce32
SM
134#include <trace/events/sock.h>
135
1da177e4
LT
136#ifdef CONFIG_INET
137#include <net/tcp.h>
138#endif
139
36b77a52 140static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
141static LIST_HEAD(proto_list);
142
143#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
144int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
145{
146 struct proto *proto;
147 int ret = 0;
148
36b77a52 149 mutex_lock(&proto_list_mutex);
d1a4c0b3
GC
150 list_for_each_entry(proto, &proto_list, node) {
151 if (proto->init_cgroup) {
152 ret = proto->init_cgroup(cgrp, ss);
153 if (ret)
154 goto out;
155 }
156 }
157
36b77a52 158 mutex_unlock(&proto_list_mutex);
d1a4c0b3
GC
159 return ret;
160out:
161 list_for_each_entry_continue_reverse(proto, &proto_list, node)
162 if (proto->destroy_cgroup)
761b3ef5 163 proto->destroy_cgroup(cgrp);
36b77a52 164 mutex_unlock(&proto_list_mutex);
d1a4c0b3
GC
165 return ret;
166}
167
761b3ef5 168void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
d1a4c0b3
GC
169{
170 struct proto *proto;
171
36b77a52 172 mutex_lock(&proto_list_mutex);
d1a4c0b3
GC
173 list_for_each_entry_reverse(proto, &proto_list, node)
174 if (proto->destroy_cgroup)
761b3ef5 175 proto->destroy_cgroup(cgrp);
36b77a52 176 mutex_unlock(&proto_list_mutex);
d1a4c0b3
GC
177}
178#endif
179
da21f24d
IM
180/*
181 * Each address family might have different locking rules, so we have
182 * one slock key per address family:
183 */
a5b5bb9a
IM
184static struct lock_class_key af_family_keys[AF_MAX];
185static struct lock_class_key af_family_slock_keys[AF_MAX];
186
c5905afb 187struct static_key memcg_socket_limit_enabled;
e1aab161
GC
188EXPORT_SYMBOL(memcg_socket_limit_enabled);
189
a5b5bb9a
IM
190/*
191 * Make lock validator output more readable. (we pre-construct these
192 * strings build-time, so that runtime initialization of socket
193 * locks is fast):
194 */
36cbd3dc 195static const char *const af_family_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
196 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
197 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
198 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
199 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
200 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
201 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
202 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
cbd151bf 203 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
a5b5bb9a 204 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
cd05acfe 205 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
17926a79 206 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
bce7b154 207 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
6f107b58 208 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
c7fe3b52 209 "sk_lock-AF_NFC" , "sk_lock-AF_MAX"
a5b5bb9a 210};
36cbd3dc 211static const char *const af_family_slock_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
212 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
213 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
214 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
215 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
216 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
217 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
218 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
cbd151bf 219 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
a5b5bb9a 220 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
cd05acfe 221 "slock-27" , "slock-28" , "slock-AF_CAN" ,
17926a79 222 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
bce7b154 223 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
6f107b58 224 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
c7fe3b52 225 "slock-AF_NFC" , "slock-AF_MAX"
a5b5bb9a 226};
36cbd3dc 227static const char *const af_family_clock_key_strings[AF_MAX+1] = {
443aef0e
PZ
228 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
229 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
230 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
231 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
232 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
233 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
234 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
cbd151bf 235 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
443aef0e 236 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
b4942af6 237 "clock-27" , "clock-28" , "clock-AF_CAN" ,
e51f802b 238 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
bce7b154 239 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
6f107b58 240 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
c7fe3b52 241 "clock-AF_NFC" , "clock-AF_MAX"
443aef0e 242};
da21f24d
IM
243
244/*
245 * sk_callback_lock locking rules are per-address-family,
246 * so split the lock classes by using a per-AF key:
247 */
248static struct lock_class_key af_callback_keys[AF_MAX];
249
1da177e4
LT
250/* Take into consideration the size of the struct sk_buff overhead in the
251 * determination of these values, since that is non-constant across
252 * platforms. This makes socket queueing behavior and performance
253 * not depend upon such differences.
254 */
255#define _SK_MEM_PACKETS 256
87fb4b7b 256#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
1da177e4
LT
257#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
258#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
259
260/* Run time adjustable parameters. */
ab32ea5d
BH
261__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
262__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
263__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
264__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 265
25985edc 266/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 267int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 268EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 269
5bc1421e
NH
270#if defined(CONFIG_CGROUPS)
271#if !defined(CONFIG_NET_CLS_CGROUP)
f8451725
HX
272int net_cls_subsys_id = -1;
273EXPORT_SYMBOL_GPL(net_cls_subsys_id);
274#endif
5bc1421e
NH
275#if !defined(CONFIG_NETPRIO_CGROUP)
276int net_prio_subsys_id = -1;
277EXPORT_SYMBOL_GPL(net_prio_subsys_id);
278#endif
279#endif
f8451725 280
1da177e4
LT
281static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
282{
283 struct timeval tv;
284
285 if (optlen < sizeof(tv))
286 return -EINVAL;
287 if (copy_from_user(&tv, optval, sizeof(tv)))
288 return -EFAULT;
ba78073e
VA
289 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
290 return -EDOM;
1da177e4 291
ba78073e 292 if (tv.tv_sec < 0) {
6f11df83
AM
293 static int warned __read_mostly;
294
ba78073e 295 *timeo_p = 0;
50aab54f 296 if (warned < 10 && net_ratelimit()) {
ba78073e
VA
297 warned++;
298 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
299 "tries to set negative timeout\n",
ba25f9dc 300 current->comm, task_pid_nr(current));
50aab54f 301 }
ba78073e
VA
302 return 0;
303 }
1da177e4
LT
304 *timeo_p = MAX_SCHEDULE_TIMEOUT;
305 if (tv.tv_sec == 0 && tv.tv_usec == 0)
306 return 0;
307 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
308 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
309 return 0;
310}
311
312static void sock_warn_obsolete_bsdism(const char *name)
313{
314 static int warned;
315 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
316 if (strcmp(warncomm, current->comm) && warned < 5) {
317 strcpy(warncomm, current->comm);
1da177e4
LT
318 printk(KERN_WARNING "process `%s' is using obsolete "
319 "%s SO_BSDCOMPAT\n", warncomm, name);
320 warned++;
321 }
322}
323
08e29af3
ED
324#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
325
326static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 327{
08e29af3
ED
328 if (sk->sk_flags & flags) {
329 sk->sk_flags &= ~flags;
330 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 331 net_disable_timestamp();
1da177e4
LT
332 }
333}
334
335
f0088a50
DV
336int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
337{
766e9037 338 int err;
f0088a50 339 int skb_len;
3b885787
NH
340 unsigned long flags;
341 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 342
0fd7bac6 343 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 344 atomic_inc(&sk->sk_drops);
3847ce32 345 trace_sock_rcvqueue_full(sk, skb);
766e9037 346 return -ENOMEM;
f0088a50
DV
347 }
348
fda9ef5d 349 err = sk_filter(sk, skb);
f0088a50 350 if (err)
766e9037 351 return err;
f0088a50 352
3ab224be 353 if (!sk_rmem_schedule(sk, skb->truesize)) {
766e9037
ED
354 atomic_inc(&sk->sk_drops);
355 return -ENOBUFS;
3ab224be
HA
356 }
357
f0088a50
DV
358 skb->dev = NULL;
359 skb_set_owner_r(skb, sk);
49ad9599 360
f0088a50
DV
361 /* Cache the SKB length before we tack it onto the receive
362 * queue. Once it is added it no longer belongs to us and
363 * may be freed by other threads of control pulling packets
364 * from the queue.
365 */
366 skb_len = skb->len;
367
7fee226a
ED
368 /* we escape from rcu protected region, make sure we dont leak
369 * a norefcounted dst
370 */
371 skb_dst_force(skb);
372
3b885787
NH
373 spin_lock_irqsave(&list->lock, flags);
374 skb->dropcount = atomic_read(&sk->sk_drops);
375 __skb_queue_tail(list, skb);
376 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
377
378 if (!sock_flag(sk, SOCK_DEAD))
379 sk->sk_data_ready(sk, skb_len);
766e9037 380 return 0;
f0088a50
DV
381}
382EXPORT_SYMBOL(sock_queue_rcv_skb);
383
58a5a7b9 384int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
f0088a50
DV
385{
386 int rc = NET_RX_SUCCESS;
387
fda9ef5d 388 if (sk_filter(sk, skb))
f0088a50
DV
389 goto discard_and_relse;
390
391 skb->dev = NULL;
392
f545a38f 393 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
c377411f
ED
394 atomic_inc(&sk->sk_drops);
395 goto discard_and_relse;
396 }
58a5a7b9
ACM
397 if (nested)
398 bh_lock_sock_nested(sk);
399 else
400 bh_lock_sock(sk);
a5b5bb9a
IM
401 if (!sock_owned_by_user(sk)) {
402 /*
403 * trylock + unlock semantics:
404 */
405 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
406
c57943a1 407 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
408
409 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 410 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
411 bh_unlock_sock(sk);
412 atomic_inc(&sk->sk_drops);
413 goto discard_and_relse;
414 }
415
f0088a50
DV
416 bh_unlock_sock(sk);
417out:
418 sock_put(sk);
419 return rc;
420discard_and_relse:
421 kfree_skb(skb);
422 goto out;
423}
424EXPORT_SYMBOL(sk_receive_skb);
425
ea94ff3b
KK
426void sk_reset_txq(struct sock *sk)
427{
428 sk_tx_queue_clear(sk);
429}
430EXPORT_SYMBOL(sk_reset_txq);
431
f0088a50
DV
432struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
433{
b6c6712a 434 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
435
436 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 437 sk_tx_queue_clear(sk);
a9b3cd7f 438 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
439 dst_release(dst);
440 return NULL;
441 }
442
443 return dst;
444}
445EXPORT_SYMBOL(__sk_dst_check);
446
447struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
448{
449 struct dst_entry *dst = sk_dst_get(sk);
450
451 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
452 sk_dst_reset(sk);
453 dst_release(dst);
454 return NULL;
455 }
456
457 return dst;
458}
459EXPORT_SYMBOL(sk_dst_check);
460
4878809f
DM
461static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
462{
463 int ret = -ENOPROTOOPT;
464#ifdef CONFIG_NETDEVICES
3b1e0a65 465 struct net *net = sock_net(sk);
4878809f
DM
466 char devname[IFNAMSIZ];
467 int index;
468
469 /* Sorry... */
470 ret = -EPERM;
471 if (!capable(CAP_NET_RAW))
472 goto out;
473
474 ret = -EINVAL;
475 if (optlen < 0)
476 goto out;
477
478 /* Bind this socket to a particular device like "eth0",
479 * as specified in the passed interface name. If the
480 * name is "" or the option length is zero the socket
481 * is not bound.
482 */
483 if (optlen > IFNAMSIZ - 1)
484 optlen = IFNAMSIZ - 1;
485 memset(devname, 0, sizeof(devname));
486
487 ret = -EFAULT;
488 if (copy_from_user(devname, optval, optlen))
489 goto out;
490
000ba2e4
DM
491 index = 0;
492 if (devname[0] != '\0') {
bf8e56bf 493 struct net_device *dev;
4878809f 494
bf8e56bf
ED
495 rcu_read_lock();
496 dev = dev_get_by_name_rcu(net, devname);
497 if (dev)
498 index = dev->ifindex;
499 rcu_read_unlock();
4878809f
DM
500 ret = -ENODEV;
501 if (!dev)
502 goto out;
4878809f
DM
503 }
504
505 lock_sock(sk);
506 sk->sk_bound_dev_if = index;
507 sk_dst_reset(sk);
508 release_sock(sk);
509
510 ret = 0;
511
512out:
513#endif
514
515 return ret;
516}
517
c0ef877b
PE
518static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
519{
520 if (valbool)
521 sock_set_flag(sk, bit);
522 else
523 sock_reset_flag(sk, bit);
524}
525
1da177e4
LT
526/*
527 * This is meant for all protocols to use and covers goings on
528 * at the socket level. Everything here is generic.
529 */
530
531int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 532 char __user *optval, unsigned int optlen)
1da177e4 533{
2a91525c 534 struct sock *sk = sock->sk;
1da177e4
LT
535 int val;
536 int valbool;
537 struct linger ling;
538 int ret = 0;
4ec93edb 539
1da177e4
LT
540 /*
541 * Options without arguments
542 */
543
4878809f
DM
544 if (optname == SO_BINDTODEVICE)
545 return sock_bindtodevice(sk, optval, optlen);
546
e71a4783
SH
547 if (optlen < sizeof(int))
548 return -EINVAL;
4ec93edb 549
1da177e4
LT
550 if (get_user(val, (int __user *)optval))
551 return -EFAULT;
4ec93edb 552
2a91525c 553 valbool = val ? 1 : 0;
1da177e4
LT
554
555 lock_sock(sk);
556
2a91525c 557 switch (optname) {
e71a4783 558 case SO_DEBUG:
2a91525c 559 if (val && !capable(CAP_NET_ADMIN))
e71a4783 560 ret = -EACCES;
2a91525c 561 else
c0ef877b 562 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
563 break;
564 case SO_REUSEADDR:
4a17fd52 565 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783
SH
566 break;
567 case SO_TYPE:
49c794e9 568 case SO_PROTOCOL:
0d6038ee 569 case SO_DOMAIN:
e71a4783
SH
570 case SO_ERROR:
571 ret = -ENOPROTOOPT;
572 break;
573 case SO_DONTROUTE:
c0ef877b 574 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
e71a4783
SH
575 break;
576 case SO_BROADCAST:
577 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
578 break;
579 case SO_SNDBUF:
580 /* Don't error on this BSD doesn't and if you think
82981930
ED
581 * about it this is right. Otherwise apps have to
582 * play 'guess the biggest size' games. RCVBUF/SNDBUF
583 * are treated in BSD as hints
584 */
585 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 586set_sndbuf:
e71a4783 587 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
82981930
ED
588 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
589 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
590 sk->sk_write_space(sk);
591 break;
1da177e4 592
e71a4783
SH
593 case SO_SNDBUFFORCE:
594 if (!capable(CAP_NET_ADMIN)) {
595 ret = -EPERM;
596 break;
597 }
598 goto set_sndbuf;
b0573dea 599
e71a4783
SH
600 case SO_RCVBUF:
601 /* Don't error on this BSD doesn't and if you think
82981930
ED
602 * about it this is right. Otherwise apps have to
603 * play 'guess the biggest size' games. RCVBUF/SNDBUF
604 * are treated in BSD as hints
605 */
606 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 607set_rcvbuf:
e71a4783
SH
608 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
609 /*
610 * We double it on the way in to account for
611 * "struct sk_buff" etc. overhead. Applications
612 * assume that the SO_RCVBUF setting they make will
613 * allow that much actual data to be received on that
614 * socket.
615 *
616 * Applications are unaware that "struct sk_buff" and
617 * other overheads allocate from the receive buffer
618 * during socket buffer allocation.
619 *
620 * And after considering the possible alternatives,
621 * returning the value we actually used in getsockopt
622 * is the most desirable behavior.
623 */
82981930 624 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
625 break;
626
627 case SO_RCVBUFFORCE:
628 if (!capable(CAP_NET_ADMIN)) {
629 ret = -EPERM;
1da177e4 630 break;
e71a4783
SH
631 }
632 goto set_rcvbuf;
1da177e4 633
e71a4783 634 case SO_KEEPALIVE:
1da177e4 635#ifdef CONFIG_INET
e71a4783
SH
636 if (sk->sk_protocol == IPPROTO_TCP)
637 tcp_set_keepalive(sk, valbool);
1da177e4 638#endif
e71a4783
SH
639 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
640 break;
641
642 case SO_OOBINLINE:
643 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
644 break;
645
646 case SO_NO_CHECK:
647 sk->sk_no_check = valbool;
648 break;
649
650 case SO_PRIORITY:
651 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
652 sk->sk_priority = val;
653 else
654 ret = -EPERM;
655 break;
656
657 case SO_LINGER:
658 if (optlen < sizeof(ling)) {
659 ret = -EINVAL; /* 1003.1g */
1da177e4 660 break;
e71a4783 661 }
2a91525c 662 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 663 ret = -EFAULT;
1da177e4 664 break;
e71a4783
SH
665 }
666 if (!ling.l_onoff)
667 sock_reset_flag(sk, SOCK_LINGER);
668 else {
1da177e4 669#if (BITS_PER_LONG == 32)
e71a4783
SH
670 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
671 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 672 else
e71a4783
SH
673#endif
674 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
675 sock_set_flag(sk, SOCK_LINGER);
676 }
677 break;
678
679 case SO_BSDCOMPAT:
680 sock_warn_obsolete_bsdism("setsockopt");
681 break;
682
683 case SO_PASSCRED:
684 if (valbool)
685 set_bit(SOCK_PASSCRED, &sock->flags);
686 else
687 clear_bit(SOCK_PASSCRED, &sock->flags);
688 break;
689
690 case SO_TIMESTAMP:
92f37fd2 691 case SO_TIMESTAMPNS:
e71a4783 692 if (valbool) {
92f37fd2
ED
693 if (optname == SO_TIMESTAMP)
694 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
695 else
696 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 697 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 698 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 699 } else {
e71a4783 700 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2
ED
701 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
702 }
e71a4783
SH
703 break;
704
20d49473
PO
705 case SO_TIMESTAMPING:
706 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 707 ret = -EINVAL;
20d49473
PO
708 break;
709 }
710 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
711 val & SOF_TIMESTAMPING_TX_HARDWARE);
712 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
713 val & SOF_TIMESTAMPING_TX_SOFTWARE);
714 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
715 val & SOF_TIMESTAMPING_RX_HARDWARE);
716 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
717 sock_enable_timestamp(sk,
718 SOCK_TIMESTAMPING_RX_SOFTWARE);
719 else
720 sock_disable_timestamp(sk,
08e29af3 721 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
20d49473
PO
722 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
723 val & SOF_TIMESTAMPING_SOFTWARE);
724 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
725 val & SOF_TIMESTAMPING_SYS_HARDWARE);
726 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
727 val & SOF_TIMESTAMPING_RAW_HARDWARE);
728 break;
729
e71a4783
SH
730 case SO_RCVLOWAT:
731 if (val < 0)
732 val = INT_MAX;
733 sk->sk_rcvlowat = val ? : 1;
734 break;
735
736 case SO_RCVTIMEO:
737 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
738 break;
739
740 case SO_SNDTIMEO:
741 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
742 break;
1da177e4 743
e71a4783
SH
744 case SO_ATTACH_FILTER:
745 ret = -EINVAL;
746 if (optlen == sizeof(struct sock_fprog)) {
747 struct sock_fprog fprog;
1da177e4 748
e71a4783
SH
749 ret = -EFAULT;
750 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 751 break;
e71a4783
SH
752
753 ret = sk_attach_filter(&fprog, sk);
754 }
755 break;
756
757 case SO_DETACH_FILTER:
55b33325 758 ret = sk_detach_filter(sk);
e71a4783 759 break;
1da177e4 760
e71a4783
SH
761 case SO_PASSSEC:
762 if (valbool)
763 set_bit(SOCK_PASSSEC, &sock->flags);
764 else
765 clear_bit(SOCK_PASSSEC, &sock->flags);
766 break;
4a19ec58
LAT
767 case SO_MARK:
768 if (!capable(CAP_NET_ADMIN))
769 ret = -EPERM;
2a91525c 770 else
4a19ec58 771 sk->sk_mark = val;
4a19ec58 772 break;
877ce7c1 773
1da177e4
LT
774 /* We implement the SO_SNDLOWAT etc to
775 not be settable (1003.1g 5.3) */
3b885787 776 case SO_RXQ_OVFL:
8083f0fc 777 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 778 break;
6e3e939f
JB
779
780 case SO_WIFI_STATUS:
781 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
782 break;
783
ef64a54f
PE
784 case SO_PEEK_OFF:
785 if (sock->ops->set_peek_off)
786 sock->ops->set_peek_off(sk, val);
787 else
788 ret = -EOPNOTSUPP;
789 break;
3bdc0eba
BG
790
791 case SO_NOFCS:
792 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
793 break;
794
e71a4783
SH
795 default:
796 ret = -ENOPROTOOPT;
797 break;
4ec93edb 798 }
1da177e4
LT
799 release_sock(sk);
800 return ret;
801}
2a91525c 802EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
803
804
3f551f94
EB
805void cred_to_ucred(struct pid *pid, const struct cred *cred,
806 struct ucred *ucred)
807{
808 ucred->pid = pid_vnr(pid);
809 ucred->uid = ucred->gid = -1;
810 if (cred) {
811 struct user_namespace *current_ns = current_user_ns();
812
813 ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
814 ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
815 }
816}
3924773a 817EXPORT_SYMBOL_GPL(cred_to_ucred);
3f551f94 818
1da177e4
LT
819int sock_getsockopt(struct socket *sock, int level, int optname,
820 char __user *optval, int __user *optlen)
821{
822 struct sock *sk = sock->sk;
4ec93edb 823
e71a4783 824 union {
4ec93edb
YH
825 int val;
826 struct linger ling;
1da177e4
LT
827 struct timeval tm;
828 } v;
4ec93edb 829
4d0392be 830 int lv = sizeof(int);
1da177e4 831 int len;
4ec93edb 832
e71a4783 833 if (get_user(len, optlen))
4ec93edb 834 return -EFAULT;
e71a4783 835 if (len < 0)
1da177e4 836 return -EINVAL;
4ec93edb 837
50fee1de 838 memset(&v, 0, sizeof(v));
df0bca04 839
2a91525c 840 switch (optname) {
e71a4783
SH
841 case SO_DEBUG:
842 v.val = sock_flag(sk, SOCK_DBG);
843 break;
844
845 case SO_DONTROUTE:
846 v.val = sock_flag(sk, SOCK_LOCALROUTE);
847 break;
848
849 case SO_BROADCAST:
850 v.val = !!sock_flag(sk, SOCK_BROADCAST);
851 break;
852
853 case SO_SNDBUF:
854 v.val = sk->sk_sndbuf;
855 break;
856
857 case SO_RCVBUF:
858 v.val = sk->sk_rcvbuf;
859 break;
860
861 case SO_REUSEADDR:
862 v.val = sk->sk_reuse;
863 break;
864
865 case SO_KEEPALIVE:
866 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
867 break;
868
869 case SO_TYPE:
870 v.val = sk->sk_type;
871 break;
872
49c794e9
JE
873 case SO_PROTOCOL:
874 v.val = sk->sk_protocol;
875 break;
876
0d6038ee
JE
877 case SO_DOMAIN:
878 v.val = sk->sk_family;
879 break;
880
e71a4783
SH
881 case SO_ERROR:
882 v.val = -sock_error(sk);
2a91525c 883 if (v.val == 0)
e71a4783
SH
884 v.val = xchg(&sk->sk_err_soft, 0);
885 break;
886
887 case SO_OOBINLINE:
888 v.val = !!sock_flag(sk, SOCK_URGINLINE);
889 break;
890
891 case SO_NO_CHECK:
892 v.val = sk->sk_no_check;
893 break;
894
895 case SO_PRIORITY:
896 v.val = sk->sk_priority;
897 break;
898
899 case SO_LINGER:
900 lv = sizeof(v.ling);
901 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
902 v.ling.l_linger = sk->sk_lingertime / HZ;
903 break;
904
905 case SO_BSDCOMPAT:
906 sock_warn_obsolete_bsdism("getsockopt");
907 break;
908
909 case SO_TIMESTAMP:
92f37fd2
ED
910 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
911 !sock_flag(sk, SOCK_RCVTSTAMPNS);
912 break;
913
914 case SO_TIMESTAMPNS:
915 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783
SH
916 break;
917
20d49473
PO
918 case SO_TIMESTAMPING:
919 v.val = 0;
920 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
921 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
922 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
923 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
924 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
925 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
926 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
927 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
928 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
929 v.val |= SOF_TIMESTAMPING_SOFTWARE;
930 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
931 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
932 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
933 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
934 break;
935
e71a4783 936 case SO_RCVTIMEO:
2a91525c 937 lv = sizeof(struct timeval);
e71a4783
SH
938 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
939 v.tm.tv_sec = 0;
940 v.tm.tv_usec = 0;
941 } else {
942 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
943 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
944 }
945 break;
946
947 case SO_SNDTIMEO:
2a91525c 948 lv = sizeof(struct timeval);
e71a4783
SH
949 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
950 v.tm.tv_sec = 0;
951 v.tm.tv_usec = 0;
952 } else {
953 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
954 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
955 }
956 break;
1da177e4 957
e71a4783
SH
958 case SO_RCVLOWAT:
959 v.val = sk->sk_rcvlowat;
960 break;
1da177e4 961
e71a4783 962 case SO_SNDLOWAT:
2a91525c 963 v.val = 1;
e71a4783 964 break;
1da177e4 965
e71a4783 966 case SO_PASSCRED:
82981930 967 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 968 break;
1da177e4 969
e71a4783 970 case SO_PEERCRED:
109f6e39
EB
971 {
972 struct ucred peercred;
973 if (len > sizeof(peercred))
974 len = sizeof(peercred);
975 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
976 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
977 return -EFAULT;
978 goto lenout;
109f6e39 979 }
1da177e4 980
e71a4783
SH
981 case SO_PEERNAME:
982 {
983 char address[128];
984
985 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
986 return -ENOTCONN;
987 if (lv < len)
988 return -EINVAL;
989 if (copy_to_user(optval, address, len))
990 return -EFAULT;
991 goto lenout;
992 }
1da177e4 993
e71a4783
SH
994 /* Dubious BSD thing... Probably nobody even uses it, but
995 * the UNIX standard wants it for whatever reason... -DaveM
996 */
997 case SO_ACCEPTCONN:
998 v.val = sk->sk_state == TCP_LISTEN;
999 break;
1da177e4 1000
e71a4783 1001 case SO_PASSSEC:
82981930 1002 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1003 break;
877ce7c1 1004
e71a4783
SH
1005 case SO_PEERSEC:
1006 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1007
4a19ec58
LAT
1008 case SO_MARK:
1009 v.val = sk->sk_mark;
1010 break;
1011
3b885787
NH
1012 case SO_RXQ_OVFL:
1013 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
1014 break;
1015
6e3e939f
JB
1016 case SO_WIFI_STATUS:
1017 v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
1018 break;
1019
ef64a54f
PE
1020 case SO_PEEK_OFF:
1021 if (!sock->ops->set_peek_off)
1022 return -EOPNOTSUPP;
1023
1024 v.val = sk->sk_peek_off;
1025 break;
bc2f7996
DM
1026 case SO_NOFCS:
1027 v.val = !!sock_flag(sk, SOCK_NOFCS);
1028 break;
e71a4783
SH
1029 default:
1030 return -ENOPROTOOPT;
1da177e4 1031 }
e71a4783 1032
1da177e4
LT
1033 if (len > lv)
1034 len = lv;
1035 if (copy_to_user(optval, &v, len))
1036 return -EFAULT;
1037lenout:
4ec93edb
YH
1038 if (put_user(len, optlen))
1039 return -EFAULT;
1040 return 0;
1da177e4
LT
1041}
1042
a5b5bb9a
IM
1043/*
1044 * Initialize an sk_lock.
1045 *
1046 * (We also register the sk_lock with the lock validator.)
1047 */
b6f99a21 1048static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1049{
ed07536e
PZ
1050 sock_lock_init_class_and_name(sk,
1051 af_family_slock_key_strings[sk->sk_family],
1052 af_family_slock_keys + sk->sk_family,
1053 af_family_key_strings[sk->sk_family],
1054 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1055}
1056
4dc6dc71
ED
1057/*
1058 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1059 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1060 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1061 */
f1a6c4da
PE
1062static void sock_copy(struct sock *nsk, const struct sock *osk)
1063{
1064#ifdef CONFIG_SECURITY_NETWORK
1065 void *sptr = nsk->sk_security;
1066#endif
68835aba
ED
1067 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1068
1069 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1070 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1071
f1a6c4da
PE
1072#ifdef CONFIG_SECURITY_NETWORK
1073 nsk->sk_security = sptr;
1074 security_sk_clone(osk, nsk);
1075#endif
1076}
1077
fcbdf09d
OP
1078/*
1079 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1080 * un-modified. Special care is taken when initializing object to zero.
1081 */
1082static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1083{
1084 if (offsetof(struct sock, sk_node.next) != 0)
1085 memset(sk, 0, offsetof(struct sock, sk_node.next));
1086 memset(&sk->sk_node.pprev, 0,
1087 size - offsetof(struct sock, sk_node.pprev));
1088}
1089
1090void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1091{
1092 unsigned long nulls1, nulls2;
1093
1094 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1095 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1096 if (nulls1 > nulls2)
1097 swap(nulls1, nulls2);
1098
1099 if (nulls1 != 0)
1100 memset((char *)sk, 0, nulls1);
1101 memset((char *)sk + nulls1 + sizeof(void *), 0,
1102 nulls2 - nulls1 - sizeof(void *));
1103 memset((char *)sk + nulls2 + sizeof(void *), 0,
1104 size - nulls2 - sizeof(void *));
1105}
1106EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1107
2e4afe7b
PE
1108static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1109 int family)
c308c1b2
PE
1110{
1111 struct sock *sk;
1112 struct kmem_cache *slab;
1113
1114 slab = prot->slab;
e912b114
ED
1115 if (slab != NULL) {
1116 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1117 if (!sk)
1118 return sk;
1119 if (priority & __GFP_ZERO) {
fcbdf09d
OP
1120 if (prot->clear_sk)
1121 prot->clear_sk(sk, prot->obj_size);
1122 else
1123 sk_prot_clear_nulls(sk, prot->obj_size);
e912b114 1124 }
fcbdf09d 1125 } else
c308c1b2
PE
1126 sk = kmalloc(prot->obj_size, priority);
1127
2e4afe7b 1128 if (sk != NULL) {
a98b65a3
VN
1129 kmemcheck_annotate_bitfield(sk, flags);
1130
2e4afe7b
PE
1131 if (security_sk_alloc(sk, family, priority))
1132 goto out_free;
1133
1134 if (!try_module_get(prot->owner))
1135 goto out_free_sec;
e022f0b4 1136 sk_tx_queue_clear(sk);
2e4afe7b
PE
1137 }
1138
c308c1b2 1139 return sk;
2e4afe7b
PE
1140
1141out_free_sec:
1142 security_sk_free(sk);
1143out_free:
1144 if (slab != NULL)
1145 kmem_cache_free(slab, sk);
1146 else
1147 kfree(sk);
1148 return NULL;
c308c1b2
PE
1149}
1150
1151static void sk_prot_free(struct proto *prot, struct sock *sk)
1152{
1153 struct kmem_cache *slab;
2e4afe7b 1154 struct module *owner;
c308c1b2 1155
2e4afe7b 1156 owner = prot->owner;
c308c1b2 1157 slab = prot->slab;
2e4afe7b
PE
1158
1159 security_sk_free(sk);
c308c1b2
PE
1160 if (slab != NULL)
1161 kmem_cache_free(slab, sk);
1162 else
1163 kfree(sk);
2e4afe7b 1164 module_put(owner);
c308c1b2
PE
1165}
1166
f8451725
HX
1167#ifdef CONFIG_CGROUPS
1168void sock_update_classid(struct sock *sk)
1169{
1144182a 1170 u32 classid;
f8451725 1171
1144182a
PM
1172 rcu_read_lock(); /* doing current task, which cannot vanish. */
1173 classid = task_cls_classid(current);
1174 rcu_read_unlock();
f8451725
HX
1175 if (classid && classid != sk->sk_classid)
1176 sk->sk_classid = classid;
1177}
82862742 1178EXPORT_SYMBOL(sock_update_classid);
5bc1421e
NH
1179
1180void sock_update_netprioidx(struct sock *sk)
1181{
5bc1421e
NH
1182 if (in_interrupt())
1183 return;
2b73bc65
NH
1184
1185 sk->sk_cgrp_prioidx = task_netprioidx(current);
5bc1421e
NH
1186}
1187EXPORT_SYMBOL_GPL(sock_update_netprioidx);
f8451725
HX
1188#endif
1189
1da177e4
LT
1190/**
1191 * sk_alloc - All socket objects are allocated here
c4ea43c5 1192 * @net: the applicable net namespace
4dc3b16b
PP
1193 * @family: protocol family
1194 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1195 * @prot: struct proto associated with this new sock instance
1da177e4 1196 */
1b8d7ae4 1197struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
6257ff21 1198 struct proto *prot)
1da177e4 1199{
c308c1b2 1200 struct sock *sk;
1da177e4 1201
154adbc8 1202 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1203 if (sk) {
154adbc8
PE
1204 sk->sk_family = family;
1205 /*
1206 * See comment in struct sock definition to understand
1207 * why we need sk_prot_creator -acme
1208 */
1209 sk->sk_prot = sk->sk_prot_creator = prot;
1210 sock_lock_init(sk);
3b1e0a65 1211 sock_net_set(sk, get_net(net));
d66ee058 1212 atomic_set(&sk->sk_wmem_alloc, 1);
f8451725
HX
1213
1214 sock_update_classid(sk);
5bc1421e 1215 sock_update_netprioidx(sk);
1da177e4 1216 }
a79af59e 1217
2e4afe7b 1218 return sk;
1da177e4 1219}
2a91525c 1220EXPORT_SYMBOL(sk_alloc);
1da177e4 1221
2b85a34e 1222static void __sk_free(struct sock *sk)
1da177e4
LT
1223{
1224 struct sk_filter *filter;
1da177e4
LT
1225
1226 if (sk->sk_destruct)
1227 sk->sk_destruct(sk);
1228
a898def2
PM
1229 filter = rcu_dereference_check(sk->sk_filter,
1230 atomic_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1231 if (filter) {
309dd5fc 1232 sk_filter_uncharge(sk, filter);
a9b3cd7f 1233 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4
LT
1234 }
1235
08e29af3 1236 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1237
1238 if (atomic_read(&sk->sk_omem_alloc))
1239 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
0dc47877 1240 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1241
109f6e39
EB
1242 if (sk->sk_peer_cred)
1243 put_cred(sk->sk_peer_cred);
1244 put_pid(sk->sk_peer_pid);
3b1e0a65 1245 put_net(sock_net(sk));
c308c1b2 1246 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1247}
2b85a34e
ED
1248
1249void sk_free(struct sock *sk)
1250{
1251 /*
25985edc 1252 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1253 * some packets are still in some tx queue.
1254 * If not null, sock_wfree() will call __sk_free(sk) later
1255 */
1256 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1257 __sk_free(sk);
1258}
2a91525c 1259EXPORT_SYMBOL(sk_free);
1da177e4 1260
edf02087 1261/*
25985edc
LDM
1262 * Last sock_put should drop reference to sk->sk_net. It has already
1263 * been dropped in sk_change_net. Taking reference to stopping namespace
edf02087 1264 * is not an option.
25985edc 1265 * Take reference to a socket to remove it from hash _alive_ and after that
edf02087
DL
1266 * destroy it in the context of init_net.
1267 */
1268void sk_release_kernel(struct sock *sk)
1269{
1270 if (sk == NULL || sk->sk_socket == NULL)
1271 return;
1272
1273 sock_hold(sk);
1274 sock_release(sk->sk_socket);
65a18ec5 1275 release_net(sock_net(sk));
3b1e0a65 1276 sock_net_set(sk, get_net(&init_net));
edf02087
DL
1277 sock_put(sk);
1278}
45af1754 1279EXPORT_SYMBOL(sk_release_kernel);
edf02087 1280
475f1b52
SR
1281static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1282{
1283 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1284 sock_update_memcg(newsk);
1285}
1286
e56c57d0
ED
1287/**
1288 * sk_clone_lock - clone a socket, and lock its clone
1289 * @sk: the socket to clone
1290 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1291 *
1292 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1293 */
1294struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1295{
8fd1d178 1296 struct sock *newsk;
87d11ceb 1297
8fd1d178 1298 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1299 if (newsk != NULL) {
1300 struct sk_filter *filter;
1301
892c141e 1302 sock_copy(newsk, sk);
87d11ceb
ACM
1303
1304 /* SANITY */
3b1e0a65 1305 get_net(sock_net(newsk));
87d11ceb
ACM
1306 sk_node_init(&newsk->sk_node);
1307 sock_lock_init(newsk);
1308 bh_lock_sock(newsk);
fa438ccf 1309 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1310 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1311
1312 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1313 /*
1314 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1315 */
1316 atomic_set(&newsk->sk_wmem_alloc, 1);
87d11ceb
ACM
1317 atomic_set(&newsk->sk_omem_alloc, 0);
1318 skb_queue_head_init(&newsk->sk_receive_queue);
1319 skb_queue_head_init(&newsk->sk_write_queue);
97fc2f08
CL
1320#ifdef CONFIG_NET_DMA
1321 skb_queue_head_init(&newsk->sk_async_wait_queue);
1322#endif
87d11ceb 1323
b6c6712a 1324 spin_lock_init(&newsk->sk_dst_lock);
87d11ceb 1325 rwlock_init(&newsk->sk_callback_lock);
443aef0e
PZ
1326 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1327 af_callback_keys + newsk->sk_family,
1328 af_family_clock_key_strings[newsk->sk_family]);
87d11ceb
ACM
1329
1330 newsk->sk_dst_cache = NULL;
1331 newsk->sk_wmem_queued = 0;
1332 newsk->sk_forward_alloc = 0;
1333 newsk->sk_send_head = NULL;
87d11ceb
ACM
1334 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1335
1336 sock_reset_flag(newsk, SOCK_DONE);
1337 skb_queue_head_init(&newsk->sk_error_queue);
1338
0d7da9dd 1339 filter = rcu_dereference_protected(newsk->sk_filter, 1);
87d11ceb
ACM
1340 if (filter != NULL)
1341 sk_filter_charge(newsk, filter);
1342
1343 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1344 /* It is still raw copy of parent, so invalidate
1345 * destructor and make plain sk_free() */
1346 newsk->sk_destruct = NULL;
b0691c8e 1347 bh_unlock_sock(newsk);
87d11ceb
ACM
1348 sk_free(newsk);
1349 newsk = NULL;
1350 goto out;
1351 }
1352
1353 newsk->sk_err = 0;
1354 newsk->sk_priority = 0;
4dc6dc71
ED
1355 /*
1356 * Before updating sk_refcnt, we must commit prior changes to memory
1357 * (Documentation/RCU/rculist_nulls.txt for details)
1358 */
1359 smp_wmb();
87d11ceb
ACM
1360 atomic_set(&newsk->sk_refcnt, 2);
1361
1362 /*
1363 * Increment the counter in the same struct proto as the master
1364 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1365 * is the same as sk->sk_prot->socks, as this field was copied
1366 * with memcpy).
1367 *
1368 * This _changes_ the previous behaviour, where
1369 * tcp_create_openreq_child always was incrementing the
1370 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1371 * to be taken into account in all callers. -acme
1372 */
1373 sk_refcnt_debug_inc(newsk);
972692e0 1374 sk_set_socket(newsk, NULL);
43815482 1375 newsk->sk_wq = NULL;
87d11ceb 1376
f3f511e1
GC
1377 sk_update_clone(sk, newsk);
1378
87d11ceb 1379 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1380 sk_sockets_allocated_inc(newsk);
704da560 1381
08e29af3 1382 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1383 net_enable_timestamp();
87d11ceb
ACM
1384 }
1385out:
1386 return newsk;
1387}
e56c57d0 1388EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1389
9958089a
AK
1390void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1391{
1392 __sk_dst_set(sk, dst);
1393 sk->sk_route_caps = dst->dev->features;
1394 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1395 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1396 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1397 if (sk_can_gso(sk)) {
82cc1a7a 1398 if (dst->header_len) {
9958089a 1399 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1400 } else {
9958089a 1401 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a
PWJ
1402 sk->sk_gso_max_size = dst->dev->gso_max_size;
1403 }
9958089a
AK
1404 }
1405}
1406EXPORT_SYMBOL_GPL(sk_setup_caps);
1407
1da177e4
LT
1408void __init sk_init(void)
1409{
4481374c 1410 if (totalram_pages <= 4096) {
1da177e4
LT
1411 sysctl_wmem_max = 32767;
1412 sysctl_rmem_max = 32767;
1413 sysctl_wmem_default = 32767;
1414 sysctl_rmem_default = 32767;
4481374c 1415 } else if (totalram_pages >= 131072) {
1da177e4
LT
1416 sysctl_wmem_max = 131071;
1417 sysctl_rmem_max = 131071;
1418 }
1419}
1420
1421/*
1422 * Simple resource managers for sockets.
1423 */
1424
1425
4ec93edb
YH
1426/*
1427 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1428 */
1429void sock_wfree(struct sk_buff *skb)
1430{
1431 struct sock *sk = skb->sk;
d99927f4 1432 unsigned int len = skb->truesize;
1da177e4 1433
d99927f4
ED
1434 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1435 /*
1436 * Keep a reference on sk_wmem_alloc, this will be released
1437 * after sk_write_space() call
1438 */
1439 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1da177e4 1440 sk->sk_write_space(sk);
d99927f4
ED
1441 len = 1;
1442 }
2b85a34e 1443 /*
d99927f4
ED
1444 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1445 * could not do because of in-flight packets
2b85a34e 1446 */
d99927f4 1447 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1448 __sk_free(sk);
1da177e4 1449}
2a91525c 1450EXPORT_SYMBOL(sock_wfree);
1da177e4 1451
4ec93edb
YH
1452/*
1453 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1454 */
1455void sock_rfree(struct sk_buff *skb)
1456{
1457 struct sock *sk = skb->sk;
d361fd59 1458 unsigned int len = skb->truesize;
1da177e4 1459
d361fd59
ED
1460 atomic_sub(len, &sk->sk_rmem_alloc);
1461 sk_mem_uncharge(sk, len);
1da177e4 1462}
2a91525c 1463EXPORT_SYMBOL(sock_rfree);
1da177e4
LT
1464
1465
1466int sock_i_uid(struct sock *sk)
1467{
1468 int uid;
1469
f064af1e 1470 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1471 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
f064af1e 1472 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1473 return uid;
1474}
2a91525c 1475EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
1476
1477unsigned long sock_i_ino(struct sock *sk)
1478{
1479 unsigned long ino;
1480
f064af1e 1481 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1482 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 1483 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1484 return ino;
1485}
2a91525c 1486EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
1487
1488/*
1489 * Allocate a skb from the socket's send buffer.
1490 */
86a76caf 1491struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1492 gfp_t priority)
1da177e4
LT
1493{
1494 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 1495 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
1496 if (skb) {
1497 skb_set_owner_w(skb, sk);
1498 return skb;
1499 }
1500 }
1501 return NULL;
1502}
2a91525c 1503EXPORT_SYMBOL(sock_wmalloc);
1da177e4
LT
1504
1505/*
1506 * Allocate a skb from the socket's receive buffer.
4ec93edb 1507 */
86a76caf 1508struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1509 gfp_t priority)
1da177e4
LT
1510{
1511 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1512 struct sk_buff *skb = alloc_skb(size, priority);
1513 if (skb) {
1514 skb_set_owner_r(skb, sk);
1515 return skb;
1516 }
1517 }
1518 return NULL;
1519}
1520
4ec93edb 1521/*
1da177e4 1522 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 1523 */
dd0fc66f 1524void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 1525{
95c96174 1526 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
1527 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1528 void *mem;
1529 /* First do the add, to avoid the race if kmalloc
4ec93edb 1530 * might sleep.
1da177e4
LT
1531 */
1532 atomic_add(size, &sk->sk_omem_alloc);
1533 mem = kmalloc(size, priority);
1534 if (mem)
1535 return mem;
1536 atomic_sub(size, &sk->sk_omem_alloc);
1537 }
1538 return NULL;
1539}
2a91525c 1540EXPORT_SYMBOL(sock_kmalloc);
1da177e4
LT
1541
1542/*
1543 * Free an option memory block.
1544 */
1545void sock_kfree_s(struct sock *sk, void *mem, int size)
1546{
1547 kfree(mem);
1548 atomic_sub(size, &sk->sk_omem_alloc);
1549}
2a91525c 1550EXPORT_SYMBOL(sock_kfree_s);
1da177e4
LT
1551
1552/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1553 I think, these locks should be removed for datagram sockets.
1554 */
2a91525c 1555static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
1556{
1557 DEFINE_WAIT(wait);
1558
1559 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1560 for (;;) {
1561 if (!timeo)
1562 break;
1563 if (signal_pending(current))
1564 break;
1565 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 1566 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1da177e4
LT
1567 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1568 break;
1569 if (sk->sk_shutdown & SEND_SHUTDOWN)
1570 break;
1571 if (sk->sk_err)
1572 break;
1573 timeo = schedule_timeout(timeo);
1574 }
aa395145 1575 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
1576 return timeo;
1577}
1578
1579
1580/*
1581 * Generic send/receive buffer handlers
1582 */
1583
4cc7f68d
HX
1584struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1585 unsigned long data_len, int noblock,
1586 int *errcode)
1da177e4
LT
1587{
1588 struct sk_buff *skb;
7d877f3b 1589 gfp_t gfp_mask;
1da177e4
LT
1590 long timeo;
1591 int err;
1592
1593 gfp_mask = sk->sk_allocation;
1594 if (gfp_mask & __GFP_WAIT)
1595 gfp_mask |= __GFP_REPEAT;
1596
1597 timeo = sock_sndtimeo(sk, noblock);
1598 while (1) {
1599 err = sock_error(sk);
1600 if (err != 0)
1601 goto failure;
1602
1603 err = -EPIPE;
1604 if (sk->sk_shutdown & SEND_SHUTDOWN)
1605 goto failure;
1606
1607 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
db38c179 1608 skb = alloc_skb(header_len, gfp_mask);
1da177e4
LT
1609 if (skb) {
1610 int npages;
1611 int i;
1612
1613 /* No pages, we're done... */
1614 if (!data_len)
1615 break;
1616
1617 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1618 skb->truesize += data_len;
1619 skb_shinfo(skb)->nr_frags = npages;
1620 for (i = 0; i < npages; i++) {
1621 struct page *page;
1da177e4
LT
1622
1623 page = alloc_pages(sk->sk_allocation, 0);
1624 if (!page) {
1625 err = -ENOBUFS;
1626 skb_shinfo(skb)->nr_frags = i;
1627 kfree_skb(skb);
1628 goto failure;
1629 }
1630
ea2ab693
IC
1631 __skb_fill_page_desc(skb, i,
1632 page, 0,
1633 (data_len >= PAGE_SIZE ?
1634 PAGE_SIZE :
1635 data_len));
1da177e4
LT
1636 data_len -= PAGE_SIZE;
1637 }
1638
1639 /* Full success... */
1640 break;
1641 }
1642 err = -ENOBUFS;
1643 goto failure;
1644 }
1645 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1646 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1647 err = -EAGAIN;
1648 if (!timeo)
1649 goto failure;
1650 if (signal_pending(current))
1651 goto interrupted;
1652 timeo = sock_wait_for_wmem(sk, timeo);
1653 }
1654
1655 skb_set_owner_w(skb, sk);
1656 return skb;
1657
1658interrupted:
1659 err = sock_intr_errno(timeo);
1660failure:
1661 *errcode = err;
1662 return NULL;
1663}
4cc7f68d 1664EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 1665
4ec93edb 1666struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
1667 int noblock, int *errcode)
1668{
1669 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1670}
2a91525c 1671EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4
LT
1672
1673static void __lock_sock(struct sock *sk)
f39234d6
NK
1674 __releases(&sk->sk_lock.slock)
1675 __acquires(&sk->sk_lock.slock)
1da177e4
LT
1676{
1677 DEFINE_WAIT(wait);
1678
e71a4783 1679 for (;;) {
1da177e4
LT
1680 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1681 TASK_UNINTERRUPTIBLE);
1682 spin_unlock_bh(&sk->sk_lock.slock);
1683 schedule();
1684 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 1685 if (!sock_owned_by_user(sk))
1da177e4
LT
1686 break;
1687 }
1688 finish_wait(&sk->sk_lock.wq, &wait);
1689}
1690
1691static void __release_sock(struct sock *sk)
f39234d6
NK
1692 __releases(&sk->sk_lock.slock)
1693 __acquires(&sk->sk_lock.slock)
1da177e4
LT
1694{
1695 struct sk_buff *skb = sk->sk_backlog.head;
1696
1697 do {
1698 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1699 bh_unlock_sock(sk);
1700
1701 do {
1702 struct sk_buff *next = skb->next;
1703
e4cbb02a 1704 prefetch(next);
7fee226a 1705 WARN_ON_ONCE(skb_dst_is_noref(skb));
1da177e4 1706 skb->next = NULL;
c57943a1 1707 sk_backlog_rcv(sk, skb);
1da177e4
LT
1708
1709 /*
1710 * We are in process context here with softirqs
1711 * disabled, use cond_resched_softirq() to preempt.
1712 * This is safe to do because we've taken the backlog
1713 * queue private:
1714 */
1715 cond_resched_softirq();
1716
1717 skb = next;
1718 } while (skb != NULL);
1719
1720 bh_lock_sock(sk);
e71a4783 1721 } while ((skb = sk->sk_backlog.head) != NULL);
8eae939f
ZY
1722
1723 /*
1724 * Doing the zeroing here guarantee we can not loop forever
1725 * while a wild producer attempts to flood us.
1726 */
1727 sk->sk_backlog.len = 0;
1da177e4
LT
1728}
1729
1730/**
1731 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
1732 * @sk: sock to wait on
1733 * @timeo: for how long
1da177e4
LT
1734 *
1735 * Now socket state including sk->sk_err is changed only under lock,
1736 * hence we may omit checks after joining wait queue.
1737 * We check receive queue before schedule() only as optimization;
1738 * it is very likely that release_sock() added new data.
1739 */
1740int sk_wait_data(struct sock *sk, long *timeo)
1741{
1742 int rc;
1743 DEFINE_WAIT(wait);
1744
aa395145 1745 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1da177e4
LT
1746 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1747 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1748 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
aa395145 1749 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
1750 return rc;
1751}
1da177e4
LT
1752EXPORT_SYMBOL(sk_wait_data);
1753
3ab224be
HA
1754/**
1755 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1756 * @sk: socket
1757 * @size: memory size to allocate
1758 * @kind: allocation type
1759 *
1760 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1761 * rmem allocation. This function assumes that protocols which have
1762 * memory_pressure use sk_wmem_queued as write buffer accounting.
1763 */
1764int __sk_mem_schedule(struct sock *sk, int size, int kind)
1765{
1766 struct proto *prot = sk->sk_prot;
1767 int amt = sk_mem_pages(size);
8d987e5c 1768 long allocated;
e1aab161 1769 int parent_status = UNDER_LIMIT;
3ab224be
HA
1770
1771 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
180d8cd9 1772
e1aab161 1773 allocated = sk_memory_allocated_add(sk, amt, &parent_status);
3ab224be
HA
1774
1775 /* Under limit. */
e1aab161
GC
1776 if (parent_status == UNDER_LIMIT &&
1777 allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 1778 sk_leave_memory_pressure(sk);
3ab224be
HA
1779 return 1;
1780 }
1781
e1aab161
GC
1782 /* Under pressure. (we or our parents) */
1783 if ((parent_status > SOFT_LIMIT) ||
1784 allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 1785 sk_enter_memory_pressure(sk);
3ab224be 1786
e1aab161
GC
1787 /* Over hard limit (we or our parents) */
1788 if ((parent_status == OVER_LIMIT) ||
1789 (allocated > sk_prot_mem_limits(sk, 2)))
3ab224be
HA
1790 goto suppress_allocation;
1791
1792 /* guarantee minimum buffer size under pressure */
1793 if (kind == SK_MEM_RECV) {
1794 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1795 return 1;
180d8cd9 1796
3ab224be
HA
1797 } else { /* SK_MEM_SEND */
1798 if (sk->sk_type == SOCK_STREAM) {
1799 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1800 return 1;
1801 } else if (atomic_read(&sk->sk_wmem_alloc) <
1802 prot->sysctl_wmem[0])
1803 return 1;
1804 }
1805
180d8cd9 1806 if (sk_has_memory_pressure(sk)) {
1748376b
ED
1807 int alloc;
1808
180d8cd9 1809 if (!sk_under_memory_pressure(sk))
1748376b 1810 return 1;
180d8cd9
GC
1811 alloc = sk_sockets_allocated_read_positive(sk);
1812 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
1813 sk_mem_pages(sk->sk_wmem_queued +
1814 atomic_read(&sk->sk_rmem_alloc) +
1815 sk->sk_forward_alloc))
1816 return 1;
1817 }
1818
1819suppress_allocation:
1820
1821 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1822 sk_stream_moderate_sndbuf(sk);
1823
1824 /* Fail only if socket is _under_ its sndbuf.
1825 * In this case we cannot block, so that we have to fail.
1826 */
1827 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1828 return 1;
1829 }
1830
3847ce32
SM
1831 trace_sock_exceed_buf_limit(sk, prot, allocated);
1832
3ab224be
HA
1833 /* Alas. Undo changes. */
1834 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
180d8cd9 1835
0e90b31f 1836 sk_memory_allocated_sub(sk, amt);
180d8cd9 1837
3ab224be
HA
1838 return 0;
1839}
3ab224be
HA
1840EXPORT_SYMBOL(__sk_mem_schedule);
1841
1842/**
1843 * __sk_reclaim - reclaim memory_allocated
1844 * @sk: socket
1845 */
1846void __sk_mem_reclaim(struct sock *sk)
1847{
180d8cd9 1848 sk_memory_allocated_sub(sk,
0e90b31f 1849 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
3ab224be
HA
1850 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1851
180d8cd9
GC
1852 if (sk_under_memory_pressure(sk) &&
1853 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1854 sk_leave_memory_pressure(sk);
3ab224be 1855}
3ab224be
HA
1856EXPORT_SYMBOL(__sk_mem_reclaim);
1857
1858
1da177e4
LT
1859/*
1860 * Set of default routines for initialising struct proto_ops when
1861 * the protocol does not support a particular function. In certain
1862 * cases where it makes no sense for a protocol to have a "do nothing"
1863 * function, some default processing is provided.
1864 */
1865
1866int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1867{
1868 return -EOPNOTSUPP;
1869}
2a91525c 1870EXPORT_SYMBOL(sock_no_bind);
1da177e4 1871
4ec93edb 1872int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
1873 int len, int flags)
1874{
1875 return -EOPNOTSUPP;
1876}
2a91525c 1877EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
1878
1879int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1880{
1881 return -EOPNOTSUPP;
1882}
2a91525c 1883EXPORT_SYMBOL(sock_no_socketpair);
1da177e4
LT
1884
1885int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1886{
1887 return -EOPNOTSUPP;
1888}
2a91525c 1889EXPORT_SYMBOL(sock_no_accept);
1da177e4 1890
4ec93edb 1891int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
1892 int *len, int peer)
1893{
1894 return -EOPNOTSUPP;
1895}
2a91525c 1896EXPORT_SYMBOL(sock_no_getname);
1da177e4 1897
2a91525c 1898unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1da177e4
LT
1899{
1900 return 0;
1901}
2a91525c 1902EXPORT_SYMBOL(sock_no_poll);
1da177e4
LT
1903
1904int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1905{
1906 return -EOPNOTSUPP;
1907}
2a91525c 1908EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
1909
1910int sock_no_listen(struct socket *sock, int backlog)
1911{
1912 return -EOPNOTSUPP;
1913}
2a91525c 1914EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
1915
1916int sock_no_shutdown(struct socket *sock, int how)
1917{
1918 return -EOPNOTSUPP;
1919}
2a91525c 1920EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
1921
1922int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 1923 char __user *optval, unsigned int optlen)
1da177e4
LT
1924{
1925 return -EOPNOTSUPP;
1926}
2a91525c 1927EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
1928
1929int sock_no_getsockopt(struct socket *sock, int level, int optname,
1930 char __user *optval, int __user *optlen)
1931{
1932 return -EOPNOTSUPP;
1933}
2a91525c 1934EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4
LT
1935
1936int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1937 size_t len)
1938{
1939 return -EOPNOTSUPP;
1940}
2a91525c 1941EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4
LT
1942
1943int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1944 size_t len, int flags)
1945{
1946 return -EOPNOTSUPP;
1947}
2a91525c 1948EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
1949
1950int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1951{
1952 /* Mirror missing mmap method error code */
1953 return -ENODEV;
1954}
2a91525c 1955EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
1956
1957ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1958{
1959 ssize_t res;
1960 struct msghdr msg = {.msg_flags = flags};
1961 struct kvec iov;
1962 char *kaddr = kmap(page);
1963 iov.iov_base = kaddr + offset;
1964 iov.iov_len = size;
1965 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1966 kunmap(page);
1967 return res;
1968}
2a91525c 1969EXPORT_SYMBOL(sock_no_sendpage);
1da177e4
LT
1970
1971/*
1972 * Default Socket Callbacks
1973 */
1974
1975static void sock_def_wakeup(struct sock *sk)
1976{
43815482
ED
1977 struct socket_wq *wq;
1978
1979 rcu_read_lock();
1980 wq = rcu_dereference(sk->sk_wq);
1981 if (wq_has_sleeper(wq))
1982 wake_up_interruptible_all(&wq->wait);
1983 rcu_read_unlock();
1da177e4
LT
1984}
1985
1986static void sock_def_error_report(struct sock *sk)
1987{
43815482
ED
1988 struct socket_wq *wq;
1989
1990 rcu_read_lock();
1991 wq = rcu_dereference(sk->sk_wq);
1992 if (wq_has_sleeper(wq))
1993 wake_up_interruptible_poll(&wq->wait, POLLERR);
8d8ad9d7 1994 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 1995 rcu_read_unlock();
1da177e4
LT
1996}
1997
1998static void sock_def_readable(struct sock *sk, int len)
1999{
43815482
ED
2000 struct socket_wq *wq;
2001
2002 rcu_read_lock();
2003 wq = rcu_dereference(sk->sk_wq);
2004 if (wq_has_sleeper(wq))
2c6607c6 2005 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
37e5540b 2006 POLLRDNORM | POLLRDBAND);
8d8ad9d7 2007 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2008 rcu_read_unlock();
1da177e4
LT
2009}
2010
2011static void sock_def_write_space(struct sock *sk)
2012{
43815482
ED
2013 struct socket_wq *wq;
2014
2015 rcu_read_lock();
1da177e4
LT
2016
2017 /* Do not wake up a writer until he can make "significant"
2018 * progress. --DaveM
2019 */
e71a4783 2020 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482
ED
2021 wq = rcu_dereference(sk->sk_wq);
2022 if (wq_has_sleeper(wq))
2023 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
37e5540b 2024 POLLWRNORM | POLLWRBAND);
1da177e4
LT
2025
2026 /* Should agree with poll, otherwise some programs break */
2027 if (sock_writeable(sk))
8d8ad9d7 2028 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2029 }
2030
43815482 2031 rcu_read_unlock();
1da177e4
LT
2032}
2033
2034static void sock_def_destruct(struct sock *sk)
2035{
a51482bd 2036 kfree(sk->sk_protinfo);
1da177e4
LT
2037}
2038
2039void sk_send_sigurg(struct sock *sk)
2040{
2041 if (sk->sk_socket && sk->sk_socket->file)
2042 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2043 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2044}
2a91525c 2045EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2046
2047void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2048 unsigned long expires)
2049{
2050 if (!mod_timer(timer, expires))
2051 sock_hold(sk);
2052}
1da177e4
LT
2053EXPORT_SYMBOL(sk_reset_timer);
2054
2055void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2056{
2057 if (timer_pending(timer) && del_timer(timer))
2058 __sock_put(sk);
2059}
1da177e4
LT
2060EXPORT_SYMBOL(sk_stop_timer);
2061
2062void sock_init_data(struct socket *sock, struct sock *sk)
2063{
2064 skb_queue_head_init(&sk->sk_receive_queue);
2065 skb_queue_head_init(&sk->sk_write_queue);
2066 skb_queue_head_init(&sk->sk_error_queue);
97fc2f08
CL
2067#ifdef CONFIG_NET_DMA
2068 skb_queue_head_init(&sk->sk_async_wait_queue);
2069#endif
1da177e4
LT
2070
2071 sk->sk_send_head = NULL;
2072
2073 init_timer(&sk->sk_timer);
4ec93edb 2074
1da177e4
LT
2075 sk->sk_allocation = GFP_KERNEL;
2076 sk->sk_rcvbuf = sysctl_rmem_default;
2077 sk->sk_sndbuf = sysctl_wmem_default;
2078 sk->sk_state = TCP_CLOSE;
972692e0 2079 sk_set_socket(sk, sock);
1da177e4
LT
2080
2081 sock_set_flag(sk, SOCK_ZAPPED);
2082
e71a4783 2083 if (sock) {
1da177e4 2084 sk->sk_type = sock->type;
43815482 2085 sk->sk_wq = sock->wq;
1da177e4
LT
2086 sock->sk = sk;
2087 } else
43815482 2088 sk->sk_wq = NULL;
1da177e4 2089
b6c6712a 2090 spin_lock_init(&sk->sk_dst_lock);
1da177e4 2091 rwlock_init(&sk->sk_callback_lock);
443aef0e
PZ
2092 lockdep_set_class_and_name(&sk->sk_callback_lock,
2093 af_callback_keys + sk->sk_family,
2094 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2095
2096 sk->sk_state_change = sock_def_wakeup;
2097 sk->sk_data_ready = sock_def_readable;
2098 sk->sk_write_space = sock_def_write_space;
2099 sk->sk_error_report = sock_def_error_report;
2100 sk->sk_destruct = sock_def_destruct;
2101
2102 sk->sk_sndmsg_page = NULL;
2103 sk->sk_sndmsg_off = 0;
ef64a54f 2104 sk->sk_peek_off = -1;
1da177e4 2105
109f6e39
EB
2106 sk->sk_peer_pid = NULL;
2107 sk->sk_peer_cred = NULL;
1da177e4
LT
2108 sk->sk_write_pending = 0;
2109 sk->sk_rcvlowat = 1;
2110 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2111 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2112
f37f0afb 2113 sk->sk_stamp = ktime_set(-1L, 0);
1da177e4 2114
4dc6dc71
ED
2115 /*
2116 * Before updating sk_refcnt, we must commit prior changes to memory
2117 * (Documentation/RCU/rculist_nulls.txt for details)
2118 */
2119 smp_wmb();
1da177e4 2120 atomic_set(&sk->sk_refcnt, 1);
33c732c3 2121 atomic_set(&sk->sk_drops, 0);
1da177e4 2122}
2a91525c 2123EXPORT_SYMBOL(sock_init_data);
1da177e4 2124
b5606c2d 2125void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2126{
2127 might_sleep();
a5b5bb9a 2128 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2129 if (sk->sk_lock.owned)
1da177e4 2130 __lock_sock(sk);
d2e9117c 2131 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2132 spin_unlock(&sk->sk_lock.slock);
2133 /*
2134 * The sk_lock has mutex_lock() semantics here:
2135 */
fcc70d5f 2136 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2137 local_bh_enable();
1da177e4 2138}
fcc70d5f 2139EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2140
b5606c2d 2141void release_sock(struct sock *sk)
1da177e4 2142{
a5b5bb9a
IM
2143 /*
2144 * The sk_lock has mutex_unlock() semantics:
2145 */
2146 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2147
2148 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2149 if (sk->sk_backlog.tail)
2150 __release_sock(sk);
d2e9117c 2151 sk->sk_lock.owned = 0;
a5b5bb9a
IM
2152 if (waitqueue_active(&sk->sk_lock.wq))
2153 wake_up(&sk->sk_lock.wq);
2154 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2155}
2156EXPORT_SYMBOL(release_sock);
2157
8a74ad60
ED
2158/**
2159 * lock_sock_fast - fast version of lock_sock
2160 * @sk: socket
2161 *
2162 * This version should be used for very small section, where process wont block
2163 * return false if fast path is taken
2164 * sk_lock.slock locked, owned = 0, BH disabled
2165 * return true if slow path is taken
2166 * sk_lock.slock unlocked, owned = 1, BH enabled
2167 */
2168bool lock_sock_fast(struct sock *sk)
2169{
2170 might_sleep();
2171 spin_lock_bh(&sk->sk_lock.slock);
2172
2173 if (!sk->sk_lock.owned)
2174 /*
2175 * Note : We must disable BH
2176 */
2177 return false;
2178
2179 __lock_sock(sk);
2180 sk->sk_lock.owned = 1;
2181 spin_unlock(&sk->sk_lock.slock);
2182 /*
2183 * The sk_lock has mutex_lock() semantics here:
2184 */
2185 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2186 local_bh_enable();
2187 return true;
2188}
2189EXPORT_SYMBOL(lock_sock_fast);
2190
1da177e4 2191int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2192{
b7aa0bf7 2193 struct timeval tv;
1da177e4 2194 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2195 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
b7aa0bf7
ED
2196 tv = ktime_to_timeval(sk->sk_stamp);
2197 if (tv.tv_sec == -1)
1da177e4 2198 return -ENOENT;
b7aa0bf7
ED
2199 if (tv.tv_sec == 0) {
2200 sk->sk_stamp = ktime_get_real();
2201 tv = ktime_to_timeval(sk->sk_stamp);
2202 }
2203 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2204}
1da177e4
LT
2205EXPORT_SYMBOL(sock_get_timestamp);
2206
ae40eb1e
ED
2207int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2208{
2209 struct timespec ts;
2210 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2211 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ae40eb1e
ED
2212 ts = ktime_to_timespec(sk->sk_stamp);
2213 if (ts.tv_sec == -1)
2214 return -ENOENT;
2215 if (ts.tv_sec == 0) {
2216 sk->sk_stamp = ktime_get_real();
2217 ts = ktime_to_timespec(sk->sk_stamp);
2218 }
2219 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2220}
2221EXPORT_SYMBOL(sock_get_timestampns);
2222
20d49473 2223void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 2224{
20d49473 2225 if (!sock_flag(sk, flag)) {
08e29af3
ED
2226 unsigned long previous_flags = sk->sk_flags;
2227
20d49473
PO
2228 sock_set_flag(sk, flag);
2229 /*
2230 * we just set one of the two flags which require net
2231 * time stamping, but time stamping might have been on
2232 * already because of the other one
2233 */
08e29af3 2234 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 2235 net_enable_timestamp();
1da177e4
LT
2236 }
2237}
1da177e4
LT
2238
2239/*
2240 * Get a socket option on an socket.
2241 *
2242 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2243 * asynchronous errors should be reported by getsockopt. We assume
2244 * this means if you specify SO_ERROR (otherwise whats the point of it).
2245 */
2246int sock_common_getsockopt(struct socket *sock, int level, int optname,
2247 char __user *optval, int __user *optlen)
2248{
2249 struct sock *sk = sock->sk;
2250
2251 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2252}
1da177e4
LT
2253EXPORT_SYMBOL(sock_common_getsockopt);
2254
3fdadf7d 2255#ifdef CONFIG_COMPAT
543d9cfe
ACM
2256int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2257 char __user *optval, int __user *optlen)
3fdadf7d
DM
2258{
2259 struct sock *sk = sock->sk;
2260
1e51f951 2261 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
2262 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2263 optval, optlen);
3fdadf7d
DM
2264 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2265}
2266EXPORT_SYMBOL(compat_sock_common_getsockopt);
2267#endif
2268
1da177e4
LT
2269int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2270 struct msghdr *msg, size_t size, int flags)
2271{
2272 struct sock *sk = sock->sk;
2273 int addr_len = 0;
2274 int err;
2275
2276 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2277 flags & ~MSG_DONTWAIT, &addr_len);
2278 if (err >= 0)
2279 msg->msg_namelen = addr_len;
2280 return err;
2281}
1da177e4
LT
2282EXPORT_SYMBOL(sock_common_recvmsg);
2283
2284/*
2285 * Set socket options on an inet socket.
2286 */
2287int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2288 char __user *optval, unsigned int optlen)
1da177e4
LT
2289{
2290 struct sock *sk = sock->sk;
2291
2292 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2293}
1da177e4
LT
2294EXPORT_SYMBOL(sock_common_setsockopt);
2295
3fdadf7d 2296#ifdef CONFIG_COMPAT
543d9cfe 2297int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2298 char __user *optval, unsigned int optlen)
3fdadf7d
DM
2299{
2300 struct sock *sk = sock->sk;
2301
543d9cfe
ACM
2302 if (sk->sk_prot->compat_setsockopt != NULL)
2303 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2304 optval, optlen);
3fdadf7d
DM
2305 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2306}
2307EXPORT_SYMBOL(compat_sock_common_setsockopt);
2308#endif
2309
1da177e4
LT
2310void sk_common_release(struct sock *sk)
2311{
2312 if (sk->sk_prot->destroy)
2313 sk->sk_prot->destroy(sk);
2314
2315 /*
2316 * Observation: when sock_common_release is called, processes have
2317 * no access to socket. But net still has.
2318 * Step one, detach it from networking:
2319 *
2320 * A. Remove from hash tables.
2321 */
2322
2323 sk->sk_prot->unhash(sk);
2324
2325 /*
2326 * In this point socket cannot receive new packets, but it is possible
2327 * that some packets are in flight because some CPU runs receiver and
2328 * did hash table lookup before we unhashed socket. They will achieve
2329 * receive queue and will be purged by socket destructor.
2330 *
2331 * Also we still have packets pending on receive queue and probably,
2332 * our own packets waiting in device queues. sock_destroy will drain
2333 * receive queue, but transmitted packets will delay socket destruction
2334 * until the last reference will be released.
2335 */
2336
2337 sock_orphan(sk);
2338
2339 xfrm_sk_free_policy(sk);
2340
e6848976 2341 sk_refcnt_debug_release(sk);
1da177e4
LT
2342 sock_put(sk);
2343}
1da177e4
LT
2344EXPORT_SYMBOL(sk_common_release);
2345
13ff3d6f
PE
2346#ifdef CONFIG_PROC_FS
2347#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
2348struct prot_inuse {
2349 int val[PROTO_INUSE_NR];
2350};
13ff3d6f
PE
2351
2352static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159
PE
2353
2354#ifdef CONFIG_NET_NS
2355void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2356{
d6d9ca0f 2357 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
70ee1159
PE
2358}
2359EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2360
2361int sock_prot_inuse_get(struct net *net, struct proto *prot)
2362{
2363 int cpu, idx = prot->inuse_idx;
2364 int res = 0;
2365
2366 for_each_possible_cpu(cpu)
2367 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2368
2369 return res >= 0 ? res : 0;
2370}
2371EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2372
2c8c1e72 2373static int __net_init sock_inuse_init_net(struct net *net)
70ee1159
PE
2374{
2375 net->core.inuse = alloc_percpu(struct prot_inuse);
2376 return net->core.inuse ? 0 : -ENOMEM;
2377}
2378
2c8c1e72 2379static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159
PE
2380{
2381 free_percpu(net->core.inuse);
2382}
2383
2384static struct pernet_operations net_inuse_ops = {
2385 .init = sock_inuse_init_net,
2386 .exit = sock_inuse_exit_net,
2387};
2388
2389static __init int net_inuse_init(void)
2390{
2391 if (register_pernet_subsys(&net_inuse_ops))
2392 panic("Cannot initialize net inuse counters");
2393
2394 return 0;
2395}
2396
2397core_initcall(net_inuse_init);
2398#else
1338d466
PE
2399static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2400
c29a0bc4 2401void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1338d466 2402{
d6d9ca0f 2403 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
1338d466
PE
2404}
2405EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2406
c29a0bc4 2407int sock_prot_inuse_get(struct net *net, struct proto *prot)
1338d466
PE
2408{
2409 int cpu, idx = prot->inuse_idx;
2410 int res = 0;
2411
2412 for_each_possible_cpu(cpu)
2413 res += per_cpu(prot_inuse, cpu).val[idx];
2414
2415 return res >= 0 ? res : 0;
2416}
2417EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
70ee1159 2418#endif
13ff3d6f
PE
2419
2420static void assign_proto_idx(struct proto *prot)
2421{
2422 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2423
2424 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2425 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2426 return;
2427 }
2428
2429 set_bit(prot->inuse_idx, proto_inuse_idx);
2430}
2431
2432static void release_proto_idx(struct proto *prot)
2433{
2434 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2435 clear_bit(prot->inuse_idx, proto_inuse_idx);
2436}
2437#else
2438static inline void assign_proto_idx(struct proto *prot)
2439{
2440}
2441
2442static inline void release_proto_idx(struct proto *prot)
2443{
2444}
2445#endif
2446
b733c007
PE
2447int proto_register(struct proto *prot, int alloc_slab)
2448{
1da177e4
LT
2449 if (alloc_slab) {
2450 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
271b72c7
ED
2451 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2452 NULL);
1da177e4
LT
2453
2454 if (prot->slab == NULL) {
2455 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2456 prot->name);
60e7663d 2457 goto out;
1da177e4 2458 }
2e6599cb
ACM
2459
2460 if (prot->rsk_prot != NULL) {
faf23422 2461 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
7e56b5d6 2462 if (prot->rsk_prot->slab_name == NULL)
2e6599cb
ACM
2463 goto out_free_sock_slab;
2464
7e56b5d6 2465 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2e6599cb 2466 prot->rsk_prot->obj_size, 0,
20c2df83 2467 SLAB_HWCACHE_ALIGN, NULL);
2e6599cb
ACM
2468
2469 if (prot->rsk_prot->slab == NULL) {
2470 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2471 prot->name);
2472 goto out_free_request_sock_slab_name;
2473 }
2474 }
8feaf0c0 2475
6d6ee43e 2476 if (prot->twsk_prot != NULL) {
faf23422 2477 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 2478
7e56b5d6 2479 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
2480 goto out_free_request_sock_slab;
2481
6d6ee43e 2482 prot->twsk_prot->twsk_slab =
7e56b5d6 2483 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 2484 prot->twsk_prot->twsk_obj_size,
3ab5aee7
ED
2485 0,
2486 SLAB_HWCACHE_ALIGN |
2487 prot->slab_flags,
20c2df83 2488 NULL);
6d6ee43e 2489 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
2490 goto out_free_timewait_sock_slab_name;
2491 }
1da177e4
LT
2492 }
2493
36b77a52 2494 mutex_lock(&proto_list_mutex);
1da177e4 2495 list_add(&prot->node, &proto_list);
13ff3d6f 2496 assign_proto_idx(prot);
36b77a52 2497 mutex_unlock(&proto_list_mutex);
b733c007
PE
2498 return 0;
2499
8feaf0c0 2500out_free_timewait_sock_slab_name:
7e56b5d6 2501 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0
ACM
2502out_free_request_sock_slab:
2503 if (prot->rsk_prot && prot->rsk_prot->slab) {
2504 kmem_cache_destroy(prot->rsk_prot->slab);
2505 prot->rsk_prot->slab = NULL;
2506 }
2e6599cb 2507out_free_request_sock_slab_name:
72150e9b
DC
2508 if (prot->rsk_prot)
2509 kfree(prot->rsk_prot->slab_name);
2e6599cb
ACM
2510out_free_sock_slab:
2511 kmem_cache_destroy(prot->slab);
2512 prot->slab = NULL;
b733c007
PE
2513out:
2514 return -ENOBUFS;
1da177e4 2515}
1da177e4
LT
2516EXPORT_SYMBOL(proto_register);
2517
2518void proto_unregister(struct proto *prot)
2519{
36b77a52 2520 mutex_lock(&proto_list_mutex);
13ff3d6f 2521 release_proto_idx(prot);
0a3f4358 2522 list_del(&prot->node);
36b77a52 2523 mutex_unlock(&proto_list_mutex);
1da177e4
LT
2524
2525 if (prot->slab != NULL) {
2526 kmem_cache_destroy(prot->slab);
2527 prot->slab = NULL;
2528 }
2529
2e6599cb 2530 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2e6599cb 2531 kmem_cache_destroy(prot->rsk_prot->slab);
7e56b5d6 2532 kfree(prot->rsk_prot->slab_name);
2e6599cb
ACM
2533 prot->rsk_prot->slab = NULL;
2534 }
2535
6d6ee43e 2536 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 2537 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 2538 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 2539 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 2540 }
1da177e4 2541}
1da177e4
LT
2542EXPORT_SYMBOL(proto_unregister);
2543
2544#ifdef CONFIG_PROC_FS
1da177e4 2545static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 2546 __acquires(proto_list_mutex)
1da177e4 2547{
36b77a52 2548 mutex_lock(&proto_list_mutex);
60f0438a 2549 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
2550}
2551
2552static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2553{
60f0438a 2554 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
2555}
2556
2557static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 2558 __releases(proto_list_mutex)
1da177e4 2559{
36b77a52 2560 mutex_unlock(&proto_list_mutex);
1da177e4
LT
2561}
2562
2563static char proto_method_implemented(const void *method)
2564{
2565 return method == NULL ? 'n' : 'y';
2566}
180d8cd9
GC
2567static long sock_prot_memory_allocated(struct proto *proto)
2568{
cb75a36c 2569 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
2570}
2571
2572static char *sock_prot_memory_pressure(struct proto *proto)
2573{
2574 return proto->memory_pressure != NULL ?
2575 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2576}
1da177e4
LT
2577
2578static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2579{
180d8cd9 2580
8d987e5c 2581 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
2582 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2583 proto->name,
2584 proto->obj_size,
14e943db 2585 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
2586 sock_prot_memory_allocated(proto),
2587 sock_prot_memory_pressure(proto),
1da177e4
LT
2588 proto->max_header,
2589 proto->slab == NULL ? "no" : "yes",
2590 module_name(proto->owner),
2591 proto_method_implemented(proto->close),
2592 proto_method_implemented(proto->connect),
2593 proto_method_implemented(proto->disconnect),
2594 proto_method_implemented(proto->accept),
2595 proto_method_implemented(proto->ioctl),
2596 proto_method_implemented(proto->init),
2597 proto_method_implemented(proto->destroy),
2598 proto_method_implemented(proto->shutdown),
2599 proto_method_implemented(proto->setsockopt),
2600 proto_method_implemented(proto->getsockopt),
2601 proto_method_implemented(proto->sendmsg),
2602 proto_method_implemented(proto->recvmsg),
2603 proto_method_implemented(proto->sendpage),
2604 proto_method_implemented(proto->bind),
2605 proto_method_implemented(proto->backlog_rcv),
2606 proto_method_implemented(proto->hash),
2607 proto_method_implemented(proto->unhash),
2608 proto_method_implemented(proto->get_port),
2609 proto_method_implemented(proto->enter_memory_pressure));
2610}
2611
2612static int proto_seq_show(struct seq_file *seq, void *v)
2613{
60f0438a 2614 if (v == &proto_list)
1da177e4
LT
2615 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2616 "protocol",
2617 "size",
2618 "sockets",
2619 "memory",
2620 "press",
2621 "maxhdr",
2622 "slab",
2623 "module",
2624 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2625 else
60f0438a 2626 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
2627 return 0;
2628}
2629
f690808e 2630static const struct seq_operations proto_seq_ops = {
1da177e4
LT
2631 .start = proto_seq_start,
2632 .next = proto_seq_next,
2633 .stop = proto_seq_stop,
2634 .show = proto_seq_show,
2635};
2636
2637static int proto_seq_open(struct inode *inode, struct file *file)
2638{
14e943db
ED
2639 return seq_open_net(inode, file, &proto_seq_ops,
2640 sizeof(struct seq_net_private));
1da177e4
LT
2641}
2642
9a32144e 2643static const struct file_operations proto_seq_fops = {
1da177e4
LT
2644 .owner = THIS_MODULE,
2645 .open = proto_seq_open,
2646 .read = seq_read,
2647 .llseek = seq_lseek,
14e943db
ED
2648 .release = seq_release_net,
2649};
2650
2651static __net_init int proto_init_net(struct net *net)
2652{
2653 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2654 return -ENOMEM;
2655
2656 return 0;
2657}
2658
2659static __net_exit void proto_exit_net(struct net *net)
2660{
2661 proc_net_remove(net, "protocols");
2662}
2663
2664
2665static __net_initdata struct pernet_operations proto_net_ops = {
2666 .init = proto_init_net,
2667 .exit = proto_exit_net,
1da177e4
LT
2668};
2669
2670static int __init proto_init(void)
2671{
14e943db 2672 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
2673}
2674
2675subsys_initcall(proto_init);
2676
2677#endif /* PROC_FS */