remove libdss from Makefile
[GitHub/moto-9609/android_kernel_motorola_exynos9610.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
e005d193
JP
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
4fc268d2 94#include <linux/capability.h>
1da177e4 95#include <linux/errno.h>
cb820f8e 96#include <linux/errqueue.h>
1da177e4
LT
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
1da177e4
LT
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
f1083048 105#include <linux/sched/mm.h>
1da177e4
LT
106#include <linux/timer.h>
107#include <linux/string.h>
108#include <linux/sockios.h>
109#include <linux/net.h>
110#include <linux/mm.h>
111#include <linux/slab.h>
112#include <linux/interrupt.h>
113#include <linux/poll.h>
114#include <linux/tcp.h>
115#include <linux/init.h>
a1f8e7f7 116#include <linux/highmem.h>
3f551f94 117#include <linux/user_namespace.h>
c5905afb 118#include <linux/static_key.h>
3969eb38 119#include <linux/memcontrol.h>
8c1ae10d 120#include <linux/prefetch.h>
1da177e4 121
7c0f6ba6 122#include <linux/uaccess.h>
1da177e4
LT
123
124#include <linux/netdevice.h>
125#include <net/protocol.h>
126#include <linux/skbuff.h>
457c4cbc 127#include <net/net_namespace.h>
2e6599cb 128#include <net/request_sock.h>
1da177e4 129#include <net/sock.h>
20d49473 130#include <linux/net_tstamp.h>
1da177e4
LT
131#include <net/xfrm.h>
132#include <linux/ipsec.h>
f8451725 133#include <net/cls_cgroup.h>
5bc1421e 134#include <net/netprio_cgroup.h>
eb4cb008 135#include <linux/sock_diag.h>
1da177e4
LT
136
137#include <linux/filter.h>
538950a1 138#include <net/sock_reuseport.h>
1da177e4 139
3847ce32
SM
140#include <trace/events/sock.h>
141
1da177e4 142#include <net/tcp.h>
076bb0c8 143#include <net/busy_poll.h>
06021292 144
36b77a52 145static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
146static LIST_HEAD(proto_list);
147
a3b299da
EB
148/**
149 * sk_ns_capable - General socket capability test
150 * @sk: Socket to use a capability on or through
151 * @user_ns: The user namespace of the capability to use
152 * @cap: The capability to use
153 *
154 * Test to see if the opener of the socket had when the socket was
155 * created and the current process has the capability @cap in the user
156 * namespace @user_ns.
157 */
158bool sk_ns_capable(const struct sock *sk,
159 struct user_namespace *user_ns, int cap)
160{
161 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162 ns_capable(user_ns, cap);
163}
164EXPORT_SYMBOL(sk_ns_capable);
165
166/**
167 * sk_capable - Socket global capability test
168 * @sk: Socket to use a capability on or through
e793c0f7 169 * @cap: The global capability to use
a3b299da
EB
170 *
171 * Test to see if the opener of the socket had when the socket was
172 * created and the current process has the capability @cap in all user
173 * namespaces.
174 */
175bool sk_capable(const struct sock *sk, int cap)
176{
177 return sk_ns_capable(sk, &init_user_ns, cap);
178}
179EXPORT_SYMBOL(sk_capable);
180
181/**
182 * sk_net_capable - Network namespace socket capability test
183 * @sk: Socket to use a capability on or through
184 * @cap: The capability to use
185 *
e793c0f7 186 * Test to see if the opener of the socket had when the socket was created
a3b299da
EB
187 * and the current process has the capability @cap over the network namespace
188 * the socket is a member of.
189 */
190bool sk_net_capable(const struct sock *sk, int cap)
191{
192 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193}
194EXPORT_SYMBOL(sk_net_capable);
195
da21f24d
IM
196/*
197 * Each address family might have different locking rules, so we have
cdfbabfb
DH
198 * one slock key per address family and separate keys for internal and
199 * userspace sockets.
da21f24d 200 */
a5b5bb9a 201static struct lock_class_key af_family_keys[AF_MAX];
cdfbabfb 202static struct lock_class_key af_family_kern_keys[AF_MAX];
a5b5bb9a 203static struct lock_class_key af_family_slock_keys[AF_MAX];
cdfbabfb 204static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
a5b5bb9a 205
a5b5bb9a
IM
206/*
207 * Make lock validator output more readable. (we pre-construct these
208 * strings build-time, so that runtime initialization of socket
209 * locks is fast):
210 */
cdfbabfb
DH
211
212#define _sock_locks(x) \
213 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
214 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
215 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
216 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
217 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
218 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
219 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
220 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
221 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
222 x "27" , x "28" , x "AF_CAN" , \
223 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
224 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
225 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
226 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
227 x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX"
228
36cbd3dc 229static const char *const af_family_key_strings[AF_MAX+1] = {
cdfbabfb 230 _sock_locks("sk_lock-")
a5b5bb9a 231};
36cbd3dc 232static const char *const af_family_slock_key_strings[AF_MAX+1] = {
cdfbabfb 233 _sock_locks("slock-")
a5b5bb9a 234};
36cbd3dc 235static const char *const af_family_clock_key_strings[AF_MAX+1] = {
cdfbabfb
DH
236 _sock_locks("clock-")
237};
238
239static const char *const af_family_kern_key_strings[AF_MAX+1] = {
240 _sock_locks("k-sk_lock-")
241};
242static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
243 _sock_locks("k-slock-")
244};
245static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
246 _sock_locks("k-clock-")
443aef0e 247};
581319c5
PA
248static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
249 "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" ,
250 "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK",
251 "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" ,
252 "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" ,
253 "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" ,
254 "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" ,
255 "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" ,
256 "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" ,
257 "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" ,
258 "rlock-27" , "rlock-28" , "rlock-AF_CAN" ,
259 "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" ,
260 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
261 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
262 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
263 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX"
264};
265static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
266 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
267 "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK",
268 "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" ,
269 "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" ,
270 "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" ,
271 "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" ,
272 "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" ,
273 "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" ,
274 "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" ,
275 "wlock-27" , "wlock-28" , "wlock-AF_CAN" ,
276 "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" ,
277 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
278 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
279 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
280 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX"
281};
282static const char *const af_family_elock_key_strings[AF_MAX+1] = {
283 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
284 "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK",
285 "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" ,
286 "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" ,
287 "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" ,
288 "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" ,
289 "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" ,
290 "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" ,
291 "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" ,
292 "elock-27" , "elock-28" , "elock-AF_CAN" ,
293 "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" ,
294 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
295 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
296 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
297 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX"
298};
da21f24d
IM
299
300/*
581319c5 301 * sk_callback_lock and sk queues locking rules are per-address-family,
da21f24d
IM
302 * so split the lock classes by using a per-AF key:
303 */
304static struct lock_class_key af_callback_keys[AF_MAX];
581319c5
PA
305static struct lock_class_key af_rlock_keys[AF_MAX];
306static struct lock_class_key af_wlock_keys[AF_MAX];
307static struct lock_class_key af_elock_keys[AF_MAX];
cdfbabfb 308static struct lock_class_key af_kern_callback_keys[AF_MAX];
da21f24d 309
1da177e4 310/* Run time adjustable parameters. */
ab32ea5d 311__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 312EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 313__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 314EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
315__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
316__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 317
25985edc 318/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 319int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 320EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 321
b245be1f
WB
322int sysctl_tstamp_allow_data __read_mostly = 1;
323
c93bdd0e
MG
324struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
325EXPORT_SYMBOL_GPL(memalloc_socks);
326
7cb02404
MG
327/**
328 * sk_set_memalloc - sets %SOCK_MEMALLOC
329 * @sk: socket to set it on
330 *
331 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
332 * It's the responsibility of the admin to adjust min_free_kbytes
333 * to meet the requirements
334 */
335void sk_set_memalloc(struct sock *sk)
336{
337 sock_set_flag(sk, SOCK_MEMALLOC);
338 sk->sk_allocation |= __GFP_MEMALLOC;
c93bdd0e 339 static_key_slow_inc(&memalloc_socks);
7cb02404
MG
340}
341EXPORT_SYMBOL_GPL(sk_set_memalloc);
342
343void sk_clear_memalloc(struct sock *sk)
344{
345 sock_reset_flag(sk, SOCK_MEMALLOC);
346 sk->sk_allocation &= ~__GFP_MEMALLOC;
c93bdd0e 347 static_key_slow_dec(&memalloc_socks);
c76562b6
MG
348
349 /*
350 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
5d753610
MG
351 * progress of swapping. SOCK_MEMALLOC may be cleared while
352 * it has rmem allocations due to the last swapfile being deactivated
353 * but there is a risk that the socket is unusable due to exceeding
354 * the rmem limits. Reclaim the reserves and obey rmem limits again.
c76562b6 355 */
5d753610 356 sk_mem_reclaim(sk);
7cb02404
MG
357}
358EXPORT_SYMBOL_GPL(sk_clear_memalloc);
359
b4b9e355
MG
360int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
361{
362 int ret;
f1083048 363 unsigned int noreclaim_flag;
b4b9e355
MG
364
365 /* these should have been dropped before queueing */
366 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
367
f1083048 368 noreclaim_flag = memalloc_noreclaim_save();
b4b9e355 369 ret = sk->sk_backlog_rcv(sk, skb);
f1083048 370 memalloc_noreclaim_restore(noreclaim_flag);
b4b9e355
MG
371
372 return ret;
373}
374EXPORT_SYMBOL(__sk_backlog_rcv);
375
1da177e4
LT
376static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
377{
378 struct timeval tv;
379
380 if (optlen < sizeof(tv))
381 return -EINVAL;
382 if (copy_from_user(&tv, optval, sizeof(tv)))
383 return -EFAULT;
ba78073e
VA
384 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
385 return -EDOM;
1da177e4 386
ba78073e 387 if (tv.tv_sec < 0) {
6f11df83
AM
388 static int warned __read_mostly;
389
ba78073e 390 *timeo_p = 0;
50aab54f 391 if (warned < 10 && net_ratelimit()) {
ba78073e 392 warned++;
e005d193
JP
393 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
394 __func__, current->comm, task_pid_nr(current));
50aab54f 395 }
ba78073e
VA
396 return 0;
397 }
1da177e4
LT
398 *timeo_p = MAX_SCHEDULE_TIMEOUT;
399 if (tv.tv_sec == 0 && tv.tv_usec == 0)
400 return 0;
401 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
8ccde4c5 402 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
1da177e4
LT
403 return 0;
404}
405
406static void sock_warn_obsolete_bsdism(const char *name)
407{
408 static int warned;
409 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
410 if (strcmp(warncomm, current->comm) && warned < 5) {
411 strcpy(warncomm, current->comm);
e005d193
JP
412 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
413 warncomm, name);
1da177e4
LT
414 warned++;
415 }
416}
417
080a270f
HFS
418static bool sock_needs_netstamp(const struct sock *sk)
419{
420 switch (sk->sk_family) {
421 case AF_UNSPEC:
422 case AF_UNIX:
423 return false;
424 default:
425 return true;
426 }
427}
428
08e29af3 429static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 430{
08e29af3
ED
431 if (sk->sk_flags & flags) {
432 sk->sk_flags &= ~flags;
080a270f
HFS
433 if (sock_needs_netstamp(sk) &&
434 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 435 net_disable_timestamp();
1da177e4
LT
436 }
437}
438
439
e6afc8ac 440int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
f0088a50 441{
3b885787
NH
442 unsigned long flags;
443 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 444
0fd7bac6 445 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 446 atomic_inc(&sk->sk_drops);
3847ce32 447 trace_sock_rcvqueue_full(sk, skb);
766e9037 448 return -ENOMEM;
f0088a50
DV
449 }
450
c76562b6 451 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
766e9037
ED
452 atomic_inc(&sk->sk_drops);
453 return -ENOBUFS;
3ab224be
HA
454 }
455
f0088a50
DV
456 skb->dev = NULL;
457 skb_set_owner_r(skb, sk);
49ad9599 458
7fee226a
ED
459 /* we escape from rcu protected region, make sure we dont leak
460 * a norefcounted dst
461 */
462 skb_dst_force(skb);
463
3b885787 464 spin_lock_irqsave(&list->lock, flags);
3bc3b96f 465 sock_skb_set_dropcount(sk, skb);
3b885787
NH
466 __skb_queue_tail(list, skb);
467 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
468
469 if (!sock_flag(sk, SOCK_DEAD))
676d2369 470 sk->sk_data_ready(sk);
766e9037 471 return 0;
f0088a50 472}
e6afc8ac 473EXPORT_SYMBOL(__sock_queue_rcv_skb);
474
475int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
476{
477 int err;
478
479 err = sk_filter(sk, skb);
480 if (err)
481 return err;
482
483 return __sock_queue_rcv_skb(sk, skb);
484}
f0088a50
DV
485EXPORT_SYMBOL(sock_queue_rcv_skb);
486
4f0c40d9 487int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
c3f24cfb 488 const int nested, unsigned int trim_cap, bool refcounted)
f0088a50
DV
489{
490 int rc = NET_RX_SUCCESS;
491
4f0c40d9 492 if (sk_filter_trim_cap(sk, skb, trim_cap))
f0088a50
DV
493 goto discard_and_relse;
494
495 skb->dev = NULL;
496
274f482d 497 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
c377411f
ED
498 atomic_inc(&sk->sk_drops);
499 goto discard_and_relse;
500 }
58a5a7b9
ACM
501 if (nested)
502 bh_lock_sock_nested(sk);
503 else
504 bh_lock_sock(sk);
a5b5bb9a
IM
505 if (!sock_owned_by_user(sk)) {
506 /*
507 * trylock + unlock semantics:
508 */
509 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
510
c57943a1 511 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
512
513 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 514 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
515 bh_unlock_sock(sk);
516 atomic_inc(&sk->sk_drops);
517 goto discard_and_relse;
518 }
519
f0088a50
DV
520 bh_unlock_sock(sk);
521out:
c3f24cfb
ED
522 if (refcounted)
523 sock_put(sk);
f0088a50
DV
524 return rc;
525discard_and_relse:
526 kfree_skb(skb);
527 goto out;
528}
4f0c40d9 529EXPORT_SYMBOL(__sk_receive_skb);
f0088a50
DV
530
531struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
532{
b6c6712a 533 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
534
535 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 536 sk_tx_queue_clear(sk);
9b8805a3 537 sk->sk_dst_pending_confirm = 0;
a9b3cd7f 538 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
539 dst_release(dst);
540 return NULL;
541 }
542
543 return dst;
544}
545EXPORT_SYMBOL(__sk_dst_check);
546
547struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
548{
549 struct dst_entry *dst = sk_dst_get(sk);
550
551 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
552 sk_dst_reset(sk);
553 dst_release(dst);
554 return NULL;
555 }
556
557 return dst;
558}
559EXPORT_SYMBOL(sk_dst_check);
560
c91f6df2
BH
561static int sock_setbindtodevice(struct sock *sk, char __user *optval,
562 int optlen)
4878809f
DM
563{
564 int ret = -ENOPROTOOPT;
565#ifdef CONFIG_NETDEVICES
3b1e0a65 566 struct net *net = sock_net(sk);
4878809f
DM
567 char devname[IFNAMSIZ];
568 int index;
569
570 /* Sorry... */
571 ret = -EPERM;
5e1fccc0 572 if (!ns_capable(net->user_ns, CAP_NET_RAW))
4878809f
DM
573 goto out;
574
575 ret = -EINVAL;
576 if (optlen < 0)
577 goto out;
578
579 /* Bind this socket to a particular device like "eth0",
580 * as specified in the passed interface name. If the
581 * name is "" or the option length is zero the socket
582 * is not bound.
583 */
584 if (optlen > IFNAMSIZ - 1)
585 optlen = IFNAMSIZ - 1;
586 memset(devname, 0, sizeof(devname));
587
588 ret = -EFAULT;
589 if (copy_from_user(devname, optval, optlen))
590 goto out;
591
000ba2e4
DM
592 index = 0;
593 if (devname[0] != '\0') {
bf8e56bf 594 struct net_device *dev;
4878809f 595
bf8e56bf
ED
596 rcu_read_lock();
597 dev = dev_get_by_name_rcu(net, devname);
598 if (dev)
599 index = dev->ifindex;
600 rcu_read_unlock();
4878809f
DM
601 ret = -ENODEV;
602 if (!dev)
603 goto out;
4878809f
DM
604 }
605
606 lock_sock(sk);
607 sk->sk_bound_dev_if = index;
608 sk_dst_reset(sk);
609 release_sock(sk);
610
611 ret = 0;
612
613out:
614#endif
615
616 return ret;
617}
618
c91f6df2
BH
619static int sock_getbindtodevice(struct sock *sk, char __user *optval,
620 int __user *optlen, int len)
621{
622 int ret = -ENOPROTOOPT;
623#ifdef CONFIG_NETDEVICES
624 struct net *net = sock_net(sk);
c91f6df2 625 char devname[IFNAMSIZ];
c91f6df2
BH
626
627 if (sk->sk_bound_dev_if == 0) {
628 len = 0;
629 goto zero;
630 }
631
632 ret = -EINVAL;
633 if (len < IFNAMSIZ)
634 goto out;
635
5dbe7c17
NS
636 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
637 if (ret)
c91f6df2 638 goto out;
c91f6df2
BH
639
640 len = strlen(devname) + 1;
641
642 ret = -EFAULT;
643 if (copy_to_user(optval, devname, len))
644 goto out;
645
646zero:
647 ret = -EFAULT;
648 if (put_user(len, optlen))
649 goto out;
650
651 ret = 0;
652
653out:
654#endif
655
656 return ret;
657}
658
c0ef877b
PE
659static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
660{
661 if (valbool)
662 sock_set_flag(sk, bit);
663 else
664 sock_reset_flag(sk, bit);
665}
666
f60e5990 667bool sk_mc_loop(struct sock *sk)
668{
669 if (dev_recursion_level())
670 return false;
671 if (!sk)
672 return true;
673 switch (sk->sk_family) {
674 case AF_INET:
675 return inet_sk(sk)->mc_loop;
676#if IS_ENABLED(CONFIG_IPV6)
677 case AF_INET6:
678 return inet6_sk(sk)->mc_loop;
679#endif
680 }
681 WARN_ON(1);
682 return true;
683}
684EXPORT_SYMBOL(sk_mc_loop);
685
1da177e4
LT
686/*
687 * This is meant for all protocols to use and covers goings on
688 * at the socket level. Everything here is generic.
689 */
690
691int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 692 char __user *optval, unsigned int optlen)
1da177e4 693{
2a91525c 694 struct sock *sk = sock->sk;
1da177e4
LT
695 int val;
696 int valbool;
697 struct linger ling;
698 int ret = 0;
4ec93edb 699
1da177e4
LT
700 /*
701 * Options without arguments
702 */
703
4878809f 704 if (optname == SO_BINDTODEVICE)
c91f6df2 705 return sock_setbindtodevice(sk, optval, optlen);
4878809f 706
e71a4783
SH
707 if (optlen < sizeof(int))
708 return -EINVAL;
4ec93edb 709
1da177e4
LT
710 if (get_user(val, (int __user *)optval))
711 return -EFAULT;
4ec93edb 712
2a91525c 713 valbool = val ? 1 : 0;
1da177e4
LT
714
715 lock_sock(sk);
716
2a91525c 717 switch (optname) {
e71a4783 718 case SO_DEBUG:
2a91525c 719 if (val && !capable(CAP_NET_ADMIN))
e71a4783 720 ret = -EACCES;
2a91525c 721 else
c0ef877b 722 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
723 break;
724 case SO_REUSEADDR:
4a17fd52 725 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783 726 break;
055dc21a
TH
727 case SO_REUSEPORT:
728 sk->sk_reuseport = valbool;
729 break;
e71a4783 730 case SO_TYPE:
49c794e9 731 case SO_PROTOCOL:
0d6038ee 732 case SO_DOMAIN:
e71a4783
SH
733 case SO_ERROR:
734 ret = -ENOPROTOOPT;
735 break;
736 case SO_DONTROUTE:
c0ef877b 737 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
e1bca2f4 738 sk_dst_reset(sk);
e71a4783
SH
739 break;
740 case SO_BROADCAST:
741 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
742 break;
743 case SO_SNDBUF:
744 /* Don't error on this BSD doesn't and if you think
82981930
ED
745 * about it this is right. Otherwise apps have to
746 * play 'guess the biggest size' games. RCVBUF/SNDBUF
747 * are treated in BSD as hints
748 */
749 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 750set_sndbuf:
e71a4783 751 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
b98b0bc8 752 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
82981930 753 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
754 sk->sk_write_space(sk);
755 break;
1da177e4 756
e71a4783
SH
757 case SO_SNDBUFFORCE:
758 if (!capable(CAP_NET_ADMIN)) {
759 ret = -EPERM;
760 break;
761 }
762 goto set_sndbuf;
b0573dea 763
e71a4783
SH
764 case SO_RCVBUF:
765 /* Don't error on this BSD doesn't and if you think
82981930
ED
766 * about it this is right. Otherwise apps have to
767 * play 'guess the biggest size' games. RCVBUF/SNDBUF
768 * are treated in BSD as hints
769 */
770 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 771set_rcvbuf:
e71a4783
SH
772 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
773 /*
774 * We double it on the way in to account for
775 * "struct sk_buff" etc. overhead. Applications
776 * assume that the SO_RCVBUF setting they make will
777 * allow that much actual data to be received on that
778 * socket.
779 *
780 * Applications are unaware that "struct sk_buff" and
781 * other overheads allocate from the receive buffer
782 * during socket buffer allocation.
783 *
784 * And after considering the possible alternatives,
785 * returning the value we actually used in getsockopt
786 * is the most desirable behavior.
787 */
b98b0bc8 788 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
789 break;
790
791 case SO_RCVBUFFORCE:
792 if (!capable(CAP_NET_ADMIN)) {
793 ret = -EPERM;
1da177e4 794 break;
e71a4783
SH
795 }
796 goto set_rcvbuf;
1da177e4 797
e71a4783 798 case SO_KEEPALIVE:
4b9d07a4
UB
799 if (sk->sk_prot->keepalive)
800 sk->sk_prot->keepalive(sk, valbool);
e71a4783
SH
801 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
802 break;
803
804 case SO_OOBINLINE:
805 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
806 break;
807
808 case SO_NO_CHECK:
28448b80 809 sk->sk_no_check_tx = valbool;
e71a4783
SH
810 break;
811
812 case SO_PRIORITY:
5e1fccc0
EB
813 if ((val >= 0 && val <= 6) ||
814 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
e71a4783
SH
815 sk->sk_priority = val;
816 else
817 ret = -EPERM;
818 break;
819
820 case SO_LINGER:
821 if (optlen < sizeof(ling)) {
822 ret = -EINVAL; /* 1003.1g */
1da177e4 823 break;
e71a4783 824 }
2a91525c 825 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 826 ret = -EFAULT;
1da177e4 827 break;
e71a4783
SH
828 }
829 if (!ling.l_onoff)
830 sock_reset_flag(sk, SOCK_LINGER);
831 else {
1da177e4 832#if (BITS_PER_LONG == 32)
e71a4783
SH
833 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
834 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 835 else
e71a4783
SH
836#endif
837 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
838 sock_set_flag(sk, SOCK_LINGER);
839 }
840 break;
841
842 case SO_BSDCOMPAT:
843 sock_warn_obsolete_bsdism("setsockopt");
844 break;
845
846 case SO_PASSCRED:
847 if (valbool)
848 set_bit(SOCK_PASSCRED, &sock->flags);
849 else
850 clear_bit(SOCK_PASSCRED, &sock->flags);
851 break;
852
853 case SO_TIMESTAMP:
92f37fd2 854 case SO_TIMESTAMPNS:
e71a4783 855 if (valbool) {
92f37fd2
ED
856 if (optname == SO_TIMESTAMP)
857 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
858 else
859 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 860 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 861 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 862 } else {
e71a4783 863 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2
ED
864 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
865 }
e71a4783
SH
866 break;
867
20d49473
PO
868 case SO_TIMESTAMPING:
869 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 870 ret = -EINVAL;
20d49473
PO
871 break;
872 }
b245be1f 873
09c2d251 874 if (val & SOF_TIMESTAMPING_OPT_ID &&
4ed2d765 875 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
ac5cc977
WC
876 if (sk->sk_protocol == IPPROTO_TCP &&
877 sk->sk_type == SOCK_STREAM) {
6db8b963
SHY
878 if ((1 << sk->sk_state) &
879 (TCPF_CLOSE | TCPF_LISTEN)) {
4ed2d765
WB
880 ret = -EINVAL;
881 break;
882 }
883 sk->sk_tskey = tcp_sk(sk)->snd_una;
884 } else {
885 sk->sk_tskey = 0;
886 }
887 }
1c885808
FY
888
889 if (val & SOF_TIMESTAMPING_OPT_STATS &&
890 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
891 ret = -EINVAL;
892 break;
893 }
894
b9f40e21 895 sk->sk_tsflags = val;
20d49473
PO
896 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
897 sock_enable_timestamp(sk,
898 SOCK_TIMESTAMPING_RX_SOFTWARE);
899 else
900 sock_disable_timestamp(sk,
08e29af3 901 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
20d49473
PO
902 break;
903
e71a4783
SH
904 case SO_RCVLOWAT:
905 if (val < 0)
906 val = INT_MAX;
907 sk->sk_rcvlowat = val ? : 1;
908 break;
909
910 case SO_RCVTIMEO:
911 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
912 break;
913
914 case SO_SNDTIMEO:
915 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
916 break;
1da177e4 917
e71a4783
SH
918 case SO_ATTACH_FILTER:
919 ret = -EINVAL;
920 if (optlen == sizeof(struct sock_fprog)) {
921 struct sock_fprog fprog;
1da177e4 922
e71a4783
SH
923 ret = -EFAULT;
924 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 925 break;
e71a4783
SH
926
927 ret = sk_attach_filter(&fprog, sk);
928 }
929 break;
930
89aa0758
AS
931 case SO_ATTACH_BPF:
932 ret = -EINVAL;
933 if (optlen == sizeof(u32)) {
934 u32 ufd;
935
936 ret = -EFAULT;
937 if (copy_from_user(&ufd, optval, sizeof(ufd)))
938 break;
939
940 ret = sk_attach_bpf(ufd, sk);
941 }
942 break;
943
538950a1
CG
944 case SO_ATTACH_REUSEPORT_CBPF:
945 ret = -EINVAL;
946 if (optlen == sizeof(struct sock_fprog)) {
947 struct sock_fprog fprog;
948
949 ret = -EFAULT;
950 if (copy_from_user(&fprog, optval, sizeof(fprog)))
951 break;
952
953 ret = sk_reuseport_attach_filter(&fprog, sk);
954 }
955 break;
956
957 case SO_ATTACH_REUSEPORT_EBPF:
958 ret = -EINVAL;
959 if (optlen == sizeof(u32)) {
960 u32 ufd;
961
962 ret = -EFAULT;
963 if (copy_from_user(&ufd, optval, sizeof(ufd)))
964 break;
965
966 ret = sk_reuseport_attach_bpf(ufd, sk);
967 }
968 break;
969
e71a4783 970 case SO_DETACH_FILTER:
55b33325 971 ret = sk_detach_filter(sk);
e71a4783 972 break;
1da177e4 973
d59577b6
VB
974 case SO_LOCK_FILTER:
975 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
976 ret = -EPERM;
977 else
978 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
979 break;
980
e71a4783
SH
981 case SO_PASSSEC:
982 if (valbool)
983 set_bit(SOCK_PASSSEC, &sock->flags);
984 else
985 clear_bit(SOCK_PASSSEC, &sock->flags);
986 break;
4a19ec58 987 case SO_MARK:
5e1fccc0 988 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
4a19ec58 989 ret = -EPERM;
2a91525c 990 else
4a19ec58 991 sk->sk_mark = val;
4a19ec58 992 break;
877ce7c1 993
3b885787 994 case SO_RXQ_OVFL:
8083f0fc 995 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 996 break;
6e3e939f
JB
997
998 case SO_WIFI_STATUS:
999 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1000 break;
1001
ef64a54f
PE
1002 case SO_PEEK_OFF:
1003 if (sock->ops->set_peek_off)
12663bfc 1004 ret = sock->ops->set_peek_off(sk, val);
ef64a54f
PE
1005 else
1006 ret = -EOPNOTSUPP;
1007 break;
3bdc0eba
BG
1008
1009 case SO_NOFCS:
1010 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1011 break;
1012
7d4c04fc
KJ
1013 case SO_SELECT_ERR_QUEUE:
1014 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1015 break;
1016
e0d1095a 1017#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1018 case SO_BUSY_POLL:
dafcc438
ET
1019 /* allow unprivileged users to decrease the value */
1020 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1021 ret = -EPERM;
1022 else {
1023 if (val < 0)
1024 ret = -EINVAL;
1025 else
1026 sk->sk_ll_usec = val;
1027 }
1028 break;
1029#endif
62748f32
ED
1030
1031 case SO_MAX_PACING_RATE:
218af599
ED
1032 if (val != ~0U)
1033 cmpxchg(&sk->sk_pacing_status,
1034 SK_PACING_NONE,
1035 SK_PACING_NEEDED);
62748f32
ED
1036 sk->sk_max_pacing_rate = val;
1037 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1038 sk->sk_max_pacing_rate);
1039 break;
1040
70da268b
ED
1041 case SO_INCOMING_CPU:
1042 sk->sk_incoming_cpu = val;
1043 break;
1044
a87cb3e4
TH
1045 case SO_CNX_ADVICE:
1046 if (val == 1)
1047 dst_negative_advice(sk);
1048 break;
76851d12
WB
1049
1050 case SO_ZEROCOPY:
1051 if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
1052 ret = -ENOTSUPP;
1053 else if (sk->sk_protocol != IPPROTO_TCP)
1054 ret = -ENOTSUPP;
1055 else if (sk->sk_state != TCP_CLOSE)
1056 ret = -EBUSY;
1057 else if (val < 0 || val > 1)
1058 ret = -EINVAL;
1059 else
1060 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1061 break;
1062
e71a4783
SH
1063 default:
1064 ret = -ENOPROTOOPT;
1065 break;
4ec93edb 1066 }
1da177e4
LT
1067 release_sock(sk);
1068 return ret;
1069}
2a91525c 1070EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
1071
1072
8f09898b 1073static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1074 struct ucred *ucred)
3f551f94
EB
1075{
1076 ucred->pid = pid_vnr(pid);
1077 ucred->uid = ucred->gid = -1;
1078 if (cred) {
1079 struct user_namespace *current_ns = current_user_ns();
1080
b2e4f544
EB
1081 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1082 ucred->gid = from_kgid_munged(current_ns, cred->egid);
3f551f94
EB
1083 }
1084}
1085
28b5ba2a
DH
1086static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1087{
1088 struct user_namespace *user_ns = current_user_ns();
1089 int i;
1090
1091 for (i = 0; i < src->ngroups; i++)
1092 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1093 return -EFAULT;
1094
1095 return 0;
1096}
1097
1da177e4
LT
1098int sock_getsockopt(struct socket *sock, int level, int optname,
1099 char __user *optval, int __user *optlen)
1100{
1101 struct sock *sk = sock->sk;
4ec93edb 1102
e71a4783 1103 union {
4ec93edb 1104 int val;
5daab9db 1105 u64 val64;
4ec93edb 1106 struct linger ling;
1da177e4
LT
1107 struct timeval tm;
1108 } v;
4ec93edb 1109
4d0392be 1110 int lv = sizeof(int);
1da177e4 1111 int len;
4ec93edb 1112
e71a4783 1113 if (get_user(len, optlen))
4ec93edb 1114 return -EFAULT;
e71a4783 1115 if (len < 0)
1da177e4 1116 return -EINVAL;
4ec93edb 1117
50fee1de 1118 memset(&v, 0, sizeof(v));
df0bca04 1119
2a91525c 1120 switch (optname) {
e71a4783
SH
1121 case SO_DEBUG:
1122 v.val = sock_flag(sk, SOCK_DBG);
1123 break;
1124
1125 case SO_DONTROUTE:
1126 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1127 break;
1128
1129 case SO_BROADCAST:
1b23a5df 1130 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
1131 break;
1132
1133 case SO_SNDBUF:
1134 v.val = sk->sk_sndbuf;
1135 break;
1136
1137 case SO_RCVBUF:
1138 v.val = sk->sk_rcvbuf;
1139 break;
1140
1141 case SO_REUSEADDR:
1142 v.val = sk->sk_reuse;
1143 break;
1144
055dc21a
TH
1145 case SO_REUSEPORT:
1146 v.val = sk->sk_reuseport;
1147 break;
1148
e71a4783 1149 case SO_KEEPALIVE:
1b23a5df 1150 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
1151 break;
1152
1153 case SO_TYPE:
1154 v.val = sk->sk_type;
1155 break;
1156
49c794e9
JE
1157 case SO_PROTOCOL:
1158 v.val = sk->sk_protocol;
1159 break;
1160
0d6038ee
JE
1161 case SO_DOMAIN:
1162 v.val = sk->sk_family;
1163 break;
1164
e71a4783
SH
1165 case SO_ERROR:
1166 v.val = -sock_error(sk);
2a91525c 1167 if (v.val == 0)
e71a4783
SH
1168 v.val = xchg(&sk->sk_err_soft, 0);
1169 break;
1170
1171 case SO_OOBINLINE:
1b23a5df 1172 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
1173 break;
1174
1175 case SO_NO_CHECK:
28448b80 1176 v.val = sk->sk_no_check_tx;
e71a4783
SH
1177 break;
1178
1179 case SO_PRIORITY:
1180 v.val = sk->sk_priority;
1181 break;
1182
1183 case SO_LINGER:
1184 lv = sizeof(v.ling);
1b23a5df 1185 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
1186 v.ling.l_linger = sk->sk_lingertime / HZ;
1187 break;
1188
1189 case SO_BSDCOMPAT:
1190 sock_warn_obsolete_bsdism("getsockopt");
1191 break;
1192
1193 case SO_TIMESTAMP:
92f37fd2
ED
1194 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1195 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1196 break;
1197
1198 case SO_TIMESTAMPNS:
1199 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783
SH
1200 break;
1201
20d49473 1202 case SO_TIMESTAMPING:
b9f40e21 1203 v.val = sk->sk_tsflags;
20d49473
PO
1204 break;
1205
e71a4783 1206 case SO_RCVTIMEO:
2a91525c 1207 lv = sizeof(struct timeval);
e71a4783
SH
1208 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1209 v.tm.tv_sec = 0;
1210 v.tm.tv_usec = 0;
1211 } else {
1212 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
8ccde4c5 1213 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
e71a4783
SH
1214 }
1215 break;
1216
1217 case SO_SNDTIMEO:
2a91525c 1218 lv = sizeof(struct timeval);
e71a4783
SH
1219 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1220 v.tm.tv_sec = 0;
1221 v.tm.tv_usec = 0;
1222 } else {
1223 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
8ccde4c5 1224 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
e71a4783
SH
1225 }
1226 break;
1da177e4 1227
e71a4783
SH
1228 case SO_RCVLOWAT:
1229 v.val = sk->sk_rcvlowat;
1230 break;
1da177e4 1231
e71a4783 1232 case SO_SNDLOWAT:
2a91525c 1233 v.val = 1;
e71a4783 1234 break;
1da177e4 1235
e71a4783 1236 case SO_PASSCRED:
82981930 1237 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 1238 break;
1da177e4 1239
e71a4783 1240 case SO_PEERCRED:
109f6e39
EB
1241 {
1242 struct ucred peercred;
1243 if (len > sizeof(peercred))
1244 len = sizeof(peercred);
1245 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1246 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1247 return -EFAULT;
1248 goto lenout;
109f6e39 1249 }
1da177e4 1250
28b5ba2a
DH
1251 case SO_PEERGROUPS:
1252 {
1253 int ret, n;
1254
1255 if (!sk->sk_peer_cred)
1256 return -ENODATA;
1257
1258 n = sk->sk_peer_cred->group_info->ngroups;
1259 if (len < n * sizeof(gid_t)) {
1260 len = n * sizeof(gid_t);
1261 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1262 }
1263 len = n * sizeof(gid_t);
1264
1265 ret = groups_to_user((gid_t __user *)optval,
1266 sk->sk_peer_cred->group_info);
1267 if (ret)
1268 return ret;
1269 goto lenout;
1270 }
1271
e71a4783
SH
1272 case SO_PEERNAME:
1273 {
1274 char address[128];
1275
1276 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1277 return -ENOTCONN;
1278 if (lv < len)
1279 return -EINVAL;
1280 if (copy_to_user(optval, address, len))
1281 return -EFAULT;
1282 goto lenout;
1283 }
1da177e4 1284
e71a4783
SH
1285 /* Dubious BSD thing... Probably nobody even uses it, but
1286 * the UNIX standard wants it for whatever reason... -DaveM
1287 */
1288 case SO_ACCEPTCONN:
1289 v.val = sk->sk_state == TCP_LISTEN;
1290 break;
1da177e4 1291
e71a4783 1292 case SO_PASSSEC:
82981930 1293 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1294 break;
877ce7c1 1295
e71a4783
SH
1296 case SO_PEERSEC:
1297 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1298
4a19ec58
LAT
1299 case SO_MARK:
1300 v.val = sk->sk_mark;
1301 break;
1302
3b885787 1303 case SO_RXQ_OVFL:
1b23a5df 1304 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1305 break;
1306
6e3e939f 1307 case SO_WIFI_STATUS:
1b23a5df 1308 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1309 break;
1310
ef64a54f
PE
1311 case SO_PEEK_OFF:
1312 if (!sock->ops->set_peek_off)
1313 return -EOPNOTSUPP;
1314
1315 v.val = sk->sk_peek_off;
1316 break;
bc2f7996 1317 case SO_NOFCS:
1b23a5df 1318 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1319 break;
c91f6df2 1320
f7b86bfe 1321 case SO_BINDTODEVICE:
c91f6df2
BH
1322 return sock_getbindtodevice(sk, optval, optlen, len);
1323
a8fc9277
PE
1324 case SO_GET_FILTER:
1325 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1326 if (len < 0)
1327 return len;
1328
1329 goto lenout;
c91f6df2 1330
d59577b6
VB
1331 case SO_LOCK_FILTER:
1332 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1333 break;
1334
ea02f941
MS
1335 case SO_BPF_EXTENSIONS:
1336 v.val = bpf_tell_extensions();
1337 break;
1338
7d4c04fc
KJ
1339 case SO_SELECT_ERR_QUEUE:
1340 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1341 break;
1342
e0d1095a 1343#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1344 case SO_BUSY_POLL:
dafcc438
ET
1345 v.val = sk->sk_ll_usec;
1346 break;
1347#endif
1348
62748f32
ED
1349 case SO_MAX_PACING_RATE:
1350 v.val = sk->sk_max_pacing_rate;
1351 break;
1352
2c8c56e1
ED
1353 case SO_INCOMING_CPU:
1354 v.val = sk->sk_incoming_cpu;
1355 break;
1356
a2d133b1
JH
1357 case SO_MEMINFO:
1358 {
1359 u32 meminfo[SK_MEMINFO_VARS];
1360
1361 if (get_user(len, optlen))
1362 return -EFAULT;
1363
1364 sk_get_meminfo(sk, meminfo);
1365
1366 len = min_t(unsigned int, len, sizeof(meminfo));
1367 if (copy_to_user(optval, &meminfo, len))
1368 return -EFAULT;
1369
1370 goto lenout;
1371 }
6d433902
SS
1372
1373#ifdef CONFIG_NET_RX_BUSY_POLL
1374 case SO_INCOMING_NAPI_ID:
1375 v.val = READ_ONCE(sk->sk_napi_id);
1376
1377 /* aggregate non-NAPI IDs down to 0 */
1378 if (v.val < MIN_NAPI_ID)
1379 v.val = 0;
1380
1381 break;
1382#endif
1383
5daab9db
CF
1384 case SO_COOKIE:
1385 lv = sizeof(u64);
1386 if (len < lv)
1387 return -EINVAL;
1388 v.val64 = sock_gen_cookie(sk);
1389 break;
1390
76851d12
WB
1391 case SO_ZEROCOPY:
1392 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1393 break;
1394
e71a4783 1395 default:
443b5991
YH
1396 /* We implement the SO_SNDLOWAT etc to not be settable
1397 * (1003.1g 7).
1398 */
e71a4783 1399 return -ENOPROTOOPT;
1da177e4 1400 }
e71a4783 1401
1da177e4
LT
1402 if (len > lv)
1403 len = lv;
1404 if (copy_to_user(optval, &v, len))
1405 return -EFAULT;
1406lenout:
4ec93edb
YH
1407 if (put_user(len, optlen))
1408 return -EFAULT;
1409 return 0;
1da177e4
LT
1410}
1411
a5b5bb9a
IM
1412/*
1413 * Initialize an sk_lock.
1414 *
1415 * (We also register the sk_lock with the lock validator.)
1416 */
b6f99a21 1417static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1418{
cdfbabfb
DH
1419 if (sk->sk_kern_sock)
1420 sock_lock_init_class_and_name(
1421 sk,
1422 af_family_kern_slock_key_strings[sk->sk_family],
1423 af_family_kern_slock_keys + sk->sk_family,
1424 af_family_kern_key_strings[sk->sk_family],
1425 af_family_kern_keys + sk->sk_family);
1426 else
1427 sock_lock_init_class_and_name(
1428 sk,
ed07536e
PZ
1429 af_family_slock_key_strings[sk->sk_family],
1430 af_family_slock_keys + sk->sk_family,
1431 af_family_key_strings[sk->sk_family],
1432 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1433}
1434
4dc6dc71
ED
1435/*
1436 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1437 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1438 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1439 */
f1a6c4da
PE
1440static void sock_copy(struct sock *nsk, const struct sock *osk)
1441{
1442#ifdef CONFIG_SECURITY_NETWORK
1443 void *sptr = nsk->sk_security;
1444#endif
68835aba
ED
1445 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1446
1447 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1448 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1449
f1a6c4da
PE
1450#ifdef CONFIG_SECURITY_NETWORK
1451 nsk->sk_security = sptr;
1452 security_sk_clone(osk, nsk);
1453#endif
1454}
1455
2e4afe7b
PE
1456static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1457 int family)
c308c1b2
PE
1458{
1459 struct sock *sk;
1460 struct kmem_cache *slab;
1461
1462 slab = prot->slab;
e912b114
ED
1463 if (slab != NULL) {
1464 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1465 if (!sk)
1466 return sk;
ba2489b0
ED
1467 if (priority & __GFP_ZERO)
1468 sk_prot_clear_nulls(sk, prot->obj_size);
fcbdf09d 1469 } else
c308c1b2
PE
1470 sk = kmalloc(prot->obj_size, priority);
1471
2e4afe7b
PE
1472 if (sk != NULL) {
1473 if (security_sk_alloc(sk, family, priority))
1474 goto out_free;
1475
1476 if (!try_module_get(prot->owner))
1477 goto out_free_sec;
e022f0b4 1478 sk_tx_queue_clear(sk);
2e4afe7b
PE
1479 }
1480
c308c1b2 1481 return sk;
2e4afe7b
PE
1482
1483out_free_sec:
1484 security_sk_free(sk);
1485out_free:
1486 if (slab != NULL)
1487 kmem_cache_free(slab, sk);
1488 else
1489 kfree(sk);
1490 return NULL;
c308c1b2
PE
1491}
1492
1493static void sk_prot_free(struct proto *prot, struct sock *sk)
1494{
1495 struct kmem_cache *slab;
2e4afe7b 1496 struct module *owner;
c308c1b2 1497
2e4afe7b 1498 owner = prot->owner;
c308c1b2 1499 slab = prot->slab;
2e4afe7b 1500
bd1060a1 1501 cgroup_sk_free(&sk->sk_cgrp_data);
2d758073 1502 mem_cgroup_sk_free(sk);
2e4afe7b 1503 security_sk_free(sk);
c308c1b2
PE
1504 if (slab != NULL)
1505 kmem_cache_free(slab, sk);
1506 else
1507 kfree(sk);
2e4afe7b 1508 module_put(owner);
c308c1b2
PE
1509}
1510
1da177e4
LT
1511/**
1512 * sk_alloc - All socket objects are allocated here
c4ea43c5 1513 * @net: the applicable net namespace
4dc3b16b
PP
1514 * @family: protocol family
1515 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1516 * @prot: struct proto associated with this new sock instance
11aa9c28 1517 * @kern: is this to be a kernel socket?
1da177e4 1518 */
1b8d7ae4 1519struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
11aa9c28 1520 struct proto *prot, int kern)
1da177e4 1521{
c308c1b2 1522 struct sock *sk;
1da177e4 1523
154adbc8 1524 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1525 if (sk) {
154adbc8
PE
1526 sk->sk_family = family;
1527 /*
1528 * See comment in struct sock definition to understand
1529 * why we need sk_prot_creator -acme
1530 */
1531 sk->sk_prot = sk->sk_prot_creator = prot;
cdfbabfb 1532 sk->sk_kern_sock = kern;
154adbc8 1533 sock_lock_init(sk);
26abe143
EB
1534 sk->sk_net_refcnt = kern ? 0 : 1;
1535 if (likely(sk->sk_net_refcnt))
1536 get_net(net);
1537 sock_net_set(sk, net);
14afee4b 1538 refcount_set(&sk->sk_wmem_alloc, 1);
f8451725 1539
2d758073 1540 mem_cgroup_sk_alloc(sk);
d979a39d 1541 cgroup_sk_alloc(&sk->sk_cgrp_data);
2a56a1fe
TH
1542 sock_update_classid(&sk->sk_cgrp_data);
1543 sock_update_netprioidx(&sk->sk_cgrp_data);
1da177e4 1544 }
a79af59e 1545
2e4afe7b 1546 return sk;
1da177e4 1547}
2a91525c 1548EXPORT_SYMBOL(sk_alloc);
1da177e4 1549
a4298e45
ED
1550/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1551 * grace period. This is the case for UDP sockets and TCP listeners.
1552 */
1553static void __sk_destruct(struct rcu_head *head)
1da177e4 1554{
a4298e45 1555 struct sock *sk = container_of(head, struct sock, sk_rcu);
1da177e4 1556 struct sk_filter *filter;
1da177e4
LT
1557
1558 if (sk->sk_destruct)
1559 sk->sk_destruct(sk);
1560
a898def2 1561 filter = rcu_dereference_check(sk->sk_filter,
14afee4b 1562 refcount_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1563 if (filter) {
309dd5fc 1564 sk_filter_uncharge(sk, filter);
a9b3cd7f 1565 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4 1566 }
538950a1
CG
1567 if (rcu_access_pointer(sk->sk_reuseport_cb))
1568 reuseport_detach_sock(sk);
1da177e4 1569
08e29af3 1570 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1571
1572 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1573 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1574 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1575
22a0e18e
ED
1576 if (sk->sk_frag.page) {
1577 put_page(sk->sk_frag.page);
1578 sk->sk_frag.page = NULL;
1579 }
1580
109f6e39
EB
1581 if (sk->sk_peer_cred)
1582 put_cred(sk->sk_peer_cred);
1583 put_pid(sk->sk_peer_pid);
26abe143
EB
1584 if (likely(sk->sk_net_refcnt))
1585 put_net(sock_net(sk));
c308c1b2 1586 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1587}
2b85a34e 1588
a4298e45
ED
1589void sk_destruct(struct sock *sk)
1590{
1591 if (sock_flag(sk, SOCK_RCU_FREE))
1592 call_rcu(&sk->sk_rcu, __sk_destruct);
1593 else
1594 __sk_destruct(&sk->sk_rcu);
1595}
1596
eb4cb008
CG
1597static void __sk_free(struct sock *sk)
1598{
edabcd0f 1599 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
eb4cb008
CG
1600 sock_diag_broadcast_destroy(sk);
1601 else
1602 sk_destruct(sk);
1603}
1604
2b85a34e
ED
1605void sk_free(struct sock *sk)
1606{
1607 /*
25985edc 1608 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1609 * some packets are still in some tx queue.
1610 * If not null, sock_wfree() will call __sk_free(sk) later
1611 */
14afee4b 1612 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2b85a34e
ED
1613 __sk_free(sk);
1614}
2a91525c 1615EXPORT_SYMBOL(sk_free);
1da177e4 1616
581319c5
PA
1617static void sk_init_common(struct sock *sk)
1618{
1619 skb_queue_head_init(&sk->sk_receive_queue);
1620 skb_queue_head_init(&sk->sk_write_queue);
1621 skb_queue_head_init(&sk->sk_error_queue);
1622
1623 rwlock_init(&sk->sk_callback_lock);
1624 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1625 af_rlock_keys + sk->sk_family,
1626 af_family_rlock_key_strings[sk->sk_family]);
1627 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1628 af_wlock_keys + sk->sk_family,
1629 af_family_wlock_key_strings[sk->sk_family]);
1630 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1631 af_elock_keys + sk->sk_family,
1632 af_family_elock_key_strings[sk->sk_family]);
1633 lockdep_set_class_and_name(&sk->sk_callback_lock,
1634 af_callback_keys + sk->sk_family,
1635 af_family_clock_key_strings[sk->sk_family]);
1636}
1637
e56c57d0
ED
1638/**
1639 * sk_clone_lock - clone a socket, and lock its clone
1640 * @sk: the socket to clone
1641 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1642 *
1643 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1644 */
1645struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1646{
8fd1d178 1647 struct sock *newsk;
278571ba 1648 bool is_charged = true;
87d11ceb 1649
8fd1d178 1650 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1651 if (newsk != NULL) {
1652 struct sk_filter *filter;
1653
892c141e 1654 sock_copy(newsk, sk);
87d11ceb 1655
9d538fa6
CP
1656 newsk->sk_prot_creator = sk->sk_prot;
1657
87d11ceb 1658 /* SANITY */
8a681736
SV
1659 if (likely(newsk->sk_net_refcnt))
1660 get_net(sock_net(newsk));
87d11ceb
ACM
1661 sk_node_init(&newsk->sk_node);
1662 sock_lock_init(newsk);
1663 bh_lock_sock(newsk);
fa438ccf 1664 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1665 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1666
1667 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1668 /*
1669 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1670 */
14afee4b 1671 refcount_set(&newsk->sk_wmem_alloc, 1);
87d11ceb 1672 atomic_set(&newsk->sk_omem_alloc, 0);
581319c5 1673 sk_init_common(newsk);
87d11ceb
ACM
1674
1675 newsk->sk_dst_cache = NULL;
9b8805a3 1676 newsk->sk_dst_pending_confirm = 0;
87d11ceb
ACM
1677 newsk->sk_wmem_queued = 0;
1678 newsk->sk_forward_alloc = 0;
9caad864 1679 atomic_set(&newsk->sk_drops, 0);
87d11ceb 1680 newsk->sk_send_head = NULL;
87d11ceb 1681 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
52267790 1682 atomic_set(&newsk->sk_zckey, 0);
87d11ceb
ACM
1683
1684 sock_reset_flag(newsk, SOCK_DONE);
799a34d5 1685 mem_cgroup_sk_alloc(newsk);
c0576e39 1686 cgroup_sk_alloc(&newsk->sk_cgrp_data);
87d11ceb 1687
eefca20e
ED
1688 rcu_read_lock();
1689 filter = rcu_dereference(sk->sk_filter);
87d11ceb 1690 if (filter != NULL)
278571ba
AS
1691 /* though it's an empty new sock, the charging may fail
1692 * if sysctl_optmem_max was changed between creation of
1693 * original socket and cloning
1694 */
1695 is_charged = sk_filter_charge(newsk, filter);
eefca20e
ED
1696 RCU_INIT_POINTER(newsk->sk_filter, filter);
1697 rcu_read_unlock();
87d11ceb 1698
d188ba86 1699 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
a97e50cc
DB
1700 /* We need to make sure that we don't uncharge the new
1701 * socket if we couldn't charge it in the first place
1702 * as otherwise we uncharge the parent's filter.
1703 */
1704 if (!is_charged)
1705 RCU_INIT_POINTER(newsk->sk_filter, NULL);
94352d45 1706 sk_free_unlock_clone(newsk);
87d11ceb
ACM
1707 newsk = NULL;
1708 goto out;
1709 }
fa463497 1710 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
87d11ceb
ACM
1711
1712 newsk->sk_err = 0;
e551c32d 1713 newsk->sk_err_soft = 0;
87d11ceb 1714 newsk->sk_priority = 0;
2c8c56e1 1715 newsk->sk_incoming_cpu = raw_smp_processor_id();
33cf7c90 1716 atomic64_set(&newsk->sk_cookie, 0);
d979a39d 1717
4dc6dc71
ED
1718 /*
1719 * Before updating sk_refcnt, we must commit prior changes to memory
1720 * (Documentation/RCU/rculist_nulls.txt for details)
1721 */
1722 smp_wmb();
41c6d650 1723 refcount_set(&newsk->sk_refcnt, 2);
87d11ceb
ACM
1724
1725 /*
1726 * Increment the counter in the same struct proto as the master
1727 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1728 * is the same as sk->sk_prot->socks, as this field was copied
1729 * with memcpy).
1730 *
1731 * This _changes_ the previous behaviour, where
1732 * tcp_create_openreq_child always was incrementing the
1733 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1734 * to be taken into account in all callers. -acme
1735 */
1736 sk_refcnt_debug_inc(newsk);
972692e0 1737 sk_set_socket(newsk, NULL);
43815482 1738 newsk->sk_wq = NULL;
87d11ceb
ACM
1739
1740 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1741 sk_sockets_allocated_inc(newsk);
704da560 1742
080a270f
HFS
1743 if (sock_needs_netstamp(sk) &&
1744 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1745 net_enable_timestamp();
87d11ceb
ACM
1746 }
1747out:
1748 return newsk;
1749}
e56c57d0 1750EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1751
94352d45
ACM
1752void sk_free_unlock_clone(struct sock *sk)
1753{
1754 /* It is still raw copy of parent, so invalidate
1755 * destructor and make plain sk_free() */
1756 sk->sk_destruct = NULL;
1757 bh_unlock_sock(sk);
1758 sk_free(sk);
1759}
1760EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1761
9958089a
AK
1762void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1763{
d6a4e26a
ED
1764 u32 max_segs = 1;
1765
6bd4f355 1766 sk_dst_set(sk, dst);
9958089a
AK
1767 sk->sk_route_caps = dst->dev->features;
1768 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1769 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1770 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1771 if (sk_can_gso(sk)) {
f70f250a 1772 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
9958089a 1773 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1774 } else {
9958089a 1775 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a 1776 sk->sk_gso_max_size = dst->dev->gso_max_size;
d6a4e26a 1777 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
82cc1a7a 1778 }
9958089a 1779 }
d6a4e26a 1780 sk->sk_gso_max_segs = max_segs;
9958089a
AK
1781}
1782EXPORT_SYMBOL_GPL(sk_setup_caps);
1783
1da177e4
LT
1784/*
1785 * Simple resource managers for sockets.
1786 */
1787
1788
4ec93edb
YH
1789/*
1790 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1791 */
1792void sock_wfree(struct sk_buff *skb)
1793{
1794 struct sock *sk = skb->sk;
d99927f4 1795 unsigned int len = skb->truesize;
1da177e4 1796
d99927f4
ED
1797 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1798 /*
1799 * Keep a reference on sk_wmem_alloc, this will be released
1800 * after sk_write_space() call
1801 */
14afee4b 1802 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1da177e4 1803 sk->sk_write_space(sk);
d99927f4
ED
1804 len = 1;
1805 }
2b85a34e 1806 /*
d99927f4
ED
1807 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1808 * could not do because of in-flight packets
2b85a34e 1809 */
14afee4b 1810 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1811 __sk_free(sk);
1da177e4 1812}
2a91525c 1813EXPORT_SYMBOL(sock_wfree);
1da177e4 1814
1d2077ac
ED
1815/* This variant of sock_wfree() is used by TCP,
1816 * since it sets SOCK_USE_WRITE_QUEUE.
1817 */
1818void __sock_wfree(struct sk_buff *skb)
1819{
1820 struct sock *sk = skb->sk;
1821
14afee4b 1822 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1d2077ac
ED
1823 __sk_free(sk);
1824}
1825
9e17f8a4
ED
1826void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1827{
1828 skb_orphan(skb);
1829 skb->sk = sk;
1830#ifdef CONFIG_INET
1831 if (unlikely(!sk_fullsock(sk))) {
1832 skb->destructor = sock_edemux;
1833 sock_hold(sk);
1834 return;
1835 }
1836#endif
1837 skb->destructor = sock_wfree;
1838 skb_set_hash_from_sk(skb, sk);
1839 /*
1840 * We used to take a refcount on sk, but following operation
1841 * is enough to guarantee sk_free() wont free this sock until
1842 * all in-flight packets are completed
1843 */
14afee4b 1844 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
9e17f8a4
ED
1845}
1846EXPORT_SYMBOL(skb_set_owner_w);
1847
1d2077ac
ED
1848/* This helper is used by netem, as it can hold packets in its
1849 * delay queue. We want to allow the owner socket to send more
1850 * packets, as if they were already TX completed by a typical driver.
1851 * But we also want to keep skb->sk set because some packet schedulers
f6ba8d33 1852 * rely on it (sch_fq for example).
1d2077ac 1853 */
f2f872f9
ED
1854void skb_orphan_partial(struct sk_buff *skb)
1855{
f6ba8d33 1856 if (skb_is_tcp_pure_ack(skb))
1d2077ac
ED
1857 return;
1858
f2f872f9
ED
1859 if (skb->destructor == sock_wfree
1860#ifdef CONFIG_INET
1861 || skb->destructor == tcp_wfree
1862#endif
1863 ) {
f6ba8d33
ED
1864 struct sock *sk = skb->sk;
1865
41c6d650 1866 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
14afee4b 1867 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
f6ba8d33
ED
1868 skb->destructor = sock_efree;
1869 }
f2f872f9
ED
1870 } else {
1871 skb_orphan(skb);
1872 }
1873}
1874EXPORT_SYMBOL(skb_orphan_partial);
1875
4ec93edb
YH
1876/*
1877 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1878 */
1879void sock_rfree(struct sk_buff *skb)
1880{
1881 struct sock *sk = skb->sk;
d361fd59 1882 unsigned int len = skb->truesize;
1da177e4 1883
d361fd59
ED
1884 atomic_sub(len, &sk->sk_rmem_alloc);
1885 sk_mem_uncharge(sk, len);
1da177e4 1886}
2a91525c 1887EXPORT_SYMBOL(sock_rfree);
1da177e4 1888
7768eed8
OH
1889/*
1890 * Buffer destructor for skbs that are not used directly in read or write
1891 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1892 */
62bccb8c
AD
1893void sock_efree(struct sk_buff *skb)
1894{
1895 sock_put(skb->sk);
1896}
1897EXPORT_SYMBOL(sock_efree);
1898
976d0201 1899kuid_t sock_i_uid(struct sock *sk)
1da177e4 1900{
976d0201 1901 kuid_t uid;
1da177e4 1902
f064af1e 1903 read_lock_bh(&sk->sk_callback_lock);
976d0201 1904 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
f064af1e 1905 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1906 return uid;
1907}
2a91525c 1908EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
1909
1910unsigned long sock_i_ino(struct sock *sk)
1911{
1912 unsigned long ino;
1913
f064af1e 1914 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1915 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 1916 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1917 return ino;
1918}
2a91525c 1919EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
1920
1921/*
1922 * Allocate a skb from the socket's send buffer.
1923 */
86a76caf 1924struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1925 gfp_t priority)
1da177e4 1926{
14afee4b 1927 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 1928 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
1929 if (skb) {
1930 skb_set_owner_w(skb, sk);
1931 return skb;
1932 }
1933 }
1934 return NULL;
1935}
2a91525c 1936EXPORT_SYMBOL(sock_wmalloc);
1da177e4 1937
98ba0bd5
WB
1938static void sock_ofree(struct sk_buff *skb)
1939{
1940 struct sock *sk = skb->sk;
1941
1942 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1943}
1944
1945struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1946 gfp_t priority)
1947{
1948 struct sk_buff *skb;
1949
1950 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1951 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1952 sysctl_optmem_max)
1953 return NULL;
1954
1955 skb = alloc_skb(size, priority);
1956 if (!skb)
1957 return NULL;
1958
1959 atomic_add(skb->truesize, &sk->sk_omem_alloc);
1960 skb->sk = sk;
1961 skb->destructor = sock_ofree;
1962 return skb;
1963}
1964
4ec93edb 1965/*
1da177e4 1966 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 1967 */
dd0fc66f 1968void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 1969{
95c96174 1970 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
1971 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1972 void *mem;
1973 /* First do the add, to avoid the race if kmalloc
4ec93edb 1974 * might sleep.
1da177e4
LT
1975 */
1976 atomic_add(size, &sk->sk_omem_alloc);
1977 mem = kmalloc(size, priority);
1978 if (mem)
1979 return mem;
1980 atomic_sub(size, &sk->sk_omem_alloc);
1981 }
1982 return NULL;
1983}
2a91525c 1984EXPORT_SYMBOL(sock_kmalloc);
1da177e4 1985
79e88659
DB
1986/* Free an option memory block. Note, we actually want the inline
1987 * here as this allows gcc to detect the nullify and fold away the
1988 * condition entirely.
1da177e4 1989 */
79e88659
DB
1990static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1991 const bool nullify)
1da177e4 1992{
e53da5fb
DM
1993 if (WARN_ON_ONCE(!mem))
1994 return;
79e88659
DB
1995 if (nullify)
1996 kzfree(mem);
1997 else
1998 kfree(mem);
1da177e4
LT
1999 atomic_sub(size, &sk->sk_omem_alloc);
2000}
79e88659
DB
2001
2002void sock_kfree_s(struct sock *sk, void *mem, int size)
2003{
2004 __sock_kfree_s(sk, mem, size, false);
2005}
2a91525c 2006EXPORT_SYMBOL(sock_kfree_s);
1da177e4 2007
79e88659
DB
2008void sock_kzfree_s(struct sock *sk, void *mem, int size)
2009{
2010 __sock_kfree_s(sk, mem, size, true);
2011}
2012EXPORT_SYMBOL(sock_kzfree_s);
2013
1da177e4
LT
2014/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2015 I think, these locks should be removed for datagram sockets.
2016 */
2a91525c 2017static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
2018{
2019 DEFINE_WAIT(wait);
2020
9cd3e072 2021 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1da177e4
LT
2022 for (;;) {
2023 if (!timeo)
2024 break;
2025 if (signal_pending(current))
2026 break;
2027 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 2028 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
14afee4b 2029 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1da177e4
LT
2030 break;
2031 if (sk->sk_shutdown & SEND_SHUTDOWN)
2032 break;
2033 if (sk->sk_err)
2034 break;
2035 timeo = schedule_timeout(timeo);
2036 }
aa395145 2037 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
2038 return timeo;
2039}
2040
2041
2042/*
2043 * Generic send/receive buffer handlers
2044 */
2045
4cc7f68d
HX
2046struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2047 unsigned long data_len, int noblock,
28d64271 2048 int *errcode, int max_page_order)
1da177e4 2049{
2e4e4410 2050 struct sk_buff *skb;
1da177e4
LT
2051 long timeo;
2052 int err;
2053
1da177e4 2054 timeo = sock_sndtimeo(sk, noblock);
2e4e4410 2055 for (;;) {
1da177e4
LT
2056 err = sock_error(sk);
2057 if (err != 0)
2058 goto failure;
2059
2060 err = -EPIPE;
2061 if (sk->sk_shutdown & SEND_SHUTDOWN)
2062 goto failure;
2063
2e4e4410
ED
2064 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2065 break;
28d64271 2066
9cd3e072 2067 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2e4e4410
ED
2068 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2069 err = -EAGAIN;
2070 if (!timeo)
1da177e4 2071 goto failure;
2e4e4410
ED
2072 if (signal_pending(current))
2073 goto interrupted;
2074 timeo = sock_wait_for_wmem(sk, timeo);
1da177e4 2075 }
2e4e4410
ED
2076 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2077 errcode, sk->sk_allocation);
2078 if (skb)
2079 skb_set_owner_w(skb, sk);
1da177e4
LT
2080 return skb;
2081
2082interrupted:
2083 err = sock_intr_errno(timeo);
2084failure:
2085 *errcode = err;
2086 return NULL;
2087}
4cc7f68d 2088EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 2089
4ec93edb 2090struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
2091 int noblock, int *errcode)
2092{
28d64271 2093 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1da177e4 2094}
2a91525c 2095EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4 2096
39771b12
WB
2097int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2098 struct sockcm_cookie *sockc)
2099{
3dd17e63
SHY
2100 u32 tsflags;
2101
39771b12
WB
2102 switch (cmsg->cmsg_type) {
2103 case SO_MARK:
2104 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2105 return -EPERM;
2106 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2107 return -EINVAL;
2108 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2109 break;
3dd17e63
SHY
2110 case SO_TIMESTAMPING:
2111 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2112 return -EINVAL;
2113
2114 tsflags = *(u32 *)CMSG_DATA(cmsg);
2115 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2116 return -EINVAL;
2117
2118 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2119 sockc->tsflags |= tsflags;
2120 break;
779f1ede
SHY
2121 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2122 case SCM_RIGHTS:
2123 case SCM_CREDENTIALS:
2124 break;
39771b12
WB
2125 default:
2126 return -EINVAL;
2127 }
2128 return 0;
2129}
2130EXPORT_SYMBOL(__sock_cmsg_send);
2131
f28ea365
EJ
2132int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2133 struct sockcm_cookie *sockc)
2134{
2135 struct cmsghdr *cmsg;
39771b12 2136 int ret;
f28ea365
EJ
2137
2138 for_each_cmsghdr(cmsg, msg) {
2139 if (!CMSG_OK(msg, cmsg))
2140 return -EINVAL;
2141 if (cmsg->cmsg_level != SOL_SOCKET)
2142 continue;
39771b12
WB
2143 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2144 if (ret)
2145 return ret;
f28ea365
EJ
2146 }
2147 return 0;
2148}
2149EXPORT_SYMBOL(sock_cmsg_send);
2150
06044751
ED
2151static void sk_enter_memory_pressure(struct sock *sk)
2152{
2153 if (!sk->sk_prot->enter_memory_pressure)
2154 return;
2155
2156 sk->sk_prot->enter_memory_pressure(sk);
2157}
2158
2159static void sk_leave_memory_pressure(struct sock *sk)
2160{
2161 if (sk->sk_prot->leave_memory_pressure) {
2162 sk->sk_prot->leave_memory_pressure(sk);
2163 } else {
2164 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2165
2166 if (memory_pressure && *memory_pressure)
2167 *memory_pressure = 0;
2168 }
2169}
2170
5640f768
ED
2171/* On 32bit arches, an skb frag is limited to 2^15 */
2172#define SKB_FRAG_PAGE_ORDER get_order(32768)
2173
400dfd3a
ED
2174/**
2175 * skb_page_frag_refill - check that a page_frag contains enough room
2176 * @sz: minimum size of the fragment we want to get
2177 * @pfrag: pointer to page_frag
82d5e2b8 2178 * @gfp: priority for memory allocation
400dfd3a
ED
2179 *
2180 * Note: While this allocator tries to use high order pages, there is
2181 * no guarantee that allocations succeed. Therefore, @sz MUST be
2182 * less or equal than PAGE_SIZE.
2183 */
d9b2938a 2184bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
5640f768 2185{
5640f768 2186 if (pfrag->page) {
fe896d18 2187 if (page_ref_count(pfrag->page) == 1) {
5640f768
ED
2188 pfrag->offset = 0;
2189 return true;
2190 }
400dfd3a 2191 if (pfrag->offset + sz <= pfrag->size)
5640f768
ED
2192 return true;
2193 put_page(pfrag->page);
2194 }
2195
d9b2938a
ED
2196 pfrag->offset = 0;
2197 if (SKB_FRAG_PAGE_ORDER) {
d0164adc
MG
2198 /* Avoid direct reclaim but allow kswapd to wake */
2199 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2200 __GFP_COMP | __GFP_NOWARN |
2201 __GFP_NORETRY,
d9b2938a 2202 SKB_FRAG_PAGE_ORDER);
5640f768 2203 if (likely(pfrag->page)) {
d9b2938a 2204 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
5640f768
ED
2205 return true;
2206 }
d9b2938a
ED
2207 }
2208 pfrag->page = alloc_page(gfp);
2209 if (likely(pfrag->page)) {
2210 pfrag->size = PAGE_SIZE;
2211 return true;
2212 }
400dfd3a
ED
2213 return false;
2214}
2215EXPORT_SYMBOL(skb_page_frag_refill);
2216
2217bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2218{
2219 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2220 return true;
2221
5640f768
ED
2222 sk_enter_memory_pressure(sk);
2223 sk_stream_moderate_sndbuf(sk);
2224 return false;
2225}
2226EXPORT_SYMBOL(sk_page_frag_refill);
2227
1da177e4 2228static void __lock_sock(struct sock *sk)
f39234d6
NK
2229 __releases(&sk->sk_lock.slock)
2230 __acquires(&sk->sk_lock.slock)
1da177e4
LT
2231{
2232 DEFINE_WAIT(wait);
2233
e71a4783 2234 for (;;) {
1da177e4
LT
2235 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2236 TASK_UNINTERRUPTIBLE);
2237 spin_unlock_bh(&sk->sk_lock.slock);
2238 schedule();
2239 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 2240 if (!sock_owned_by_user(sk))
1da177e4
LT
2241 break;
2242 }
2243 finish_wait(&sk->sk_lock.wq, &wait);
2244}
2245
e6ddc2c3 2246void __release_sock(struct sock *sk)
f39234d6
NK
2247 __releases(&sk->sk_lock.slock)
2248 __acquires(&sk->sk_lock.slock)
1da177e4 2249{
5413d1ba 2250 struct sk_buff *skb, *next;
1da177e4 2251
5413d1ba 2252 while ((skb = sk->sk_backlog.head) != NULL) {
1da177e4 2253 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1da177e4 2254
5413d1ba 2255 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4 2256
5413d1ba
ED
2257 do {
2258 next = skb->next;
e4cbb02a 2259 prefetch(next);
7fee226a 2260 WARN_ON_ONCE(skb_dst_is_noref(skb));
1da177e4 2261 skb->next = NULL;
c57943a1 2262 sk_backlog_rcv(sk, skb);
1da177e4 2263
5413d1ba 2264 cond_resched();
1da177e4
LT
2265
2266 skb = next;
2267 } while (skb != NULL);
2268
5413d1ba
ED
2269 spin_lock_bh(&sk->sk_lock.slock);
2270 }
8eae939f
ZY
2271
2272 /*
2273 * Doing the zeroing here guarantee we can not loop forever
2274 * while a wild producer attempts to flood us.
2275 */
2276 sk->sk_backlog.len = 0;
1da177e4
LT
2277}
2278
d41a69f1
ED
2279void __sk_flush_backlog(struct sock *sk)
2280{
2281 spin_lock_bh(&sk->sk_lock.slock);
2282 __release_sock(sk);
2283 spin_unlock_bh(&sk->sk_lock.slock);
2284}
2285
1da177e4
LT
2286/**
2287 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
2288 * @sk: sock to wait on
2289 * @timeo: for how long
dfbafc99 2290 * @skb: last skb seen on sk_receive_queue
1da177e4
LT
2291 *
2292 * Now socket state including sk->sk_err is changed only under lock,
2293 * hence we may omit checks after joining wait queue.
2294 * We check receive queue before schedule() only as optimization;
2295 * it is very likely that release_sock() added new data.
2296 */
dfbafc99 2297int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1da177e4 2298{
d9dc8b0f 2299 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1da177e4 2300 int rc;
1da177e4 2301
d9dc8b0f 2302 add_wait_queue(sk_sleep(sk), &wait);
9cd3e072 2303 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2304 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
9cd3e072 2305 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2306 remove_wait_queue(sk_sleep(sk), &wait);
1da177e4
LT
2307 return rc;
2308}
1da177e4
LT
2309EXPORT_SYMBOL(sk_wait_data);
2310
3ab224be 2311/**
f8c3bf00 2312 * __sk_mem_raise_allocated - increase memory_allocated
3ab224be
HA
2313 * @sk: socket
2314 * @size: memory size to allocate
f8c3bf00 2315 * @amt: pages to allocate
3ab224be
HA
2316 * @kind: allocation type
2317 *
f8c3bf00 2318 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3ab224be 2319 */
f8c3bf00 2320int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3ab224be
HA
2321{
2322 struct proto *prot = sk->sk_prot;
f8c3bf00 2323 long allocated = sk_memory_allocated_add(sk, amt);
e805605c 2324
baac50bb
JW
2325 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2326 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
e805605c 2327 goto suppress_allocation;
3ab224be
HA
2328
2329 /* Under limit. */
e805605c 2330 if (allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 2331 sk_leave_memory_pressure(sk);
3ab224be
HA
2332 return 1;
2333 }
2334
e805605c
JW
2335 /* Under pressure. */
2336 if (allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 2337 sk_enter_memory_pressure(sk);
3ab224be 2338
e805605c
JW
2339 /* Over hard limit. */
2340 if (allocated > sk_prot_mem_limits(sk, 2))
3ab224be
HA
2341 goto suppress_allocation;
2342
2343 /* guarantee minimum buffer size under pressure */
2344 if (kind == SK_MEM_RECV) {
2345 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2346 return 1;
180d8cd9 2347
3ab224be
HA
2348 } else { /* SK_MEM_SEND */
2349 if (sk->sk_type == SOCK_STREAM) {
2350 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2351 return 1;
14afee4b 2352 } else if (refcount_read(&sk->sk_wmem_alloc) <
3ab224be
HA
2353 prot->sysctl_wmem[0])
2354 return 1;
2355 }
2356
180d8cd9 2357 if (sk_has_memory_pressure(sk)) {
1748376b
ED
2358 int alloc;
2359
180d8cd9 2360 if (!sk_under_memory_pressure(sk))
1748376b 2361 return 1;
180d8cd9
GC
2362 alloc = sk_sockets_allocated_read_positive(sk);
2363 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
2364 sk_mem_pages(sk->sk_wmem_queued +
2365 atomic_read(&sk->sk_rmem_alloc) +
2366 sk->sk_forward_alloc))
2367 return 1;
2368 }
2369
2370suppress_allocation:
2371
2372 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2373 sk_stream_moderate_sndbuf(sk);
2374
2375 /* Fail only if socket is _under_ its sndbuf.
2376 * In this case we cannot block, so that we have to fail.
2377 */
2378 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2379 return 1;
2380 }
2381
3847ce32
SM
2382 trace_sock_exceed_buf_limit(sk, prot, allocated);
2383
0e90b31f 2384 sk_memory_allocated_sub(sk, amt);
180d8cd9 2385
baac50bb
JW
2386 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2387 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
e805605c 2388
3ab224be
HA
2389 return 0;
2390}
f8c3bf00
PA
2391EXPORT_SYMBOL(__sk_mem_raise_allocated);
2392
2393/**
2394 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2395 * @sk: socket
2396 * @size: memory size to allocate
2397 * @kind: allocation type
2398 *
2399 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2400 * rmem allocation. This function assumes that protocols which have
2401 * memory_pressure use sk_wmem_queued as write buffer accounting.
2402 */
2403int __sk_mem_schedule(struct sock *sk, int size, int kind)
2404{
2405 int ret, amt = sk_mem_pages(size);
2406
2407 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2408 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2409 if (!ret)
2410 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2411 return ret;
2412}
3ab224be
HA
2413EXPORT_SYMBOL(__sk_mem_schedule);
2414
2415/**
f8c3bf00 2416 * __sk_mem_reduce_allocated - reclaim memory_allocated
3ab224be 2417 * @sk: socket
f8c3bf00
PA
2418 * @amount: number of quanta
2419 *
2420 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3ab224be 2421 */
f8c3bf00 2422void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3ab224be 2423{
1a24e04e 2424 sk_memory_allocated_sub(sk, amount);
3ab224be 2425
baac50bb
JW
2426 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2427 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
e805605c 2428
180d8cd9
GC
2429 if (sk_under_memory_pressure(sk) &&
2430 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2431 sk_leave_memory_pressure(sk);
3ab224be 2432}
f8c3bf00
PA
2433EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2434
2435/**
2436 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2437 * @sk: socket
2438 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2439 */
2440void __sk_mem_reclaim(struct sock *sk, int amount)
2441{
2442 amount >>= SK_MEM_QUANTUM_SHIFT;
2443 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2444 __sk_mem_reduce_allocated(sk, amount);
2445}
3ab224be
HA
2446EXPORT_SYMBOL(__sk_mem_reclaim);
2447
627d2d6b 2448int sk_set_peek_off(struct sock *sk, int val)
2449{
627d2d6b 2450 sk->sk_peek_off = val;
2451 return 0;
2452}
2453EXPORT_SYMBOL_GPL(sk_set_peek_off);
3ab224be 2454
1da177e4
LT
2455/*
2456 * Set of default routines for initialising struct proto_ops when
2457 * the protocol does not support a particular function. In certain
2458 * cases where it makes no sense for a protocol to have a "do nothing"
2459 * function, some default processing is provided.
2460 */
2461
2462int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2463{
2464 return -EOPNOTSUPP;
2465}
2a91525c 2466EXPORT_SYMBOL(sock_no_bind);
1da177e4 2467
4ec93edb 2468int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2469 int len, int flags)
2470{
2471 return -EOPNOTSUPP;
2472}
2a91525c 2473EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
2474
2475int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2476{
2477 return -EOPNOTSUPP;
2478}
2a91525c 2479EXPORT_SYMBOL(sock_no_socketpair);
1da177e4 2480
cdfbabfb
DH
2481int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2482 bool kern)
1da177e4
LT
2483{
2484 return -EOPNOTSUPP;
2485}
2a91525c 2486EXPORT_SYMBOL(sock_no_accept);
1da177e4 2487
4ec93edb 2488int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2489 int *len, int peer)
2490{
2491 return -EOPNOTSUPP;
2492}
2a91525c 2493EXPORT_SYMBOL(sock_no_getname);
1da177e4 2494
2a91525c 2495unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1da177e4
LT
2496{
2497 return 0;
2498}
2a91525c 2499EXPORT_SYMBOL(sock_no_poll);
1da177e4
LT
2500
2501int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2502{
2503 return -EOPNOTSUPP;
2504}
2a91525c 2505EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
2506
2507int sock_no_listen(struct socket *sock, int backlog)
2508{
2509 return -EOPNOTSUPP;
2510}
2a91525c 2511EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
2512
2513int sock_no_shutdown(struct socket *sock, int how)
2514{
2515 return -EOPNOTSUPP;
2516}
2a91525c 2517EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
2518
2519int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 2520 char __user *optval, unsigned int optlen)
1da177e4
LT
2521{
2522 return -EOPNOTSUPP;
2523}
2a91525c 2524EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
2525
2526int sock_no_getsockopt(struct socket *sock, int level, int optname,
2527 char __user *optval, int __user *optlen)
2528{
2529 return -EOPNOTSUPP;
2530}
2a91525c 2531EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4 2532
1b784140 2533int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
1da177e4
LT
2534{
2535 return -EOPNOTSUPP;
2536}
2a91525c 2537EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4 2538
306b13eb
TH
2539int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2540{
2541 return -EOPNOTSUPP;
2542}
2543EXPORT_SYMBOL(sock_no_sendmsg_locked);
2544
1b784140
YX
2545int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2546 int flags)
1da177e4
LT
2547{
2548 return -EOPNOTSUPP;
2549}
2a91525c 2550EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
2551
2552int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2553{
2554 /* Mirror missing mmap method error code */
2555 return -ENODEV;
2556}
2a91525c 2557EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
2558
2559ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2560{
2561 ssize_t res;
2562 struct msghdr msg = {.msg_flags = flags};
2563 struct kvec iov;
2564 char *kaddr = kmap(page);
2565 iov.iov_base = kaddr + offset;
2566 iov.iov_len = size;
2567 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2568 kunmap(page);
2569 return res;
2570}
2a91525c 2571EXPORT_SYMBOL(sock_no_sendpage);
1da177e4 2572
306b13eb
TH
2573ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2574 int offset, size_t size, int flags)
2575{
2576 ssize_t res;
2577 struct msghdr msg = {.msg_flags = flags};
2578 struct kvec iov;
2579 char *kaddr = kmap(page);
2580
2581 iov.iov_base = kaddr + offset;
2582 iov.iov_len = size;
2583 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2584 kunmap(page);
2585 return res;
2586}
2587EXPORT_SYMBOL(sock_no_sendpage_locked);
2588
1da177e4
LT
2589/*
2590 * Default Socket Callbacks
2591 */
2592
2593static void sock_def_wakeup(struct sock *sk)
2594{
43815482
ED
2595 struct socket_wq *wq;
2596
2597 rcu_read_lock();
2598 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2599 if (skwq_has_sleeper(wq))
43815482
ED
2600 wake_up_interruptible_all(&wq->wait);
2601 rcu_read_unlock();
1da177e4
LT
2602}
2603
2604static void sock_def_error_report(struct sock *sk)
2605{
43815482
ED
2606 struct socket_wq *wq;
2607
2608 rcu_read_lock();
2609 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2610 if (skwq_has_sleeper(wq))
43815482 2611 wake_up_interruptible_poll(&wq->wait, POLLERR);
8d8ad9d7 2612 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2613 rcu_read_unlock();
1da177e4
LT
2614}
2615
676d2369 2616static void sock_def_readable(struct sock *sk)
1da177e4 2617{
43815482
ED
2618 struct socket_wq *wq;
2619
2620 rcu_read_lock();
2621 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2622 if (skwq_has_sleeper(wq))
2c6607c6 2623 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
37e5540b 2624 POLLRDNORM | POLLRDBAND);
8d8ad9d7 2625 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2626 rcu_read_unlock();
1da177e4
LT
2627}
2628
2629static void sock_def_write_space(struct sock *sk)
2630{
43815482
ED
2631 struct socket_wq *wq;
2632
2633 rcu_read_lock();
1da177e4
LT
2634
2635 /* Do not wake up a writer until he can make "significant"
2636 * progress. --DaveM
2637 */
14afee4b 2638 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482 2639 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2640 if (skwq_has_sleeper(wq))
43815482 2641 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
37e5540b 2642 POLLWRNORM | POLLWRBAND);
1da177e4
LT
2643
2644 /* Should agree with poll, otherwise some programs break */
2645 if (sock_writeable(sk))
8d8ad9d7 2646 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2647 }
2648
43815482 2649 rcu_read_unlock();
1da177e4
LT
2650}
2651
2652static void sock_def_destruct(struct sock *sk)
2653{
1da177e4
LT
2654}
2655
2656void sk_send_sigurg(struct sock *sk)
2657{
2658 if (sk->sk_socket && sk->sk_socket->file)
2659 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2660 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2661}
2a91525c 2662EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2663
2664void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2665 unsigned long expires)
2666{
2667 if (!mod_timer(timer, expires))
2668 sock_hold(sk);
2669}
1da177e4
LT
2670EXPORT_SYMBOL(sk_reset_timer);
2671
2672void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2673{
25cc4ae9 2674 if (del_timer(timer))
1da177e4
LT
2675 __sock_put(sk);
2676}
1da177e4
LT
2677EXPORT_SYMBOL(sk_stop_timer);
2678
2679void sock_init_data(struct socket *sock, struct sock *sk)
2680{
581319c5 2681 sk_init_common(sk);
1da177e4
LT
2682 sk->sk_send_head = NULL;
2683
2684 init_timer(&sk->sk_timer);
4ec93edb 2685
1da177e4
LT
2686 sk->sk_allocation = GFP_KERNEL;
2687 sk->sk_rcvbuf = sysctl_rmem_default;
2688 sk->sk_sndbuf = sysctl_wmem_default;
2689 sk->sk_state = TCP_CLOSE;
972692e0 2690 sk_set_socket(sk, sock);
1da177e4
LT
2691
2692 sock_set_flag(sk, SOCK_ZAPPED);
2693
e71a4783 2694 if (sock) {
1da177e4 2695 sk->sk_type = sock->type;
43815482 2696 sk->sk_wq = sock->wq;
1da177e4 2697 sock->sk = sk;
86741ec2
LC
2698 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2699 } else {
43815482 2700 sk->sk_wq = NULL;
86741ec2
LC
2701 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2702 }
1da177e4 2703
1da177e4 2704 rwlock_init(&sk->sk_callback_lock);
cdfbabfb
DH
2705 if (sk->sk_kern_sock)
2706 lockdep_set_class_and_name(
2707 &sk->sk_callback_lock,
2708 af_kern_callback_keys + sk->sk_family,
2709 af_family_kern_clock_key_strings[sk->sk_family]);
2710 else
2711 lockdep_set_class_and_name(
2712 &sk->sk_callback_lock,
443aef0e
PZ
2713 af_callback_keys + sk->sk_family,
2714 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2715
2716 sk->sk_state_change = sock_def_wakeup;
2717 sk->sk_data_ready = sock_def_readable;
2718 sk->sk_write_space = sock_def_write_space;
2719 sk->sk_error_report = sock_def_error_report;
2720 sk->sk_destruct = sock_def_destruct;
2721
5640f768
ED
2722 sk->sk_frag.page = NULL;
2723 sk->sk_frag.offset = 0;
ef64a54f 2724 sk->sk_peek_off = -1;
1da177e4 2725
109f6e39
EB
2726 sk->sk_peer_pid = NULL;
2727 sk->sk_peer_cred = NULL;
1da177e4
LT
2728 sk->sk_write_pending = 0;
2729 sk->sk_rcvlowat = 1;
2730 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2731 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2732
6c7c98ba 2733 sk->sk_stamp = SK_DEFAULT_STAMP;
e5af70e9
DD
2734#if BITS_PER_LONG==32
2735 seqlock_init(&sk->sk_stamp_seq);
2736#endif
52267790 2737 atomic_set(&sk->sk_zckey, 0);
1da177e4 2738
e0d1095a 2739#ifdef CONFIG_NET_RX_BUSY_POLL
06021292 2740 sk->sk_napi_id = 0;
64b0dc51 2741 sk->sk_ll_usec = sysctl_net_busy_read;
06021292
ET
2742#endif
2743
62748f32 2744 sk->sk_max_pacing_rate = ~0U;
7eec4174 2745 sk->sk_pacing_rate = ~0U;
70da268b 2746 sk->sk_incoming_cpu = -1;
4dc6dc71
ED
2747 /*
2748 * Before updating sk_refcnt, we must commit prior changes to memory
2749 * (Documentation/RCU/rculist_nulls.txt for details)
2750 */
2751 smp_wmb();
41c6d650 2752 refcount_set(&sk->sk_refcnt, 1);
33c732c3 2753 atomic_set(&sk->sk_drops, 0);
1da177e4 2754}
2a91525c 2755EXPORT_SYMBOL(sock_init_data);
1da177e4 2756
b5606c2d 2757void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2758{
2759 might_sleep();
a5b5bb9a 2760 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2761 if (sk->sk_lock.owned)
1da177e4 2762 __lock_sock(sk);
d2e9117c 2763 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2764 spin_unlock(&sk->sk_lock.slock);
2765 /*
2766 * The sk_lock has mutex_lock() semantics here:
2767 */
fcc70d5f 2768 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2769 local_bh_enable();
1da177e4 2770}
fcc70d5f 2771EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2772
b5606c2d 2773void release_sock(struct sock *sk)
1da177e4 2774{
a5b5bb9a 2775 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2776 if (sk->sk_backlog.tail)
2777 __release_sock(sk);
46d3ceab 2778
c3f9b018
ED
2779 /* Warning : release_cb() might need to release sk ownership,
2780 * ie call sock_release_ownership(sk) before us.
2781 */
46d3ceab
ED
2782 if (sk->sk_prot->release_cb)
2783 sk->sk_prot->release_cb(sk);
2784
c3f9b018 2785 sock_release_ownership(sk);
a5b5bb9a
IM
2786 if (waitqueue_active(&sk->sk_lock.wq))
2787 wake_up(&sk->sk_lock.wq);
2788 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2789}
2790EXPORT_SYMBOL(release_sock);
2791
8a74ad60
ED
2792/**
2793 * lock_sock_fast - fast version of lock_sock
2794 * @sk: socket
2795 *
2796 * This version should be used for very small section, where process wont block
d651983d
MCC
2797 * return false if fast path is taken:
2798 *
8a74ad60 2799 * sk_lock.slock locked, owned = 0, BH disabled
d651983d
MCC
2800 *
2801 * return true if slow path is taken:
2802 *
8a74ad60
ED
2803 * sk_lock.slock unlocked, owned = 1, BH enabled
2804 */
2805bool lock_sock_fast(struct sock *sk)
2806{
2807 might_sleep();
2808 spin_lock_bh(&sk->sk_lock.slock);
2809
2810 if (!sk->sk_lock.owned)
2811 /*
2812 * Note : We must disable BH
2813 */
2814 return false;
2815
2816 __lock_sock(sk);
2817 sk->sk_lock.owned = 1;
2818 spin_unlock(&sk->sk_lock.slock);
2819 /*
2820 * The sk_lock has mutex_lock() semantics here:
2821 */
2822 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2823 local_bh_enable();
2824 return true;
2825}
2826EXPORT_SYMBOL(lock_sock_fast);
2827
1da177e4 2828int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2829{
b7aa0bf7 2830 struct timeval tv;
1da177e4 2831 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2832 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
b7aa0bf7
ED
2833 tv = ktime_to_timeval(sk->sk_stamp);
2834 if (tv.tv_sec == -1)
1da177e4 2835 return -ENOENT;
b7aa0bf7
ED
2836 if (tv.tv_sec == 0) {
2837 sk->sk_stamp = ktime_get_real();
2838 tv = ktime_to_timeval(sk->sk_stamp);
2839 }
2840 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2841}
1da177e4
LT
2842EXPORT_SYMBOL(sock_get_timestamp);
2843
ae40eb1e
ED
2844int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2845{
2846 struct timespec ts;
2847 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2848 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ae40eb1e
ED
2849 ts = ktime_to_timespec(sk->sk_stamp);
2850 if (ts.tv_sec == -1)
2851 return -ENOENT;
2852 if (ts.tv_sec == 0) {
2853 sk->sk_stamp = ktime_get_real();
2854 ts = ktime_to_timespec(sk->sk_stamp);
2855 }
2856 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2857}
2858EXPORT_SYMBOL(sock_get_timestampns);
2859
20d49473 2860void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 2861{
20d49473 2862 if (!sock_flag(sk, flag)) {
08e29af3
ED
2863 unsigned long previous_flags = sk->sk_flags;
2864
20d49473
PO
2865 sock_set_flag(sk, flag);
2866 /*
2867 * we just set one of the two flags which require net
2868 * time stamping, but time stamping might have been on
2869 * already because of the other one
2870 */
080a270f
HFS
2871 if (sock_needs_netstamp(sk) &&
2872 !(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 2873 net_enable_timestamp();
1da177e4
LT
2874 }
2875}
1da177e4 2876
cb820f8e
RC
2877int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2878 int level, int type)
2879{
2880 struct sock_exterr_skb *serr;
364a9e93 2881 struct sk_buff *skb;
cb820f8e
RC
2882 int copied, err;
2883
2884 err = -EAGAIN;
364a9e93 2885 skb = sock_dequeue_err_skb(sk);
cb820f8e
RC
2886 if (skb == NULL)
2887 goto out;
2888
2889 copied = skb->len;
2890 if (copied > len) {
2891 msg->msg_flags |= MSG_TRUNC;
2892 copied = len;
2893 }
51f3d02b 2894 err = skb_copy_datagram_msg(skb, 0, msg, copied);
cb820f8e
RC
2895 if (err)
2896 goto out_free_skb;
2897
2898 sock_recv_timestamp(msg, sk, skb);
2899
2900 serr = SKB_EXT_ERR(skb);
2901 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2902
2903 msg->msg_flags |= MSG_ERRQUEUE;
2904 err = copied;
2905
cb820f8e
RC
2906out_free_skb:
2907 kfree_skb(skb);
2908out:
2909 return err;
2910}
2911EXPORT_SYMBOL(sock_recv_errqueue);
2912
1da177e4
LT
2913/*
2914 * Get a socket option on an socket.
2915 *
2916 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2917 * asynchronous errors should be reported by getsockopt. We assume
2918 * this means if you specify SO_ERROR (otherwise whats the point of it).
2919 */
2920int sock_common_getsockopt(struct socket *sock, int level, int optname,
2921 char __user *optval, int __user *optlen)
2922{
2923 struct sock *sk = sock->sk;
2924
2925 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2926}
1da177e4
LT
2927EXPORT_SYMBOL(sock_common_getsockopt);
2928
3fdadf7d 2929#ifdef CONFIG_COMPAT
543d9cfe
ACM
2930int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2931 char __user *optval, int __user *optlen)
3fdadf7d
DM
2932{
2933 struct sock *sk = sock->sk;
2934
1e51f951 2935 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
2936 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2937 optval, optlen);
3fdadf7d
DM
2938 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2939}
2940EXPORT_SYMBOL(compat_sock_common_getsockopt);
2941#endif
2942
1b784140
YX
2943int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2944 int flags)
1da177e4
LT
2945{
2946 struct sock *sk = sock->sk;
2947 int addr_len = 0;
2948 int err;
2949
1b784140 2950 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
1da177e4
LT
2951 flags & ~MSG_DONTWAIT, &addr_len);
2952 if (err >= 0)
2953 msg->msg_namelen = addr_len;
2954 return err;
2955}
1da177e4
LT
2956EXPORT_SYMBOL(sock_common_recvmsg);
2957
2958/*
2959 * Set socket options on an inet socket.
2960 */
2961int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2962 char __user *optval, unsigned int optlen)
1da177e4
LT
2963{
2964 struct sock *sk = sock->sk;
2965
2966 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2967}
1da177e4
LT
2968EXPORT_SYMBOL(sock_common_setsockopt);
2969
3fdadf7d 2970#ifdef CONFIG_COMPAT
543d9cfe 2971int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2972 char __user *optval, unsigned int optlen)
3fdadf7d
DM
2973{
2974 struct sock *sk = sock->sk;
2975
543d9cfe
ACM
2976 if (sk->sk_prot->compat_setsockopt != NULL)
2977 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2978 optval, optlen);
3fdadf7d
DM
2979 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2980}
2981EXPORT_SYMBOL(compat_sock_common_setsockopt);
2982#endif
2983
1da177e4
LT
2984void sk_common_release(struct sock *sk)
2985{
2986 if (sk->sk_prot->destroy)
2987 sk->sk_prot->destroy(sk);
2988
2989 /*
2990 * Observation: when sock_common_release is called, processes have
2991 * no access to socket. But net still has.
2992 * Step one, detach it from networking:
2993 *
2994 * A. Remove from hash tables.
2995 */
2996
2997 sk->sk_prot->unhash(sk);
2998
2999 /*
3000 * In this point socket cannot receive new packets, but it is possible
3001 * that some packets are in flight because some CPU runs receiver and
3002 * did hash table lookup before we unhashed socket. They will achieve
3003 * receive queue and will be purged by socket destructor.
3004 *
3005 * Also we still have packets pending on receive queue and probably,
3006 * our own packets waiting in device queues. sock_destroy will drain
3007 * receive queue, but transmitted packets will delay socket destruction
3008 * until the last reference will be released.
3009 */
3010
3011 sock_orphan(sk);
3012
3013 xfrm_sk_free_policy(sk);
3014
e6848976 3015 sk_refcnt_debug_release(sk);
5640f768 3016
1da177e4
LT
3017 sock_put(sk);
3018}
1da177e4
LT
3019EXPORT_SYMBOL(sk_common_release);
3020
a2d133b1
JH
3021void sk_get_meminfo(const struct sock *sk, u32 *mem)
3022{
3023 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3024
3025 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3026 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3027 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3028 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3029 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3030 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3031 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3032 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3033 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3034}
3035
13ff3d6f
PE
3036#ifdef CONFIG_PROC_FS
3037#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
3038struct prot_inuse {
3039 int val[PROTO_INUSE_NR];
3040};
13ff3d6f
PE
3041
3042static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159
PE
3043
3044#ifdef CONFIG_NET_NS
3045void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3046{
d6d9ca0f 3047 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
70ee1159
PE
3048}
3049EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3050
3051int sock_prot_inuse_get(struct net *net, struct proto *prot)
3052{
3053 int cpu, idx = prot->inuse_idx;
3054 int res = 0;
3055
3056 for_each_possible_cpu(cpu)
3057 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
3058
3059 return res >= 0 ? res : 0;
3060}
3061EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3062
2c8c1e72 3063static int __net_init sock_inuse_init_net(struct net *net)
70ee1159
PE
3064{
3065 net->core.inuse = alloc_percpu(struct prot_inuse);
3066 return net->core.inuse ? 0 : -ENOMEM;
3067}
3068
2c8c1e72 3069static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159
PE
3070{
3071 free_percpu(net->core.inuse);
3072}
3073
3074static struct pernet_operations net_inuse_ops = {
3075 .init = sock_inuse_init_net,
3076 .exit = sock_inuse_exit_net,
3077};
3078
3079static __init int net_inuse_init(void)
3080{
3081 if (register_pernet_subsys(&net_inuse_ops))
3082 panic("Cannot initialize net inuse counters");
3083
3084 return 0;
3085}
3086
3087core_initcall(net_inuse_init);
3088#else
1338d466
PE
3089static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
3090
c29a0bc4 3091void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1338d466 3092{
d6d9ca0f 3093 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
1338d466
PE
3094}
3095EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3096
c29a0bc4 3097int sock_prot_inuse_get(struct net *net, struct proto *prot)
1338d466
PE
3098{
3099 int cpu, idx = prot->inuse_idx;
3100 int res = 0;
3101
3102 for_each_possible_cpu(cpu)
3103 res += per_cpu(prot_inuse, cpu).val[idx];
3104
3105 return res >= 0 ? res : 0;
3106}
3107EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
70ee1159 3108#endif
13ff3d6f
PE
3109
3110static void assign_proto_idx(struct proto *prot)
3111{
3112 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3113
3114 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 3115 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
3116 return;
3117 }
3118
3119 set_bit(prot->inuse_idx, proto_inuse_idx);
3120}
3121
3122static void release_proto_idx(struct proto *prot)
3123{
3124 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3125 clear_bit(prot->inuse_idx, proto_inuse_idx);
3126}
3127#else
3128static inline void assign_proto_idx(struct proto *prot)
3129{
3130}
3131
3132static inline void release_proto_idx(struct proto *prot)
3133{
3134}
3135#endif
3136
0159dfd3
ED
3137static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3138{
3139 if (!rsk_prot)
3140 return;
3141 kfree(rsk_prot->slab_name);
3142 rsk_prot->slab_name = NULL;
adf78eda
JL
3143 kmem_cache_destroy(rsk_prot->slab);
3144 rsk_prot->slab = NULL;
0159dfd3
ED
3145}
3146
3147static int req_prot_init(const struct proto *prot)
3148{
3149 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3150
3151 if (!rsk_prot)
3152 return 0;
3153
3154 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3155 prot->name);
3156 if (!rsk_prot->slab_name)
3157 return -ENOMEM;
3158
3159 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3160 rsk_prot->obj_size, 0,
e96f78ab 3161 prot->slab_flags, NULL);
0159dfd3
ED
3162
3163 if (!rsk_prot->slab) {
3164 pr_crit("%s: Can't create request sock SLAB cache!\n",
3165 prot->name);
3166 return -ENOMEM;
3167 }
3168 return 0;
3169}
3170
b733c007
PE
3171int proto_register(struct proto *prot, int alloc_slab)
3172{
1da177e4
LT
3173 if (alloc_slab) {
3174 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
271b72c7
ED
3175 SLAB_HWCACHE_ALIGN | prot->slab_flags,
3176 NULL);
1da177e4
LT
3177
3178 if (prot->slab == NULL) {
e005d193
JP
3179 pr_crit("%s: Can't create sock SLAB cache!\n",
3180 prot->name);
60e7663d 3181 goto out;
1da177e4 3182 }
2e6599cb 3183
0159dfd3
ED
3184 if (req_prot_init(prot))
3185 goto out_free_request_sock_slab;
8feaf0c0 3186
6d6ee43e 3187 if (prot->twsk_prot != NULL) {
faf23422 3188 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 3189
7e56b5d6 3190 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
3191 goto out_free_request_sock_slab;
3192
6d6ee43e 3193 prot->twsk_prot->twsk_slab =
7e56b5d6 3194 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 3195 prot->twsk_prot->twsk_obj_size,
3ab5aee7 3196 0,
52db70dc 3197 prot->slab_flags,
20c2df83 3198 NULL);
6d6ee43e 3199 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
3200 goto out_free_timewait_sock_slab_name;
3201 }
1da177e4
LT
3202 }
3203
36b77a52 3204 mutex_lock(&proto_list_mutex);
1da177e4 3205 list_add(&prot->node, &proto_list);
13ff3d6f 3206 assign_proto_idx(prot);
36b77a52 3207 mutex_unlock(&proto_list_mutex);
b733c007
PE
3208 return 0;
3209
8feaf0c0 3210out_free_timewait_sock_slab_name:
7e56b5d6 3211 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0 3212out_free_request_sock_slab:
0159dfd3
ED
3213 req_prot_cleanup(prot->rsk_prot);
3214
2e6599cb
ACM
3215 kmem_cache_destroy(prot->slab);
3216 prot->slab = NULL;
b733c007
PE
3217out:
3218 return -ENOBUFS;
1da177e4 3219}
1da177e4
LT
3220EXPORT_SYMBOL(proto_register);
3221
3222void proto_unregister(struct proto *prot)
3223{
36b77a52 3224 mutex_lock(&proto_list_mutex);
13ff3d6f 3225 release_proto_idx(prot);
0a3f4358 3226 list_del(&prot->node);
36b77a52 3227 mutex_unlock(&proto_list_mutex);
1da177e4 3228
adf78eda
JL
3229 kmem_cache_destroy(prot->slab);
3230 prot->slab = NULL;
1da177e4 3231
0159dfd3 3232 req_prot_cleanup(prot->rsk_prot);
2e6599cb 3233
6d6ee43e 3234 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 3235 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 3236 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 3237 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 3238 }
1da177e4 3239}
1da177e4
LT
3240EXPORT_SYMBOL(proto_unregister);
3241
3242#ifdef CONFIG_PROC_FS
1da177e4 3243static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 3244 __acquires(proto_list_mutex)
1da177e4 3245{
36b77a52 3246 mutex_lock(&proto_list_mutex);
60f0438a 3247 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
3248}
3249
3250static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3251{
60f0438a 3252 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
3253}
3254
3255static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 3256 __releases(proto_list_mutex)
1da177e4 3257{
36b77a52 3258 mutex_unlock(&proto_list_mutex);
1da177e4
LT
3259}
3260
3261static char proto_method_implemented(const void *method)
3262{
3263 return method == NULL ? 'n' : 'y';
3264}
180d8cd9
GC
3265static long sock_prot_memory_allocated(struct proto *proto)
3266{
cb75a36c 3267 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
3268}
3269
3270static char *sock_prot_memory_pressure(struct proto *proto)
3271{
3272 return proto->memory_pressure != NULL ?
3273 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3274}
1da177e4
LT
3275
3276static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3277{
180d8cd9 3278
8d987e5c 3279 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
3280 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3281 proto->name,
3282 proto->obj_size,
14e943db 3283 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
3284 sock_prot_memory_allocated(proto),
3285 sock_prot_memory_pressure(proto),
1da177e4
LT
3286 proto->max_header,
3287 proto->slab == NULL ? "no" : "yes",
3288 module_name(proto->owner),
3289 proto_method_implemented(proto->close),
3290 proto_method_implemented(proto->connect),
3291 proto_method_implemented(proto->disconnect),
3292 proto_method_implemented(proto->accept),
3293 proto_method_implemented(proto->ioctl),
3294 proto_method_implemented(proto->init),
3295 proto_method_implemented(proto->destroy),
3296 proto_method_implemented(proto->shutdown),
3297 proto_method_implemented(proto->setsockopt),
3298 proto_method_implemented(proto->getsockopt),
3299 proto_method_implemented(proto->sendmsg),
3300 proto_method_implemented(proto->recvmsg),
3301 proto_method_implemented(proto->sendpage),
3302 proto_method_implemented(proto->bind),
3303 proto_method_implemented(proto->backlog_rcv),
3304 proto_method_implemented(proto->hash),
3305 proto_method_implemented(proto->unhash),
3306 proto_method_implemented(proto->get_port),
3307 proto_method_implemented(proto->enter_memory_pressure));
3308}
3309
3310static int proto_seq_show(struct seq_file *seq, void *v)
3311{
60f0438a 3312 if (v == &proto_list)
1da177e4
LT
3313 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3314 "protocol",
3315 "size",
3316 "sockets",
3317 "memory",
3318 "press",
3319 "maxhdr",
3320 "slab",
3321 "module",
3322 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3323 else
60f0438a 3324 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
3325 return 0;
3326}
3327
f690808e 3328static const struct seq_operations proto_seq_ops = {
1da177e4
LT
3329 .start = proto_seq_start,
3330 .next = proto_seq_next,
3331 .stop = proto_seq_stop,
3332 .show = proto_seq_show,
3333};
3334
3335static int proto_seq_open(struct inode *inode, struct file *file)
3336{
14e943db
ED
3337 return seq_open_net(inode, file, &proto_seq_ops,
3338 sizeof(struct seq_net_private));
1da177e4
LT
3339}
3340
9a32144e 3341static const struct file_operations proto_seq_fops = {
1da177e4
LT
3342 .owner = THIS_MODULE,
3343 .open = proto_seq_open,
3344 .read = seq_read,
3345 .llseek = seq_lseek,
14e943db
ED
3346 .release = seq_release_net,
3347};
3348
3349static __net_init int proto_init_net(struct net *net)
3350{
d4beaa66 3351 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
14e943db
ED
3352 return -ENOMEM;
3353
3354 return 0;
3355}
3356
3357static __net_exit void proto_exit_net(struct net *net)
3358{
ece31ffd 3359 remove_proc_entry("protocols", net->proc_net);
14e943db
ED
3360}
3361
3362
3363static __net_initdata struct pernet_operations proto_net_ops = {
3364 .init = proto_init_net,
3365 .exit = proto_exit_net,
1da177e4
LT
3366};
3367
3368static int __init proto_init(void)
3369{
14e943db 3370 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
3371}
3372
3373subsys_initcall(proto_init);
3374
3375#endif /* PROC_FS */
7db6b048
SS
3376
3377#ifdef CONFIG_NET_RX_BUSY_POLL
3378bool sk_busy_loop_end(void *p, unsigned long start_time)
3379{
3380 struct sock *sk = p;
3381
3382 return !skb_queue_empty(&sk->sk_receive_queue) ||
3383 sk_busy_loop_timeout(sk, start_time);
3384}
3385EXPORT_SYMBOL(sk_busy_loop_end);
3386#endif /* CONFIG_NET_RX_BUSY_POLL */