inet: prevent leakage of uninitialized memory to user in recv syscalls
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
1da177e4
LT
91
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111
HX
218struct packet_skb_cb {
219 unsigned int origlen;
220 union {
221 struct sockaddr_pkt pkt;
222 struct sockaddr_ll ll;
223 } sa;
224};
225
226#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 227
bc59ba39 228#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 229#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 231#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 233#define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
236
dc99f600
DM
237static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238static void __fanout_link(struct sock *sk, struct packet_sock *po);
239
ce06b03e
DM
240/* register_prot_hook must be invoked with the po->bind_lock held,
241 * or from a context in which asynchronous accesses to the packet
242 * socket is not possible (packet_create()).
243 */
244static void register_prot_hook(struct sock *sk)
245{
246 struct packet_sock *po = pkt_sk(sk);
247 if (!po->running) {
dc99f600
DM
248 if (po->fanout)
249 __fanout_link(sk, po);
250 else
251 dev_add_pack(&po->prot_hook);
ce06b03e
DM
252 sock_hold(sk);
253 po->running = 1;
254 }
255}
256
257/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
258 * held. If the sync parameter is true, we will temporarily drop
259 * the po->bind_lock and do a synchronize_net to make sure no
260 * asynchronous packet processing paths still refer to the elements
261 * of po->prot_hook. If the sync parameter is false, it is the
262 * callers responsibility to take care of this.
263 */
264static void __unregister_prot_hook(struct sock *sk, bool sync)
265{
266 struct packet_sock *po = pkt_sk(sk);
267
268 po->running = 0;
dc99f600
DM
269 if (po->fanout)
270 __fanout_unlink(sk, po);
271 else
272 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
273 __sock_put(sk);
274
275 if (sync) {
276 spin_unlock(&po->bind_lock);
277 synchronize_net();
278 spin_lock(&po->bind_lock);
279 }
280}
281
282static void unregister_prot_hook(struct sock *sk, bool sync)
283{
284 struct packet_sock *po = pkt_sk(sk);
285
286 if (po->running)
287 __unregister_prot_hook(sk, sync);
288}
289
f6dafa95 290static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
291{
292 if (is_vmalloc_addr(addr))
293 return vmalloc_to_page(addr);
294 return virt_to_page(addr);
295}
296
69e3c75f 297static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 298{
184f489e 299 union tpacket_uhdr h;
1da177e4 300
69e3c75f 301 h.raw = frame;
bbd6ef87
PM
302 switch (po->tp_version) {
303 case TPACKET_V1:
69e3c75f 304 h.h1->tp_status = status;
0af55bb5 305 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
306 break;
307 case TPACKET_V2:
69e3c75f 308 h.h2->tp_status = status;
0af55bb5 309 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 310 break;
f6fb8f10 311 case TPACKET_V3:
69e3c75f 312 default:
f6fb8f10 313 WARN(1, "TPACKET version not supported.\n");
69e3c75f 314 BUG();
bbd6ef87 315 }
69e3c75f
JB
316
317 smp_wmb();
bbd6ef87
PM
318}
319
69e3c75f 320static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 321{
184f489e 322 union tpacket_uhdr h;
bbd6ef87 323
69e3c75f
JB
324 smp_rmb();
325
bbd6ef87
PM
326 h.raw = frame;
327 switch (po->tp_version) {
328 case TPACKET_V1:
0af55bb5 329 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 330 return h.h1->tp_status;
bbd6ef87 331 case TPACKET_V2:
0af55bb5 332 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 333 return h.h2->tp_status;
f6fb8f10 334 case TPACKET_V3:
69e3c75f 335 default:
f6fb8f10 336 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
337 BUG();
338 return 0;
bbd6ef87 339 }
1da177e4 340}
69e3c75f 341
b9c32fb2
DB
342static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
343 unsigned int flags)
7a51384c
DB
344{
345 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
346
347 if (shhwtstamps) {
348 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
349 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
b9c32fb2 350 return TP_STATUS_TS_SYS_HARDWARE;
7a51384c
DB
351 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
352 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
b9c32fb2 353 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
354 }
355
356 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 357 return TP_STATUS_TS_SOFTWARE;
7a51384c 358
b9c32fb2 359 return 0;
7a51384c
DB
360}
361
b9c32fb2
DB
362static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
363 struct sk_buff *skb)
2e31396f
WB
364{
365 union tpacket_uhdr h;
366 struct timespec ts;
b9c32fb2 367 __u32 ts_status;
2e31396f 368
b9c32fb2
DB
369 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
370 return 0;
2e31396f
WB
371
372 h.raw = frame;
373 switch (po->tp_version) {
374 case TPACKET_V1:
375 h.h1->tp_sec = ts.tv_sec;
376 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
377 break;
378 case TPACKET_V2:
379 h.h2->tp_sec = ts.tv_sec;
380 h.h2->tp_nsec = ts.tv_nsec;
381 break;
382 case TPACKET_V3:
383 default:
384 WARN(1, "TPACKET version not supported.\n");
385 BUG();
386 }
387
388 /* one flush is safe, as both fields always lie on the same cacheline */
389 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
390 smp_wmb();
b9c32fb2
DB
391
392 return ts_status;
2e31396f
WB
393}
394
69e3c75f
JB
395static void *packet_lookup_frame(struct packet_sock *po,
396 struct packet_ring_buffer *rb,
397 unsigned int position,
398 int status)
399{
400 unsigned int pg_vec_pos, frame_offset;
184f489e 401 union tpacket_uhdr h;
69e3c75f
JB
402
403 pg_vec_pos = position / rb->frames_per_block;
404 frame_offset = position % rb->frames_per_block;
405
0e3125c7
NH
406 h.raw = rb->pg_vec[pg_vec_pos].buffer +
407 (frame_offset * rb->frame_size);
69e3c75f
JB
408
409 if (status != __packet_get_status(po, h.raw))
410 return NULL;
411
412 return h.raw;
413}
414
eea49cc9 415static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
416 struct packet_ring_buffer *rb,
417 int status)
418{
419 return packet_lookup_frame(po, rb, rb->head, status);
420}
421
bc59ba39 422static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 423{
424 del_timer_sync(&pkc->retire_blk_timer);
425}
426
427static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
428 int tx_ring,
429 struct sk_buff_head *rb_queue)
430{
bc59ba39 431 struct tpacket_kbdq_core *pkc;
f6fb8f10 432
433 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
434
435 spin_lock(&rb_queue->lock);
436 pkc->delete_blk_timer = 1;
437 spin_unlock(&rb_queue->lock);
438
439 prb_del_retire_blk_timer(pkc);
440}
441
442static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 443 struct tpacket_kbdq_core *pkc,
f6fb8f10 444 void (*func) (unsigned long))
445{
446 init_timer(&pkc->retire_blk_timer);
447 pkc->retire_blk_timer.data = (long)po;
448 pkc->retire_blk_timer.function = func;
449 pkc->retire_blk_timer.expires = jiffies;
450}
451
452static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
453{
bc59ba39 454 struct tpacket_kbdq_core *pkc;
f6fb8f10 455
456 if (tx_ring)
457 BUG();
458
459 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
460 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
461}
462
463static int prb_calc_retire_blk_tmo(struct packet_sock *po,
464 int blk_size_in_bytes)
465{
466 struct net_device *dev;
467 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
468 struct ethtool_cmd ecmd;
469 int err;
e440cf2c 470 u32 speed;
f6fb8f10 471
4bc71cb9
JP
472 rtnl_lock();
473 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
474 if (unlikely(!dev)) {
475 rtnl_unlock();
f6fb8f10 476 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
477 }
478 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 479 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
480 rtnl_unlock();
481 if (!err) {
4bc71cb9
JP
482 /*
483 * If the link speed is so slow you don't really
484 * need to worry about perf anyways
485 */
e440cf2c 486 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 487 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 488 } else {
489 msec = 1;
490 div = speed / 1000;
f6fb8f10 491 }
492 }
493
494 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
495
496 if (div)
497 mbits /= div;
498
499 tmo = mbits * msec;
500
501 if (div)
502 return tmo+1;
503 return tmo;
504}
505
bc59ba39 506static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 507 union tpacket_req_u *req_u)
508{
509 p1->feature_req_word = req_u->req3.tp_feature_req_word;
510}
511
512static void init_prb_bdqc(struct packet_sock *po,
513 struct packet_ring_buffer *rb,
514 struct pgv *pg_vec,
515 union tpacket_req_u *req_u, int tx_ring)
516{
bc59ba39 517 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
518 struct tpacket_block_desc *pbd;
f6fb8f10 519
520 memset(p1, 0x0, sizeof(*p1));
521
522 p1->knxt_seq_num = 1;
523 p1->pkbdq = pg_vec;
bc59ba39 524 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 525 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 526 p1->kblk_size = req_u->req3.tp_block_size;
527 p1->knum_blocks = req_u->req3.tp_block_nr;
528 p1->hdrlen = po->tp_hdrlen;
529 p1->version = po->tp_version;
530 p1->last_kactive_blk_num = 0;
ee80fbf3 531 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 532 if (req_u->req3.tp_retire_blk_tov)
533 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
534 else
535 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
536 req_u->req3.tp_block_size);
537 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
538 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
539
540 prb_init_ft_ops(p1, req_u);
541 prb_setup_retire_blk_timer(po, tx_ring);
542 prb_open_block(p1, pbd);
543}
544
545/* Do NOT update the last_blk_num first.
546 * Assumes sk_buff_head lock is held.
547 */
bc59ba39 548static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 549{
550 mod_timer(&pkc->retire_blk_timer,
551 jiffies + pkc->tov_in_jiffies);
552 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
553}
554
555/*
556 * Timer logic:
557 * 1) We refresh the timer only when we open a block.
558 * By doing this we don't waste cycles refreshing the timer
559 * on packet-by-packet basis.
560 *
561 * With a 1MB block-size, on a 1Gbps line, it will take
562 * i) ~8 ms to fill a block + ii) memcpy etc.
563 * In this cut we are not accounting for the memcpy time.
564 *
565 * So, if the user sets the 'tmo' to 10ms then the timer
566 * will never fire while the block is still getting filled
567 * (which is what we want). However, the user could choose
568 * to close a block early and that's fine.
569 *
570 * But when the timer does fire, we check whether or not to refresh it.
571 * Since the tmo granularity is in msecs, it is not too expensive
572 * to refresh the timer, lets say every '8' msecs.
573 * Either the user can set the 'tmo' or we can derive it based on
574 * a) line-speed and b) block-size.
575 * prb_calc_retire_blk_tmo() calculates the tmo.
576 *
577 */
578static void prb_retire_rx_blk_timer_expired(unsigned long data)
579{
580 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 581 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 582 unsigned int frozen;
bc59ba39 583 struct tpacket_block_desc *pbd;
f6fb8f10 584
585 spin_lock(&po->sk.sk_receive_queue.lock);
586
587 frozen = prb_queue_frozen(pkc);
588 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
589
590 if (unlikely(pkc->delete_blk_timer))
591 goto out;
592
593 /* We only need to plug the race when the block is partially filled.
594 * tpacket_rcv:
595 * lock(); increment BLOCK_NUM_PKTS; unlock()
596 * copy_bits() is in progress ...
597 * timer fires on other cpu:
598 * we can't retire the current block because copy_bits
599 * is in progress.
600 *
601 */
602 if (BLOCK_NUM_PKTS(pbd)) {
603 while (atomic_read(&pkc->blk_fill_in_prog)) {
604 /* Waiting for skb_copy_bits to finish... */
605 cpu_relax();
606 }
607 }
608
609 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
610 if (!frozen) {
611 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
612 if (!prb_dispatch_next_block(pkc, po))
613 goto refresh_timer;
614 else
615 goto out;
616 } else {
617 /* Case 1. Queue was frozen because user-space was
618 * lagging behind.
619 */
620 if (prb_curr_blk_in_use(pkc, pbd)) {
621 /*
622 * Ok, user-space is still behind.
623 * So just refresh the timer.
624 */
625 goto refresh_timer;
626 } else {
627 /* Case 2. queue was frozen,user-space caught up,
628 * now the link went idle && the timer fired.
629 * We don't have a block to close.So we open this
630 * block and restart the timer.
631 * opening a block thaws the queue,restarts timer
632 * Thawing/timer-refresh is a side effect.
633 */
634 prb_open_block(pkc, pbd);
635 goto out;
636 }
637 }
638 }
639
640refresh_timer:
641 _prb_refresh_rx_retire_blk_timer(pkc);
642
643out:
644 spin_unlock(&po->sk.sk_receive_queue.lock);
645}
646
eea49cc9 647static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 648 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 649{
650 /* Flush everything minus the block header */
651
652#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
653 u8 *start, *end;
654
655 start = (u8 *)pbd1;
656
657 /* Skip the block header(we know header WILL fit in 4K) */
658 start += PAGE_SIZE;
659
660 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
661 for (; start < end; start += PAGE_SIZE)
662 flush_dcache_page(pgv_to_page(start));
663
664 smp_wmb();
665#endif
666
667 /* Now update the block status. */
668
669 BLOCK_STATUS(pbd1) = status;
670
671 /* Flush the block header */
672
673#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
674 start = (u8 *)pbd1;
675 flush_dcache_page(pgv_to_page(start));
676
677 smp_wmb();
678#endif
679}
680
681/*
682 * Side effect:
683 *
684 * 1) flush the block
685 * 2) Increment active_blk_num
686 *
687 * Note:We DONT refresh the timer on purpose.
688 * Because almost always the next block will be opened.
689 */
bc59ba39 690static void prb_close_block(struct tpacket_kbdq_core *pkc1,
691 struct tpacket_block_desc *pbd1,
f6fb8f10 692 struct packet_sock *po, unsigned int stat)
693{
694 __u32 status = TP_STATUS_USER | stat;
695
696 struct tpacket3_hdr *last_pkt;
bc59ba39 697 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 698
ee80fbf3 699 if (po->stats.stats3.tp_drops)
f6fb8f10 700 status |= TP_STATUS_LOSING;
701
702 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
703 last_pkt->tp_next_offset = 0;
704
705 /* Get the ts of the last pkt */
706 if (BLOCK_NUM_PKTS(pbd1)) {
707 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
708 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
709 } else {
710 /* Ok, we tmo'd - so get the current time */
711 struct timespec ts;
712 getnstimeofday(&ts);
713 h1->ts_last_pkt.ts_sec = ts.tv_sec;
714 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
715 }
716
717 smp_wmb();
718
719 /* Flush the block */
720 prb_flush_block(pkc1, pbd1, status);
721
722 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
723}
724
eea49cc9 725static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 726{
727 pkc->reset_pending_on_curr_blk = 0;
728}
729
730/*
731 * Side effect of opening a block:
732 *
733 * 1) prb_queue is thawed.
734 * 2) retire_blk_timer is refreshed.
735 *
736 */
bc59ba39 737static void prb_open_block(struct tpacket_kbdq_core *pkc1,
738 struct tpacket_block_desc *pbd1)
f6fb8f10 739{
740 struct timespec ts;
bc59ba39 741 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 742
743 smp_rmb();
744
8da3056c
DB
745 /* We could have just memset this but we will lose the
746 * flexibility of making the priv area sticky
747 */
f6fb8f10 748
8da3056c
DB
749 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
750 BLOCK_NUM_PKTS(pbd1) = 0;
751 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 752
8da3056c
DB
753 getnstimeofday(&ts);
754
755 h1->ts_first_pkt.ts_sec = ts.tv_sec;
756 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 757
8da3056c
DB
758 pkc1->pkblk_start = (char *)pbd1;
759 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
760
761 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
762 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
763
764 pbd1->version = pkc1->version;
765 pkc1->prev = pkc1->nxt_offset;
766 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
767
768 prb_thaw_queue(pkc1);
769 _prb_refresh_rx_retire_blk_timer(pkc1);
770
771 smp_wmb();
f6fb8f10 772}
773
774/*
775 * Queue freeze logic:
776 * 1) Assume tp_block_nr = 8 blocks.
777 * 2) At time 't0', user opens Rx ring.
778 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
779 * 4) user-space is either sleeping or processing block '0'.
780 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
781 * it will close block-7,loop around and try to fill block '0'.
782 * call-flow:
783 * __packet_lookup_frame_in_block
784 * prb_retire_current_block()
785 * prb_dispatch_next_block()
786 * |->(BLOCK_STATUS == USER) evaluates to true
787 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
788 * 6) Now there are two cases:
789 * 6.1) Link goes idle right after the queue is frozen.
790 * But remember, the last open_block() refreshed the timer.
791 * When this timer expires,it will refresh itself so that we can
792 * re-open block-0 in near future.
793 * 6.2) Link is busy and keeps on receiving packets. This is a simple
794 * case and __packet_lookup_frame_in_block will check if block-0
795 * is free and can now be re-used.
796 */
eea49cc9 797static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 798 struct packet_sock *po)
799{
800 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 801 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 802}
803
804#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
805
806/*
807 * If the next block is free then we will dispatch it
808 * and return a good offset.
809 * Else, we will freeze the queue.
810 * So, caller must check the return value.
811 */
bc59ba39 812static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 813 struct packet_sock *po)
814{
bc59ba39 815 struct tpacket_block_desc *pbd;
f6fb8f10 816
817 smp_rmb();
818
819 /* 1. Get current block num */
820 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
821
822 /* 2. If this block is currently in_use then freeze the queue */
823 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
824 prb_freeze_queue(pkc, po);
825 return NULL;
826 }
827
828 /*
829 * 3.
830 * open this block and return the offset where the first packet
831 * needs to get stored.
832 */
833 prb_open_block(pkc, pbd);
834 return (void *)pkc->nxt_offset;
835}
836
bc59ba39 837static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 838 struct packet_sock *po, unsigned int status)
839{
bc59ba39 840 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 841
842 /* retire/close the current block */
843 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
844 /*
845 * Plug the case where copy_bits() is in progress on
846 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
847 * have space to copy the pkt in the current block and
848 * called prb_retire_current_block()
849 *
850 * We don't need to worry about the TMO case because
851 * the timer-handler already handled this case.
852 */
853 if (!(status & TP_STATUS_BLK_TMO)) {
854 while (atomic_read(&pkc->blk_fill_in_prog)) {
855 /* Waiting for skb_copy_bits to finish... */
856 cpu_relax();
857 }
858 }
859 prb_close_block(pkc, pbd, po, status);
860 return;
861 }
f6fb8f10 862}
863
eea49cc9 864static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 865 struct tpacket_block_desc *pbd)
f6fb8f10 866{
867 return TP_STATUS_USER & BLOCK_STATUS(pbd);
868}
869
eea49cc9 870static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 871{
872 return pkc->reset_pending_on_curr_blk;
873}
874
eea49cc9 875static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 876{
bc59ba39 877 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 878 atomic_dec(&pkc->blk_fill_in_prog);
879}
880
eea49cc9 881static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 882 struct tpacket3_hdr *ppd)
883{
884 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
885}
886
eea49cc9 887static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 888 struct tpacket3_hdr *ppd)
889{
890 ppd->hv1.tp_rxhash = 0;
891}
892
eea49cc9 893static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 894 struct tpacket3_hdr *ppd)
895{
896 if (vlan_tx_tag_present(pkc->skb)) {
897 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
898 ppd->tp_status = TP_STATUS_VLAN_VALID;
899 } else {
9e67030a 900 ppd->hv1.tp_vlan_tci = 0;
901 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 902 }
903}
904
bc59ba39 905static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 906 struct tpacket3_hdr *ppd)
907{
908 prb_fill_vlan_info(pkc, ppd);
909
910 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
911 prb_fill_rxhash(pkc, ppd);
912 else
913 prb_clear_rxhash(pkc, ppd);
914}
915
eea49cc9 916static void prb_fill_curr_block(char *curr,
bc59ba39 917 struct tpacket_kbdq_core *pkc,
918 struct tpacket_block_desc *pbd,
f6fb8f10 919 unsigned int len)
920{
921 struct tpacket3_hdr *ppd;
922
923 ppd = (struct tpacket3_hdr *)curr;
924 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
925 pkc->prev = curr;
926 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
927 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
928 BLOCK_NUM_PKTS(pbd) += 1;
929 atomic_inc(&pkc->blk_fill_in_prog);
930 prb_run_all_ft_ops(pkc, ppd);
931}
932
933/* Assumes caller has the sk->rx_queue.lock */
934static void *__packet_lookup_frame_in_block(struct packet_sock *po,
935 struct sk_buff *skb,
936 int status,
937 unsigned int len
938 )
939{
bc59ba39 940 struct tpacket_kbdq_core *pkc;
941 struct tpacket_block_desc *pbd;
f6fb8f10 942 char *curr, *end;
943
e3192690 944 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 945 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
946
947 /* Queue is frozen when user space is lagging behind */
948 if (prb_queue_frozen(pkc)) {
949 /*
950 * Check if that last block which caused the queue to freeze,
951 * is still in_use by user-space.
952 */
953 if (prb_curr_blk_in_use(pkc, pbd)) {
954 /* Can't record this packet */
955 return NULL;
956 } else {
957 /*
958 * Ok, the block was released by user-space.
959 * Now let's open that block.
960 * opening a block also thaws the queue.
961 * Thawing is a side effect.
962 */
963 prb_open_block(pkc, pbd);
964 }
965 }
966
967 smp_mb();
968 curr = pkc->nxt_offset;
969 pkc->skb = skb;
e3192690 970 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 971
972 /* first try the current block */
973 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
974 prb_fill_curr_block(curr, pkc, pbd, len);
975 return (void *)curr;
976 }
977
978 /* Ok, close the current block */
979 prb_retire_current_block(pkc, po, 0);
980
981 /* Now, try to dispatch the next block */
982 curr = (char *)prb_dispatch_next_block(pkc, po);
983 if (curr) {
984 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
985 prb_fill_curr_block(curr, pkc, pbd, len);
986 return (void *)curr;
987 }
988
989 /*
990 * No free blocks are available.user_space hasn't caught up yet.
991 * Queue was just frozen and now this packet will get dropped.
992 */
993 return NULL;
994}
995
eea49cc9 996static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 997 struct sk_buff *skb,
998 int status, unsigned int len)
999{
1000 char *curr = NULL;
1001 switch (po->tp_version) {
1002 case TPACKET_V1:
1003 case TPACKET_V2:
1004 curr = packet_lookup_frame(po, &po->rx_ring,
1005 po->rx_ring.head, status);
1006 return curr;
1007 case TPACKET_V3:
1008 return __packet_lookup_frame_in_block(po, skb, status, len);
1009 default:
1010 WARN(1, "TPACKET version not supported\n");
1011 BUG();
99aa3473 1012 return NULL;
f6fb8f10 1013 }
1014}
1015
eea49cc9 1016static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1017 struct packet_ring_buffer *rb,
77f65ebd 1018 unsigned int idx,
f6fb8f10 1019 int status)
1020{
bc59ba39 1021 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1022 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1023
1024 if (status != BLOCK_STATUS(pbd))
1025 return NULL;
1026 return pbd;
1027}
1028
eea49cc9 1029static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1030{
1031 unsigned int prev;
1032 if (rb->prb_bdqc.kactive_blk_num)
1033 prev = rb->prb_bdqc.kactive_blk_num-1;
1034 else
1035 prev = rb->prb_bdqc.knum_blocks-1;
1036 return prev;
1037}
1038
1039/* Assumes caller has held the rx_queue.lock */
eea49cc9 1040static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1041 struct packet_ring_buffer *rb,
1042 int status)
1043{
1044 unsigned int previous = prb_previous_blk_num(rb);
1045 return prb_lookup_block(po, rb, previous, status);
1046}
1047
eea49cc9 1048static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1049 struct packet_ring_buffer *rb,
1050 int status)
1051{
1052 if (po->tp_version <= TPACKET_V2)
1053 return packet_previous_frame(po, rb, status);
1054
1055 return __prb_previous_block(po, rb, status);
1056}
1057
eea49cc9 1058static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1059 struct packet_ring_buffer *rb)
1060{
1061 switch (po->tp_version) {
1062 case TPACKET_V1:
1063 case TPACKET_V2:
1064 return packet_increment_head(rb);
1065 case TPACKET_V3:
1066 default:
1067 WARN(1, "TPACKET version not supported.\n");
1068 BUG();
1069 return;
1070 }
1071}
1072
eea49cc9 1073static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1074 struct packet_ring_buffer *rb,
1075 int status)
1076{
1077 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1078 return packet_lookup_frame(po, rb, previous, status);
1079}
1080
eea49cc9 1081static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1082{
1083 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1084}
1085
77f65ebd
WB
1086static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1087{
1088 struct sock *sk = &po->sk;
1089 bool has_room;
1090
1091 if (po->prot_hook.func != tpacket_rcv)
1092 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1093 <= sk->sk_rcvbuf;
1094
1095 spin_lock(&sk->sk_receive_queue.lock);
1096 if (po->tp_version == TPACKET_V3)
1097 has_room = prb_lookup_block(po, &po->rx_ring,
1098 po->rx_ring.prb_bdqc.kactive_blk_num,
1099 TP_STATUS_KERNEL);
1100 else
1101 has_room = packet_lookup_frame(po, &po->rx_ring,
1102 po->rx_ring.head,
1103 TP_STATUS_KERNEL);
1104 spin_unlock(&sk->sk_receive_queue.lock);
1105
1106 return has_room;
1107}
1108
1da177e4
LT
1109static void packet_sock_destruct(struct sock *sk)
1110{
ed85b565
RC
1111 skb_queue_purge(&sk->sk_error_queue);
1112
547b792c
IJ
1113 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1114 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1115
1116 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1117 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1118 return;
1119 }
1120
17ab56a2 1121 sk_refcnt_debug_dec(sk);
1da177e4
LT
1122}
1123
dc99f600
DM
1124static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1125{
1126 int x = atomic_read(&f->rr_cur) + 1;
1127
1128 if (x >= num)
1129 x = 0;
1130
1131 return x;
1132}
1133
77f65ebd
WB
1134static unsigned int fanout_demux_hash(struct packet_fanout *f,
1135 struct sk_buff *skb,
1136 unsigned int num)
dc99f600 1137{
77f65ebd 1138 return (((u64)skb->rxhash) * num) >> 32;
dc99f600
DM
1139}
1140
77f65ebd
WB
1141static unsigned int fanout_demux_lb(struct packet_fanout *f,
1142 struct sk_buff *skb,
1143 unsigned int num)
dc99f600
DM
1144{
1145 int cur, old;
1146
1147 cur = atomic_read(&f->rr_cur);
1148 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1149 fanout_rr_next(f, num))) != cur)
1150 cur = old;
77f65ebd
WB
1151 return cur;
1152}
1153
1154static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1155 struct sk_buff *skb,
1156 unsigned int num)
1157{
1158 return smp_processor_id() % num;
dc99f600
DM
1159}
1160
77f65ebd
WB
1161static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1162 struct sk_buff *skb,
1163 unsigned int idx, unsigned int skip,
1164 unsigned int num)
95ec3eb4 1165{
77f65ebd 1166 unsigned int i, j;
95ec3eb4 1167
77f65ebd
WB
1168 i = j = min_t(int, f->next[idx], num - 1);
1169 do {
1170 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1171 if (i != j)
1172 f->next[idx] = i;
1173 return i;
1174 }
1175 if (++i == num)
1176 i = 0;
1177 } while (i != j);
1178
1179 return idx;
1180}
1181
1182static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1183{
1184 return f->flags & (flag >> 8);
95ec3eb4
DM
1185}
1186
95ec3eb4
DM
1187static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1188 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1189{
1190 struct packet_fanout *f = pt->af_packet_priv;
1191 unsigned int num = f->num_members;
1192 struct packet_sock *po;
77f65ebd 1193 unsigned int idx;
dc99f600
DM
1194
1195 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1196 !num) {
1197 kfree_skb(skb);
1198 return 0;
1199 }
1200
95ec3eb4
DM
1201 switch (f->type) {
1202 case PACKET_FANOUT_HASH:
1203 default:
77f65ebd 1204 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
bc416d97 1205 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1206 if (!skb)
1207 return 0;
1208 }
1209 skb_get_rxhash(skb);
77f65ebd 1210 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1211 break;
1212 case PACKET_FANOUT_LB:
77f65ebd 1213 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1214 break;
1215 case PACKET_FANOUT_CPU:
77f65ebd
WB
1216 idx = fanout_demux_cpu(f, skb, num);
1217 break;
1218 case PACKET_FANOUT_ROLLOVER:
1219 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1220 break;
dc99f600
DM
1221 }
1222
77f65ebd
WB
1223 po = pkt_sk(f->arr[idx]);
1224 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1225 unlikely(!packet_rcv_has_room(po, skb))) {
1226 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1227 po = pkt_sk(f->arr[idx]);
1228 }
dc99f600
DM
1229
1230 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1231}
1232
fff3321d
PE
1233DEFINE_MUTEX(fanout_mutex);
1234EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1235static LIST_HEAD(fanout_list);
1236
1237static void __fanout_link(struct sock *sk, struct packet_sock *po)
1238{
1239 struct packet_fanout *f = po->fanout;
1240
1241 spin_lock(&f->lock);
1242 f->arr[f->num_members] = sk;
1243 smp_wmb();
1244 f->num_members++;
1245 spin_unlock(&f->lock);
1246}
1247
1248static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1249{
1250 struct packet_fanout *f = po->fanout;
1251 int i;
1252
1253 spin_lock(&f->lock);
1254 for (i = 0; i < f->num_members; i++) {
1255 if (f->arr[i] == sk)
1256 break;
1257 }
1258 BUG_ON(i >= f->num_members);
1259 f->arr[i] = f->arr[f->num_members - 1];
1260 f->num_members--;
1261 spin_unlock(&f->lock);
1262}
1263
a0dfb263 1264static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
c0de08d0
EL
1265{
1266 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1267 return true;
1268
1269 return false;
1270}
1271
7736d33f 1272static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1273{
1274 struct packet_sock *po = pkt_sk(sk);
1275 struct packet_fanout *f, *match;
7736d33f 1276 u8 type = type_flags & 0xff;
77f65ebd 1277 u8 flags = type_flags >> 8;
dc99f600
DM
1278 int err;
1279
1280 switch (type) {
77f65ebd
WB
1281 case PACKET_FANOUT_ROLLOVER:
1282 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1283 return -EINVAL;
dc99f600
DM
1284 case PACKET_FANOUT_HASH:
1285 case PACKET_FANOUT_LB:
95ec3eb4 1286 case PACKET_FANOUT_CPU:
dc99f600
DM
1287 break;
1288 default:
1289 return -EINVAL;
1290 }
1291
1292 if (!po->running)
1293 return -EINVAL;
1294
1295 if (po->fanout)
1296 return -EALREADY;
1297
1298 mutex_lock(&fanout_mutex);
1299 match = NULL;
1300 list_for_each_entry(f, &fanout_list, list) {
1301 if (f->id == id &&
1302 read_pnet(&f->net) == sock_net(sk)) {
1303 match = f;
1304 break;
1305 }
1306 }
afe62c68 1307 err = -EINVAL;
77f65ebd 1308 if (match && match->flags != flags)
afe62c68 1309 goto out;
dc99f600 1310 if (!match) {
afe62c68 1311 err = -ENOMEM;
dc99f600 1312 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1313 if (!match)
1314 goto out;
1315 write_pnet(&match->net, sock_net(sk));
1316 match->id = id;
1317 match->type = type;
77f65ebd 1318 match->flags = flags;
afe62c68
ED
1319 atomic_set(&match->rr_cur, 0);
1320 INIT_LIST_HEAD(&match->list);
1321 spin_lock_init(&match->lock);
1322 atomic_set(&match->sk_ref, 0);
1323 match->prot_hook.type = po->prot_hook.type;
1324 match->prot_hook.dev = po->prot_hook.dev;
1325 match->prot_hook.func = packet_rcv_fanout;
1326 match->prot_hook.af_packet_priv = match;
c0de08d0 1327 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1328 dev_add_pack(&match->prot_hook);
1329 list_add(&match->list, &fanout_list);
dc99f600 1330 }
afe62c68
ED
1331 err = -EINVAL;
1332 if (match->type == type &&
1333 match->prot_hook.type == po->prot_hook.type &&
1334 match->prot_hook.dev == po->prot_hook.dev) {
1335 err = -ENOSPC;
1336 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1337 __dev_remove_pack(&po->prot_hook);
1338 po->fanout = match;
1339 atomic_inc(&match->sk_ref);
1340 __fanout_link(sk, po);
1341 err = 0;
dc99f600
DM
1342 }
1343 }
afe62c68 1344out:
dc99f600
DM
1345 mutex_unlock(&fanout_mutex);
1346 return err;
1347}
1348
1349static void fanout_release(struct sock *sk)
1350{
1351 struct packet_sock *po = pkt_sk(sk);
1352 struct packet_fanout *f;
1353
1354 f = po->fanout;
1355 if (!f)
1356 return;
1357
fff3321d 1358 mutex_lock(&fanout_mutex);
dc99f600
DM
1359 po->fanout = NULL;
1360
dc99f600
DM
1361 if (atomic_dec_and_test(&f->sk_ref)) {
1362 list_del(&f->list);
1363 dev_remove_pack(&f->prot_hook);
1364 kfree(f);
1365 }
1366 mutex_unlock(&fanout_mutex);
1367}
1da177e4 1368
90ddc4f0 1369static const struct proto_ops packet_ops;
1da177e4 1370
90ddc4f0 1371static const struct proto_ops packet_ops_spkt;
1da177e4 1372
40d4e3df
ED
1373static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1374 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1375{
1376 struct sock *sk;
1377 struct sockaddr_pkt *spkt;
1378
1379 /*
1380 * When we registered the protocol we saved the socket in the data
1381 * field for just this event.
1382 */
1383
1384 sk = pt->af_packet_priv;
1ce4f28b 1385
1da177e4
LT
1386 /*
1387 * Yank back the headers [hope the device set this
1388 * right or kerboom...]
1389 *
1390 * Incoming packets have ll header pulled,
1391 * push it back.
1392 *
98e399f8 1393 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1394 * so that this procedure is noop.
1395 */
1396
1397 if (skb->pkt_type == PACKET_LOOPBACK)
1398 goto out;
1399
09ad9bc7 1400 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1401 goto out;
1402
40d4e3df
ED
1403 skb = skb_share_check(skb, GFP_ATOMIC);
1404 if (skb == NULL)
1da177e4
LT
1405 goto oom;
1406
1407 /* drop any routing info */
adf30907 1408 skb_dst_drop(skb);
1da177e4 1409
84531c24
PO
1410 /* drop conntrack reference */
1411 nf_reset(skb);
1412
ffbc6111 1413 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1414
98e399f8 1415 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1416
1417 /*
1418 * The SOCK_PACKET socket receives _all_ frames.
1419 */
1420
1421 spkt->spkt_family = dev->type;
1422 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1423 spkt->spkt_protocol = skb->protocol;
1424
1425 /*
1426 * Charge the memory to the socket. This is done specifically
1427 * to prevent sockets using all the memory up.
1428 */
1429
40d4e3df 1430 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1431 return 0;
1432
1433out:
1434 kfree_skb(skb);
1435oom:
1436 return 0;
1437}
1438
1439
1440/*
1441 * Output a raw packet to a device layer. This bypasses all the other
1442 * protocol layers and you must therefore supply it with a complete frame
1443 */
1ce4f28b 1444
1da177e4
LT
1445static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1446 struct msghdr *msg, size_t len)
1447{
1448 struct sock *sk = sock->sk;
40d4e3df 1449 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1450 struct sk_buff *skb = NULL;
1da177e4 1451 struct net_device *dev;
40d4e3df 1452 __be16 proto = 0;
1da177e4 1453 int err;
3bdc0eba 1454 int extra_len = 0;
1ce4f28b 1455
1da177e4 1456 /*
1ce4f28b 1457 * Get and verify the address.
1da177e4
LT
1458 */
1459
40d4e3df 1460 if (saddr) {
1da177e4 1461 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1462 return -EINVAL;
1463 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1464 proto = saddr->spkt_protocol;
1465 } else
1466 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1467
1468 /*
1ce4f28b 1469 * Find the device first to size check it
1da177e4
LT
1470 */
1471
de74e92a 1472 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1473retry:
654d1f8a
ED
1474 rcu_read_lock();
1475 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1476 err = -ENODEV;
1477 if (dev == NULL)
1478 goto out_unlock;
1ce4f28b 1479
d5e76b0a
DM
1480 err = -ENETDOWN;
1481 if (!(dev->flags & IFF_UP))
1482 goto out_unlock;
1483
1da177e4 1484 /*
40d4e3df
ED
1485 * You may not queue a frame bigger than the mtu. This is the lowest level
1486 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1487 */
1ce4f28b 1488
3bdc0eba
BG
1489 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1490 if (!netif_supports_nofcs(dev)) {
1491 err = -EPROTONOSUPPORT;
1492 goto out_unlock;
1493 }
1494 extra_len = 4; /* We're doing our own CRC */
1495 }
1496
1da177e4 1497 err = -EMSGSIZE;
3bdc0eba 1498 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1499 goto out_unlock;
1500
1a35ca80
ED
1501 if (!skb) {
1502 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1503 int tlen = dev->needed_tailroom;
1a35ca80
ED
1504 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1505
1506 rcu_read_unlock();
4ce40912 1507 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1508 if (skb == NULL)
1509 return -ENOBUFS;
1510 /* FIXME: Save some space for broken drivers that write a hard
1511 * header at transmission time by themselves. PPP is the notable
1512 * one here. This should really be fixed at the driver level.
1513 */
1514 skb_reserve(skb, reserved);
1515 skb_reset_network_header(skb);
1516
1517 /* Try to align data part correctly */
1518 if (hhlen) {
1519 skb->data -= hhlen;
1520 skb->tail -= hhlen;
1521 if (len < hhlen)
1522 skb_reset_network_header(skb);
1523 }
1524 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1525 if (err)
1526 goto out_free;
1527 goto retry;
1da177e4
LT
1528 }
1529
3bdc0eba 1530 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1531 /* Earlier code assumed this would be a VLAN pkt,
1532 * double-check this now that we have the actual
1533 * packet in hand.
1534 */
1535 struct ethhdr *ehdr;
1536 skb_reset_mac_header(skb);
1537 ehdr = eth_hdr(skb);
1538 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1539 err = -EMSGSIZE;
1540 goto out_unlock;
1541 }
1542 }
1a35ca80 1543
1da177e4
LT
1544 skb->protocol = proto;
1545 skb->dev = dev;
1546 skb->priority = sk->sk_priority;
2d37a186 1547 skb->mark = sk->sk_mark;
bf84a010
DB
1548
1549 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1550
3bdc0eba
BG
1551 if (unlikely(extra_len == 4))
1552 skb->no_fcs = 1;
1553
40893fd0 1554 skb_probe_transport_header(skb, 0);
c1aad275 1555
1da177e4 1556 dev_queue_xmit(skb);
654d1f8a 1557 rcu_read_unlock();
40d4e3df 1558 return len;
1da177e4 1559
1da177e4 1560out_unlock:
654d1f8a 1561 rcu_read_unlock();
1a35ca80
ED
1562out_free:
1563 kfree_skb(skb);
1da177e4
LT
1564 return err;
1565}
1da177e4 1566
eea49cc9 1567static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1568 const struct sock *sk,
dbcb5855 1569 unsigned int res)
1da177e4
LT
1570{
1571 struct sk_filter *filter;
fda9ef5d 1572
80f8f102
ED
1573 rcu_read_lock();
1574 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1575 if (filter != NULL)
0a14842f 1576 res = SK_RUN_FILTER(filter, skb);
80f8f102 1577 rcu_read_unlock();
1da177e4 1578
dbcb5855 1579 return res;
1da177e4
LT
1580}
1581
1582/*
62ab0812
ED
1583 * This function makes lazy skb cloning in hope that most of packets
1584 * are discarded by BPF.
1585 *
1586 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1587 * and skb->cb are mangled. It works because (and until) packets
1588 * falling here are owned by current CPU. Output packets are cloned
1589 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1590 * sequencially, so that if we return skb to original state on exit,
1591 * we will not harm anyone.
1da177e4
LT
1592 */
1593
40d4e3df
ED
1594static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1595 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1596{
1597 struct sock *sk;
1598 struct sockaddr_ll *sll;
1599 struct packet_sock *po;
40d4e3df 1600 u8 *skb_head = skb->data;
1da177e4 1601 int skb_len = skb->len;
dbcb5855 1602 unsigned int snaplen, res;
1da177e4
LT
1603
1604 if (skb->pkt_type == PACKET_LOOPBACK)
1605 goto drop;
1606
1607 sk = pt->af_packet_priv;
1608 po = pkt_sk(sk);
1609
09ad9bc7 1610 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1611 goto drop;
1612
1da177e4
LT
1613 skb->dev = dev;
1614
3b04ddde 1615 if (dev->header_ops) {
1da177e4 1616 /* The device has an explicit notion of ll header,
62ab0812
ED
1617 * exported to higher levels.
1618 *
1619 * Otherwise, the device hides details of its frame
1620 * structure, so that corresponding packet head is
1621 * never delivered to user.
1da177e4
LT
1622 */
1623 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1624 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1625 else if (skb->pkt_type == PACKET_OUTGOING) {
1626 /* Special case: outgoing packets have ll header at head */
bbe735e4 1627 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1628 }
1629 }
1630
1631 snaplen = skb->len;
1632
dbcb5855
DM
1633 res = run_filter(skb, sk, snaplen);
1634 if (!res)
fda9ef5d 1635 goto drop_n_restore;
dbcb5855
DM
1636 if (snaplen > res)
1637 snaplen = res;
1da177e4 1638
0fd7bac6 1639 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1640 goto drop_n_acct;
1641
1642 if (skb_shared(skb)) {
1643 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1644 if (nskb == NULL)
1645 goto drop_n_acct;
1646
1647 if (skb_head != skb->data) {
1648 skb->data = skb_head;
1649 skb->len = skb_len;
1650 }
abc4e4fa 1651 consume_skb(skb);
1da177e4
LT
1652 skb = nskb;
1653 }
1654
ffbc6111
HX
1655 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1656 sizeof(skb->cb));
1657
1658 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1659 sll->sll_family = AF_PACKET;
1660 sll->sll_hatype = dev->type;
1661 sll->sll_protocol = skb->protocol;
1662 sll->sll_pkttype = skb->pkt_type;
8032b464 1663 if (unlikely(po->origdev))
80feaacb
PWJ
1664 sll->sll_ifindex = orig_dev->ifindex;
1665 else
1666 sll->sll_ifindex = dev->ifindex;
1da177e4 1667
b95cce35 1668 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1669
ffbc6111 1670 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1671
1da177e4
LT
1672 if (pskb_trim(skb, snaplen))
1673 goto drop_n_acct;
1674
1675 skb_set_owner_r(skb, sk);
1676 skb->dev = NULL;
adf30907 1677 skb_dst_drop(skb);
1da177e4 1678
84531c24
PO
1679 /* drop conntrack reference */
1680 nf_reset(skb);
1681
1da177e4 1682 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1683 po->stats.stats1.tp_packets++;
3b885787 1684 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1685 __skb_queue_tail(&sk->sk_receive_queue, skb);
1686 spin_unlock(&sk->sk_receive_queue.lock);
1687 sk->sk_data_ready(sk, skb->len);
1688 return 0;
1689
1690drop_n_acct:
7091fbd8 1691 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1692 po->stats.stats1.tp_drops++;
7091fbd8
WB
1693 atomic_inc(&sk->sk_drops);
1694 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1695
1696drop_n_restore:
1697 if (skb_head != skb->data && skb_shared(skb)) {
1698 skb->data = skb_head;
1699 skb->len = skb_len;
1700 }
1701drop:
ead2ceb0 1702 consume_skb(skb);
1da177e4
LT
1703 return 0;
1704}
1705
40d4e3df
ED
1706static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1707 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1708{
1709 struct sock *sk;
1710 struct packet_sock *po;
1711 struct sockaddr_ll *sll;
184f489e 1712 union tpacket_uhdr h;
40d4e3df 1713 u8 *skb_head = skb->data;
1da177e4 1714 int skb_len = skb->len;
dbcb5855 1715 unsigned int snaplen, res;
f6fb8f10 1716 unsigned long status = TP_STATUS_USER;
bbd6ef87 1717 unsigned short macoff, netoff, hdrlen;
1da177e4 1718 struct sk_buff *copy_skb = NULL;
bbd6ef87 1719 struct timespec ts;
b9c32fb2 1720 __u32 ts_status;
1da177e4
LT
1721
1722 if (skb->pkt_type == PACKET_LOOPBACK)
1723 goto drop;
1724
1725 sk = pt->af_packet_priv;
1726 po = pkt_sk(sk);
1727
09ad9bc7 1728 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1729 goto drop;
1730
3b04ddde 1731 if (dev->header_ops) {
1da177e4 1732 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1733 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1734 else if (skb->pkt_type == PACKET_OUTGOING) {
1735 /* Special case: outgoing packets have ll header at head */
bbe735e4 1736 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1737 }
1738 }
1739
8dc41944
HX
1740 if (skb->ip_summed == CHECKSUM_PARTIAL)
1741 status |= TP_STATUS_CSUMNOTREADY;
1742
1da177e4
LT
1743 snaplen = skb->len;
1744
dbcb5855
DM
1745 res = run_filter(skb, sk, snaplen);
1746 if (!res)
fda9ef5d 1747 goto drop_n_restore;
dbcb5855
DM
1748 if (snaplen > res)
1749 snaplen = res;
1da177e4
LT
1750
1751 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1752 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1753 po->tp_reserve;
1da177e4 1754 } else {
95c96174 1755 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1756 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1757 (maclen < 16 ? 16 : maclen)) +
1758 po->tp_reserve;
1da177e4
LT
1759 macoff = netoff - maclen;
1760 }
f6fb8f10 1761 if (po->tp_version <= TPACKET_V2) {
1762 if (macoff + snaplen > po->rx_ring.frame_size) {
1763 if (po->copy_thresh &&
0fd7bac6 1764 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1765 if (skb_shared(skb)) {
1766 copy_skb = skb_clone(skb, GFP_ATOMIC);
1767 } else {
1768 copy_skb = skb_get(skb);
1769 skb_head = skb->data;
1770 }
1771 if (copy_skb)
1772 skb_set_owner_r(copy_skb, sk);
1da177e4 1773 }
f6fb8f10 1774 snaplen = po->rx_ring.frame_size - macoff;
1775 if ((int)snaplen < 0)
1776 snaplen = 0;
1da177e4 1777 }
1da177e4 1778 }
1da177e4 1779 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1780 h.raw = packet_current_rx_frame(po, skb,
1781 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1782 if (!h.raw)
1da177e4 1783 goto ring_is_full;
f6fb8f10 1784 if (po->tp_version <= TPACKET_V2) {
1785 packet_increment_rx_head(po, &po->rx_ring);
1786 /*
1787 * LOSING will be reported till you read the stats,
1788 * because it's COR - Clear On Read.
1789 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1790 * at packet level.
1791 */
ee80fbf3 1792 if (po->stats.stats1.tp_drops)
f6fb8f10 1793 status |= TP_STATUS_LOSING;
1794 }
ee80fbf3 1795 po->stats.stats1.tp_packets++;
1da177e4
LT
1796 if (copy_skb) {
1797 status |= TP_STATUS_COPY;
1798 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1799 }
1da177e4
LT
1800 spin_unlock(&sk->sk_receive_queue.lock);
1801
bbd6ef87 1802 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
1803
1804 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 1805 getnstimeofday(&ts);
1da177e4 1806
b9c32fb2
DB
1807 status |= ts_status;
1808
bbd6ef87
PM
1809 switch (po->tp_version) {
1810 case TPACKET_V1:
1811 h.h1->tp_len = skb->len;
1812 h.h1->tp_snaplen = snaplen;
1813 h.h1->tp_mac = macoff;
1814 h.h1->tp_net = netoff;
4b457bdf
DB
1815 h.h1->tp_sec = ts.tv_sec;
1816 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
1817 hdrlen = sizeof(*h.h1);
1818 break;
1819 case TPACKET_V2:
1820 h.h2->tp_len = skb->len;
1821 h.h2->tp_snaplen = snaplen;
1822 h.h2->tp_mac = macoff;
1823 h.h2->tp_net = netoff;
bbd6ef87
PM
1824 h.h2->tp_sec = ts.tv_sec;
1825 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1826 if (vlan_tx_tag_present(skb)) {
1827 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1828 status |= TP_STATUS_VLAN_VALID;
1829 } else {
1830 h.h2->tp_vlan_tci = 0;
1831 }
13fcb7bd 1832 h.h2->tp_padding = 0;
bbd6ef87
PM
1833 hdrlen = sizeof(*h.h2);
1834 break;
f6fb8f10 1835 case TPACKET_V3:
1836 /* tp_nxt_offset,vlan are already populated above.
1837 * So DONT clear those fields here
1838 */
1839 h.h3->tp_status |= status;
1840 h.h3->tp_len = skb->len;
1841 h.h3->tp_snaplen = snaplen;
1842 h.h3->tp_mac = macoff;
1843 h.h3->tp_net = netoff;
f6fb8f10 1844 h.h3->tp_sec = ts.tv_sec;
1845 h.h3->tp_nsec = ts.tv_nsec;
1846 hdrlen = sizeof(*h.h3);
1847 break;
bbd6ef87
PM
1848 default:
1849 BUG();
1850 }
1da177e4 1851
bbd6ef87 1852 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1853 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1854 sll->sll_family = AF_PACKET;
1855 sll->sll_hatype = dev->type;
1856 sll->sll_protocol = skb->protocol;
1857 sll->sll_pkttype = skb->pkt_type;
8032b464 1858 if (unlikely(po->origdev))
80feaacb
PWJ
1859 sll->sll_ifindex = orig_dev->ifindex;
1860 else
1861 sll->sll_ifindex = dev->ifindex;
1da177e4 1862
e16aa207 1863 smp_mb();
f6dafa95 1864#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1865 {
0af55bb5
CG
1866 u8 *start, *end;
1867
f6fb8f10 1868 if (po->tp_version <= TPACKET_V2) {
1869 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1870 + macoff + snaplen);
1871 for (start = h.raw; start < end; start += PAGE_SIZE)
1872 flush_dcache_page(pgv_to_page(start));
1873 }
cc9f01b2 1874 smp_wmb();
1da177e4 1875 }
f6dafa95 1876#endif
f6fb8f10 1877 if (po->tp_version <= TPACKET_V2)
1878 __packet_set_status(po, h.raw, status);
1879 else
1880 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1881
1882 sk->sk_data_ready(sk, 0);
1883
1884drop_n_restore:
1885 if (skb_head != skb->data && skb_shared(skb)) {
1886 skb->data = skb_head;
1887 skb->len = skb_len;
1888 }
1889drop:
1ce4f28b 1890 kfree_skb(skb);
1da177e4
LT
1891 return 0;
1892
1893ring_is_full:
ee80fbf3 1894 po->stats.stats1.tp_drops++;
1da177e4
LT
1895 spin_unlock(&sk->sk_receive_queue.lock);
1896
1897 sk->sk_data_ready(sk, 0);
acb5d75b 1898 kfree_skb(copy_skb);
1da177e4
LT
1899 goto drop_n_restore;
1900}
1901
69e3c75f
JB
1902static void tpacket_destruct_skb(struct sk_buff *skb)
1903{
1904 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1905 void *ph;
1da177e4 1906
69e3c75f 1907 if (likely(po->tx_ring.pg_vec)) {
b9c32fb2
DB
1908 __u32 ts;
1909
69e3c75f 1910 ph = skb_shinfo(skb)->destructor_arg;
69e3c75f
JB
1911 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1912 atomic_dec(&po->tx_ring.pending);
b9c32fb2
DB
1913
1914 ts = __packet_set_timestamp(po, ph, skb);
1915 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
1916 }
1917
1918 sock_wfree(skb);
1919}
1920
40d4e3df
ED
1921static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1922 void *frame, struct net_device *dev, int size_max,
ae641949 1923 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 1924{
184f489e 1925 union tpacket_uhdr ph;
69e3c75f
JB
1926 int to_write, offset, len, tp_len, nr_frags, len_max;
1927 struct socket *sock = po->sk.sk_socket;
1928 struct page *page;
1929 void *data;
1930 int err;
1931
1932 ph.raw = frame;
1933
1934 skb->protocol = proto;
1935 skb->dev = dev;
1936 skb->priority = po->sk.sk_priority;
2d37a186 1937 skb->mark = po->sk.sk_mark;
2e31396f 1938 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
1939 skb_shinfo(skb)->destructor_arg = ph.raw;
1940
1941 switch (po->tp_version) {
1942 case TPACKET_V2:
1943 tp_len = ph.h2->tp_len;
1944 break;
1945 default:
1946 tp_len = ph.h1->tp_len;
1947 break;
1948 }
1949 if (unlikely(tp_len > size_max)) {
40d4e3df 1950 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1951 return -EMSGSIZE;
1952 }
1953
ae641949 1954 skb_reserve(skb, hlen);
69e3c75f 1955 skb_reset_network_header(skb);
40893fd0 1956 skb_probe_transport_header(skb, 0);
c1aad275 1957
5920cd3a
PC
1958 if (po->tp_tx_has_off) {
1959 int off_min, off_max, off;
1960 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
1961 off_max = po->tx_ring.frame_size - tp_len;
1962 if (sock->type == SOCK_DGRAM) {
1963 switch (po->tp_version) {
1964 case TPACKET_V2:
1965 off = ph.h2->tp_net;
1966 break;
1967 default:
1968 off = ph.h1->tp_net;
1969 break;
1970 }
1971 } else {
1972 switch (po->tp_version) {
1973 case TPACKET_V2:
1974 off = ph.h2->tp_mac;
1975 break;
1976 default:
1977 off = ph.h1->tp_mac;
1978 break;
1979 }
1980 }
1981 if (unlikely((off < off_min) || (off_max < off)))
1982 return -EINVAL;
1983 data = ph.raw + off;
1984 } else {
1985 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1986 }
69e3c75f
JB
1987 to_write = tp_len;
1988
1989 if (sock->type == SOCK_DGRAM) {
1990 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1991 NULL, tp_len);
1992 if (unlikely(err < 0))
1993 return -EINVAL;
40d4e3df 1994 } else if (dev->hard_header_len) {
69e3c75f
JB
1995 /* net device doesn't like empty head */
1996 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
1997 pr_err("packet size is too short (%d < %d)\n",
1998 tp_len, dev->hard_header_len);
69e3c75f
JB
1999 return -EINVAL;
2000 }
2001
2002 skb_push(skb, dev->hard_header_len);
2003 err = skb_store_bits(skb, 0, data,
2004 dev->hard_header_len);
2005 if (unlikely(err))
2006 return err;
2007
2008 data += dev->hard_header_len;
2009 to_write -= dev->hard_header_len;
2010 }
2011
69e3c75f
JB
2012 offset = offset_in_page(data);
2013 len_max = PAGE_SIZE - offset;
2014 len = ((to_write > len_max) ? len_max : to_write);
2015
2016 skb->data_len = to_write;
2017 skb->len += to_write;
2018 skb->truesize += to_write;
2019 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2020
2021 while (likely(to_write)) {
2022 nr_frags = skb_shinfo(skb)->nr_frags;
2023
2024 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2025 pr_err("Packet exceed the number of skb frags(%lu)\n",
2026 MAX_SKB_FRAGS);
69e3c75f
JB
2027 return -EFAULT;
2028 }
2029
0af55bb5
CG
2030 page = pgv_to_page(data);
2031 data += len;
69e3c75f
JB
2032 flush_dcache_page(page);
2033 get_page(page);
0af55bb5 2034 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2035 to_write -= len;
2036 offset = 0;
2037 len_max = PAGE_SIZE;
2038 len = ((to_write > len_max) ? len_max : to_write);
2039 }
2040
2041 return tp_len;
2042}
2043
2044static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2045{
69e3c75f
JB
2046 struct sk_buff *skb;
2047 struct net_device *dev;
2048 __be16 proto;
827d9780
BG
2049 bool need_rls_dev = false;
2050 int err, reserve = 0;
40d4e3df
ED
2051 void *ph;
2052 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2053 int tp_len, size_max;
2054 unsigned char *addr;
2055 int len_sum = 0;
9e67030a 2056 int status = TP_STATUS_AVAILABLE;
ae641949 2057 int hlen, tlen;
69e3c75f 2058
69e3c75f
JB
2059 mutex_lock(&po->pg_vec_lock);
2060
69e3c75f 2061 if (saddr == NULL) {
827d9780 2062 dev = po->prot_hook.dev;
69e3c75f
JB
2063 proto = po->num;
2064 addr = NULL;
2065 } else {
2066 err = -EINVAL;
2067 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2068 goto out;
2069 if (msg->msg_namelen < (saddr->sll_halen
2070 + offsetof(struct sockaddr_ll,
2071 sll_addr)))
2072 goto out;
69e3c75f
JB
2073 proto = saddr->sll_protocol;
2074 addr = saddr->sll_addr;
827d9780
BG
2075 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2076 need_rls_dev = true;
69e3c75f
JB
2077 }
2078
69e3c75f
JB
2079 err = -ENXIO;
2080 if (unlikely(dev == NULL))
2081 goto out;
2082
2083 reserve = dev->hard_header_len;
2084
2085 err = -ENETDOWN;
2086 if (unlikely(!(dev->flags & IFF_UP)))
2087 goto out_put;
2088
2089 size_max = po->tx_ring.frame_size
b5dd884e 2090 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2091
2092 if (size_max > dev->mtu + reserve)
2093 size_max = dev->mtu + reserve;
2094
2095 do {
2096 ph = packet_current_frame(po, &po->tx_ring,
2097 TP_STATUS_SEND_REQUEST);
2098
2099 if (unlikely(ph == NULL)) {
2100 schedule();
2101 continue;
2102 }
2103
2104 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2105 hlen = LL_RESERVED_SPACE(dev);
2106 tlen = dev->needed_tailroom;
69e3c75f 2107 skb = sock_alloc_send_skb(&po->sk,
ae641949 2108 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2109 0, &err);
2110
2111 if (unlikely(skb == NULL))
2112 goto out_status;
2113
2114 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2115 addr, hlen);
69e3c75f
JB
2116
2117 if (unlikely(tp_len < 0)) {
2118 if (po->tp_loss) {
2119 __packet_set_status(po, ph,
2120 TP_STATUS_AVAILABLE);
2121 packet_increment_head(&po->tx_ring);
2122 kfree_skb(skb);
2123 continue;
2124 } else {
2125 status = TP_STATUS_WRONG_FORMAT;
2126 err = tp_len;
2127 goto out_status;
2128 }
2129 }
2130
2131 skb->destructor = tpacket_destruct_skb;
2132 __packet_set_status(po, ph, TP_STATUS_SENDING);
2133 atomic_inc(&po->tx_ring.pending);
2134
2135 status = TP_STATUS_SEND_REQUEST;
2136 err = dev_queue_xmit(skb);
eb70df13
JP
2137 if (unlikely(err > 0)) {
2138 err = net_xmit_errno(err);
2139 if (err && __packet_get_status(po, ph) ==
2140 TP_STATUS_AVAILABLE) {
2141 /* skb was destructed already */
2142 skb = NULL;
2143 goto out_status;
2144 }
2145 /*
2146 * skb was dropped but not destructed yet;
2147 * let's treat it like congestion or err < 0
2148 */
2149 err = 0;
2150 }
69e3c75f
JB
2151 packet_increment_head(&po->tx_ring);
2152 len_sum += tp_len;
f64f9e71
JP
2153 } while (likely((ph != NULL) ||
2154 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2155 (atomic_read(&po->tx_ring.pending))))
2156 );
69e3c75f
JB
2157
2158 err = len_sum;
2159 goto out_put;
2160
69e3c75f
JB
2161out_status:
2162 __packet_set_status(po, ph, status);
2163 kfree_skb(skb);
2164out_put:
827d9780
BG
2165 if (need_rls_dev)
2166 dev_put(dev);
69e3c75f
JB
2167out:
2168 mutex_unlock(&po->pg_vec_lock);
2169 return err;
2170}
69e3c75f 2171
eea49cc9
OJ
2172static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2173 size_t reserve, size_t len,
2174 size_t linear, int noblock,
2175 int *err)
bfd5f4a3
SS
2176{
2177 struct sk_buff *skb;
2178
2179 /* Under a page? Don't bother with paged skb. */
2180 if (prepad + len < PAGE_SIZE || !linear)
2181 linear = len;
2182
2183 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2184 err);
2185 if (!skb)
2186 return NULL;
2187
2188 skb_reserve(skb, reserve);
2189 skb_put(skb, linear);
2190 skb->data_len = len - linear;
2191 skb->len += len - linear;
2192
2193 return skb;
2194}
2195
69e3c75f 2196static int packet_snd(struct socket *sock,
1da177e4
LT
2197 struct msghdr *msg, size_t len)
2198{
2199 struct sock *sk = sock->sk;
40d4e3df 2200 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2201 struct sk_buff *skb;
2202 struct net_device *dev;
0e11c91e 2203 __be16 proto;
827d9780 2204 bool need_rls_dev = false;
1da177e4 2205 unsigned char *addr;
827d9780 2206 int err, reserve = 0;
bfd5f4a3
SS
2207 struct virtio_net_hdr vnet_hdr = { 0 };
2208 int offset = 0;
2209 int vnet_hdr_len;
2210 struct packet_sock *po = pkt_sk(sk);
2211 unsigned short gso_type = 0;
ae641949 2212 int hlen, tlen;
3bdc0eba 2213 int extra_len = 0;
1da177e4
LT
2214
2215 /*
1ce4f28b 2216 * Get and verify the address.
1da177e4 2217 */
1ce4f28b 2218
1da177e4 2219 if (saddr == NULL) {
827d9780 2220 dev = po->prot_hook.dev;
1da177e4
LT
2221 proto = po->num;
2222 addr = NULL;
2223 } else {
2224 err = -EINVAL;
2225 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2226 goto out;
0fb375fb
EB
2227 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2228 goto out;
1da177e4
LT
2229 proto = saddr->sll_protocol;
2230 addr = saddr->sll_addr;
827d9780
BG
2231 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2232 need_rls_dev = true;
1da177e4
LT
2233 }
2234
1da177e4
LT
2235 err = -ENXIO;
2236 if (dev == NULL)
2237 goto out_unlock;
2238 if (sock->type == SOCK_RAW)
2239 reserve = dev->hard_header_len;
2240
d5e76b0a
DM
2241 err = -ENETDOWN;
2242 if (!(dev->flags & IFF_UP))
2243 goto out_unlock;
2244
bfd5f4a3
SS
2245 if (po->has_vnet_hdr) {
2246 vnet_hdr_len = sizeof(vnet_hdr);
2247
2248 err = -EINVAL;
2249 if (len < vnet_hdr_len)
2250 goto out_unlock;
2251
2252 len -= vnet_hdr_len;
2253
2254 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2255 vnet_hdr_len);
2256 if (err < 0)
2257 goto out_unlock;
2258
2259 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2260 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2261 vnet_hdr.hdr_len))
2262 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2263 vnet_hdr.csum_offset + 2;
2264
2265 err = -EINVAL;
2266 if (vnet_hdr.hdr_len > len)
2267 goto out_unlock;
2268
2269 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2270 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2271 case VIRTIO_NET_HDR_GSO_TCPV4:
2272 gso_type = SKB_GSO_TCPV4;
2273 break;
2274 case VIRTIO_NET_HDR_GSO_TCPV6:
2275 gso_type = SKB_GSO_TCPV6;
2276 break;
2277 case VIRTIO_NET_HDR_GSO_UDP:
2278 gso_type = SKB_GSO_UDP;
2279 break;
2280 default:
2281 goto out_unlock;
2282 }
2283
2284 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2285 gso_type |= SKB_GSO_TCP_ECN;
2286
2287 if (vnet_hdr.gso_size == 0)
2288 goto out_unlock;
2289
2290 }
2291 }
2292
3bdc0eba
BG
2293 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2294 if (!netif_supports_nofcs(dev)) {
2295 err = -EPROTONOSUPPORT;
2296 goto out_unlock;
2297 }
2298 extra_len = 4; /* We're doing our own CRC */
2299 }
2300
1da177e4 2301 err = -EMSGSIZE;
3bdc0eba 2302 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2303 goto out_unlock;
2304
bfd5f4a3 2305 err = -ENOBUFS;
ae641949
HX
2306 hlen = LL_RESERVED_SPACE(dev);
2307 tlen = dev->needed_tailroom;
2308 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2309 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2310 if (skb == NULL)
1da177e4
LT
2311 goto out_unlock;
2312
bfd5f4a3 2313 skb_set_network_header(skb, reserve);
1da177e4 2314
0c4e8581
SH
2315 err = -EINVAL;
2316 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2317 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2318 goto out_free;
1da177e4
LT
2319
2320 /* Returns -EFAULT on error */
bfd5f4a3 2321 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2322 if (err)
2323 goto out_free;
bf84a010
DB
2324
2325 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2326
3bdc0eba 2327 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
57f89bfa
BG
2328 /* Earlier code assumed this would be a VLAN pkt,
2329 * double-check this now that we have the actual
2330 * packet in hand.
2331 */
2332 struct ethhdr *ehdr;
2333 skb_reset_mac_header(skb);
2334 ehdr = eth_hdr(skb);
2335 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2336 err = -EMSGSIZE;
2337 goto out_free;
2338 }
2339 }
2340
1da177e4
LT
2341 skb->protocol = proto;
2342 skb->dev = dev;
2343 skb->priority = sk->sk_priority;
2d37a186 2344 skb->mark = sk->sk_mark;
1da177e4 2345
bfd5f4a3
SS
2346 if (po->has_vnet_hdr) {
2347 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2348 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2349 vnet_hdr.csum_offset)) {
2350 err = -EINVAL;
2351 goto out_free;
2352 }
2353 }
2354
2355 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2356 skb_shinfo(skb)->gso_type = gso_type;
2357
2358 /* Header must be checked, and gso_segs computed. */
2359 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2360 skb_shinfo(skb)->gso_segs = 0;
2361
2362 len += vnet_hdr_len;
2363 }
2364
40893fd0 2365 skb_probe_transport_header(skb, reserve);
c1aad275 2366
3bdc0eba
BG
2367 if (unlikely(extra_len == 4))
2368 skb->no_fcs = 1;
2369
1da177e4
LT
2370 /*
2371 * Now send it
2372 */
2373
2374 err = dev_queue_xmit(skb);
2375 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2376 goto out_unlock;
2377
827d9780
BG
2378 if (need_rls_dev)
2379 dev_put(dev);
1da177e4 2380
40d4e3df 2381 return len;
1da177e4
LT
2382
2383out_free:
2384 kfree_skb(skb);
2385out_unlock:
827d9780 2386 if (dev && need_rls_dev)
1da177e4
LT
2387 dev_put(dev);
2388out:
2389 return err;
2390}
2391
69e3c75f
JB
2392static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2393 struct msghdr *msg, size_t len)
2394{
69e3c75f
JB
2395 struct sock *sk = sock->sk;
2396 struct packet_sock *po = pkt_sk(sk);
2397 if (po->tx_ring.pg_vec)
2398 return tpacket_snd(po, msg);
2399 else
69e3c75f
JB
2400 return packet_snd(sock, msg, len);
2401}
2402
1da177e4
LT
2403/*
2404 * Close a PACKET socket. This is fairly simple. We immediately go
2405 * to 'closed' state and remove our protocol entry in the device list.
2406 */
2407
2408static int packet_release(struct socket *sock)
2409{
2410 struct sock *sk = sock->sk;
2411 struct packet_sock *po;
d12d01d6 2412 struct net *net;
f6fb8f10 2413 union tpacket_req_u req_u;
1da177e4
LT
2414
2415 if (!sk)
2416 return 0;
2417
3b1e0a65 2418 net = sock_net(sk);
1da177e4
LT
2419 po = pkt_sk(sk);
2420
0fa7fa98 2421 mutex_lock(&net->packet.sklist_lock);
808f5114 2422 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2423 mutex_unlock(&net->packet.sklist_lock);
2424
2425 preempt_disable();
920de804 2426 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2427 preempt_enable();
1da177e4 2428
808f5114 2429 spin_lock(&po->bind_lock);
ce06b03e 2430 unregister_prot_hook(sk, false);
160ff18a
BG
2431 if (po->prot_hook.dev) {
2432 dev_put(po->prot_hook.dev);
2433 po->prot_hook.dev = NULL;
2434 }
808f5114 2435 spin_unlock(&po->bind_lock);
1da177e4 2436
1da177e4 2437 packet_flush_mclist(sk);
1da177e4 2438
9665d5d6
PS
2439 if (po->rx_ring.pg_vec) {
2440 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2441 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2442 }
69e3c75f 2443
9665d5d6
PS
2444 if (po->tx_ring.pg_vec) {
2445 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2446 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2447 }
1da177e4 2448
dc99f600
DM
2449 fanout_release(sk);
2450
808f5114 2451 synchronize_net();
1da177e4
LT
2452 /*
2453 * Now the socket is dead. No more input will appear.
2454 */
1da177e4
LT
2455 sock_orphan(sk);
2456 sock->sk = NULL;
2457
2458 /* Purge queues */
2459
2460 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2461 sk_refcnt_debug_release(sk);
1da177e4
LT
2462
2463 sock_put(sk);
2464 return 0;
2465}
2466
2467/*
2468 * Attach a packet hook.
2469 */
2470
0e11c91e 2471static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2472{
2473 struct packet_sock *po = pkt_sk(sk);
dc99f600 2474
aef950b4
WY
2475 if (po->fanout) {
2476 if (dev)
2477 dev_put(dev);
2478
dc99f600 2479 return -EINVAL;
aef950b4 2480 }
1da177e4
LT
2481
2482 lock_sock(sk);
2483
2484 spin_lock(&po->bind_lock);
ce06b03e 2485 unregister_prot_hook(sk, true);
1da177e4
LT
2486 po->num = protocol;
2487 po->prot_hook.type = protocol;
160ff18a
BG
2488 if (po->prot_hook.dev)
2489 dev_put(po->prot_hook.dev);
1da177e4
LT
2490 po->prot_hook.dev = dev;
2491
2492 po->ifindex = dev ? dev->ifindex : 0;
2493
2494 if (protocol == 0)
2495 goto out_unlock;
2496
be85d4ad 2497 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2498 register_prot_hook(sk);
be85d4ad
UT
2499 } else {
2500 sk->sk_err = ENETDOWN;
2501 if (!sock_flag(sk, SOCK_DEAD))
2502 sk->sk_error_report(sk);
1da177e4
LT
2503 }
2504
2505out_unlock:
2506 spin_unlock(&po->bind_lock);
2507 release_sock(sk);
2508 return 0;
2509}
2510
2511/*
2512 * Bind a packet socket to a device
2513 */
2514
40d4e3df
ED
2515static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2516 int addr_len)
1da177e4 2517{
40d4e3df 2518 struct sock *sk = sock->sk;
1da177e4
LT
2519 char name[15];
2520 struct net_device *dev;
2521 int err = -ENODEV;
1ce4f28b 2522
1da177e4
LT
2523 /*
2524 * Check legality
2525 */
1ce4f28b 2526
8ae55f04 2527 if (addr_len != sizeof(struct sockaddr))
1da177e4 2528 return -EINVAL;
40d4e3df 2529 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2530
3b1e0a65 2531 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2532 if (dev)
1da177e4 2533 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2534 return err;
2535}
1da177e4
LT
2536
2537static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2538{
40d4e3df
ED
2539 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2540 struct sock *sk = sock->sk;
1da177e4
LT
2541 struct net_device *dev = NULL;
2542 int err;
2543
2544
2545 /*
2546 * Check legality
2547 */
1ce4f28b 2548
1da177e4
LT
2549 if (addr_len < sizeof(struct sockaddr_ll))
2550 return -EINVAL;
2551 if (sll->sll_family != AF_PACKET)
2552 return -EINVAL;
2553
2554 if (sll->sll_ifindex) {
2555 err = -ENODEV;
3b1e0a65 2556 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2557 if (dev == NULL)
2558 goto out;
2559 }
2560 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2561
2562out:
2563 return err;
2564}
2565
2566static struct proto packet_proto = {
2567 .name = "PACKET",
2568 .owner = THIS_MODULE,
2569 .obj_size = sizeof(struct packet_sock),
2570};
2571
2572/*
1ce4f28b 2573 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2574 */
2575
3f378b68
EP
2576static int packet_create(struct net *net, struct socket *sock, int protocol,
2577 int kern)
1da177e4
LT
2578{
2579 struct sock *sk;
2580 struct packet_sock *po;
0e11c91e 2581 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2582 int err;
2583
df008c91 2584 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2585 return -EPERM;
be02097c
DM
2586 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2587 sock->type != SOCK_PACKET)
1da177e4
LT
2588 return -ESOCKTNOSUPPORT;
2589
2590 sock->state = SS_UNCONNECTED;
2591
2592 err = -ENOBUFS;
6257ff21 2593 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2594 if (sk == NULL)
2595 goto out;
2596
2597 sock->ops = &packet_ops;
1da177e4
LT
2598 if (sock->type == SOCK_PACKET)
2599 sock->ops = &packet_ops_spkt;
be02097c 2600
1da177e4
LT
2601 sock_init_data(sock, sk);
2602
2603 po = pkt_sk(sk);
2604 sk->sk_family = PF_PACKET;
0e11c91e 2605 po->num = proto;
1da177e4
LT
2606
2607 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2608 sk_refcnt_debug_inc(sk);
1da177e4
LT
2609
2610 /*
2611 * Attach a protocol block
2612 */
2613
2614 spin_lock_init(&po->bind_lock);
905db440 2615 mutex_init(&po->pg_vec_lock);
1da177e4 2616 po->prot_hook.func = packet_rcv;
be02097c 2617
1da177e4
LT
2618 if (sock->type == SOCK_PACKET)
2619 po->prot_hook.func = packet_rcv_spkt;
be02097c 2620
1da177e4
LT
2621 po->prot_hook.af_packet_priv = sk;
2622
0e11c91e
AV
2623 if (proto) {
2624 po->prot_hook.type = proto;
ce06b03e 2625 register_prot_hook(sk);
1da177e4
LT
2626 }
2627
0fa7fa98 2628 mutex_lock(&net->packet.sklist_lock);
808f5114 2629 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2630 mutex_unlock(&net->packet.sklist_lock);
2631
2632 preempt_disable();
3680453c 2633 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2634 preempt_enable();
808f5114 2635
40d4e3df 2636 return 0;
1da177e4
LT
2637out:
2638 return err;
2639}
2640
ed85b565
RC
2641static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2642{
2643 struct sock_exterr_skb *serr;
2644 struct sk_buff *skb, *skb2;
2645 int copied, err;
2646
2647 err = -EAGAIN;
2648 skb = skb_dequeue(&sk->sk_error_queue);
2649 if (skb == NULL)
2650 goto out;
2651
2652 copied = skb->len;
2653 if (copied > len) {
2654 msg->msg_flags |= MSG_TRUNC;
2655 copied = len;
2656 }
2657 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2658 if (err)
2659 goto out_free_skb;
2660
2661 sock_recv_timestamp(msg, sk, skb);
2662
2663 serr = SKB_EXT_ERR(skb);
2664 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2665 sizeof(serr->ee), &serr->ee);
2666
2667 msg->msg_flags |= MSG_ERRQUEUE;
2668 err = copied;
2669
2670 /* Reset and regenerate socket error */
2671 spin_lock_bh(&sk->sk_error_queue.lock);
2672 sk->sk_err = 0;
2673 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2674 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2675 spin_unlock_bh(&sk->sk_error_queue.lock);
2676 sk->sk_error_report(sk);
2677 } else
2678 spin_unlock_bh(&sk->sk_error_queue.lock);
2679
2680out_free_skb:
2681 kfree_skb(skb);
2682out:
2683 return err;
2684}
2685
1da177e4
LT
2686/*
2687 * Pull a packet from our receive queue and hand it to the user.
2688 * If necessary we block.
2689 */
2690
2691static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2692 struct msghdr *msg, size_t len, int flags)
2693{
2694 struct sock *sk = sock->sk;
2695 struct sk_buff *skb;
2696 int copied, err;
0fb375fb 2697 struct sockaddr_ll *sll;
bfd5f4a3 2698 int vnet_hdr_len = 0;
1da177e4
LT
2699
2700 err = -EINVAL;
ed85b565 2701 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2702 goto out;
2703
2704#if 0
2705 /* What error should we return now? EUNATTACH? */
2706 if (pkt_sk(sk)->ifindex < 0)
2707 return -ENODEV;
2708#endif
2709
ed85b565
RC
2710 if (flags & MSG_ERRQUEUE) {
2711 err = packet_recv_error(sk, msg, len);
2712 goto out;
2713 }
2714
1da177e4
LT
2715 /*
2716 * Call the generic datagram receiver. This handles all sorts
2717 * of horrible races and re-entrancy so we can forget about it
2718 * in the protocol layers.
2719 *
2720 * Now it will return ENETDOWN, if device have just gone down,
2721 * but then it will block.
2722 */
2723
40d4e3df 2724 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2725
2726 /*
1ce4f28b 2727 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2728 * handles the blocking we don't see and worry about blocking
2729 * retries.
2730 */
2731
8ae55f04 2732 if (skb == NULL)
1da177e4
LT
2733 goto out;
2734
bfd5f4a3
SS
2735 if (pkt_sk(sk)->has_vnet_hdr) {
2736 struct virtio_net_hdr vnet_hdr = { 0 };
2737
2738 err = -EINVAL;
2739 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2740 if (len < vnet_hdr_len)
bfd5f4a3
SS
2741 goto out_free;
2742
1f18b717
MK
2743 len -= vnet_hdr_len;
2744
bfd5f4a3
SS
2745 if (skb_is_gso(skb)) {
2746 struct skb_shared_info *sinfo = skb_shinfo(skb);
2747
2748 /* This is a hint as to how much should be linear. */
2749 vnet_hdr.hdr_len = skb_headlen(skb);
2750 vnet_hdr.gso_size = sinfo->gso_size;
2751 if (sinfo->gso_type & SKB_GSO_TCPV4)
2752 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2753 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2754 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2755 else if (sinfo->gso_type & SKB_GSO_UDP)
2756 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2757 else if (sinfo->gso_type & SKB_GSO_FCOE)
2758 goto out_free;
2759 else
2760 BUG();
2761 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2762 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2763 } else
2764 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2765
2766 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2767 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2768 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2769 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2770 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2771 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2772 } /* else everything is zero */
2773
2774 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2775 vnet_hdr_len);
2776 if (err < 0)
2777 goto out_free;
2778 }
2779
0fb375fb
EB
2780 /*
2781 * If the address length field is there to be filled in, we fill
2782 * it in now.
2783 */
2784
ffbc6111 2785 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
2786 if (sock->type == SOCK_PACKET)
2787 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2788 else
2789 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2790
1da177e4
LT
2791 /*
2792 * You lose any data beyond the buffer you gave. If it worries a
2793 * user program they can ask the device for its MTU anyway.
2794 */
2795
2796 copied = skb->len;
40d4e3df
ED
2797 if (copied > len) {
2798 copied = len;
2799 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2800 }
2801
2802 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2803 if (err)
2804 goto out_free;
2805
3b885787 2806 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
2807
2808 if (msg->msg_name)
ffbc6111
HX
2809 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2810 msg->msg_namelen);
1da177e4 2811
8dc41944 2812 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2813 struct tpacket_auxdata aux;
2814
2815 aux.tp_status = TP_STATUS_USER;
2816 if (skb->ip_summed == CHECKSUM_PARTIAL)
2817 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2818 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2819 aux.tp_snaplen = skb->len;
2820 aux.tp_mac = 0;
bbe735e4 2821 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2822 if (vlan_tx_tag_present(skb)) {
2823 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2824 aux.tp_status |= TP_STATUS_VLAN_VALID;
2825 } else {
2826 aux.tp_vlan_tci = 0;
2827 }
13fcb7bd 2828 aux.tp_padding = 0;
ffbc6111 2829 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2830 }
2831
1da177e4
LT
2832 /*
2833 * Free or return the buffer as appropriate. Again this
2834 * hides all the races and re-entrancy issues from us.
2835 */
bfd5f4a3 2836 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2837
2838out_free:
2839 skb_free_datagram(sk, skb);
2840out:
2841 return err;
2842}
2843
1da177e4
LT
2844static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2845 int *uaddr_len, int peer)
2846{
2847 struct net_device *dev;
2848 struct sock *sk = sock->sk;
2849
2850 if (peer)
2851 return -EOPNOTSUPP;
2852
2853 uaddr->sa_family = AF_PACKET;
2dc85bf3 2854 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
2855 rcu_read_lock();
2856 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2857 if (dev)
2dc85bf3 2858 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 2859 rcu_read_unlock();
1da177e4
LT
2860 *uaddr_len = sizeof(*uaddr);
2861
2862 return 0;
2863}
1da177e4
LT
2864
2865static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2866 int *uaddr_len, int peer)
2867{
2868 struct net_device *dev;
2869 struct sock *sk = sock->sk;
2870 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2871 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2872
2873 if (peer)
2874 return -EOPNOTSUPP;
2875
2876 sll->sll_family = AF_PACKET;
2877 sll->sll_ifindex = po->ifindex;
2878 sll->sll_protocol = po->num;
67286640 2879 sll->sll_pkttype = 0;
654d1f8a
ED
2880 rcu_read_lock();
2881 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2882 if (dev) {
2883 sll->sll_hatype = dev->type;
2884 sll->sll_halen = dev->addr_len;
2885 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2886 } else {
2887 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2888 sll->sll_halen = 0;
2889 }
654d1f8a 2890 rcu_read_unlock();
0fb375fb 2891 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2892
2893 return 0;
2894}
2895
2aeb0b88
WC
2896static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2897 int what)
1da177e4
LT
2898{
2899 switch (i->type) {
2900 case PACKET_MR_MULTICAST:
1162563f
JP
2901 if (i->alen != dev->addr_len)
2902 return -EINVAL;
1da177e4 2903 if (what > 0)
22bedad3 2904 return dev_mc_add(dev, i->addr);
1da177e4 2905 else
22bedad3 2906 return dev_mc_del(dev, i->addr);
1da177e4
LT
2907 break;
2908 case PACKET_MR_PROMISC:
2aeb0b88 2909 return dev_set_promiscuity(dev, what);
1da177e4
LT
2910 break;
2911 case PACKET_MR_ALLMULTI:
2aeb0b88 2912 return dev_set_allmulti(dev, what);
1da177e4 2913 break;
d95ed927 2914 case PACKET_MR_UNICAST:
1162563f
JP
2915 if (i->alen != dev->addr_len)
2916 return -EINVAL;
d95ed927 2917 if (what > 0)
a748ee24 2918 return dev_uc_add(dev, i->addr);
d95ed927 2919 else
a748ee24 2920 return dev_uc_del(dev, i->addr);
d95ed927 2921 break;
40d4e3df
ED
2922 default:
2923 break;
1da177e4 2924 }
2aeb0b88 2925 return 0;
1da177e4
LT
2926}
2927
2928static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2929{
40d4e3df 2930 for ( ; i; i = i->next) {
1da177e4
LT
2931 if (i->ifindex == dev->ifindex)
2932 packet_dev_mc(dev, i, what);
2933 }
2934}
2935
0fb375fb 2936static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2937{
2938 struct packet_sock *po = pkt_sk(sk);
2939 struct packet_mclist *ml, *i;
2940 struct net_device *dev;
2941 int err;
2942
2943 rtnl_lock();
2944
2945 err = -ENODEV;
3b1e0a65 2946 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2947 if (!dev)
2948 goto done;
2949
2950 err = -EINVAL;
1162563f 2951 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2952 goto done;
2953
2954 err = -ENOBUFS;
8b3a7005 2955 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2956 if (i == NULL)
2957 goto done;
2958
2959 err = 0;
2960 for (ml = po->mclist; ml; ml = ml->next) {
2961 if (ml->ifindex == mreq->mr_ifindex &&
2962 ml->type == mreq->mr_type &&
2963 ml->alen == mreq->mr_alen &&
2964 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2965 ml->count++;
2966 /* Free the new element ... */
2967 kfree(i);
2968 goto done;
2969 }
2970 }
2971
2972 i->type = mreq->mr_type;
2973 i->ifindex = mreq->mr_ifindex;
2974 i->alen = mreq->mr_alen;
2975 memcpy(i->addr, mreq->mr_address, i->alen);
2976 i->count = 1;
2977 i->next = po->mclist;
2978 po->mclist = i;
2aeb0b88
WC
2979 err = packet_dev_mc(dev, i, 1);
2980 if (err) {
2981 po->mclist = i->next;
2982 kfree(i);
2983 }
1da177e4
LT
2984
2985done:
2986 rtnl_unlock();
2987 return err;
2988}
2989
0fb375fb 2990static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2991{
2992 struct packet_mclist *ml, **mlp;
2993
2994 rtnl_lock();
2995
2996 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2997 if (ml->ifindex == mreq->mr_ifindex &&
2998 ml->type == mreq->mr_type &&
2999 ml->alen == mreq->mr_alen &&
3000 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3001 if (--ml->count == 0) {
3002 struct net_device *dev;
3003 *mlp = ml->next;
ad959e76
ED
3004 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3005 if (dev)
1da177e4 3006 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3007 kfree(ml);
3008 }
3009 rtnl_unlock();
3010 return 0;
3011 }
3012 }
3013 rtnl_unlock();
3014 return -EADDRNOTAVAIL;
3015}
3016
3017static void packet_flush_mclist(struct sock *sk)
3018{
3019 struct packet_sock *po = pkt_sk(sk);
3020 struct packet_mclist *ml;
3021
3022 if (!po->mclist)
3023 return;
3024
3025 rtnl_lock();
3026 while ((ml = po->mclist) != NULL) {
3027 struct net_device *dev;
3028
3029 po->mclist = ml->next;
ad959e76
ED
3030 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3031 if (dev != NULL)
1da177e4 3032 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3033 kfree(ml);
3034 }
3035 rtnl_unlock();
3036}
1da177e4
LT
3037
3038static int
b7058842 3039packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3040{
3041 struct sock *sk = sock->sk;
8dc41944 3042 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3043 int ret;
3044
3045 if (level != SOL_PACKET)
3046 return -ENOPROTOOPT;
3047
69e3c75f 3048 switch (optname) {
1ce4f28b 3049 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3050 case PACKET_DROP_MEMBERSHIP:
3051 {
0fb375fb
EB
3052 struct packet_mreq_max mreq;
3053 int len = optlen;
3054 memset(&mreq, 0, sizeof(mreq));
3055 if (len < sizeof(struct packet_mreq))
1da177e4 3056 return -EINVAL;
0fb375fb
EB
3057 if (len > sizeof(mreq))
3058 len = sizeof(mreq);
40d4e3df 3059 if (copy_from_user(&mreq, optval, len))
1da177e4 3060 return -EFAULT;
0fb375fb
EB
3061 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3062 return -EINVAL;
1da177e4
LT
3063 if (optname == PACKET_ADD_MEMBERSHIP)
3064 ret = packet_mc_add(sk, &mreq);
3065 else
3066 ret = packet_mc_drop(sk, &mreq);
3067 return ret;
3068 }
a2efcfa0 3069
1da177e4 3070 case PACKET_RX_RING:
69e3c75f 3071 case PACKET_TX_RING:
1da177e4 3072 {
f6fb8f10 3073 union tpacket_req_u req_u;
3074 int len;
1da177e4 3075
f6fb8f10 3076 switch (po->tp_version) {
3077 case TPACKET_V1:
3078 case TPACKET_V2:
3079 len = sizeof(req_u.req);
3080 break;
3081 case TPACKET_V3:
3082 default:
3083 len = sizeof(req_u.req3);
3084 break;
3085 }
3086 if (optlen < len)
1da177e4 3087 return -EINVAL;
bfd5f4a3
SS
3088 if (pkt_sk(sk)->has_vnet_hdr)
3089 return -EINVAL;
f6fb8f10 3090 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3091 return -EFAULT;
f6fb8f10 3092 return packet_set_ring(sk, &req_u, 0,
3093 optname == PACKET_TX_RING);
1da177e4
LT
3094 }
3095 case PACKET_COPY_THRESH:
3096 {
3097 int val;
3098
40d4e3df 3099 if (optlen != sizeof(val))
1da177e4 3100 return -EINVAL;
40d4e3df 3101 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3102 return -EFAULT;
3103
3104 pkt_sk(sk)->copy_thresh = val;
3105 return 0;
3106 }
bbd6ef87
PM
3107 case PACKET_VERSION:
3108 {
3109 int val;
3110
3111 if (optlen != sizeof(val))
3112 return -EINVAL;
69e3c75f 3113 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3114 return -EBUSY;
3115 if (copy_from_user(&val, optval, sizeof(val)))
3116 return -EFAULT;
3117 switch (val) {
3118 case TPACKET_V1:
3119 case TPACKET_V2:
f6fb8f10 3120 case TPACKET_V3:
bbd6ef87
PM
3121 po->tp_version = val;
3122 return 0;
3123 default:
3124 return -EINVAL;
3125 }
3126 }
8913336a
PM
3127 case PACKET_RESERVE:
3128 {
3129 unsigned int val;
3130
3131 if (optlen != sizeof(val))
3132 return -EINVAL;
69e3c75f 3133 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3134 return -EBUSY;
3135 if (copy_from_user(&val, optval, sizeof(val)))
3136 return -EFAULT;
3137 po->tp_reserve = val;
3138 return 0;
3139 }
69e3c75f
JB
3140 case PACKET_LOSS:
3141 {
3142 unsigned int val;
3143
3144 if (optlen != sizeof(val))
3145 return -EINVAL;
3146 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3147 return -EBUSY;
3148 if (copy_from_user(&val, optval, sizeof(val)))
3149 return -EFAULT;
3150 po->tp_loss = !!val;
3151 return 0;
3152 }
8dc41944
HX
3153 case PACKET_AUXDATA:
3154 {
3155 int val;
3156
3157 if (optlen < sizeof(val))
3158 return -EINVAL;
3159 if (copy_from_user(&val, optval, sizeof(val)))
3160 return -EFAULT;
3161
3162 po->auxdata = !!val;
3163 return 0;
3164 }
80feaacb
PWJ
3165 case PACKET_ORIGDEV:
3166 {
3167 int val;
3168
3169 if (optlen < sizeof(val))
3170 return -EINVAL;
3171 if (copy_from_user(&val, optval, sizeof(val)))
3172 return -EFAULT;
3173
3174 po->origdev = !!val;
3175 return 0;
3176 }
bfd5f4a3
SS
3177 case PACKET_VNET_HDR:
3178 {
3179 int val;
3180
3181 if (sock->type != SOCK_RAW)
3182 return -EINVAL;
3183 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3184 return -EBUSY;
3185 if (optlen < sizeof(val))
3186 return -EINVAL;
3187 if (copy_from_user(&val, optval, sizeof(val)))
3188 return -EFAULT;
3189
3190 po->has_vnet_hdr = !!val;
3191 return 0;
3192 }
614f60fa
SM
3193 case PACKET_TIMESTAMP:
3194 {
3195 int val;
3196
3197 if (optlen != sizeof(val))
3198 return -EINVAL;
3199 if (copy_from_user(&val, optval, sizeof(val)))
3200 return -EFAULT;
3201
3202 po->tp_tstamp = val;
3203 return 0;
3204 }
dc99f600
DM
3205 case PACKET_FANOUT:
3206 {
3207 int val;
3208
3209 if (optlen != sizeof(val))
3210 return -EINVAL;
3211 if (copy_from_user(&val, optval, sizeof(val)))
3212 return -EFAULT;
3213
3214 return fanout_add(sk, val & 0xffff, val >> 16);
3215 }
5920cd3a
PC
3216 case PACKET_TX_HAS_OFF:
3217 {
3218 unsigned int val;
3219
3220 if (optlen != sizeof(val))
3221 return -EINVAL;
3222 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3223 return -EBUSY;
3224 if (copy_from_user(&val, optval, sizeof(val)))
3225 return -EFAULT;
3226 po->tp_tx_has_off = !!val;
3227 return 0;
3228 }
1da177e4
LT
3229 default:
3230 return -ENOPROTOOPT;
3231 }
3232}
3233
3234static int packet_getsockopt(struct socket *sock, int level, int optname,
3235 char __user *optval, int __user *optlen)
3236{
3237 int len;
c06fff6e 3238 int val, lv = sizeof(val);
1da177e4
LT
3239 struct sock *sk = sock->sk;
3240 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3241 void *data = &val;
ee80fbf3 3242 union tpacket_stats_u st;
1da177e4
LT
3243
3244 if (level != SOL_PACKET)
3245 return -ENOPROTOOPT;
3246
8ae55f04
KK
3247 if (get_user(len, optlen))
3248 return -EFAULT;
1da177e4
LT
3249
3250 if (len < 0)
3251 return -EINVAL;
1ce4f28b 3252
69e3c75f 3253 switch (optname) {
1da177e4 3254 case PACKET_STATISTICS:
1da177e4 3255 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3256 memcpy(&st, &po->stats, sizeof(st));
3257 memset(&po->stats, 0, sizeof(po->stats));
3258 spin_unlock_bh(&sk->sk_receive_queue.lock);
3259
f6fb8f10 3260 if (po->tp_version == TPACKET_V3) {
c06fff6e 3261 lv = sizeof(struct tpacket_stats_v3);
fc26e4cf 3262 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3263 data = &st.stats3;
f6fb8f10 3264 } else {
c06fff6e 3265 lv = sizeof(struct tpacket_stats);
fc26e4cf 3266 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3267 data = &st.stats1;
f6fb8f10 3268 }
ee80fbf3 3269
8dc41944
HX
3270 break;
3271 case PACKET_AUXDATA:
8dc41944 3272 val = po->auxdata;
80feaacb
PWJ
3273 break;
3274 case PACKET_ORIGDEV:
80feaacb 3275 val = po->origdev;
bfd5f4a3
SS
3276 break;
3277 case PACKET_VNET_HDR:
bfd5f4a3 3278 val = po->has_vnet_hdr;
1da177e4 3279 break;
bbd6ef87 3280 case PACKET_VERSION:
bbd6ef87 3281 val = po->tp_version;
bbd6ef87
PM
3282 break;
3283 case PACKET_HDRLEN:
3284 if (len > sizeof(int))
3285 len = sizeof(int);
3286 if (copy_from_user(&val, optval, len))
3287 return -EFAULT;
3288 switch (val) {
3289 case TPACKET_V1:
3290 val = sizeof(struct tpacket_hdr);
3291 break;
3292 case TPACKET_V2:
3293 val = sizeof(struct tpacket2_hdr);
3294 break;
f6fb8f10 3295 case TPACKET_V3:
3296 val = sizeof(struct tpacket3_hdr);
3297 break;
bbd6ef87
PM
3298 default:
3299 return -EINVAL;
3300 }
bbd6ef87 3301 break;
8913336a 3302 case PACKET_RESERVE:
8913336a 3303 val = po->tp_reserve;
8913336a 3304 break;
69e3c75f 3305 case PACKET_LOSS:
69e3c75f 3306 val = po->tp_loss;
69e3c75f 3307 break;
614f60fa 3308 case PACKET_TIMESTAMP:
614f60fa 3309 val = po->tp_tstamp;
614f60fa 3310 break;
dc99f600 3311 case PACKET_FANOUT:
dc99f600
DM
3312 val = (po->fanout ?
3313 ((u32)po->fanout->id |
77f65ebd
WB
3314 ((u32)po->fanout->type << 16) |
3315 ((u32)po->fanout->flags << 24)) :
dc99f600 3316 0);
dc99f600 3317 break;
5920cd3a
PC
3318 case PACKET_TX_HAS_OFF:
3319 val = po->tp_tx_has_off;
3320 break;
1da177e4
LT
3321 default:
3322 return -ENOPROTOOPT;
3323 }
3324
c06fff6e
ED
3325 if (len > lv)
3326 len = lv;
8ae55f04
KK
3327 if (put_user(len, optlen))
3328 return -EFAULT;
8dc41944
HX
3329 if (copy_to_user(optval, data, len))
3330 return -EFAULT;
8ae55f04 3331 return 0;
1da177e4
LT
3332}
3333
3334
3335static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3336{
3337 struct sock *sk;
ad930650 3338 struct net_device *dev = data;
c346dca1 3339 struct net *net = dev_net(dev);
1da177e4 3340
808f5114 3341 rcu_read_lock();
b67bfe0d 3342 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3343 struct packet_sock *po = pkt_sk(sk);
3344
3345 switch (msg) {
3346 case NETDEV_UNREGISTER:
1da177e4
LT
3347 if (po->mclist)
3348 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3349 /* fallthrough */
3350
1da177e4
LT
3351 case NETDEV_DOWN:
3352 if (dev->ifindex == po->ifindex) {
3353 spin_lock(&po->bind_lock);
3354 if (po->running) {
ce06b03e 3355 __unregister_prot_hook(sk, false);
1da177e4
LT
3356 sk->sk_err = ENETDOWN;
3357 if (!sock_flag(sk, SOCK_DEAD))
3358 sk->sk_error_report(sk);
3359 }
3360 if (msg == NETDEV_UNREGISTER) {
3361 po->ifindex = -1;
160ff18a
BG
3362 if (po->prot_hook.dev)
3363 dev_put(po->prot_hook.dev);
1da177e4
LT
3364 po->prot_hook.dev = NULL;
3365 }
3366 spin_unlock(&po->bind_lock);
3367 }
3368 break;
3369 case NETDEV_UP:
808f5114 3370 if (dev->ifindex == po->ifindex) {
3371 spin_lock(&po->bind_lock);
ce06b03e
DM
3372 if (po->num)
3373 register_prot_hook(sk);
808f5114 3374 spin_unlock(&po->bind_lock);
1da177e4 3375 }
1da177e4
LT
3376 break;
3377 }
3378 }
808f5114 3379 rcu_read_unlock();
1da177e4
LT
3380 return NOTIFY_DONE;
3381}
3382
3383
3384static int packet_ioctl(struct socket *sock, unsigned int cmd,
3385 unsigned long arg)
3386{
3387 struct sock *sk = sock->sk;
3388
69e3c75f 3389 switch (cmd) {
40d4e3df
ED
3390 case SIOCOUTQ:
3391 {
3392 int amount = sk_wmem_alloc_get(sk);
31e6d363 3393
40d4e3df
ED
3394 return put_user(amount, (int __user *)arg);
3395 }
3396 case SIOCINQ:
3397 {
3398 struct sk_buff *skb;
3399 int amount = 0;
3400
3401 spin_lock_bh(&sk->sk_receive_queue.lock);
3402 skb = skb_peek(&sk->sk_receive_queue);
3403 if (skb)
3404 amount = skb->len;
3405 spin_unlock_bh(&sk->sk_receive_queue.lock);
3406 return put_user(amount, (int __user *)arg);
3407 }
3408 case SIOCGSTAMP:
3409 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3410 case SIOCGSTAMPNS:
3411 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3412
1da177e4 3413#ifdef CONFIG_INET
40d4e3df
ED
3414 case SIOCADDRT:
3415 case SIOCDELRT:
3416 case SIOCDARP:
3417 case SIOCGARP:
3418 case SIOCSARP:
3419 case SIOCGIFADDR:
3420 case SIOCSIFADDR:
3421 case SIOCGIFBRDADDR:
3422 case SIOCSIFBRDADDR:
3423 case SIOCGIFNETMASK:
3424 case SIOCSIFNETMASK:
3425 case SIOCGIFDSTADDR:
3426 case SIOCSIFDSTADDR:
3427 case SIOCSIFFLAGS:
40d4e3df 3428 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3429#endif
3430
40d4e3df
ED
3431 default:
3432 return -ENOIOCTLCMD;
1da177e4
LT
3433 }
3434 return 0;
3435}
3436
40d4e3df 3437static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3438 poll_table *wait)
3439{
3440 struct sock *sk = sock->sk;
3441 struct packet_sock *po = pkt_sk(sk);
3442 unsigned int mask = datagram_poll(file, sock, wait);
3443
3444 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3445 if (po->rx_ring.pg_vec) {
f6fb8f10 3446 if (!packet_previous_rx_frame(po, &po->rx_ring,
3447 TP_STATUS_KERNEL))
1da177e4
LT
3448 mask |= POLLIN | POLLRDNORM;
3449 }
3450 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3451 spin_lock_bh(&sk->sk_write_queue.lock);
3452 if (po->tx_ring.pg_vec) {
3453 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3454 mask |= POLLOUT | POLLWRNORM;
3455 }
3456 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3457 return mask;
3458}
3459
3460
3461/* Dirty? Well, I still did not learn better way to account
3462 * for user mmaps.
3463 */
3464
3465static void packet_mm_open(struct vm_area_struct *vma)
3466{
3467 struct file *file = vma->vm_file;
40d4e3df 3468 struct socket *sock = file->private_data;
1da177e4 3469 struct sock *sk = sock->sk;
1ce4f28b 3470
1da177e4
LT
3471 if (sk)
3472 atomic_inc(&pkt_sk(sk)->mapped);
3473}
3474
3475static void packet_mm_close(struct vm_area_struct *vma)
3476{
3477 struct file *file = vma->vm_file;
40d4e3df 3478 struct socket *sock = file->private_data;
1da177e4 3479 struct sock *sk = sock->sk;
1ce4f28b 3480
1da177e4
LT
3481 if (sk)
3482 atomic_dec(&pkt_sk(sk)->mapped);
3483}
3484
f0f37e2f 3485static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3486 .open = packet_mm_open,
3487 .close = packet_mm_close,
1da177e4
LT
3488};
3489
0e3125c7
NH
3490static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3491 unsigned int len)
1da177e4
LT
3492{
3493 int i;
3494
4ebf0ae2 3495 for (i = 0; i < len; i++) {
0e3125c7 3496 if (likely(pg_vec[i].buffer)) {
c56b4d90 3497 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3498 vfree(pg_vec[i].buffer);
3499 else
3500 free_pages((unsigned long)pg_vec[i].buffer,
3501 order);
3502 pg_vec[i].buffer = NULL;
3503 }
1da177e4
LT
3504 }
3505 kfree(pg_vec);
3506}
3507
eea49cc9 3508static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3509{
0e3125c7
NH
3510 char *buffer = NULL;
3511 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3512 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3513
3514 buffer = (char *) __get_free_pages(gfp_flags, order);
3515
3516 if (buffer)
3517 return buffer;
3518
3519 /*
3520 * __get_free_pages failed, fall back to vmalloc
3521 */
bbce5a59 3522 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3523
0e3125c7
NH
3524 if (buffer)
3525 return buffer;
3526
3527 /*
3528 * vmalloc failed, lets dig into swap here
3529 */
0e3125c7
NH
3530 gfp_flags &= ~__GFP_NORETRY;
3531 buffer = (char *)__get_free_pages(gfp_flags, order);
3532 if (buffer)
3533 return buffer;
3534
3535 /*
3536 * complete and utter failure
3537 */
3538 return NULL;
4ebf0ae2
DM
3539}
3540
0e3125c7 3541static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3542{
3543 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3544 struct pgv *pg_vec;
4ebf0ae2
DM
3545 int i;
3546
0e3125c7 3547 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3548 if (unlikely(!pg_vec))
3549 goto out;
3550
3551 for (i = 0; i < block_nr; i++) {
c56b4d90 3552 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3553 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3554 goto out_free_pgvec;
3555 }
3556
3557out:
3558 return pg_vec;
3559
3560out_free_pgvec:
3561 free_pg_vec(pg_vec, order, block_nr);
3562 pg_vec = NULL;
3563 goto out;
3564}
1da177e4 3565
f6fb8f10 3566static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3567 int closing, int tx_ring)
1da177e4 3568{
0e3125c7 3569 struct pgv *pg_vec = NULL;
1da177e4 3570 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3571 int was_running, order = 0;
69e3c75f
JB
3572 struct packet_ring_buffer *rb;
3573 struct sk_buff_head *rb_queue;
0e11c91e 3574 __be16 num;
f6fb8f10 3575 int err = -EINVAL;
3576 /* Added to avoid minimal code churn */
3577 struct tpacket_req *req = &req_u->req;
3578
3579 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3580 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3581 WARN(1, "Tx-ring is not supported.\n");
3582 goto out;
3583 }
1ce4f28b 3584
69e3c75f
JB
3585 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3586 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3587
69e3c75f
JB
3588 err = -EBUSY;
3589 if (!closing) {
3590 if (atomic_read(&po->mapped))
3591 goto out;
3592 if (atomic_read(&rb->pending))
3593 goto out;
3594 }
1da177e4 3595
69e3c75f
JB
3596 if (req->tp_block_nr) {
3597 /* Sanity tests and some calculations */
3598 err = -EBUSY;
3599 if (unlikely(rb->pg_vec))
3600 goto out;
1da177e4 3601
bbd6ef87
PM
3602 switch (po->tp_version) {
3603 case TPACKET_V1:
3604 po->tp_hdrlen = TPACKET_HDRLEN;
3605 break;
3606 case TPACKET_V2:
3607 po->tp_hdrlen = TPACKET2_HDRLEN;
3608 break;
f6fb8f10 3609 case TPACKET_V3:
3610 po->tp_hdrlen = TPACKET3_HDRLEN;
3611 break;
bbd6ef87
PM
3612 }
3613
69e3c75f 3614 err = -EINVAL;
4ebf0ae2 3615 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3616 goto out;
4ebf0ae2 3617 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3618 goto out;
8913336a 3619 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3620 po->tp_reserve))
3621 goto out;
4ebf0ae2 3622 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3623 goto out;
1da177e4 3624
69e3c75f
JB
3625 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3626 if (unlikely(rb->frames_per_block <= 0))
3627 goto out;
3628 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3629 req->tp_frame_nr))
3630 goto out;
1da177e4
LT
3631
3632 err = -ENOMEM;
4ebf0ae2
DM
3633 order = get_order(req->tp_block_size);
3634 pg_vec = alloc_pg_vec(req, order);
3635 if (unlikely(!pg_vec))
1da177e4 3636 goto out;
f6fb8f10 3637 switch (po->tp_version) {
3638 case TPACKET_V3:
3639 /* Transmit path is not supported. We checked
3640 * it above but just being paranoid
3641 */
3642 if (!tx_ring)
3643 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3644 break;
3645 default:
3646 break;
3647 }
69e3c75f
JB
3648 }
3649 /* Done */
3650 else {
3651 err = -EINVAL;
4ebf0ae2 3652 if (unlikely(req->tp_frame_nr))
69e3c75f 3653 goto out;
1da177e4
LT
3654 }
3655
3656 lock_sock(sk);
3657
3658 /* Detach socket from network */
3659 spin_lock(&po->bind_lock);
3660 was_running = po->running;
3661 num = po->num;
3662 if (was_running) {
1da177e4 3663 po->num = 0;
ce06b03e 3664 __unregister_prot_hook(sk, false);
1da177e4
LT
3665 }
3666 spin_unlock(&po->bind_lock);
1ce4f28b 3667
1da177e4
LT
3668 synchronize_net();
3669
3670 err = -EBUSY;
905db440 3671 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3672 if (closing || atomic_read(&po->mapped) == 0) {
3673 err = 0;
69e3c75f 3674 spin_lock_bh(&rb_queue->lock);
c053fd96 3675 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3676 rb->frame_max = (req->tp_frame_nr - 1);
3677 rb->head = 0;
3678 rb->frame_size = req->tp_frame_size;
3679 spin_unlock_bh(&rb_queue->lock);
3680
c053fd96
CG
3681 swap(rb->pg_vec_order, order);
3682 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3683
3684 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3685 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3686 tpacket_rcv : packet_rcv;
3687 skb_queue_purge(rb_queue);
1da177e4 3688 if (atomic_read(&po->mapped))
40d4e3df
ED
3689 pr_err("packet_mmap: vma is busy: %d\n",
3690 atomic_read(&po->mapped));
1da177e4 3691 }
905db440 3692 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3693
3694 spin_lock(&po->bind_lock);
ce06b03e 3695 if (was_running) {
1da177e4 3696 po->num = num;
ce06b03e 3697 register_prot_hook(sk);
1da177e4
LT
3698 }
3699 spin_unlock(&po->bind_lock);
f6fb8f10 3700 if (closing && (po->tp_version > TPACKET_V2)) {
3701 /* Because we don't support block-based V3 on tx-ring */
3702 if (!tx_ring)
3703 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3704 }
1da177e4
LT
3705 release_sock(sk);
3706
1da177e4
LT
3707 if (pg_vec)
3708 free_pg_vec(pg_vec, order, req->tp_block_nr);
3709out:
3710 return err;
3711}
3712
69e3c75f
JB
3713static int packet_mmap(struct file *file, struct socket *sock,
3714 struct vm_area_struct *vma)
1da177e4
LT
3715{
3716 struct sock *sk = sock->sk;
3717 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3718 unsigned long size, expected_size;
3719 struct packet_ring_buffer *rb;
1da177e4
LT
3720 unsigned long start;
3721 int err = -EINVAL;
3722 int i;
3723
3724 if (vma->vm_pgoff)
3725 return -EINVAL;
3726
905db440 3727 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3728
3729 expected_size = 0;
3730 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3731 if (rb->pg_vec) {
3732 expected_size += rb->pg_vec_len
3733 * rb->pg_vec_pages
3734 * PAGE_SIZE;
3735 }
3736 }
3737
3738 if (expected_size == 0)
1da177e4 3739 goto out;
69e3c75f
JB
3740
3741 size = vma->vm_end - vma->vm_start;
3742 if (size != expected_size)
1da177e4
LT
3743 goto out;
3744
1da177e4 3745 start = vma->vm_start;
69e3c75f
JB
3746 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3747 if (rb->pg_vec == NULL)
3748 continue;
3749
3750 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3751 struct page *page;
3752 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3753 int pg_num;
3754
c56b4d90
CG
3755 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3756 page = pgv_to_page(kaddr);
69e3c75f
JB
3757 err = vm_insert_page(vma, start, page);
3758 if (unlikely(err))
3759 goto out;
3760 start += PAGE_SIZE;
0e3125c7 3761 kaddr += PAGE_SIZE;
69e3c75f 3762 }
4ebf0ae2 3763 }
1da177e4 3764 }
69e3c75f 3765
4ebf0ae2 3766 atomic_inc(&po->mapped);
1da177e4
LT
3767 vma->vm_ops = &packet_mmap_ops;
3768 err = 0;
3769
3770out:
905db440 3771 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3772 return err;
3773}
1da177e4 3774
90ddc4f0 3775static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3776 .family = PF_PACKET,
3777 .owner = THIS_MODULE,
3778 .release = packet_release,
3779 .bind = packet_bind_spkt,
3780 .connect = sock_no_connect,
3781 .socketpair = sock_no_socketpair,
3782 .accept = sock_no_accept,
3783 .getname = packet_getname_spkt,
3784 .poll = datagram_poll,
3785 .ioctl = packet_ioctl,
3786 .listen = sock_no_listen,
3787 .shutdown = sock_no_shutdown,
3788 .setsockopt = sock_no_setsockopt,
3789 .getsockopt = sock_no_getsockopt,
3790 .sendmsg = packet_sendmsg_spkt,
3791 .recvmsg = packet_recvmsg,
3792 .mmap = sock_no_mmap,
3793 .sendpage = sock_no_sendpage,
3794};
1da177e4 3795
90ddc4f0 3796static const struct proto_ops packet_ops = {
1da177e4
LT
3797 .family = PF_PACKET,
3798 .owner = THIS_MODULE,
3799 .release = packet_release,
3800 .bind = packet_bind,
3801 .connect = sock_no_connect,
3802 .socketpair = sock_no_socketpair,
3803 .accept = sock_no_accept,
1ce4f28b 3804 .getname = packet_getname,
1da177e4
LT
3805 .poll = packet_poll,
3806 .ioctl = packet_ioctl,
3807 .listen = sock_no_listen,
3808 .shutdown = sock_no_shutdown,
3809 .setsockopt = packet_setsockopt,
3810 .getsockopt = packet_getsockopt,
3811 .sendmsg = packet_sendmsg,
3812 .recvmsg = packet_recvmsg,
3813 .mmap = packet_mmap,
3814 .sendpage = sock_no_sendpage,
3815};
3816
ec1b4cf7 3817static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3818 .family = PF_PACKET,
3819 .create = packet_create,
3820 .owner = THIS_MODULE,
3821};
3822
3823static struct notifier_block packet_netdev_notifier = {
40d4e3df 3824 .notifier_call = packet_notifier,
1da177e4
LT
3825};
3826
3827#ifdef CONFIG_PROC_FS
1da177e4
LT
3828
3829static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3830 __acquires(RCU)
1da177e4 3831{
e372c414 3832 struct net *net = seq_file_net(seq);
808f5114 3833
3834 rcu_read_lock();
3835 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3836}
3837
3838static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3839{
1bf40954 3840 struct net *net = seq_file_net(seq);
808f5114 3841 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3842}
3843
3844static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3845 __releases(RCU)
1da177e4 3846{
808f5114 3847 rcu_read_unlock();
1da177e4
LT
3848}
3849
1ce4f28b 3850static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3851{
3852 if (v == SEQ_START_TOKEN)
3853 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3854 else {
b7ceabd9 3855 struct sock *s = sk_entry(v);
1da177e4
LT
3856 const struct packet_sock *po = pkt_sk(s);
3857
3858 seq_printf(seq,
71338aa7 3859 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3860 s,
3861 atomic_read(&s->sk_refcnt),
3862 s->sk_type,
3863 ntohs(po->num),
3864 po->ifindex,
3865 po->running,
3866 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 3867 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 3868 sock_i_ino(s));
1da177e4
LT
3869 }
3870
3871 return 0;
3872}
3873
56b3d975 3874static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3875 .start = packet_seq_start,
3876 .next = packet_seq_next,
3877 .stop = packet_seq_stop,
3878 .show = packet_seq_show,
3879};
3880
3881static int packet_seq_open(struct inode *inode, struct file *file)
3882{
e372c414
DL
3883 return seq_open_net(inode, file, &packet_seq_ops,
3884 sizeof(struct seq_net_private));
1da177e4
LT
3885}
3886
da7071d7 3887static const struct file_operations packet_seq_fops = {
1da177e4
LT
3888 .owner = THIS_MODULE,
3889 .open = packet_seq_open,
3890 .read = seq_read,
3891 .llseek = seq_lseek,
e372c414 3892 .release = seq_release_net,
1da177e4
LT
3893};
3894
3895#endif
3896
2c8c1e72 3897static int __net_init packet_net_init(struct net *net)
d12d01d6 3898{
0fa7fa98 3899 mutex_init(&net->packet.sklist_lock);
2aaef4e4 3900 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 3901
d4beaa66 3902 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
3903 return -ENOMEM;
3904
3905 return 0;
3906}
3907
2c8c1e72 3908static void __net_exit packet_net_exit(struct net *net)
d12d01d6 3909{
ece31ffd 3910 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
3911}
3912
3913static struct pernet_operations packet_net_ops = {
3914 .init = packet_net_init,
3915 .exit = packet_net_exit,
3916};
3917
3918
1da177e4
LT
3919static void __exit packet_exit(void)
3920{
1da177e4 3921 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3922 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3923 sock_unregister(PF_PACKET);
3924 proto_unregister(&packet_proto);
3925}
3926
3927static int __init packet_init(void)
3928{
3929 int rc = proto_register(&packet_proto, 0);
3930
3931 if (rc != 0)
3932 goto out;
3933
3934 sock_register(&packet_family_ops);
d12d01d6 3935 register_pernet_subsys(&packet_net_ops);
1da177e4 3936 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3937out:
3938 return rc;
3939}
3940
3941module_init(packet_init);
3942module_exit(packet_exit);
3943MODULE_LICENSE("GPL");
3944MODULE_ALIAS_NETPROTO(PF_PACKET);