import PULS_20180308
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
1da177e4
LT
91
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111
HX
218struct packet_skb_cb {
219 unsigned int origlen;
220 union {
221 struct sockaddr_pkt pkt;
222 struct sockaddr_ll ll;
223 } sa;
224};
225
226#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 227
bc59ba39 228#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 229#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 231#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 233#define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
236
dc99f600
DM
237static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238static void __fanout_link(struct sock *sk, struct packet_sock *po);
239
c3ac8a13
DB
240static struct net_device *packet_cached_dev_get(struct packet_sock *po)
241{
242 struct net_device *dev;
243
244 rcu_read_lock();
245 dev = rcu_dereference(po->cached_dev);
246 if (likely(dev))
247 dev_hold(dev);
248 rcu_read_unlock();
249
250 return dev;
251}
252
253static void packet_cached_dev_assign(struct packet_sock *po,
254 struct net_device *dev)
255{
256 rcu_assign_pointer(po->cached_dev, dev);
257}
258
259static void packet_cached_dev_reset(struct packet_sock *po)
260{
261 RCU_INIT_POINTER(po->cached_dev, NULL);
262}
263
ce06b03e
DM
264/* register_prot_hook must be invoked with the po->bind_lock held,
265 * or from a context in which asynchronous accesses to the packet
266 * socket is not possible (packet_create()).
267 */
268static void register_prot_hook(struct sock *sk)
269{
270 struct packet_sock *po = pkt_sk(sk);
026bb405 271
ce06b03e 272 if (!po->running) {
c3ac8a13 273 if (po->fanout)
dc99f600 274 __fanout_link(sk, po);
c3ac8a13 275 else
dc99f600 276 dev_add_pack(&po->prot_hook);
026bb405 277
ce06b03e
DM
278 sock_hold(sk);
279 po->running = 1;
280 }
281}
282
283/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
284 * held. If the sync parameter is true, we will temporarily drop
285 * the po->bind_lock and do a synchronize_net to make sure no
286 * asynchronous packet processing paths still refer to the elements
287 * of po->prot_hook. If the sync parameter is false, it is the
288 * callers responsibility to take care of this.
289 */
290static void __unregister_prot_hook(struct sock *sk, bool sync)
291{
292 struct packet_sock *po = pkt_sk(sk);
293
294 po->running = 0;
c3ac8a13
DB
295
296 if (po->fanout)
dc99f600 297 __fanout_unlink(sk, po);
c3ac8a13 298 else
dc99f600 299 __dev_remove_pack(&po->prot_hook);
026bb405 300
ce06b03e
DM
301 __sock_put(sk);
302
303 if (sync) {
304 spin_unlock(&po->bind_lock);
305 synchronize_net();
306 spin_lock(&po->bind_lock);
307 }
308}
309
310static void unregister_prot_hook(struct sock *sk, bool sync)
311{
312 struct packet_sock *po = pkt_sk(sk);
313
314 if (po->running)
315 __unregister_prot_hook(sk, sync);
316}
317
f6dafa95 318static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
319{
320 if (is_vmalloc_addr(addr))
321 return vmalloc_to_page(addr);
322 return virt_to_page(addr);
323}
324
69e3c75f 325static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 326{
184f489e 327 union tpacket_uhdr h;
1da177e4 328
69e3c75f 329 h.raw = frame;
bbd6ef87
PM
330 switch (po->tp_version) {
331 case TPACKET_V1:
69e3c75f 332 h.h1->tp_status = status;
0af55bb5 333 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
334 break;
335 case TPACKET_V2:
69e3c75f 336 h.h2->tp_status = status;
0af55bb5 337 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 338 break;
f6fb8f10 339 case TPACKET_V3:
69e3c75f 340 default:
f6fb8f10 341 WARN(1, "TPACKET version not supported.\n");
69e3c75f 342 BUG();
bbd6ef87 343 }
69e3c75f
JB
344
345 smp_wmb();
bbd6ef87
PM
346}
347
69e3c75f 348static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 349{
184f489e 350 union tpacket_uhdr h;
bbd6ef87 351
69e3c75f
JB
352 smp_rmb();
353
bbd6ef87
PM
354 h.raw = frame;
355 switch (po->tp_version) {
356 case TPACKET_V1:
0af55bb5 357 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 358 return h.h1->tp_status;
bbd6ef87 359 case TPACKET_V2:
0af55bb5 360 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 361 return h.h2->tp_status;
f6fb8f10 362 case TPACKET_V3:
69e3c75f 363 default:
f6fb8f10 364 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
365 BUG();
366 return 0;
bbd6ef87 367 }
1da177e4 368}
69e3c75f 369
b9c32fb2
DB
370static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
371 unsigned int flags)
7a51384c
DB
372{
373 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
374
375 if (shhwtstamps) {
376 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
377 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
b9c32fb2 378 return TP_STATUS_TS_SYS_HARDWARE;
7a51384c
DB
379 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
380 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
b9c32fb2 381 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
382 }
383
384 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 385 return TP_STATUS_TS_SOFTWARE;
7a51384c 386
b9c32fb2 387 return 0;
7a51384c
DB
388}
389
b9c32fb2
DB
390static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
391 struct sk_buff *skb)
2e31396f
WB
392{
393 union tpacket_uhdr h;
394 struct timespec ts;
b9c32fb2 395 __u32 ts_status;
2e31396f 396
b9c32fb2
DB
397 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
398 return 0;
2e31396f
WB
399
400 h.raw = frame;
401 switch (po->tp_version) {
402 case TPACKET_V1:
403 h.h1->tp_sec = ts.tv_sec;
404 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
405 break;
406 case TPACKET_V2:
407 h.h2->tp_sec = ts.tv_sec;
408 h.h2->tp_nsec = ts.tv_nsec;
409 break;
410 case TPACKET_V3:
411 default:
412 WARN(1, "TPACKET version not supported.\n");
413 BUG();
414 }
415
416 /* one flush is safe, as both fields always lie on the same cacheline */
417 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
418 smp_wmb();
b9c32fb2
DB
419
420 return ts_status;
2e31396f
WB
421}
422
69e3c75f
JB
423static void *packet_lookup_frame(struct packet_sock *po,
424 struct packet_ring_buffer *rb,
425 unsigned int position,
426 int status)
427{
428 unsigned int pg_vec_pos, frame_offset;
184f489e 429 union tpacket_uhdr h;
69e3c75f
JB
430
431 pg_vec_pos = position / rb->frames_per_block;
432 frame_offset = position % rb->frames_per_block;
433
0e3125c7
NH
434 h.raw = rb->pg_vec[pg_vec_pos].buffer +
435 (frame_offset * rb->frame_size);
69e3c75f
JB
436
437 if (status != __packet_get_status(po, h.raw))
438 return NULL;
439
440 return h.raw;
441}
442
eea49cc9 443static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
444 struct packet_ring_buffer *rb,
445 int status)
446{
447 return packet_lookup_frame(po, rb, rb->head, status);
448}
449
bc59ba39 450static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 451{
452 del_timer_sync(&pkc->retire_blk_timer);
453}
454
455static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
456 int tx_ring,
457 struct sk_buff_head *rb_queue)
458{
bc59ba39 459 struct tpacket_kbdq_core *pkc;
f6fb8f10 460
461 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
462
5b9e9be7 463 spin_lock_bh(&rb_queue->lock);
f6fb8f10 464 pkc->delete_blk_timer = 1;
5b9e9be7 465 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 466
467 prb_del_retire_blk_timer(pkc);
468}
469
470static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 471 struct tpacket_kbdq_core *pkc,
f6fb8f10 472 void (*func) (unsigned long))
473{
474 init_timer(&pkc->retire_blk_timer);
475 pkc->retire_blk_timer.data = (long)po;
476 pkc->retire_blk_timer.function = func;
477 pkc->retire_blk_timer.expires = jiffies;
478}
479
480static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
481{
bc59ba39 482 struct tpacket_kbdq_core *pkc;
f6fb8f10 483
484 if (tx_ring)
485 BUG();
486
487 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
488 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
489}
490
491static int prb_calc_retire_blk_tmo(struct packet_sock *po,
492 int blk_size_in_bytes)
493{
494 struct net_device *dev;
495 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
496 struct ethtool_cmd ecmd;
497 int err;
e440cf2c 498 u32 speed;
f6fb8f10 499
4bc71cb9
JP
500 rtnl_lock();
501 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
502 if (unlikely(!dev)) {
503 rtnl_unlock();
f6fb8f10 504 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
505 }
506 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 507 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
508 rtnl_unlock();
509 if (!err) {
4bc71cb9
JP
510 /*
511 * If the link speed is so slow you don't really
512 * need to worry about perf anyways
513 */
e440cf2c 514 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 515 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 516 } else {
517 msec = 1;
518 div = speed / 1000;
f6fb8f10 519 }
520 }
521
522 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
523
524 if (div)
525 mbits /= div;
526
527 tmo = mbits * msec;
528
529 if (div)
530 return tmo+1;
531 return tmo;
532}
533
bc59ba39 534static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 535 union tpacket_req_u *req_u)
536{
537 p1->feature_req_word = req_u->req3.tp_feature_req_word;
538}
539
540static void init_prb_bdqc(struct packet_sock *po,
541 struct packet_ring_buffer *rb,
542 struct pgv *pg_vec,
543 union tpacket_req_u *req_u, int tx_ring)
544{
bc59ba39 545 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
546 struct tpacket_block_desc *pbd;
f6fb8f10 547
548 memset(p1, 0x0, sizeof(*p1));
549
550 p1->knxt_seq_num = 1;
551 p1->pkbdq = pg_vec;
bc59ba39 552 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 553 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 554 p1->kblk_size = req_u->req3.tp_block_size;
555 p1->knum_blocks = req_u->req3.tp_block_nr;
556 p1->hdrlen = po->tp_hdrlen;
557 p1->version = po->tp_version;
558 p1->last_kactive_blk_num = 0;
ee80fbf3 559 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 560 if (req_u->req3.tp_retire_blk_tov)
561 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
562 else
563 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
564 req_u->req3.tp_block_size);
565 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
566 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
567
568 prb_init_ft_ops(p1, req_u);
569 prb_setup_retire_blk_timer(po, tx_ring);
570 prb_open_block(p1, pbd);
571}
572
573/* Do NOT update the last_blk_num first.
574 * Assumes sk_buff_head lock is held.
575 */
bc59ba39 576static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 577{
578 mod_timer(&pkc->retire_blk_timer,
579 jiffies + pkc->tov_in_jiffies);
580 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
581}
582
583/*
584 * Timer logic:
585 * 1) We refresh the timer only when we open a block.
586 * By doing this we don't waste cycles refreshing the timer
587 * on packet-by-packet basis.
588 *
589 * With a 1MB block-size, on a 1Gbps line, it will take
590 * i) ~8 ms to fill a block + ii) memcpy etc.
591 * In this cut we are not accounting for the memcpy time.
592 *
593 * So, if the user sets the 'tmo' to 10ms then the timer
594 * will never fire while the block is still getting filled
595 * (which is what we want). However, the user could choose
596 * to close a block early and that's fine.
597 *
598 * But when the timer does fire, we check whether or not to refresh it.
599 * Since the tmo granularity is in msecs, it is not too expensive
600 * to refresh the timer, lets say every '8' msecs.
601 * Either the user can set the 'tmo' or we can derive it based on
602 * a) line-speed and b) block-size.
603 * prb_calc_retire_blk_tmo() calculates the tmo.
604 *
605 */
606static void prb_retire_rx_blk_timer_expired(unsigned long data)
607{
608 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 609 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 610 unsigned int frozen;
bc59ba39 611 struct tpacket_block_desc *pbd;
f6fb8f10 612
613 spin_lock(&po->sk.sk_receive_queue.lock);
614
615 frozen = prb_queue_frozen(pkc);
616 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
617
618 if (unlikely(pkc->delete_blk_timer))
619 goto out;
620
621 /* We only need to plug the race when the block is partially filled.
622 * tpacket_rcv:
623 * lock(); increment BLOCK_NUM_PKTS; unlock()
624 * copy_bits() is in progress ...
625 * timer fires on other cpu:
626 * we can't retire the current block because copy_bits
627 * is in progress.
628 *
629 */
630 if (BLOCK_NUM_PKTS(pbd)) {
631 while (atomic_read(&pkc->blk_fill_in_prog)) {
632 /* Waiting for skb_copy_bits to finish... */
633 cpu_relax();
634 }
635 }
636
637 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
638 if (!frozen) {
639 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
640 if (!prb_dispatch_next_block(pkc, po))
641 goto refresh_timer;
642 else
643 goto out;
644 } else {
645 /* Case 1. Queue was frozen because user-space was
646 * lagging behind.
647 */
648 if (prb_curr_blk_in_use(pkc, pbd)) {
649 /*
650 * Ok, user-space is still behind.
651 * So just refresh the timer.
652 */
653 goto refresh_timer;
654 } else {
655 /* Case 2. queue was frozen,user-space caught up,
656 * now the link went idle && the timer fired.
657 * We don't have a block to close.So we open this
658 * block and restart the timer.
659 * opening a block thaws the queue,restarts timer
660 * Thawing/timer-refresh is a side effect.
661 */
662 prb_open_block(pkc, pbd);
663 goto out;
664 }
665 }
666 }
667
668refresh_timer:
669 _prb_refresh_rx_retire_blk_timer(pkc);
670
671out:
672 spin_unlock(&po->sk.sk_receive_queue.lock);
673}
674
eea49cc9 675static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 676 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 677{
678 /* Flush everything minus the block header */
679
680#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
681 u8 *start, *end;
682
683 start = (u8 *)pbd1;
684
685 /* Skip the block header(we know header WILL fit in 4K) */
686 start += PAGE_SIZE;
687
688 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
689 for (; start < end; start += PAGE_SIZE)
690 flush_dcache_page(pgv_to_page(start));
691
692 smp_wmb();
693#endif
694
695 /* Now update the block status. */
696
697 BLOCK_STATUS(pbd1) = status;
698
699 /* Flush the block header */
700
701#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
702 start = (u8 *)pbd1;
703 flush_dcache_page(pgv_to_page(start));
704
705 smp_wmb();
706#endif
707}
708
709/*
710 * Side effect:
711 *
712 * 1) flush the block
713 * 2) Increment active_blk_num
714 *
715 * Note:We DONT refresh the timer on purpose.
716 * Because almost always the next block will be opened.
717 */
bc59ba39 718static void prb_close_block(struct tpacket_kbdq_core *pkc1,
719 struct tpacket_block_desc *pbd1,
f6fb8f10 720 struct packet_sock *po, unsigned int stat)
721{
722 __u32 status = TP_STATUS_USER | stat;
723
724 struct tpacket3_hdr *last_pkt;
bc59ba39 725 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 726
ee80fbf3 727 if (po->stats.stats3.tp_drops)
f6fb8f10 728 status |= TP_STATUS_LOSING;
729
730 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
731 last_pkt->tp_next_offset = 0;
732
733 /* Get the ts of the last pkt */
734 if (BLOCK_NUM_PKTS(pbd1)) {
735 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
736 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
737 } else {
738 /* Ok, we tmo'd - so get the current time */
739 struct timespec ts;
740 getnstimeofday(&ts);
741 h1->ts_last_pkt.ts_sec = ts.tv_sec;
742 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
743 }
744
745 smp_wmb();
746
747 /* Flush the block */
748 prb_flush_block(pkc1, pbd1, status);
749
750 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
751}
752
eea49cc9 753static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 754{
755 pkc->reset_pending_on_curr_blk = 0;
756}
757
758/*
759 * Side effect of opening a block:
760 *
761 * 1) prb_queue is thawed.
762 * 2) retire_blk_timer is refreshed.
763 *
764 */
bc59ba39 765static void prb_open_block(struct tpacket_kbdq_core *pkc1,
766 struct tpacket_block_desc *pbd1)
f6fb8f10 767{
768 struct timespec ts;
bc59ba39 769 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 770
771 smp_rmb();
772
8da3056c
DB
773 /* We could have just memset this but we will lose the
774 * flexibility of making the priv area sticky
775 */
f6fb8f10 776
8da3056c
DB
777 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
778 BLOCK_NUM_PKTS(pbd1) = 0;
779 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 780
8da3056c
DB
781 getnstimeofday(&ts);
782
783 h1->ts_first_pkt.ts_sec = ts.tv_sec;
784 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 785
8da3056c
DB
786 pkc1->pkblk_start = (char *)pbd1;
787 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
788
789 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
790 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
791
792 pbd1->version = pkc1->version;
793 pkc1->prev = pkc1->nxt_offset;
794 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
795
796 prb_thaw_queue(pkc1);
797 _prb_refresh_rx_retire_blk_timer(pkc1);
798
799 smp_wmb();
f6fb8f10 800}
801
802/*
803 * Queue freeze logic:
804 * 1) Assume tp_block_nr = 8 blocks.
805 * 2) At time 't0', user opens Rx ring.
806 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
807 * 4) user-space is either sleeping or processing block '0'.
808 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
809 * it will close block-7,loop around and try to fill block '0'.
810 * call-flow:
811 * __packet_lookup_frame_in_block
812 * prb_retire_current_block()
813 * prb_dispatch_next_block()
814 * |->(BLOCK_STATUS == USER) evaluates to true
815 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
816 * 6) Now there are two cases:
817 * 6.1) Link goes idle right after the queue is frozen.
818 * But remember, the last open_block() refreshed the timer.
819 * When this timer expires,it will refresh itself so that we can
820 * re-open block-0 in near future.
821 * 6.2) Link is busy and keeps on receiving packets. This is a simple
822 * case and __packet_lookup_frame_in_block will check if block-0
823 * is free and can now be re-used.
824 */
eea49cc9 825static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 826 struct packet_sock *po)
827{
828 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 829 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 830}
831
832#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
833
834/*
835 * If the next block is free then we will dispatch it
836 * and return a good offset.
837 * Else, we will freeze the queue.
838 * So, caller must check the return value.
839 */
bc59ba39 840static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 841 struct packet_sock *po)
842{
bc59ba39 843 struct tpacket_block_desc *pbd;
f6fb8f10 844
845 smp_rmb();
846
847 /* 1. Get current block num */
848 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
849
850 /* 2. If this block is currently in_use then freeze the queue */
851 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
852 prb_freeze_queue(pkc, po);
853 return NULL;
854 }
855
856 /*
857 * 3.
858 * open this block and return the offset where the first packet
859 * needs to get stored.
860 */
861 prb_open_block(pkc, pbd);
862 return (void *)pkc->nxt_offset;
863}
864
bc59ba39 865static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 866 struct packet_sock *po, unsigned int status)
867{
bc59ba39 868 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 869
870 /* retire/close the current block */
871 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
872 /*
873 * Plug the case where copy_bits() is in progress on
874 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
875 * have space to copy the pkt in the current block and
876 * called prb_retire_current_block()
877 *
878 * We don't need to worry about the TMO case because
879 * the timer-handler already handled this case.
880 */
881 if (!(status & TP_STATUS_BLK_TMO)) {
882 while (atomic_read(&pkc->blk_fill_in_prog)) {
883 /* Waiting for skb_copy_bits to finish... */
884 cpu_relax();
885 }
886 }
887 prb_close_block(pkc, pbd, po, status);
888 return;
889 }
f6fb8f10 890}
891
eea49cc9 892static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 893 struct tpacket_block_desc *pbd)
f6fb8f10 894{
895 return TP_STATUS_USER & BLOCK_STATUS(pbd);
896}
897
eea49cc9 898static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 899{
900 return pkc->reset_pending_on_curr_blk;
901}
902
eea49cc9 903static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 904{
bc59ba39 905 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 906 atomic_dec(&pkc->blk_fill_in_prog);
907}
908
eea49cc9 909static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 910 struct tpacket3_hdr *ppd)
911{
912 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
913}
914
eea49cc9 915static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 916 struct tpacket3_hdr *ppd)
917{
918 ppd->hv1.tp_rxhash = 0;
919}
920
eea49cc9 921static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 922 struct tpacket3_hdr *ppd)
923{
924 if (vlan_tx_tag_present(pkc->skb)) {
925 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
926 ppd->tp_status = TP_STATUS_VLAN_VALID;
927 } else {
9e67030a 928 ppd->hv1.tp_vlan_tci = 0;
929 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 930 }
931}
932
bc59ba39 933static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 934 struct tpacket3_hdr *ppd)
935{
936 prb_fill_vlan_info(pkc, ppd);
937
938 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
939 prb_fill_rxhash(pkc, ppd);
940 else
941 prb_clear_rxhash(pkc, ppd);
942}
943
eea49cc9 944static void prb_fill_curr_block(char *curr,
bc59ba39 945 struct tpacket_kbdq_core *pkc,
946 struct tpacket_block_desc *pbd,
f6fb8f10 947 unsigned int len)
948{
949 struct tpacket3_hdr *ppd;
950
951 ppd = (struct tpacket3_hdr *)curr;
952 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
953 pkc->prev = curr;
954 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
955 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
956 BLOCK_NUM_PKTS(pbd) += 1;
957 atomic_inc(&pkc->blk_fill_in_prog);
958 prb_run_all_ft_ops(pkc, ppd);
959}
960
961/* Assumes caller has the sk->rx_queue.lock */
962static void *__packet_lookup_frame_in_block(struct packet_sock *po,
963 struct sk_buff *skb,
964 int status,
965 unsigned int len
966 )
967{
bc59ba39 968 struct tpacket_kbdq_core *pkc;
969 struct tpacket_block_desc *pbd;
f6fb8f10 970 char *curr, *end;
971
e3192690 972 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 973 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
974
975 /* Queue is frozen when user space is lagging behind */
976 if (prb_queue_frozen(pkc)) {
977 /*
978 * Check if that last block which caused the queue to freeze,
979 * is still in_use by user-space.
980 */
981 if (prb_curr_blk_in_use(pkc, pbd)) {
982 /* Can't record this packet */
983 return NULL;
984 } else {
985 /*
986 * Ok, the block was released by user-space.
987 * Now let's open that block.
988 * opening a block also thaws the queue.
989 * Thawing is a side effect.
990 */
991 prb_open_block(pkc, pbd);
992 }
993 }
994
995 smp_mb();
996 curr = pkc->nxt_offset;
997 pkc->skb = skb;
e3192690 998 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 999
1000 /* first try the current block */
1001 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1002 prb_fill_curr_block(curr, pkc, pbd, len);
1003 return (void *)curr;
1004 }
1005
1006 /* Ok, close the current block */
1007 prb_retire_current_block(pkc, po, 0);
1008
1009 /* Now, try to dispatch the next block */
1010 curr = (char *)prb_dispatch_next_block(pkc, po);
1011 if (curr) {
1012 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1013 prb_fill_curr_block(curr, pkc, pbd, len);
1014 return (void *)curr;
1015 }
1016
1017 /*
1018 * No free blocks are available.user_space hasn't caught up yet.
1019 * Queue was just frozen and now this packet will get dropped.
1020 */
1021 return NULL;
1022}
1023
eea49cc9 1024static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1025 struct sk_buff *skb,
1026 int status, unsigned int len)
1027{
1028 char *curr = NULL;
1029 switch (po->tp_version) {
1030 case TPACKET_V1:
1031 case TPACKET_V2:
1032 curr = packet_lookup_frame(po, &po->rx_ring,
1033 po->rx_ring.head, status);
1034 return curr;
1035 case TPACKET_V3:
1036 return __packet_lookup_frame_in_block(po, skb, status, len);
1037 default:
1038 WARN(1, "TPACKET version not supported\n");
1039 BUG();
99aa3473 1040 return NULL;
f6fb8f10 1041 }
1042}
1043
eea49cc9 1044static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1045 struct packet_ring_buffer *rb,
77f65ebd 1046 unsigned int idx,
f6fb8f10 1047 int status)
1048{
bc59ba39 1049 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1050 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1051
1052 if (status != BLOCK_STATUS(pbd))
1053 return NULL;
1054 return pbd;
1055}
1056
eea49cc9 1057static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1058{
1059 unsigned int prev;
1060 if (rb->prb_bdqc.kactive_blk_num)
1061 prev = rb->prb_bdqc.kactive_blk_num-1;
1062 else
1063 prev = rb->prb_bdqc.knum_blocks-1;
1064 return prev;
1065}
1066
1067/* Assumes caller has held the rx_queue.lock */
eea49cc9 1068static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1069 struct packet_ring_buffer *rb,
1070 int status)
1071{
1072 unsigned int previous = prb_previous_blk_num(rb);
1073 return prb_lookup_block(po, rb, previous, status);
1074}
1075
eea49cc9 1076static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1077 struct packet_ring_buffer *rb,
1078 int status)
1079{
1080 if (po->tp_version <= TPACKET_V2)
1081 return packet_previous_frame(po, rb, status);
1082
1083 return __prb_previous_block(po, rb, status);
1084}
1085
eea49cc9 1086static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1087 struct packet_ring_buffer *rb)
1088{
1089 switch (po->tp_version) {
1090 case TPACKET_V1:
1091 case TPACKET_V2:
1092 return packet_increment_head(rb);
1093 case TPACKET_V3:
1094 default:
1095 WARN(1, "TPACKET version not supported.\n");
1096 BUG();
1097 return;
1098 }
1099}
1100
eea49cc9 1101static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1102 struct packet_ring_buffer *rb,
1103 int status)
1104{
1105 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1106 return packet_lookup_frame(po, rb, previous, status);
1107}
1108
eea49cc9 1109static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1110{
1111 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1112}
1113
77f65ebd
WB
1114static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1115{
1116 struct sock *sk = &po->sk;
1117 bool has_room;
1118
1119 if (po->prot_hook.func != tpacket_rcv)
1120 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1121 <= sk->sk_rcvbuf;
1122
1123 spin_lock(&sk->sk_receive_queue.lock);
1124 if (po->tp_version == TPACKET_V3)
1125 has_room = prb_lookup_block(po, &po->rx_ring,
1126 po->rx_ring.prb_bdqc.kactive_blk_num,
1127 TP_STATUS_KERNEL);
1128 else
1129 has_room = packet_lookup_frame(po, &po->rx_ring,
1130 po->rx_ring.head,
1131 TP_STATUS_KERNEL);
1132 spin_unlock(&sk->sk_receive_queue.lock);
1133
1134 return has_room;
1135}
1136
1da177e4
LT
1137static void packet_sock_destruct(struct sock *sk)
1138{
ed85b565
RC
1139 skb_queue_purge(&sk->sk_error_queue);
1140
547b792c
IJ
1141 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1142 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1143
1144 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1145 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1146 return;
1147 }
1148
17ab56a2 1149 sk_refcnt_debug_dec(sk);
1da177e4
LT
1150}
1151
dc99f600
DM
1152static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1153{
1154 int x = atomic_read(&f->rr_cur) + 1;
1155
1156 if (x >= num)
1157 x = 0;
1158
1159 return x;
1160}
1161
77f65ebd
WB
1162static unsigned int fanout_demux_hash(struct packet_fanout *f,
1163 struct sk_buff *skb,
1164 unsigned int num)
dc99f600 1165{
77f65ebd 1166 return (((u64)skb->rxhash) * num) >> 32;
dc99f600
DM
1167}
1168
77f65ebd
WB
1169static unsigned int fanout_demux_lb(struct packet_fanout *f,
1170 struct sk_buff *skb,
1171 unsigned int num)
dc99f600
DM
1172{
1173 int cur, old;
1174
1175 cur = atomic_read(&f->rr_cur);
1176 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1177 fanout_rr_next(f, num))) != cur)
1178 cur = old;
77f65ebd
WB
1179 return cur;
1180}
1181
1182static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1183 struct sk_buff *skb,
1184 unsigned int num)
1185{
1186 return smp_processor_id() % num;
dc99f600
DM
1187}
1188
77f65ebd
WB
1189static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1190 struct sk_buff *skb,
1191 unsigned int idx, unsigned int skip,
1192 unsigned int num)
95ec3eb4 1193{
77f65ebd 1194 unsigned int i, j;
95ec3eb4 1195
77f65ebd
WB
1196 i = j = min_t(int, f->next[idx], num - 1);
1197 do {
1198 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1199 if (i != j)
1200 f->next[idx] = i;
1201 return i;
1202 }
1203 if (++i == num)
1204 i = 0;
1205 } while (i != j);
1206
1207 return idx;
1208}
1209
1210static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1211{
1212 return f->flags & (flag >> 8);
95ec3eb4
DM
1213}
1214
95ec3eb4
DM
1215static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1216 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1217{
1218 struct packet_fanout *f = pt->af_packet_priv;
1219 unsigned int num = f->num_members;
1220 struct packet_sock *po;
77f65ebd 1221 unsigned int idx;
dc99f600
DM
1222
1223 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1224 !num) {
1225 kfree_skb(skb);
1226 return 0;
1227 }
1228
95ec3eb4
DM
1229 switch (f->type) {
1230 case PACKET_FANOUT_HASH:
1231 default:
77f65ebd 1232 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
bc416d97 1233 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1234 if (!skb)
1235 return 0;
1236 }
1237 skb_get_rxhash(skb);
77f65ebd 1238 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1239 break;
1240 case PACKET_FANOUT_LB:
77f65ebd 1241 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1242 break;
1243 case PACKET_FANOUT_CPU:
77f65ebd
WB
1244 idx = fanout_demux_cpu(f, skb, num);
1245 break;
1246 case PACKET_FANOUT_ROLLOVER:
1247 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1248 break;
dc99f600
DM
1249 }
1250
77f65ebd
WB
1251 po = pkt_sk(f->arr[idx]);
1252 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1253 unlikely(!packet_rcv_has_room(po, skb))) {
1254 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1255 po = pkt_sk(f->arr[idx]);
1256 }
dc99f600
DM
1257
1258 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1259}
1260
fff3321d
PE
1261DEFINE_MUTEX(fanout_mutex);
1262EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1263static LIST_HEAD(fanout_list);
1264
1265static void __fanout_link(struct sock *sk, struct packet_sock *po)
1266{
1267 struct packet_fanout *f = po->fanout;
1268
1269 spin_lock(&f->lock);
1270 f->arr[f->num_members] = sk;
1271 smp_wmb();
1272 f->num_members++;
1273 spin_unlock(&f->lock);
1274}
1275
1276static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1277{
1278 struct packet_fanout *f = po->fanout;
1279 int i;
1280
1281 spin_lock(&f->lock);
1282 for (i = 0; i < f->num_members; i++) {
1283 if (f->arr[i] == sk)
1284 break;
1285 }
1286 BUG_ON(i >= f->num_members);
1287 f->arr[i] = f->arr[f->num_members - 1];
1288 f->num_members--;
1289 spin_unlock(&f->lock);
1290}
1291
a0dfb263 1292static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
c0de08d0
EL
1293{
1294 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1295 return true;
1296
1297 return false;
1298}
1299
7736d33f 1300static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1301{
1302 struct packet_sock *po = pkt_sk(sk);
1303 struct packet_fanout *f, *match;
7736d33f 1304 u8 type = type_flags & 0xff;
77f65ebd 1305 u8 flags = type_flags >> 8;
dc99f600
DM
1306 int err;
1307
1308 switch (type) {
77f65ebd
WB
1309 case PACKET_FANOUT_ROLLOVER:
1310 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1311 return -EINVAL;
dc99f600
DM
1312 case PACKET_FANOUT_HASH:
1313 case PACKET_FANOUT_LB:
95ec3eb4 1314 case PACKET_FANOUT_CPU:
dc99f600
DM
1315 break;
1316 default:
1317 return -EINVAL;
1318 }
1319
1320 if (!po->running)
1321 return -EINVAL;
1322
1323 if (po->fanout)
1324 return -EALREADY;
1325
1326 mutex_lock(&fanout_mutex);
1327 match = NULL;
1328 list_for_each_entry(f, &fanout_list, list) {
1329 if (f->id == id &&
1330 read_pnet(&f->net) == sock_net(sk)) {
1331 match = f;
1332 break;
1333 }
1334 }
afe62c68 1335 err = -EINVAL;
77f65ebd 1336 if (match && match->flags != flags)
afe62c68 1337 goto out;
dc99f600 1338 if (!match) {
afe62c68 1339 err = -ENOMEM;
dc99f600 1340 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1341 if (!match)
1342 goto out;
1343 write_pnet(&match->net, sock_net(sk));
1344 match->id = id;
1345 match->type = type;
77f65ebd 1346 match->flags = flags;
afe62c68
ED
1347 atomic_set(&match->rr_cur, 0);
1348 INIT_LIST_HEAD(&match->list);
1349 spin_lock_init(&match->lock);
1350 atomic_set(&match->sk_ref, 0);
1351 match->prot_hook.type = po->prot_hook.type;
1352 match->prot_hook.dev = po->prot_hook.dev;
1353 match->prot_hook.func = packet_rcv_fanout;
1354 match->prot_hook.af_packet_priv = match;
c0de08d0 1355 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1356 dev_add_pack(&match->prot_hook);
1357 list_add(&match->list, &fanout_list);
dc99f600 1358 }
afe62c68
ED
1359 err = -EINVAL;
1360 if (match->type == type &&
1361 match->prot_hook.type == po->prot_hook.type &&
1362 match->prot_hook.dev == po->prot_hook.dev) {
1363 err = -ENOSPC;
1364 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1365 __dev_remove_pack(&po->prot_hook);
1366 po->fanout = match;
1367 atomic_inc(&match->sk_ref);
1368 __fanout_link(sk, po);
1369 err = 0;
dc99f600
DM
1370 }
1371 }
afe62c68 1372out:
dc99f600
DM
1373 mutex_unlock(&fanout_mutex);
1374 return err;
1375}
1376
1377static void fanout_release(struct sock *sk)
1378{
1379 struct packet_sock *po = pkt_sk(sk);
1380 struct packet_fanout *f;
1381
1382 f = po->fanout;
1383 if (!f)
1384 return;
1385
fff3321d 1386 mutex_lock(&fanout_mutex);
dc99f600
DM
1387 po->fanout = NULL;
1388
dc99f600
DM
1389 if (atomic_dec_and_test(&f->sk_ref)) {
1390 list_del(&f->list);
1391 dev_remove_pack(&f->prot_hook);
1392 kfree(f);
1393 }
1394 mutex_unlock(&fanout_mutex);
1395}
1da177e4 1396
90ddc4f0 1397static const struct proto_ops packet_ops;
1da177e4 1398
90ddc4f0 1399static const struct proto_ops packet_ops_spkt;
1da177e4 1400
40d4e3df
ED
1401static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1402 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1403{
1404 struct sock *sk;
1405 struct sockaddr_pkt *spkt;
1406
1407 /*
1408 * When we registered the protocol we saved the socket in the data
1409 * field for just this event.
1410 */
1411
1412 sk = pt->af_packet_priv;
1ce4f28b 1413
1da177e4
LT
1414 /*
1415 * Yank back the headers [hope the device set this
1416 * right or kerboom...]
1417 *
1418 * Incoming packets have ll header pulled,
1419 * push it back.
1420 *
98e399f8 1421 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1422 * so that this procedure is noop.
1423 */
1424
1425 if (skb->pkt_type == PACKET_LOOPBACK)
1426 goto out;
1427
09ad9bc7 1428 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1429 goto out;
1430
40d4e3df
ED
1431 skb = skb_share_check(skb, GFP_ATOMIC);
1432 if (skb == NULL)
1da177e4
LT
1433 goto oom;
1434
1435 /* drop any routing info */
adf30907 1436 skb_dst_drop(skb);
1da177e4 1437
84531c24
PO
1438 /* drop conntrack reference */
1439 nf_reset(skb);
1440
ffbc6111 1441 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1442
98e399f8 1443 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1444
1445 /*
1446 * The SOCK_PACKET socket receives _all_ frames.
1447 */
1448
1449 spkt->spkt_family = dev->type;
1450 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1451 spkt->spkt_protocol = skb->protocol;
1452
1453 /*
1454 * Charge the memory to the socket. This is done specifically
1455 * to prevent sockets using all the memory up.
1456 */
1457
40d4e3df 1458 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1459 return 0;
1460
1461out:
1462 kfree_skb(skb);
1463oom:
1464 return 0;
1465}
1466
1467
1468/*
1469 * Output a raw packet to a device layer. This bypasses all the other
1470 * protocol layers and you must therefore supply it with a complete frame
1471 */
1ce4f28b 1472
1da177e4
LT
1473static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1474 struct msghdr *msg, size_t len)
1475{
1476 struct sock *sk = sock->sk;
40d4e3df 1477 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1478 struct sk_buff *skb = NULL;
1da177e4 1479 struct net_device *dev;
40d4e3df 1480 __be16 proto = 0;
1da177e4 1481 int err;
3bdc0eba 1482 int extra_len = 0;
1ce4f28b 1483
1da177e4 1484 /*
1ce4f28b 1485 * Get and verify the address.
1da177e4
LT
1486 */
1487
40d4e3df 1488 if (saddr) {
1da177e4 1489 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1490 return -EINVAL;
1491 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1492 proto = saddr->spkt_protocol;
1493 } else
1494 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1495
1496 /*
1ce4f28b 1497 * Find the device first to size check it
1da177e4
LT
1498 */
1499
de74e92a 1500 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1501retry:
654d1f8a
ED
1502 rcu_read_lock();
1503 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1504 err = -ENODEV;
1505 if (dev == NULL)
1506 goto out_unlock;
1ce4f28b 1507
d5e76b0a
DM
1508 err = -ENETDOWN;
1509 if (!(dev->flags & IFF_UP))
1510 goto out_unlock;
1511
1da177e4 1512 /*
40d4e3df
ED
1513 * You may not queue a frame bigger than the mtu. This is the lowest level
1514 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1515 */
1ce4f28b 1516
3bdc0eba
BG
1517 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1518 if (!netif_supports_nofcs(dev)) {
1519 err = -EPROTONOSUPPORT;
1520 goto out_unlock;
1521 }
1522 extra_len = 4; /* We're doing our own CRC */
1523 }
1524
1da177e4 1525 err = -EMSGSIZE;
3bdc0eba 1526 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1527 goto out_unlock;
1528
1a35ca80
ED
1529 if (!skb) {
1530 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1531 int tlen = dev->needed_tailroom;
1a35ca80
ED
1532 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1533
1534 rcu_read_unlock();
4ce40912 1535 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1536 if (skb == NULL)
1537 return -ENOBUFS;
1538 /* FIXME: Save some space for broken drivers that write a hard
1539 * header at transmission time by themselves. PPP is the notable
1540 * one here. This should really be fixed at the driver level.
1541 */
1542 skb_reserve(skb, reserved);
1543 skb_reset_network_header(skb);
1544
1545 /* Try to align data part correctly */
1546 if (hhlen) {
1547 skb->data -= hhlen;
1548 skb->tail -= hhlen;
1549 if (len < hhlen)
1550 skb_reset_network_header(skb);
1551 }
1552 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1553 if (err)
1554 goto out_free;
1555 goto retry;
1da177e4
LT
1556 }
1557
3bdc0eba 1558 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1559 /* Earlier code assumed this would be a VLAN pkt,
1560 * double-check this now that we have the actual
1561 * packet in hand.
1562 */
1563 struct ethhdr *ehdr;
1564 skb_reset_mac_header(skb);
1565 ehdr = eth_hdr(skb);
1566 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1567 err = -EMSGSIZE;
1568 goto out_unlock;
1569 }
1570 }
1a35ca80 1571
1da177e4
LT
1572 skb->protocol = proto;
1573 skb->dev = dev;
1574 skb->priority = sk->sk_priority;
2d37a186 1575 skb->mark = sk->sk_mark;
bf84a010
DB
1576
1577 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1578
3bdc0eba
BG
1579 if (unlikely(extra_len == 4))
1580 skb->no_fcs = 1;
1581
40893fd0 1582 skb_probe_transport_header(skb, 0);
c1aad275 1583
1da177e4 1584 dev_queue_xmit(skb);
654d1f8a 1585 rcu_read_unlock();
40d4e3df 1586 return len;
1da177e4 1587
1da177e4 1588out_unlock:
654d1f8a 1589 rcu_read_unlock();
1a35ca80
ED
1590out_free:
1591 kfree_skb(skb);
1da177e4
LT
1592 return err;
1593}
1da177e4 1594
eea49cc9 1595static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1596 const struct sock *sk,
dbcb5855 1597 unsigned int res)
1da177e4
LT
1598{
1599 struct sk_filter *filter;
fda9ef5d 1600
80f8f102
ED
1601 rcu_read_lock();
1602 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1603 if (filter != NULL)
0a14842f 1604 res = SK_RUN_FILTER(filter, skb);
80f8f102 1605 rcu_read_unlock();
1da177e4 1606
dbcb5855 1607 return res;
1da177e4
LT
1608}
1609
1610/*
62ab0812
ED
1611 * This function makes lazy skb cloning in hope that most of packets
1612 * are discarded by BPF.
1613 *
1614 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1615 * and skb->cb are mangled. It works because (and until) packets
1616 * falling here are owned by current CPU. Output packets are cloned
1617 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1618 * sequencially, so that if we return skb to original state on exit,
1619 * we will not harm anyone.
1da177e4
LT
1620 */
1621
40d4e3df
ED
1622static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1623 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1624{
1625 struct sock *sk;
1626 struct sockaddr_ll *sll;
1627 struct packet_sock *po;
40d4e3df 1628 u8 *skb_head = skb->data;
1da177e4 1629 int skb_len = skb->len;
dbcb5855 1630 unsigned int snaplen, res;
1da177e4
LT
1631
1632 if (skb->pkt_type == PACKET_LOOPBACK)
1633 goto drop;
1634
1635 sk = pt->af_packet_priv;
1636 po = pkt_sk(sk);
1637
09ad9bc7 1638 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1639 goto drop;
1640
1da177e4
LT
1641 skb->dev = dev;
1642
3b04ddde 1643 if (dev->header_ops) {
1da177e4 1644 /* The device has an explicit notion of ll header,
62ab0812
ED
1645 * exported to higher levels.
1646 *
1647 * Otherwise, the device hides details of its frame
1648 * structure, so that corresponding packet head is
1649 * never delivered to user.
1da177e4
LT
1650 */
1651 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1652 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1653 else if (skb->pkt_type == PACKET_OUTGOING) {
1654 /* Special case: outgoing packets have ll header at head */
bbe735e4 1655 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1656 }
1657 }
1658
1659 snaplen = skb->len;
1660
dbcb5855
DM
1661 res = run_filter(skb, sk, snaplen);
1662 if (!res)
fda9ef5d 1663 goto drop_n_restore;
dbcb5855
DM
1664 if (snaplen > res)
1665 snaplen = res;
1da177e4 1666
0fd7bac6 1667 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1668 goto drop_n_acct;
1669
1670 if (skb_shared(skb)) {
1671 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1672 if (nskb == NULL)
1673 goto drop_n_acct;
1674
1675 if (skb_head != skb->data) {
1676 skb->data = skb_head;
1677 skb->len = skb_len;
1678 }
abc4e4fa 1679 consume_skb(skb);
1da177e4
LT
1680 skb = nskb;
1681 }
1682
ffbc6111
HX
1683 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1684 sizeof(skb->cb));
1685
1686 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1687 sll->sll_family = AF_PACKET;
1688 sll->sll_hatype = dev->type;
1689 sll->sll_protocol = skb->protocol;
1690 sll->sll_pkttype = skb->pkt_type;
8032b464 1691 if (unlikely(po->origdev))
80feaacb
PWJ
1692 sll->sll_ifindex = orig_dev->ifindex;
1693 else
1694 sll->sll_ifindex = dev->ifindex;
1da177e4 1695
b95cce35 1696 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1697
ffbc6111 1698 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1699
1da177e4
LT
1700 if (pskb_trim(skb, snaplen))
1701 goto drop_n_acct;
1702
1703 skb_set_owner_r(skb, sk);
1704 skb->dev = NULL;
adf30907 1705 skb_dst_drop(skb);
1da177e4 1706
84531c24
PO
1707 /* drop conntrack reference */
1708 nf_reset(skb);
1709
1da177e4 1710 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1711 po->stats.stats1.tp_packets++;
3b885787 1712 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1713 __skb_queue_tail(&sk->sk_receive_queue, skb);
1714 spin_unlock(&sk->sk_receive_queue.lock);
1715 sk->sk_data_ready(sk, skb->len);
1716 return 0;
1717
1718drop_n_acct:
7091fbd8 1719 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1720 po->stats.stats1.tp_drops++;
7091fbd8
WB
1721 atomic_inc(&sk->sk_drops);
1722 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1723
1724drop_n_restore:
1725 if (skb_head != skb->data && skb_shared(skb)) {
1726 skb->data = skb_head;
1727 skb->len = skb_len;
1728 }
1729drop:
ead2ceb0 1730 consume_skb(skb);
1da177e4
LT
1731 return 0;
1732}
1733
40d4e3df
ED
1734static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1735 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1736{
1737 struct sock *sk;
1738 struct packet_sock *po;
1739 struct sockaddr_ll *sll;
184f489e 1740 union tpacket_uhdr h;
40d4e3df 1741 u8 *skb_head = skb->data;
1da177e4 1742 int skb_len = skb->len;
dbcb5855 1743 unsigned int snaplen, res;
f6fb8f10 1744 unsigned long status = TP_STATUS_USER;
bbd6ef87 1745 unsigned short macoff, netoff, hdrlen;
1da177e4 1746 struct sk_buff *copy_skb = NULL;
bbd6ef87 1747 struct timespec ts;
b9c32fb2 1748 __u32 ts_status;
1da177e4
LT
1749
1750 if (skb->pkt_type == PACKET_LOOPBACK)
1751 goto drop;
1752
1753 sk = pt->af_packet_priv;
1754 po = pkt_sk(sk);
1755
09ad9bc7 1756 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1757 goto drop;
1758
3b04ddde 1759 if (dev->header_ops) {
1da177e4 1760 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1761 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1762 else if (skb->pkt_type == PACKET_OUTGOING) {
1763 /* Special case: outgoing packets have ll header at head */
bbe735e4 1764 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1765 }
1766 }
1767
8dc41944
HX
1768 if (skb->ip_summed == CHECKSUM_PARTIAL)
1769 status |= TP_STATUS_CSUMNOTREADY;
1770
1da177e4
LT
1771 snaplen = skb->len;
1772
dbcb5855
DM
1773 res = run_filter(skb, sk, snaplen);
1774 if (!res)
fda9ef5d 1775 goto drop_n_restore;
dbcb5855
DM
1776 if (snaplen > res)
1777 snaplen = res;
1da177e4
LT
1778
1779 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1780 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1781 po->tp_reserve;
1da177e4 1782 } else {
95c96174 1783 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1784 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1785 (maclen < 16 ? 16 : maclen)) +
1786 po->tp_reserve;
1da177e4
LT
1787 macoff = netoff - maclen;
1788 }
f6fb8f10 1789 if (po->tp_version <= TPACKET_V2) {
1790 if (macoff + snaplen > po->rx_ring.frame_size) {
1791 if (po->copy_thresh &&
0fd7bac6 1792 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1793 if (skb_shared(skb)) {
1794 copy_skb = skb_clone(skb, GFP_ATOMIC);
1795 } else {
1796 copy_skb = skb_get(skb);
1797 skb_head = skb->data;
1798 }
1799 if (copy_skb)
1800 skb_set_owner_r(copy_skb, sk);
1da177e4 1801 }
f6fb8f10 1802 snaplen = po->rx_ring.frame_size - macoff;
1803 if ((int)snaplen < 0)
1804 snaplen = 0;
1da177e4 1805 }
1da177e4 1806 }
1da177e4 1807 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1808 h.raw = packet_current_rx_frame(po, skb,
1809 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1810 if (!h.raw)
1da177e4 1811 goto ring_is_full;
f6fb8f10 1812 if (po->tp_version <= TPACKET_V2) {
1813 packet_increment_rx_head(po, &po->rx_ring);
1814 /*
1815 * LOSING will be reported till you read the stats,
1816 * because it's COR - Clear On Read.
1817 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1818 * at packet level.
1819 */
ee80fbf3 1820 if (po->stats.stats1.tp_drops)
f6fb8f10 1821 status |= TP_STATUS_LOSING;
1822 }
ee80fbf3 1823 po->stats.stats1.tp_packets++;
1da177e4
LT
1824 if (copy_skb) {
1825 status |= TP_STATUS_COPY;
1826 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1827 }
1da177e4
LT
1828 spin_unlock(&sk->sk_receive_queue.lock);
1829
bbd6ef87 1830 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
1831
1832 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 1833 getnstimeofday(&ts);
1da177e4 1834
b9c32fb2
DB
1835 status |= ts_status;
1836
bbd6ef87
PM
1837 switch (po->tp_version) {
1838 case TPACKET_V1:
1839 h.h1->tp_len = skb->len;
1840 h.h1->tp_snaplen = snaplen;
1841 h.h1->tp_mac = macoff;
1842 h.h1->tp_net = netoff;
4b457bdf
DB
1843 h.h1->tp_sec = ts.tv_sec;
1844 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
1845 hdrlen = sizeof(*h.h1);
1846 break;
1847 case TPACKET_V2:
1848 h.h2->tp_len = skb->len;
1849 h.h2->tp_snaplen = snaplen;
1850 h.h2->tp_mac = macoff;
1851 h.h2->tp_net = netoff;
bbd6ef87
PM
1852 h.h2->tp_sec = ts.tv_sec;
1853 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1854 if (vlan_tx_tag_present(skb)) {
1855 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1856 status |= TP_STATUS_VLAN_VALID;
1857 } else {
1858 h.h2->tp_vlan_tci = 0;
1859 }
13fcb7bd 1860 h.h2->tp_padding = 0;
bbd6ef87
PM
1861 hdrlen = sizeof(*h.h2);
1862 break;
f6fb8f10 1863 case TPACKET_V3:
1864 /* tp_nxt_offset,vlan are already populated above.
1865 * So DONT clear those fields here
1866 */
1867 h.h3->tp_status |= status;
1868 h.h3->tp_len = skb->len;
1869 h.h3->tp_snaplen = snaplen;
1870 h.h3->tp_mac = macoff;
1871 h.h3->tp_net = netoff;
f6fb8f10 1872 h.h3->tp_sec = ts.tv_sec;
1873 h.h3->tp_nsec = ts.tv_nsec;
1874 hdrlen = sizeof(*h.h3);
1875 break;
bbd6ef87
PM
1876 default:
1877 BUG();
1878 }
1da177e4 1879
bbd6ef87 1880 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1881 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1882 sll->sll_family = AF_PACKET;
1883 sll->sll_hatype = dev->type;
1884 sll->sll_protocol = skb->protocol;
1885 sll->sll_pkttype = skb->pkt_type;
8032b464 1886 if (unlikely(po->origdev))
80feaacb
PWJ
1887 sll->sll_ifindex = orig_dev->ifindex;
1888 else
1889 sll->sll_ifindex = dev->ifindex;
1da177e4 1890
e16aa207 1891 smp_mb();
f6dafa95 1892#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1893 {
0af55bb5
CG
1894 u8 *start, *end;
1895
f6fb8f10 1896 if (po->tp_version <= TPACKET_V2) {
1897 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1898 + macoff + snaplen);
1899 for (start = h.raw; start < end; start += PAGE_SIZE)
1900 flush_dcache_page(pgv_to_page(start));
1901 }
cc9f01b2 1902 smp_wmb();
1da177e4 1903 }
f6dafa95 1904#endif
f6fb8f10 1905 if (po->tp_version <= TPACKET_V2)
1906 __packet_set_status(po, h.raw, status);
1907 else
1908 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1909
1910 sk->sk_data_ready(sk, 0);
1911
1912drop_n_restore:
1913 if (skb_head != skb->data && skb_shared(skb)) {
1914 skb->data = skb_head;
1915 skb->len = skb_len;
1916 }
1917drop:
1ce4f28b 1918 kfree_skb(skb);
1da177e4
LT
1919 return 0;
1920
1921ring_is_full:
ee80fbf3 1922 po->stats.stats1.tp_drops++;
1da177e4
LT
1923 spin_unlock(&sk->sk_receive_queue.lock);
1924
1925 sk->sk_data_ready(sk, 0);
acb5d75b 1926 kfree_skb(copy_skb);
1da177e4
LT
1927 goto drop_n_restore;
1928}
1929
69e3c75f
JB
1930static void tpacket_destruct_skb(struct sk_buff *skb)
1931{
1932 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1933 void *ph;
1da177e4 1934
69e3c75f 1935 if (likely(po->tx_ring.pg_vec)) {
b9c32fb2
DB
1936 __u32 ts;
1937
69e3c75f 1938 ph = skb_shinfo(skb)->destructor_arg;
69e3c75f
JB
1939 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1940 atomic_dec(&po->tx_ring.pending);
b9c32fb2
DB
1941
1942 ts = __packet_set_timestamp(po, ph, skb);
1943 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
1944 }
1945
1946 sock_wfree(skb);
1947}
1948
40d4e3df
ED
1949static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1950 void *frame, struct net_device *dev, int size_max,
ae641949 1951 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 1952{
184f489e 1953 union tpacket_uhdr ph;
69e3c75f
JB
1954 int to_write, offset, len, tp_len, nr_frags, len_max;
1955 struct socket *sock = po->sk.sk_socket;
1956 struct page *page;
1957 void *data;
1958 int err;
1959
1960 ph.raw = frame;
1961
1962 skb->protocol = proto;
1963 skb->dev = dev;
1964 skb->priority = po->sk.sk_priority;
2d37a186 1965 skb->mark = po->sk.sk_mark;
2e31396f 1966 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
1967 skb_shinfo(skb)->destructor_arg = ph.raw;
1968
1969 switch (po->tp_version) {
1970 case TPACKET_V2:
1971 tp_len = ph.h2->tp_len;
1972 break;
1973 default:
1974 tp_len = ph.h1->tp_len;
1975 break;
1976 }
1977 if (unlikely(tp_len > size_max)) {
40d4e3df 1978 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1979 return -EMSGSIZE;
1980 }
1981
ae641949 1982 skb_reserve(skb, hlen);
69e3c75f 1983 skb_reset_network_header(skb);
40893fd0 1984 skb_probe_transport_header(skb, 0);
c1aad275 1985
5920cd3a
PC
1986 if (po->tp_tx_has_off) {
1987 int off_min, off_max, off;
1988 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
1989 off_max = po->tx_ring.frame_size - tp_len;
1990 if (sock->type == SOCK_DGRAM) {
1991 switch (po->tp_version) {
1992 case TPACKET_V2:
1993 off = ph.h2->tp_net;
1994 break;
1995 default:
1996 off = ph.h1->tp_net;
1997 break;
1998 }
1999 } else {
2000 switch (po->tp_version) {
2001 case TPACKET_V2:
2002 off = ph.h2->tp_mac;
2003 break;
2004 default:
2005 off = ph.h1->tp_mac;
2006 break;
2007 }
2008 }
2009 if (unlikely((off < off_min) || (off_max < off)))
2010 return -EINVAL;
2011 data = ph.raw + off;
2012 } else {
2013 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2014 }
69e3c75f
JB
2015 to_write = tp_len;
2016
2017 if (sock->type == SOCK_DGRAM) {
2018 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2019 NULL, tp_len);
2020 if (unlikely(err < 0))
2021 return -EINVAL;
40d4e3df 2022 } else if (dev->hard_header_len) {
69e3c75f
JB
2023 /* net device doesn't like empty head */
2024 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
2025 pr_err("packet size is too short (%d < %d)\n",
2026 tp_len, dev->hard_header_len);
69e3c75f
JB
2027 return -EINVAL;
2028 }
2029
2030 skb_push(skb, dev->hard_header_len);
2031 err = skb_store_bits(skb, 0, data,
2032 dev->hard_header_len);
2033 if (unlikely(err))
2034 return err;
2035
2036 data += dev->hard_header_len;
2037 to_write -= dev->hard_header_len;
2038 }
2039
69e3c75f
JB
2040 offset = offset_in_page(data);
2041 len_max = PAGE_SIZE - offset;
2042 len = ((to_write > len_max) ? len_max : to_write);
2043
2044 skb->data_len = to_write;
2045 skb->len += to_write;
2046 skb->truesize += to_write;
2047 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2048
2049 while (likely(to_write)) {
2050 nr_frags = skb_shinfo(skb)->nr_frags;
2051
2052 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2053 pr_err("Packet exceed the number of skb frags(%lu)\n",
2054 MAX_SKB_FRAGS);
69e3c75f
JB
2055 return -EFAULT;
2056 }
2057
0af55bb5
CG
2058 page = pgv_to_page(data);
2059 data += len;
69e3c75f
JB
2060 flush_dcache_page(page);
2061 get_page(page);
0af55bb5 2062 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2063 to_write -= len;
2064 offset = 0;
2065 len_max = PAGE_SIZE;
2066 len = ((to_write > len_max) ? len_max : to_write);
2067 }
2068
2069 return tp_len;
2070}
2071
2072static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2073{
69e3c75f
JB
2074 struct sk_buff *skb;
2075 struct net_device *dev;
2076 __be16 proto;
827d9780 2077 int err, reserve = 0;
40d4e3df
ED
2078 void *ph;
2079 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2080 int tp_len, size_max;
2081 unsigned char *addr;
2082 int len_sum = 0;
9e67030a 2083 int status = TP_STATUS_AVAILABLE;
ae641949 2084 int hlen, tlen;
69e3c75f 2085
69e3c75f
JB
2086 mutex_lock(&po->pg_vec_lock);
2087
c3ac8a13 2088 if (likely(saddr == NULL)) {
026bb405 2089 dev = packet_cached_dev_get(po);
69e3c75f
JB
2090 proto = po->num;
2091 addr = NULL;
2092 } else {
2093 err = -EINVAL;
2094 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2095 goto out;
2096 if (msg->msg_namelen < (saddr->sll_halen
2097 + offsetof(struct sockaddr_ll,
2098 sll_addr)))
2099 goto out;
69e3c75f
JB
2100 proto = saddr->sll_protocol;
2101 addr = saddr->sll_addr;
827d9780 2102 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2103 }
2104
69e3c75f
JB
2105 err = -ENXIO;
2106 if (unlikely(dev == NULL))
2107 goto out;
69e3c75f
JB
2108 err = -ENETDOWN;
2109 if (unlikely(!(dev->flags & IFF_UP)))
2110 goto out_put;
2111
026bb405
DB
2112 reserve = dev->hard_header_len;
2113
69e3c75f 2114 size_max = po->tx_ring.frame_size
b5dd884e 2115 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2116
2117 if (size_max > dev->mtu + reserve)
2118 size_max = dev->mtu + reserve;
2119
2120 do {
2121 ph = packet_current_frame(po, &po->tx_ring,
2122 TP_STATUS_SEND_REQUEST);
2123
2124 if (unlikely(ph == NULL)) {
2125 schedule();
2126 continue;
2127 }
2128
2129 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2130 hlen = LL_RESERVED_SPACE(dev);
2131 tlen = dev->needed_tailroom;
69e3c75f 2132 skb = sock_alloc_send_skb(&po->sk,
ae641949 2133 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2134 0, &err);
2135
2136 if (unlikely(skb == NULL))
2137 goto out_status;
2138
2139 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2140 addr, hlen);
69e3c75f
JB
2141
2142 if (unlikely(tp_len < 0)) {
2143 if (po->tp_loss) {
2144 __packet_set_status(po, ph,
2145 TP_STATUS_AVAILABLE);
2146 packet_increment_head(&po->tx_ring);
2147 kfree_skb(skb);
2148 continue;
2149 } else {
2150 status = TP_STATUS_WRONG_FORMAT;
2151 err = tp_len;
2152 goto out_status;
2153 }
2154 }
2155
2156 skb->destructor = tpacket_destruct_skb;
2157 __packet_set_status(po, ph, TP_STATUS_SENDING);
2158 atomic_inc(&po->tx_ring.pending);
2159
2160 status = TP_STATUS_SEND_REQUEST;
2161 err = dev_queue_xmit(skb);
eb70df13
JP
2162 if (unlikely(err > 0)) {
2163 err = net_xmit_errno(err);
2164 if (err && __packet_get_status(po, ph) ==
2165 TP_STATUS_AVAILABLE) {
2166 /* skb was destructed already */
2167 skb = NULL;
2168 goto out_status;
2169 }
2170 /*
2171 * skb was dropped but not destructed yet;
2172 * let's treat it like congestion or err < 0
2173 */
2174 err = 0;
2175 }
69e3c75f
JB
2176 packet_increment_head(&po->tx_ring);
2177 len_sum += tp_len;
f64f9e71
JP
2178 } while (likely((ph != NULL) ||
2179 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2180 (atomic_read(&po->tx_ring.pending))))
2181 );
69e3c75f
JB
2182
2183 err = len_sum;
2184 goto out_put;
2185
69e3c75f
JB
2186out_status:
2187 __packet_set_status(po, ph, status);
2188 kfree_skb(skb);
2189out_put:
026bb405 2190 dev_put(dev);
69e3c75f
JB
2191out:
2192 mutex_unlock(&po->pg_vec_lock);
2193 return err;
2194}
69e3c75f 2195
eea49cc9
OJ
2196static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2197 size_t reserve, size_t len,
2198 size_t linear, int noblock,
2199 int *err)
bfd5f4a3
SS
2200{
2201 struct sk_buff *skb;
2202
2203 /* Under a page? Don't bother with paged skb. */
2204 if (prepad + len < PAGE_SIZE || !linear)
2205 linear = len;
2206
2207 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2208 err);
2209 if (!skb)
2210 return NULL;
2211
2212 skb_reserve(skb, reserve);
2213 skb_put(skb, linear);
2214 skb->data_len = len - linear;
2215 skb->len += len - linear;
2216
2217 return skb;
2218}
2219
69e3c75f 2220static int packet_snd(struct socket *sock,
1da177e4
LT
2221 struct msghdr *msg, size_t len)
2222{
2223 struct sock *sk = sock->sk;
40d4e3df 2224 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2225 struct sk_buff *skb;
2226 struct net_device *dev;
0e11c91e 2227 __be16 proto;
1da177e4 2228 unsigned char *addr;
827d9780 2229 int err, reserve = 0;
bfd5f4a3
SS
2230 struct virtio_net_hdr vnet_hdr = { 0 };
2231 int offset = 0;
2232 int vnet_hdr_len;
2233 struct packet_sock *po = pkt_sk(sk);
2234 unsigned short gso_type = 0;
ae641949 2235 int hlen, tlen;
3bdc0eba 2236 int extra_len = 0;
1da177e4
LT
2237
2238 /*
1ce4f28b 2239 * Get and verify the address.
1da177e4 2240 */
1ce4f28b 2241
c3ac8a13 2242 if (likely(saddr == NULL)) {
026bb405 2243 dev = packet_cached_dev_get(po);
1da177e4
LT
2244 proto = po->num;
2245 addr = NULL;
2246 } else {
2247 err = -EINVAL;
2248 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2249 goto out;
0fb375fb
EB
2250 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2251 goto out;
1da177e4
LT
2252 proto = saddr->sll_protocol;
2253 addr = saddr->sll_addr;
827d9780 2254 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2255 }
2256
1da177e4 2257 err = -ENXIO;
026bb405 2258 if (unlikely(dev == NULL))
1da177e4 2259 goto out_unlock;
d5e76b0a 2260 err = -ENETDOWN;
026bb405 2261 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2262 goto out_unlock;
2263
026bb405
DB
2264 if (sock->type == SOCK_RAW)
2265 reserve = dev->hard_header_len;
bfd5f4a3
SS
2266 if (po->has_vnet_hdr) {
2267 vnet_hdr_len = sizeof(vnet_hdr);
2268
2269 err = -EINVAL;
2270 if (len < vnet_hdr_len)
2271 goto out_unlock;
2272
2273 len -= vnet_hdr_len;
2274
2275 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2276 vnet_hdr_len);
2277 if (err < 0)
2278 goto out_unlock;
2279
2280 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2281 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2282 vnet_hdr.hdr_len))
2283 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2284 vnet_hdr.csum_offset + 2;
2285
2286 err = -EINVAL;
2287 if (vnet_hdr.hdr_len > len)
2288 goto out_unlock;
2289
2290 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2291 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2292 case VIRTIO_NET_HDR_GSO_TCPV4:
2293 gso_type = SKB_GSO_TCPV4;
2294 break;
2295 case VIRTIO_NET_HDR_GSO_TCPV6:
2296 gso_type = SKB_GSO_TCPV6;
2297 break;
2298 case VIRTIO_NET_HDR_GSO_UDP:
2299 gso_type = SKB_GSO_UDP;
2300 break;
2301 default:
2302 goto out_unlock;
2303 }
2304
2305 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2306 gso_type |= SKB_GSO_TCP_ECN;
2307
2308 if (vnet_hdr.gso_size == 0)
2309 goto out_unlock;
2310
2311 }
2312 }
2313
3bdc0eba
BG
2314 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2315 if (!netif_supports_nofcs(dev)) {
2316 err = -EPROTONOSUPPORT;
2317 goto out_unlock;
2318 }
2319 extra_len = 4; /* We're doing our own CRC */
2320 }
2321
1da177e4 2322 err = -EMSGSIZE;
3bdc0eba 2323 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2324 goto out_unlock;
2325
bfd5f4a3 2326 err = -ENOBUFS;
ae641949
HX
2327 hlen = LL_RESERVED_SPACE(dev);
2328 tlen = dev->needed_tailroom;
2329 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2330 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2331 if (skb == NULL)
1da177e4
LT
2332 goto out_unlock;
2333
bfd5f4a3 2334 skb_set_network_header(skb, reserve);
1da177e4 2335
0c4e8581
SH
2336 err = -EINVAL;
2337 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2338 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2339 goto out_free;
1da177e4
LT
2340
2341 /* Returns -EFAULT on error */
bfd5f4a3 2342 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2343 if (err)
2344 goto out_free;
bf84a010
DB
2345
2346 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2347
3bdc0eba 2348 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
57f89bfa
BG
2349 /* Earlier code assumed this would be a VLAN pkt,
2350 * double-check this now that we have the actual
2351 * packet in hand.
2352 */
2353 struct ethhdr *ehdr;
2354 skb_reset_mac_header(skb);
2355 ehdr = eth_hdr(skb);
2356 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2357 err = -EMSGSIZE;
2358 goto out_free;
2359 }
2360 }
2361
1da177e4
LT
2362 skb->protocol = proto;
2363 skb->dev = dev;
2364 skb->priority = sk->sk_priority;
2d37a186 2365 skb->mark = sk->sk_mark;
1da177e4 2366
bfd5f4a3
SS
2367 if (po->has_vnet_hdr) {
2368 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2369 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2370 vnet_hdr.csum_offset)) {
2371 err = -EINVAL;
2372 goto out_free;
2373 }
2374 }
2375
2376 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2377 skb_shinfo(skb)->gso_type = gso_type;
2378
2379 /* Header must be checked, and gso_segs computed. */
2380 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2381 skb_shinfo(skb)->gso_segs = 0;
2382
2383 len += vnet_hdr_len;
2384 }
2385
40893fd0 2386 skb_probe_transport_header(skb, reserve);
c1aad275 2387
3bdc0eba
BG
2388 if (unlikely(extra_len == 4))
2389 skb->no_fcs = 1;
2390
1da177e4
LT
2391 /*
2392 * Now send it
2393 */
2394
2395 err = dev_queue_xmit(skb);
2396 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2397 goto out_unlock;
2398
026bb405 2399 dev_put(dev);
1da177e4 2400
40d4e3df 2401 return len;
1da177e4
LT
2402
2403out_free:
2404 kfree_skb(skb);
2405out_unlock:
026bb405 2406 if (dev)
1da177e4
LT
2407 dev_put(dev);
2408out:
2409 return err;
2410}
2411
69e3c75f
JB
2412static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2413 struct msghdr *msg, size_t len)
2414{
69e3c75f
JB
2415 struct sock *sk = sock->sk;
2416 struct packet_sock *po = pkt_sk(sk);
2417 if (po->tx_ring.pg_vec)
2418 return tpacket_snd(po, msg);
2419 else
69e3c75f
JB
2420 return packet_snd(sock, msg, len);
2421}
2422
1da177e4
LT
2423/*
2424 * Close a PACKET socket. This is fairly simple. We immediately go
2425 * to 'closed' state and remove our protocol entry in the device list.
2426 */
2427
2428static int packet_release(struct socket *sock)
2429{
2430 struct sock *sk = sock->sk;
2431 struct packet_sock *po;
d12d01d6 2432 struct net *net;
f6fb8f10 2433 union tpacket_req_u req_u;
1da177e4
LT
2434
2435 if (!sk)
2436 return 0;
2437
3b1e0a65 2438 net = sock_net(sk);
1da177e4
LT
2439 po = pkt_sk(sk);
2440
0fa7fa98 2441 mutex_lock(&net->packet.sklist_lock);
808f5114 2442 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2443 mutex_unlock(&net->packet.sklist_lock);
2444
2445 preempt_disable();
920de804 2446 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2447 preempt_enable();
1da177e4 2448
808f5114 2449 spin_lock(&po->bind_lock);
ce06b03e 2450 unregister_prot_hook(sk, false);
c3ac8a13
DB
2451 packet_cached_dev_reset(po);
2452
160ff18a
BG
2453 if (po->prot_hook.dev) {
2454 dev_put(po->prot_hook.dev);
2455 po->prot_hook.dev = NULL;
2456 }
808f5114 2457 spin_unlock(&po->bind_lock);
1da177e4 2458
1da177e4 2459 packet_flush_mclist(sk);
1da177e4 2460
9665d5d6
PS
2461 if (po->rx_ring.pg_vec) {
2462 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2463 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2464 }
69e3c75f 2465
9665d5d6
PS
2466 if (po->tx_ring.pg_vec) {
2467 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2468 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2469 }
1da177e4 2470
dc99f600
DM
2471 fanout_release(sk);
2472
808f5114 2473 synchronize_net();
1da177e4
LT
2474 /*
2475 * Now the socket is dead. No more input will appear.
2476 */
1da177e4
LT
2477 sock_orphan(sk);
2478 sock->sk = NULL;
2479
2480 /* Purge queues */
2481
2482 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2483 sk_refcnt_debug_release(sk);
1da177e4
LT
2484
2485 sock_put(sk);
2486 return 0;
2487}
2488
2489/*
2490 * Attach a packet hook.
2491 */
2492
0e11c91e 2493static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2494{
2495 struct packet_sock *po = pkt_sk(sk);
dc99f600 2496
aef950b4
WY
2497 if (po->fanout) {
2498 if (dev)
2499 dev_put(dev);
2500
dc99f600 2501 return -EINVAL;
aef950b4 2502 }
1da177e4
LT
2503
2504 lock_sock(sk);
2505
2506 spin_lock(&po->bind_lock);
ce06b03e 2507 unregister_prot_hook(sk, true);
c3ac8a13 2508
1da177e4
LT
2509 po->num = protocol;
2510 po->prot_hook.type = protocol;
160ff18a
BG
2511 if (po->prot_hook.dev)
2512 dev_put(po->prot_hook.dev);
1da177e4 2513
c3ac8a13 2514 po->prot_hook.dev = dev;
1da177e4
LT
2515 po->ifindex = dev ? dev->ifindex : 0;
2516
c3ac8a13
DB
2517 packet_cached_dev_assign(po, dev);
2518
1da177e4
LT
2519 if (protocol == 0)
2520 goto out_unlock;
2521
be85d4ad 2522 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2523 register_prot_hook(sk);
be85d4ad
UT
2524 } else {
2525 sk->sk_err = ENETDOWN;
2526 if (!sock_flag(sk, SOCK_DEAD))
2527 sk->sk_error_report(sk);
1da177e4
LT
2528 }
2529
2530out_unlock:
2531 spin_unlock(&po->bind_lock);
2532 release_sock(sk);
2533 return 0;
2534}
2535
2536/*
2537 * Bind a packet socket to a device
2538 */
2539
40d4e3df
ED
2540static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2541 int addr_len)
1da177e4 2542{
40d4e3df 2543 struct sock *sk = sock->sk;
1da177e4
LT
2544 char name[15];
2545 struct net_device *dev;
2546 int err = -ENODEV;
1ce4f28b 2547
1da177e4
LT
2548 /*
2549 * Check legality
2550 */
1ce4f28b 2551
8ae55f04 2552 if (addr_len != sizeof(struct sockaddr))
1da177e4 2553 return -EINVAL;
40d4e3df 2554 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2555
3b1e0a65 2556 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2557 if (dev)
1da177e4 2558 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2559 return err;
2560}
1da177e4
LT
2561
2562static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2563{
40d4e3df
ED
2564 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2565 struct sock *sk = sock->sk;
1da177e4
LT
2566 struct net_device *dev = NULL;
2567 int err;
2568
2569
2570 /*
2571 * Check legality
2572 */
1ce4f28b 2573
1da177e4
LT
2574 if (addr_len < sizeof(struct sockaddr_ll))
2575 return -EINVAL;
2576 if (sll->sll_family != AF_PACKET)
2577 return -EINVAL;
2578
2579 if (sll->sll_ifindex) {
2580 err = -ENODEV;
3b1e0a65 2581 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2582 if (dev == NULL)
2583 goto out;
2584 }
2585 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2586
2587out:
2588 return err;
2589}
2590
2591static struct proto packet_proto = {
2592 .name = "PACKET",
2593 .owner = THIS_MODULE,
2594 .obj_size = sizeof(struct packet_sock),
2595};
2596
2597/*
1ce4f28b 2598 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2599 */
2600
3f378b68
EP
2601static int packet_create(struct net *net, struct socket *sock, int protocol,
2602 int kern)
1da177e4
LT
2603{
2604 struct sock *sk;
2605 struct packet_sock *po;
0e11c91e 2606 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2607 int err;
2608
df008c91 2609 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2610 return -EPERM;
be02097c
DM
2611 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2612 sock->type != SOCK_PACKET)
1da177e4
LT
2613 return -ESOCKTNOSUPPORT;
2614
2615 sock->state = SS_UNCONNECTED;
2616
2617 err = -ENOBUFS;
6257ff21 2618 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2619 if (sk == NULL)
2620 goto out;
2621
2622 sock->ops = &packet_ops;
1da177e4
LT
2623 if (sock->type == SOCK_PACKET)
2624 sock->ops = &packet_ops_spkt;
be02097c 2625
1da177e4
LT
2626 sock_init_data(sock, sk);
2627
2628 po = pkt_sk(sk);
2629 sk->sk_family = PF_PACKET;
0e11c91e 2630 po->num = proto;
c3ac8a13
DB
2631
2632 packet_cached_dev_reset(po);
1da177e4
LT
2633
2634 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2635 sk_refcnt_debug_inc(sk);
1da177e4
LT
2636
2637 /*
2638 * Attach a protocol block
2639 */
2640
2641 spin_lock_init(&po->bind_lock);
905db440 2642 mutex_init(&po->pg_vec_lock);
1da177e4 2643 po->prot_hook.func = packet_rcv;
be02097c 2644
1da177e4
LT
2645 if (sock->type == SOCK_PACKET)
2646 po->prot_hook.func = packet_rcv_spkt;
be02097c 2647
1da177e4
LT
2648 po->prot_hook.af_packet_priv = sk;
2649
0e11c91e
AV
2650 if (proto) {
2651 po->prot_hook.type = proto;
ce06b03e 2652 register_prot_hook(sk);
1da177e4
LT
2653 }
2654
0fa7fa98 2655 mutex_lock(&net->packet.sklist_lock);
808f5114 2656 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2657 mutex_unlock(&net->packet.sklist_lock);
2658
2659 preempt_disable();
3680453c 2660 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2661 preempt_enable();
808f5114 2662
40d4e3df 2663 return 0;
1da177e4
LT
2664out:
2665 return err;
2666}
2667
ed85b565
RC
2668static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2669{
2670 struct sock_exterr_skb *serr;
2671 struct sk_buff *skb, *skb2;
2672 int copied, err;
2673
2674 err = -EAGAIN;
2675 skb = skb_dequeue(&sk->sk_error_queue);
2676 if (skb == NULL)
2677 goto out;
2678
2679 copied = skb->len;
2680 if (copied > len) {
2681 msg->msg_flags |= MSG_TRUNC;
2682 copied = len;
2683 }
2684 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2685 if (err)
2686 goto out_free_skb;
2687
2688 sock_recv_timestamp(msg, sk, skb);
2689
2690 serr = SKB_EXT_ERR(skb);
2691 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2692 sizeof(serr->ee), &serr->ee);
2693
2694 msg->msg_flags |= MSG_ERRQUEUE;
2695 err = copied;
2696
2697 /* Reset and regenerate socket error */
2698 spin_lock_bh(&sk->sk_error_queue.lock);
2699 sk->sk_err = 0;
2700 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2701 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2702 spin_unlock_bh(&sk->sk_error_queue.lock);
2703 sk->sk_error_report(sk);
2704 } else
2705 spin_unlock_bh(&sk->sk_error_queue.lock);
2706
2707out_free_skb:
2708 kfree_skb(skb);
2709out:
2710 return err;
2711}
2712
1da177e4
LT
2713/*
2714 * Pull a packet from our receive queue and hand it to the user.
2715 * If necessary we block.
2716 */
2717
2718static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2719 struct msghdr *msg, size_t len, int flags)
2720{
2721 struct sock *sk = sock->sk;
2722 struct sk_buff *skb;
2723 int copied, err;
bfd5f4a3 2724 int vnet_hdr_len = 0;
1da177e4
LT
2725
2726 err = -EINVAL;
ed85b565 2727 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2728 goto out;
2729
2730#if 0
2731 /* What error should we return now? EUNATTACH? */
2732 if (pkt_sk(sk)->ifindex < 0)
2733 return -ENODEV;
2734#endif
2735
ed85b565
RC
2736 if (flags & MSG_ERRQUEUE) {
2737 err = packet_recv_error(sk, msg, len);
2738 goto out;
2739 }
2740
1da177e4
LT
2741 /*
2742 * Call the generic datagram receiver. This handles all sorts
2743 * of horrible races and re-entrancy so we can forget about it
2744 * in the protocol layers.
2745 *
2746 * Now it will return ENETDOWN, if device have just gone down,
2747 * but then it will block.
2748 */
2749
40d4e3df 2750 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2751
2752 /*
1ce4f28b 2753 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2754 * handles the blocking we don't see and worry about blocking
2755 * retries.
2756 */
2757
8ae55f04 2758 if (skb == NULL)
1da177e4
LT
2759 goto out;
2760
bfd5f4a3
SS
2761 if (pkt_sk(sk)->has_vnet_hdr) {
2762 struct virtio_net_hdr vnet_hdr = { 0 };
2763
2764 err = -EINVAL;
2765 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2766 if (len < vnet_hdr_len)
bfd5f4a3
SS
2767 goto out_free;
2768
1f18b717
MK
2769 len -= vnet_hdr_len;
2770
bfd5f4a3
SS
2771 if (skb_is_gso(skb)) {
2772 struct skb_shared_info *sinfo = skb_shinfo(skb);
2773
2774 /* This is a hint as to how much should be linear. */
2775 vnet_hdr.hdr_len = skb_headlen(skb);
2776 vnet_hdr.gso_size = sinfo->gso_size;
2777 if (sinfo->gso_type & SKB_GSO_TCPV4)
2778 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2779 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2780 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2781 else if (sinfo->gso_type & SKB_GSO_UDP)
2782 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2783 else if (sinfo->gso_type & SKB_GSO_FCOE)
2784 goto out_free;
2785 else
2786 BUG();
2787 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2788 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2789 } else
2790 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2791
2792 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2793 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2794 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2795 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2796 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2797 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2798 } /* else everything is zero */
2799
2800 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2801 vnet_hdr_len);
2802 if (err < 0)
2803 goto out_free;
2804 }
2805
2f73d7fd
HFS
2806 /* You lose any data beyond the buffer you gave. If it worries
2807 * a user program they can ask the device for its MTU
2808 * anyway.
1da177e4 2809 */
1da177e4 2810 copied = skb->len;
40d4e3df
ED
2811 if (copied > len) {
2812 copied = len;
2813 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2814 }
2815
2816 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2817 if (err)
2818 goto out_free;
2819
3b885787 2820 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 2821
2f73d7fd
HFS
2822 if (msg->msg_name) {
2823 /* If the address length field is there to be filled
2824 * in, we fill it in now.
2825 */
2826 if (sock->type == SOCK_PACKET) {
2827 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2828 } else {
2829 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2830 msg->msg_namelen = sll->sll_halen +
2831 offsetof(struct sockaddr_ll, sll_addr);
2832 }
ffbc6111
HX
2833 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2834 msg->msg_namelen);
2f73d7fd 2835 }
1da177e4 2836
8dc41944 2837 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2838 struct tpacket_auxdata aux;
2839
2840 aux.tp_status = TP_STATUS_USER;
2841 if (skb->ip_summed == CHECKSUM_PARTIAL)
2842 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2843 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2844 aux.tp_snaplen = skb->len;
2845 aux.tp_mac = 0;
bbe735e4 2846 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2847 if (vlan_tx_tag_present(skb)) {
2848 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2849 aux.tp_status |= TP_STATUS_VLAN_VALID;
2850 } else {
2851 aux.tp_vlan_tci = 0;
2852 }
13fcb7bd 2853 aux.tp_padding = 0;
ffbc6111 2854 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2855 }
2856
1da177e4
LT
2857 /*
2858 * Free or return the buffer as appropriate. Again this
2859 * hides all the races and re-entrancy issues from us.
2860 */
bfd5f4a3 2861 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2862
2863out_free:
2864 skb_free_datagram(sk, skb);
2865out:
2866 return err;
2867}
2868
1da177e4
LT
2869static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2870 int *uaddr_len, int peer)
2871{
2872 struct net_device *dev;
2873 struct sock *sk = sock->sk;
2874
2875 if (peer)
2876 return -EOPNOTSUPP;
2877
2878 uaddr->sa_family = AF_PACKET;
2dc85bf3 2879 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
2880 rcu_read_lock();
2881 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2882 if (dev)
2dc85bf3 2883 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 2884 rcu_read_unlock();
1da177e4
LT
2885 *uaddr_len = sizeof(*uaddr);
2886
2887 return 0;
2888}
1da177e4
LT
2889
2890static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2891 int *uaddr_len, int peer)
2892{
2893 struct net_device *dev;
2894 struct sock *sk = sock->sk;
2895 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2896 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2897
2898 if (peer)
2899 return -EOPNOTSUPP;
2900
2901 sll->sll_family = AF_PACKET;
2902 sll->sll_ifindex = po->ifindex;
2903 sll->sll_protocol = po->num;
67286640 2904 sll->sll_pkttype = 0;
654d1f8a
ED
2905 rcu_read_lock();
2906 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2907 if (dev) {
2908 sll->sll_hatype = dev->type;
2909 sll->sll_halen = dev->addr_len;
2910 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2911 } else {
2912 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2913 sll->sll_halen = 0;
2914 }
654d1f8a 2915 rcu_read_unlock();
0fb375fb 2916 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2917
2918 return 0;
2919}
2920
2aeb0b88
WC
2921static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2922 int what)
1da177e4
LT
2923{
2924 switch (i->type) {
2925 case PACKET_MR_MULTICAST:
1162563f
JP
2926 if (i->alen != dev->addr_len)
2927 return -EINVAL;
1da177e4 2928 if (what > 0)
22bedad3 2929 return dev_mc_add(dev, i->addr);
1da177e4 2930 else
22bedad3 2931 return dev_mc_del(dev, i->addr);
1da177e4
LT
2932 break;
2933 case PACKET_MR_PROMISC:
2aeb0b88 2934 return dev_set_promiscuity(dev, what);
1da177e4
LT
2935 break;
2936 case PACKET_MR_ALLMULTI:
2aeb0b88 2937 return dev_set_allmulti(dev, what);
1da177e4 2938 break;
d95ed927 2939 case PACKET_MR_UNICAST:
1162563f
JP
2940 if (i->alen != dev->addr_len)
2941 return -EINVAL;
d95ed927 2942 if (what > 0)
a748ee24 2943 return dev_uc_add(dev, i->addr);
d95ed927 2944 else
a748ee24 2945 return dev_uc_del(dev, i->addr);
d95ed927 2946 break;
40d4e3df
ED
2947 default:
2948 break;
1da177e4 2949 }
2aeb0b88 2950 return 0;
1da177e4
LT
2951}
2952
2953static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2954{
40d4e3df 2955 for ( ; i; i = i->next) {
1da177e4
LT
2956 if (i->ifindex == dev->ifindex)
2957 packet_dev_mc(dev, i, what);
2958 }
2959}
2960
0fb375fb 2961static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2962{
2963 struct packet_sock *po = pkt_sk(sk);
2964 struct packet_mclist *ml, *i;
2965 struct net_device *dev;
2966 int err;
2967
2968 rtnl_lock();
2969
2970 err = -ENODEV;
3b1e0a65 2971 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2972 if (!dev)
2973 goto done;
2974
2975 err = -EINVAL;
1162563f 2976 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2977 goto done;
2978
2979 err = -ENOBUFS;
8b3a7005 2980 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2981 if (i == NULL)
2982 goto done;
2983
2984 err = 0;
2985 for (ml = po->mclist; ml; ml = ml->next) {
2986 if (ml->ifindex == mreq->mr_ifindex &&
2987 ml->type == mreq->mr_type &&
2988 ml->alen == mreq->mr_alen &&
2989 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2990 ml->count++;
2991 /* Free the new element ... */
2992 kfree(i);
2993 goto done;
2994 }
2995 }
2996
2997 i->type = mreq->mr_type;
2998 i->ifindex = mreq->mr_ifindex;
2999 i->alen = mreq->mr_alen;
3000 memcpy(i->addr, mreq->mr_address, i->alen);
3001 i->count = 1;
3002 i->next = po->mclist;
3003 po->mclist = i;
2aeb0b88
WC
3004 err = packet_dev_mc(dev, i, 1);
3005 if (err) {
3006 po->mclist = i->next;
3007 kfree(i);
3008 }
1da177e4
LT
3009
3010done:
3011 rtnl_unlock();
3012 return err;
3013}
3014
0fb375fb 3015static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3016{
3017 struct packet_mclist *ml, **mlp;
3018
3019 rtnl_lock();
3020
3021 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3022 if (ml->ifindex == mreq->mr_ifindex &&
3023 ml->type == mreq->mr_type &&
3024 ml->alen == mreq->mr_alen &&
3025 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3026 if (--ml->count == 0) {
3027 struct net_device *dev;
3028 *mlp = ml->next;
ad959e76
ED
3029 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3030 if (dev)
1da177e4 3031 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3032 kfree(ml);
3033 }
3034 rtnl_unlock();
3035 return 0;
3036 }
3037 }
3038 rtnl_unlock();
3039 return -EADDRNOTAVAIL;
3040}
3041
3042static void packet_flush_mclist(struct sock *sk)
3043{
3044 struct packet_sock *po = pkt_sk(sk);
3045 struct packet_mclist *ml;
3046
3047 if (!po->mclist)
3048 return;
3049
3050 rtnl_lock();
3051 while ((ml = po->mclist) != NULL) {
3052 struct net_device *dev;
3053
3054 po->mclist = ml->next;
ad959e76
ED
3055 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3056 if (dev != NULL)
1da177e4 3057 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3058 kfree(ml);
3059 }
3060 rtnl_unlock();
3061}
1da177e4
LT
3062
3063static int
b7058842 3064packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3065{
3066 struct sock *sk = sock->sk;
8dc41944 3067 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3068 int ret;
3069
3070 if (level != SOL_PACKET)
3071 return -ENOPROTOOPT;
3072
69e3c75f 3073 switch (optname) {
1ce4f28b 3074 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3075 case PACKET_DROP_MEMBERSHIP:
3076 {
0fb375fb
EB
3077 struct packet_mreq_max mreq;
3078 int len = optlen;
3079 memset(&mreq, 0, sizeof(mreq));
3080 if (len < sizeof(struct packet_mreq))
1da177e4 3081 return -EINVAL;
0fb375fb
EB
3082 if (len > sizeof(mreq))
3083 len = sizeof(mreq);
40d4e3df 3084 if (copy_from_user(&mreq, optval, len))
1da177e4 3085 return -EFAULT;
0fb375fb
EB
3086 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3087 return -EINVAL;
1da177e4
LT
3088 if (optname == PACKET_ADD_MEMBERSHIP)
3089 ret = packet_mc_add(sk, &mreq);
3090 else
3091 ret = packet_mc_drop(sk, &mreq);
3092 return ret;
3093 }
a2efcfa0 3094
1da177e4 3095 case PACKET_RX_RING:
69e3c75f 3096 case PACKET_TX_RING:
1da177e4 3097 {
f6fb8f10 3098 union tpacket_req_u req_u;
3099 int len;
1da177e4 3100
f6fb8f10 3101 switch (po->tp_version) {
3102 case TPACKET_V1:
3103 case TPACKET_V2:
3104 len = sizeof(req_u.req);
3105 break;
3106 case TPACKET_V3:
3107 default:
3108 len = sizeof(req_u.req3);
3109 break;
3110 }
3111 if (optlen < len)
1da177e4 3112 return -EINVAL;
bfd5f4a3
SS
3113 if (pkt_sk(sk)->has_vnet_hdr)
3114 return -EINVAL;
f6fb8f10 3115 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3116 return -EFAULT;
f6fb8f10 3117 return packet_set_ring(sk, &req_u, 0,
3118 optname == PACKET_TX_RING);
1da177e4
LT
3119 }
3120 case PACKET_COPY_THRESH:
3121 {
3122 int val;
3123
40d4e3df 3124 if (optlen != sizeof(val))
1da177e4 3125 return -EINVAL;
40d4e3df 3126 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3127 return -EFAULT;
3128
3129 pkt_sk(sk)->copy_thresh = val;
3130 return 0;
3131 }
bbd6ef87
PM
3132 case PACKET_VERSION:
3133 {
3134 int val;
3135
3136 if (optlen != sizeof(val))
3137 return -EINVAL;
bbd6ef87
PM
3138 if (copy_from_user(&val, optval, sizeof(val)))
3139 return -EFAULT;
3140 switch (val) {
3141 case TPACKET_V1:
3142 case TPACKET_V2:
f6fb8f10 3143 case TPACKET_V3:
4b9e9796 3144 break;
bbd6ef87
PM
3145 default:
3146 return -EINVAL;
3147 }
4b9e9796
S
3148 lock_sock(sk);
3149 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3150 ret = -EBUSY;
3151 } else {
3152 po->tp_version = val;
3153 ret = 0;
3154 }
3155 release_sock(sk);
3156 return ret;
bbd6ef87 3157 }
8913336a
PM
3158 case PACKET_RESERVE:
3159 {
3160 unsigned int val;
3161
3162 if (optlen != sizeof(val))
3163 return -EINVAL;
69e3c75f 3164 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3165 return -EBUSY;
3166 if (copy_from_user(&val, optval, sizeof(val)))
3167 return -EFAULT;
4b9e9796
S
3168 if (val > INT_MAX)
3169 return -EINVAL;
8913336a
PM
3170 po->tp_reserve = val;
3171 return 0;
3172 }
69e3c75f
JB
3173 case PACKET_LOSS:
3174 {
3175 unsigned int val;
3176
3177 if (optlen != sizeof(val))
3178 return -EINVAL;
3179 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3180 return -EBUSY;
3181 if (copy_from_user(&val, optval, sizeof(val)))
3182 return -EFAULT;
3183 po->tp_loss = !!val;
3184 return 0;
3185 }
8dc41944
HX
3186 case PACKET_AUXDATA:
3187 {
3188 int val;
3189
3190 if (optlen < sizeof(val))
3191 return -EINVAL;
3192 if (copy_from_user(&val, optval, sizeof(val)))
3193 return -EFAULT;
3194
3195 po->auxdata = !!val;
3196 return 0;
3197 }
80feaacb
PWJ
3198 case PACKET_ORIGDEV:
3199 {
3200 int val;
3201
3202 if (optlen < sizeof(val))
3203 return -EINVAL;
3204 if (copy_from_user(&val, optval, sizeof(val)))
3205 return -EFAULT;
3206
3207 po->origdev = !!val;
3208 return 0;
3209 }
bfd5f4a3
SS
3210 case PACKET_VNET_HDR:
3211 {
3212 int val;
3213
3214 if (sock->type != SOCK_RAW)
3215 return -EINVAL;
3216 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3217 return -EBUSY;
3218 if (optlen < sizeof(val))
3219 return -EINVAL;
3220 if (copy_from_user(&val, optval, sizeof(val)))
3221 return -EFAULT;
3222
3223 po->has_vnet_hdr = !!val;
3224 return 0;
3225 }
614f60fa
SM
3226 case PACKET_TIMESTAMP:
3227 {
3228 int val;
3229
3230 if (optlen != sizeof(val))
3231 return -EINVAL;
3232 if (copy_from_user(&val, optval, sizeof(val)))
3233 return -EFAULT;
3234
3235 po->tp_tstamp = val;
3236 return 0;
3237 }
dc99f600
DM
3238 case PACKET_FANOUT:
3239 {
3240 int val;
3241
3242 if (optlen != sizeof(val))
3243 return -EINVAL;
3244 if (copy_from_user(&val, optval, sizeof(val)))
3245 return -EFAULT;
3246
3247 return fanout_add(sk, val & 0xffff, val >> 16);
3248 }
5920cd3a
PC
3249 case PACKET_TX_HAS_OFF:
3250 {
3251 unsigned int val;
3252
3253 if (optlen != sizeof(val))
3254 return -EINVAL;
3255 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3256 return -EBUSY;
3257 if (copy_from_user(&val, optval, sizeof(val)))
3258 return -EFAULT;
3259 po->tp_tx_has_off = !!val;
3260 return 0;
3261 }
1da177e4
LT
3262 default:
3263 return -ENOPROTOOPT;
3264 }
3265}
3266
3267static int packet_getsockopt(struct socket *sock, int level, int optname,
3268 char __user *optval, int __user *optlen)
3269{
3270 int len;
c06fff6e 3271 int val, lv = sizeof(val);
1da177e4
LT
3272 struct sock *sk = sock->sk;
3273 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3274 void *data = &val;
ee80fbf3 3275 union tpacket_stats_u st;
1da177e4
LT
3276
3277 if (level != SOL_PACKET)
3278 return -ENOPROTOOPT;
3279
8ae55f04
KK
3280 if (get_user(len, optlen))
3281 return -EFAULT;
1da177e4
LT
3282
3283 if (len < 0)
3284 return -EINVAL;
1ce4f28b 3285
69e3c75f 3286 switch (optname) {
1da177e4 3287 case PACKET_STATISTICS:
1da177e4 3288 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3289 memcpy(&st, &po->stats, sizeof(st));
3290 memset(&po->stats, 0, sizeof(po->stats));
3291 spin_unlock_bh(&sk->sk_receive_queue.lock);
3292
f6fb8f10 3293 if (po->tp_version == TPACKET_V3) {
c06fff6e 3294 lv = sizeof(struct tpacket_stats_v3);
fc26e4cf 3295 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3296 data = &st.stats3;
f6fb8f10 3297 } else {
c06fff6e 3298 lv = sizeof(struct tpacket_stats);
fc26e4cf 3299 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3300 data = &st.stats1;
f6fb8f10 3301 }
ee80fbf3 3302
8dc41944
HX
3303 break;
3304 case PACKET_AUXDATA:
8dc41944 3305 val = po->auxdata;
80feaacb
PWJ
3306 break;
3307 case PACKET_ORIGDEV:
80feaacb 3308 val = po->origdev;
bfd5f4a3
SS
3309 break;
3310 case PACKET_VNET_HDR:
bfd5f4a3 3311 val = po->has_vnet_hdr;
1da177e4 3312 break;
bbd6ef87 3313 case PACKET_VERSION:
bbd6ef87 3314 val = po->tp_version;
bbd6ef87
PM
3315 break;
3316 case PACKET_HDRLEN:
3317 if (len > sizeof(int))
3318 len = sizeof(int);
3319 if (copy_from_user(&val, optval, len))
3320 return -EFAULT;
3321 switch (val) {
3322 case TPACKET_V1:
3323 val = sizeof(struct tpacket_hdr);
3324 break;
3325 case TPACKET_V2:
3326 val = sizeof(struct tpacket2_hdr);
3327 break;
f6fb8f10 3328 case TPACKET_V3:
3329 val = sizeof(struct tpacket3_hdr);
3330 break;
bbd6ef87
PM
3331 default:
3332 return -EINVAL;
3333 }
bbd6ef87 3334 break;
8913336a 3335 case PACKET_RESERVE:
8913336a 3336 val = po->tp_reserve;
8913336a 3337 break;
69e3c75f 3338 case PACKET_LOSS:
69e3c75f 3339 val = po->tp_loss;
69e3c75f 3340 break;
614f60fa 3341 case PACKET_TIMESTAMP:
614f60fa 3342 val = po->tp_tstamp;
614f60fa 3343 break;
dc99f600 3344 case PACKET_FANOUT:
dc99f600
DM
3345 val = (po->fanout ?
3346 ((u32)po->fanout->id |
77f65ebd
WB
3347 ((u32)po->fanout->type << 16) |
3348 ((u32)po->fanout->flags << 24)) :
dc99f600 3349 0);
dc99f600 3350 break;
5920cd3a
PC
3351 case PACKET_TX_HAS_OFF:
3352 val = po->tp_tx_has_off;
3353 break;
1da177e4
LT
3354 default:
3355 return -ENOPROTOOPT;
3356 }
3357
c06fff6e
ED
3358 if (len > lv)
3359 len = lv;
8ae55f04
KK
3360 if (put_user(len, optlen))
3361 return -EFAULT;
8dc41944
HX
3362 if (copy_to_user(optval, data, len))
3363 return -EFAULT;
8ae55f04 3364 return 0;
1da177e4
LT
3365}
3366
3367
3368static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3369{
3370 struct sock *sk;
ad930650 3371 struct net_device *dev = data;
c346dca1 3372 struct net *net = dev_net(dev);
1da177e4 3373
808f5114 3374 rcu_read_lock();
b67bfe0d 3375 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3376 struct packet_sock *po = pkt_sk(sk);
3377
3378 switch (msg) {
3379 case NETDEV_UNREGISTER:
1da177e4
LT
3380 if (po->mclist)
3381 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3382 /* fallthrough */
3383
1da177e4
LT
3384 case NETDEV_DOWN:
3385 if (dev->ifindex == po->ifindex) {
3386 spin_lock(&po->bind_lock);
3387 if (po->running) {
ce06b03e 3388 __unregister_prot_hook(sk, false);
1da177e4
LT
3389 sk->sk_err = ENETDOWN;
3390 if (!sock_flag(sk, SOCK_DEAD))
3391 sk->sk_error_report(sk);
3392 }
3393 if (msg == NETDEV_UNREGISTER) {
c3ac8a13 3394 packet_cached_dev_reset(po);
1da177e4 3395 po->ifindex = -1;
160ff18a
BG
3396 if (po->prot_hook.dev)
3397 dev_put(po->prot_hook.dev);
1da177e4
LT
3398 po->prot_hook.dev = NULL;
3399 }
3400 spin_unlock(&po->bind_lock);
3401 }
3402 break;
3403 case NETDEV_UP:
808f5114 3404 if (dev->ifindex == po->ifindex) {
3405 spin_lock(&po->bind_lock);
ce06b03e
DM
3406 if (po->num)
3407 register_prot_hook(sk);
808f5114 3408 spin_unlock(&po->bind_lock);
1da177e4 3409 }
1da177e4
LT
3410 break;
3411 }
3412 }
808f5114 3413 rcu_read_unlock();
1da177e4
LT
3414 return NOTIFY_DONE;
3415}
3416
3417
3418static int packet_ioctl(struct socket *sock, unsigned int cmd,
3419 unsigned long arg)
3420{
3421 struct sock *sk = sock->sk;
3422
69e3c75f 3423 switch (cmd) {
40d4e3df
ED
3424 case SIOCOUTQ:
3425 {
3426 int amount = sk_wmem_alloc_get(sk);
31e6d363 3427
40d4e3df
ED
3428 return put_user(amount, (int __user *)arg);
3429 }
3430 case SIOCINQ:
3431 {
3432 struct sk_buff *skb;
3433 int amount = 0;
3434
3435 spin_lock_bh(&sk->sk_receive_queue.lock);
3436 skb = skb_peek(&sk->sk_receive_queue);
3437 if (skb)
3438 amount = skb->len;
3439 spin_unlock_bh(&sk->sk_receive_queue.lock);
3440 return put_user(amount, (int __user *)arg);
3441 }
3442 case SIOCGSTAMP:
3443 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3444 case SIOCGSTAMPNS:
3445 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3446
1da177e4 3447#ifdef CONFIG_INET
40d4e3df
ED
3448 case SIOCADDRT:
3449 case SIOCDELRT:
3450 case SIOCDARP:
3451 case SIOCGARP:
3452 case SIOCSARP:
3453 case SIOCGIFADDR:
3454 case SIOCSIFADDR:
3455 case SIOCGIFBRDADDR:
3456 case SIOCSIFBRDADDR:
3457 case SIOCGIFNETMASK:
3458 case SIOCSIFNETMASK:
3459 case SIOCGIFDSTADDR:
3460 case SIOCSIFDSTADDR:
3461 case SIOCSIFFLAGS:
40d4e3df 3462 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3463#endif
3464
40d4e3df
ED
3465 default:
3466 return -ENOIOCTLCMD;
1da177e4
LT
3467 }
3468 return 0;
3469}
3470
40d4e3df 3471static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3472 poll_table *wait)
3473{
3474 struct sock *sk = sock->sk;
3475 struct packet_sock *po = pkt_sk(sk);
3476 unsigned int mask = datagram_poll(file, sock, wait);
3477
3478 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3479 if (po->rx_ring.pg_vec) {
f6fb8f10 3480 if (!packet_previous_rx_frame(po, &po->rx_ring,
3481 TP_STATUS_KERNEL))
1da177e4
LT
3482 mask |= POLLIN | POLLRDNORM;
3483 }
3484 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3485 spin_lock_bh(&sk->sk_write_queue.lock);
3486 if (po->tx_ring.pg_vec) {
3487 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3488 mask |= POLLOUT | POLLWRNORM;
3489 }
3490 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3491 return mask;
3492}
3493
3494
3495/* Dirty? Well, I still did not learn better way to account
3496 * for user mmaps.
3497 */
3498
3499static void packet_mm_open(struct vm_area_struct *vma)
3500{
3501 struct file *file = vma->vm_file;
40d4e3df 3502 struct socket *sock = file->private_data;
1da177e4 3503 struct sock *sk = sock->sk;
1ce4f28b 3504
1da177e4
LT
3505 if (sk)
3506 atomic_inc(&pkt_sk(sk)->mapped);
3507}
3508
3509static void packet_mm_close(struct vm_area_struct *vma)
3510{
3511 struct file *file = vma->vm_file;
40d4e3df 3512 struct socket *sock = file->private_data;
1da177e4 3513 struct sock *sk = sock->sk;
1ce4f28b 3514
1da177e4
LT
3515 if (sk)
3516 atomic_dec(&pkt_sk(sk)->mapped);
3517}
3518
f0f37e2f 3519static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3520 .open = packet_mm_open,
3521 .close = packet_mm_close,
1da177e4
LT
3522};
3523
0e3125c7
NH
3524static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3525 unsigned int len)
1da177e4
LT
3526{
3527 int i;
3528
4ebf0ae2 3529 for (i = 0; i < len; i++) {
0e3125c7 3530 if (likely(pg_vec[i].buffer)) {
c56b4d90 3531 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3532 vfree(pg_vec[i].buffer);
3533 else
3534 free_pages((unsigned long)pg_vec[i].buffer,
3535 order);
3536 pg_vec[i].buffer = NULL;
3537 }
1da177e4
LT
3538 }
3539 kfree(pg_vec);
3540}
3541
eea49cc9 3542static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3543{
0e3125c7
NH
3544 char *buffer = NULL;
3545 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3546 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3547
3548 buffer = (char *) __get_free_pages(gfp_flags, order);
3549
3550 if (buffer)
3551 return buffer;
3552
3553 /*
3554 * __get_free_pages failed, fall back to vmalloc
3555 */
bbce5a59 3556 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3557
0e3125c7
NH
3558 if (buffer)
3559 return buffer;
3560
3561 /*
3562 * vmalloc failed, lets dig into swap here
3563 */
0e3125c7
NH
3564 gfp_flags &= ~__GFP_NORETRY;
3565 buffer = (char *)__get_free_pages(gfp_flags, order);
3566 if (buffer)
3567 return buffer;
3568
3569 /*
3570 * complete and utter failure
3571 */
3572 return NULL;
4ebf0ae2
DM
3573}
3574
0e3125c7 3575static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3576{
3577 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3578 struct pgv *pg_vec;
4ebf0ae2
DM
3579 int i;
3580
0e3125c7 3581 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3582 if (unlikely(!pg_vec))
3583 goto out;
3584
3585 for (i = 0; i < block_nr; i++) {
c56b4d90 3586 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3587 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3588 goto out_free_pgvec;
3589 }
3590
3591out:
3592 return pg_vec;
3593
3594out_free_pgvec:
3595 free_pg_vec(pg_vec, order, block_nr);
3596 pg_vec = NULL;
3597 goto out;
3598}
1da177e4 3599
f6fb8f10 3600static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3601 int closing, int tx_ring)
1da177e4 3602{
0e3125c7 3603 struct pgv *pg_vec = NULL;
1da177e4 3604 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3605 int was_running, order = 0;
69e3c75f
JB
3606 struct packet_ring_buffer *rb;
3607 struct sk_buff_head *rb_queue;
0e11c91e 3608 __be16 num;
f6fb8f10 3609 int err = -EINVAL;
3610 /* Added to avoid minimal code churn */
3611 struct tpacket_req *req = &req_u->req;
3612
4b9e9796 3613 lock_sock(sk);
f6fb8f10 3614 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3615 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3616 WARN(1, "Tx-ring is not supported.\n");
3617 goto out;
3618 }
1ce4f28b 3619
69e3c75f
JB
3620 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3621 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3622
69e3c75f
JB
3623 err = -EBUSY;
3624 if (!closing) {
3625 if (atomic_read(&po->mapped))
3626 goto out;
3627 if (atomic_read(&rb->pending))
3628 goto out;
3629 }
1da177e4 3630
69e3c75f
JB
3631 if (req->tp_block_nr) {
3632 /* Sanity tests and some calculations */
3633 err = -EBUSY;
3634 if (unlikely(rb->pg_vec))
3635 goto out;
1da177e4 3636
bbd6ef87
PM
3637 switch (po->tp_version) {
3638 case TPACKET_V1:
3639 po->tp_hdrlen = TPACKET_HDRLEN;
3640 break;
3641 case TPACKET_V2:
3642 po->tp_hdrlen = TPACKET2_HDRLEN;
3643 break;
f6fb8f10 3644 case TPACKET_V3:
3645 po->tp_hdrlen = TPACKET3_HDRLEN;
3646 break;
bbd6ef87
PM
3647 }
3648
69e3c75f 3649 err = -EINVAL;
4ebf0ae2 3650 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3651 goto out;
4ebf0ae2 3652 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3653 goto out;
4b9e9796
S
3654 if (po->tp_version >= TPACKET_V3 &&
3655 req->tp_block_size <=
3656 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
3657 goto out;
8913336a 3658 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3659 po->tp_reserve))
3660 goto out;
4ebf0ae2 3661 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3662 goto out;
1da177e4 3663
69e3c75f
JB
3664 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3665 if (unlikely(rb->frames_per_block <= 0))
3666 goto out;
4b9e9796
S
3667 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
3668 goto out;
69e3c75f
JB
3669 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3670 req->tp_frame_nr))
3671 goto out;
1da177e4
LT
3672
3673 err = -ENOMEM;
4ebf0ae2
DM
3674 order = get_order(req->tp_block_size);
3675 pg_vec = alloc_pg_vec(req, order);
3676 if (unlikely(!pg_vec))
1da177e4 3677 goto out;
f6fb8f10 3678 switch (po->tp_version) {
3679 case TPACKET_V3:
3680 /* Transmit path is not supported. We checked
3681 * it above but just being paranoid
3682 */
3683 if (!tx_ring)
3684 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3685 break;
3686 default:
3687 break;
3688 }
69e3c75f
JB
3689 }
3690 /* Done */
3691 else {
3692 err = -EINVAL;
4ebf0ae2 3693 if (unlikely(req->tp_frame_nr))
69e3c75f 3694 goto out;
1da177e4
LT
3695 }
3696
1da177e4
LT
3697
3698 /* Detach socket from network */
3699 spin_lock(&po->bind_lock);
3700 was_running = po->running;
3701 num = po->num;
3702 if (was_running) {
1da177e4 3703 po->num = 0;
ce06b03e 3704 __unregister_prot_hook(sk, false);
1da177e4
LT
3705 }
3706 spin_unlock(&po->bind_lock);
1ce4f28b 3707
1da177e4
LT
3708 synchronize_net();
3709
3710 err = -EBUSY;
905db440 3711 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3712 if (closing || atomic_read(&po->mapped) == 0) {
3713 err = 0;
69e3c75f 3714 spin_lock_bh(&rb_queue->lock);
c053fd96 3715 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3716 rb->frame_max = (req->tp_frame_nr - 1);
3717 rb->head = 0;
3718 rb->frame_size = req->tp_frame_size;
3719 spin_unlock_bh(&rb_queue->lock);
3720
c053fd96
CG
3721 swap(rb->pg_vec_order, order);
3722 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3723
3724 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3725 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3726 tpacket_rcv : packet_rcv;
3727 skb_queue_purge(rb_queue);
1da177e4 3728 if (atomic_read(&po->mapped))
40d4e3df
ED
3729 pr_err("packet_mmap: vma is busy: %d\n",
3730 atomic_read(&po->mapped));
1da177e4 3731 }
905db440 3732 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3733
3734 spin_lock(&po->bind_lock);
ce06b03e 3735 if (was_running) {
1da177e4 3736 po->num = num;
ce06b03e 3737 register_prot_hook(sk);
1da177e4
LT
3738 }
3739 spin_unlock(&po->bind_lock);
f6fb8f10 3740 if (closing && (po->tp_version > TPACKET_V2)) {
3741 /* Because we don't support block-based V3 on tx-ring */
3742 if (!tx_ring)
3743 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3744 }
1da177e4 3745
1da177e4
LT
3746 if (pg_vec)
3747 free_pg_vec(pg_vec, order, req->tp_block_nr);
3748out:
4b9e9796 3749 release_sock(sk);
1da177e4
LT
3750 return err;
3751}
3752
69e3c75f
JB
3753static int packet_mmap(struct file *file, struct socket *sock,
3754 struct vm_area_struct *vma)
1da177e4
LT
3755{
3756 struct sock *sk = sock->sk;
3757 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3758 unsigned long size, expected_size;
3759 struct packet_ring_buffer *rb;
1da177e4
LT
3760 unsigned long start;
3761 int err = -EINVAL;
3762 int i;
3763
3764 if (vma->vm_pgoff)
3765 return -EINVAL;
3766
905db440 3767 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3768
3769 expected_size = 0;
3770 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3771 if (rb->pg_vec) {
3772 expected_size += rb->pg_vec_len
3773 * rb->pg_vec_pages
3774 * PAGE_SIZE;
3775 }
3776 }
3777
3778 if (expected_size == 0)
1da177e4 3779 goto out;
69e3c75f
JB
3780
3781 size = vma->vm_end - vma->vm_start;
3782 if (size != expected_size)
1da177e4
LT
3783 goto out;
3784
1da177e4 3785 start = vma->vm_start;
69e3c75f
JB
3786 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3787 if (rb->pg_vec == NULL)
3788 continue;
3789
3790 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3791 struct page *page;
3792 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3793 int pg_num;
3794
c56b4d90
CG
3795 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3796 page = pgv_to_page(kaddr);
69e3c75f
JB
3797 err = vm_insert_page(vma, start, page);
3798 if (unlikely(err))
3799 goto out;
3800 start += PAGE_SIZE;
0e3125c7 3801 kaddr += PAGE_SIZE;
69e3c75f 3802 }
4ebf0ae2 3803 }
1da177e4 3804 }
69e3c75f 3805
4ebf0ae2 3806 atomic_inc(&po->mapped);
1da177e4
LT
3807 vma->vm_ops = &packet_mmap_ops;
3808 err = 0;
3809
3810out:
905db440 3811 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3812 return err;
3813}
1da177e4 3814
90ddc4f0 3815static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3816 .family = PF_PACKET,
3817 .owner = THIS_MODULE,
3818 .release = packet_release,
3819 .bind = packet_bind_spkt,
3820 .connect = sock_no_connect,
3821 .socketpair = sock_no_socketpair,
3822 .accept = sock_no_accept,
3823 .getname = packet_getname_spkt,
3824 .poll = datagram_poll,
3825 .ioctl = packet_ioctl,
3826 .listen = sock_no_listen,
3827 .shutdown = sock_no_shutdown,
3828 .setsockopt = sock_no_setsockopt,
3829 .getsockopt = sock_no_getsockopt,
3830 .sendmsg = packet_sendmsg_spkt,
3831 .recvmsg = packet_recvmsg,
3832 .mmap = sock_no_mmap,
3833 .sendpage = sock_no_sendpage,
3834};
1da177e4 3835
90ddc4f0 3836static const struct proto_ops packet_ops = {
1da177e4
LT
3837 .family = PF_PACKET,
3838 .owner = THIS_MODULE,
3839 .release = packet_release,
3840 .bind = packet_bind,
3841 .connect = sock_no_connect,
3842 .socketpair = sock_no_socketpair,
3843 .accept = sock_no_accept,
1ce4f28b 3844 .getname = packet_getname,
1da177e4
LT
3845 .poll = packet_poll,
3846 .ioctl = packet_ioctl,
3847 .listen = sock_no_listen,
3848 .shutdown = sock_no_shutdown,
3849 .setsockopt = packet_setsockopt,
3850 .getsockopt = packet_getsockopt,
3851 .sendmsg = packet_sendmsg,
3852 .recvmsg = packet_recvmsg,
3853 .mmap = packet_mmap,
3854 .sendpage = sock_no_sendpage,
3855};
3856
ec1b4cf7 3857static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3858 .family = PF_PACKET,
3859 .create = packet_create,
3860 .owner = THIS_MODULE,
3861};
3862
3863static struct notifier_block packet_netdev_notifier = {
40d4e3df 3864 .notifier_call = packet_notifier,
1da177e4
LT
3865};
3866
3867#ifdef CONFIG_PROC_FS
1da177e4
LT
3868
3869static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3870 __acquires(RCU)
1da177e4 3871{
e372c414 3872 struct net *net = seq_file_net(seq);
808f5114 3873
3874 rcu_read_lock();
3875 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3876}
3877
3878static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3879{
1bf40954 3880 struct net *net = seq_file_net(seq);
808f5114 3881 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3882}
3883
3884static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3885 __releases(RCU)
1da177e4 3886{
808f5114 3887 rcu_read_unlock();
1da177e4
LT
3888}
3889
1ce4f28b 3890static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3891{
3892 if (v == SEQ_START_TOKEN)
3893 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3894 else {
b7ceabd9 3895 struct sock *s = sk_entry(v);
1da177e4
LT
3896 const struct packet_sock *po = pkt_sk(s);
3897
3898 seq_printf(seq,
71338aa7 3899 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3900 s,
3901 atomic_read(&s->sk_refcnt),
3902 s->sk_type,
3903 ntohs(po->num),
3904 po->ifindex,
3905 po->running,
3906 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 3907 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 3908 sock_i_ino(s));
1da177e4
LT
3909 }
3910
3911 return 0;
3912}
3913
56b3d975 3914static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3915 .start = packet_seq_start,
3916 .next = packet_seq_next,
3917 .stop = packet_seq_stop,
3918 .show = packet_seq_show,
3919};
3920
3921static int packet_seq_open(struct inode *inode, struct file *file)
3922{
e372c414
DL
3923 return seq_open_net(inode, file, &packet_seq_ops,
3924 sizeof(struct seq_net_private));
1da177e4
LT
3925}
3926
da7071d7 3927static const struct file_operations packet_seq_fops = {
1da177e4
LT
3928 .owner = THIS_MODULE,
3929 .open = packet_seq_open,
3930 .read = seq_read,
3931 .llseek = seq_lseek,
e372c414 3932 .release = seq_release_net,
1da177e4
LT
3933};
3934
3935#endif
3936
2c8c1e72 3937static int __net_init packet_net_init(struct net *net)
d12d01d6 3938{
0fa7fa98 3939 mutex_init(&net->packet.sklist_lock);
2aaef4e4 3940 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 3941
d4beaa66 3942 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
3943 return -ENOMEM;
3944
3945 return 0;
3946}
3947
2c8c1e72 3948static void __net_exit packet_net_exit(struct net *net)
d12d01d6 3949{
ece31ffd 3950 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
3951}
3952
3953static struct pernet_operations packet_net_ops = {
3954 .init = packet_net_init,
3955 .exit = packet_net_exit,
3956};
3957
3958
1da177e4
LT
3959static void __exit packet_exit(void)
3960{
1da177e4 3961 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3962 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3963 sock_unregister(PF_PACKET);
3964 proto_unregister(&packet_proto);
3965}
3966
3967static int __init packet_init(void)
3968{
3969 int rc = proto_register(&packet_proto, 0);
3970
3971 if (rc != 0)
3972 goto out;
3973
3974 sock_register(&packet_family_ops);
d12d01d6 3975 register_pernet_subsys(&packet_net_ops);
1da177e4 3976 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3977out:
3978 return rc;
3979}
3980
3981module_init(packet_init);
3982module_exit(packet_exit);
3983MODULE_LICENSE("GPL");
3984MODULE_ALIAS_NETPROTO(PF_PACKET);