Merge tag 'v3.10.102' into update
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
1da177e4
LT
91
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111
HX
218struct packet_skb_cb {
219 unsigned int origlen;
220 union {
221 struct sockaddr_pkt pkt;
222 struct sockaddr_ll ll;
223 } sa;
224};
225
226#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 227
bc59ba39 228#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 229#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 231#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 233#define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
236
dc99f600
DM
237static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238static void __fanout_link(struct sock *sk, struct packet_sock *po);
239
c3ac8a13
DB
240static struct net_device *packet_cached_dev_get(struct packet_sock *po)
241{
242 struct net_device *dev;
243
244 rcu_read_lock();
245 dev = rcu_dereference(po->cached_dev);
246 if (likely(dev))
247 dev_hold(dev);
248 rcu_read_unlock();
249
250 return dev;
251}
252
253static void packet_cached_dev_assign(struct packet_sock *po,
254 struct net_device *dev)
255{
256 rcu_assign_pointer(po->cached_dev, dev);
257}
258
259static void packet_cached_dev_reset(struct packet_sock *po)
260{
261 RCU_INIT_POINTER(po->cached_dev, NULL);
262}
263
ce06b03e
DM
264/* register_prot_hook must be invoked with the po->bind_lock held,
265 * or from a context in which asynchronous accesses to the packet
266 * socket is not possible (packet_create()).
267 */
268static void register_prot_hook(struct sock *sk)
269{
270 struct packet_sock *po = pkt_sk(sk);
026bb405 271
ce06b03e 272 if (!po->running) {
c3ac8a13 273 if (po->fanout)
dc99f600 274 __fanout_link(sk, po);
c3ac8a13 275 else
dc99f600 276 dev_add_pack(&po->prot_hook);
026bb405 277
ce06b03e
DM
278 sock_hold(sk);
279 po->running = 1;
280 }
281}
282
283/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
284 * held. If the sync parameter is true, we will temporarily drop
285 * the po->bind_lock and do a synchronize_net to make sure no
286 * asynchronous packet processing paths still refer to the elements
287 * of po->prot_hook. If the sync parameter is false, it is the
288 * callers responsibility to take care of this.
289 */
290static void __unregister_prot_hook(struct sock *sk, bool sync)
291{
292 struct packet_sock *po = pkt_sk(sk);
293
294 po->running = 0;
c3ac8a13
DB
295
296 if (po->fanout)
dc99f600 297 __fanout_unlink(sk, po);
c3ac8a13 298 else
dc99f600 299 __dev_remove_pack(&po->prot_hook);
026bb405 300
ce06b03e
DM
301 __sock_put(sk);
302
303 if (sync) {
304 spin_unlock(&po->bind_lock);
305 synchronize_net();
306 spin_lock(&po->bind_lock);
307 }
308}
309
310static void unregister_prot_hook(struct sock *sk, bool sync)
311{
312 struct packet_sock *po = pkt_sk(sk);
313
314 if (po->running)
315 __unregister_prot_hook(sk, sync);
316}
317
f6dafa95 318static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
319{
320 if (is_vmalloc_addr(addr))
321 return vmalloc_to_page(addr);
322 return virt_to_page(addr);
323}
324
69e3c75f 325static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 326{
184f489e 327 union tpacket_uhdr h;
1da177e4 328
69e3c75f 329 h.raw = frame;
bbd6ef87
PM
330 switch (po->tp_version) {
331 case TPACKET_V1:
69e3c75f 332 h.h1->tp_status = status;
0af55bb5 333 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
334 break;
335 case TPACKET_V2:
69e3c75f 336 h.h2->tp_status = status;
0af55bb5 337 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 338 break;
f6fb8f10 339 case TPACKET_V3:
69e3c75f 340 default:
f6fb8f10 341 WARN(1, "TPACKET version not supported.\n");
69e3c75f 342 BUG();
bbd6ef87 343 }
69e3c75f
JB
344
345 smp_wmb();
bbd6ef87
PM
346}
347
69e3c75f 348static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 349{
184f489e 350 union tpacket_uhdr h;
bbd6ef87 351
69e3c75f
JB
352 smp_rmb();
353
bbd6ef87
PM
354 h.raw = frame;
355 switch (po->tp_version) {
356 case TPACKET_V1:
0af55bb5 357 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 358 return h.h1->tp_status;
bbd6ef87 359 case TPACKET_V2:
0af55bb5 360 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 361 return h.h2->tp_status;
f6fb8f10 362 case TPACKET_V3:
69e3c75f 363 default:
f6fb8f10 364 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
365 BUG();
366 return 0;
bbd6ef87 367 }
1da177e4 368}
69e3c75f 369
b9c32fb2
DB
370static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
371 unsigned int flags)
7a51384c
DB
372{
373 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
374
375 if (shhwtstamps) {
376 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
377 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
b9c32fb2 378 return TP_STATUS_TS_SYS_HARDWARE;
7a51384c
DB
379 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
380 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
b9c32fb2 381 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
382 }
383
384 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 385 return TP_STATUS_TS_SOFTWARE;
7a51384c 386
b9c32fb2 387 return 0;
7a51384c
DB
388}
389
b9c32fb2
DB
390static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
391 struct sk_buff *skb)
2e31396f
WB
392{
393 union tpacket_uhdr h;
394 struct timespec ts;
b9c32fb2 395 __u32 ts_status;
2e31396f 396
b9c32fb2
DB
397 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
398 return 0;
2e31396f
WB
399
400 h.raw = frame;
401 switch (po->tp_version) {
402 case TPACKET_V1:
403 h.h1->tp_sec = ts.tv_sec;
404 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
405 break;
406 case TPACKET_V2:
407 h.h2->tp_sec = ts.tv_sec;
408 h.h2->tp_nsec = ts.tv_nsec;
409 break;
410 case TPACKET_V3:
411 default:
412 WARN(1, "TPACKET version not supported.\n");
413 BUG();
414 }
415
416 /* one flush is safe, as both fields always lie on the same cacheline */
417 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
418 smp_wmb();
b9c32fb2
DB
419
420 return ts_status;
2e31396f
WB
421}
422
69e3c75f
JB
423static void *packet_lookup_frame(struct packet_sock *po,
424 struct packet_ring_buffer *rb,
425 unsigned int position,
426 int status)
427{
428 unsigned int pg_vec_pos, frame_offset;
184f489e 429 union tpacket_uhdr h;
69e3c75f
JB
430
431 pg_vec_pos = position / rb->frames_per_block;
432 frame_offset = position % rb->frames_per_block;
433
0e3125c7
NH
434 h.raw = rb->pg_vec[pg_vec_pos].buffer +
435 (frame_offset * rb->frame_size);
69e3c75f
JB
436
437 if (status != __packet_get_status(po, h.raw))
438 return NULL;
439
440 return h.raw;
441}
442
eea49cc9 443static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
444 struct packet_ring_buffer *rb,
445 int status)
446{
447 return packet_lookup_frame(po, rb, rb->head, status);
448}
449
bc59ba39 450static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 451{
452 del_timer_sync(&pkc->retire_blk_timer);
453}
454
455static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
456 int tx_ring,
457 struct sk_buff_head *rb_queue)
458{
bc59ba39 459 struct tpacket_kbdq_core *pkc;
f6fb8f10 460
461 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
462
5b9e9be7 463 spin_lock_bh(&rb_queue->lock);
f6fb8f10 464 pkc->delete_blk_timer = 1;
5b9e9be7 465 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 466
467 prb_del_retire_blk_timer(pkc);
468}
469
470static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 471 struct tpacket_kbdq_core *pkc,
f6fb8f10 472 void (*func) (unsigned long))
473{
474 init_timer(&pkc->retire_blk_timer);
475 pkc->retire_blk_timer.data = (long)po;
476 pkc->retire_blk_timer.function = func;
477 pkc->retire_blk_timer.expires = jiffies;
478}
479
480static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
481{
bc59ba39 482 struct tpacket_kbdq_core *pkc;
f6fb8f10 483
484 if (tx_ring)
485 BUG();
486
487 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
488 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
489}
490
491static int prb_calc_retire_blk_tmo(struct packet_sock *po,
492 int blk_size_in_bytes)
493{
494 struct net_device *dev;
495 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
496 struct ethtool_cmd ecmd;
497 int err;
e440cf2c 498 u32 speed;
f6fb8f10 499
4bc71cb9
JP
500 rtnl_lock();
501 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
502 if (unlikely(!dev)) {
503 rtnl_unlock();
f6fb8f10 504 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
505 }
506 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 507 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
508 rtnl_unlock();
509 if (!err) {
4bc71cb9
JP
510 /*
511 * If the link speed is so slow you don't really
512 * need to worry about perf anyways
513 */
e440cf2c 514 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 515 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 516 } else {
517 msec = 1;
518 div = speed / 1000;
f6fb8f10 519 }
520 }
521
522 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
523
524 if (div)
525 mbits /= div;
526
527 tmo = mbits * msec;
528
529 if (div)
530 return tmo+1;
531 return tmo;
532}
533
bc59ba39 534static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 535 union tpacket_req_u *req_u)
536{
537 p1->feature_req_word = req_u->req3.tp_feature_req_word;
538}
539
540static void init_prb_bdqc(struct packet_sock *po,
541 struct packet_ring_buffer *rb,
542 struct pgv *pg_vec,
543 union tpacket_req_u *req_u, int tx_ring)
544{
bc59ba39 545 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
546 struct tpacket_block_desc *pbd;
f6fb8f10 547
548 memset(p1, 0x0, sizeof(*p1));
549
550 p1->knxt_seq_num = 1;
551 p1->pkbdq = pg_vec;
bc59ba39 552 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 553 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 554 p1->kblk_size = req_u->req3.tp_block_size;
555 p1->knum_blocks = req_u->req3.tp_block_nr;
556 p1->hdrlen = po->tp_hdrlen;
557 p1->version = po->tp_version;
558 p1->last_kactive_blk_num = 0;
ee80fbf3 559 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 560 if (req_u->req3.tp_retire_blk_tov)
561 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
562 else
563 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
564 req_u->req3.tp_block_size);
565 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
566 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
567
4035ed7b 568 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 569 prb_init_ft_ops(p1, req_u);
570 prb_setup_retire_blk_timer(po, tx_ring);
571 prb_open_block(p1, pbd);
572}
573
574/* Do NOT update the last_blk_num first.
575 * Assumes sk_buff_head lock is held.
576 */
bc59ba39 577static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 578{
579 mod_timer(&pkc->retire_blk_timer,
580 jiffies + pkc->tov_in_jiffies);
581 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
582}
583
584/*
585 * Timer logic:
586 * 1) We refresh the timer only when we open a block.
587 * By doing this we don't waste cycles refreshing the timer
588 * on packet-by-packet basis.
589 *
590 * With a 1MB block-size, on a 1Gbps line, it will take
591 * i) ~8 ms to fill a block + ii) memcpy etc.
592 * In this cut we are not accounting for the memcpy time.
593 *
594 * So, if the user sets the 'tmo' to 10ms then the timer
595 * will never fire while the block is still getting filled
596 * (which is what we want). However, the user could choose
597 * to close a block early and that's fine.
598 *
599 * But when the timer does fire, we check whether or not to refresh it.
600 * Since the tmo granularity is in msecs, it is not too expensive
601 * to refresh the timer, lets say every '8' msecs.
602 * Either the user can set the 'tmo' or we can derive it based on
603 * a) line-speed and b) block-size.
604 * prb_calc_retire_blk_tmo() calculates the tmo.
605 *
606 */
607static void prb_retire_rx_blk_timer_expired(unsigned long data)
608{
609 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 610 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 611 unsigned int frozen;
bc59ba39 612 struct tpacket_block_desc *pbd;
f6fb8f10 613
614 spin_lock(&po->sk.sk_receive_queue.lock);
615
616 frozen = prb_queue_frozen(pkc);
617 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
618
619 if (unlikely(pkc->delete_blk_timer))
620 goto out;
621
622 /* We only need to plug the race when the block is partially filled.
623 * tpacket_rcv:
624 * lock(); increment BLOCK_NUM_PKTS; unlock()
625 * copy_bits() is in progress ...
626 * timer fires on other cpu:
627 * we can't retire the current block because copy_bits
628 * is in progress.
629 *
630 */
631 if (BLOCK_NUM_PKTS(pbd)) {
632 while (atomic_read(&pkc->blk_fill_in_prog)) {
633 /* Waiting for skb_copy_bits to finish... */
634 cpu_relax();
635 }
636 }
637
638 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
639 if (!frozen) {
640 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
641 if (!prb_dispatch_next_block(pkc, po))
642 goto refresh_timer;
643 else
644 goto out;
645 } else {
646 /* Case 1. Queue was frozen because user-space was
647 * lagging behind.
648 */
649 if (prb_curr_blk_in_use(pkc, pbd)) {
650 /*
651 * Ok, user-space is still behind.
652 * So just refresh the timer.
653 */
654 goto refresh_timer;
655 } else {
656 /* Case 2. queue was frozen,user-space caught up,
657 * now the link went idle && the timer fired.
658 * We don't have a block to close.So we open this
659 * block and restart the timer.
660 * opening a block thaws the queue,restarts timer
661 * Thawing/timer-refresh is a side effect.
662 */
663 prb_open_block(pkc, pbd);
664 goto out;
665 }
666 }
667 }
668
669refresh_timer:
670 _prb_refresh_rx_retire_blk_timer(pkc);
671
672out:
673 spin_unlock(&po->sk.sk_receive_queue.lock);
674}
675
eea49cc9 676static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 677 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 678{
679 /* Flush everything minus the block header */
680
681#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
682 u8 *start, *end;
683
684 start = (u8 *)pbd1;
685
686 /* Skip the block header(we know header WILL fit in 4K) */
687 start += PAGE_SIZE;
688
689 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
690 for (; start < end; start += PAGE_SIZE)
691 flush_dcache_page(pgv_to_page(start));
692
693 smp_wmb();
694#endif
695
696 /* Now update the block status. */
697
698 BLOCK_STATUS(pbd1) = status;
699
700 /* Flush the block header */
701
702#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
703 start = (u8 *)pbd1;
704 flush_dcache_page(pgv_to_page(start));
705
706 smp_wmb();
707#endif
708}
709
710/*
711 * Side effect:
712 *
713 * 1) flush the block
714 * 2) Increment active_blk_num
715 *
716 * Note:We DONT refresh the timer on purpose.
717 * Because almost always the next block will be opened.
718 */
bc59ba39 719static void prb_close_block(struct tpacket_kbdq_core *pkc1,
720 struct tpacket_block_desc *pbd1,
f6fb8f10 721 struct packet_sock *po, unsigned int stat)
722{
723 __u32 status = TP_STATUS_USER | stat;
724
725 struct tpacket3_hdr *last_pkt;
bc59ba39 726 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 727
ee80fbf3 728 if (po->stats.stats3.tp_drops)
f6fb8f10 729 status |= TP_STATUS_LOSING;
730
731 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
732 last_pkt->tp_next_offset = 0;
733
734 /* Get the ts of the last pkt */
735 if (BLOCK_NUM_PKTS(pbd1)) {
736 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
737 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
738 } else {
739 /* Ok, we tmo'd - so get the current time */
740 struct timespec ts;
741 getnstimeofday(&ts);
742 h1->ts_last_pkt.ts_sec = ts.tv_sec;
743 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
744 }
745
746 smp_wmb();
747
748 /* Flush the block */
749 prb_flush_block(pkc1, pbd1, status);
750
751 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
752}
753
eea49cc9 754static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 755{
756 pkc->reset_pending_on_curr_blk = 0;
757}
758
759/*
760 * Side effect of opening a block:
761 *
762 * 1) prb_queue is thawed.
763 * 2) retire_blk_timer is refreshed.
764 *
765 */
bc59ba39 766static void prb_open_block(struct tpacket_kbdq_core *pkc1,
767 struct tpacket_block_desc *pbd1)
f6fb8f10 768{
769 struct timespec ts;
bc59ba39 770 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 771
772 smp_rmb();
773
8da3056c
DB
774 /* We could have just memset this but we will lose the
775 * flexibility of making the priv area sticky
776 */
f6fb8f10 777
8da3056c
DB
778 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
779 BLOCK_NUM_PKTS(pbd1) = 0;
780 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 781
8da3056c
DB
782 getnstimeofday(&ts);
783
784 h1->ts_first_pkt.ts_sec = ts.tv_sec;
785 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 786
8da3056c
DB
787 pkc1->pkblk_start = (char *)pbd1;
788 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
789
790 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
791 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
792
793 pbd1->version = pkc1->version;
794 pkc1->prev = pkc1->nxt_offset;
795 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
796
797 prb_thaw_queue(pkc1);
798 _prb_refresh_rx_retire_blk_timer(pkc1);
799
800 smp_wmb();
f6fb8f10 801}
802
803/*
804 * Queue freeze logic:
805 * 1) Assume tp_block_nr = 8 blocks.
806 * 2) At time 't0', user opens Rx ring.
807 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
808 * 4) user-space is either sleeping or processing block '0'.
809 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
810 * it will close block-7,loop around and try to fill block '0'.
811 * call-flow:
812 * __packet_lookup_frame_in_block
813 * prb_retire_current_block()
814 * prb_dispatch_next_block()
815 * |->(BLOCK_STATUS == USER) evaluates to true
816 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
817 * 6) Now there are two cases:
818 * 6.1) Link goes idle right after the queue is frozen.
819 * But remember, the last open_block() refreshed the timer.
820 * When this timer expires,it will refresh itself so that we can
821 * re-open block-0 in near future.
822 * 6.2) Link is busy and keeps on receiving packets. This is a simple
823 * case and __packet_lookup_frame_in_block will check if block-0
824 * is free and can now be re-used.
825 */
eea49cc9 826static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 827 struct packet_sock *po)
828{
829 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 830 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 831}
832
833#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
834
835/*
836 * If the next block is free then we will dispatch it
837 * and return a good offset.
838 * Else, we will freeze the queue.
839 * So, caller must check the return value.
840 */
bc59ba39 841static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 842 struct packet_sock *po)
843{
bc59ba39 844 struct tpacket_block_desc *pbd;
f6fb8f10 845
846 smp_rmb();
847
848 /* 1. Get current block num */
849 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
850
851 /* 2. If this block is currently in_use then freeze the queue */
852 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
853 prb_freeze_queue(pkc, po);
854 return NULL;
855 }
856
857 /*
858 * 3.
859 * open this block and return the offset where the first packet
860 * needs to get stored.
861 */
862 prb_open_block(pkc, pbd);
863 return (void *)pkc->nxt_offset;
864}
865
bc59ba39 866static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 867 struct packet_sock *po, unsigned int status)
868{
bc59ba39 869 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 870
871 /* retire/close the current block */
872 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
873 /*
874 * Plug the case where copy_bits() is in progress on
875 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
876 * have space to copy the pkt in the current block and
877 * called prb_retire_current_block()
878 *
879 * We don't need to worry about the TMO case because
880 * the timer-handler already handled this case.
881 */
882 if (!(status & TP_STATUS_BLK_TMO)) {
883 while (atomic_read(&pkc->blk_fill_in_prog)) {
884 /* Waiting for skb_copy_bits to finish... */
885 cpu_relax();
886 }
887 }
888 prb_close_block(pkc, pbd, po, status);
889 return;
890 }
f6fb8f10 891}
892
eea49cc9 893static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 894 struct tpacket_block_desc *pbd)
f6fb8f10 895{
896 return TP_STATUS_USER & BLOCK_STATUS(pbd);
897}
898
eea49cc9 899static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 900{
901 return pkc->reset_pending_on_curr_blk;
902}
903
eea49cc9 904static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 905{
bc59ba39 906 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 907 atomic_dec(&pkc->blk_fill_in_prog);
908}
909
eea49cc9 910static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 911 struct tpacket3_hdr *ppd)
912{
913 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
914}
915
eea49cc9 916static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 917 struct tpacket3_hdr *ppd)
918{
919 ppd->hv1.tp_rxhash = 0;
920}
921
eea49cc9 922static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 923 struct tpacket3_hdr *ppd)
924{
925 if (vlan_tx_tag_present(pkc->skb)) {
926 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
927 ppd->tp_status = TP_STATUS_VLAN_VALID;
928 } else {
9e67030a 929 ppd->hv1.tp_vlan_tci = 0;
930 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 931 }
932}
933
bc59ba39 934static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 935 struct tpacket3_hdr *ppd)
936{
937 prb_fill_vlan_info(pkc, ppd);
938
939 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
940 prb_fill_rxhash(pkc, ppd);
941 else
942 prb_clear_rxhash(pkc, ppd);
943}
944
eea49cc9 945static void prb_fill_curr_block(char *curr,
bc59ba39 946 struct tpacket_kbdq_core *pkc,
947 struct tpacket_block_desc *pbd,
f6fb8f10 948 unsigned int len)
949{
950 struct tpacket3_hdr *ppd;
951
952 ppd = (struct tpacket3_hdr *)curr;
953 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
954 pkc->prev = curr;
955 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
956 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
957 BLOCK_NUM_PKTS(pbd) += 1;
958 atomic_inc(&pkc->blk_fill_in_prog);
959 prb_run_all_ft_ops(pkc, ppd);
960}
961
962/* Assumes caller has the sk->rx_queue.lock */
963static void *__packet_lookup_frame_in_block(struct packet_sock *po,
964 struct sk_buff *skb,
965 int status,
966 unsigned int len
967 )
968{
bc59ba39 969 struct tpacket_kbdq_core *pkc;
970 struct tpacket_block_desc *pbd;
f6fb8f10 971 char *curr, *end;
972
e3192690 973 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 974 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
975
976 /* Queue is frozen when user space is lagging behind */
977 if (prb_queue_frozen(pkc)) {
978 /*
979 * Check if that last block which caused the queue to freeze,
980 * is still in_use by user-space.
981 */
982 if (prb_curr_blk_in_use(pkc, pbd)) {
983 /* Can't record this packet */
984 return NULL;
985 } else {
986 /*
987 * Ok, the block was released by user-space.
988 * Now let's open that block.
989 * opening a block also thaws the queue.
990 * Thawing is a side effect.
991 */
992 prb_open_block(pkc, pbd);
993 }
994 }
995
996 smp_mb();
997 curr = pkc->nxt_offset;
998 pkc->skb = skb;
e3192690 999 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1000
1001 /* first try the current block */
1002 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1003 prb_fill_curr_block(curr, pkc, pbd, len);
1004 return (void *)curr;
1005 }
1006
1007 /* Ok, close the current block */
1008 prb_retire_current_block(pkc, po, 0);
1009
1010 /* Now, try to dispatch the next block */
1011 curr = (char *)prb_dispatch_next_block(pkc, po);
1012 if (curr) {
1013 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1014 prb_fill_curr_block(curr, pkc, pbd, len);
1015 return (void *)curr;
1016 }
1017
1018 /*
1019 * No free blocks are available.user_space hasn't caught up yet.
1020 * Queue was just frozen and now this packet will get dropped.
1021 */
1022 return NULL;
1023}
1024
eea49cc9 1025static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1026 struct sk_buff *skb,
1027 int status, unsigned int len)
1028{
1029 char *curr = NULL;
1030 switch (po->tp_version) {
1031 case TPACKET_V1:
1032 case TPACKET_V2:
1033 curr = packet_lookup_frame(po, &po->rx_ring,
1034 po->rx_ring.head, status);
1035 return curr;
1036 case TPACKET_V3:
1037 return __packet_lookup_frame_in_block(po, skb, status, len);
1038 default:
1039 WARN(1, "TPACKET version not supported\n");
1040 BUG();
99aa3473 1041 return NULL;
f6fb8f10 1042 }
1043}
1044
eea49cc9 1045static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1046 struct packet_ring_buffer *rb,
77f65ebd 1047 unsigned int idx,
f6fb8f10 1048 int status)
1049{
bc59ba39 1050 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1051 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1052
1053 if (status != BLOCK_STATUS(pbd))
1054 return NULL;
1055 return pbd;
1056}
1057
eea49cc9 1058static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1059{
1060 unsigned int prev;
1061 if (rb->prb_bdqc.kactive_blk_num)
1062 prev = rb->prb_bdqc.kactive_blk_num-1;
1063 else
1064 prev = rb->prb_bdqc.knum_blocks-1;
1065 return prev;
1066}
1067
1068/* Assumes caller has held the rx_queue.lock */
eea49cc9 1069static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1070 struct packet_ring_buffer *rb,
1071 int status)
1072{
1073 unsigned int previous = prb_previous_blk_num(rb);
1074 return prb_lookup_block(po, rb, previous, status);
1075}
1076
eea49cc9 1077static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1078 struct packet_ring_buffer *rb,
1079 int status)
1080{
1081 if (po->tp_version <= TPACKET_V2)
1082 return packet_previous_frame(po, rb, status);
1083
1084 return __prb_previous_block(po, rb, status);
1085}
1086
eea49cc9 1087static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1088 struct packet_ring_buffer *rb)
1089{
1090 switch (po->tp_version) {
1091 case TPACKET_V1:
1092 case TPACKET_V2:
1093 return packet_increment_head(rb);
1094 case TPACKET_V3:
1095 default:
1096 WARN(1, "TPACKET version not supported.\n");
1097 BUG();
1098 return;
1099 }
1100}
1101
eea49cc9 1102static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1103 struct packet_ring_buffer *rb,
1104 int status)
1105{
1106 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1107 return packet_lookup_frame(po, rb, previous, status);
1108}
1109
eea49cc9 1110static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1111{
1112 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1113}
1114
77f65ebd
WB
1115static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1116{
1117 struct sock *sk = &po->sk;
1118 bool has_room;
1119
1120 if (po->prot_hook.func != tpacket_rcv)
1121 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1122 <= sk->sk_rcvbuf;
1123
1124 spin_lock(&sk->sk_receive_queue.lock);
1125 if (po->tp_version == TPACKET_V3)
1126 has_room = prb_lookup_block(po, &po->rx_ring,
1127 po->rx_ring.prb_bdqc.kactive_blk_num,
1128 TP_STATUS_KERNEL);
1129 else
1130 has_room = packet_lookup_frame(po, &po->rx_ring,
1131 po->rx_ring.head,
1132 TP_STATUS_KERNEL);
1133 spin_unlock(&sk->sk_receive_queue.lock);
1134
1135 return has_room;
1136}
1137
1da177e4
LT
1138static void packet_sock_destruct(struct sock *sk)
1139{
ed85b565
RC
1140 skb_queue_purge(&sk->sk_error_queue);
1141
547b792c
IJ
1142 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1143 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1144
1145 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1146 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1147 return;
1148 }
1149
17ab56a2 1150 sk_refcnt_debug_dec(sk);
1da177e4
LT
1151}
1152
77f65ebd
WB
1153static unsigned int fanout_demux_hash(struct packet_fanout *f,
1154 struct sk_buff *skb,
1155 unsigned int num)
dc99f600 1156{
77f65ebd 1157 return (((u64)skb->rxhash) * num) >> 32;
dc99f600
DM
1158}
1159
77f65ebd
WB
1160static unsigned int fanout_demux_lb(struct packet_fanout *f,
1161 struct sk_buff *skb,
1162 unsigned int num)
dc99f600 1163{
4552ddd0 1164 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1165
4552ddd0 1166 return val % num;
77f65ebd
WB
1167}
1168
1169static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1170 struct sk_buff *skb,
1171 unsigned int num)
1172{
1173 return smp_processor_id() % num;
dc99f600
DM
1174}
1175
77f65ebd
WB
1176static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1177 struct sk_buff *skb,
1178 unsigned int idx, unsigned int skip,
1179 unsigned int num)
95ec3eb4 1180{
77f65ebd 1181 unsigned int i, j;
95ec3eb4 1182
77f65ebd
WB
1183 i = j = min_t(int, f->next[idx], num - 1);
1184 do {
1185 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1186 if (i != j)
1187 f->next[idx] = i;
1188 return i;
1189 }
1190 if (++i == num)
1191 i = 0;
1192 } while (i != j);
1193
1194 return idx;
1195}
1196
1197static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1198{
1199 return f->flags & (flag >> 8);
95ec3eb4
DM
1200}
1201
95ec3eb4
DM
1202static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1203 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1204{
1205 struct packet_fanout *f = pt->af_packet_priv;
7eee92bf 1206 unsigned int num = ACCESS_ONCE(f->num_members);
dc99f600 1207 struct packet_sock *po;
77f65ebd 1208 unsigned int idx;
dc99f600
DM
1209
1210 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1211 !num) {
1212 kfree_skb(skb);
1213 return 0;
1214 }
1215
95ec3eb4
DM
1216 switch (f->type) {
1217 case PACKET_FANOUT_HASH:
1218 default:
77f65ebd 1219 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
bc416d97 1220 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1221 if (!skb)
1222 return 0;
1223 }
1224 skb_get_rxhash(skb);
77f65ebd 1225 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1226 break;
1227 case PACKET_FANOUT_LB:
77f65ebd 1228 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1229 break;
1230 case PACKET_FANOUT_CPU:
77f65ebd
WB
1231 idx = fanout_demux_cpu(f, skb, num);
1232 break;
1233 case PACKET_FANOUT_ROLLOVER:
1234 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1235 break;
dc99f600
DM
1236 }
1237
77f65ebd
WB
1238 po = pkt_sk(f->arr[idx]);
1239 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1240 unlikely(!packet_rcv_has_room(po, skb))) {
1241 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1242 po = pkt_sk(f->arr[idx]);
1243 }
dc99f600
DM
1244
1245 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1246}
1247
fff3321d
PE
1248DEFINE_MUTEX(fanout_mutex);
1249EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1250static LIST_HEAD(fanout_list);
1251
1252static void __fanout_link(struct sock *sk, struct packet_sock *po)
1253{
1254 struct packet_fanout *f = po->fanout;
1255
1256 spin_lock(&f->lock);
1257 f->arr[f->num_members] = sk;
1258 smp_wmb();
1259 f->num_members++;
1260 spin_unlock(&f->lock);
1261}
1262
1263static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1264{
1265 struct packet_fanout *f = po->fanout;
1266 int i;
1267
1268 spin_lock(&f->lock);
1269 for (i = 0; i < f->num_members; i++) {
1270 if (f->arr[i] == sk)
1271 break;
1272 }
1273 BUG_ON(i >= f->num_members);
1274 f->arr[i] = f->arr[f->num_members - 1];
1275 f->num_members--;
1276 spin_unlock(&f->lock);
1277}
1278
a0dfb263 1279static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
c0de08d0
EL
1280{
1281 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1282 return true;
1283
1284 return false;
1285}
1286
7736d33f 1287static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1288{
1289 struct packet_sock *po = pkt_sk(sk);
1290 struct packet_fanout *f, *match;
7736d33f 1291 u8 type = type_flags & 0xff;
77f65ebd 1292 u8 flags = type_flags >> 8;
dc99f600
DM
1293 int err;
1294
1295 switch (type) {
77f65ebd
WB
1296 case PACKET_FANOUT_ROLLOVER:
1297 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1298 return -EINVAL;
dc99f600
DM
1299 case PACKET_FANOUT_HASH:
1300 case PACKET_FANOUT_LB:
95ec3eb4 1301 case PACKET_FANOUT_CPU:
dc99f600
DM
1302 break;
1303 default:
1304 return -EINVAL;
1305 }
1306
1307 if (!po->running)
1308 return -EINVAL;
1309
1310 if (po->fanout)
1311 return -EALREADY;
1312
1313 mutex_lock(&fanout_mutex);
1314 match = NULL;
1315 list_for_each_entry(f, &fanout_list, list) {
1316 if (f->id == id &&
1317 read_pnet(&f->net) == sock_net(sk)) {
1318 match = f;
1319 break;
1320 }
1321 }
afe62c68 1322 err = -EINVAL;
77f65ebd 1323 if (match && match->flags != flags)
afe62c68 1324 goto out;
dc99f600 1325 if (!match) {
afe62c68 1326 err = -ENOMEM;
dc99f600 1327 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1328 if (!match)
1329 goto out;
1330 write_pnet(&match->net, sock_net(sk));
1331 match->id = id;
1332 match->type = type;
77f65ebd 1333 match->flags = flags;
afe62c68
ED
1334 atomic_set(&match->rr_cur, 0);
1335 INIT_LIST_HEAD(&match->list);
1336 spin_lock_init(&match->lock);
1337 atomic_set(&match->sk_ref, 0);
1338 match->prot_hook.type = po->prot_hook.type;
1339 match->prot_hook.dev = po->prot_hook.dev;
1340 match->prot_hook.func = packet_rcv_fanout;
1341 match->prot_hook.af_packet_priv = match;
c0de08d0 1342 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1343 dev_add_pack(&match->prot_hook);
1344 list_add(&match->list, &fanout_list);
dc99f600 1345 }
afe62c68
ED
1346 err = -EINVAL;
1347 if (match->type == type &&
1348 match->prot_hook.type == po->prot_hook.type &&
1349 match->prot_hook.dev == po->prot_hook.dev) {
1350 err = -ENOSPC;
1351 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1352 __dev_remove_pack(&po->prot_hook);
1353 po->fanout = match;
1354 atomic_inc(&match->sk_ref);
1355 __fanout_link(sk, po);
1356 err = 0;
dc99f600
DM
1357 }
1358 }
afe62c68 1359out:
dc99f600
DM
1360 mutex_unlock(&fanout_mutex);
1361 return err;
1362}
1363
1364static void fanout_release(struct sock *sk)
1365{
1366 struct packet_sock *po = pkt_sk(sk);
1367 struct packet_fanout *f;
1368
1369 f = po->fanout;
1370 if (!f)
1371 return;
1372
fff3321d 1373 mutex_lock(&fanout_mutex);
dc99f600
DM
1374 po->fanout = NULL;
1375
dc99f600
DM
1376 if (atomic_dec_and_test(&f->sk_ref)) {
1377 list_del(&f->list);
1378 dev_remove_pack(&f->prot_hook);
1379 kfree(f);
1380 }
1381 mutex_unlock(&fanout_mutex);
1382}
1da177e4 1383
90ddc4f0 1384static const struct proto_ops packet_ops;
1da177e4 1385
90ddc4f0 1386static const struct proto_ops packet_ops_spkt;
1da177e4 1387
40d4e3df
ED
1388static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1389 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1390{
1391 struct sock *sk;
1392 struct sockaddr_pkt *spkt;
1393
1394 /*
1395 * When we registered the protocol we saved the socket in the data
1396 * field for just this event.
1397 */
1398
1399 sk = pt->af_packet_priv;
1ce4f28b 1400
1da177e4
LT
1401 /*
1402 * Yank back the headers [hope the device set this
1403 * right or kerboom...]
1404 *
1405 * Incoming packets have ll header pulled,
1406 * push it back.
1407 *
98e399f8 1408 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1409 * so that this procedure is noop.
1410 */
1411
1412 if (skb->pkt_type == PACKET_LOOPBACK)
1413 goto out;
1414
09ad9bc7 1415 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1416 goto out;
1417
40d4e3df
ED
1418 skb = skb_share_check(skb, GFP_ATOMIC);
1419 if (skb == NULL)
1da177e4
LT
1420 goto oom;
1421
1422 /* drop any routing info */
adf30907 1423 skb_dst_drop(skb);
1da177e4 1424
84531c24
PO
1425 /* drop conntrack reference */
1426 nf_reset(skb);
1427
ffbc6111 1428 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1429
98e399f8 1430 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1431
1432 /*
1433 * The SOCK_PACKET socket receives _all_ frames.
1434 */
1435
1436 spkt->spkt_family = dev->type;
1437 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1438 spkt->spkt_protocol = skb->protocol;
1439
1440 /*
1441 * Charge the memory to the socket. This is done specifically
1442 * to prevent sockets using all the memory up.
1443 */
1444
40d4e3df 1445 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1446 return 0;
1447
1448out:
1449 kfree_skb(skb);
1450oom:
1451 return 0;
1452}
1453
1454
1455/*
1456 * Output a raw packet to a device layer. This bypasses all the other
1457 * protocol layers and you must therefore supply it with a complete frame
1458 */
1ce4f28b 1459
1da177e4
LT
1460static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1461 struct msghdr *msg, size_t len)
1462{
1463 struct sock *sk = sock->sk;
40d4e3df 1464 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1465 struct sk_buff *skb = NULL;
1da177e4 1466 struct net_device *dev;
40d4e3df 1467 __be16 proto = 0;
1da177e4 1468 int err;
3bdc0eba 1469 int extra_len = 0;
1ce4f28b 1470
1da177e4 1471 /*
1ce4f28b 1472 * Get and verify the address.
1da177e4
LT
1473 */
1474
40d4e3df 1475 if (saddr) {
1da177e4 1476 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1477 return -EINVAL;
1478 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1479 proto = saddr->spkt_protocol;
1480 } else
1481 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1482
1483 /*
1ce4f28b 1484 * Find the device first to size check it
1da177e4
LT
1485 */
1486
de74e92a 1487 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1488retry:
654d1f8a
ED
1489 rcu_read_lock();
1490 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1491 err = -ENODEV;
1492 if (dev == NULL)
1493 goto out_unlock;
1ce4f28b 1494
d5e76b0a
DM
1495 err = -ENETDOWN;
1496 if (!(dev->flags & IFF_UP))
1497 goto out_unlock;
1498
1da177e4 1499 /*
40d4e3df
ED
1500 * You may not queue a frame bigger than the mtu. This is the lowest level
1501 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1502 */
1ce4f28b 1503
3bdc0eba
BG
1504 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1505 if (!netif_supports_nofcs(dev)) {
1506 err = -EPROTONOSUPPORT;
1507 goto out_unlock;
1508 }
1509 extra_len = 4; /* We're doing our own CRC */
1510 }
1511
1da177e4 1512 err = -EMSGSIZE;
3bdc0eba 1513 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1514 goto out_unlock;
1515
1a35ca80
ED
1516 if (!skb) {
1517 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1518 int tlen = dev->needed_tailroom;
1a35ca80
ED
1519 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1520
1521 rcu_read_unlock();
4ce40912 1522 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1523 if (skb == NULL)
1524 return -ENOBUFS;
1525 /* FIXME: Save some space for broken drivers that write a hard
1526 * header at transmission time by themselves. PPP is the notable
1527 * one here. This should really be fixed at the driver level.
1528 */
1529 skb_reserve(skb, reserved);
1530 skb_reset_network_header(skb);
1531
1532 /* Try to align data part correctly */
1533 if (hhlen) {
1534 skb->data -= hhlen;
1535 skb->tail -= hhlen;
1536 if (len < hhlen)
1537 skb_reset_network_header(skb);
1538 }
1539 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1540 if (err)
1541 goto out_free;
1542 goto retry;
1da177e4
LT
1543 }
1544
3bdc0eba 1545 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1546 /* Earlier code assumed this would be a VLAN pkt,
1547 * double-check this now that we have the actual
1548 * packet in hand.
1549 */
1550 struct ethhdr *ehdr;
1551 skb_reset_mac_header(skb);
1552 ehdr = eth_hdr(skb);
1553 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1554 err = -EMSGSIZE;
1555 goto out_unlock;
1556 }
1557 }
1a35ca80 1558
1da177e4
LT
1559 skb->protocol = proto;
1560 skb->dev = dev;
1561 skb->priority = sk->sk_priority;
2d37a186 1562 skb->mark = sk->sk_mark;
bf84a010
DB
1563
1564 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1565
3bdc0eba
BG
1566 if (unlikely(extra_len == 4))
1567 skb->no_fcs = 1;
1568
40893fd0 1569 skb_probe_transport_header(skb, 0);
c1aad275 1570
1da177e4 1571 dev_queue_xmit(skb);
654d1f8a 1572 rcu_read_unlock();
40d4e3df 1573 return len;
1da177e4 1574
1da177e4 1575out_unlock:
654d1f8a 1576 rcu_read_unlock();
1a35ca80
ED
1577out_free:
1578 kfree_skb(skb);
1da177e4
LT
1579 return err;
1580}
1da177e4 1581
eea49cc9 1582static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1583 const struct sock *sk,
dbcb5855 1584 unsigned int res)
1da177e4
LT
1585{
1586 struct sk_filter *filter;
fda9ef5d 1587
80f8f102
ED
1588 rcu_read_lock();
1589 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1590 if (filter != NULL)
0a14842f 1591 res = SK_RUN_FILTER(filter, skb);
80f8f102 1592 rcu_read_unlock();
1da177e4 1593
dbcb5855 1594 return res;
1da177e4
LT
1595}
1596
1597/*
62ab0812
ED
1598 * This function makes lazy skb cloning in hope that most of packets
1599 * are discarded by BPF.
1600 *
1601 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1602 * and skb->cb are mangled. It works because (and until) packets
1603 * falling here are owned by current CPU. Output packets are cloned
1604 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1605 * sequencially, so that if we return skb to original state on exit,
1606 * we will not harm anyone.
1da177e4
LT
1607 */
1608
40d4e3df
ED
1609static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1610 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1611{
1612 struct sock *sk;
1613 struct sockaddr_ll *sll;
1614 struct packet_sock *po;
40d4e3df 1615 u8 *skb_head = skb->data;
1da177e4 1616 int skb_len = skb->len;
dbcb5855 1617 unsigned int snaplen, res;
1da177e4
LT
1618
1619 if (skb->pkt_type == PACKET_LOOPBACK)
1620 goto drop;
1621
1622 sk = pt->af_packet_priv;
1623 po = pkt_sk(sk);
1624
09ad9bc7 1625 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1626 goto drop;
1627
1da177e4
LT
1628 skb->dev = dev;
1629
3b04ddde 1630 if (dev->header_ops) {
1da177e4 1631 /* The device has an explicit notion of ll header,
62ab0812
ED
1632 * exported to higher levels.
1633 *
1634 * Otherwise, the device hides details of its frame
1635 * structure, so that corresponding packet head is
1636 * never delivered to user.
1da177e4
LT
1637 */
1638 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1639 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1640 else if (skb->pkt_type == PACKET_OUTGOING) {
1641 /* Special case: outgoing packets have ll header at head */
bbe735e4 1642 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1643 }
1644 }
1645
1646 snaplen = skb->len;
1647
dbcb5855
DM
1648 res = run_filter(skb, sk, snaplen);
1649 if (!res)
fda9ef5d 1650 goto drop_n_restore;
dbcb5855
DM
1651 if (snaplen > res)
1652 snaplen = res;
1da177e4 1653
0fd7bac6 1654 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1655 goto drop_n_acct;
1656
1657 if (skb_shared(skb)) {
1658 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1659 if (nskb == NULL)
1660 goto drop_n_acct;
1661
1662 if (skb_head != skb->data) {
1663 skb->data = skb_head;
1664 skb->len = skb_len;
1665 }
abc4e4fa 1666 consume_skb(skb);
1da177e4
LT
1667 skb = nskb;
1668 }
1669
ffbc6111
HX
1670 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1671 sizeof(skb->cb));
1672
1673 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1674 sll->sll_family = AF_PACKET;
1675 sll->sll_hatype = dev->type;
1676 sll->sll_protocol = skb->protocol;
1677 sll->sll_pkttype = skb->pkt_type;
8032b464 1678 if (unlikely(po->origdev))
80feaacb
PWJ
1679 sll->sll_ifindex = orig_dev->ifindex;
1680 else
1681 sll->sll_ifindex = dev->ifindex;
1da177e4 1682
b95cce35 1683 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1684
ffbc6111 1685 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1686
1da177e4
LT
1687 if (pskb_trim(skb, snaplen))
1688 goto drop_n_acct;
1689
1690 skb_set_owner_r(skb, sk);
1691 skb->dev = NULL;
adf30907 1692 skb_dst_drop(skb);
1da177e4 1693
84531c24
PO
1694 /* drop conntrack reference */
1695 nf_reset(skb);
1696
1da177e4 1697 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1698 po->stats.stats1.tp_packets++;
3b885787 1699 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1700 __skb_queue_tail(&sk->sk_receive_queue, skb);
1701 spin_unlock(&sk->sk_receive_queue.lock);
1702 sk->sk_data_ready(sk, skb->len);
1703 return 0;
1704
1705drop_n_acct:
7091fbd8 1706 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1707 po->stats.stats1.tp_drops++;
7091fbd8
WB
1708 atomic_inc(&sk->sk_drops);
1709 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1710
1711drop_n_restore:
1712 if (skb_head != skb->data && skb_shared(skb)) {
1713 skb->data = skb_head;
1714 skb->len = skb_len;
1715 }
1716drop:
ead2ceb0 1717 consume_skb(skb);
1da177e4
LT
1718 return 0;
1719}
1720
40d4e3df
ED
1721static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1722 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1723{
1724 struct sock *sk;
1725 struct packet_sock *po;
1726 struct sockaddr_ll *sll;
184f489e 1727 union tpacket_uhdr h;
40d4e3df 1728 u8 *skb_head = skb->data;
1da177e4 1729 int skb_len = skb->len;
dbcb5855 1730 unsigned int snaplen, res;
f6fb8f10 1731 unsigned long status = TP_STATUS_USER;
bbd6ef87 1732 unsigned short macoff, netoff, hdrlen;
1da177e4 1733 struct sk_buff *copy_skb = NULL;
bbd6ef87 1734 struct timespec ts;
b9c32fb2 1735 __u32 ts_status;
1da177e4
LT
1736
1737 if (skb->pkt_type == PACKET_LOOPBACK)
1738 goto drop;
1739
1740 sk = pt->af_packet_priv;
1741 po = pkt_sk(sk);
1742
09ad9bc7 1743 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1744 goto drop;
1745
3b04ddde 1746 if (dev->header_ops) {
1da177e4 1747 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1748 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1749 else if (skb->pkt_type == PACKET_OUTGOING) {
1750 /* Special case: outgoing packets have ll header at head */
bbe735e4 1751 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1752 }
1753 }
1754
8dc41944
HX
1755 if (skb->ip_summed == CHECKSUM_PARTIAL)
1756 status |= TP_STATUS_CSUMNOTREADY;
1757
1da177e4
LT
1758 snaplen = skb->len;
1759
dbcb5855
DM
1760 res = run_filter(skb, sk, snaplen);
1761 if (!res)
fda9ef5d 1762 goto drop_n_restore;
dbcb5855
DM
1763 if (snaplen > res)
1764 snaplen = res;
1da177e4
LT
1765
1766 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1767 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1768 po->tp_reserve;
1da177e4 1769 } else {
95c96174 1770 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1771 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1772 (maclen < 16 ? 16 : maclen)) +
1773 po->tp_reserve;
1da177e4
LT
1774 macoff = netoff - maclen;
1775 }
f6fb8f10 1776 if (po->tp_version <= TPACKET_V2) {
1777 if (macoff + snaplen > po->rx_ring.frame_size) {
1778 if (po->copy_thresh &&
0fd7bac6 1779 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1780 if (skb_shared(skb)) {
1781 copy_skb = skb_clone(skb, GFP_ATOMIC);
1782 } else {
1783 copy_skb = skb_get(skb);
1784 skb_head = skb->data;
1785 }
1786 if (copy_skb)
1787 skb_set_owner_r(copy_skb, sk);
1da177e4 1788 }
f6fb8f10 1789 snaplen = po->rx_ring.frame_size - macoff;
1790 if ((int)snaplen < 0)
1791 snaplen = 0;
1da177e4 1792 }
4035ed7b
ED
1793 } else if (unlikely(macoff + snaplen >
1794 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
1795 u32 nval;
1796
1797 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
1798 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
1799 snaplen, nval, macoff);
1800 snaplen = nval;
1801 if (unlikely((int)snaplen < 0)) {
1802 snaplen = 0;
1803 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
1804 }
1da177e4 1805 }
1da177e4 1806 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1807 h.raw = packet_current_rx_frame(po, skb,
1808 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1809 if (!h.raw)
1da177e4 1810 goto ring_is_full;
f6fb8f10 1811 if (po->tp_version <= TPACKET_V2) {
1812 packet_increment_rx_head(po, &po->rx_ring);
1813 /*
1814 * LOSING will be reported till you read the stats,
1815 * because it's COR - Clear On Read.
1816 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1817 * at packet level.
1818 */
ee80fbf3 1819 if (po->stats.stats1.tp_drops)
f6fb8f10 1820 status |= TP_STATUS_LOSING;
1821 }
ee80fbf3 1822 po->stats.stats1.tp_packets++;
1da177e4
LT
1823 if (copy_skb) {
1824 status |= TP_STATUS_COPY;
1825 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1826 }
1da177e4
LT
1827 spin_unlock(&sk->sk_receive_queue.lock);
1828
bbd6ef87 1829 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
1830
1831 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 1832 getnstimeofday(&ts);
1da177e4 1833
b9c32fb2
DB
1834 status |= ts_status;
1835
bbd6ef87
PM
1836 switch (po->tp_version) {
1837 case TPACKET_V1:
1838 h.h1->tp_len = skb->len;
1839 h.h1->tp_snaplen = snaplen;
1840 h.h1->tp_mac = macoff;
1841 h.h1->tp_net = netoff;
4b457bdf
DB
1842 h.h1->tp_sec = ts.tv_sec;
1843 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
1844 hdrlen = sizeof(*h.h1);
1845 break;
1846 case TPACKET_V2:
1847 h.h2->tp_len = skb->len;
1848 h.h2->tp_snaplen = snaplen;
1849 h.h2->tp_mac = macoff;
1850 h.h2->tp_net = netoff;
bbd6ef87
PM
1851 h.h2->tp_sec = ts.tv_sec;
1852 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1853 if (vlan_tx_tag_present(skb)) {
1854 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1855 status |= TP_STATUS_VLAN_VALID;
1856 } else {
1857 h.h2->tp_vlan_tci = 0;
1858 }
13fcb7bd 1859 h.h2->tp_padding = 0;
bbd6ef87
PM
1860 hdrlen = sizeof(*h.h2);
1861 break;
f6fb8f10 1862 case TPACKET_V3:
1863 /* tp_nxt_offset,vlan are already populated above.
1864 * So DONT clear those fields here
1865 */
1866 h.h3->tp_status |= status;
1867 h.h3->tp_len = skb->len;
1868 h.h3->tp_snaplen = snaplen;
1869 h.h3->tp_mac = macoff;
1870 h.h3->tp_net = netoff;
f6fb8f10 1871 h.h3->tp_sec = ts.tv_sec;
1872 h.h3->tp_nsec = ts.tv_nsec;
1873 hdrlen = sizeof(*h.h3);
1874 break;
bbd6ef87
PM
1875 default:
1876 BUG();
1877 }
1da177e4 1878
bbd6ef87 1879 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1880 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1881 sll->sll_family = AF_PACKET;
1882 sll->sll_hatype = dev->type;
1883 sll->sll_protocol = skb->protocol;
1884 sll->sll_pkttype = skb->pkt_type;
8032b464 1885 if (unlikely(po->origdev))
80feaacb
PWJ
1886 sll->sll_ifindex = orig_dev->ifindex;
1887 else
1888 sll->sll_ifindex = dev->ifindex;
1da177e4 1889
e16aa207 1890 smp_mb();
f6dafa95 1891#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1892 {
0af55bb5
CG
1893 u8 *start, *end;
1894
f6fb8f10 1895 if (po->tp_version <= TPACKET_V2) {
1896 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1897 + macoff + snaplen);
1898 for (start = h.raw; start < end; start += PAGE_SIZE)
1899 flush_dcache_page(pgv_to_page(start));
1900 }
cc9f01b2 1901 smp_wmb();
1da177e4 1902 }
f6dafa95 1903#endif
f6fb8f10 1904 if (po->tp_version <= TPACKET_V2)
1905 __packet_set_status(po, h.raw, status);
1906 else
1907 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1908
1909 sk->sk_data_ready(sk, 0);
1910
1911drop_n_restore:
1912 if (skb_head != skb->data && skb_shared(skb)) {
1913 skb->data = skb_head;
1914 skb->len = skb_len;
1915 }
1916drop:
1ce4f28b 1917 kfree_skb(skb);
1da177e4
LT
1918 return 0;
1919
1920ring_is_full:
ee80fbf3 1921 po->stats.stats1.tp_drops++;
1da177e4
LT
1922 spin_unlock(&sk->sk_receive_queue.lock);
1923
1924 sk->sk_data_ready(sk, 0);
acb5d75b 1925 kfree_skb(copy_skb);
1da177e4
LT
1926 goto drop_n_restore;
1927}
1928
69e3c75f
JB
1929static void tpacket_destruct_skb(struct sk_buff *skb)
1930{
1931 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1932 void *ph;
1da177e4 1933
69e3c75f 1934 if (likely(po->tx_ring.pg_vec)) {
b9c32fb2
DB
1935 __u32 ts;
1936
69e3c75f 1937 ph = skb_shinfo(skb)->destructor_arg;
69e3c75f
JB
1938 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1939 atomic_dec(&po->tx_ring.pending);
b9c32fb2
DB
1940
1941 ts = __packet_set_timestamp(po, ph, skb);
1942 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
1943 }
1944
1945 sock_wfree(skb);
1946}
1947
40d4e3df
ED
1948static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1949 void *frame, struct net_device *dev, int size_max,
ae641949 1950 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 1951{
184f489e 1952 union tpacket_uhdr ph;
69e3c75f
JB
1953 int to_write, offset, len, tp_len, nr_frags, len_max;
1954 struct socket *sock = po->sk.sk_socket;
1955 struct page *page;
1956 void *data;
1957 int err;
1958
1959 ph.raw = frame;
1960
1961 skb->protocol = proto;
1962 skb->dev = dev;
1963 skb->priority = po->sk.sk_priority;
2d37a186 1964 skb->mark = po->sk.sk_mark;
2e31396f 1965 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
1966 skb_shinfo(skb)->destructor_arg = ph.raw;
1967
1968 switch (po->tp_version) {
1969 case TPACKET_V2:
1970 tp_len = ph.h2->tp_len;
1971 break;
1972 default:
1973 tp_len = ph.h1->tp_len;
1974 break;
1975 }
1976 if (unlikely(tp_len > size_max)) {
40d4e3df 1977 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1978 return -EMSGSIZE;
1979 }
1980
ae641949 1981 skb_reserve(skb, hlen);
69e3c75f 1982 skb_reset_network_header(skb);
40893fd0 1983 skb_probe_transport_header(skb, 0);
c1aad275 1984
5920cd3a
PC
1985 if (po->tp_tx_has_off) {
1986 int off_min, off_max, off;
1987 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
1988 off_max = po->tx_ring.frame_size - tp_len;
1989 if (sock->type == SOCK_DGRAM) {
1990 switch (po->tp_version) {
1991 case TPACKET_V2:
1992 off = ph.h2->tp_net;
1993 break;
1994 default:
1995 off = ph.h1->tp_net;
1996 break;
1997 }
1998 } else {
1999 switch (po->tp_version) {
2000 case TPACKET_V2:
2001 off = ph.h2->tp_mac;
2002 break;
2003 default:
2004 off = ph.h1->tp_mac;
2005 break;
2006 }
2007 }
2008 if (unlikely((off < off_min) || (off_max < off)))
2009 return -EINVAL;
2010 data = ph.raw + off;
2011 } else {
2012 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2013 }
69e3c75f
JB
2014 to_write = tp_len;
2015
2016 if (sock->type == SOCK_DGRAM) {
2017 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2018 NULL, tp_len);
2019 if (unlikely(err < 0))
2020 return -EINVAL;
40d4e3df 2021 } else if (dev->hard_header_len) {
69e3c75f
JB
2022 /* net device doesn't like empty head */
2023 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
2024 pr_err("packet size is too short (%d < %d)\n",
2025 tp_len, dev->hard_header_len);
69e3c75f
JB
2026 return -EINVAL;
2027 }
2028
2029 skb_push(skb, dev->hard_header_len);
2030 err = skb_store_bits(skb, 0, data,
2031 dev->hard_header_len);
2032 if (unlikely(err))
2033 return err;
2034
2035 data += dev->hard_header_len;
2036 to_write -= dev->hard_header_len;
2037 }
2038
69e3c75f
JB
2039 offset = offset_in_page(data);
2040 len_max = PAGE_SIZE - offset;
2041 len = ((to_write > len_max) ? len_max : to_write);
2042
2043 skb->data_len = to_write;
2044 skb->len += to_write;
2045 skb->truesize += to_write;
2046 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2047
2048 while (likely(to_write)) {
2049 nr_frags = skb_shinfo(skb)->nr_frags;
2050
2051 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2052 pr_err("Packet exceed the number of skb frags(%lu)\n",
2053 MAX_SKB_FRAGS);
69e3c75f
JB
2054 return -EFAULT;
2055 }
2056
0af55bb5
CG
2057 page = pgv_to_page(data);
2058 data += len;
69e3c75f
JB
2059 flush_dcache_page(page);
2060 get_page(page);
0af55bb5 2061 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2062 to_write -= len;
2063 offset = 0;
2064 len_max = PAGE_SIZE;
2065 len = ((to_write > len_max) ? len_max : to_write);
2066 }
2067
2068 return tp_len;
2069}
2070
2071static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2072{
69e3c75f
JB
2073 struct sk_buff *skb;
2074 struct net_device *dev;
2075 __be16 proto;
827d9780 2076 int err, reserve = 0;
40d4e3df
ED
2077 void *ph;
2078 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2079 int tp_len, size_max;
2080 unsigned char *addr;
2081 int len_sum = 0;
9e67030a 2082 int status = TP_STATUS_AVAILABLE;
ae641949 2083 int hlen, tlen;
69e3c75f 2084
69e3c75f
JB
2085 mutex_lock(&po->pg_vec_lock);
2086
c3ac8a13 2087 if (likely(saddr == NULL)) {
026bb405 2088 dev = packet_cached_dev_get(po);
69e3c75f
JB
2089 proto = po->num;
2090 addr = NULL;
2091 } else {
2092 err = -EINVAL;
2093 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2094 goto out;
2095 if (msg->msg_namelen < (saddr->sll_halen
2096 + offsetof(struct sockaddr_ll,
2097 sll_addr)))
2098 goto out;
69e3c75f
JB
2099 proto = saddr->sll_protocol;
2100 addr = saddr->sll_addr;
827d9780 2101 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2102 }
2103
69e3c75f
JB
2104 err = -ENXIO;
2105 if (unlikely(dev == NULL))
2106 goto out;
69e3c75f
JB
2107 err = -ENETDOWN;
2108 if (unlikely(!(dev->flags & IFF_UP)))
2109 goto out_put;
2110
026bb405
DB
2111 reserve = dev->hard_header_len;
2112
69e3c75f 2113 size_max = po->tx_ring.frame_size
b5dd884e 2114 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2115
2116 if (size_max > dev->mtu + reserve)
2117 size_max = dev->mtu + reserve;
2118
2119 do {
2120 ph = packet_current_frame(po, &po->tx_ring,
2121 TP_STATUS_SEND_REQUEST);
2122
2123 if (unlikely(ph == NULL)) {
2124 schedule();
2125 continue;
2126 }
2127
2128 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2129 hlen = LL_RESERVED_SPACE(dev);
2130 tlen = dev->needed_tailroom;
69e3c75f 2131 skb = sock_alloc_send_skb(&po->sk,
ae641949 2132 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2133 0, &err);
2134
2135 if (unlikely(skb == NULL))
2136 goto out_status;
2137
2138 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2139 addr, hlen);
69e3c75f
JB
2140
2141 if (unlikely(tp_len < 0)) {
2142 if (po->tp_loss) {
2143 __packet_set_status(po, ph,
2144 TP_STATUS_AVAILABLE);
2145 packet_increment_head(&po->tx_ring);
2146 kfree_skb(skb);
2147 continue;
2148 } else {
2149 status = TP_STATUS_WRONG_FORMAT;
2150 err = tp_len;
2151 goto out_status;
2152 }
2153 }
2154
2155 skb->destructor = tpacket_destruct_skb;
2156 __packet_set_status(po, ph, TP_STATUS_SENDING);
2157 atomic_inc(&po->tx_ring.pending);
2158
2159 status = TP_STATUS_SEND_REQUEST;
2160 err = dev_queue_xmit(skb);
eb70df13
JP
2161 if (unlikely(err > 0)) {
2162 err = net_xmit_errno(err);
2163 if (err && __packet_get_status(po, ph) ==
2164 TP_STATUS_AVAILABLE) {
2165 /* skb was destructed already */
2166 skb = NULL;
2167 goto out_status;
2168 }
2169 /*
2170 * skb was dropped but not destructed yet;
2171 * let's treat it like congestion or err < 0
2172 */
2173 err = 0;
2174 }
69e3c75f
JB
2175 packet_increment_head(&po->tx_ring);
2176 len_sum += tp_len;
f64f9e71
JP
2177 } while (likely((ph != NULL) ||
2178 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2179 (atomic_read(&po->tx_ring.pending))))
2180 );
69e3c75f
JB
2181
2182 err = len_sum;
2183 goto out_put;
2184
69e3c75f
JB
2185out_status:
2186 __packet_set_status(po, ph, status);
2187 kfree_skb(skb);
2188out_put:
026bb405 2189 dev_put(dev);
69e3c75f
JB
2190out:
2191 mutex_unlock(&po->pg_vec_lock);
2192 return err;
2193}
69e3c75f 2194
eea49cc9
OJ
2195static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2196 size_t reserve, size_t len,
2197 size_t linear, int noblock,
2198 int *err)
bfd5f4a3
SS
2199{
2200 struct sk_buff *skb;
2201
2202 /* Under a page? Don't bother with paged skb. */
2203 if (prepad + len < PAGE_SIZE || !linear)
2204 linear = len;
2205
2206 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2207 err);
2208 if (!skb)
2209 return NULL;
2210
2211 skb_reserve(skb, reserve);
2212 skb_put(skb, linear);
2213 skb->data_len = len - linear;
2214 skb->len += len - linear;
2215
2216 return skb;
2217}
2218
69e3c75f 2219static int packet_snd(struct socket *sock,
1da177e4
LT
2220 struct msghdr *msg, size_t len)
2221{
2222 struct sock *sk = sock->sk;
40d4e3df 2223 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2224 struct sk_buff *skb;
2225 struct net_device *dev;
0e11c91e 2226 __be16 proto;
1da177e4 2227 unsigned char *addr;
827d9780 2228 int err, reserve = 0;
bfd5f4a3
SS
2229 struct virtio_net_hdr vnet_hdr = { 0 };
2230 int offset = 0;
2231 int vnet_hdr_len;
2232 struct packet_sock *po = pkt_sk(sk);
2233 unsigned short gso_type = 0;
ae641949 2234 int hlen, tlen;
3bdc0eba 2235 int extra_len = 0;
1da177e4
LT
2236
2237 /*
1ce4f28b 2238 * Get and verify the address.
1da177e4 2239 */
1ce4f28b 2240
c3ac8a13 2241 if (likely(saddr == NULL)) {
026bb405 2242 dev = packet_cached_dev_get(po);
1da177e4
LT
2243 proto = po->num;
2244 addr = NULL;
2245 } else {
2246 err = -EINVAL;
2247 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2248 goto out;
0fb375fb
EB
2249 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2250 goto out;
1da177e4
LT
2251 proto = saddr->sll_protocol;
2252 addr = saddr->sll_addr;
827d9780 2253 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2254 }
2255
1da177e4 2256 err = -ENXIO;
026bb405 2257 if (unlikely(dev == NULL))
1da177e4 2258 goto out_unlock;
d5e76b0a 2259 err = -ENETDOWN;
026bb405 2260 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2261 goto out_unlock;
2262
026bb405
DB
2263 if (sock->type == SOCK_RAW)
2264 reserve = dev->hard_header_len;
bfd5f4a3
SS
2265 if (po->has_vnet_hdr) {
2266 vnet_hdr_len = sizeof(vnet_hdr);
2267
2268 err = -EINVAL;
2269 if (len < vnet_hdr_len)
2270 goto out_unlock;
2271
2272 len -= vnet_hdr_len;
2273
2274 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2275 vnet_hdr_len);
2276 if (err < 0)
2277 goto out_unlock;
2278
2279 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2280 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2281 vnet_hdr.hdr_len))
2282 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2283 vnet_hdr.csum_offset + 2;
2284
2285 err = -EINVAL;
2286 if (vnet_hdr.hdr_len > len)
2287 goto out_unlock;
2288
2289 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2290 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2291 case VIRTIO_NET_HDR_GSO_TCPV4:
2292 gso_type = SKB_GSO_TCPV4;
2293 break;
2294 case VIRTIO_NET_HDR_GSO_TCPV6:
2295 gso_type = SKB_GSO_TCPV6;
2296 break;
2297 case VIRTIO_NET_HDR_GSO_UDP:
2298 gso_type = SKB_GSO_UDP;
2299 break;
2300 default:
2301 goto out_unlock;
2302 }
2303
2304 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2305 gso_type |= SKB_GSO_TCP_ECN;
2306
2307 if (vnet_hdr.gso_size == 0)
2308 goto out_unlock;
2309
2310 }
2311 }
2312
3bdc0eba
BG
2313 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2314 if (!netif_supports_nofcs(dev)) {
2315 err = -EPROTONOSUPPORT;
2316 goto out_unlock;
2317 }
2318 extra_len = 4; /* We're doing our own CRC */
2319 }
2320
1da177e4 2321 err = -EMSGSIZE;
3bdc0eba 2322 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2323 goto out_unlock;
2324
bfd5f4a3 2325 err = -ENOBUFS;
ae641949
HX
2326 hlen = LL_RESERVED_SPACE(dev);
2327 tlen = dev->needed_tailroom;
2328 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2329 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2330 if (skb == NULL)
1da177e4
LT
2331 goto out_unlock;
2332
bfd5f4a3 2333 skb_set_network_header(skb, reserve);
1da177e4 2334
0c4e8581
SH
2335 err = -EINVAL;
2336 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2337 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2338 goto out_free;
1da177e4
LT
2339
2340 /* Returns -EFAULT on error */
bfd5f4a3 2341 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2342 if (err)
2343 goto out_free;
bf84a010
DB
2344
2345 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2346
3bdc0eba 2347 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
57f89bfa
BG
2348 /* Earlier code assumed this would be a VLAN pkt,
2349 * double-check this now that we have the actual
2350 * packet in hand.
2351 */
2352 struct ethhdr *ehdr;
2353 skb_reset_mac_header(skb);
2354 ehdr = eth_hdr(skb);
2355 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2356 err = -EMSGSIZE;
2357 goto out_free;
2358 }
2359 }
2360
1da177e4
LT
2361 skb->protocol = proto;
2362 skb->dev = dev;
2363 skb->priority = sk->sk_priority;
2d37a186 2364 skb->mark = sk->sk_mark;
1da177e4 2365
bfd5f4a3
SS
2366 if (po->has_vnet_hdr) {
2367 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2368 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2369 vnet_hdr.csum_offset)) {
2370 err = -EINVAL;
2371 goto out_free;
2372 }
2373 }
2374
2375 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2376 skb_shinfo(skb)->gso_type = gso_type;
2377
2378 /* Header must be checked, and gso_segs computed. */
2379 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2380 skb_shinfo(skb)->gso_segs = 0;
2381
2382 len += vnet_hdr_len;
2383 }
2384
40893fd0 2385 skb_probe_transport_header(skb, reserve);
c1aad275 2386
3bdc0eba
BG
2387 if (unlikely(extra_len == 4))
2388 skb->no_fcs = 1;
2389
1da177e4
LT
2390 /*
2391 * Now send it
2392 */
2393
2394 err = dev_queue_xmit(skb);
2395 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2396 goto out_unlock;
2397
026bb405 2398 dev_put(dev);
1da177e4 2399
40d4e3df 2400 return len;
1da177e4
LT
2401
2402out_free:
2403 kfree_skb(skb);
2404out_unlock:
026bb405 2405 if (dev)
1da177e4
LT
2406 dev_put(dev);
2407out:
2408 return err;
2409}
2410
69e3c75f
JB
2411static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2412 struct msghdr *msg, size_t len)
2413{
69e3c75f
JB
2414 struct sock *sk = sock->sk;
2415 struct packet_sock *po = pkt_sk(sk);
2416 if (po->tx_ring.pg_vec)
2417 return tpacket_snd(po, msg);
2418 else
69e3c75f
JB
2419 return packet_snd(sock, msg, len);
2420}
2421
1da177e4
LT
2422/*
2423 * Close a PACKET socket. This is fairly simple. We immediately go
2424 * to 'closed' state and remove our protocol entry in the device list.
2425 */
2426
2427static int packet_release(struct socket *sock)
2428{
2429 struct sock *sk = sock->sk;
2430 struct packet_sock *po;
d12d01d6 2431 struct net *net;
f6fb8f10 2432 union tpacket_req_u req_u;
1da177e4
LT
2433
2434 if (!sk)
2435 return 0;
2436
3b1e0a65 2437 net = sock_net(sk);
1da177e4
LT
2438 po = pkt_sk(sk);
2439
0fa7fa98 2440 mutex_lock(&net->packet.sklist_lock);
808f5114 2441 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2442 mutex_unlock(&net->packet.sklist_lock);
2443
2444 preempt_disable();
920de804 2445 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2446 preempt_enable();
1da177e4 2447
808f5114 2448 spin_lock(&po->bind_lock);
ce06b03e 2449 unregister_prot_hook(sk, false);
c3ac8a13
DB
2450 packet_cached_dev_reset(po);
2451
160ff18a
BG
2452 if (po->prot_hook.dev) {
2453 dev_put(po->prot_hook.dev);
2454 po->prot_hook.dev = NULL;
2455 }
808f5114 2456 spin_unlock(&po->bind_lock);
1da177e4 2457
1da177e4 2458 packet_flush_mclist(sk);
1da177e4 2459
9665d5d6
PS
2460 if (po->rx_ring.pg_vec) {
2461 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2462 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2463 }
69e3c75f 2464
9665d5d6
PS
2465 if (po->tx_ring.pg_vec) {
2466 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2467 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2468 }
1da177e4 2469
dc99f600
DM
2470 fanout_release(sk);
2471
808f5114 2472 synchronize_net();
1da177e4
LT
2473 /*
2474 * Now the socket is dead. No more input will appear.
2475 */
1da177e4
LT
2476 sock_orphan(sk);
2477 sock->sk = NULL;
2478
2479 /* Purge queues */
2480
2481 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2482 sk_refcnt_debug_release(sk);
1da177e4
LT
2483
2484 sock_put(sk);
2485 return 0;
2486}
2487
2488/*
2489 * Attach a packet hook.
2490 */
2491
0e11c91e 2492static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2493{
2494 struct packet_sock *po = pkt_sk(sk);
dc99f600 2495
aef950b4
WY
2496 if (po->fanout) {
2497 if (dev)
2498 dev_put(dev);
2499
dc99f600 2500 return -EINVAL;
aef950b4 2501 }
1da177e4
LT
2502
2503 lock_sock(sk);
2504
2505 spin_lock(&po->bind_lock);
ce06b03e 2506 unregister_prot_hook(sk, true);
c3ac8a13 2507
1da177e4
LT
2508 po->num = protocol;
2509 po->prot_hook.type = protocol;
160ff18a
BG
2510 if (po->prot_hook.dev)
2511 dev_put(po->prot_hook.dev);
1da177e4 2512
c3ac8a13 2513 po->prot_hook.dev = dev;
1da177e4
LT
2514 po->ifindex = dev ? dev->ifindex : 0;
2515
c3ac8a13
DB
2516 packet_cached_dev_assign(po, dev);
2517
1da177e4
LT
2518 if (protocol == 0)
2519 goto out_unlock;
2520
be85d4ad 2521 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2522 register_prot_hook(sk);
be85d4ad
UT
2523 } else {
2524 sk->sk_err = ENETDOWN;
2525 if (!sock_flag(sk, SOCK_DEAD))
2526 sk->sk_error_report(sk);
1da177e4
LT
2527 }
2528
2529out_unlock:
2530 spin_unlock(&po->bind_lock);
2531 release_sock(sk);
2532 return 0;
2533}
2534
2535/*
2536 * Bind a packet socket to a device
2537 */
2538
40d4e3df
ED
2539static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2540 int addr_len)
1da177e4 2541{
40d4e3df 2542 struct sock *sk = sock->sk;
1da177e4
LT
2543 char name[15];
2544 struct net_device *dev;
2545 int err = -ENODEV;
1ce4f28b 2546
1da177e4
LT
2547 /*
2548 * Check legality
2549 */
1ce4f28b 2550
8ae55f04 2551 if (addr_len != sizeof(struct sockaddr))
1da177e4 2552 return -EINVAL;
40d4e3df 2553 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2554
3b1e0a65 2555 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2556 if (dev)
1da177e4 2557 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2558 return err;
2559}
1da177e4
LT
2560
2561static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2562{
40d4e3df
ED
2563 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2564 struct sock *sk = sock->sk;
1da177e4
LT
2565 struct net_device *dev = NULL;
2566 int err;
2567
2568
2569 /*
2570 * Check legality
2571 */
1ce4f28b 2572
1da177e4
LT
2573 if (addr_len < sizeof(struct sockaddr_ll))
2574 return -EINVAL;
2575 if (sll->sll_family != AF_PACKET)
2576 return -EINVAL;
2577
2578 if (sll->sll_ifindex) {
2579 err = -ENODEV;
3b1e0a65 2580 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2581 if (dev == NULL)
2582 goto out;
2583 }
2584 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2585
2586out:
2587 return err;
2588}
2589
2590static struct proto packet_proto = {
2591 .name = "PACKET",
2592 .owner = THIS_MODULE,
2593 .obj_size = sizeof(struct packet_sock),
2594};
2595
2596/*
1ce4f28b 2597 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2598 */
2599
3f378b68
EP
2600static int packet_create(struct net *net, struct socket *sock, int protocol,
2601 int kern)
1da177e4
LT
2602{
2603 struct sock *sk;
2604 struct packet_sock *po;
0e11c91e 2605 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2606 int err;
2607
df008c91 2608 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2609 return -EPERM;
be02097c
DM
2610 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2611 sock->type != SOCK_PACKET)
1da177e4
LT
2612 return -ESOCKTNOSUPPORT;
2613
2614 sock->state = SS_UNCONNECTED;
2615
2616 err = -ENOBUFS;
6257ff21 2617 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2618 if (sk == NULL)
2619 goto out;
2620
2621 sock->ops = &packet_ops;
1da177e4
LT
2622 if (sock->type == SOCK_PACKET)
2623 sock->ops = &packet_ops_spkt;
be02097c 2624
1da177e4
LT
2625 sock_init_data(sock, sk);
2626
2627 po = pkt_sk(sk);
2628 sk->sk_family = PF_PACKET;
0e11c91e 2629 po->num = proto;
c3ac8a13
DB
2630
2631 packet_cached_dev_reset(po);
1da177e4
LT
2632
2633 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2634 sk_refcnt_debug_inc(sk);
1da177e4
LT
2635
2636 /*
2637 * Attach a protocol block
2638 */
2639
2640 spin_lock_init(&po->bind_lock);
905db440 2641 mutex_init(&po->pg_vec_lock);
1da177e4 2642 po->prot_hook.func = packet_rcv;
be02097c 2643
1da177e4
LT
2644 if (sock->type == SOCK_PACKET)
2645 po->prot_hook.func = packet_rcv_spkt;
be02097c 2646
1da177e4
LT
2647 po->prot_hook.af_packet_priv = sk;
2648
0e11c91e
AV
2649 if (proto) {
2650 po->prot_hook.type = proto;
ce06b03e 2651 register_prot_hook(sk);
1da177e4
LT
2652 }
2653
0fa7fa98 2654 mutex_lock(&net->packet.sklist_lock);
808f5114 2655 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2656 mutex_unlock(&net->packet.sklist_lock);
2657
2658 preempt_disable();
3680453c 2659 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2660 preempt_enable();
808f5114 2661
40d4e3df 2662 return 0;
1da177e4
LT
2663out:
2664 return err;
2665}
2666
ed85b565
RC
2667static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2668{
2669 struct sock_exterr_skb *serr;
2670 struct sk_buff *skb, *skb2;
2671 int copied, err;
2672
2673 err = -EAGAIN;
2674 skb = skb_dequeue(&sk->sk_error_queue);
2675 if (skb == NULL)
2676 goto out;
2677
2678 copied = skb->len;
2679 if (copied > len) {
2680 msg->msg_flags |= MSG_TRUNC;
2681 copied = len;
2682 }
2683 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2684 if (err)
2685 goto out_free_skb;
2686
2687 sock_recv_timestamp(msg, sk, skb);
2688
2689 serr = SKB_EXT_ERR(skb);
2690 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2691 sizeof(serr->ee), &serr->ee);
2692
2693 msg->msg_flags |= MSG_ERRQUEUE;
2694 err = copied;
2695
2696 /* Reset and regenerate socket error */
2697 spin_lock_bh(&sk->sk_error_queue.lock);
2698 sk->sk_err = 0;
2699 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2700 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2701 spin_unlock_bh(&sk->sk_error_queue.lock);
2702 sk->sk_error_report(sk);
2703 } else
2704 spin_unlock_bh(&sk->sk_error_queue.lock);
2705
2706out_free_skb:
2707 kfree_skb(skb);
2708out:
2709 return err;
2710}
2711
1da177e4
LT
2712/*
2713 * Pull a packet from our receive queue and hand it to the user.
2714 * If necessary we block.
2715 */
2716
2717static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2718 struct msghdr *msg, size_t len, int flags)
2719{
2720 struct sock *sk = sock->sk;
2721 struct sk_buff *skb;
2722 int copied, err;
bfd5f4a3 2723 int vnet_hdr_len = 0;
1da177e4
LT
2724
2725 err = -EINVAL;
ed85b565 2726 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2727 goto out;
2728
2729#if 0
2730 /* What error should we return now? EUNATTACH? */
2731 if (pkt_sk(sk)->ifindex < 0)
2732 return -ENODEV;
2733#endif
2734
ed85b565
RC
2735 if (flags & MSG_ERRQUEUE) {
2736 err = packet_recv_error(sk, msg, len);
2737 goto out;
2738 }
2739
1da177e4
LT
2740 /*
2741 * Call the generic datagram receiver. This handles all sorts
2742 * of horrible races and re-entrancy so we can forget about it
2743 * in the protocol layers.
2744 *
2745 * Now it will return ENETDOWN, if device have just gone down,
2746 * but then it will block.
2747 */
2748
40d4e3df 2749 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2750
2751 /*
1ce4f28b 2752 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2753 * handles the blocking we don't see and worry about blocking
2754 * retries.
2755 */
2756
8ae55f04 2757 if (skb == NULL)
1da177e4
LT
2758 goto out;
2759
bfd5f4a3
SS
2760 if (pkt_sk(sk)->has_vnet_hdr) {
2761 struct virtio_net_hdr vnet_hdr = { 0 };
2762
2763 err = -EINVAL;
2764 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2765 if (len < vnet_hdr_len)
bfd5f4a3
SS
2766 goto out_free;
2767
1f18b717
MK
2768 len -= vnet_hdr_len;
2769
bfd5f4a3
SS
2770 if (skb_is_gso(skb)) {
2771 struct skb_shared_info *sinfo = skb_shinfo(skb);
2772
2773 /* This is a hint as to how much should be linear. */
2774 vnet_hdr.hdr_len = skb_headlen(skb);
2775 vnet_hdr.gso_size = sinfo->gso_size;
2776 if (sinfo->gso_type & SKB_GSO_TCPV4)
2777 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2778 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2779 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2780 else if (sinfo->gso_type & SKB_GSO_UDP)
2781 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2782 else if (sinfo->gso_type & SKB_GSO_FCOE)
2783 goto out_free;
2784 else
2785 BUG();
2786 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2787 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2788 } else
2789 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2790
2791 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2792 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2793 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2794 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2795 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2796 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2797 } /* else everything is zero */
2798
2799 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2800 vnet_hdr_len);
2801 if (err < 0)
2802 goto out_free;
2803 }
2804
2f73d7fd
HFS
2805 /* You lose any data beyond the buffer you gave. If it worries
2806 * a user program they can ask the device for its MTU
2807 * anyway.
1da177e4 2808 */
1da177e4 2809 copied = skb->len;
40d4e3df
ED
2810 if (copied > len) {
2811 copied = len;
2812 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2813 }
2814
2815 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2816 if (err)
2817 goto out_free;
2818
3b885787 2819 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 2820
2f73d7fd
HFS
2821 if (msg->msg_name) {
2822 /* If the address length field is there to be filled
2823 * in, we fill it in now.
2824 */
2825 if (sock->type == SOCK_PACKET) {
2826 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2827 } else {
2828 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2829 msg->msg_namelen = sll->sll_halen +
2830 offsetof(struct sockaddr_ll, sll_addr);
2831 }
ffbc6111
HX
2832 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2833 msg->msg_namelen);
2f73d7fd 2834 }
1da177e4 2835
8dc41944 2836 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2837 struct tpacket_auxdata aux;
2838
2839 aux.tp_status = TP_STATUS_USER;
2840 if (skb->ip_summed == CHECKSUM_PARTIAL)
2841 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2842 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2843 aux.tp_snaplen = skb->len;
2844 aux.tp_mac = 0;
bbe735e4 2845 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2846 if (vlan_tx_tag_present(skb)) {
2847 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2848 aux.tp_status |= TP_STATUS_VLAN_VALID;
2849 } else {
2850 aux.tp_vlan_tci = 0;
2851 }
13fcb7bd 2852 aux.tp_padding = 0;
ffbc6111 2853 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2854 }
2855
1da177e4
LT
2856 /*
2857 * Free or return the buffer as appropriate. Again this
2858 * hides all the races and re-entrancy issues from us.
2859 */
bfd5f4a3 2860 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2861
2862out_free:
2863 skb_free_datagram(sk, skb);
2864out:
2865 return err;
2866}
2867
1da177e4
LT
2868static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2869 int *uaddr_len, int peer)
2870{
2871 struct net_device *dev;
2872 struct sock *sk = sock->sk;
2873
2874 if (peer)
2875 return -EOPNOTSUPP;
2876
2877 uaddr->sa_family = AF_PACKET;
2dc85bf3 2878 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
2879 rcu_read_lock();
2880 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2881 if (dev)
2dc85bf3 2882 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 2883 rcu_read_unlock();
1da177e4
LT
2884 *uaddr_len = sizeof(*uaddr);
2885
2886 return 0;
2887}
1da177e4
LT
2888
2889static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2890 int *uaddr_len, int peer)
2891{
2892 struct net_device *dev;
2893 struct sock *sk = sock->sk;
2894 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2895 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2896
2897 if (peer)
2898 return -EOPNOTSUPP;
2899
2900 sll->sll_family = AF_PACKET;
2901 sll->sll_ifindex = po->ifindex;
2902 sll->sll_protocol = po->num;
67286640 2903 sll->sll_pkttype = 0;
654d1f8a
ED
2904 rcu_read_lock();
2905 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2906 if (dev) {
2907 sll->sll_hatype = dev->type;
2908 sll->sll_halen = dev->addr_len;
2909 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2910 } else {
2911 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2912 sll->sll_halen = 0;
2913 }
654d1f8a 2914 rcu_read_unlock();
0fb375fb 2915 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2916
2917 return 0;
2918}
2919
2aeb0b88
WC
2920static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2921 int what)
1da177e4
LT
2922{
2923 switch (i->type) {
2924 case PACKET_MR_MULTICAST:
1162563f
JP
2925 if (i->alen != dev->addr_len)
2926 return -EINVAL;
1da177e4 2927 if (what > 0)
22bedad3 2928 return dev_mc_add(dev, i->addr);
1da177e4 2929 else
22bedad3 2930 return dev_mc_del(dev, i->addr);
1da177e4
LT
2931 break;
2932 case PACKET_MR_PROMISC:
2aeb0b88 2933 return dev_set_promiscuity(dev, what);
1da177e4
LT
2934 break;
2935 case PACKET_MR_ALLMULTI:
2aeb0b88 2936 return dev_set_allmulti(dev, what);
1da177e4 2937 break;
d95ed927 2938 case PACKET_MR_UNICAST:
1162563f
JP
2939 if (i->alen != dev->addr_len)
2940 return -EINVAL;
d95ed927 2941 if (what > 0)
a748ee24 2942 return dev_uc_add(dev, i->addr);
d95ed927 2943 else
a748ee24 2944 return dev_uc_del(dev, i->addr);
d95ed927 2945 break;
40d4e3df
ED
2946 default:
2947 break;
1da177e4 2948 }
2aeb0b88 2949 return 0;
1da177e4
LT
2950}
2951
2952static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2953{
40d4e3df 2954 for ( ; i; i = i->next) {
1da177e4
LT
2955 if (i->ifindex == dev->ifindex)
2956 packet_dev_mc(dev, i, what);
2957 }
2958}
2959
0fb375fb 2960static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2961{
2962 struct packet_sock *po = pkt_sk(sk);
2963 struct packet_mclist *ml, *i;
2964 struct net_device *dev;
2965 int err;
2966
2967 rtnl_lock();
2968
2969 err = -ENODEV;
3b1e0a65 2970 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2971 if (!dev)
2972 goto done;
2973
2974 err = -EINVAL;
1162563f 2975 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2976 goto done;
2977
2978 err = -ENOBUFS;
8b3a7005 2979 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2980 if (i == NULL)
2981 goto done;
2982
2983 err = 0;
2984 for (ml = po->mclist; ml; ml = ml->next) {
2985 if (ml->ifindex == mreq->mr_ifindex &&
2986 ml->type == mreq->mr_type &&
2987 ml->alen == mreq->mr_alen &&
2988 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2989 ml->count++;
2990 /* Free the new element ... */
2991 kfree(i);
2992 goto done;
2993 }
2994 }
2995
2996 i->type = mreq->mr_type;
2997 i->ifindex = mreq->mr_ifindex;
2998 i->alen = mreq->mr_alen;
2999 memcpy(i->addr, mreq->mr_address, i->alen);
13b89711 3000 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3001 i->count = 1;
3002 i->next = po->mclist;
3003 po->mclist = i;
2aeb0b88
WC
3004 err = packet_dev_mc(dev, i, 1);
3005 if (err) {
3006 po->mclist = i->next;
3007 kfree(i);
3008 }
1da177e4
LT
3009
3010done:
3011 rtnl_unlock();
3012 return err;
3013}
3014
0fb375fb 3015static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3016{
3017 struct packet_mclist *ml, **mlp;
3018
3019 rtnl_lock();
3020
3021 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3022 if (ml->ifindex == mreq->mr_ifindex &&
3023 ml->type == mreq->mr_type &&
3024 ml->alen == mreq->mr_alen &&
3025 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3026 if (--ml->count == 0) {
3027 struct net_device *dev;
3028 *mlp = ml->next;
ad959e76
ED
3029 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3030 if (dev)
1da177e4 3031 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3032 kfree(ml);
3033 }
3034 rtnl_unlock();
3035 return 0;
3036 }
3037 }
3038 rtnl_unlock();
3039 return -EADDRNOTAVAIL;
3040}
3041
3042static void packet_flush_mclist(struct sock *sk)
3043{
3044 struct packet_sock *po = pkt_sk(sk);
3045 struct packet_mclist *ml;
3046
3047 if (!po->mclist)
3048 return;
3049
3050 rtnl_lock();
3051 while ((ml = po->mclist) != NULL) {
3052 struct net_device *dev;
3053
3054 po->mclist = ml->next;
ad959e76
ED
3055 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3056 if (dev != NULL)
1da177e4 3057 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3058 kfree(ml);
3059 }
3060 rtnl_unlock();
3061}
1da177e4
LT
3062
3063static int
b7058842 3064packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3065{
3066 struct sock *sk = sock->sk;
8dc41944 3067 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3068 int ret;
3069
3070 if (level != SOL_PACKET)
3071 return -ENOPROTOOPT;
3072
69e3c75f 3073 switch (optname) {
1ce4f28b 3074 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3075 case PACKET_DROP_MEMBERSHIP:
3076 {
0fb375fb
EB
3077 struct packet_mreq_max mreq;
3078 int len = optlen;
3079 memset(&mreq, 0, sizeof(mreq));
3080 if (len < sizeof(struct packet_mreq))
1da177e4 3081 return -EINVAL;
0fb375fb
EB
3082 if (len > sizeof(mreq))
3083 len = sizeof(mreq);
40d4e3df 3084 if (copy_from_user(&mreq, optval, len))
1da177e4 3085 return -EFAULT;
0fb375fb
EB
3086 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3087 return -EINVAL;
1da177e4
LT
3088 if (optname == PACKET_ADD_MEMBERSHIP)
3089 ret = packet_mc_add(sk, &mreq);
3090 else
3091 ret = packet_mc_drop(sk, &mreq);
3092 return ret;
3093 }
a2efcfa0 3094
1da177e4 3095 case PACKET_RX_RING:
69e3c75f 3096 case PACKET_TX_RING:
1da177e4 3097 {
f6fb8f10 3098 union tpacket_req_u req_u;
3099 int len;
1da177e4 3100
f6fb8f10 3101 switch (po->tp_version) {
3102 case TPACKET_V1:
3103 case TPACKET_V2:
3104 len = sizeof(req_u.req);
3105 break;
3106 case TPACKET_V3:
3107 default:
3108 len = sizeof(req_u.req3);
3109 break;
3110 }
3111 if (optlen < len)
1da177e4 3112 return -EINVAL;
bfd5f4a3
SS
3113 if (pkt_sk(sk)->has_vnet_hdr)
3114 return -EINVAL;
f6fb8f10 3115 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3116 return -EFAULT;
f6fb8f10 3117 return packet_set_ring(sk, &req_u, 0,
3118 optname == PACKET_TX_RING);
1da177e4
LT
3119 }
3120 case PACKET_COPY_THRESH:
3121 {
3122 int val;
3123
40d4e3df 3124 if (optlen != sizeof(val))
1da177e4 3125 return -EINVAL;
40d4e3df 3126 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3127 return -EFAULT;
3128
3129 pkt_sk(sk)->copy_thresh = val;
3130 return 0;
3131 }
bbd6ef87
PM
3132 case PACKET_VERSION:
3133 {
3134 int val;
3135
3136 if (optlen != sizeof(val))
3137 return -EINVAL;
bbd6ef87
PM
3138 if (copy_from_user(&val, optval, sizeof(val)))
3139 return -EFAULT;
3140 switch (val) {
3141 case TPACKET_V1:
3142 case TPACKET_V2:
f6fb8f10 3143 case TPACKET_V3:
4b9e9796 3144 break;
bbd6ef87
PM
3145 default:
3146 return -EINVAL;
3147 }
4b9e9796
S
3148 lock_sock(sk);
3149 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3150 ret = -EBUSY;
3151 } else {
3152 po->tp_version = val;
3153 ret = 0;
3154 }
3155 release_sock(sk);
3156 return ret;
bbd6ef87 3157 }
8913336a
PM
3158 case PACKET_RESERVE:
3159 {
3160 unsigned int val;
3161
3162 if (optlen != sizeof(val))
3163 return -EINVAL;
69e3c75f 3164 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3165 return -EBUSY;
3166 if (copy_from_user(&val, optval, sizeof(val)))
3167 return -EFAULT;
4b9e9796
S
3168 if (val > INT_MAX)
3169 return -EINVAL;
8913336a
PM
3170 po->tp_reserve = val;
3171 return 0;
3172 }
69e3c75f
JB
3173 case PACKET_LOSS:
3174 {
3175 unsigned int val;
3176
3177 if (optlen != sizeof(val))
3178 return -EINVAL;
3179 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3180 return -EBUSY;
3181 if (copy_from_user(&val, optval, sizeof(val)))
3182 return -EFAULT;
3183 po->tp_loss = !!val;
3184 return 0;
3185 }
8dc41944
HX
3186 case PACKET_AUXDATA:
3187 {
3188 int val;
3189
3190 if (optlen < sizeof(val))
3191 return -EINVAL;
3192 if (copy_from_user(&val, optval, sizeof(val)))
3193 return -EFAULT;
3194
3195 po->auxdata = !!val;
3196 return 0;
3197 }
80feaacb
PWJ
3198 case PACKET_ORIGDEV:
3199 {
3200 int val;
3201
3202 if (optlen < sizeof(val))
3203 return -EINVAL;
3204 if (copy_from_user(&val, optval, sizeof(val)))
3205 return -EFAULT;
3206
3207 po->origdev = !!val;
3208 return 0;
3209 }
bfd5f4a3
SS
3210 case PACKET_VNET_HDR:
3211 {
3212 int val;
3213
3214 if (sock->type != SOCK_RAW)
3215 return -EINVAL;
3216 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3217 return -EBUSY;
3218 if (optlen < sizeof(val))
3219 return -EINVAL;
3220 if (copy_from_user(&val, optval, sizeof(val)))
3221 return -EFAULT;
3222
3223 po->has_vnet_hdr = !!val;
3224 return 0;
3225 }
614f60fa
SM
3226 case PACKET_TIMESTAMP:
3227 {
3228 int val;
3229
3230 if (optlen != sizeof(val))
3231 return -EINVAL;
3232 if (copy_from_user(&val, optval, sizeof(val)))
3233 return -EFAULT;
3234
3235 po->tp_tstamp = val;
3236 return 0;
3237 }
dc99f600
DM
3238 case PACKET_FANOUT:
3239 {
3240 int val;
3241
3242 if (optlen != sizeof(val))
3243 return -EINVAL;
3244 if (copy_from_user(&val, optval, sizeof(val)))
3245 return -EFAULT;
3246
3247 return fanout_add(sk, val & 0xffff, val >> 16);
3248 }
5920cd3a
PC
3249 case PACKET_TX_HAS_OFF:
3250 {
3251 unsigned int val;
3252
3253 if (optlen != sizeof(val))
3254 return -EINVAL;
3255 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3256 return -EBUSY;
3257 if (copy_from_user(&val, optval, sizeof(val)))
3258 return -EFAULT;
3259 po->tp_tx_has_off = !!val;
3260 return 0;
3261 }
1da177e4
LT
3262 default:
3263 return -ENOPROTOOPT;
3264 }
3265}
3266
3267static int packet_getsockopt(struct socket *sock, int level, int optname,
3268 char __user *optval, int __user *optlen)
3269{
3270 int len;
c06fff6e 3271 int val, lv = sizeof(val);
1da177e4
LT
3272 struct sock *sk = sock->sk;
3273 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3274 void *data = &val;
ee80fbf3 3275 union tpacket_stats_u st;
1da177e4
LT
3276
3277 if (level != SOL_PACKET)
3278 return -ENOPROTOOPT;
3279
8ae55f04
KK
3280 if (get_user(len, optlen))
3281 return -EFAULT;
1da177e4
LT
3282
3283 if (len < 0)
3284 return -EINVAL;
1ce4f28b 3285
69e3c75f 3286 switch (optname) {
1da177e4 3287 case PACKET_STATISTICS:
1da177e4 3288 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3289 memcpy(&st, &po->stats, sizeof(st));
3290 memset(&po->stats, 0, sizeof(po->stats));
3291 spin_unlock_bh(&sk->sk_receive_queue.lock);
3292
f6fb8f10 3293 if (po->tp_version == TPACKET_V3) {
c06fff6e 3294 lv = sizeof(struct tpacket_stats_v3);
fc26e4cf 3295 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3296 data = &st.stats3;
f6fb8f10 3297 } else {
c06fff6e 3298 lv = sizeof(struct tpacket_stats);
fc26e4cf 3299 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3300 data = &st.stats1;
f6fb8f10 3301 }
ee80fbf3 3302
8dc41944
HX
3303 break;
3304 case PACKET_AUXDATA:
8dc41944 3305 val = po->auxdata;
80feaacb
PWJ
3306 break;
3307 case PACKET_ORIGDEV:
80feaacb 3308 val = po->origdev;
bfd5f4a3
SS
3309 break;
3310 case PACKET_VNET_HDR:
bfd5f4a3 3311 val = po->has_vnet_hdr;
1da177e4 3312 break;
bbd6ef87 3313 case PACKET_VERSION:
bbd6ef87 3314 val = po->tp_version;
bbd6ef87
PM
3315 break;
3316 case PACKET_HDRLEN:
3317 if (len > sizeof(int))
3318 len = sizeof(int);
3319 if (copy_from_user(&val, optval, len))
3320 return -EFAULT;
3321 switch (val) {
3322 case TPACKET_V1:
3323 val = sizeof(struct tpacket_hdr);
3324 break;
3325 case TPACKET_V2:
3326 val = sizeof(struct tpacket2_hdr);
3327 break;
f6fb8f10 3328 case TPACKET_V3:
3329 val = sizeof(struct tpacket3_hdr);
3330 break;
bbd6ef87
PM
3331 default:
3332 return -EINVAL;
3333 }
bbd6ef87 3334 break;
8913336a 3335 case PACKET_RESERVE:
8913336a 3336 val = po->tp_reserve;
8913336a 3337 break;
69e3c75f 3338 case PACKET_LOSS:
69e3c75f 3339 val = po->tp_loss;
69e3c75f 3340 break;
614f60fa 3341 case PACKET_TIMESTAMP:
614f60fa 3342 val = po->tp_tstamp;
614f60fa 3343 break;
dc99f600 3344 case PACKET_FANOUT:
dc99f600
DM
3345 val = (po->fanout ?
3346 ((u32)po->fanout->id |
77f65ebd
WB
3347 ((u32)po->fanout->type << 16) |
3348 ((u32)po->fanout->flags << 24)) :
dc99f600 3349 0);
dc99f600 3350 break;
5920cd3a
PC
3351 case PACKET_TX_HAS_OFF:
3352 val = po->tp_tx_has_off;
3353 break;
1da177e4
LT
3354 default:
3355 return -ENOPROTOOPT;
3356 }
3357
c06fff6e
ED
3358 if (len > lv)
3359 len = lv;
8ae55f04
KK
3360 if (put_user(len, optlen))
3361 return -EFAULT;
8dc41944
HX
3362 if (copy_to_user(optval, data, len))
3363 return -EFAULT;
8ae55f04 3364 return 0;
1da177e4
LT
3365}
3366
3367
3368static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3369{
3370 struct sock *sk;
ad930650 3371 struct net_device *dev = data;
c346dca1 3372 struct net *net = dev_net(dev);
1da177e4 3373
808f5114 3374 rcu_read_lock();
b67bfe0d 3375 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3376 struct packet_sock *po = pkt_sk(sk);
3377
3378 switch (msg) {
3379 case NETDEV_UNREGISTER:
1da177e4
LT
3380 if (po->mclist)
3381 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3382 /* fallthrough */
3383
1da177e4
LT
3384 case NETDEV_DOWN:
3385 if (dev->ifindex == po->ifindex) {
3386 spin_lock(&po->bind_lock);
3387 if (po->running) {
ce06b03e 3388 __unregister_prot_hook(sk, false);
1da177e4
LT
3389 sk->sk_err = ENETDOWN;
3390 if (!sock_flag(sk, SOCK_DEAD))
3391 sk->sk_error_report(sk);
3392 }
3393 if (msg == NETDEV_UNREGISTER) {
c3ac8a13 3394 packet_cached_dev_reset(po);
1da177e4 3395 po->ifindex = -1;
160ff18a
BG
3396 if (po->prot_hook.dev)
3397 dev_put(po->prot_hook.dev);
1da177e4
LT
3398 po->prot_hook.dev = NULL;
3399 }
3400 spin_unlock(&po->bind_lock);
3401 }
3402 break;
3403 case NETDEV_UP:
808f5114 3404 if (dev->ifindex == po->ifindex) {
3405 spin_lock(&po->bind_lock);
ce06b03e
DM
3406 if (po->num)
3407 register_prot_hook(sk);
808f5114 3408 spin_unlock(&po->bind_lock);
1da177e4 3409 }
1da177e4
LT
3410 break;
3411 }
3412 }
808f5114 3413 rcu_read_unlock();
1da177e4
LT
3414 return NOTIFY_DONE;
3415}
3416
3417
3418static int packet_ioctl(struct socket *sock, unsigned int cmd,
3419 unsigned long arg)
3420{
3421 struct sock *sk = sock->sk;
3422
69e3c75f 3423 switch (cmd) {
40d4e3df
ED
3424 case SIOCOUTQ:
3425 {
3426 int amount = sk_wmem_alloc_get(sk);
31e6d363 3427
40d4e3df
ED
3428 return put_user(amount, (int __user *)arg);
3429 }
3430 case SIOCINQ:
3431 {
3432 struct sk_buff *skb;
3433 int amount = 0;
3434
3435 spin_lock_bh(&sk->sk_receive_queue.lock);
3436 skb = skb_peek(&sk->sk_receive_queue);
3437 if (skb)
3438 amount = skb->len;
3439 spin_unlock_bh(&sk->sk_receive_queue.lock);
3440 return put_user(amount, (int __user *)arg);
3441 }
3442 case SIOCGSTAMP:
3443 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3444 case SIOCGSTAMPNS:
3445 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3446
1da177e4 3447#ifdef CONFIG_INET
40d4e3df
ED
3448 case SIOCADDRT:
3449 case SIOCDELRT:
3450 case SIOCDARP:
3451 case SIOCGARP:
3452 case SIOCSARP:
3453 case SIOCGIFADDR:
3454 case SIOCSIFADDR:
3455 case SIOCGIFBRDADDR:
3456 case SIOCSIFBRDADDR:
3457 case SIOCGIFNETMASK:
3458 case SIOCSIFNETMASK:
3459 case SIOCGIFDSTADDR:
3460 case SIOCSIFDSTADDR:
3461 case SIOCSIFFLAGS:
40d4e3df 3462 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3463#endif
3464
40d4e3df
ED
3465 default:
3466 return -ENOIOCTLCMD;
1da177e4
LT
3467 }
3468 return 0;
3469}
3470
40d4e3df 3471static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3472 poll_table *wait)
3473{
3474 struct sock *sk = sock->sk;
3475 struct packet_sock *po = pkt_sk(sk);
3476 unsigned int mask = datagram_poll(file, sock, wait);
3477
3478 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3479 if (po->rx_ring.pg_vec) {
f6fb8f10 3480 if (!packet_previous_rx_frame(po, &po->rx_ring,
3481 TP_STATUS_KERNEL))
1da177e4
LT
3482 mask |= POLLIN | POLLRDNORM;
3483 }
3484 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3485 spin_lock_bh(&sk->sk_write_queue.lock);
3486 if (po->tx_ring.pg_vec) {
3487 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3488 mask |= POLLOUT | POLLWRNORM;
3489 }
3490 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3491 return mask;
3492}
3493
3494
3495/* Dirty? Well, I still did not learn better way to account
3496 * for user mmaps.
3497 */
3498
3499static void packet_mm_open(struct vm_area_struct *vma)
3500{
3501 struct file *file = vma->vm_file;
40d4e3df 3502 struct socket *sock = file->private_data;
1da177e4 3503 struct sock *sk = sock->sk;
1ce4f28b 3504
1da177e4
LT
3505 if (sk)
3506 atomic_inc(&pkt_sk(sk)->mapped);
3507}
3508
3509static void packet_mm_close(struct vm_area_struct *vma)
3510{
3511 struct file *file = vma->vm_file;
40d4e3df 3512 struct socket *sock = file->private_data;
1da177e4 3513 struct sock *sk = sock->sk;
1ce4f28b 3514
1da177e4
LT
3515 if (sk)
3516 atomic_dec(&pkt_sk(sk)->mapped);
3517}
3518
f0f37e2f 3519static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3520 .open = packet_mm_open,
3521 .close = packet_mm_close,
1da177e4
LT
3522};
3523
0e3125c7
NH
3524static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3525 unsigned int len)
1da177e4
LT
3526{
3527 int i;
3528
4ebf0ae2 3529 for (i = 0; i < len; i++) {
0e3125c7 3530 if (likely(pg_vec[i].buffer)) {
c56b4d90 3531 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3532 vfree(pg_vec[i].buffer);
3533 else
3534 free_pages((unsigned long)pg_vec[i].buffer,
3535 order);
3536 pg_vec[i].buffer = NULL;
3537 }
1da177e4
LT
3538 }
3539 kfree(pg_vec);
3540}
3541
eea49cc9 3542static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3543{
0e3125c7
NH
3544 char *buffer = NULL;
3545 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3546 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3547
3548 buffer = (char *) __get_free_pages(gfp_flags, order);
3549
3550 if (buffer)
3551 return buffer;
3552
3553 /*
3554 * __get_free_pages failed, fall back to vmalloc
3555 */
bbce5a59 3556 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3557
0e3125c7
NH
3558 if (buffer)
3559 return buffer;
3560
3561 /*
3562 * vmalloc failed, lets dig into swap here
3563 */
0e3125c7
NH
3564 gfp_flags &= ~__GFP_NORETRY;
3565 buffer = (char *)__get_free_pages(gfp_flags, order);
3566 if (buffer)
3567 return buffer;
3568
3569 /*
3570 * complete and utter failure
3571 */
3572 return NULL;
4ebf0ae2
DM
3573}
3574
0e3125c7 3575static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3576{
3577 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3578 struct pgv *pg_vec;
4ebf0ae2
DM
3579 int i;
3580
0e3125c7 3581 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3582 if (unlikely(!pg_vec))
3583 goto out;
3584
3585 for (i = 0; i < block_nr; i++) {
c56b4d90 3586 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3587 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3588 goto out_free_pgvec;
3589 }
3590
3591out:
3592 return pg_vec;
3593
3594out_free_pgvec:
3595 free_pg_vec(pg_vec, order, block_nr);
3596 pg_vec = NULL;
3597 goto out;
3598}
1da177e4 3599
f6fb8f10 3600static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3601 int closing, int tx_ring)
1da177e4 3602{
0e3125c7 3603 struct pgv *pg_vec = NULL;
1da177e4 3604 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3605 int was_running, order = 0;
69e3c75f
JB
3606 struct packet_ring_buffer *rb;
3607 struct sk_buff_head *rb_queue;
0e11c91e 3608 __be16 num;
f6fb8f10 3609 int err = -EINVAL;
3610 /* Added to avoid minimal code churn */
3611 struct tpacket_req *req = &req_u->req;
3612
4b9e9796 3613 lock_sock(sk);
f6fb8f10 3614 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3615 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3616 WARN(1, "Tx-ring is not supported.\n");
3617 goto out;
3618 }
1ce4f28b 3619
69e3c75f
JB
3620 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3621 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3622
69e3c75f
JB
3623 err = -EBUSY;
3624 if (!closing) {
3625 if (atomic_read(&po->mapped))
3626 goto out;
3627 if (atomic_read(&rb->pending))
3628 goto out;
3629 }
1da177e4 3630
69e3c75f
JB
3631 if (req->tp_block_nr) {
3632 /* Sanity tests and some calculations */
3633 err = -EBUSY;
3634 if (unlikely(rb->pg_vec))
3635 goto out;
1da177e4 3636
bbd6ef87
PM
3637 switch (po->tp_version) {
3638 case TPACKET_V1:
3639 po->tp_hdrlen = TPACKET_HDRLEN;
3640 break;
3641 case TPACKET_V2:
3642 po->tp_hdrlen = TPACKET2_HDRLEN;
3643 break;
f6fb8f10 3644 case TPACKET_V3:
3645 po->tp_hdrlen = TPACKET3_HDRLEN;
3646 break;
bbd6ef87
PM
3647 }
3648
69e3c75f 3649 err = -EINVAL;
4ebf0ae2 3650 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3651 goto out;
4ebf0ae2 3652 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3653 goto out;
4b9e9796 3654 if (po->tp_version >= TPACKET_V3 &&
4035ed7b
ED
3655 (int)(req->tp_block_size -
3656 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
4b9e9796 3657 goto out;
8913336a 3658 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3659 po->tp_reserve))
3660 goto out;
4ebf0ae2 3661 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3662 goto out;
1da177e4 3663
69e3c75f
JB
3664 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3665 if (unlikely(rb->frames_per_block <= 0))
3666 goto out;
4b9e9796
S
3667 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
3668 goto out;
69e3c75f
JB
3669 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3670 req->tp_frame_nr))
3671 goto out;
1da177e4
LT
3672
3673 err = -ENOMEM;
4ebf0ae2
DM
3674 order = get_order(req->tp_block_size);
3675 pg_vec = alloc_pg_vec(req, order);
3676 if (unlikely(!pg_vec))
1da177e4 3677 goto out;
f6fb8f10 3678 switch (po->tp_version) {
3679 case TPACKET_V3:
3680 /* Transmit path is not supported. We checked
3681 * it above but just being paranoid
3682 */
3683 if (!tx_ring)
3684 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3685 break;
3686 default:
3687 break;
3688 }
69e3c75f
JB
3689 }
3690 /* Done */
3691 else {
3692 err = -EINVAL;
4ebf0ae2 3693 if (unlikely(req->tp_frame_nr))
69e3c75f 3694 goto out;
1da177e4
LT
3695 }
3696
1da177e4
LT
3697
3698 /* Detach socket from network */
3699 spin_lock(&po->bind_lock);
3700 was_running = po->running;
3701 num = po->num;
3702 if (was_running) {
1da177e4 3703 po->num = 0;
ce06b03e 3704 __unregister_prot_hook(sk, false);
1da177e4
LT
3705 }
3706 spin_unlock(&po->bind_lock);
1ce4f28b 3707
1da177e4
LT
3708 synchronize_net();
3709
3710 err = -EBUSY;
905db440 3711 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3712 if (closing || atomic_read(&po->mapped) == 0) {
3713 err = 0;
69e3c75f 3714 spin_lock_bh(&rb_queue->lock);
c053fd96 3715 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3716 rb->frame_max = (req->tp_frame_nr - 1);
3717 rb->head = 0;
3718 rb->frame_size = req->tp_frame_size;
3719 spin_unlock_bh(&rb_queue->lock);
3720
c053fd96
CG
3721 swap(rb->pg_vec_order, order);
3722 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3723
3724 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3725 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3726 tpacket_rcv : packet_rcv;
3727 skb_queue_purge(rb_queue);
1da177e4 3728 if (atomic_read(&po->mapped))
40d4e3df
ED
3729 pr_err("packet_mmap: vma is busy: %d\n",
3730 atomic_read(&po->mapped));
1da177e4 3731 }
905db440 3732 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3733
3734 spin_lock(&po->bind_lock);
ce06b03e 3735 if (was_running) {
1da177e4 3736 po->num = num;
ce06b03e 3737 register_prot_hook(sk);
1da177e4
LT
3738 }
3739 spin_unlock(&po->bind_lock);
f6fb8f10 3740 if (closing && (po->tp_version > TPACKET_V2)) {
3741 /* Because we don't support block-based V3 on tx-ring */
3742 if (!tx_ring)
3743 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3744 }
1da177e4 3745
1da177e4
LT
3746 if (pg_vec)
3747 free_pg_vec(pg_vec, order, req->tp_block_nr);
3748out:
4b9e9796 3749 release_sock(sk);
1da177e4
LT
3750 return err;
3751}
3752
69e3c75f
JB
3753static int packet_mmap(struct file *file, struct socket *sock,
3754 struct vm_area_struct *vma)
1da177e4
LT
3755{
3756 struct sock *sk = sock->sk;
3757 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3758 unsigned long size, expected_size;
3759 struct packet_ring_buffer *rb;
1da177e4
LT
3760 unsigned long start;
3761 int err = -EINVAL;
3762 int i;
3763
3764 if (vma->vm_pgoff)
3765 return -EINVAL;
3766
905db440 3767 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3768
3769 expected_size = 0;
3770 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3771 if (rb->pg_vec) {
3772 expected_size += rb->pg_vec_len
3773 * rb->pg_vec_pages
3774 * PAGE_SIZE;
3775 }
3776 }
3777
3778 if (expected_size == 0)
1da177e4 3779 goto out;
69e3c75f
JB
3780
3781 size = vma->vm_end - vma->vm_start;
3782 if (size != expected_size)
1da177e4
LT
3783 goto out;
3784
1da177e4 3785 start = vma->vm_start;
69e3c75f
JB
3786 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3787 if (rb->pg_vec == NULL)
3788 continue;
3789
3790 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3791 struct page *page;
3792 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3793 int pg_num;
3794
c56b4d90
CG
3795 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3796 page = pgv_to_page(kaddr);
69e3c75f
JB
3797 err = vm_insert_page(vma, start, page);
3798 if (unlikely(err))
3799 goto out;
3800 start += PAGE_SIZE;
0e3125c7 3801 kaddr += PAGE_SIZE;
69e3c75f 3802 }
4ebf0ae2 3803 }
1da177e4 3804 }
69e3c75f 3805
4ebf0ae2 3806 atomic_inc(&po->mapped);
1da177e4
LT
3807 vma->vm_ops = &packet_mmap_ops;
3808 err = 0;
3809
3810out:
905db440 3811 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3812 return err;
3813}
1da177e4 3814
90ddc4f0 3815static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3816 .family = PF_PACKET,
3817 .owner = THIS_MODULE,
3818 .release = packet_release,
3819 .bind = packet_bind_spkt,
3820 .connect = sock_no_connect,
3821 .socketpair = sock_no_socketpair,
3822 .accept = sock_no_accept,
3823 .getname = packet_getname_spkt,
3824 .poll = datagram_poll,
3825 .ioctl = packet_ioctl,
3826 .listen = sock_no_listen,
3827 .shutdown = sock_no_shutdown,
3828 .setsockopt = sock_no_setsockopt,
3829 .getsockopt = sock_no_getsockopt,
3830 .sendmsg = packet_sendmsg_spkt,
3831 .recvmsg = packet_recvmsg,
3832 .mmap = sock_no_mmap,
3833 .sendpage = sock_no_sendpage,
3834};
1da177e4 3835
90ddc4f0 3836static const struct proto_ops packet_ops = {
1da177e4
LT
3837 .family = PF_PACKET,
3838 .owner = THIS_MODULE,
3839 .release = packet_release,
3840 .bind = packet_bind,
3841 .connect = sock_no_connect,
3842 .socketpair = sock_no_socketpair,
3843 .accept = sock_no_accept,
1ce4f28b 3844 .getname = packet_getname,
1da177e4
LT
3845 .poll = packet_poll,
3846 .ioctl = packet_ioctl,
3847 .listen = sock_no_listen,
3848 .shutdown = sock_no_shutdown,
3849 .setsockopt = packet_setsockopt,
3850 .getsockopt = packet_getsockopt,
3851 .sendmsg = packet_sendmsg,
3852 .recvmsg = packet_recvmsg,
3853 .mmap = packet_mmap,
3854 .sendpage = sock_no_sendpage,
3855};
3856
ec1b4cf7 3857static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3858 .family = PF_PACKET,
3859 .create = packet_create,
3860 .owner = THIS_MODULE,
3861};
3862
3863static struct notifier_block packet_netdev_notifier = {
40d4e3df 3864 .notifier_call = packet_notifier,
1da177e4
LT
3865};
3866
3867#ifdef CONFIG_PROC_FS
1da177e4
LT
3868
3869static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3870 __acquires(RCU)
1da177e4 3871{
e372c414 3872 struct net *net = seq_file_net(seq);
808f5114 3873
3874 rcu_read_lock();
3875 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3876}
3877
3878static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3879{
1bf40954 3880 struct net *net = seq_file_net(seq);
808f5114 3881 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3882}
3883
3884static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3885 __releases(RCU)
1da177e4 3886{
808f5114 3887 rcu_read_unlock();
1da177e4
LT
3888}
3889
1ce4f28b 3890static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3891{
3892 if (v == SEQ_START_TOKEN)
3893 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3894 else {
b7ceabd9 3895 struct sock *s = sk_entry(v);
1da177e4
LT
3896 const struct packet_sock *po = pkt_sk(s);
3897
3898 seq_printf(seq,
71338aa7 3899 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3900 s,
3901 atomic_read(&s->sk_refcnt),
3902 s->sk_type,
3903 ntohs(po->num),
3904 po->ifindex,
3905 po->running,
3906 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 3907 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 3908 sock_i_ino(s));
1da177e4
LT
3909 }
3910
3911 return 0;
3912}
3913
56b3d975 3914static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3915 .start = packet_seq_start,
3916 .next = packet_seq_next,
3917 .stop = packet_seq_stop,
3918 .show = packet_seq_show,
3919};
3920
3921static int packet_seq_open(struct inode *inode, struct file *file)
3922{
e372c414
DL
3923 return seq_open_net(inode, file, &packet_seq_ops,
3924 sizeof(struct seq_net_private));
1da177e4
LT
3925}
3926
da7071d7 3927static const struct file_operations packet_seq_fops = {
1da177e4
LT
3928 .owner = THIS_MODULE,
3929 .open = packet_seq_open,
3930 .read = seq_read,
3931 .llseek = seq_lseek,
e372c414 3932 .release = seq_release_net,
1da177e4
LT
3933};
3934
3935#endif
3936
2c8c1e72 3937static int __net_init packet_net_init(struct net *net)
d12d01d6 3938{
0fa7fa98 3939 mutex_init(&net->packet.sklist_lock);
2aaef4e4 3940 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 3941
d4beaa66 3942 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
3943 return -ENOMEM;
3944
3945 return 0;
3946}
3947
2c8c1e72 3948static void __net_exit packet_net_exit(struct net *net)
d12d01d6 3949{
ece31ffd 3950 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
3951}
3952
3953static struct pernet_operations packet_net_ops = {
3954 .init = packet_net_init,
3955 .exit = packet_net_exit,
3956};
3957
3958
1da177e4
LT
3959static void __exit packet_exit(void)
3960{
1da177e4 3961 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3962 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3963 sock_unregister(PF_PACKET);
3964 proto_unregister(&packet_proto);
3965}
3966
3967static int __init packet_init(void)
3968{
3969 int rc = proto_register(&packet_proto, 0);
3970
3971 if (rc != 0)
3972 goto out;
3973
3974 sock_register(&packet_family_ops);
d12d01d6 3975 register_pernet_subsys(&packet_net_ops);
1da177e4 3976 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3977out:
3978 return rc;
3979}
3980
3981module_init(packet_init);
3982module_exit(packet_exit);
3983MODULE_LICENSE("GPL");
3984MODULE_ALIAS_NETPROTO(PF_PACKET);