net/packet: check length in getsockopt() called with PACKET_HDRLEN
[GitHub/LineageOS/android_kernel_samsung_universal7580.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
1da177e4
LT
91
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111
HX
218struct packet_skb_cb {
219 unsigned int origlen;
220 union {
221 struct sockaddr_pkt pkt;
222 struct sockaddr_ll ll;
223 } sa;
224};
225
226#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 227
bc59ba39 228#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 229#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 231#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 233#define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
236
dc99f600
DM
237static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238static void __fanout_link(struct sock *sk, struct packet_sock *po);
239
c3ac8a13
DB
240static struct net_device *packet_cached_dev_get(struct packet_sock *po)
241{
242 struct net_device *dev;
243
244 rcu_read_lock();
245 dev = rcu_dereference(po->cached_dev);
246 if (likely(dev))
247 dev_hold(dev);
248 rcu_read_unlock();
249
250 return dev;
251}
252
253static void packet_cached_dev_assign(struct packet_sock *po,
254 struct net_device *dev)
255{
256 rcu_assign_pointer(po->cached_dev, dev);
257}
258
259static void packet_cached_dev_reset(struct packet_sock *po)
260{
261 RCU_INIT_POINTER(po->cached_dev, NULL);
262}
263
ce06b03e
DM
264/* register_prot_hook must be invoked with the po->bind_lock held,
265 * or from a context in which asynchronous accesses to the packet
266 * socket is not possible (packet_create()).
267 */
268static void register_prot_hook(struct sock *sk)
269{
270 struct packet_sock *po = pkt_sk(sk);
026bb405 271
ce06b03e 272 if (!po->running) {
c3ac8a13 273 if (po->fanout)
dc99f600 274 __fanout_link(sk, po);
c3ac8a13 275 else
dc99f600 276 dev_add_pack(&po->prot_hook);
026bb405 277
ce06b03e
DM
278 sock_hold(sk);
279 po->running = 1;
280 }
281}
282
283/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
284 * held. If the sync parameter is true, we will temporarily drop
285 * the po->bind_lock and do a synchronize_net to make sure no
286 * asynchronous packet processing paths still refer to the elements
287 * of po->prot_hook. If the sync parameter is false, it is the
288 * callers responsibility to take care of this.
289 */
290static void __unregister_prot_hook(struct sock *sk, bool sync)
291{
292 struct packet_sock *po = pkt_sk(sk);
293
294 po->running = 0;
c3ac8a13
DB
295
296 if (po->fanout)
dc99f600 297 __fanout_unlink(sk, po);
c3ac8a13 298 else
dc99f600 299 __dev_remove_pack(&po->prot_hook);
026bb405 300
ce06b03e
DM
301 __sock_put(sk);
302
303 if (sync) {
304 spin_unlock(&po->bind_lock);
305 synchronize_net();
306 spin_lock(&po->bind_lock);
307 }
308}
309
310static void unregister_prot_hook(struct sock *sk, bool sync)
311{
312 struct packet_sock *po = pkt_sk(sk);
313
314 if (po->running)
315 __unregister_prot_hook(sk, sync);
316}
317
f6dafa95 318static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
319{
320 if (is_vmalloc_addr(addr))
321 return vmalloc_to_page(addr);
322 return virt_to_page(addr);
323}
324
69e3c75f 325static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 326{
184f489e 327 union tpacket_uhdr h;
1da177e4 328
69e3c75f 329 h.raw = frame;
bbd6ef87
PM
330 switch (po->tp_version) {
331 case TPACKET_V1:
69e3c75f 332 h.h1->tp_status = status;
0af55bb5 333 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
334 break;
335 case TPACKET_V2:
69e3c75f 336 h.h2->tp_status = status;
0af55bb5 337 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 338 break;
f6fb8f10 339 case TPACKET_V3:
69e3c75f 340 default:
f6fb8f10 341 WARN(1, "TPACKET version not supported.\n");
69e3c75f 342 BUG();
bbd6ef87 343 }
69e3c75f
JB
344
345 smp_wmb();
bbd6ef87
PM
346}
347
69e3c75f 348static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 349{
184f489e 350 union tpacket_uhdr h;
bbd6ef87 351
69e3c75f
JB
352 smp_rmb();
353
bbd6ef87
PM
354 h.raw = frame;
355 switch (po->tp_version) {
356 case TPACKET_V1:
0af55bb5 357 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 358 return h.h1->tp_status;
bbd6ef87 359 case TPACKET_V2:
0af55bb5 360 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 361 return h.h2->tp_status;
f6fb8f10 362 case TPACKET_V3:
69e3c75f 363 default:
f6fb8f10 364 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
365 BUG();
366 return 0;
bbd6ef87 367 }
1da177e4 368}
69e3c75f 369
b9c32fb2
DB
370static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
371 unsigned int flags)
7a51384c
DB
372{
373 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
374
375 if (shhwtstamps) {
376 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
377 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
b9c32fb2 378 return TP_STATUS_TS_SYS_HARDWARE;
7a51384c
DB
379 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
380 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
b9c32fb2 381 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
382 }
383
384 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 385 return TP_STATUS_TS_SOFTWARE;
7a51384c 386
b9c32fb2 387 return 0;
7a51384c
DB
388}
389
b9c32fb2
DB
390static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
391 struct sk_buff *skb)
2e31396f
WB
392{
393 union tpacket_uhdr h;
394 struct timespec ts;
b9c32fb2 395 __u32 ts_status;
2e31396f 396
b9c32fb2
DB
397 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
398 return 0;
2e31396f
WB
399
400 h.raw = frame;
401 switch (po->tp_version) {
402 case TPACKET_V1:
403 h.h1->tp_sec = ts.tv_sec;
404 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
405 break;
406 case TPACKET_V2:
407 h.h2->tp_sec = ts.tv_sec;
408 h.h2->tp_nsec = ts.tv_nsec;
409 break;
410 case TPACKET_V3:
411 default:
412 WARN(1, "TPACKET version not supported.\n");
413 BUG();
414 }
415
416 /* one flush is safe, as both fields always lie on the same cacheline */
417 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
418 smp_wmb();
b9c32fb2
DB
419
420 return ts_status;
2e31396f
WB
421}
422
69e3c75f
JB
423static void *packet_lookup_frame(struct packet_sock *po,
424 struct packet_ring_buffer *rb,
425 unsigned int position,
426 int status)
427{
428 unsigned int pg_vec_pos, frame_offset;
184f489e 429 union tpacket_uhdr h;
69e3c75f
JB
430
431 pg_vec_pos = position / rb->frames_per_block;
432 frame_offset = position % rb->frames_per_block;
433
0e3125c7
NH
434 h.raw = rb->pg_vec[pg_vec_pos].buffer +
435 (frame_offset * rb->frame_size);
69e3c75f
JB
436
437 if (status != __packet_get_status(po, h.raw))
438 return NULL;
439
440 return h.raw;
441}
442
eea49cc9 443static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
444 struct packet_ring_buffer *rb,
445 int status)
446{
447 return packet_lookup_frame(po, rb, rb->head, status);
448}
449
bc59ba39 450static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 451{
452 del_timer_sync(&pkc->retire_blk_timer);
453}
454
455static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
456 int tx_ring,
457 struct sk_buff_head *rb_queue)
458{
bc59ba39 459 struct tpacket_kbdq_core *pkc;
f6fb8f10 460
461 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
462
5b9e9be7 463 spin_lock_bh(&rb_queue->lock);
f6fb8f10 464 pkc->delete_blk_timer = 1;
5b9e9be7 465 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 466
467 prb_del_retire_blk_timer(pkc);
468}
469
470static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 471 struct tpacket_kbdq_core *pkc,
f6fb8f10 472 void (*func) (unsigned long))
473{
474 init_timer(&pkc->retire_blk_timer);
475 pkc->retire_blk_timer.data = (long)po;
476 pkc->retire_blk_timer.function = func;
477 pkc->retire_blk_timer.expires = jiffies;
478}
479
480static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
481{
bc59ba39 482 struct tpacket_kbdq_core *pkc;
f6fb8f10 483
484 if (tx_ring)
485 BUG();
486
487 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
488 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
489}
490
491static int prb_calc_retire_blk_tmo(struct packet_sock *po,
492 int blk_size_in_bytes)
493{
494 struct net_device *dev;
495 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
496 struct ethtool_cmd ecmd;
497 int err;
e440cf2c 498 u32 speed;
f6fb8f10 499
4bc71cb9
JP
500 rtnl_lock();
501 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
502 if (unlikely(!dev)) {
503 rtnl_unlock();
f6fb8f10 504 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
505 }
506 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 507 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
508 rtnl_unlock();
509 if (!err) {
4bc71cb9
JP
510 /*
511 * If the link speed is so slow you don't really
512 * need to worry about perf anyways
513 */
e440cf2c 514 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 515 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 516 } else {
517 msec = 1;
518 div = speed / 1000;
f6fb8f10 519 }
520 }
521
522 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
523
524 if (div)
525 mbits /= div;
526
527 tmo = mbits * msec;
528
529 if (div)
530 return tmo+1;
531 return tmo;
532}
533
bc59ba39 534static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 535 union tpacket_req_u *req_u)
536{
537 p1->feature_req_word = req_u->req3.tp_feature_req_word;
538}
539
540static void init_prb_bdqc(struct packet_sock *po,
541 struct packet_ring_buffer *rb,
542 struct pgv *pg_vec,
543 union tpacket_req_u *req_u, int tx_ring)
544{
bc59ba39 545 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
546 struct tpacket_block_desc *pbd;
f6fb8f10 547
548 memset(p1, 0x0, sizeof(*p1));
549
550 p1->knxt_seq_num = 1;
551 p1->pkbdq = pg_vec;
bc59ba39 552 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 553 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 554 p1->kblk_size = req_u->req3.tp_block_size;
555 p1->knum_blocks = req_u->req3.tp_block_nr;
556 p1->hdrlen = po->tp_hdrlen;
557 p1->version = po->tp_version;
558 p1->last_kactive_blk_num = 0;
ee80fbf3 559 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 560 if (req_u->req3.tp_retire_blk_tov)
561 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
562 else
563 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
564 req_u->req3.tp_block_size);
565 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
566 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
567
4035ed7b 568 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 569 prb_init_ft_ops(p1, req_u);
570 prb_setup_retire_blk_timer(po, tx_ring);
571 prb_open_block(p1, pbd);
572}
573
574/* Do NOT update the last_blk_num first.
575 * Assumes sk_buff_head lock is held.
576 */
bc59ba39 577static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 578{
579 mod_timer(&pkc->retire_blk_timer,
580 jiffies + pkc->tov_in_jiffies);
581 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
582}
583
584/*
585 * Timer logic:
586 * 1) We refresh the timer only when we open a block.
587 * By doing this we don't waste cycles refreshing the timer
588 * on packet-by-packet basis.
589 *
590 * With a 1MB block-size, on a 1Gbps line, it will take
591 * i) ~8 ms to fill a block + ii) memcpy etc.
592 * In this cut we are not accounting for the memcpy time.
593 *
594 * So, if the user sets the 'tmo' to 10ms then the timer
595 * will never fire while the block is still getting filled
596 * (which is what we want). However, the user could choose
597 * to close a block early and that's fine.
598 *
599 * But when the timer does fire, we check whether or not to refresh it.
600 * Since the tmo granularity is in msecs, it is not too expensive
601 * to refresh the timer, lets say every '8' msecs.
602 * Either the user can set the 'tmo' or we can derive it based on
603 * a) line-speed and b) block-size.
604 * prb_calc_retire_blk_tmo() calculates the tmo.
605 *
606 */
607static void prb_retire_rx_blk_timer_expired(unsigned long data)
608{
609 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 610 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 611 unsigned int frozen;
bc59ba39 612 struct tpacket_block_desc *pbd;
f6fb8f10 613
614 spin_lock(&po->sk.sk_receive_queue.lock);
615
616 frozen = prb_queue_frozen(pkc);
617 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
618
619 if (unlikely(pkc->delete_blk_timer))
620 goto out;
621
622 /* We only need to plug the race when the block is partially filled.
623 * tpacket_rcv:
624 * lock(); increment BLOCK_NUM_PKTS; unlock()
625 * copy_bits() is in progress ...
626 * timer fires on other cpu:
627 * we can't retire the current block because copy_bits
628 * is in progress.
629 *
630 */
631 if (BLOCK_NUM_PKTS(pbd)) {
632 while (atomic_read(&pkc->blk_fill_in_prog)) {
633 /* Waiting for skb_copy_bits to finish... */
634 cpu_relax();
635 }
636 }
637
638 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
639 if (!frozen) {
640 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
641 if (!prb_dispatch_next_block(pkc, po))
642 goto refresh_timer;
643 else
644 goto out;
645 } else {
646 /* Case 1. Queue was frozen because user-space was
647 * lagging behind.
648 */
649 if (prb_curr_blk_in_use(pkc, pbd)) {
650 /*
651 * Ok, user-space is still behind.
652 * So just refresh the timer.
653 */
654 goto refresh_timer;
655 } else {
656 /* Case 2. queue was frozen,user-space caught up,
657 * now the link went idle && the timer fired.
658 * We don't have a block to close.So we open this
659 * block and restart the timer.
660 * opening a block thaws the queue,restarts timer
661 * Thawing/timer-refresh is a side effect.
662 */
663 prb_open_block(pkc, pbd);
664 goto out;
665 }
666 }
667 }
668
669refresh_timer:
670 _prb_refresh_rx_retire_blk_timer(pkc);
671
672out:
673 spin_unlock(&po->sk.sk_receive_queue.lock);
674}
675
eea49cc9 676static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 677 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 678{
679 /* Flush everything minus the block header */
680
681#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
682 u8 *start, *end;
683
684 start = (u8 *)pbd1;
685
686 /* Skip the block header(we know header WILL fit in 4K) */
687 start += PAGE_SIZE;
688
689 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
690 for (; start < end; start += PAGE_SIZE)
691 flush_dcache_page(pgv_to_page(start));
692
693 smp_wmb();
694#endif
695
696 /* Now update the block status. */
697
698 BLOCK_STATUS(pbd1) = status;
699
700 /* Flush the block header */
701
702#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
703 start = (u8 *)pbd1;
704 flush_dcache_page(pgv_to_page(start));
705
706 smp_wmb();
707#endif
708}
709
710/*
711 * Side effect:
712 *
713 * 1) flush the block
714 * 2) Increment active_blk_num
715 *
716 * Note:We DONT refresh the timer on purpose.
717 * Because almost always the next block will be opened.
718 */
bc59ba39 719static void prb_close_block(struct tpacket_kbdq_core *pkc1,
720 struct tpacket_block_desc *pbd1,
f6fb8f10 721 struct packet_sock *po, unsigned int stat)
722{
723 __u32 status = TP_STATUS_USER | stat;
724
725 struct tpacket3_hdr *last_pkt;
bc59ba39 726 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 727
ee80fbf3 728 if (po->stats.stats3.tp_drops)
f6fb8f10 729 status |= TP_STATUS_LOSING;
730
731 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
732 last_pkt->tp_next_offset = 0;
733
734 /* Get the ts of the last pkt */
735 if (BLOCK_NUM_PKTS(pbd1)) {
736 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
737 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
738 } else {
739 /* Ok, we tmo'd - so get the current time */
740 struct timespec ts;
741 getnstimeofday(&ts);
742 h1->ts_last_pkt.ts_sec = ts.tv_sec;
743 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
744 }
745
746 smp_wmb();
747
748 /* Flush the block */
749 prb_flush_block(pkc1, pbd1, status);
750
751 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
752}
753
eea49cc9 754static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 755{
756 pkc->reset_pending_on_curr_blk = 0;
757}
758
759/*
760 * Side effect of opening a block:
761 *
762 * 1) prb_queue is thawed.
763 * 2) retire_blk_timer is refreshed.
764 *
765 */
bc59ba39 766static void prb_open_block(struct tpacket_kbdq_core *pkc1,
767 struct tpacket_block_desc *pbd1)
f6fb8f10 768{
769 struct timespec ts;
bc59ba39 770 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 771
772 smp_rmb();
773
8da3056c
DB
774 /* We could have just memset this but we will lose the
775 * flexibility of making the priv area sticky
776 */
f6fb8f10 777
8da3056c
DB
778 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
779 BLOCK_NUM_PKTS(pbd1) = 0;
780 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 781
8da3056c
DB
782 getnstimeofday(&ts);
783
784 h1->ts_first_pkt.ts_sec = ts.tv_sec;
785 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 786
8da3056c
DB
787 pkc1->pkblk_start = (char *)pbd1;
788 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
789
790 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
791 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
792
793 pbd1->version = pkc1->version;
794 pkc1->prev = pkc1->nxt_offset;
795 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
796
797 prb_thaw_queue(pkc1);
798 _prb_refresh_rx_retire_blk_timer(pkc1);
799
800 smp_wmb();
f6fb8f10 801}
802
803/*
804 * Queue freeze logic:
805 * 1) Assume tp_block_nr = 8 blocks.
806 * 2) At time 't0', user opens Rx ring.
807 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
808 * 4) user-space is either sleeping or processing block '0'.
809 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
810 * it will close block-7,loop around and try to fill block '0'.
811 * call-flow:
812 * __packet_lookup_frame_in_block
813 * prb_retire_current_block()
814 * prb_dispatch_next_block()
815 * |->(BLOCK_STATUS == USER) evaluates to true
816 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
817 * 6) Now there are two cases:
818 * 6.1) Link goes idle right after the queue is frozen.
819 * But remember, the last open_block() refreshed the timer.
820 * When this timer expires,it will refresh itself so that we can
821 * re-open block-0 in near future.
822 * 6.2) Link is busy and keeps on receiving packets. This is a simple
823 * case and __packet_lookup_frame_in_block will check if block-0
824 * is free and can now be re-used.
825 */
eea49cc9 826static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 827 struct packet_sock *po)
828{
829 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 830 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 831}
832
833#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
834
835/*
836 * If the next block is free then we will dispatch it
837 * and return a good offset.
838 * Else, we will freeze the queue.
839 * So, caller must check the return value.
840 */
bc59ba39 841static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 842 struct packet_sock *po)
843{
bc59ba39 844 struct tpacket_block_desc *pbd;
f6fb8f10 845
846 smp_rmb();
847
848 /* 1. Get current block num */
849 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
850
851 /* 2. If this block is currently in_use then freeze the queue */
852 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
853 prb_freeze_queue(pkc, po);
854 return NULL;
855 }
856
857 /*
858 * 3.
859 * open this block and return the offset where the first packet
860 * needs to get stored.
861 */
862 prb_open_block(pkc, pbd);
863 return (void *)pkc->nxt_offset;
864}
865
bc59ba39 866static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 867 struct packet_sock *po, unsigned int status)
868{
bc59ba39 869 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 870
871 /* retire/close the current block */
872 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
873 /*
874 * Plug the case where copy_bits() is in progress on
875 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
876 * have space to copy the pkt in the current block and
877 * called prb_retire_current_block()
878 *
879 * We don't need to worry about the TMO case because
880 * the timer-handler already handled this case.
881 */
882 if (!(status & TP_STATUS_BLK_TMO)) {
883 while (atomic_read(&pkc->blk_fill_in_prog)) {
884 /* Waiting for skb_copy_bits to finish... */
885 cpu_relax();
886 }
887 }
888 prb_close_block(pkc, pbd, po, status);
889 return;
890 }
f6fb8f10 891}
892
eea49cc9 893static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 894 struct tpacket_block_desc *pbd)
f6fb8f10 895{
896 return TP_STATUS_USER & BLOCK_STATUS(pbd);
897}
898
eea49cc9 899static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 900{
901 return pkc->reset_pending_on_curr_blk;
902}
903
eea49cc9 904static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 905{
bc59ba39 906 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 907 atomic_dec(&pkc->blk_fill_in_prog);
908}
909
eea49cc9 910static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 911 struct tpacket3_hdr *ppd)
912{
913 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
914}
915
eea49cc9 916static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 917 struct tpacket3_hdr *ppd)
918{
919 ppd->hv1.tp_rxhash = 0;
920}
921
eea49cc9 922static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 923 struct tpacket3_hdr *ppd)
924{
925 if (vlan_tx_tag_present(pkc->skb)) {
926 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
927 ppd->tp_status = TP_STATUS_VLAN_VALID;
928 } else {
9e67030a 929 ppd->hv1.tp_vlan_tci = 0;
930 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 931 }
932}
933
bc59ba39 934static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 935 struct tpacket3_hdr *ppd)
936{
937 prb_fill_vlan_info(pkc, ppd);
938
939 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
940 prb_fill_rxhash(pkc, ppd);
941 else
942 prb_clear_rxhash(pkc, ppd);
943}
944
eea49cc9 945static void prb_fill_curr_block(char *curr,
bc59ba39 946 struct tpacket_kbdq_core *pkc,
947 struct tpacket_block_desc *pbd,
f6fb8f10 948 unsigned int len)
949{
950 struct tpacket3_hdr *ppd;
951
952 ppd = (struct tpacket3_hdr *)curr;
953 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
954 pkc->prev = curr;
955 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
956 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
957 BLOCK_NUM_PKTS(pbd) += 1;
958 atomic_inc(&pkc->blk_fill_in_prog);
959 prb_run_all_ft_ops(pkc, ppd);
960}
961
962/* Assumes caller has the sk->rx_queue.lock */
963static void *__packet_lookup_frame_in_block(struct packet_sock *po,
964 struct sk_buff *skb,
965 int status,
966 unsigned int len
967 )
968{
bc59ba39 969 struct tpacket_kbdq_core *pkc;
970 struct tpacket_block_desc *pbd;
f6fb8f10 971 char *curr, *end;
972
e3192690 973 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 974 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
975
976 /* Queue is frozen when user space is lagging behind */
977 if (prb_queue_frozen(pkc)) {
978 /*
979 * Check if that last block which caused the queue to freeze,
980 * is still in_use by user-space.
981 */
982 if (prb_curr_blk_in_use(pkc, pbd)) {
983 /* Can't record this packet */
984 return NULL;
985 } else {
986 /*
987 * Ok, the block was released by user-space.
988 * Now let's open that block.
989 * opening a block also thaws the queue.
990 * Thawing is a side effect.
991 */
992 prb_open_block(pkc, pbd);
993 }
994 }
995
996 smp_mb();
997 curr = pkc->nxt_offset;
998 pkc->skb = skb;
e3192690 999 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1000
1001 /* first try the current block */
1002 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1003 prb_fill_curr_block(curr, pkc, pbd, len);
1004 return (void *)curr;
1005 }
1006
1007 /* Ok, close the current block */
1008 prb_retire_current_block(pkc, po, 0);
1009
1010 /* Now, try to dispatch the next block */
1011 curr = (char *)prb_dispatch_next_block(pkc, po);
1012 if (curr) {
1013 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1014 prb_fill_curr_block(curr, pkc, pbd, len);
1015 return (void *)curr;
1016 }
1017
1018 /*
1019 * No free blocks are available.user_space hasn't caught up yet.
1020 * Queue was just frozen and now this packet will get dropped.
1021 */
1022 return NULL;
1023}
1024
eea49cc9 1025static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1026 struct sk_buff *skb,
1027 int status, unsigned int len)
1028{
1029 char *curr = NULL;
1030 switch (po->tp_version) {
1031 case TPACKET_V1:
1032 case TPACKET_V2:
1033 curr = packet_lookup_frame(po, &po->rx_ring,
1034 po->rx_ring.head, status);
1035 return curr;
1036 case TPACKET_V3:
1037 return __packet_lookup_frame_in_block(po, skb, status, len);
1038 default:
1039 WARN(1, "TPACKET version not supported\n");
1040 BUG();
99aa3473 1041 return NULL;
f6fb8f10 1042 }
1043}
1044
eea49cc9 1045static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1046 struct packet_ring_buffer *rb,
77f65ebd 1047 unsigned int idx,
f6fb8f10 1048 int status)
1049{
bc59ba39 1050 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1051 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1052
1053 if (status != BLOCK_STATUS(pbd))
1054 return NULL;
1055 return pbd;
1056}
1057
eea49cc9 1058static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1059{
1060 unsigned int prev;
1061 if (rb->prb_bdqc.kactive_blk_num)
1062 prev = rb->prb_bdqc.kactive_blk_num-1;
1063 else
1064 prev = rb->prb_bdqc.knum_blocks-1;
1065 return prev;
1066}
1067
1068/* Assumes caller has held the rx_queue.lock */
eea49cc9 1069static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1070 struct packet_ring_buffer *rb,
1071 int status)
1072{
1073 unsigned int previous = prb_previous_blk_num(rb);
1074 return prb_lookup_block(po, rb, previous, status);
1075}
1076
eea49cc9 1077static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1078 struct packet_ring_buffer *rb,
1079 int status)
1080{
1081 if (po->tp_version <= TPACKET_V2)
1082 return packet_previous_frame(po, rb, status);
1083
1084 return __prb_previous_block(po, rb, status);
1085}
1086
eea49cc9 1087static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1088 struct packet_ring_buffer *rb)
1089{
1090 switch (po->tp_version) {
1091 case TPACKET_V1:
1092 case TPACKET_V2:
1093 return packet_increment_head(rb);
1094 case TPACKET_V3:
1095 default:
1096 WARN(1, "TPACKET version not supported.\n");
1097 BUG();
1098 return;
1099 }
1100}
1101
eea49cc9 1102static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1103 struct packet_ring_buffer *rb,
1104 int status)
1105{
1106 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1107 return packet_lookup_frame(po, rb, previous, status);
1108}
1109
eea49cc9 1110static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1111{
1112 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1113}
1114
77f65ebd
WB
1115static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1116{
1117 struct sock *sk = &po->sk;
1118 bool has_room;
1119
1120 if (po->prot_hook.func != tpacket_rcv)
1121 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1122 <= sk->sk_rcvbuf;
1123
1124 spin_lock(&sk->sk_receive_queue.lock);
1125 if (po->tp_version == TPACKET_V3)
1126 has_room = prb_lookup_block(po, &po->rx_ring,
1127 po->rx_ring.prb_bdqc.kactive_blk_num,
1128 TP_STATUS_KERNEL);
1129 else
1130 has_room = packet_lookup_frame(po, &po->rx_ring,
1131 po->rx_ring.head,
1132 TP_STATUS_KERNEL);
1133 spin_unlock(&sk->sk_receive_queue.lock);
1134
1135 return has_room;
1136}
1137
1da177e4
LT
1138static void packet_sock_destruct(struct sock *sk)
1139{
ed85b565
RC
1140 skb_queue_purge(&sk->sk_error_queue);
1141
547b792c
IJ
1142 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1143 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1144
1145 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1146 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1147 return;
1148 }
1149
17ab56a2 1150 sk_refcnt_debug_dec(sk);
1da177e4
LT
1151}
1152
77f65ebd
WB
1153static unsigned int fanout_demux_hash(struct packet_fanout *f,
1154 struct sk_buff *skb,
1155 unsigned int num)
dc99f600 1156{
77f65ebd 1157 return (((u64)skb->rxhash) * num) >> 32;
dc99f600
DM
1158}
1159
77f65ebd
WB
1160static unsigned int fanout_demux_lb(struct packet_fanout *f,
1161 struct sk_buff *skb,
1162 unsigned int num)
dc99f600 1163{
1753823f 1164 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1165
1753823f 1166 return val % num;
77f65ebd
WB
1167}
1168
1169static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1170 struct sk_buff *skb,
1171 unsigned int num)
1172{
1173 return smp_processor_id() % num;
dc99f600
DM
1174}
1175
77f65ebd
WB
1176static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1177 struct sk_buff *skb,
1178 unsigned int idx, unsigned int skip,
1179 unsigned int num)
95ec3eb4 1180{
77f65ebd 1181 unsigned int i, j;
95ec3eb4 1182
77f65ebd
WB
1183 i = j = min_t(int, f->next[idx], num - 1);
1184 do {
1185 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1186 if (i != j)
1187 f->next[idx] = i;
1188 return i;
1189 }
1190 if (++i == num)
1191 i = 0;
1192 } while (i != j);
1193
1194 return idx;
1195}
1196
1197static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1198{
1199 return f->flags & (flag >> 8);
95ec3eb4
DM
1200}
1201
95ec3eb4
DM
1202static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1203 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1204{
1205 struct packet_fanout *f = pt->af_packet_priv;
10fb796a 1206 unsigned int num = ACCESS_ONCE(f->num_members);
dc99f600 1207 struct packet_sock *po;
77f65ebd 1208 unsigned int idx;
dc99f600
DM
1209
1210 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1211 !num) {
1212 kfree_skb(skb);
1213 return 0;
1214 }
1215
95ec3eb4
DM
1216 switch (f->type) {
1217 case PACKET_FANOUT_HASH:
1218 default:
77f65ebd 1219 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
bc416d97 1220 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1221 if (!skb)
1222 return 0;
1223 }
1224 skb_get_rxhash(skb);
77f65ebd 1225 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1226 break;
1227 case PACKET_FANOUT_LB:
77f65ebd 1228 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1229 break;
1230 case PACKET_FANOUT_CPU:
77f65ebd
WB
1231 idx = fanout_demux_cpu(f, skb, num);
1232 break;
1233 case PACKET_FANOUT_ROLLOVER:
1234 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1235 break;
dc99f600
DM
1236 }
1237
77f65ebd
WB
1238 po = pkt_sk(f->arr[idx]);
1239 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1240 unlikely(!packet_rcv_has_room(po, skb))) {
1241 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1242 po = pkt_sk(f->arr[idx]);
1243 }
dc99f600
DM
1244
1245 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1246}
1247
fff3321d
PE
1248DEFINE_MUTEX(fanout_mutex);
1249EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1250static LIST_HEAD(fanout_list);
1251
1252static void __fanout_link(struct sock *sk, struct packet_sock *po)
1253{
1254 struct packet_fanout *f = po->fanout;
1255
1256 spin_lock(&f->lock);
1257 f->arr[f->num_members] = sk;
1258 smp_wmb();
1259 f->num_members++;
e4284052
AS
1260 if (f->num_members == 1)
1261 dev_add_pack(&f->prot_hook);
dc99f600
DM
1262 spin_unlock(&f->lock);
1263}
1264
1265static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1266{
1267 struct packet_fanout *f = po->fanout;
1268 int i;
1269
1270 spin_lock(&f->lock);
1271 for (i = 0; i < f->num_members; i++) {
1272 if (f->arr[i] == sk)
1273 break;
1274 }
1275 BUG_ON(i >= f->num_members);
1276 f->arr[i] = f->arr[f->num_members - 1];
1277 f->num_members--;
e4284052
AS
1278 if (f->num_members == 0)
1279 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1280 spin_unlock(&f->lock);
1281}
1282
a0dfb263 1283static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
c0de08d0
EL
1284{
1285 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1286 return true;
1287
1288 return false;
1289}
1290
7736d33f 1291static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1292{
1293 struct packet_sock *po = pkt_sk(sk);
1294 struct packet_fanout *f, *match;
7736d33f 1295 u8 type = type_flags & 0xff;
77f65ebd 1296 u8 flags = type_flags >> 8;
dc99f600
DM
1297 int err;
1298
1299 switch (type) {
77f65ebd
WB
1300 case PACKET_FANOUT_ROLLOVER:
1301 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1302 return -EINVAL;
dc99f600
DM
1303 case PACKET_FANOUT_HASH:
1304 case PACKET_FANOUT_LB:
95ec3eb4 1305 case PACKET_FANOUT_CPU:
dc99f600
DM
1306 break;
1307 default:
1308 return -EINVAL;
1309 }
1310
0e0321d2
ED
1311 mutex_lock(&fanout_mutex);
1312
c3b4c3a2
ED
1313 err = -EINVAL;
1314 if (!po->running)
1315 goto out;
1316
0e0321d2 1317 err = -EALREADY;
dc99f600 1318 if (po->fanout)
0e0321d2 1319 goto out;
dc99f600 1320
dc99f600
DM
1321 match = NULL;
1322 list_for_each_entry(f, &fanout_list, list) {
1323 if (f->id == id &&
1324 read_pnet(&f->net) == sock_net(sk)) {
1325 match = f;
1326 break;
1327 }
1328 }
afe62c68 1329 err = -EINVAL;
77f65ebd 1330 if (match && match->flags != flags)
afe62c68 1331 goto out;
dc99f600 1332 if (!match) {
afe62c68 1333 err = -ENOMEM;
dc99f600 1334 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1335 if (!match)
1336 goto out;
1337 write_pnet(&match->net, sock_net(sk));
1338 match->id = id;
1339 match->type = type;
77f65ebd 1340 match->flags = flags;
afe62c68
ED
1341 atomic_set(&match->rr_cur, 0);
1342 INIT_LIST_HEAD(&match->list);
1343 spin_lock_init(&match->lock);
1344 atomic_set(&match->sk_ref, 0);
1345 match->prot_hook.type = po->prot_hook.type;
1346 match->prot_hook.dev = po->prot_hook.dev;
1347 match->prot_hook.func = packet_rcv_fanout;
1348 match->prot_hook.af_packet_priv = match;
c0de08d0 1349 match->prot_hook.id_match = match_fanout_group;
afe62c68 1350 list_add(&match->list, &fanout_list);
dc99f600 1351 }
afe62c68 1352 err = -EINVAL;
e3f540a7
WB
1353
1354 spin_lock(&po->bind_lock);
1355 if (po->running &&
1356 match->type == type &&
afe62c68
ED
1357 match->prot_hook.type == po->prot_hook.type &&
1358 match->prot_hook.dev == po->prot_hook.dev) {
1359 err = -ENOSPC;
1360 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1361 __dev_remove_pack(&po->prot_hook);
1362 po->fanout = match;
1363 atomic_inc(&match->sk_ref);
1364 __fanout_link(sk, po);
1365 err = 0;
dc99f600
DM
1366 }
1367 }
e3f540a7
WB
1368 spin_unlock(&po->bind_lock);
1369
1370 if (err && !atomic_read(&match->sk_ref)) {
1371 list_del(&match->list);
1372 kfree(match);
1373 }
1374
afe62c68 1375out:
dc99f600
DM
1376 mutex_unlock(&fanout_mutex);
1377 return err;
1378}
1379
e4284052
AS
1380/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1381 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1382 * It is the responsibility of the caller to call fanout_release_data() and
1383 * free the returned packet_fanout (after synchronize_net())
1384 */
1385static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1386{
1387 struct packet_sock *po = pkt_sk(sk);
1388 struct packet_fanout *f;
1389
fff3321d 1390 mutex_lock(&fanout_mutex);
0e0321d2
ED
1391 f = po->fanout;
1392 if (f) {
1393 po->fanout = NULL;
dc99f600 1394
e4284052 1395 if (atomic_dec_and_test(&f->sk_ref))
0e0321d2 1396 list_del(&f->list);
e4284052
AS
1397 else
1398 f = NULL;
dc99f600
DM
1399 }
1400 mutex_unlock(&fanout_mutex);
e4284052
AS
1401
1402 return f;
dc99f600 1403}
1da177e4 1404
90ddc4f0 1405static const struct proto_ops packet_ops;
1da177e4 1406
90ddc4f0 1407static const struct proto_ops packet_ops_spkt;
1da177e4 1408
40d4e3df
ED
1409static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1410 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1411{
1412 struct sock *sk;
1413 struct sockaddr_pkt *spkt;
1414
1415 /*
1416 * When we registered the protocol we saved the socket in the data
1417 * field for just this event.
1418 */
1419
1420 sk = pt->af_packet_priv;
1ce4f28b 1421
1da177e4
LT
1422 /*
1423 * Yank back the headers [hope the device set this
1424 * right or kerboom...]
1425 *
1426 * Incoming packets have ll header pulled,
1427 * push it back.
1428 *
98e399f8 1429 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1430 * so that this procedure is noop.
1431 */
1432
1433 if (skb->pkt_type == PACKET_LOOPBACK)
1434 goto out;
1435
09ad9bc7 1436 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1437 goto out;
1438
40d4e3df
ED
1439 skb = skb_share_check(skb, GFP_ATOMIC);
1440 if (skb == NULL)
1da177e4
LT
1441 goto oom;
1442
1443 /* drop any routing info */
adf30907 1444 skb_dst_drop(skb);
1da177e4 1445
84531c24
PO
1446 /* drop conntrack reference */
1447 nf_reset(skb);
1448
ffbc6111 1449 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1450
98e399f8 1451 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1452
1453 /*
1454 * The SOCK_PACKET socket receives _all_ frames.
1455 */
1456
1457 spkt->spkt_family = dev->type;
1458 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1459 spkt->spkt_protocol = skb->protocol;
1460
1461 /*
1462 * Charge the memory to the socket. This is done specifically
1463 * to prevent sockets using all the memory up.
1464 */
1465
40d4e3df 1466 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1467 return 0;
1468
1469out:
1470 kfree_skb(skb);
1471oom:
1472 return 0;
1473}
1474
1475
1476/*
1477 * Output a raw packet to a device layer. This bypasses all the other
1478 * protocol layers and you must therefore supply it with a complete frame
1479 */
1ce4f28b 1480
1da177e4
LT
1481static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1482 struct msghdr *msg, size_t len)
1483{
1484 struct sock *sk = sock->sk;
40d4e3df 1485 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1486 struct sk_buff *skb = NULL;
1da177e4 1487 struct net_device *dev;
40d4e3df 1488 __be16 proto = 0;
1da177e4 1489 int err;
3bdc0eba 1490 int extra_len = 0;
1ce4f28b 1491
1da177e4 1492 /*
1ce4f28b 1493 * Get and verify the address.
1da177e4
LT
1494 */
1495
40d4e3df 1496 if (saddr) {
1da177e4 1497 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1498 return -EINVAL;
1499 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1500 proto = saddr->spkt_protocol;
1501 } else
1502 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1503
1504 /*
1ce4f28b 1505 * Find the device first to size check it
1da177e4
LT
1506 */
1507
de74e92a 1508 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1509retry:
654d1f8a
ED
1510 rcu_read_lock();
1511 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1512 err = -ENODEV;
1513 if (dev == NULL)
1514 goto out_unlock;
1ce4f28b 1515
d5e76b0a
DM
1516 err = -ENETDOWN;
1517 if (!(dev->flags & IFF_UP))
1518 goto out_unlock;
1519
1da177e4 1520 /*
40d4e3df
ED
1521 * You may not queue a frame bigger than the mtu. This is the lowest level
1522 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1523 */
1ce4f28b 1524
3bdc0eba
BG
1525 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1526 if (!netif_supports_nofcs(dev)) {
1527 err = -EPROTONOSUPPORT;
1528 goto out_unlock;
1529 }
1530 extra_len = 4; /* We're doing our own CRC */
1531 }
1532
1da177e4 1533 err = -EMSGSIZE;
3bdc0eba 1534 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1535 goto out_unlock;
1536
1a35ca80
ED
1537 if (!skb) {
1538 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1539 int tlen = dev->needed_tailroom;
1a35ca80
ED
1540 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1541
1542 rcu_read_unlock();
4ce40912 1543 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1544 if (skb == NULL)
1545 return -ENOBUFS;
1546 /* FIXME: Save some space for broken drivers that write a hard
1547 * header at transmission time by themselves. PPP is the notable
1548 * one here. This should really be fixed at the driver level.
1549 */
1550 skb_reserve(skb, reserved);
1551 skb_reset_network_header(skb);
1552
1553 /* Try to align data part correctly */
1554 if (hhlen) {
1555 skb->data -= hhlen;
1556 skb->tail -= hhlen;
1557 if (len < hhlen)
1558 skb_reset_network_header(skb);
1559 }
1560 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1561 if (err)
1562 goto out_free;
1563 goto retry;
1da177e4
LT
1564 }
1565
3bdc0eba 1566 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1567 /* Earlier code assumed this would be a VLAN pkt,
1568 * double-check this now that we have the actual
1569 * packet in hand.
1570 */
1571 struct ethhdr *ehdr;
1572 skb_reset_mac_header(skb);
1573 ehdr = eth_hdr(skb);
1574 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1575 err = -EMSGSIZE;
1576 goto out_unlock;
1577 }
1578 }
1a35ca80 1579
1da177e4
LT
1580 skb->protocol = proto;
1581 skb->dev = dev;
1582 skb->priority = sk->sk_priority;
2d37a186 1583 skb->mark = sk->sk_mark;
bf84a010
DB
1584
1585 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1586
3bdc0eba
BG
1587 if (unlikely(extra_len == 4))
1588 skb->no_fcs = 1;
1589
40893fd0 1590 skb_probe_transport_header(skb, 0);
c1aad275 1591
1da177e4 1592 dev_queue_xmit(skb);
654d1f8a 1593 rcu_read_unlock();
40d4e3df 1594 return len;
1da177e4 1595
1da177e4 1596out_unlock:
654d1f8a 1597 rcu_read_unlock();
1a35ca80
ED
1598out_free:
1599 kfree_skb(skb);
1da177e4
LT
1600 return err;
1601}
1da177e4 1602
eea49cc9 1603static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1604 const struct sock *sk,
dbcb5855 1605 unsigned int res)
1da177e4
LT
1606{
1607 struct sk_filter *filter;
fda9ef5d 1608
80f8f102
ED
1609 rcu_read_lock();
1610 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1611 if (filter != NULL)
0a14842f 1612 res = SK_RUN_FILTER(filter, skb);
80f8f102 1613 rcu_read_unlock();
1da177e4 1614
dbcb5855 1615 return res;
1da177e4
LT
1616}
1617
1618/*
62ab0812
ED
1619 * This function makes lazy skb cloning in hope that most of packets
1620 * are discarded by BPF.
1621 *
1622 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1623 * and skb->cb are mangled. It works because (and until) packets
1624 * falling here are owned by current CPU. Output packets are cloned
1625 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1626 * sequencially, so that if we return skb to original state on exit,
1627 * we will not harm anyone.
1da177e4
LT
1628 */
1629
40d4e3df
ED
1630static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1631 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1632{
1633 struct sock *sk;
1634 struct sockaddr_ll *sll;
1635 struct packet_sock *po;
40d4e3df 1636 u8 *skb_head = skb->data;
1da177e4 1637 int skb_len = skb->len;
dbcb5855 1638 unsigned int snaplen, res;
1da177e4
LT
1639
1640 if (skb->pkt_type == PACKET_LOOPBACK)
1641 goto drop;
1642
1643 sk = pt->af_packet_priv;
1644 po = pkt_sk(sk);
1645
09ad9bc7 1646 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1647 goto drop;
1648
1da177e4
LT
1649 skb->dev = dev;
1650
3b04ddde 1651 if (dev->header_ops) {
1da177e4 1652 /* The device has an explicit notion of ll header,
62ab0812
ED
1653 * exported to higher levels.
1654 *
1655 * Otherwise, the device hides details of its frame
1656 * structure, so that corresponding packet head is
1657 * never delivered to user.
1da177e4
LT
1658 */
1659 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1660 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1661 else if (skb->pkt_type == PACKET_OUTGOING) {
1662 /* Special case: outgoing packets have ll header at head */
bbe735e4 1663 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1664 }
1665 }
1666
1667 snaplen = skb->len;
1668
dbcb5855
DM
1669 res = run_filter(skb, sk, snaplen);
1670 if (!res)
fda9ef5d 1671 goto drop_n_restore;
dbcb5855
DM
1672 if (snaplen > res)
1673 snaplen = res;
1da177e4 1674
0fd7bac6 1675 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1676 goto drop_n_acct;
1677
1678 if (skb_shared(skb)) {
1679 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1680 if (nskb == NULL)
1681 goto drop_n_acct;
1682
1683 if (skb_head != skb->data) {
1684 skb->data = skb_head;
1685 skb->len = skb_len;
1686 }
abc4e4fa 1687 consume_skb(skb);
1da177e4
LT
1688 skb = nskb;
1689 }
1690
ffbc6111
HX
1691 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1692 sizeof(skb->cb));
1693
1694 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1695 sll->sll_family = AF_PACKET;
1696 sll->sll_hatype = dev->type;
1697 sll->sll_protocol = skb->protocol;
1698 sll->sll_pkttype = skb->pkt_type;
8032b464 1699 if (unlikely(po->origdev))
80feaacb
PWJ
1700 sll->sll_ifindex = orig_dev->ifindex;
1701 else
1702 sll->sll_ifindex = dev->ifindex;
1da177e4 1703
b95cce35 1704 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1705
ffbc6111 1706 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1707
1da177e4
LT
1708 if (pskb_trim(skb, snaplen))
1709 goto drop_n_acct;
1710
1711 skb_set_owner_r(skb, sk);
1712 skb->dev = NULL;
adf30907 1713 skb_dst_drop(skb);
1da177e4 1714
84531c24
PO
1715 /* drop conntrack reference */
1716 nf_reset(skb);
1717
1da177e4 1718 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1719 po->stats.stats1.tp_packets++;
3b885787 1720 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1721 __skb_queue_tail(&sk->sk_receive_queue, skb);
1722 spin_unlock(&sk->sk_receive_queue.lock);
1723 sk->sk_data_ready(sk, skb->len);
1724 return 0;
1725
1726drop_n_acct:
7091fbd8 1727 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1728 po->stats.stats1.tp_drops++;
7091fbd8
WB
1729 atomic_inc(&sk->sk_drops);
1730 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1731
1732drop_n_restore:
1733 if (skb_head != skb->data && skb_shared(skb)) {
1734 skb->data = skb_head;
1735 skb->len = skb_len;
1736 }
1737drop:
ead2ceb0 1738 consume_skb(skb);
1da177e4
LT
1739 return 0;
1740}
1741
40d4e3df
ED
1742static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1743 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1744{
1745 struct sock *sk;
1746 struct packet_sock *po;
1747 struct sockaddr_ll *sll;
184f489e 1748 union tpacket_uhdr h;
40d4e3df 1749 u8 *skb_head = skb->data;
1da177e4 1750 int skb_len = skb->len;
dbcb5855 1751 unsigned int snaplen, res;
f6fb8f10 1752 unsigned long status = TP_STATUS_USER;
bbd6ef87 1753 unsigned short macoff, netoff, hdrlen;
1da177e4 1754 struct sk_buff *copy_skb = NULL;
bbd6ef87 1755 struct timespec ts;
b9c32fb2 1756 __u32 ts_status;
1da177e4
LT
1757
1758 if (skb->pkt_type == PACKET_LOOPBACK)
1759 goto drop;
1760
1761 sk = pt->af_packet_priv;
1762 po = pkt_sk(sk);
1763
09ad9bc7 1764 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1765 goto drop;
1766
3b04ddde 1767 if (dev->header_ops) {
1da177e4 1768 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1769 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1770 else if (skb->pkt_type == PACKET_OUTGOING) {
1771 /* Special case: outgoing packets have ll header at head */
bbe735e4 1772 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1773 }
1774 }
1775
8dc41944
HX
1776 if (skb->ip_summed == CHECKSUM_PARTIAL)
1777 status |= TP_STATUS_CSUMNOTREADY;
1778
1da177e4
LT
1779 snaplen = skb->len;
1780
dbcb5855
DM
1781 res = run_filter(skb, sk, snaplen);
1782 if (!res)
fda9ef5d 1783 goto drop_n_restore;
dbcb5855
DM
1784 if (snaplen > res)
1785 snaplen = res;
1da177e4
LT
1786
1787 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1788 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1789 po->tp_reserve;
1da177e4 1790 } else {
95c96174 1791 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1792 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1793 (maclen < 16 ? 16 : maclen)) +
1794 po->tp_reserve;
1da177e4
LT
1795 macoff = netoff - maclen;
1796 }
f6fb8f10 1797 if (po->tp_version <= TPACKET_V2) {
1798 if (macoff + snaplen > po->rx_ring.frame_size) {
1799 if (po->copy_thresh &&
0fd7bac6 1800 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1801 if (skb_shared(skb)) {
1802 copy_skb = skb_clone(skb, GFP_ATOMIC);
1803 } else {
1804 copy_skb = skb_get(skb);
1805 skb_head = skb->data;
1806 }
1807 if (copy_skb)
1808 skb_set_owner_r(copy_skb, sk);
1da177e4 1809 }
f6fb8f10 1810 snaplen = po->rx_ring.frame_size - macoff;
1811 if ((int)snaplen < 0)
1812 snaplen = 0;
1da177e4 1813 }
4035ed7b
ED
1814 } else if (unlikely(macoff + snaplen >
1815 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
1816 u32 nval;
1817
1818 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
1819 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
1820 snaplen, nval, macoff);
1821 snaplen = nval;
1822 if (unlikely((int)snaplen < 0)) {
1823 snaplen = 0;
1824 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
1825 }
1da177e4 1826 }
1da177e4 1827 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1828 h.raw = packet_current_rx_frame(po, skb,
1829 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1830 if (!h.raw)
1da177e4 1831 goto ring_is_full;
f6fb8f10 1832 if (po->tp_version <= TPACKET_V2) {
1833 packet_increment_rx_head(po, &po->rx_ring);
1834 /*
1835 * LOSING will be reported till you read the stats,
1836 * because it's COR - Clear On Read.
1837 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1838 * at packet level.
1839 */
ee80fbf3 1840 if (po->stats.stats1.tp_drops)
f6fb8f10 1841 status |= TP_STATUS_LOSING;
1842 }
ee80fbf3 1843 po->stats.stats1.tp_packets++;
1da177e4
LT
1844 if (copy_skb) {
1845 status |= TP_STATUS_COPY;
1846 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1847 }
1da177e4
LT
1848 spin_unlock(&sk->sk_receive_queue.lock);
1849
bbd6ef87 1850 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
1851
1852 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 1853 getnstimeofday(&ts);
1da177e4 1854
b9c32fb2
DB
1855 status |= ts_status;
1856
bbd6ef87
PM
1857 switch (po->tp_version) {
1858 case TPACKET_V1:
1859 h.h1->tp_len = skb->len;
1860 h.h1->tp_snaplen = snaplen;
1861 h.h1->tp_mac = macoff;
1862 h.h1->tp_net = netoff;
4b457bdf
DB
1863 h.h1->tp_sec = ts.tv_sec;
1864 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
1865 hdrlen = sizeof(*h.h1);
1866 break;
1867 case TPACKET_V2:
1868 h.h2->tp_len = skb->len;
1869 h.h2->tp_snaplen = snaplen;
1870 h.h2->tp_mac = macoff;
1871 h.h2->tp_net = netoff;
bbd6ef87
PM
1872 h.h2->tp_sec = ts.tv_sec;
1873 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1874 if (vlan_tx_tag_present(skb)) {
1875 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1876 status |= TP_STATUS_VLAN_VALID;
1877 } else {
1878 h.h2->tp_vlan_tci = 0;
1879 }
13fcb7bd 1880 h.h2->tp_padding = 0;
bbd6ef87
PM
1881 hdrlen = sizeof(*h.h2);
1882 break;
f6fb8f10 1883 case TPACKET_V3:
1884 /* tp_nxt_offset,vlan are already populated above.
1885 * So DONT clear those fields here
1886 */
1887 h.h3->tp_status |= status;
1888 h.h3->tp_len = skb->len;
1889 h.h3->tp_snaplen = snaplen;
1890 h.h3->tp_mac = macoff;
1891 h.h3->tp_net = netoff;
f6fb8f10 1892 h.h3->tp_sec = ts.tv_sec;
1893 h.h3->tp_nsec = ts.tv_nsec;
1894 hdrlen = sizeof(*h.h3);
1895 break;
bbd6ef87
PM
1896 default:
1897 BUG();
1898 }
1da177e4 1899
bbd6ef87 1900 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1901 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1902 sll->sll_family = AF_PACKET;
1903 sll->sll_hatype = dev->type;
1904 sll->sll_protocol = skb->protocol;
1905 sll->sll_pkttype = skb->pkt_type;
8032b464 1906 if (unlikely(po->origdev))
80feaacb
PWJ
1907 sll->sll_ifindex = orig_dev->ifindex;
1908 else
1909 sll->sll_ifindex = dev->ifindex;
1da177e4 1910
e16aa207 1911 smp_mb();
f6dafa95 1912#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1913 {
0af55bb5
CG
1914 u8 *start, *end;
1915
f6fb8f10 1916 if (po->tp_version <= TPACKET_V2) {
1917 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1918 + macoff + snaplen);
1919 for (start = h.raw; start < end; start += PAGE_SIZE)
1920 flush_dcache_page(pgv_to_page(start));
1921 }
cc9f01b2 1922 smp_wmb();
1da177e4 1923 }
f6dafa95 1924#endif
f6fb8f10 1925 if (po->tp_version <= TPACKET_V2)
1926 __packet_set_status(po, h.raw, status);
1927 else
1928 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1929
1930 sk->sk_data_ready(sk, 0);
1931
1932drop_n_restore:
1933 if (skb_head != skb->data && skb_shared(skb)) {
1934 skb->data = skb_head;
1935 skb->len = skb_len;
1936 }
1937drop:
1ce4f28b 1938 kfree_skb(skb);
1da177e4
LT
1939 return 0;
1940
1941ring_is_full:
ee80fbf3 1942 po->stats.stats1.tp_drops++;
1da177e4
LT
1943 spin_unlock(&sk->sk_receive_queue.lock);
1944
1945 sk->sk_data_ready(sk, 0);
acb5d75b 1946 kfree_skb(copy_skb);
1da177e4
LT
1947 goto drop_n_restore;
1948}
1949
69e3c75f
JB
1950static void tpacket_destruct_skb(struct sk_buff *skb)
1951{
1952 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1953 void *ph;
1da177e4 1954
69e3c75f 1955 if (likely(po->tx_ring.pg_vec)) {
b9c32fb2
DB
1956 __u32 ts;
1957
69e3c75f 1958 ph = skb_shinfo(skb)->destructor_arg;
69e3c75f
JB
1959 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1960 atomic_dec(&po->tx_ring.pending);
b9c32fb2
DB
1961
1962 ts = __packet_set_timestamp(po, ph, skb);
1963 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
1964 }
1965
1966 sock_wfree(skb);
1967}
1968
40d4e3df
ED
1969static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1970 void *frame, struct net_device *dev, int size_max,
ae641949 1971 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 1972{
184f489e 1973 union tpacket_uhdr ph;
69e3c75f
JB
1974 int to_write, offset, len, tp_len, nr_frags, len_max;
1975 struct socket *sock = po->sk.sk_socket;
1976 struct page *page;
1977 void *data;
1978 int err;
1979
1980 ph.raw = frame;
1981
1982 skb->protocol = proto;
1983 skb->dev = dev;
1984 skb->priority = po->sk.sk_priority;
2d37a186 1985 skb->mark = po->sk.sk_mark;
2e31396f 1986 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
1987 skb_shinfo(skb)->destructor_arg = ph.raw;
1988
1989 switch (po->tp_version) {
1990 case TPACKET_V2:
1991 tp_len = ph.h2->tp_len;
1992 break;
1993 default:
1994 tp_len = ph.h1->tp_len;
1995 break;
1996 }
1997 if (unlikely(tp_len > size_max)) {
40d4e3df 1998 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1999 return -EMSGSIZE;
2000 }
2001
ae641949 2002 skb_reserve(skb, hlen);
69e3c75f 2003 skb_reset_network_header(skb);
40893fd0 2004 skb_probe_transport_header(skb, 0);
c1aad275 2005
5920cd3a
PC
2006 if (po->tp_tx_has_off) {
2007 int off_min, off_max, off;
2008 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2009 off_max = po->tx_ring.frame_size - tp_len;
2010 if (sock->type == SOCK_DGRAM) {
2011 switch (po->tp_version) {
2012 case TPACKET_V2:
2013 off = ph.h2->tp_net;
2014 break;
2015 default:
2016 off = ph.h1->tp_net;
2017 break;
2018 }
2019 } else {
2020 switch (po->tp_version) {
2021 case TPACKET_V2:
2022 off = ph.h2->tp_mac;
2023 break;
2024 default:
2025 off = ph.h1->tp_mac;
2026 break;
2027 }
2028 }
2029 if (unlikely((off < off_min) || (off_max < off)))
2030 return -EINVAL;
2031 data = ph.raw + off;
2032 } else {
2033 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2034 }
69e3c75f
JB
2035 to_write = tp_len;
2036
2037 if (sock->type == SOCK_DGRAM) {
2038 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2039 NULL, tp_len);
2040 if (unlikely(err < 0))
2041 return -EINVAL;
40d4e3df 2042 } else if (dev->hard_header_len) {
69e3c75f
JB
2043 /* net device doesn't like empty head */
2044 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
2045 pr_err("packet size is too short (%d < %d)\n",
2046 tp_len, dev->hard_header_len);
69e3c75f
JB
2047 return -EINVAL;
2048 }
2049
2050 skb_push(skb, dev->hard_header_len);
2051 err = skb_store_bits(skb, 0, data,
2052 dev->hard_header_len);
2053 if (unlikely(err))
2054 return err;
2055
2056 data += dev->hard_header_len;
2057 to_write -= dev->hard_header_len;
2058 }
2059
69e3c75f
JB
2060 offset = offset_in_page(data);
2061 len_max = PAGE_SIZE - offset;
2062 len = ((to_write > len_max) ? len_max : to_write);
2063
2064 skb->data_len = to_write;
2065 skb->len += to_write;
2066 skb->truesize += to_write;
2067 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2068
2069 while (likely(to_write)) {
2070 nr_frags = skb_shinfo(skb)->nr_frags;
2071
2072 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2073 pr_err("Packet exceed the number of skb frags(%lu)\n",
2074 MAX_SKB_FRAGS);
69e3c75f
JB
2075 return -EFAULT;
2076 }
2077
0af55bb5
CG
2078 page = pgv_to_page(data);
2079 data += len;
69e3c75f
JB
2080 flush_dcache_page(page);
2081 get_page(page);
0af55bb5 2082 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2083 to_write -= len;
2084 offset = 0;
2085 len_max = PAGE_SIZE;
2086 len = ((to_write > len_max) ? len_max : to_write);
2087 }
2088
2089 return tp_len;
2090}
2091
2092static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2093{
69e3c75f
JB
2094 struct sk_buff *skb;
2095 struct net_device *dev;
2096 __be16 proto;
827d9780 2097 int err, reserve = 0;
40d4e3df
ED
2098 void *ph;
2099 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2100 int tp_len, size_max;
2101 unsigned char *addr;
2102 int len_sum = 0;
9e67030a 2103 int status = TP_STATUS_AVAILABLE;
ae641949 2104 int hlen, tlen;
69e3c75f 2105
69e3c75f
JB
2106 mutex_lock(&po->pg_vec_lock);
2107
c3ac8a13 2108 if (likely(saddr == NULL)) {
026bb405 2109 dev = packet_cached_dev_get(po);
69e3c75f
JB
2110 proto = po->num;
2111 addr = NULL;
2112 } else {
2113 err = -EINVAL;
2114 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2115 goto out;
2116 if (msg->msg_namelen < (saddr->sll_halen
2117 + offsetof(struct sockaddr_ll,
2118 sll_addr)))
2119 goto out;
69e3c75f
JB
2120 proto = saddr->sll_protocol;
2121 addr = saddr->sll_addr;
827d9780 2122 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2123 }
2124
69e3c75f
JB
2125 err = -ENXIO;
2126 if (unlikely(dev == NULL))
2127 goto out;
69e3c75f
JB
2128 err = -ENETDOWN;
2129 if (unlikely(!(dev->flags & IFF_UP)))
2130 goto out_put;
2131
026bb405
DB
2132 reserve = dev->hard_header_len;
2133
69e3c75f 2134 size_max = po->tx_ring.frame_size
b5dd884e 2135 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2136
2137 if (size_max > dev->mtu + reserve)
2138 size_max = dev->mtu + reserve;
2139
2140 do {
2141 ph = packet_current_frame(po, &po->tx_ring,
2142 TP_STATUS_SEND_REQUEST);
2143
2144 if (unlikely(ph == NULL)) {
2145 schedule();
2146 continue;
2147 }
2148
2149 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2150 hlen = LL_RESERVED_SPACE(dev);
2151 tlen = dev->needed_tailroom;
69e3c75f 2152 skb = sock_alloc_send_skb(&po->sk,
ae641949 2153 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2154 0, &err);
2155
2156 if (unlikely(skb == NULL))
2157 goto out_status;
2158
2159 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2160 addr, hlen);
69e3c75f
JB
2161
2162 if (unlikely(tp_len < 0)) {
2163 if (po->tp_loss) {
2164 __packet_set_status(po, ph,
2165 TP_STATUS_AVAILABLE);
2166 packet_increment_head(&po->tx_ring);
2167 kfree_skb(skb);
2168 continue;
2169 } else {
2170 status = TP_STATUS_WRONG_FORMAT;
2171 err = tp_len;
2172 goto out_status;
2173 }
2174 }
2175
2176 skb->destructor = tpacket_destruct_skb;
2177 __packet_set_status(po, ph, TP_STATUS_SENDING);
2178 atomic_inc(&po->tx_ring.pending);
2179
2180 status = TP_STATUS_SEND_REQUEST;
2181 err = dev_queue_xmit(skb);
eb70df13
JP
2182 if (unlikely(err > 0)) {
2183 err = net_xmit_errno(err);
2184 if (err && __packet_get_status(po, ph) ==
2185 TP_STATUS_AVAILABLE) {
2186 /* skb was destructed already */
2187 skb = NULL;
2188 goto out_status;
2189 }
2190 /*
2191 * skb was dropped but not destructed yet;
2192 * let's treat it like congestion or err < 0
2193 */
2194 err = 0;
2195 }
69e3c75f
JB
2196 packet_increment_head(&po->tx_ring);
2197 len_sum += tp_len;
f64f9e71
JP
2198 } while (likely((ph != NULL) ||
2199 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2200 (atomic_read(&po->tx_ring.pending))))
2201 );
69e3c75f
JB
2202
2203 err = len_sum;
2204 goto out_put;
2205
69e3c75f
JB
2206out_status:
2207 __packet_set_status(po, ph, status);
2208 kfree_skb(skb);
2209out_put:
026bb405 2210 dev_put(dev);
69e3c75f
JB
2211out:
2212 mutex_unlock(&po->pg_vec_lock);
2213 return err;
2214}
69e3c75f 2215
eea49cc9
OJ
2216static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2217 size_t reserve, size_t len,
2218 size_t linear, int noblock,
2219 int *err)
bfd5f4a3
SS
2220{
2221 struct sk_buff *skb;
2222
2223 /* Under a page? Don't bother with paged skb. */
2224 if (prepad + len < PAGE_SIZE || !linear)
2225 linear = len;
2226
2227 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2228 err);
2229 if (!skb)
2230 return NULL;
2231
2232 skb_reserve(skb, reserve);
2233 skb_put(skb, linear);
2234 skb->data_len = len - linear;
2235 skb->len += len - linear;
2236
2237 return skb;
2238}
2239
69e3c75f 2240static int packet_snd(struct socket *sock,
1da177e4
LT
2241 struct msghdr *msg, size_t len)
2242{
2243 struct sock *sk = sock->sk;
40d4e3df 2244 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2245 struct sk_buff *skb;
2246 struct net_device *dev;
0e11c91e 2247 __be16 proto;
1da177e4 2248 unsigned char *addr;
827d9780 2249 int err, reserve = 0;
bfd5f4a3
SS
2250 struct virtio_net_hdr vnet_hdr = { 0 };
2251 int offset = 0;
2252 int vnet_hdr_len;
2253 struct packet_sock *po = pkt_sk(sk);
2254 unsigned short gso_type = 0;
73cd9b10 2255 int hlen, tlen, linear;
3bdc0eba 2256 int extra_len = 0;
1da177e4
LT
2257
2258 /*
1ce4f28b 2259 * Get and verify the address.
1da177e4 2260 */
1ce4f28b 2261
c3ac8a13 2262 if (likely(saddr == NULL)) {
026bb405 2263 dev = packet_cached_dev_get(po);
1da177e4
LT
2264 proto = po->num;
2265 addr = NULL;
2266 } else {
2267 err = -EINVAL;
2268 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2269 goto out;
0fb375fb
EB
2270 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2271 goto out;
1da177e4
LT
2272 proto = saddr->sll_protocol;
2273 addr = saddr->sll_addr;
827d9780 2274 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2275 }
2276
1da177e4 2277 err = -ENXIO;
026bb405 2278 if (unlikely(dev == NULL))
1da177e4 2279 goto out_unlock;
d5e76b0a 2280 err = -ENETDOWN;
026bb405 2281 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2282 goto out_unlock;
2283
026bb405
DB
2284 if (sock->type == SOCK_RAW)
2285 reserve = dev->hard_header_len;
bfd5f4a3
SS
2286 if (po->has_vnet_hdr) {
2287 vnet_hdr_len = sizeof(vnet_hdr);
2288
2289 err = -EINVAL;
2290 if (len < vnet_hdr_len)
2291 goto out_unlock;
2292
2293 len -= vnet_hdr_len;
2294
2295 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2296 vnet_hdr_len);
2297 if (err < 0)
2298 goto out_unlock;
2299
2300 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2301 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2302 vnet_hdr.hdr_len))
2303 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2304 vnet_hdr.csum_offset + 2;
2305
2306 err = -EINVAL;
2307 if (vnet_hdr.hdr_len > len)
2308 goto out_unlock;
2309
2310 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2311 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2312 case VIRTIO_NET_HDR_GSO_TCPV4:
2313 gso_type = SKB_GSO_TCPV4;
2314 break;
2315 case VIRTIO_NET_HDR_GSO_TCPV6:
2316 gso_type = SKB_GSO_TCPV6;
2317 break;
2318 case VIRTIO_NET_HDR_GSO_UDP:
2319 gso_type = SKB_GSO_UDP;
2320 break;
2321 default:
2322 goto out_unlock;
2323 }
2324
2325 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2326 gso_type |= SKB_GSO_TCP_ECN;
2327
2328 if (vnet_hdr.gso_size == 0)
2329 goto out_unlock;
2330
2331 }
2332 }
2333
3bdc0eba
BG
2334 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2335 if (!netif_supports_nofcs(dev)) {
2336 err = -EPROTONOSUPPORT;
2337 goto out_unlock;
2338 }
2339 extra_len = 4; /* We're doing our own CRC */
2340 }
2341
1da177e4 2342 err = -EMSGSIZE;
3bdc0eba 2343 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2344 goto out_unlock;
2345
bfd5f4a3 2346 err = -ENOBUFS;
ae641949
HX
2347 hlen = LL_RESERVED_SPACE(dev);
2348 tlen = dev->needed_tailroom;
73cd9b10
WB
2349 linear = vnet_hdr.hdr_len;
2350 linear = max(linear, min_t(int, len, dev->hard_header_len));
2351 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2352 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2353 if (skb == NULL)
1da177e4
LT
2354 goto out_unlock;
2355
bfd5f4a3 2356 skb_set_network_header(skb, reserve);
1da177e4 2357
0c4e8581
SH
2358 err = -EINVAL;
2359 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2360 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2361 goto out_free;
1da177e4
LT
2362
2363 /* Returns -EFAULT on error */
bfd5f4a3 2364 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2365 if (err)
2366 goto out_free;
bf84a010
DB
2367
2368 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2369
3bdc0eba 2370 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
57f89bfa
BG
2371 /* Earlier code assumed this would be a VLAN pkt,
2372 * double-check this now that we have the actual
2373 * packet in hand.
2374 */
2375 struct ethhdr *ehdr;
2376 skb_reset_mac_header(skb);
2377 ehdr = eth_hdr(skb);
2378 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2379 err = -EMSGSIZE;
2380 goto out_free;
2381 }
2382 }
2383
1da177e4
LT
2384 skb->protocol = proto;
2385 skb->dev = dev;
2386 skb->priority = sk->sk_priority;
2d37a186 2387 skb->mark = sk->sk_mark;
1da177e4 2388
bfd5f4a3
SS
2389 if (po->has_vnet_hdr) {
2390 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2391 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2392 vnet_hdr.csum_offset)) {
2393 err = -EINVAL;
2394 goto out_free;
2395 }
2396 }
2397
2398 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2399 skb_shinfo(skb)->gso_type = gso_type;
2400
2401 /* Header must be checked, and gso_segs computed. */
2402 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2403 skb_shinfo(skb)->gso_segs = 0;
2404
2405 len += vnet_hdr_len;
2406 }
2407
40893fd0 2408 skb_probe_transport_header(skb, reserve);
c1aad275 2409
3bdc0eba
BG
2410 if (unlikely(extra_len == 4))
2411 skb->no_fcs = 1;
2412
1da177e4
LT
2413 /*
2414 * Now send it
2415 */
2416
2417 err = dev_queue_xmit(skb);
2418 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2419 goto out_unlock;
2420
026bb405 2421 dev_put(dev);
1da177e4 2422
40d4e3df 2423 return len;
1da177e4
LT
2424
2425out_free:
2426 kfree_skb(skb);
2427out_unlock:
026bb405 2428 if (dev)
1da177e4
LT
2429 dev_put(dev);
2430out:
2431 return err;
2432}
2433
69e3c75f
JB
2434static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2435 struct msghdr *msg, size_t len)
2436{
69e3c75f
JB
2437 struct sock *sk = sock->sk;
2438 struct packet_sock *po = pkt_sk(sk);
2439 if (po->tx_ring.pg_vec)
2440 return tpacket_snd(po, msg);
2441 else
69e3c75f
JB
2442 return packet_snd(sock, msg, len);
2443}
2444
1da177e4
LT
2445/*
2446 * Close a PACKET socket. This is fairly simple. We immediately go
2447 * to 'closed' state and remove our protocol entry in the device list.
2448 */
2449
2450static int packet_release(struct socket *sock)
2451{
2452 struct sock *sk = sock->sk;
2453 struct packet_sock *po;
e4284052 2454 struct packet_fanout *f;
d12d01d6 2455 struct net *net;
f6fb8f10 2456 union tpacket_req_u req_u;
1da177e4
LT
2457
2458 if (!sk)
2459 return 0;
2460
3b1e0a65 2461 net = sock_net(sk);
1da177e4
LT
2462 po = pkt_sk(sk);
2463
0fa7fa98 2464 mutex_lock(&net->packet.sklist_lock);
808f5114 2465 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2466 mutex_unlock(&net->packet.sklist_lock);
2467
2468 preempt_disable();
920de804 2469 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2470 preempt_enable();
1da177e4 2471
808f5114 2472 spin_lock(&po->bind_lock);
ce06b03e 2473 unregister_prot_hook(sk, false);
c3ac8a13
DB
2474 packet_cached_dev_reset(po);
2475
160ff18a
BG
2476 if (po->prot_hook.dev) {
2477 dev_put(po->prot_hook.dev);
2478 po->prot_hook.dev = NULL;
2479 }
808f5114 2480 spin_unlock(&po->bind_lock);
1da177e4 2481
1da177e4 2482 packet_flush_mclist(sk);
1da177e4 2483
9665d5d6
PS
2484 if (po->rx_ring.pg_vec) {
2485 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2486 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2487 }
69e3c75f 2488
9665d5d6
PS
2489 if (po->tx_ring.pg_vec) {
2490 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2491 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2492 }
1da177e4 2493
e4284052 2494 f = fanout_release(sk);
dc99f600 2495
808f5114 2496 synchronize_net();
e4284052
AS
2497
2498 if (f) {
2499 kfree(f);
2500 }
1da177e4
LT
2501 /*
2502 * Now the socket is dead. No more input will appear.
2503 */
1da177e4
LT
2504 sock_orphan(sk);
2505 sock->sk = NULL;
2506
2507 /* Purge queues */
2508
2509 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2510 sk_refcnt_debug_release(sk);
1da177e4
LT
2511
2512 sock_put(sk);
2513 return 0;
2514}
2515
2516/*
2517 * Attach a packet hook.
2518 */
2519
0e11c91e 2520static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2521{
2522 struct packet_sock *po = pkt_sk(sk);
671058d0
MW
2523 int ret = 0;
2524
2525 lock_sock(sk);
2526
2527 spin_lock(&po->bind_lock);
dc99f600 2528
aef950b4
WY
2529 if (po->fanout) {
2530 if (dev)
2531 dev_put(dev);
2532
671058d0
MW
2533 ret = -EINVAL;
2534 goto out_unlock;
aef950b4 2535 }
1da177e4 2536
ce06b03e 2537 unregister_prot_hook(sk, true);
c3ac8a13 2538
1da177e4
LT
2539 po->num = protocol;
2540 po->prot_hook.type = protocol;
160ff18a
BG
2541 if (po->prot_hook.dev)
2542 dev_put(po->prot_hook.dev);
1da177e4 2543
c3ac8a13 2544 po->prot_hook.dev = dev;
1da177e4
LT
2545 po->ifindex = dev ? dev->ifindex : 0;
2546
c3ac8a13
DB
2547 packet_cached_dev_assign(po, dev);
2548
1da177e4
LT
2549 if (protocol == 0)
2550 goto out_unlock;
2551
be85d4ad 2552 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2553 register_prot_hook(sk);
be85d4ad
UT
2554 } else {
2555 sk->sk_err = ENETDOWN;
2556 if (!sock_flag(sk, SOCK_DEAD))
2557 sk->sk_error_report(sk);
1da177e4
LT
2558 }
2559
2560out_unlock:
2561 spin_unlock(&po->bind_lock);
2562 release_sock(sk);
671058d0 2563 return ret;
1da177e4
LT
2564}
2565
2566/*
2567 * Bind a packet socket to a device
2568 */
2569
40d4e3df
ED
2570static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2571 int addr_len)
1da177e4 2572{
40d4e3df 2573 struct sock *sk = sock->sk;
d5fc9fd5 2574 char name[sizeof(uaddr->sa_data) + 1];
1da177e4
LT
2575 struct net_device *dev;
2576 int err = -ENODEV;
1ce4f28b 2577
1da177e4
LT
2578 /*
2579 * Check legality
2580 */
1ce4f28b 2581
8ae55f04 2582 if (addr_len != sizeof(struct sockaddr))
1da177e4 2583 return -EINVAL;
d5fc9fd5
AP
2584 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
2585 * zero-terminated.
2586 */
2587 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
2588 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 2589
3b1e0a65 2590 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2591 if (dev)
1da177e4 2592 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2593 return err;
2594}
1da177e4
LT
2595
2596static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2597{
40d4e3df
ED
2598 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2599 struct sock *sk = sock->sk;
1da177e4
LT
2600 struct net_device *dev = NULL;
2601 int err;
2602
2603
2604 /*
2605 * Check legality
2606 */
1ce4f28b 2607
1da177e4
LT
2608 if (addr_len < sizeof(struct sockaddr_ll))
2609 return -EINVAL;
2610 if (sll->sll_family != AF_PACKET)
2611 return -EINVAL;
2612
2613 if (sll->sll_ifindex) {
2614 err = -ENODEV;
3b1e0a65 2615 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2616 if (dev == NULL)
2617 goto out;
2618 }
2619 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2620
2621out:
2622 return err;
2623}
2624
2625static struct proto packet_proto = {
2626 .name = "PACKET",
2627 .owner = THIS_MODULE,
2628 .obj_size = sizeof(struct packet_sock),
2629};
2630
2631/*
1ce4f28b 2632 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2633 */
2634
3f378b68
EP
2635static int packet_create(struct net *net, struct socket *sock, int protocol,
2636 int kern)
1da177e4
LT
2637{
2638 struct sock *sk;
2639 struct packet_sock *po;
0e11c91e 2640 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2641 int err;
2642
df008c91 2643 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2644 return -EPERM;
be02097c
DM
2645 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2646 sock->type != SOCK_PACKET)
1da177e4
LT
2647 return -ESOCKTNOSUPPORT;
2648
2649 sock->state = SS_UNCONNECTED;
2650
2651 err = -ENOBUFS;
6257ff21 2652 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2653 if (sk == NULL)
2654 goto out;
2655
2656 sock->ops = &packet_ops;
1da177e4
LT
2657 if (sock->type == SOCK_PACKET)
2658 sock->ops = &packet_ops_spkt;
be02097c 2659
1da177e4
LT
2660 sock_init_data(sock, sk);
2661
2662 po = pkt_sk(sk);
2663 sk->sk_family = PF_PACKET;
0e11c91e 2664 po->num = proto;
c3ac8a13
DB
2665
2666 packet_cached_dev_reset(po);
1da177e4
LT
2667
2668 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2669 sk_refcnt_debug_inc(sk);
1da177e4
LT
2670
2671 /*
2672 * Attach a protocol block
2673 */
2674
2675 spin_lock_init(&po->bind_lock);
905db440 2676 mutex_init(&po->pg_vec_lock);
1da177e4 2677 po->prot_hook.func = packet_rcv;
be02097c 2678
1da177e4
LT
2679 if (sock->type == SOCK_PACKET)
2680 po->prot_hook.func = packet_rcv_spkt;
be02097c 2681
1da177e4
LT
2682 po->prot_hook.af_packet_priv = sk;
2683
0e11c91e
AV
2684 if (proto) {
2685 po->prot_hook.type = proto;
ce06b03e 2686 register_prot_hook(sk);
1da177e4
LT
2687 }
2688
0fa7fa98 2689 mutex_lock(&net->packet.sklist_lock);
808f5114 2690 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2691 mutex_unlock(&net->packet.sklist_lock);
2692
2693 preempt_disable();
3680453c 2694 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2695 preempt_enable();
808f5114 2696
40d4e3df 2697 return 0;
1da177e4
LT
2698out:
2699 return err;
2700}
2701
ed85b565
RC
2702static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2703{
2704 struct sock_exterr_skb *serr;
2705 struct sk_buff *skb, *skb2;
2706 int copied, err;
2707
2708 err = -EAGAIN;
2709 skb = skb_dequeue(&sk->sk_error_queue);
2710 if (skb == NULL)
2711 goto out;
2712
2713 copied = skb->len;
2714 if (copied > len) {
2715 msg->msg_flags |= MSG_TRUNC;
2716 copied = len;
2717 }
2718 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2719 if (err)
2720 goto out_free_skb;
2721
2722 sock_recv_timestamp(msg, sk, skb);
2723
2724 serr = SKB_EXT_ERR(skb);
2725 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2726 sizeof(serr->ee), &serr->ee);
2727
2728 msg->msg_flags |= MSG_ERRQUEUE;
2729 err = copied;
2730
2731 /* Reset and regenerate socket error */
2732 spin_lock_bh(&sk->sk_error_queue.lock);
2733 sk->sk_err = 0;
2734 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2735 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2736 spin_unlock_bh(&sk->sk_error_queue.lock);
2737 sk->sk_error_report(sk);
2738 } else
2739 spin_unlock_bh(&sk->sk_error_queue.lock);
2740
2741out_free_skb:
2742 kfree_skb(skb);
2743out:
2744 return err;
2745}
2746
1da177e4
LT
2747/*
2748 * Pull a packet from our receive queue and hand it to the user.
2749 * If necessary we block.
2750 */
2751
2752static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2753 struct msghdr *msg, size_t len, int flags)
2754{
2755 struct sock *sk = sock->sk;
2756 struct sk_buff *skb;
2757 int copied, err;
bfd5f4a3 2758 int vnet_hdr_len = 0;
1da177e4
LT
2759
2760 err = -EINVAL;
ed85b565 2761 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2762 goto out;
2763
2764#if 0
2765 /* What error should we return now? EUNATTACH? */
2766 if (pkt_sk(sk)->ifindex < 0)
2767 return -ENODEV;
2768#endif
2769
ed85b565
RC
2770 if (flags & MSG_ERRQUEUE) {
2771 err = packet_recv_error(sk, msg, len);
2772 goto out;
2773 }
2774
1da177e4
LT
2775 /*
2776 * Call the generic datagram receiver. This handles all sorts
2777 * of horrible races and re-entrancy so we can forget about it
2778 * in the protocol layers.
2779 *
2780 * Now it will return ENETDOWN, if device have just gone down,
2781 * but then it will block.
2782 */
2783
40d4e3df 2784 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2785
2786 /*
1ce4f28b 2787 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2788 * handles the blocking we don't see and worry about blocking
2789 * retries.
2790 */
2791
8ae55f04 2792 if (skb == NULL)
1da177e4
LT
2793 goto out;
2794
bfd5f4a3
SS
2795 if (pkt_sk(sk)->has_vnet_hdr) {
2796 struct virtio_net_hdr vnet_hdr = { 0 };
2797
2798 err = -EINVAL;
2799 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2800 if (len < vnet_hdr_len)
bfd5f4a3
SS
2801 goto out_free;
2802
1f18b717
MK
2803 len -= vnet_hdr_len;
2804
bfd5f4a3
SS
2805 if (skb_is_gso(skb)) {
2806 struct skb_shared_info *sinfo = skb_shinfo(skb);
2807
2808 /* This is a hint as to how much should be linear. */
2809 vnet_hdr.hdr_len = skb_headlen(skb);
2810 vnet_hdr.gso_size = sinfo->gso_size;
2811 if (sinfo->gso_type & SKB_GSO_TCPV4)
2812 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2813 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2814 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2815 else if (sinfo->gso_type & SKB_GSO_UDP)
2816 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2817 else if (sinfo->gso_type & SKB_GSO_FCOE)
2818 goto out_free;
2819 else
2820 BUG();
2821 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2822 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2823 } else
2824 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2825
2826 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2827 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2828 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2829 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2830 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2831 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2832 } /* else everything is zero */
2833
2834 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2835 vnet_hdr_len);
2836 if (err < 0)
2837 goto out_free;
2838 }
2839
2f73d7fd
HFS
2840 /* You lose any data beyond the buffer you gave. If it worries
2841 * a user program they can ask the device for its MTU
2842 * anyway.
1da177e4 2843 */
1da177e4 2844 copied = skb->len;
40d4e3df
ED
2845 if (copied > len) {
2846 copied = len;
2847 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2848 }
2849
2850 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2851 if (err)
2852 goto out_free;
2853
3b885787 2854 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 2855
2f73d7fd
HFS
2856 if (msg->msg_name) {
2857 /* If the address length field is there to be filled
2858 * in, we fill it in now.
2859 */
2860 if (sock->type == SOCK_PACKET) {
2861 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2862 } else {
2863 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2864 msg->msg_namelen = sll->sll_halen +
2865 offsetof(struct sockaddr_ll, sll_addr);
2866 }
ffbc6111
HX
2867 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2868 msg->msg_namelen);
2f73d7fd 2869 }
1da177e4 2870
8dc41944 2871 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2872 struct tpacket_auxdata aux;
2873
2874 aux.tp_status = TP_STATUS_USER;
2875 if (skb->ip_summed == CHECKSUM_PARTIAL)
2876 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2877 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2878 aux.tp_snaplen = skb->len;
2879 aux.tp_mac = 0;
bbe735e4 2880 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2881 if (vlan_tx_tag_present(skb)) {
2882 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2883 aux.tp_status |= TP_STATUS_VLAN_VALID;
2884 } else {
2885 aux.tp_vlan_tci = 0;
2886 }
13fcb7bd 2887 aux.tp_padding = 0;
ffbc6111 2888 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2889 }
2890
1da177e4
LT
2891 /*
2892 * Free or return the buffer as appropriate. Again this
2893 * hides all the races and re-entrancy issues from us.
2894 */
bfd5f4a3 2895 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2896
2897out_free:
2898 skb_free_datagram(sk, skb);
2899out:
2900 return err;
2901}
2902
1da177e4
LT
2903static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2904 int *uaddr_len, int peer)
2905{
2906 struct net_device *dev;
2907 struct sock *sk = sock->sk;
2908
2909 if (peer)
2910 return -EOPNOTSUPP;
2911
2912 uaddr->sa_family = AF_PACKET;
2dc85bf3 2913 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
2914 rcu_read_lock();
2915 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2916 if (dev)
2dc85bf3 2917 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 2918 rcu_read_unlock();
1da177e4
LT
2919 *uaddr_len = sizeof(*uaddr);
2920
2921 return 0;
2922}
1da177e4
LT
2923
2924static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2925 int *uaddr_len, int peer)
2926{
2927 struct net_device *dev;
2928 struct sock *sk = sock->sk;
2929 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2930 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2931
2932 if (peer)
2933 return -EOPNOTSUPP;
2934
2935 sll->sll_family = AF_PACKET;
2936 sll->sll_ifindex = po->ifindex;
2937 sll->sll_protocol = po->num;
67286640 2938 sll->sll_pkttype = 0;
654d1f8a
ED
2939 rcu_read_lock();
2940 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2941 if (dev) {
2942 sll->sll_hatype = dev->type;
2943 sll->sll_halen = dev->addr_len;
2944 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2945 } else {
2946 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2947 sll->sll_halen = 0;
2948 }
654d1f8a 2949 rcu_read_unlock();
0fb375fb 2950 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2951
2952 return 0;
2953}
2954
2aeb0b88
WC
2955static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2956 int what)
1da177e4
LT
2957{
2958 switch (i->type) {
2959 case PACKET_MR_MULTICAST:
1162563f
JP
2960 if (i->alen != dev->addr_len)
2961 return -EINVAL;
1da177e4 2962 if (what > 0)
22bedad3 2963 return dev_mc_add(dev, i->addr);
1da177e4 2964 else
22bedad3 2965 return dev_mc_del(dev, i->addr);
1da177e4
LT
2966 break;
2967 case PACKET_MR_PROMISC:
2aeb0b88 2968 return dev_set_promiscuity(dev, what);
1da177e4
LT
2969 break;
2970 case PACKET_MR_ALLMULTI:
2aeb0b88 2971 return dev_set_allmulti(dev, what);
1da177e4 2972 break;
d95ed927 2973 case PACKET_MR_UNICAST:
1162563f
JP
2974 if (i->alen != dev->addr_len)
2975 return -EINVAL;
d95ed927 2976 if (what > 0)
a748ee24 2977 return dev_uc_add(dev, i->addr);
d95ed927 2978 else
a748ee24 2979 return dev_uc_del(dev, i->addr);
d95ed927 2980 break;
40d4e3df
ED
2981 default:
2982 break;
1da177e4 2983 }
2aeb0b88 2984 return 0;
1da177e4
LT
2985}
2986
2987static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2988{
40d4e3df 2989 for ( ; i; i = i->next) {
1da177e4
LT
2990 if (i->ifindex == dev->ifindex)
2991 packet_dev_mc(dev, i, what);
2992 }
2993}
2994
0fb375fb 2995static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2996{
2997 struct packet_sock *po = pkt_sk(sk);
2998 struct packet_mclist *ml, *i;
2999 struct net_device *dev;
3000 int err;
3001
3002 rtnl_lock();
3003
3004 err = -ENODEV;
3b1e0a65 3005 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3006 if (!dev)
3007 goto done;
3008
3009 err = -EINVAL;
1162563f 3010 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3011 goto done;
3012
3013 err = -ENOBUFS;
8b3a7005 3014 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3015 if (i == NULL)
3016 goto done;
3017
3018 err = 0;
3019 for (ml = po->mclist; ml; ml = ml->next) {
3020 if (ml->ifindex == mreq->mr_ifindex &&
3021 ml->type == mreq->mr_type &&
3022 ml->alen == mreq->mr_alen &&
3023 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3024 ml->count++;
3025 /* Free the new element ... */
3026 kfree(i);
3027 goto done;
3028 }
3029 }
3030
3031 i->type = mreq->mr_type;
3032 i->ifindex = mreq->mr_ifindex;
3033 i->alen = mreq->mr_alen;
3034 memcpy(i->addr, mreq->mr_address, i->alen);
edb5924a 3035 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3036 i->count = 1;
3037 i->next = po->mclist;
3038 po->mclist = i;
2aeb0b88
WC
3039 err = packet_dev_mc(dev, i, 1);
3040 if (err) {
3041 po->mclist = i->next;
3042 kfree(i);
3043 }
1da177e4
LT
3044
3045done:
3046 rtnl_unlock();
3047 return err;
3048}
3049
0fb375fb 3050static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3051{
3052 struct packet_mclist *ml, **mlp;
3053
3054 rtnl_lock();
3055
3056 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3057 if (ml->ifindex == mreq->mr_ifindex &&
3058 ml->type == mreq->mr_type &&
3059 ml->alen == mreq->mr_alen &&
3060 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3061 if (--ml->count == 0) {
3062 struct net_device *dev;
3063 *mlp = ml->next;
ad959e76
ED
3064 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3065 if (dev)
1da177e4 3066 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3067 kfree(ml);
3068 }
3069 rtnl_unlock();
3070 return 0;
3071 }
3072 }
3073 rtnl_unlock();
3074 return -EADDRNOTAVAIL;
3075}
3076
3077static void packet_flush_mclist(struct sock *sk)
3078{
3079 struct packet_sock *po = pkt_sk(sk);
3080 struct packet_mclist *ml;
3081
3082 if (!po->mclist)
3083 return;
3084
3085 rtnl_lock();
3086 while ((ml = po->mclist) != NULL) {
3087 struct net_device *dev;
3088
3089 po->mclist = ml->next;
ad959e76
ED
3090 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3091 if (dev != NULL)
1da177e4 3092 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3093 kfree(ml);
3094 }
3095 rtnl_unlock();
3096}
1da177e4
LT
3097
3098static int
b7058842 3099packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3100{
3101 struct sock *sk = sock->sk;
8dc41944 3102 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3103 int ret;
3104
3105 if (level != SOL_PACKET)
3106 return -ENOPROTOOPT;
3107
69e3c75f 3108 switch (optname) {
1ce4f28b 3109 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3110 case PACKET_DROP_MEMBERSHIP:
3111 {
0fb375fb
EB
3112 struct packet_mreq_max mreq;
3113 int len = optlen;
3114 memset(&mreq, 0, sizeof(mreq));
3115 if (len < sizeof(struct packet_mreq))
1da177e4 3116 return -EINVAL;
0fb375fb
EB
3117 if (len > sizeof(mreq))
3118 len = sizeof(mreq);
40d4e3df 3119 if (copy_from_user(&mreq, optval, len))
1da177e4 3120 return -EFAULT;
0fb375fb
EB
3121 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3122 return -EINVAL;
1da177e4
LT
3123 if (optname == PACKET_ADD_MEMBERSHIP)
3124 ret = packet_mc_add(sk, &mreq);
3125 else
3126 ret = packet_mc_drop(sk, &mreq);
3127 return ret;
3128 }
a2efcfa0 3129
1da177e4 3130 case PACKET_RX_RING:
69e3c75f 3131 case PACKET_TX_RING:
1da177e4 3132 {
f6fb8f10 3133 union tpacket_req_u req_u;
3134 int len;
1da177e4 3135
f6fb8f10 3136 switch (po->tp_version) {
3137 case TPACKET_V1:
3138 case TPACKET_V2:
3139 len = sizeof(req_u.req);
3140 break;
3141 case TPACKET_V3:
3142 default:
3143 len = sizeof(req_u.req3);
3144 break;
3145 }
3146 if (optlen < len)
1da177e4 3147 return -EINVAL;
bfd5f4a3
SS
3148 if (pkt_sk(sk)->has_vnet_hdr)
3149 return -EINVAL;
f6fb8f10 3150 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3151 return -EFAULT;
f6fb8f10 3152 return packet_set_ring(sk, &req_u, 0,
3153 optname == PACKET_TX_RING);
1da177e4
LT
3154 }
3155 case PACKET_COPY_THRESH:
3156 {
3157 int val;
3158
40d4e3df 3159 if (optlen != sizeof(val))
1da177e4 3160 return -EINVAL;
40d4e3df 3161 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3162 return -EFAULT;
3163
3164 pkt_sk(sk)->copy_thresh = val;
3165 return 0;
3166 }
bbd6ef87
PM
3167 case PACKET_VERSION:
3168 {
3169 int val;
3170
3171 if (optlen != sizeof(val))
3172 return -EINVAL;
bbd6ef87
PM
3173 if (copy_from_user(&val, optval, sizeof(val)))
3174 return -EFAULT;
3175 switch (val) {
3176 case TPACKET_V1:
3177 case TPACKET_V2:
f6fb8f10 3178 case TPACKET_V3:
3c2a0909 3179 break;
bbd6ef87
PM
3180 default:
3181 return -EINVAL;
3182 }
3c2a0909
S
3183 lock_sock(sk);
3184 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3185 ret = -EBUSY;
3186 } else {
3187 po->tp_version = val;
3188 ret = 0;
3189 }
3190 release_sock(sk);
3191 return ret;
bbd6ef87 3192 }
8913336a
PM
3193 case PACKET_RESERVE:
3194 {
3195 unsigned int val;
3196
3197 if (optlen != sizeof(val))
3198 return -EINVAL;
8913336a
PM
3199 if (copy_from_user(&val, optval, sizeof(val)))
3200 return -EFAULT;
2c6716c2
AK
3201 if (val > INT_MAX)
3202 return -EINVAL;
6785f852
MW
3203 lock_sock(sk);
3204 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3205 ret = -EBUSY;
3206 } else {
3207 po->tp_reserve = val;
3208 ret = 0;
3209 }
3210 release_sock(sk);
3211 return ret;
8913336a 3212 }
69e3c75f
JB
3213 case PACKET_LOSS:
3214 {
3215 unsigned int val;
3216
3217 if (optlen != sizeof(val))
3218 return -EINVAL;
3219 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3220 return -EBUSY;
3221 if (copy_from_user(&val, optval, sizeof(val)))
3222 return -EFAULT;
3223 po->tp_loss = !!val;
3224 return 0;
3225 }
8dc41944
HX
3226 case PACKET_AUXDATA:
3227 {
3228 int val;
3229
3230 if (optlen < sizeof(val))
3231 return -EINVAL;
3232 if (copy_from_user(&val, optval, sizeof(val)))
3233 return -EFAULT;
3234
3235 po->auxdata = !!val;
3236 return 0;
3237 }
80feaacb
PWJ
3238 case PACKET_ORIGDEV:
3239 {
3240 int val;
3241
3242 if (optlen < sizeof(val))
3243 return -EINVAL;
3244 if (copy_from_user(&val, optval, sizeof(val)))
3245 return -EFAULT;
3246
3247 po->origdev = !!val;
3248 return 0;
3249 }
bfd5f4a3
SS
3250 case PACKET_VNET_HDR:
3251 {
3252 int val;
3253
3254 if (sock->type != SOCK_RAW)
3255 return -EINVAL;
3256 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3257 return -EBUSY;
3258 if (optlen < sizeof(val))
3259 return -EINVAL;
3260 if (copy_from_user(&val, optval, sizeof(val)))
3261 return -EFAULT;
3262
3263 po->has_vnet_hdr = !!val;
3264 return 0;
3265 }
614f60fa
SM
3266 case PACKET_TIMESTAMP:
3267 {
3268 int val;
3269
3270 if (optlen != sizeof(val))
3271 return -EINVAL;
3272 if (copy_from_user(&val, optval, sizeof(val)))
3273 return -EFAULT;
3274
3275 po->tp_tstamp = val;
3276 return 0;
3277 }
dc99f600
DM
3278 case PACKET_FANOUT:
3279 {
3280 int val;
3281
3282 if (optlen != sizeof(val))
3283 return -EINVAL;
3284 if (copy_from_user(&val, optval, sizeof(val)))
3285 return -EFAULT;
3286
3287 return fanout_add(sk, val & 0xffff, val >> 16);
3288 }
5920cd3a
PC
3289 case PACKET_TX_HAS_OFF:
3290 {
3291 unsigned int val;
3292
3293 if (optlen != sizeof(val))
3294 return -EINVAL;
3295 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3296 return -EBUSY;
3297 if (copy_from_user(&val, optval, sizeof(val)))
3298 return -EFAULT;
3299 po->tp_tx_has_off = !!val;
3300 return 0;
3301 }
1da177e4
LT
3302 default:
3303 return -ENOPROTOOPT;
3304 }
3305}
3306
3307static int packet_getsockopt(struct socket *sock, int level, int optname,
3308 char __user *optval, int __user *optlen)
3309{
3310 int len;
c06fff6e 3311 int val, lv = sizeof(val);
1da177e4
LT
3312 struct sock *sk = sock->sk;
3313 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3314 void *data = &val;
ee80fbf3 3315 union tpacket_stats_u st;
1da177e4
LT
3316
3317 if (level != SOL_PACKET)
3318 return -ENOPROTOOPT;
3319
8ae55f04
KK
3320 if (get_user(len, optlen))
3321 return -EFAULT;
1da177e4
LT
3322
3323 if (len < 0)
3324 return -EINVAL;
1ce4f28b 3325
69e3c75f 3326 switch (optname) {
1da177e4 3327 case PACKET_STATISTICS:
1da177e4 3328 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3329 memcpy(&st, &po->stats, sizeof(st));
3330 memset(&po->stats, 0, sizeof(po->stats));
3331 spin_unlock_bh(&sk->sk_receive_queue.lock);
3332
f6fb8f10 3333 if (po->tp_version == TPACKET_V3) {
c06fff6e 3334 lv = sizeof(struct tpacket_stats_v3);
fc26e4cf 3335 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3336 data = &st.stats3;
f6fb8f10 3337 } else {
c06fff6e 3338 lv = sizeof(struct tpacket_stats);
fc26e4cf 3339 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3340 data = &st.stats1;
f6fb8f10 3341 }
ee80fbf3 3342
8dc41944
HX
3343 break;
3344 case PACKET_AUXDATA:
8dc41944 3345 val = po->auxdata;
80feaacb
PWJ
3346 break;
3347 case PACKET_ORIGDEV:
80feaacb 3348 val = po->origdev;
bfd5f4a3
SS
3349 break;
3350 case PACKET_VNET_HDR:
bfd5f4a3 3351 val = po->has_vnet_hdr;
1da177e4 3352 break;
bbd6ef87 3353 case PACKET_VERSION:
bbd6ef87 3354 val = po->tp_version;
bbd6ef87
PM
3355 break;
3356 case PACKET_HDRLEN:
3357 if (len > sizeof(int))
3358 len = sizeof(int);
2601fa66
AP
3359 if (len < sizeof(int))
3360 return -EINVAL;
bbd6ef87
PM
3361 if (copy_from_user(&val, optval, len))
3362 return -EFAULT;
3363 switch (val) {
3364 case TPACKET_V1:
3365 val = sizeof(struct tpacket_hdr);
3366 break;
3367 case TPACKET_V2:
3368 val = sizeof(struct tpacket2_hdr);
3369 break;
f6fb8f10 3370 case TPACKET_V3:
3371 val = sizeof(struct tpacket3_hdr);
3372 break;
bbd6ef87
PM
3373 default:
3374 return -EINVAL;
3375 }
bbd6ef87 3376 break;
8913336a 3377 case PACKET_RESERVE:
8913336a 3378 val = po->tp_reserve;
8913336a 3379 break;
69e3c75f 3380 case PACKET_LOSS:
69e3c75f 3381 val = po->tp_loss;
69e3c75f 3382 break;
614f60fa 3383 case PACKET_TIMESTAMP:
614f60fa 3384 val = po->tp_tstamp;
614f60fa 3385 break;
dc99f600 3386 case PACKET_FANOUT:
dc99f600
DM
3387 val = (po->fanout ?
3388 ((u32)po->fanout->id |
77f65ebd
WB
3389 ((u32)po->fanout->type << 16) |
3390 ((u32)po->fanout->flags << 24)) :
dc99f600 3391 0);
dc99f600 3392 break;
5920cd3a
PC
3393 case PACKET_TX_HAS_OFF:
3394 val = po->tp_tx_has_off;
3395 break;
1da177e4
LT
3396 default:
3397 return -ENOPROTOOPT;
3398 }
3399
c06fff6e
ED
3400 if (len > lv)
3401 len = lv;
8ae55f04
KK
3402 if (put_user(len, optlen))
3403 return -EFAULT;
8dc41944
HX
3404 if (copy_to_user(optval, data, len))
3405 return -EFAULT;
8ae55f04 3406 return 0;
1da177e4
LT
3407}
3408
3409
3410static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3411{
3412 struct sock *sk;
ad930650 3413 struct net_device *dev = data;
c346dca1 3414 struct net *net = dev_net(dev);
1da177e4 3415
808f5114 3416 rcu_read_lock();
b67bfe0d 3417 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3418 struct packet_sock *po = pkt_sk(sk);
3419
3420 switch (msg) {
3421 case NETDEV_UNREGISTER:
1da177e4
LT
3422 if (po->mclist)
3423 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3424 /* fallthrough */
3425
1da177e4
LT
3426 case NETDEV_DOWN:
3427 if (dev->ifindex == po->ifindex) {
3428 spin_lock(&po->bind_lock);
3429 if (po->running) {
ce06b03e 3430 __unregister_prot_hook(sk, false);
1da177e4
LT
3431 sk->sk_err = ENETDOWN;
3432 if (!sock_flag(sk, SOCK_DEAD))
3433 sk->sk_error_report(sk);
3434 }
3435 if (msg == NETDEV_UNREGISTER) {
c3ac8a13 3436 packet_cached_dev_reset(po);
1da177e4 3437 po->ifindex = -1;
160ff18a
BG
3438 if (po->prot_hook.dev)
3439 dev_put(po->prot_hook.dev);
1da177e4
LT
3440 po->prot_hook.dev = NULL;
3441 }
3442 spin_unlock(&po->bind_lock);
3443 }
3444 break;
3445 case NETDEV_UP:
808f5114 3446 if (dev->ifindex == po->ifindex) {
3447 spin_lock(&po->bind_lock);
ce06b03e
DM
3448 if (po->num)
3449 register_prot_hook(sk);
808f5114 3450 spin_unlock(&po->bind_lock);
1da177e4 3451 }
1da177e4
LT
3452 break;
3453 }
3454 }
808f5114 3455 rcu_read_unlock();
1da177e4
LT
3456 return NOTIFY_DONE;
3457}
3458
3459
3460static int packet_ioctl(struct socket *sock, unsigned int cmd,
3461 unsigned long arg)
3462{
3463 struct sock *sk = sock->sk;
3464
69e3c75f 3465 switch (cmd) {
40d4e3df
ED
3466 case SIOCOUTQ:
3467 {
3468 int amount = sk_wmem_alloc_get(sk);
31e6d363 3469
40d4e3df
ED
3470 return put_user(amount, (int __user *)arg);
3471 }
3472 case SIOCINQ:
3473 {
3474 struct sk_buff *skb;
3475 int amount = 0;
3476
3477 spin_lock_bh(&sk->sk_receive_queue.lock);
3478 skb = skb_peek(&sk->sk_receive_queue);
3479 if (skb)
3480 amount = skb->len;
3481 spin_unlock_bh(&sk->sk_receive_queue.lock);
3482 return put_user(amount, (int __user *)arg);
3483 }
3484 case SIOCGSTAMP:
3485 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3486 case SIOCGSTAMPNS:
3487 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3488
1da177e4 3489#ifdef CONFIG_INET
40d4e3df
ED
3490 case SIOCADDRT:
3491 case SIOCDELRT:
3492 case SIOCDARP:
3493 case SIOCGARP:
3494 case SIOCSARP:
3495 case SIOCGIFADDR:
3496 case SIOCSIFADDR:
3497 case SIOCGIFBRDADDR:
3498 case SIOCSIFBRDADDR:
3499 case SIOCGIFNETMASK:
3500 case SIOCSIFNETMASK:
3501 case SIOCGIFDSTADDR:
3502 case SIOCSIFDSTADDR:
3503 case SIOCSIFFLAGS:
40d4e3df 3504 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3505#endif
3506
40d4e3df
ED
3507 default:
3508 return -ENOIOCTLCMD;
1da177e4
LT
3509 }
3510 return 0;
3511}
3512
40d4e3df 3513static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3514 poll_table *wait)
3515{
3516 struct sock *sk = sock->sk;
3517 struct packet_sock *po = pkt_sk(sk);
3518 unsigned int mask = datagram_poll(file, sock, wait);
3519
3520 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3521 if (po->rx_ring.pg_vec) {
f6fb8f10 3522 if (!packet_previous_rx_frame(po, &po->rx_ring,
3523 TP_STATUS_KERNEL))
1da177e4
LT
3524 mask |= POLLIN | POLLRDNORM;
3525 }
3526 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3527 spin_lock_bh(&sk->sk_write_queue.lock);
3528 if (po->tx_ring.pg_vec) {
3529 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3530 mask |= POLLOUT | POLLWRNORM;
3531 }
3532 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3533 return mask;
3534}
3535
3536
3537/* Dirty? Well, I still did not learn better way to account
3538 * for user mmaps.
3539 */
3540
3541static void packet_mm_open(struct vm_area_struct *vma)
3542{
3543 struct file *file = vma->vm_file;
40d4e3df 3544 struct socket *sock = file->private_data;
1da177e4 3545 struct sock *sk = sock->sk;
1ce4f28b 3546
1da177e4
LT
3547 if (sk)
3548 atomic_inc(&pkt_sk(sk)->mapped);
3549}
3550
3551static void packet_mm_close(struct vm_area_struct *vma)
3552{
3553 struct file *file = vma->vm_file;
40d4e3df 3554 struct socket *sock = file->private_data;
1da177e4 3555 struct sock *sk = sock->sk;
1ce4f28b 3556
1da177e4
LT
3557 if (sk)
3558 atomic_dec(&pkt_sk(sk)->mapped);
3559}
3560
f0f37e2f 3561static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3562 .open = packet_mm_open,
3563 .close = packet_mm_close,
1da177e4
LT
3564};
3565
0e3125c7
NH
3566static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3567 unsigned int len)
1da177e4
LT
3568{
3569 int i;
3570
4ebf0ae2 3571 for (i = 0; i < len; i++) {
0e3125c7 3572 if (likely(pg_vec[i].buffer)) {
c56b4d90 3573 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3574 vfree(pg_vec[i].buffer);
3575 else
3576 free_pages((unsigned long)pg_vec[i].buffer,
3577 order);
3578 pg_vec[i].buffer = NULL;
3579 }
1da177e4
LT
3580 }
3581 kfree(pg_vec);
3582}
3583
eea49cc9 3584static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3585{
0e3125c7
NH
3586 char *buffer = NULL;
3587 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3588 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3589
3590 buffer = (char *) __get_free_pages(gfp_flags, order);
3591
3592 if (buffer)
3593 return buffer;
3594
3595 /*
3596 * __get_free_pages failed, fall back to vmalloc
3597 */
bbce5a59 3598 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3599
0e3125c7
NH
3600 if (buffer)
3601 return buffer;
3602
3603 /*
3604 * vmalloc failed, lets dig into swap here
3605 */
0e3125c7
NH
3606 gfp_flags &= ~__GFP_NORETRY;
3607 buffer = (char *)__get_free_pages(gfp_flags, order);
3608 if (buffer)
3609 return buffer;
3610
3611 /*
3612 * complete and utter failure
3613 */
3614 return NULL;
4ebf0ae2
DM
3615}
3616
0e3125c7 3617static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3618{
3619 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3620 struct pgv *pg_vec;
4ebf0ae2
DM
3621 int i;
3622
0e3125c7 3623 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3624 if (unlikely(!pg_vec))
3625 goto out;
3626
3627 for (i = 0; i < block_nr; i++) {
c56b4d90 3628 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3629 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3630 goto out_free_pgvec;
3631 }
3632
3633out:
3634 return pg_vec;
3635
3636out_free_pgvec:
3637 free_pg_vec(pg_vec, order, block_nr);
3638 pg_vec = NULL;
3639 goto out;
3640}
1da177e4 3641
f6fb8f10 3642static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3643 int closing, int tx_ring)
1da177e4 3644{
0e3125c7 3645 struct pgv *pg_vec = NULL;
1da177e4 3646 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3647 int was_running, order = 0;
69e3c75f
JB
3648 struct packet_ring_buffer *rb;
3649 struct sk_buff_head *rb_queue;
0e11c91e 3650 __be16 num;
f6fb8f10 3651 int err = -EINVAL;
3652 /* Added to avoid minimal code churn */
3653 struct tpacket_req *req = &req_u->req;
3654
3c2a0909 3655 lock_sock(sk);
f6fb8f10 3656 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3657 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3658 WARN(1, "Tx-ring is not supported.\n");
3659 goto out;
3660 }
1ce4f28b 3661
69e3c75f
JB
3662 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3663 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3664
69e3c75f
JB
3665 err = -EBUSY;
3666 if (!closing) {
3667 if (atomic_read(&po->mapped))
3668 goto out;
3669 if (atomic_read(&rb->pending))
3670 goto out;
3671 }
1da177e4 3672
69e3c75f
JB
3673 if (req->tp_block_nr) {
3674 /* Sanity tests and some calculations */
3675 err = -EBUSY;
3676 if (unlikely(rb->pg_vec))
3677 goto out;
1da177e4 3678
bbd6ef87
PM
3679 switch (po->tp_version) {
3680 case TPACKET_V1:
3681 po->tp_hdrlen = TPACKET_HDRLEN;
3682 break;
3683 case TPACKET_V2:
3684 po->tp_hdrlen = TPACKET2_HDRLEN;
3685 break;
f6fb8f10 3686 case TPACKET_V3:
3687 po->tp_hdrlen = TPACKET3_HDRLEN;
3688 break;
bbd6ef87
PM
3689 }
3690
69e3c75f 3691 err = -EINVAL;
4ebf0ae2 3692 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3693 goto out;
4ebf0ae2 3694 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3695 goto out;
4035ed7b 3696 if (po->tp_version >= TPACKET_V3 &&
fcd987f4
AK
3697 req->tp_block_size <=
3698 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
4035ed7b 3699 goto out;
8913336a 3700 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3701 po->tp_reserve))
3702 goto out;
4ebf0ae2 3703 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3704 goto out;
1da177e4 3705
69e3c75f
JB
3706 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3707 if (unlikely(rb->frames_per_block <= 0))
3708 goto out;
0f059fc2
AK
3709 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
3710 goto out;
69e3c75f
JB
3711 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3712 req->tp_frame_nr))
3713 goto out;
1da177e4
LT
3714
3715 err = -ENOMEM;
4ebf0ae2
DM
3716 order = get_order(req->tp_block_size);
3717 pg_vec = alloc_pg_vec(req, order);
3718 if (unlikely(!pg_vec))
1da177e4 3719 goto out;
f6fb8f10 3720 switch (po->tp_version) {
3721 case TPACKET_V3:
3722 /* Transmit path is not supported. We checked
3723 * it above but just being paranoid
3724 */
3725 if (!tx_ring)
3726 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
aeebef08 3727 break;
f6fb8f10 3728 default:
3729 break;
3730 }
69e3c75f
JB
3731 }
3732 /* Done */
3733 else {
3734 err = -EINVAL;
4ebf0ae2 3735 if (unlikely(req->tp_frame_nr))
69e3c75f 3736 goto out;
1da177e4
LT
3737 }
3738
1da177e4
LT
3739
3740 /* Detach socket from network */
3741 spin_lock(&po->bind_lock);
3742 was_running = po->running;
3743 num = po->num;
3744 if (was_running) {
1da177e4 3745 po->num = 0;
ce06b03e 3746 __unregister_prot_hook(sk, false);
1da177e4
LT
3747 }
3748 spin_unlock(&po->bind_lock);
1ce4f28b 3749
1da177e4
LT
3750 synchronize_net();
3751
3752 err = -EBUSY;
905db440 3753 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3754 if (closing || atomic_read(&po->mapped) == 0) {
3755 err = 0;
69e3c75f 3756 spin_lock_bh(&rb_queue->lock);
c053fd96 3757 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3758 rb->frame_max = (req->tp_frame_nr - 1);
3759 rb->head = 0;
3760 rb->frame_size = req->tp_frame_size;
3761 spin_unlock_bh(&rb_queue->lock);
3762
c053fd96
CG
3763 swap(rb->pg_vec_order, order);
3764 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3765
3766 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3767 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3768 tpacket_rcv : packet_rcv;
3769 skb_queue_purge(rb_queue);
1da177e4 3770 if (atomic_read(&po->mapped))
40d4e3df
ED
3771 pr_err("packet_mmap: vma is busy: %d\n",
3772 atomic_read(&po->mapped));
1da177e4 3773 }
905db440 3774 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3775
3776 spin_lock(&po->bind_lock);
ce06b03e 3777 if (was_running) {
1da177e4 3778 po->num = num;
ce06b03e 3779 register_prot_hook(sk);
1da177e4
LT
3780 }
3781 spin_unlock(&po->bind_lock);
f6fb8f10 3782 if (closing && (po->tp_version > TPACKET_V2)) {
3783 /* Because we don't support block-based V3 on tx-ring */
3784 if (!tx_ring)
3785 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3786 }
1da177e4 3787
1da177e4
LT
3788 if (pg_vec)
3789 free_pg_vec(pg_vec, order, req->tp_block_nr);
3790out:
3c2a0909 3791 release_sock(sk);
1da177e4
LT
3792 return err;
3793}
3794
69e3c75f
JB
3795static int packet_mmap(struct file *file, struct socket *sock,
3796 struct vm_area_struct *vma)
1da177e4
LT
3797{
3798 struct sock *sk = sock->sk;
3799 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3800 unsigned long size, expected_size;
3801 struct packet_ring_buffer *rb;
1da177e4
LT
3802 unsigned long start;
3803 int err = -EINVAL;
3804 int i;
3805
3806 if (vma->vm_pgoff)
3807 return -EINVAL;
3808
905db440 3809 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3810
3811 expected_size = 0;
3812 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3813 if (rb->pg_vec) {
3814 expected_size += rb->pg_vec_len
3815 * rb->pg_vec_pages
3816 * PAGE_SIZE;
3817 }
3818 }
3819
3820 if (expected_size == 0)
1da177e4 3821 goto out;
69e3c75f
JB
3822
3823 size = vma->vm_end - vma->vm_start;
3824 if (size != expected_size)
1da177e4
LT
3825 goto out;
3826
1da177e4 3827 start = vma->vm_start;
69e3c75f
JB
3828 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3829 if (rb->pg_vec == NULL)
3830 continue;
3831
3832 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3833 struct page *page;
3834 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3835 int pg_num;
3836
c56b4d90
CG
3837 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3838 page = pgv_to_page(kaddr);
69e3c75f
JB
3839 err = vm_insert_page(vma, start, page);
3840 if (unlikely(err))
3841 goto out;
3842 start += PAGE_SIZE;
0e3125c7 3843 kaddr += PAGE_SIZE;
69e3c75f 3844 }
4ebf0ae2 3845 }
1da177e4 3846 }
69e3c75f 3847
4ebf0ae2 3848 atomic_inc(&po->mapped);
1da177e4
LT
3849 vma->vm_ops = &packet_mmap_ops;
3850 err = 0;
3851
3852out:
905db440 3853 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3854 return err;
3855}
1da177e4 3856
90ddc4f0 3857static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3858 .family = PF_PACKET,
3859 .owner = THIS_MODULE,
3860 .release = packet_release,
3861 .bind = packet_bind_spkt,
3862 .connect = sock_no_connect,
3863 .socketpair = sock_no_socketpair,
3864 .accept = sock_no_accept,
3865 .getname = packet_getname_spkt,
3866 .poll = datagram_poll,
3867 .ioctl = packet_ioctl,
3868 .listen = sock_no_listen,
3869 .shutdown = sock_no_shutdown,
3870 .setsockopt = sock_no_setsockopt,
3871 .getsockopt = sock_no_getsockopt,
3872 .sendmsg = packet_sendmsg_spkt,
3873 .recvmsg = packet_recvmsg,
3874 .mmap = sock_no_mmap,
3875 .sendpage = sock_no_sendpage,
3876};
1da177e4 3877
90ddc4f0 3878static const struct proto_ops packet_ops = {
1da177e4
LT
3879 .family = PF_PACKET,
3880 .owner = THIS_MODULE,
3881 .release = packet_release,
3882 .bind = packet_bind,
3883 .connect = sock_no_connect,
3884 .socketpair = sock_no_socketpair,
3885 .accept = sock_no_accept,
1ce4f28b 3886 .getname = packet_getname,
1da177e4
LT
3887 .poll = packet_poll,
3888 .ioctl = packet_ioctl,
3889 .listen = sock_no_listen,
3890 .shutdown = sock_no_shutdown,
3891 .setsockopt = packet_setsockopt,
3892 .getsockopt = packet_getsockopt,
3893 .sendmsg = packet_sendmsg,
3894 .recvmsg = packet_recvmsg,
3895 .mmap = packet_mmap,
3896 .sendpage = sock_no_sendpage,
3897};
3898
ec1b4cf7 3899static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3900 .family = PF_PACKET,
3901 .create = packet_create,
3902 .owner = THIS_MODULE,
3903};
3904
3905static struct notifier_block packet_netdev_notifier = {
40d4e3df 3906 .notifier_call = packet_notifier,
1da177e4
LT
3907};
3908
3909#ifdef CONFIG_PROC_FS
1da177e4
LT
3910
3911static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3912 __acquires(RCU)
1da177e4 3913{
e372c414 3914 struct net *net = seq_file_net(seq);
808f5114 3915
3916 rcu_read_lock();
3917 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3918}
3919
3920static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3921{
1bf40954 3922 struct net *net = seq_file_net(seq);
808f5114 3923 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3924}
3925
3926static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3927 __releases(RCU)
1da177e4 3928{
808f5114 3929 rcu_read_unlock();
1da177e4
LT
3930}
3931
1ce4f28b 3932static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3933{
3934 if (v == SEQ_START_TOKEN)
3935 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3936 else {
b7ceabd9 3937 struct sock *s = sk_entry(v);
1da177e4
LT
3938 const struct packet_sock *po = pkt_sk(s);
3939
3940 seq_printf(seq,
71338aa7 3941 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3942 s,
3943 atomic_read(&s->sk_refcnt),
3944 s->sk_type,
3945 ntohs(po->num),
3946 po->ifindex,
3947 po->running,
3948 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 3949 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 3950 sock_i_ino(s));
1da177e4
LT
3951 }
3952
3953 return 0;
3954}
3955
56b3d975 3956static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3957 .start = packet_seq_start,
3958 .next = packet_seq_next,
3959 .stop = packet_seq_stop,
3960 .show = packet_seq_show,
3961};
3962
3963static int packet_seq_open(struct inode *inode, struct file *file)
3964{
e372c414
DL
3965 return seq_open_net(inode, file, &packet_seq_ops,
3966 sizeof(struct seq_net_private));
1da177e4
LT
3967}
3968
da7071d7 3969static const struct file_operations packet_seq_fops = {
1da177e4
LT
3970 .owner = THIS_MODULE,
3971 .open = packet_seq_open,
3972 .read = seq_read,
3973 .llseek = seq_lseek,
e372c414 3974 .release = seq_release_net,
1da177e4
LT
3975};
3976
3977#endif
3978
2c8c1e72 3979static int __net_init packet_net_init(struct net *net)
d12d01d6 3980{
0fa7fa98 3981 mutex_init(&net->packet.sklist_lock);
2aaef4e4 3982 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 3983
d4beaa66 3984 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
3985 return -ENOMEM;
3986
3987 return 0;
3988}
3989
2c8c1e72 3990static void __net_exit packet_net_exit(struct net *net)
d12d01d6 3991{
ece31ffd 3992 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
3993}
3994
3995static struct pernet_operations packet_net_ops = {
3996 .init = packet_net_init,
3997 .exit = packet_net_exit,
3998};
3999
4000
1da177e4
LT
4001static void __exit packet_exit(void)
4002{
1da177e4 4003 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4004 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4005 sock_unregister(PF_PACKET);
4006 proto_unregister(&packet_proto);
4007}
4008
4009static int __init packet_init(void)
4010{
4011 int rc = proto_register(&packet_proto, 0);
4012
4013 if (rc != 0)
4014 goto out;
4015
4016 sock_register(&packet_family_ops);
d12d01d6 4017 register_pernet_subsys(&packet_net_ops);
1da177e4 4018 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4019out:
4020 return rc;
4021}
4022
4023module_init(packet_init);
4024module_exit(packet_exit);
4025MODULE_LICENSE("GPL");
4026MODULE_ALIAS_NETPROTO(PF_PACKET);