net/irda: sh_irda: add PM support
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
76#include <asm/system.h>
77#include <asm/uaccess.h>
78#include <asm/ioctls.h>
79#include <asm/page.h>
a1f8e7f7 80#include <asm/cacheflush.h>
1da177e4
LT
81#include <asm/io.h>
82#include <linux/proc_fs.h>
83#include <linux/seq_file.h>
84#include <linux/poll.h>
85#include <linux/module.h>
86#include <linux/init.h>
905db440 87#include <linux/mutex.h>
05423b24 88#include <linux/if_vlan.h>
bfd5f4a3 89#include <linux/virtio_net.h>
ed85b565 90#include <linux/errqueue.h>
614f60fa 91#include <linux/net_tstamp.h>
1da177e4
LT
92
93#ifdef CONFIG_INET
94#include <net/inet_common.h>
95#endif
96
1da177e4
LT
97/*
98 Assumptions:
99 - if device has no dev->hard_header routine, it adds and removes ll header
100 inside itself. In this case ll header is invisible outside of device,
101 but higher levels still should reserve dev->hard_header_len.
102 Some devices are enough clever to reallocate skb, when header
103 will not fit to reserved space (tunnel), another ones are silly
104 (PPP).
105 - packet socket receives packets with pulled ll header,
106 so that SOCK_RAW should push it back.
107
108On receive:
109-----------
110
111Incoming, dev->hard_header!=NULL
b0e380b1
ACM
112 mac_header -> ll header
113 data -> data
1da177e4
LT
114
115Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
116 mac_header -> ll header
117 data -> ll header
1da177e4
LT
118
119Incoming, dev->hard_header==NULL
b0e380b1
ACM
120 mac_header -> UNKNOWN position. It is very likely, that it points to ll
121 header. PPP makes it, that is wrong, because introduce
db0c58f9 122 assymetry between rx and tx paths.
b0e380b1 123 data -> data
1da177e4
LT
124
125Outgoing, dev->hard_header==NULL
b0e380b1
ACM
126 mac_header -> data. ll header is still not built!
127 data -> data
1da177e4
LT
128
129Resume
130 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
131
132
133On transmit:
134------------
135
136dev->hard_header != NULL
b0e380b1
ACM
137 mac_header -> ll header
138 data -> ll header
1da177e4
LT
139
140dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
141 mac_header -> data
142 data -> data
1da177e4
LT
143
144 We should set nh.raw on output to correct posistion,
145 packet classifier depends on it.
146 */
147
1da177e4
LT
148/* Private packet socket structures. */
149
40d4e3df 150struct packet_mclist {
1da177e4
LT
151 struct packet_mclist *next;
152 int ifindex;
153 int count;
154 unsigned short type;
155 unsigned short alen;
0fb375fb
EB
156 unsigned char addr[MAX_ADDR_LEN];
157};
158/* identical to struct packet_mreq except it has
159 * a longer address field.
160 */
40d4e3df 161struct packet_mreq_max {
0fb375fb
EB
162 int mr_ifindex;
163 unsigned short mr_type;
164 unsigned short mr_alen;
165 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 166};
a2efcfa0 167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171
172#define V3_ALIGNMENT (8)
173
bc59ba39 174#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 175
176#define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
179/* kbdq - kernel block descriptor queue */
bc59ba39 180struct tpacket_kbdq_core {
f6fb8f10 181 struct pgv *pkbdq;
182 unsigned int feature_req_word;
183 unsigned int hdrlen;
184 unsigned char reset_pending_on_curr_blk;
185 unsigned char delete_blk_timer;
186 unsigned short kactive_blk_num;
187 unsigned short blk_sizeof_priv;
188
189 /* last_kactive_blk_num:
190 * trick to see if user-space has caught up
191 * in order to avoid refreshing timer when every single pkt arrives.
192 */
193 unsigned short last_kactive_blk_num;
194
195 char *pkblk_start;
196 char *pkblk_end;
197 int kblk_size;
198 unsigned int knum_blocks;
199 uint64_t knxt_seq_num;
200 char *prev;
201 char *nxt_offset;
202 struct sk_buff *skb;
203
204 atomic_t blk_fill_in_prog;
205
206 /* Default is set to 8ms */
207#define DEFAULT_PRB_RETIRE_TOV (8)
208
209 unsigned short retire_blk_tov;
210 unsigned short version;
211 unsigned long tov_in_jiffies;
212
213 /* timer to retire an outstanding block */
214 struct timer_list retire_blk_timer;
215};
216
217#define PGV_FROM_VMALLOC 1
0e3125c7
NH
218struct pgv {
219 char *buffer;
0e3125c7
NH
220};
221
69e3c75f 222struct packet_ring_buffer {
0e3125c7 223 struct pgv *pg_vec;
69e3c75f
JB
224 unsigned int head;
225 unsigned int frames_per_block;
226 unsigned int frame_size;
227 unsigned int frame_max;
228
229 unsigned int pg_vec_order;
230 unsigned int pg_vec_pages;
231 unsigned int pg_vec_len;
232
bc59ba39 233 struct tpacket_kbdq_core prb_bdqc;
69e3c75f
JB
234 atomic_t pending;
235};
236
f6fb8f10 237#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
238#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
239#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
240#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
241#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
242#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
243#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
244
69e3c75f
JB
245struct packet_sock;
246static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
1da177e4 247
f6fb8f10 248static void *packet_previous_frame(struct packet_sock *po,
249 struct packet_ring_buffer *rb,
250 int status);
251static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 252static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
253 struct tpacket_block_desc *);
254static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 255 struct packet_sock *);
bc59ba39 256static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 257 struct packet_sock *, unsigned int status);
bc59ba39 258static int prb_queue_frozen(struct tpacket_kbdq_core *);
259static void prb_open_block(struct tpacket_kbdq_core *,
260 struct tpacket_block_desc *);
f6fb8f10 261static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 262static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
263static void prb_init_blk_timer(struct packet_sock *,
264 struct tpacket_kbdq_core *,
265 void (*func) (unsigned long));
266static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
267static void prb_clear_rxhash(struct tpacket_kbdq_core *,
268 struct tpacket3_hdr *);
269static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
270 struct tpacket3_hdr *);
1da177e4
LT
271static void packet_flush_mclist(struct sock *sk);
272
dc99f600 273struct packet_fanout;
1da177e4
LT
274struct packet_sock {
275 /* struct sock has to be the first member of packet_sock */
276 struct sock sk;
dc99f600 277 struct packet_fanout *fanout;
1da177e4 278 struct tpacket_stats stats;
f6fb8f10 279 union tpacket_stats_u stats_u;
69e3c75f
JB
280 struct packet_ring_buffer rx_ring;
281 struct packet_ring_buffer tx_ring;
1da177e4 282 int copy_thresh;
1da177e4 283 spinlock_t bind_lock;
905db440 284 struct mutex pg_vec_lock;
8dc41944 285 unsigned int running:1, /* prot_hook is attached*/
80feaacb 286 auxdata:1,
bfd5f4a3
SS
287 origdev:1,
288 has_vnet_hdr:1;
1da177e4 289 int ifindex; /* bound device */
0e11c91e 290 __be16 num;
1da177e4 291 struct packet_mclist *mclist;
1da177e4 292 atomic_t mapped;
bbd6ef87
PM
293 enum tpacket_versions tp_version;
294 unsigned int tp_hdrlen;
8913336a 295 unsigned int tp_reserve;
69e3c75f 296 unsigned int tp_loss:1;
614f60fa 297 unsigned int tp_tstamp;
94b05952 298 struct packet_type prot_hook ____cacheline_aligned_in_smp;
1da177e4
LT
299};
300
dc99f600
DM
301#define PACKET_FANOUT_MAX 256
302
303struct packet_fanout {
304#ifdef CONFIG_NET_NS
305 struct net *net;
306#endif
307 unsigned int num_members;
308 u16 id;
309 u8 type;
7736d33f 310 u8 defrag;
dc99f600
DM
311 atomic_t rr_cur;
312 struct list_head list;
313 struct sock *arr[PACKET_FANOUT_MAX];
314 spinlock_t lock;
315 atomic_t sk_ref;
316 struct packet_type prot_hook ____cacheline_aligned_in_smp;
317};
318
ffbc6111
HX
319struct packet_skb_cb {
320 unsigned int origlen;
321 union {
322 struct sockaddr_pkt pkt;
323 struct sockaddr_ll ll;
324 } sa;
325};
326
327#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 328
bc59ba39 329#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 330#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 331 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 332#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 333 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 334#define GET_NEXT_PRB_BLK_NUM(x) \
335 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
336 ((x)->kactive_blk_num+1) : 0)
337
ce06b03e
DM
338static inline struct packet_sock *pkt_sk(struct sock *sk)
339{
340 return (struct packet_sock *)sk;
341}
342
dc99f600
DM
343static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
344static void __fanout_link(struct sock *sk, struct packet_sock *po);
345
ce06b03e
DM
346/* register_prot_hook must be invoked with the po->bind_lock held,
347 * or from a context in which asynchronous accesses to the packet
348 * socket is not possible (packet_create()).
349 */
350static void register_prot_hook(struct sock *sk)
351{
352 struct packet_sock *po = pkt_sk(sk);
353 if (!po->running) {
dc99f600
DM
354 if (po->fanout)
355 __fanout_link(sk, po);
356 else
357 dev_add_pack(&po->prot_hook);
ce06b03e
DM
358 sock_hold(sk);
359 po->running = 1;
360 }
361}
362
363/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
364 * held. If the sync parameter is true, we will temporarily drop
365 * the po->bind_lock and do a synchronize_net to make sure no
366 * asynchronous packet processing paths still refer to the elements
367 * of po->prot_hook. If the sync parameter is false, it is the
368 * callers responsibility to take care of this.
369 */
370static void __unregister_prot_hook(struct sock *sk, bool sync)
371{
372 struct packet_sock *po = pkt_sk(sk);
373
374 po->running = 0;
dc99f600
DM
375 if (po->fanout)
376 __fanout_unlink(sk, po);
377 else
378 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
379 __sock_put(sk);
380
381 if (sync) {
382 spin_unlock(&po->bind_lock);
383 synchronize_net();
384 spin_lock(&po->bind_lock);
385 }
386}
387
388static void unregister_prot_hook(struct sock *sk, bool sync)
389{
390 struct packet_sock *po = pkt_sk(sk);
391
392 if (po->running)
393 __unregister_prot_hook(sk, sync);
394}
395
f6dafa95 396static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
397{
398 if (is_vmalloc_addr(addr))
399 return vmalloc_to_page(addr);
400 return virt_to_page(addr);
401}
402
69e3c75f 403static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 404{
bbd6ef87
PM
405 union {
406 struct tpacket_hdr *h1;
407 struct tpacket2_hdr *h2;
408 void *raw;
409 } h;
1da177e4 410
69e3c75f 411 h.raw = frame;
bbd6ef87
PM
412 switch (po->tp_version) {
413 case TPACKET_V1:
69e3c75f 414 h.h1->tp_status = status;
0af55bb5 415 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
416 break;
417 case TPACKET_V2:
69e3c75f 418 h.h2->tp_status = status;
0af55bb5 419 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 420 break;
f6fb8f10 421 case TPACKET_V3:
69e3c75f 422 default:
f6fb8f10 423 WARN(1, "TPACKET version not supported.\n");
69e3c75f 424 BUG();
bbd6ef87 425 }
69e3c75f
JB
426
427 smp_wmb();
bbd6ef87
PM
428}
429
69e3c75f 430static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87
PM
431{
432 union {
433 struct tpacket_hdr *h1;
434 struct tpacket2_hdr *h2;
435 void *raw;
436 } h;
437
69e3c75f
JB
438 smp_rmb();
439
bbd6ef87
PM
440 h.raw = frame;
441 switch (po->tp_version) {
442 case TPACKET_V1:
0af55bb5 443 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 444 return h.h1->tp_status;
bbd6ef87 445 case TPACKET_V2:
0af55bb5 446 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 447 return h.h2->tp_status;
f6fb8f10 448 case TPACKET_V3:
69e3c75f 449 default:
f6fb8f10 450 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
451 BUG();
452 return 0;
bbd6ef87 453 }
1da177e4 454}
69e3c75f
JB
455
456static void *packet_lookup_frame(struct packet_sock *po,
457 struct packet_ring_buffer *rb,
458 unsigned int position,
459 int status)
460{
461 unsigned int pg_vec_pos, frame_offset;
462 union {
463 struct tpacket_hdr *h1;
464 struct tpacket2_hdr *h2;
465 void *raw;
466 } h;
467
468 pg_vec_pos = position / rb->frames_per_block;
469 frame_offset = position % rb->frames_per_block;
470
0e3125c7
NH
471 h.raw = rb->pg_vec[pg_vec_pos].buffer +
472 (frame_offset * rb->frame_size);
69e3c75f
JB
473
474 if (status != __packet_get_status(po, h.raw))
475 return NULL;
476
477 return h.raw;
478}
479
480static inline void *packet_current_frame(struct packet_sock *po,
481 struct packet_ring_buffer *rb,
482 int status)
483{
484 return packet_lookup_frame(po, rb, rb->head, status);
485}
486
bc59ba39 487static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 488{
489 del_timer_sync(&pkc->retire_blk_timer);
490}
491
492static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
493 int tx_ring,
494 struct sk_buff_head *rb_queue)
495{
bc59ba39 496 struct tpacket_kbdq_core *pkc;
f6fb8f10 497
498 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
499
500 spin_lock(&rb_queue->lock);
501 pkc->delete_blk_timer = 1;
502 spin_unlock(&rb_queue->lock);
503
504 prb_del_retire_blk_timer(pkc);
505}
506
507static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 508 struct tpacket_kbdq_core *pkc,
f6fb8f10 509 void (*func) (unsigned long))
510{
511 init_timer(&pkc->retire_blk_timer);
512 pkc->retire_blk_timer.data = (long)po;
513 pkc->retire_blk_timer.function = func;
514 pkc->retire_blk_timer.expires = jiffies;
515}
516
517static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
518{
bc59ba39 519 struct tpacket_kbdq_core *pkc;
f6fb8f10 520
521 if (tx_ring)
522 BUG();
523
524 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
525 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
526}
527
528static int prb_calc_retire_blk_tmo(struct packet_sock *po,
529 int blk_size_in_bytes)
530{
531 struct net_device *dev;
532 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
533
534 dev = dev_get_by_index(sock_net(&po->sk), po->ifindex);
535 if (unlikely(dev == NULL))
536 return DEFAULT_PRB_RETIRE_TOV;
537
538 if (dev->ethtool_ops && dev->ethtool_ops->get_settings) {
539 struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET, };
540
541 if (!dev->ethtool_ops->get_settings(dev, &ecmd)) {
542 switch (ecmd.speed) {
543 case SPEED_10000:
544 msec = 1;
545 div = 10000/1000;
546 break;
547 case SPEED_1000:
548 msec = 1;
549 div = 1000/1000;
550 break;
551 /*
552 * If the link speed is so slow you don't really
553 * need to worry about perf anyways
554 */
555 case SPEED_100:
556 case SPEED_10:
557 default:
558 return DEFAULT_PRB_RETIRE_TOV;
559 }
560 }
561 }
562
563 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
564
565 if (div)
566 mbits /= div;
567
568 tmo = mbits * msec;
569
570 if (div)
571 return tmo+1;
572 return tmo;
573}
574
bc59ba39 575static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 576 union tpacket_req_u *req_u)
577{
578 p1->feature_req_word = req_u->req3.tp_feature_req_word;
579}
580
581static void init_prb_bdqc(struct packet_sock *po,
582 struct packet_ring_buffer *rb,
583 struct pgv *pg_vec,
584 union tpacket_req_u *req_u, int tx_ring)
585{
bc59ba39 586 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
587 struct tpacket_block_desc *pbd;
f6fb8f10 588
589 memset(p1, 0x0, sizeof(*p1));
590
591 p1->knxt_seq_num = 1;
592 p1->pkbdq = pg_vec;
bc59ba39 593 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
f6fb8f10 594 p1->pkblk_start = (char *)pg_vec[0].buffer;
595 p1->kblk_size = req_u->req3.tp_block_size;
596 p1->knum_blocks = req_u->req3.tp_block_nr;
597 p1->hdrlen = po->tp_hdrlen;
598 p1->version = po->tp_version;
599 p1->last_kactive_blk_num = 0;
600 po->stats_u.stats3.tp_freeze_q_cnt = 0;
601 if (req_u->req3.tp_retire_blk_tov)
602 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
603 else
604 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
605 req_u->req3.tp_block_size);
606 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
607 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
608
609 prb_init_ft_ops(p1, req_u);
610 prb_setup_retire_blk_timer(po, tx_ring);
611 prb_open_block(p1, pbd);
612}
613
614/* Do NOT update the last_blk_num first.
615 * Assumes sk_buff_head lock is held.
616 */
bc59ba39 617static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 618{
619 mod_timer(&pkc->retire_blk_timer,
620 jiffies + pkc->tov_in_jiffies);
621 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
622}
623
624/*
625 * Timer logic:
626 * 1) We refresh the timer only when we open a block.
627 * By doing this we don't waste cycles refreshing the timer
628 * on packet-by-packet basis.
629 *
630 * With a 1MB block-size, on a 1Gbps line, it will take
631 * i) ~8 ms to fill a block + ii) memcpy etc.
632 * In this cut we are not accounting for the memcpy time.
633 *
634 * So, if the user sets the 'tmo' to 10ms then the timer
635 * will never fire while the block is still getting filled
636 * (which is what we want). However, the user could choose
637 * to close a block early and that's fine.
638 *
639 * But when the timer does fire, we check whether or not to refresh it.
640 * Since the tmo granularity is in msecs, it is not too expensive
641 * to refresh the timer, lets say every '8' msecs.
642 * Either the user can set the 'tmo' or we can derive it based on
643 * a) line-speed and b) block-size.
644 * prb_calc_retire_blk_tmo() calculates the tmo.
645 *
646 */
647static void prb_retire_rx_blk_timer_expired(unsigned long data)
648{
649 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 650 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 651 unsigned int frozen;
bc59ba39 652 struct tpacket_block_desc *pbd;
f6fb8f10 653
654 spin_lock(&po->sk.sk_receive_queue.lock);
655
656 frozen = prb_queue_frozen(pkc);
657 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
658
659 if (unlikely(pkc->delete_blk_timer))
660 goto out;
661
662 /* We only need to plug the race when the block is partially filled.
663 * tpacket_rcv:
664 * lock(); increment BLOCK_NUM_PKTS; unlock()
665 * copy_bits() is in progress ...
666 * timer fires on other cpu:
667 * we can't retire the current block because copy_bits
668 * is in progress.
669 *
670 */
671 if (BLOCK_NUM_PKTS(pbd)) {
672 while (atomic_read(&pkc->blk_fill_in_prog)) {
673 /* Waiting for skb_copy_bits to finish... */
674 cpu_relax();
675 }
676 }
677
678 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
679 if (!frozen) {
680 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
681 if (!prb_dispatch_next_block(pkc, po))
682 goto refresh_timer;
683 else
684 goto out;
685 } else {
686 /* Case 1. Queue was frozen because user-space was
687 * lagging behind.
688 */
689 if (prb_curr_blk_in_use(pkc, pbd)) {
690 /*
691 * Ok, user-space is still behind.
692 * So just refresh the timer.
693 */
694 goto refresh_timer;
695 } else {
696 /* Case 2. queue was frozen,user-space caught up,
697 * now the link went idle && the timer fired.
698 * We don't have a block to close.So we open this
699 * block and restart the timer.
700 * opening a block thaws the queue,restarts timer
701 * Thawing/timer-refresh is a side effect.
702 */
703 prb_open_block(pkc, pbd);
704 goto out;
705 }
706 }
707 }
708
709refresh_timer:
710 _prb_refresh_rx_retire_blk_timer(pkc);
711
712out:
713 spin_unlock(&po->sk.sk_receive_queue.lock);
714}
715
bc59ba39 716static inline void prb_flush_block(struct tpacket_kbdq_core *pkc1,
717 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 718{
719 /* Flush everything minus the block header */
720
721#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
722 u8 *start, *end;
723
724 start = (u8 *)pbd1;
725
726 /* Skip the block header(we know header WILL fit in 4K) */
727 start += PAGE_SIZE;
728
729 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
730 for (; start < end; start += PAGE_SIZE)
731 flush_dcache_page(pgv_to_page(start));
732
733 smp_wmb();
734#endif
735
736 /* Now update the block status. */
737
738 BLOCK_STATUS(pbd1) = status;
739
740 /* Flush the block header */
741
742#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
743 start = (u8 *)pbd1;
744 flush_dcache_page(pgv_to_page(start));
745
746 smp_wmb();
747#endif
748}
749
750/*
751 * Side effect:
752 *
753 * 1) flush the block
754 * 2) Increment active_blk_num
755 *
756 * Note:We DONT refresh the timer on purpose.
757 * Because almost always the next block will be opened.
758 */
bc59ba39 759static void prb_close_block(struct tpacket_kbdq_core *pkc1,
760 struct tpacket_block_desc *pbd1,
f6fb8f10 761 struct packet_sock *po, unsigned int stat)
762{
763 __u32 status = TP_STATUS_USER | stat;
764
765 struct tpacket3_hdr *last_pkt;
bc59ba39 766 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 767
768 if (po->stats.tp_drops)
769 status |= TP_STATUS_LOSING;
770
771 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
772 last_pkt->tp_next_offset = 0;
773
774 /* Get the ts of the last pkt */
775 if (BLOCK_NUM_PKTS(pbd1)) {
776 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
777 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
778 } else {
779 /* Ok, we tmo'd - so get the current time */
780 struct timespec ts;
781 getnstimeofday(&ts);
782 h1->ts_last_pkt.ts_sec = ts.tv_sec;
783 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
784 }
785
786 smp_wmb();
787
788 /* Flush the block */
789 prb_flush_block(pkc1, pbd1, status);
790
791 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
792}
793
bc59ba39 794static inline void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 795{
796 pkc->reset_pending_on_curr_blk = 0;
797}
798
799/*
800 * Side effect of opening a block:
801 *
802 * 1) prb_queue is thawed.
803 * 2) retire_blk_timer is refreshed.
804 *
805 */
bc59ba39 806static void prb_open_block(struct tpacket_kbdq_core *pkc1,
807 struct tpacket_block_desc *pbd1)
f6fb8f10 808{
809 struct timespec ts;
bc59ba39 810 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 811
812 smp_rmb();
813
814 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
815
816 /* We could have just memset this but we will lose the
817 * flexibility of making the priv area sticky
818 */
819 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
820 BLOCK_NUM_PKTS(pbd1) = 0;
821 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
822 getnstimeofday(&ts);
823 h1->ts_first_pkt.ts_sec = ts.tv_sec;
824 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
825 pkc1->pkblk_start = (char *)pbd1;
826 pkc1->nxt_offset = (char *)(pkc1->pkblk_start +
827 BLK_PLUS_PRIV(pkc1->blk_sizeof_priv));
828 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
829 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
830 pbd1->version = pkc1->version;
831 pkc1->prev = pkc1->nxt_offset;
832 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
833 prb_thaw_queue(pkc1);
834 _prb_refresh_rx_retire_blk_timer(pkc1);
835
836 smp_wmb();
837
838 return;
839 }
840
841 WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
842 pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
843 dump_stack();
844 BUG();
845}
846
847/*
848 * Queue freeze logic:
849 * 1) Assume tp_block_nr = 8 blocks.
850 * 2) At time 't0', user opens Rx ring.
851 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
852 * 4) user-space is either sleeping or processing block '0'.
853 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
854 * it will close block-7,loop around and try to fill block '0'.
855 * call-flow:
856 * __packet_lookup_frame_in_block
857 * prb_retire_current_block()
858 * prb_dispatch_next_block()
859 * |->(BLOCK_STATUS == USER) evaluates to true
860 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
861 * 6) Now there are two cases:
862 * 6.1) Link goes idle right after the queue is frozen.
863 * But remember, the last open_block() refreshed the timer.
864 * When this timer expires,it will refresh itself so that we can
865 * re-open block-0 in near future.
866 * 6.2) Link is busy and keeps on receiving packets. This is a simple
867 * case and __packet_lookup_frame_in_block will check if block-0
868 * is free and can now be re-used.
869 */
bc59ba39 870static inline void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 871 struct packet_sock *po)
872{
873 pkc->reset_pending_on_curr_blk = 1;
874 po->stats_u.stats3.tp_freeze_q_cnt++;
875}
876
877#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
878
879/*
880 * If the next block is free then we will dispatch it
881 * and return a good offset.
882 * Else, we will freeze the queue.
883 * So, caller must check the return value.
884 */
bc59ba39 885static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 886 struct packet_sock *po)
887{
bc59ba39 888 struct tpacket_block_desc *pbd;
f6fb8f10 889
890 smp_rmb();
891
892 /* 1. Get current block num */
893 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
894
895 /* 2. If this block is currently in_use then freeze the queue */
896 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
897 prb_freeze_queue(pkc, po);
898 return NULL;
899 }
900
901 /*
902 * 3.
903 * open this block and return the offset where the first packet
904 * needs to get stored.
905 */
906 prb_open_block(pkc, pbd);
907 return (void *)pkc->nxt_offset;
908}
909
bc59ba39 910static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 911 struct packet_sock *po, unsigned int status)
912{
bc59ba39 913 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 914
915 /* retire/close the current block */
916 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
917 /*
918 * Plug the case where copy_bits() is in progress on
919 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
920 * have space to copy the pkt in the current block and
921 * called prb_retire_current_block()
922 *
923 * We don't need to worry about the TMO case because
924 * the timer-handler already handled this case.
925 */
926 if (!(status & TP_STATUS_BLK_TMO)) {
927 while (atomic_read(&pkc->blk_fill_in_prog)) {
928 /* Waiting for skb_copy_bits to finish... */
929 cpu_relax();
930 }
931 }
932 prb_close_block(pkc, pbd, po, status);
933 return;
934 }
935
936 WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
937 dump_stack();
938 BUG();
939}
940
bc59ba39 941static inline int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
942 struct tpacket_block_desc *pbd)
f6fb8f10 943{
944 return TP_STATUS_USER & BLOCK_STATUS(pbd);
945}
946
bc59ba39 947static inline int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 948{
949 return pkc->reset_pending_on_curr_blk;
950}
951
952static inline void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
953{
bc59ba39 954 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 955 atomic_dec(&pkc->blk_fill_in_prog);
956}
957
bc59ba39 958static inline void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 959 struct tpacket3_hdr *ppd)
960{
961 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
962}
963
bc59ba39 964static inline void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 965 struct tpacket3_hdr *ppd)
966{
967 ppd->hv1.tp_rxhash = 0;
968}
969
bc59ba39 970static inline void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 971 struct tpacket3_hdr *ppd)
972{
973 if (vlan_tx_tag_present(pkc->skb)) {
974 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
975 ppd->tp_status = TP_STATUS_VLAN_VALID;
976 } else {
977 ppd->hv1.tp_vlan_tci = ppd->tp_status = 0;
978 }
979}
980
bc59ba39 981static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 982 struct tpacket3_hdr *ppd)
983{
984 prb_fill_vlan_info(pkc, ppd);
985
986 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
987 prb_fill_rxhash(pkc, ppd);
988 else
989 prb_clear_rxhash(pkc, ppd);
990}
991
bc59ba39 992static inline void prb_fill_curr_block(char *curr,
993 struct tpacket_kbdq_core *pkc,
994 struct tpacket_block_desc *pbd,
f6fb8f10 995 unsigned int len)
996{
997 struct tpacket3_hdr *ppd;
998
999 ppd = (struct tpacket3_hdr *)curr;
1000 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1001 pkc->prev = curr;
1002 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1003 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1004 BLOCK_NUM_PKTS(pbd) += 1;
1005 atomic_inc(&pkc->blk_fill_in_prog);
1006 prb_run_all_ft_ops(pkc, ppd);
1007}
1008
1009/* Assumes caller has the sk->rx_queue.lock */
1010static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1011 struct sk_buff *skb,
1012 int status,
1013 unsigned int len
1014 )
1015{
bc59ba39 1016 struct tpacket_kbdq_core *pkc;
1017 struct tpacket_block_desc *pbd;
f6fb8f10 1018 char *curr, *end;
1019
1020 pkc = GET_PBDQC_FROM_RB(((struct packet_ring_buffer *)&po->rx_ring));
1021 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1022
1023 /* Queue is frozen when user space is lagging behind */
1024 if (prb_queue_frozen(pkc)) {
1025 /*
1026 * Check if that last block which caused the queue to freeze,
1027 * is still in_use by user-space.
1028 */
1029 if (prb_curr_blk_in_use(pkc, pbd)) {
1030 /* Can't record this packet */
1031 return NULL;
1032 } else {
1033 /*
1034 * Ok, the block was released by user-space.
1035 * Now let's open that block.
1036 * opening a block also thaws the queue.
1037 * Thawing is a side effect.
1038 */
1039 prb_open_block(pkc, pbd);
1040 }
1041 }
1042
1043 smp_mb();
1044 curr = pkc->nxt_offset;
1045 pkc->skb = skb;
1046 end = (char *) ((char *)pbd + pkc->kblk_size);
1047
1048 /* first try the current block */
1049 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1050 prb_fill_curr_block(curr, pkc, pbd, len);
1051 return (void *)curr;
1052 }
1053
1054 /* Ok, close the current block */
1055 prb_retire_current_block(pkc, po, 0);
1056
1057 /* Now, try to dispatch the next block */
1058 curr = (char *)prb_dispatch_next_block(pkc, po);
1059 if (curr) {
1060 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1061 prb_fill_curr_block(curr, pkc, pbd, len);
1062 return (void *)curr;
1063 }
1064
1065 /*
1066 * No free blocks are available.user_space hasn't caught up yet.
1067 * Queue was just frozen and now this packet will get dropped.
1068 */
1069 return NULL;
1070}
1071
1072static inline void *packet_current_rx_frame(struct packet_sock *po,
1073 struct sk_buff *skb,
1074 int status, unsigned int len)
1075{
1076 char *curr = NULL;
1077 switch (po->tp_version) {
1078 case TPACKET_V1:
1079 case TPACKET_V2:
1080 curr = packet_lookup_frame(po, &po->rx_ring,
1081 po->rx_ring.head, status);
1082 return curr;
1083 case TPACKET_V3:
1084 return __packet_lookup_frame_in_block(po, skb, status, len);
1085 default:
1086 WARN(1, "TPACKET version not supported\n");
1087 BUG();
1088 return 0;
1089 }
1090}
1091
1092static inline void *prb_lookup_block(struct packet_sock *po,
1093 struct packet_ring_buffer *rb,
1094 unsigned int previous,
1095 int status)
1096{
bc59ba39 1097 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1098 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
f6fb8f10 1099
1100 if (status != BLOCK_STATUS(pbd))
1101 return NULL;
1102 return pbd;
1103}
1104
1105static inline int prb_previous_blk_num(struct packet_ring_buffer *rb)
1106{
1107 unsigned int prev;
1108 if (rb->prb_bdqc.kactive_blk_num)
1109 prev = rb->prb_bdqc.kactive_blk_num-1;
1110 else
1111 prev = rb->prb_bdqc.knum_blocks-1;
1112 return prev;
1113}
1114
1115/* Assumes caller has held the rx_queue.lock */
1116static inline void *__prb_previous_block(struct packet_sock *po,
1117 struct packet_ring_buffer *rb,
1118 int status)
1119{
1120 unsigned int previous = prb_previous_blk_num(rb);
1121 return prb_lookup_block(po, rb, previous, status);
1122}
1123
1124static inline void *packet_previous_rx_frame(struct packet_sock *po,
1125 struct packet_ring_buffer *rb,
1126 int status)
1127{
1128 if (po->tp_version <= TPACKET_V2)
1129 return packet_previous_frame(po, rb, status);
1130
1131 return __prb_previous_block(po, rb, status);
1132}
1133
1134static inline void packet_increment_rx_head(struct packet_sock *po,
1135 struct packet_ring_buffer *rb)
1136{
1137 switch (po->tp_version) {
1138 case TPACKET_V1:
1139 case TPACKET_V2:
1140 return packet_increment_head(rb);
1141 case TPACKET_V3:
1142 default:
1143 WARN(1, "TPACKET version not supported.\n");
1144 BUG();
1145 return;
1146 }
1147}
1148
69e3c75f
JB
1149static inline void *packet_previous_frame(struct packet_sock *po,
1150 struct packet_ring_buffer *rb,
1151 int status)
1152{
1153 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1154 return packet_lookup_frame(po, rb, previous, status);
1155}
1156
1157static inline void packet_increment_head(struct packet_ring_buffer *buff)
1158{
1159 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1160}
1161
1da177e4
LT
1162static void packet_sock_destruct(struct sock *sk)
1163{
ed85b565
RC
1164 skb_queue_purge(&sk->sk_error_queue);
1165
547b792c
IJ
1166 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1167 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1168
1169 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1170 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1171 return;
1172 }
1173
17ab56a2 1174 sk_refcnt_debug_dec(sk);
1da177e4
LT
1175}
1176
dc99f600
DM
1177static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1178{
1179 int x = atomic_read(&f->rr_cur) + 1;
1180
1181 if (x >= num)
1182 x = 0;
1183
1184 return x;
1185}
1186
1187static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1188{
1189 u32 idx, hash = skb->rxhash;
1190
1191 idx = ((u64)hash * num) >> 32;
1192
1193 return f->arr[idx];
1194}
1195
1196static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1197{
1198 int cur, old;
1199
1200 cur = atomic_read(&f->rr_cur);
1201 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1202 fanout_rr_next(f, num))) != cur)
1203 cur = old;
1204 return f->arr[cur];
1205}
1206
95ec3eb4
DM
1207static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1208{
1209 unsigned int cpu = smp_processor_id();
1210
1211 return f->arr[cpu % num];
1212}
1213
7736d33f
DM
1214static struct sk_buff *fanout_check_defrag(struct sk_buff *skb)
1215{
31817df0 1216#ifdef CONFIG_INET
7736d33f
DM
1217 const struct iphdr *iph;
1218 u32 len;
1219
1220 if (skb->protocol != htons(ETH_P_IP))
1221 return skb;
1222
1223 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
1224 return skb;
1225
1226 iph = ip_hdr(skb);
1227 if (iph->ihl < 5 || iph->version != 4)
1228 return skb;
1229 if (!pskb_may_pull(skb, iph->ihl*4))
1230 return skb;
1231 iph = ip_hdr(skb);
1232 len = ntohs(iph->tot_len);
1233 if (skb->len < len || len < (iph->ihl * 4))
1234 return skb;
1235
1236 if (ip_is_fragment(ip_hdr(skb))) {
aec27311 1237 skb = skb_share_check(skb, GFP_ATOMIC);
7736d33f
DM
1238 if (skb) {
1239 if (pskb_trim_rcsum(skb, len))
1240 return skb;
1241 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
1242 if (ip_defrag(skb, IP_DEFRAG_AF_PACKET))
1243 return NULL;
1244 skb->rxhash = 0;
1245 }
1246 }
31817df0 1247#endif
7736d33f
DM
1248 return skb;
1249}
1250
95ec3eb4
DM
1251static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1252 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1253{
1254 struct packet_fanout *f = pt->af_packet_priv;
1255 unsigned int num = f->num_members;
1256 struct packet_sock *po;
1257 struct sock *sk;
1258
1259 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1260 !num) {
1261 kfree_skb(skb);
1262 return 0;
1263 }
1264
95ec3eb4
DM
1265 switch (f->type) {
1266 case PACKET_FANOUT_HASH:
1267 default:
1268 if (f->defrag) {
1269 skb = fanout_check_defrag(skb);
1270 if (!skb)
1271 return 0;
1272 }
1273 skb_get_rxhash(skb);
1274 sk = fanout_demux_hash(f, skb, num);
1275 break;
1276 case PACKET_FANOUT_LB:
1277 sk = fanout_demux_lb(f, skb, num);
1278 break;
1279 case PACKET_FANOUT_CPU:
1280 sk = fanout_demux_cpu(f, skb, num);
1281 break;
dc99f600
DM
1282 }
1283
dc99f600
DM
1284 po = pkt_sk(sk);
1285
1286 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1287}
1288
1289static DEFINE_MUTEX(fanout_mutex);
1290static LIST_HEAD(fanout_list);
1291
1292static void __fanout_link(struct sock *sk, struct packet_sock *po)
1293{
1294 struct packet_fanout *f = po->fanout;
1295
1296 spin_lock(&f->lock);
1297 f->arr[f->num_members] = sk;
1298 smp_wmb();
1299 f->num_members++;
1300 spin_unlock(&f->lock);
1301}
1302
1303static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1304{
1305 struct packet_fanout *f = po->fanout;
1306 int i;
1307
1308 spin_lock(&f->lock);
1309 for (i = 0; i < f->num_members; i++) {
1310 if (f->arr[i] == sk)
1311 break;
1312 }
1313 BUG_ON(i >= f->num_members);
1314 f->arr[i] = f->arr[f->num_members - 1];
1315 f->num_members--;
1316 spin_unlock(&f->lock);
1317}
1318
7736d33f 1319static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1320{
1321 struct packet_sock *po = pkt_sk(sk);
1322 struct packet_fanout *f, *match;
7736d33f
DM
1323 u8 type = type_flags & 0xff;
1324 u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0;
dc99f600
DM
1325 int err;
1326
1327 switch (type) {
1328 case PACKET_FANOUT_HASH:
1329 case PACKET_FANOUT_LB:
95ec3eb4 1330 case PACKET_FANOUT_CPU:
dc99f600
DM
1331 break;
1332 default:
1333 return -EINVAL;
1334 }
1335
1336 if (!po->running)
1337 return -EINVAL;
1338
1339 if (po->fanout)
1340 return -EALREADY;
1341
1342 mutex_lock(&fanout_mutex);
1343 match = NULL;
1344 list_for_each_entry(f, &fanout_list, list) {
1345 if (f->id == id &&
1346 read_pnet(&f->net) == sock_net(sk)) {
1347 match = f;
1348 break;
1349 }
1350 }
afe62c68 1351 err = -EINVAL;
7736d33f 1352 if (match && match->defrag != defrag)
afe62c68 1353 goto out;
dc99f600 1354 if (!match) {
afe62c68 1355 err = -ENOMEM;
dc99f600 1356 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1357 if (!match)
1358 goto out;
1359 write_pnet(&match->net, sock_net(sk));
1360 match->id = id;
1361 match->type = type;
1362 match->defrag = defrag;
1363 atomic_set(&match->rr_cur, 0);
1364 INIT_LIST_HEAD(&match->list);
1365 spin_lock_init(&match->lock);
1366 atomic_set(&match->sk_ref, 0);
1367 match->prot_hook.type = po->prot_hook.type;
1368 match->prot_hook.dev = po->prot_hook.dev;
1369 match->prot_hook.func = packet_rcv_fanout;
1370 match->prot_hook.af_packet_priv = match;
1371 dev_add_pack(&match->prot_hook);
1372 list_add(&match->list, &fanout_list);
dc99f600 1373 }
afe62c68
ED
1374 err = -EINVAL;
1375 if (match->type == type &&
1376 match->prot_hook.type == po->prot_hook.type &&
1377 match->prot_hook.dev == po->prot_hook.dev) {
1378 err = -ENOSPC;
1379 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1380 __dev_remove_pack(&po->prot_hook);
1381 po->fanout = match;
1382 atomic_inc(&match->sk_ref);
1383 __fanout_link(sk, po);
1384 err = 0;
dc99f600
DM
1385 }
1386 }
afe62c68 1387out:
dc99f600
DM
1388 mutex_unlock(&fanout_mutex);
1389 return err;
1390}
1391
1392static void fanout_release(struct sock *sk)
1393{
1394 struct packet_sock *po = pkt_sk(sk);
1395 struct packet_fanout *f;
1396
1397 f = po->fanout;
1398 if (!f)
1399 return;
1400
1401 po->fanout = NULL;
1402
1403 mutex_lock(&fanout_mutex);
1404 if (atomic_dec_and_test(&f->sk_ref)) {
1405 list_del(&f->list);
1406 dev_remove_pack(&f->prot_hook);
1407 kfree(f);
1408 }
1409 mutex_unlock(&fanout_mutex);
1410}
1da177e4 1411
90ddc4f0 1412static const struct proto_ops packet_ops;
1da177e4 1413
90ddc4f0 1414static const struct proto_ops packet_ops_spkt;
1da177e4 1415
40d4e3df
ED
1416static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1417 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1418{
1419 struct sock *sk;
1420 struct sockaddr_pkt *spkt;
1421
1422 /*
1423 * When we registered the protocol we saved the socket in the data
1424 * field for just this event.
1425 */
1426
1427 sk = pt->af_packet_priv;
1ce4f28b 1428
1da177e4
LT
1429 /*
1430 * Yank back the headers [hope the device set this
1431 * right or kerboom...]
1432 *
1433 * Incoming packets have ll header pulled,
1434 * push it back.
1435 *
98e399f8 1436 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1437 * so that this procedure is noop.
1438 */
1439
1440 if (skb->pkt_type == PACKET_LOOPBACK)
1441 goto out;
1442
09ad9bc7 1443 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1444 goto out;
1445
40d4e3df
ED
1446 skb = skb_share_check(skb, GFP_ATOMIC);
1447 if (skb == NULL)
1da177e4
LT
1448 goto oom;
1449
1450 /* drop any routing info */
adf30907 1451 skb_dst_drop(skb);
1da177e4 1452
84531c24
PO
1453 /* drop conntrack reference */
1454 nf_reset(skb);
1455
ffbc6111 1456 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1457
98e399f8 1458 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1459
1460 /*
1461 * The SOCK_PACKET socket receives _all_ frames.
1462 */
1463
1464 spkt->spkt_family = dev->type;
1465 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1466 spkt->spkt_protocol = skb->protocol;
1467
1468 /*
1469 * Charge the memory to the socket. This is done specifically
1470 * to prevent sockets using all the memory up.
1471 */
1472
40d4e3df 1473 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1474 return 0;
1475
1476out:
1477 kfree_skb(skb);
1478oom:
1479 return 0;
1480}
1481
1482
1483/*
1484 * Output a raw packet to a device layer. This bypasses all the other
1485 * protocol layers and you must therefore supply it with a complete frame
1486 */
1ce4f28b 1487
1da177e4
LT
1488static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1489 struct msghdr *msg, size_t len)
1490{
1491 struct sock *sk = sock->sk;
40d4e3df 1492 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1493 struct sk_buff *skb = NULL;
1da177e4 1494 struct net_device *dev;
40d4e3df 1495 __be16 proto = 0;
1da177e4 1496 int err;
1ce4f28b 1497
1da177e4 1498 /*
1ce4f28b 1499 * Get and verify the address.
1da177e4
LT
1500 */
1501
40d4e3df 1502 if (saddr) {
1da177e4 1503 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1504 return -EINVAL;
1505 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1506 proto = saddr->spkt_protocol;
1507 } else
1508 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1509
1510 /*
1ce4f28b 1511 * Find the device first to size check it
1da177e4
LT
1512 */
1513
1514 saddr->spkt_device[13] = 0;
1a35ca80 1515retry:
654d1f8a
ED
1516 rcu_read_lock();
1517 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1518 err = -ENODEV;
1519 if (dev == NULL)
1520 goto out_unlock;
1ce4f28b 1521
d5e76b0a
DM
1522 err = -ENETDOWN;
1523 if (!(dev->flags & IFF_UP))
1524 goto out_unlock;
1525
1da177e4 1526 /*
40d4e3df
ED
1527 * You may not queue a frame bigger than the mtu. This is the lowest level
1528 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1529 */
1ce4f28b 1530
1da177e4 1531 err = -EMSGSIZE;
57f89bfa 1532 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN)
1da177e4
LT
1533 goto out_unlock;
1534
1a35ca80
ED
1535 if (!skb) {
1536 size_t reserved = LL_RESERVED_SPACE(dev);
1537 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1538
1539 rcu_read_unlock();
1540 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
1541 if (skb == NULL)
1542 return -ENOBUFS;
1543 /* FIXME: Save some space for broken drivers that write a hard
1544 * header at transmission time by themselves. PPP is the notable
1545 * one here. This should really be fixed at the driver level.
1546 */
1547 skb_reserve(skb, reserved);
1548 skb_reset_network_header(skb);
1549
1550 /* Try to align data part correctly */
1551 if (hhlen) {
1552 skb->data -= hhlen;
1553 skb->tail -= hhlen;
1554 if (len < hhlen)
1555 skb_reset_network_header(skb);
1556 }
1557 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1558 if (err)
1559 goto out_free;
1560 goto retry;
1da177e4
LT
1561 }
1562
57f89bfa
BG
1563 if (len > (dev->mtu + dev->hard_header_len)) {
1564 /* Earlier code assumed this would be a VLAN pkt,
1565 * double-check this now that we have the actual
1566 * packet in hand.
1567 */
1568 struct ethhdr *ehdr;
1569 skb_reset_mac_header(skb);
1570 ehdr = eth_hdr(skb);
1571 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1572 err = -EMSGSIZE;
1573 goto out_unlock;
1574 }
1575 }
1a35ca80 1576
1da177e4
LT
1577 skb->protocol = proto;
1578 skb->dev = dev;
1579 skb->priority = sk->sk_priority;
2d37a186 1580 skb->mark = sk->sk_mark;
2244d07b 1581 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
1582 if (err < 0)
1583 goto out_unlock;
1da177e4
LT
1584
1585 dev_queue_xmit(skb);
654d1f8a 1586 rcu_read_unlock();
40d4e3df 1587 return len;
1da177e4 1588
1da177e4 1589out_unlock:
654d1f8a 1590 rcu_read_unlock();
1a35ca80
ED
1591out_free:
1592 kfree_skb(skb);
1da177e4
LT
1593 return err;
1594}
1da177e4 1595
62ab0812
ED
1596static inline unsigned int run_filter(const struct sk_buff *skb,
1597 const struct sock *sk,
dbcb5855 1598 unsigned int res)
1da177e4
LT
1599{
1600 struct sk_filter *filter;
fda9ef5d 1601
80f8f102
ED
1602 rcu_read_lock();
1603 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1604 if (filter != NULL)
0a14842f 1605 res = SK_RUN_FILTER(filter, skb);
80f8f102 1606 rcu_read_unlock();
1da177e4 1607
dbcb5855 1608 return res;
1da177e4
LT
1609}
1610
1611/*
62ab0812
ED
1612 * This function makes lazy skb cloning in hope that most of packets
1613 * are discarded by BPF.
1614 *
1615 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1616 * and skb->cb are mangled. It works because (and until) packets
1617 * falling here are owned by current CPU. Output packets are cloned
1618 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1619 * sequencially, so that if we return skb to original state on exit,
1620 * we will not harm anyone.
1da177e4
LT
1621 */
1622
40d4e3df
ED
1623static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1624 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1625{
1626 struct sock *sk;
1627 struct sockaddr_ll *sll;
1628 struct packet_sock *po;
40d4e3df 1629 u8 *skb_head = skb->data;
1da177e4 1630 int skb_len = skb->len;
dbcb5855 1631 unsigned int snaplen, res;
1da177e4
LT
1632
1633 if (skb->pkt_type == PACKET_LOOPBACK)
1634 goto drop;
1635
1636 sk = pt->af_packet_priv;
1637 po = pkt_sk(sk);
1638
09ad9bc7 1639 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1640 goto drop;
1641
1da177e4
LT
1642 skb->dev = dev;
1643
3b04ddde 1644 if (dev->header_ops) {
1da177e4 1645 /* The device has an explicit notion of ll header,
62ab0812
ED
1646 * exported to higher levels.
1647 *
1648 * Otherwise, the device hides details of its frame
1649 * structure, so that corresponding packet head is
1650 * never delivered to user.
1da177e4
LT
1651 */
1652 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1653 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1654 else if (skb->pkt_type == PACKET_OUTGOING) {
1655 /* Special case: outgoing packets have ll header at head */
bbe735e4 1656 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1657 }
1658 }
1659
1660 snaplen = skb->len;
1661
dbcb5855
DM
1662 res = run_filter(skb, sk, snaplen);
1663 if (!res)
fda9ef5d 1664 goto drop_n_restore;
dbcb5855
DM
1665 if (snaplen > res)
1666 snaplen = res;
1da177e4
LT
1667
1668 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
1669 (unsigned)sk->sk_rcvbuf)
1670 goto drop_n_acct;
1671
1672 if (skb_shared(skb)) {
1673 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1674 if (nskb == NULL)
1675 goto drop_n_acct;
1676
1677 if (skb_head != skb->data) {
1678 skb->data = skb_head;
1679 skb->len = skb_len;
1680 }
1681 kfree_skb(skb);
1682 skb = nskb;
1683 }
1684
ffbc6111
HX
1685 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1686 sizeof(skb->cb));
1687
1688 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1689 sll->sll_family = AF_PACKET;
1690 sll->sll_hatype = dev->type;
1691 sll->sll_protocol = skb->protocol;
1692 sll->sll_pkttype = skb->pkt_type;
8032b464 1693 if (unlikely(po->origdev))
80feaacb
PWJ
1694 sll->sll_ifindex = orig_dev->ifindex;
1695 else
1696 sll->sll_ifindex = dev->ifindex;
1da177e4 1697
b95cce35 1698 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1699
ffbc6111 1700 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1701
1da177e4
LT
1702 if (pskb_trim(skb, snaplen))
1703 goto drop_n_acct;
1704
1705 skb_set_owner_r(skb, sk);
1706 skb->dev = NULL;
adf30907 1707 skb_dst_drop(skb);
1da177e4 1708
84531c24
PO
1709 /* drop conntrack reference */
1710 nf_reset(skb);
1711
1da177e4
LT
1712 spin_lock(&sk->sk_receive_queue.lock);
1713 po->stats.tp_packets++;
3b885787 1714 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1715 __skb_queue_tail(&sk->sk_receive_queue, skb);
1716 spin_unlock(&sk->sk_receive_queue.lock);
1717 sk->sk_data_ready(sk, skb->len);
1718 return 0;
1719
1720drop_n_acct:
3b885787 1721 po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
1da177e4
LT
1722
1723drop_n_restore:
1724 if (skb_head != skb->data && skb_shared(skb)) {
1725 skb->data = skb_head;
1726 skb->len = skb_len;
1727 }
1728drop:
ead2ceb0 1729 consume_skb(skb);
1da177e4
LT
1730 return 0;
1731}
1732
40d4e3df
ED
1733static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1734 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1735{
1736 struct sock *sk;
1737 struct packet_sock *po;
1738 struct sockaddr_ll *sll;
bbd6ef87
PM
1739 union {
1740 struct tpacket_hdr *h1;
1741 struct tpacket2_hdr *h2;
f6fb8f10 1742 struct tpacket3_hdr *h3;
bbd6ef87
PM
1743 void *raw;
1744 } h;
40d4e3df 1745 u8 *skb_head = skb->data;
1da177e4 1746 int skb_len = skb->len;
dbcb5855 1747 unsigned int snaplen, res;
f6fb8f10 1748 unsigned long status = TP_STATUS_USER;
bbd6ef87 1749 unsigned short macoff, netoff, hdrlen;
1da177e4 1750 struct sk_buff *copy_skb = NULL;
b7aa0bf7 1751 struct timeval tv;
bbd6ef87 1752 struct timespec ts;
614f60fa 1753 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
1da177e4
LT
1754
1755 if (skb->pkt_type == PACKET_LOOPBACK)
1756 goto drop;
1757
1758 sk = pt->af_packet_priv;
1759 po = pkt_sk(sk);
1760
09ad9bc7 1761 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1762 goto drop;
1763
3b04ddde 1764 if (dev->header_ops) {
1da177e4 1765 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1766 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1767 else if (skb->pkt_type == PACKET_OUTGOING) {
1768 /* Special case: outgoing packets have ll header at head */
bbe735e4 1769 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1770 }
1771 }
1772
8dc41944
HX
1773 if (skb->ip_summed == CHECKSUM_PARTIAL)
1774 status |= TP_STATUS_CSUMNOTREADY;
1775
1da177e4
LT
1776 snaplen = skb->len;
1777
dbcb5855
DM
1778 res = run_filter(skb, sk, snaplen);
1779 if (!res)
fda9ef5d 1780 goto drop_n_restore;
dbcb5855
DM
1781 if (snaplen > res)
1782 snaplen = res;
1da177e4
LT
1783
1784 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1785 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1786 po->tp_reserve;
1da177e4 1787 } else {
bbe735e4 1788 unsigned maclen = skb_network_offset(skb);
bbd6ef87 1789 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1790 (maclen < 16 ? 16 : maclen)) +
1791 po->tp_reserve;
1da177e4
LT
1792 macoff = netoff - maclen;
1793 }
f6fb8f10 1794 if (po->tp_version <= TPACKET_V2) {
1795 if (macoff + snaplen > po->rx_ring.frame_size) {
1796 if (po->copy_thresh &&
1797 atomic_read(&sk->sk_rmem_alloc) + skb->truesize
1798 < (unsigned)sk->sk_rcvbuf) {
1799 if (skb_shared(skb)) {
1800 copy_skb = skb_clone(skb, GFP_ATOMIC);
1801 } else {
1802 copy_skb = skb_get(skb);
1803 skb_head = skb->data;
1804 }
1805 if (copy_skb)
1806 skb_set_owner_r(copy_skb, sk);
1da177e4 1807 }
f6fb8f10 1808 snaplen = po->rx_ring.frame_size - macoff;
1809 if ((int)snaplen < 0)
1810 snaplen = 0;
1da177e4 1811 }
1da177e4 1812 }
1da177e4 1813 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1814 h.raw = packet_current_rx_frame(po, skb,
1815 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1816 if (!h.raw)
1da177e4 1817 goto ring_is_full;
f6fb8f10 1818 if (po->tp_version <= TPACKET_V2) {
1819 packet_increment_rx_head(po, &po->rx_ring);
1820 /*
1821 * LOSING will be reported till you read the stats,
1822 * because it's COR - Clear On Read.
1823 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1824 * at packet level.
1825 */
1826 if (po->stats.tp_drops)
1827 status |= TP_STATUS_LOSING;
1828 }
1da177e4
LT
1829 po->stats.tp_packets++;
1830 if (copy_skb) {
1831 status |= TP_STATUS_COPY;
1832 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1833 }
1da177e4
LT
1834 spin_unlock(&sk->sk_receive_queue.lock);
1835
bbd6ef87 1836 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1da177e4 1837
bbd6ef87
PM
1838 switch (po->tp_version) {
1839 case TPACKET_V1:
1840 h.h1->tp_len = skb->len;
1841 h.h1->tp_snaplen = snaplen;
1842 h.h1->tp_mac = macoff;
1843 h.h1->tp_net = netoff;
614f60fa
SM
1844 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1845 && shhwtstamps->syststamp.tv64)
1846 tv = ktime_to_timeval(shhwtstamps->syststamp);
1847 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1848 && shhwtstamps->hwtstamp.tv64)
1849 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
1850 else if (skb->tstamp.tv64)
bbd6ef87
PM
1851 tv = ktime_to_timeval(skb->tstamp);
1852 else
1853 do_gettimeofday(&tv);
1854 h.h1->tp_sec = tv.tv_sec;
1855 h.h1->tp_usec = tv.tv_usec;
1856 hdrlen = sizeof(*h.h1);
1857 break;
1858 case TPACKET_V2:
1859 h.h2->tp_len = skb->len;
1860 h.h2->tp_snaplen = snaplen;
1861 h.h2->tp_mac = macoff;
1862 h.h2->tp_net = netoff;
614f60fa
SM
1863 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1864 && shhwtstamps->syststamp.tv64)
1865 ts = ktime_to_timespec(shhwtstamps->syststamp);
1866 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1867 && shhwtstamps->hwtstamp.tv64)
1868 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1869 else if (skb->tstamp.tv64)
bbd6ef87
PM
1870 ts = ktime_to_timespec(skb->tstamp);
1871 else
1872 getnstimeofday(&ts);
1873 h.h2->tp_sec = ts.tv_sec;
1874 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1875 if (vlan_tx_tag_present(skb)) {
1876 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1877 status |= TP_STATUS_VLAN_VALID;
1878 } else {
1879 h.h2->tp_vlan_tci = 0;
1880 }
13fcb7bd 1881 h.h2->tp_padding = 0;
bbd6ef87
PM
1882 hdrlen = sizeof(*h.h2);
1883 break;
f6fb8f10 1884 case TPACKET_V3:
1885 /* tp_nxt_offset,vlan are already populated above.
1886 * So DONT clear those fields here
1887 */
1888 h.h3->tp_status |= status;
1889 h.h3->tp_len = skb->len;
1890 h.h3->tp_snaplen = snaplen;
1891 h.h3->tp_mac = macoff;
1892 h.h3->tp_net = netoff;
1893 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1894 && shhwtstamps->syststamp.tv64)
1895 ts = ktime_to_timespec(shhwtstamps->syststamp);
1896 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1897 && shhwtstamps->hwtstamp.tv64)
1898 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1899 else if (skb->tstamp.tv64)
1900 ts = ktime_to_timespec(skb->tstamp);
1901 else
1902 getnstimeofday(&ts);
1903 h.h3->tp_sec = ts.tv_sec;
1904 h.h3->tp_nsec = ts.tv_nsec;
1905 hdrlen = sizeof(*h.h3);
1906 break;
bbd6ef87
PM
1907 default:
1908 BUG();
1909 }
1da177e4 1910
bbd6ef87 1911 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1912 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1913 sll->sll_family = AF_PACKET;
1914 sll->sll_hatype = dev->type;
1915 sll->sll_protocol = skb->protocol;
1916 sll->sll_pkttype = skb->pkt_type;
8032b464 1917 if (unlikely(po->origdev))
80feaacb
PWJ
1918 sll->sll_ifindex = orig_dev->ifindex;
1919 else
1920 sll->sll_ifindex = dev->ifindex;
1da177e4 1921
e16aa207 1922 smp_mb();
f6dafa95 1923#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1924 {
0af55bb5
CG
1925 u8 *start, *end;
1926
f6fb8f10 1927 if (po->tp_version <= TPACKET_V2) {
1928 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1929 + macoff + snaplen);
1930 for (start = h.raw; start < end; start += PAGE_SIZE)
1931 flush_dcache_page(pgv_to_page(start));
1932 }
cc9f01b2 1933 smp_wmb();
1da177e4 1934 }
f6dafa95 1935#endif
f6fb8f10 1936 if (po->tp_version <= TPACKET_V2)
1937 __packet_set_status(po, h.raw, status);
1938 else
1939 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1940
1941 sk->sk_data_ready(sk, 0);
1942
1943drop_n_restore:
1944 if (skb_head != skb->data && skb_shared(skb)) {
1945 skb->data = skb_head;
1946 skb->len = skb_len;
1947 }
1948drop:
1ce4f28b 1949 kfree_skb(skb);
1da177e4
LT
1950 return 0;
1951
1952ring_is_full:
1953 po->stats.tp_drops++;
1954 spin_unlock(&sk->sk_receive_queue.lock);
1955
1956 sk->sk_data_ready(sk, 0);
acb5d75b 1957 kfree_skb(copy_skb);
1da177e4
LT
1958 goto drop_n_restore;
1959}
1960
69e3c75f
JB
1961static void tpacket_destruct_skb(struct sk_buff *skb)
1962{
1963 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1964 void *ph;
1da177e4 1965
69e3c75f 1966 BUG_ON(skb == NULL);
1da177e4 1967
69e3c75f
JB
1968 if (likely(po->tx_ring.pg_vec)) {
1969 ph = skb_shinfo(skb)->destructor_arg;
1970 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
1971 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1972 atomic_dec(&po->tx_ring.pending);
1973 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
1974 }
1975
1976 sock_wfree(skb);
1977}
1978
40d4e3df
ED
1979static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1980 void *frame, struct net_device *dev, int size_max,
1981 __be16 proto, unsigned char *addr)
69e3c75f
JB
1982{
1983 union {
1984 struct tpacket_hdr *h1;
1985 struct tpacket2_hdr *h2;
1986 void *raw;
1987 } ph;
1988 int to_write, offset, len, tp_len, nr_frags, len_max;
1989 struct socket *sock = po->sk.sk_socket;
1990 struct page *page;
1991 void *data;
1992 int err;
1993
1994 ph.raw = frame;
1995
1996 skb->protocol = proto;
1997 skb->dev = dev;
1998 skb->priority = po->sk.sk_priority;
2d37a186 1999 skb->mark = po->sk.sk_mark;
69e3c75f
JB
2000 skb_shinfo(skb)->destructor_arg = ph.raw;
2001
2002 switch (po->tp_version) {
2003 case TPACKET_V2:
2004 tp_len = ph.h2->tp_len;
2005 break;
2006 default:
2007 tp_len = ph.h1->tp_len;
2008 break;
2009 }
2010 if (unlikely(tp_len > size_max)) {
40d4e3df 2011 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
2012 return -EMSGSIZE;
2013 }
2014
2015 skb_reserve(skb, LL_RESERVED_SPACE(dev));
2016 skb_reset_network_header(skb);
2017
2018 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2019 to_write = tp_len;
2020
2021 if (sock->type == SOCK_DGRAM) {
2022 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2023 NULL, tp_len);
2024 if (unlikely(err < 0))
2025 return -EINVAL;
40d4e3df 2026 } else if (dev->hard_header_len) {
69e3c75f
JB
2027 /* net device doesn't like empty head */
2028 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
2029 pr_err("packet size is too short (%d < %d)\n",
2030 tp_len, dev->hard_header_len);
69e3c75f
JB
2031 return -EINVAL;
2032 }
2033
2034 skb_push(skb, dev->hard_header_len);
2035 err = skb_store_bits(skb, 0, data,
2036 dev->hard_header_len);
2037 if (unlikely(err))
2038 return err;
2039
2040 data += dev->hard_header_len;
2041 to_write -= dev->hard_header_len;
2042 }
2043
2044 err = -EFAULT;
69e3c75f
JB
2045 offset = offset_in_page(data);
2046 len_max = PAGE_SIZE - offset;
2047 len = ((to_write > len_max) ? len_max : to_write);
2048
2049 skb->data_len = to_write;
2050 skb->len += to_write;
2051 skb->truesize += to_write;
2052 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2053
2054 while (likely(to_write)) {
2055 nr_frags = skb_shinfo(skb)->nr_frags;
2056
2057 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2058 pr_err("Packet exceed the number of skb frags(%lu)\n",
2059 MAX_SKB_FRAGS);
69e3c75f
JB
2060 return -EFAULT;
2061 }
2062
0af55bb5
CG
2063 page = pgv_to_page(data);
2064 data += len;
69e3c75f
JB
2065 flush_dcache_page(page);
2066 get_page(page);
0af55bb5 2067 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2068 to_write -= len;
2069 offset = 0;
2070 len_max = PAGE_SIZE;
2071 len = ((to_write > len_max) ? len_max : to_write);
2072 }
2073
2074 return tp_len;
2075}
2076
2077static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2078{
69e3c75f
JB
2079 struct sk_buff *skb;
2080 struct net_device *dev;
2081 __be16 proto;
827d9780
BG
2082 bool need_rls_dev = false;
2083 int err, reserve = 0;
40d4e3df
ED
2084 void *ph;
2085 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2086 int tp_len, size_max;
2087 unsigned char *addr;
2088 int len_sum = 0;
2089 int status = 0;
2090
69e3c75f
JB
2091 mutex_lock(&po->pg_vec_lock);
2092
2093 err = -EBUSY;
2094 if (saddr == NULL) {
827d9780 2095 dev = po->prot_hook.dev;
69e3c75f
JB
2096 proto = po->num;
2097 addr = NULL;
2098 } else {
2099 err = -EINVAL;
2100 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2101 goto out;
2102 if (msg->msg_namelen < (saddr->sll_halen
2103 + offsetof(struct sockaddr_ll,
2104 sll_addr)))
2105 goto out;
69e3c75f
JB
2106 proto = saddr->sll_protocol;
2107 addr = saddr->sll_addr;
827d9780
BG
2108 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2109 need_rls_dev = true;
69e3c75f
JB
2110 }
2111
69e3c75f
JB
2112 err = -ENXIO;
2113 if (unlikely(dev == NULL))
2114 goto out;
2115
2116 reserve = dev->hard_header_len;
2117
2118 err = -ENETDOWN;
2119 if (unlikely(!(dev->flags & IFF_UP)))
2120 goto out_put;
2121
2122 size_max = po->tx_ring.frame_size
b5dd884e 2123 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2124
2125 if (size_max > dev->mtu + reserve)
2126 size_max = dev->mtu + reserve;
2127
2128 do {
2129 ph = packet_current_frame(po, &po->tx_ring,
2130 TP_STATUS_SEND_REQUEST);
2131
2132 if (unlikely(ph == NULL)) {
2133 schedule();
2134 continue;
2135 }
2136
2137 status = TP_STATUS_SEND_REQUEST;
2138 skb = sock_alloc_send_skb(&po->sk,
2139 LL_ALLOCATED_SPACE(dev)
2140 + sizeof(struct sockaddr_ll),
2141 0, &err);
2142
2143 if (unlikely(skb == NULL))
2144 goto out_status;
2145
2146 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
2147 addr);
2148
2149 if (unlikely(tp_len < 0)) {
2150 if (po->tp_loss) {
2151 __packet_set_status(po, ph,
2152 TP_STATUS_AVAILABLE);
2153 packet_increment_head(&po->tx_ring);
2154 kfree_skb(skb);
2155 continue;
2156 } else {
2157 status = TP_STATUS_WRONG_FORMAT;
2158 err = tp_len;
2159 goto out_status;
2160 }
2161 }
2162
2163 skb->destructor = tpacket_destruct_skb;
2164 __packet_set_status(po, ph, TP_STATUS_SENDING);
2165 atomic_inc(&po->tx_ring.pending);
2166
2167 status = TP_STATUS_SEND_REQUEST;
2168 err = dev_queue_xmit(skb);
eb70df13
JP
2169 if (unlikely(err > 0)) {
2170 err = net_xmit_errno(err);
2171 if (err && __packet_get_status(po, ph) ==
2172 TP_STATUS_AVAILABLE) {
2173 /* skb was destructed already */
2174 skb = NULL;
2175 goto out_status;
2176 }
2177 /*
2178 * skb was dropped but not destructed yet;
2179 * let's treat it like congestion or err < 0
2180 */
2181 err = 0;
2182 }
69e3c75f
JB
2183 packet_increment_head(&po->tx_ring);
2184 len_sum += tp_len;
f64f9e71
JP
2185 } while (likely((ph != NULL) ||
2186 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2187 (atomic_read(&po->tx_ring.pending))))
2188 );
69e3c75f
JB
2189
2190 err = len_sum;
2191 goto out_put;
2192
69e3c75f
JB
2193out_status:
2194 __packet_set_status(po, ph, status);
2195 kfree_skb(skb);
2196out_put:
827d9780
BG
2197 if (need_rls_dev)
2198 dev_put(dev);
69e3c75f
JB
2199out:
2200 mutex_unlock(&po->pg_vec_lock);
2201 return err;
2202}
69e3c75f 2203
bfd5f4a3
SS
2204static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2205 size_t reserve, size_t len,
2206 size_t linear, int noblock,
2207 int *err)
2208{
2209 struct sk_buff *skb;
2210
2211 /* Under a page? Don't bother with paged skb. */
2212 if (prepad + len < PAGE_SIZE || !linear)
2213 linear = len;
2214
2215 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2216 err);
2217 if (!skb)
2218 return NULL;
2219
2220 skb_reserve(skb, reserve);
2221 skb_put(skb, linear);
2222 skb->data_len = len - linear;
2223 skb->len += len - linear;
2224
2225 return skb;
2226}
2227
69e3c75f 2228static int packet_snd(struct socket *sock,
1da177e4
LT
2229 struct msghdr *msg, size_t len)
2230{
2231 struct sock *sk = sock->sk;
40d4e3df 2232 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2233 struct sk_buff *skb;
2234 struct net_device *dev;
0e11c91e 2235 __be16 proto;
827d9780 2236 bool need_rls_dev = false;
1da177e4 2237 unsigned char *addr;
827d9780 2238 int err, reserve = 0;
bfd5f4a3
SS
2239 struct virtio_net_hdr vnet_hdr = { 0 };
2240 int offset = 0;
2241 int vnet_hdr_len;
2242 struct packet_sock *po = pkt_sk(sk);
2243 unsigned short gso_type = 0;
1da177e4
LT
2244
2245 /*
1ce4f28b 2246 * Get and verify the address.
1da177e4 2247 */
1ce4f28b 2248
1da177e4 2249 if (saddr == NULL) {
827d9780 2250 dev = po->prot_hook.dev;
1da177e4
LT
2251 proto = po->num;
2252 addr = NULL;
2253 } else {
2254 err = -EINVAL;
2255 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2256 goto out;
0fb375fb
EB
2257 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2258 goto out;
1da177e4
LT
2259 proto = saddr->sll_protocol;
2260 addr = saddr->sll_addr;
827d9780
BG
2261 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2262 need_rls_dev = true;
1da177e4
LT
2263 }
2264
1da177e4
LT
2265 err = -ENXIO;
2266 if (dev == NULL)
2267 goto out_unlock;
2268 if (sock->type == SOCK_RAW)
2269 reserve = dev->hard_header_len;
2270
d5e76b0a
DM
2271 err = -ENETDOWN;
2272 if (!(dev->flags & IFF_UP))
2273 goto out_unlock;
2274
bfd5f4a3
SS
2275 if (po->has_vnet_hdr) {
2276 vnet_hdr_len = sizeof(vnet_hdr);
2277
2278 err = -EINVAL;
2279 if (len < vnet_hdr_len)
2280 goto out_unlock;
2281
2282 len -= vnet_hdr_len;
2283
2284 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2285 vnet_hdr_len);
2286 if (err < 0)
2287 goto out_unlock;
2288
2289 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2290 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2291 vnet_hdr.hdr_len))
2292 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2293 vnet_hdr.csum_offset + 2;
2294
2295 err = -EINVAL;
2296 if (vnet_hdr.hdr_len > len)
2297 goto out_unlock;
2298
2299 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2300 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2301 case VIRTIO_NET_HDR_GSO_TCPV4:
2302 gso_type = SKB_GSO_TCPV4;
2303 break;
2304 case VIRTIO_NET_HDR_GSO_TCPV6:
2305 gso_type = SKB_GSO_TCPV6;
2306 break;
2307 case VIRTIO_NET_HDR_GSO_UDP:
2308 gso_type = SKB_GSO_UDP;
2309 break;
2310 default:
2311 goto out_unlock;
2312 }
2313
2314 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2315 gso_type |= SKB_GSO_TCP_ECN;
2316
2317 if (vnet_hdr.gso_size == 0)
2318 goto out_unlock;
2319
2320 }
2321 }
2322
1da177e4 2323 err = -EMSGSIZE;
57f89bfa 2324 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN))
1da177e4
LT
2325 goto out_unlock;
2326
bfd5f4a3
SS
2327 err = -ENOBUFS;
2328 skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
2329 LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
2330 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2331 if (skb == NULL)
1da177e4
LT
2332 goto out_unlock;
2333
bfd5f4a3 2334 skb_set_network_header(skb, reserve);
1da177e4 2335
0c4e8581
SH
2336 err = -EINVAL;
2337 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2338 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2339 goto out_free;
1da177e4
LT
2340
2341 /* Returns -EFAULT on error */
bfd5f4a3 2342 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2343 if (err)
2344 goto out_free;
2244d07b 2345 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
2346 if (err < 0)
2347 goto out_free;
1da177e4 2348
57f89bfa
BG
2349 if (!gso_type && (len > dev->mtu + reserve)) {
2350 /* Earlier code assumed this would be a VLAN pkt,
2351 * double-check this now that we have the actual
2352 * packet in hand.
2353 */
2354 struct ethhdr *ehdr;
2355 skb_reset_mac_header(skb);
2356 ehdr = eth_hdr(skb);
2357 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2358 err = -EMSGSIZE;
2359 goto out_free;
2360 }
2361 }
2362
1da177e4
LT
2363 skb->protocol = proto;
2364 skb->dev = dev;
2365 skb->priority = sk->sk_priority;
2d37a186 2366 skb->mark = sk->sk_mark;
1da177e4 2367
bfd5f4a3
SS
2368 if (po->has_vnet_hdr) {
2369 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2370 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2371 vnet_hdr.csum_offset)) {
2372 err = -EINVAL;
2373 goto out_free;
2374 }
2375 }
2376
2377 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2378 skb_shinfo(skb)->gso_type = gso_type;
2379
2380 /* Header must be checked, and gso_segs computed. */
2381 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2382 skb_shinfo(skb)->gso_segs = 0;
2383
2384 len += vnet_hdr_len;
2385 }
2386
1da177e4
LT
2387 /*
2388 * Now send it
2389 */
2390
2391 err = dev_queue_xmit(skb);
2392 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2393 goto out_unlock;
2394
827d9780
BG
2395 if (need_rls_dev)
2396 dev_put(dev);
1da177e4 2397
40d4e3df 2398 return len;
1da177e4
LT
2399
2400out_free:
2401 kfree_skb(skb);
2402out_unlock:
827d9780 2403 if (dev && need_rls_dev)
1da177e4
LT
2404 dev_put(dev);
2405out:
2406 return err;
2407}
2408
69e3c75f
JB
2409static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2410 struct msghdr *msg, size_t len)
2411{
69e3c75f
JB
2412 struct sock *sk = sock->sk;
2413 struct packet_sock *po = pkt_sk(sk);
2414 if (po->tx_ring.pg_vec)
2415 return tpacket_snd(po, msg);
2416 else
69e3c75f
JB
2417 return packet_snd(sock, msg, len);
2418}
2419
1da177e4
LT
2420/*
2421 * Close a PACKET socket. This is fairly simple. We immediately go
2422 * to 'closed' state and remove our protocol entry in the device list.
2423 */
2424
2425static int packet_release(struct socket *sock)
2426{
2427 struct sock *sk = sock->sk;
2428 struct packet_sock *po;
d12d01d6 2429 struct net *net;
f6fb8f10 2430 union tpacket_req_u req_u;
1da177e4
LT
2431
2432 if (!sk)
2433 return 0;
2434
3b1e0a65 2435 net = sock_net(sk);
1da177e4
LT
2436 po = pkt_sk(sk);
2437
808f5114 2438 spin_lock_bh(&net->packet.sklist_lock);
2439 sk_del_node_init_rcu(sk);
920de804 2440 sock_prot_inuse_add(net, sk->sk_prot, -1);
808f5114 2441 spin_unlock_bh(&net->packet.sklist_lock);
1da177e4 2442
808f5114 2443 spin_lock(&po->bind_lock);
ce06b03e 2444 unregister_prot_hook(sk, false);
160ff18a
BG
2445 if (po->prot_hook.dev) {
2446 dev_put(po->prot_hook.dev);
2447 po->prot_hook.dev = NULL;
2448 }
808f5114 2449 spin_unlock(&po->bind_lock);
1da177e4 2450
1da177e4 2451 packet_flush_mclist(sk);
1da177e4 2452
f6fb8f10 2453 memset(&req_u, 0, sizeof(req_u));
69e3c75f
JB
2454
2455 if (po->rx_ring.pg_vec)
f6fb8f10 2456 packet_set_ring(sk, &req_u, 1, 0);
69e3c75f
JB
2457
2458 if (po->tx_ring.pg_vec)
f6fb8f10 2459 packet_set_ring(sk, &req_u, 1, 1);
1da177e4 2460
dc99f600
DM
2461 fanout_release(sk);
2462
808f5114 2463 synchronize_net();
1da177e4
LT
2464 /*
2465 * Now the socket is dead. No more input will appear.
2466 */
1da177e4
LT
2467 sock_orphan(sk);
2468 sock->sk = NULL;
2469
2470 /* Purge queues */
2471
2472 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2473 sk_refcnt_debug_release(sk);
1da177e4
LT
2474
2475 sock_put(sk);
2476 return 0;
2477}
2478
2479/*
2480 * Attach a packet hook.
2481 */
2482
0e11c91e 2483static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2484{
2485 struct packet_sock *po = pkt_sk(sk);
dc99f600
DM
2486
2487 if (po->fanout)
2488 return -EINVAL;
1da177e4
LT
2489
2490 lock_sock(sk);
2491
2492 spin_lock(&po->bind_lock);
ce06b03e 2493 unregister_prot_hook(sk, true);
1da177e4
LT
2494 po->num = protocol;
2495 po->prot_hook.type = protocol;
160ff18a
BG
2496 if (po->prot_hook.dev)
2497 dev_put(po->prot_hook.dev);
1da177e4
LT
2498 po->prot_hook.dev = dev;
2499
2500 po->ifindex = dev ? dev->ifindex : 0;
2501
2502 if (protocol == 0)
2503 goto out_unlock;
2504
be85d4ad 2505 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2506 register_prot_hook(sk);
be85d4ad
UT
2507 } else {
2508 sk->sk_err = ENETDOWN;
2509 if (!sock_flag(sk, SOCK_DEAD))
2510 sk->sk_error_report(sk);
1da177e4
LT
2511 }
2512
2513out_unlock:
2514 spin_unlock(&po->bind_lock);
2515 release_sock(sk);
2516 return 0;
2517}
2518
2519/*
2520 * Bind a packet socket to a device
2521 */
2522
40d4e3df
ED
2523static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2524 int addr_len)
1da177e4 2525{
40d4e3df 2526 struct sock *sk = sock->sk;
1da177e4
LT
2527 char name[15];
2528 struct net_device *dev;
2529 int err = -ENODEV;
1ce4f28b 2530
1da177e4
LT
2531 /*
2532 * Check legality
2533 */
1ce4f28b 2534
8ae55f04 2535 if (addr_len != sizeof(struct sockaddr))
1da177e4 2536 return -EINVAL;
40d4e3df 2537 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2538
3b1e0a65 2539 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2540 if (dev)
1da177e4 2541 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2542 return err;
2543}
1da177e4
LT
2544
2545static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2546{
40d4e3df
ED
2547 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2548 struct sock *sk = sock->sk;
1da177e4
LT
2549 struct net_device *dev = NULL;
2550 int err;
2551
2552
2553 /*
2554 * Check legality
2555 */
1ce4f28b 2556
1da177e4
LT
2557 if (addr_len < sizeof(struct sockaddr_ll))
2558 return -EINVAL;
2559 if (sll->sll_family != AF_PACKET)
2560 return -EINVAL;
2561
2562 if (sll->sll_ifindex) {
2563 err = -ENODEV;
3b1e0a65 2564 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2565 if (dev == NULL)
2566 goto out;
2567 }
2568 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2569
2570out:
2571 return err;
2572}
2573
2574static struct proto packet_proto = {
2575 .name = "PACKET",
2576 .owner = THIS_MODULE,
2577 .obj_size = sizeof(struct packet_sock),
2578};
2579
2580/*
1ce4f28b 2581 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2582 */
2583
3f378b68
EP
2584static int packet_create(struct net *net, struct socket *sock, int protocol,
2585 int kern)
1da177e4
LT
2586{
2587 struct sock *sk;
2588 struct packet_sock *po;
0e11c91e 2589 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2590 int err;
2591
2592 if (!capable(CAP_NET_RAW))
2593 return -EPERM;
be02097c
DM
2594 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2595 sock->type != SOCK_PACKET)
1da177e4
LT
2596 return -ESOCKTNOSUPPORT;
2597
2598 sock->state = SS_UNCONNECTED;
2599
2600 err = -ENOBUFS;
6257ff21 2601 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2602 if (sk == NULL)
2603 goto out;
2604
2605 sock->ops = &packet_ops;
1da177e4
LT
2606 if (sock->type == SOCK_PACKET)
2607 sock->ops = &packet_ops_spkt;
be02097c 2608
1da177e4
LT
2609 sock_init_data(sock, sk);
2610
2611 po = pkt_sk(sk);
2612 sk->sk_family = PF_PACKET;
0e11c91e 2613 po->num = proto;
1da177e4
LT
2614
2615 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2616 sk_refcnt_debug_inc(sk);
1da177e4
LT
2617
2618 /*
2619 * Attach a protocol block
2620 */
2621
2622 spin_lock_init(&po->bind_lock);
905db440 2623 mutex_init(&po->pg_vec_lock);
1da177e4 2624 po->prot_hook.func = packet_rcv;
be02097c 2625
1da177e4
LT
2626 if (sock->type == SOCK_PACKET)
2627 po->prot_hook.func = packet_rcv_spkt;
be02097c 2628
1da177e4
LT
2629 po->prot_hook.af_packet_priv = sk;
2630
0e11c91e
AV
2631 if (proto) {
2632 po->prot_hook.type = proto;
ce06b03e 2633 register_prot_hook(sk);
1da177e4
LT
2634 }
2635
808f5114 2636 spin_lock_bh(&net->packet.sklist_lock);
2637 sk_add_node_rcu(sk, &net->packet.sklist);
3680453c 2638 sock_prot_inuse_add(net, &packet_proto, 1);
808f5114 2639 spin_unlock_bh(&net->packet.sklist_lock);
2640
40d4e3df 2641 return 0;
1da177e4
LT
2642out:
2643 return err;
2644}
2645
ed85b565
RC
2646static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2647{
2648 struct sock_exterr_skb *serr;
2649 struct sk_buff *skb, *skb2;
2650 int copied, err;
2651
2652 err = -EAGAIN;
2653 skb = skb_dequeue(&sk->sk_error_queue);
2654 if (skb == NULL)
2655 goto out;
2656
2657 copied = skb->len;
2658 if (copied > len) {
2659 msg->msg_flags |= MSG_TRUNC;
2660 copied = len;
2661 }
2662 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2663 if (err)
2664 goto out_free_skb;
2665
2666 sock_recv_timestamp(msg, sk, skb);
2667
2668 serr = SKB_EXT_ERR(skb);
2669 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2670 sizeof(serr->ee), &serr->ee);
2671
2672 msg->msg_flags |= MSG_ERRQUEUE;
2673 err = copied;
2674
2675 /* Reset and regenerate socket error */
2676 spin_lock_bh(&sk->sk_error_queue.lock);
2677 sk->sk_err = 0;
2678 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2679 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2680 spin_unlock_bh(&sk->sk_error_queue.lock);
2681 sk->sk_error_report(sk);
2682 } else
2683 spin_unlock_bh(&sk->sk_error_queue.lock);
2684
2685out_free_skb:
2686 kfree_skb(skb);
2687out:
2688 return err;
2689}
2690
1da177e4
LT
2691/*
2692 * Pull a packet from our receive queue and hand it to the user.
2693 * If necessary we block.
2694 */
2695
2696static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2697 struct msghdr *msg, size_t len, int flags)
2698{
2699 struct sock *sk = sock->sk;
2700 struct sk_buff *skb;
2701 int copied, err;
0fb375fb 2702 struct sockaddr_ll *sll;
bfd5f4a3 2703 int vnet_hdr_len = 0;
1da177e4
LT
2704
2705 err = -EINVAL;
ed85b565 2706 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2707 goto out;
2708
2709#if 0
2710 /* What error should we return now? EUNATTACH? */
2711 if (pkt_sk(sk)->ifindex < 0)
2712 return -ENODEV;
2713#endif
2714
ed85b565
RC
2715 if (flags & MSG_ERRQUEUE) {
2716 err = packet_recv_error(sk, msg, len);
2717 goto out;
2718 }
2719
1da177e4
LT
2720 /*
2721 * Call the generic datagram receiver. This handles all sorts
2722 * of horrible races and re-entrancy so we can forget about it
2723 * in the protocol layers.
2724 *
2725 * Now it will return ENETDOWN, if device have just gone down,
2726 * but then it will block.
2727 */
2728
40d4e3df 2729 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2730
2731 /*
1ce4f28b 2732 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2733 * handles the blocking we don't see and worry about blocking
2734 * retries.
2735 */
2736
8ae55f04 2737 if (skb == NULL)
1da177e4
LT
2738 goto out;
2739
bfd5f4a3
SS
2740 if (pkt_sk(sk)->has_vnet_hdr) {
2741 struct virtio_net_hdr vnet_hdr = { 0 };
2742
2743 err = -EINVAL;
2744 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2745 if (len < vnet_hdr_len)
bfd5f4a3
SS
2746 goto out_free;
2747
1f18b717
MK
2748 len -= vnet_hdr_len;
2749
bfd5f4a3
SS
2750 if (skb_is_gso(skb)) {
2751 struct skb_shared_info *sinfo = skb_shinfo(skb);
2752
2753 /* This is a hint as to how much should be linear. */
2754 vnet_hdr.hdr_len = skb_headlen(skb);
2755 vnet_hdr.gso_size = sinfo->gso_size;
2756 if (sinfo->gso_type & SKB_GSO_TCPV4)
2757 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2758 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2759 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2760 else if (sinfo->gso_type & SKB_GSO_UDP)
2761 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2762 else if (sinfo->gso_type & SKB_GSO_FCOE)
2763 goto out_free;
2764 else
2765 BUG();
2766 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2767 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2768 } else
2769 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2770
2771 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2772 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2773 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2774 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2775 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2776 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2777 } /* else everything is zero */
2778
2779 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2780 vnet_hdr_len);
2781 if (err < 0)
2782 goto out_free;
2783 }
2784
0fb375fb
EB
2785 /*
2786 * If the address length field is there to be filled in, we fill
2787 * it in now.
2788 */
2789
ffbc6111 2790 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
2791 if (sock->type == SOCK_PACKET)
2792 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2793 else
2794 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2795
1da177e4
LT
2796 /*
2797 * You lose any data beyond the buffer you gave. If it worries a
2798 * user program they can ask the device for its MTU anyway.
2799 */
2800
2801 copied = skb->len;
40d4e3df
ED
2802 if (copied > len) {
2803 copied = len;
2804 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2805 }
2806
2807 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2808 if (err)
2809 goto out_free;
2810
3b885787 2811 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
2812
2813 if (msg->msg_name)
ffbc6111
HX
2814 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2815 msg->msg_namelen);
1da177e4 2816
8dc41944 2817 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2818 struct tpacket_auxdata aux;
2819
2820 aux.tp_status = TP_STATUS_USER;
2821 if (skb->ip_summed == CHECKSUM_PARTIAL)
2822 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2823 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2824 aux.tp_snaplen = skb->len;
2825 aux.tp_mac = 0;
bbe735e4 2826 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2827 if (vlan_tx_tag_present(skb)) {
2828 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2829 aux.tp_status |= TP_STATUS_VLAN_VALID;
2830 } else {
2831 aux.tp_vlan_tci = 0;
2832 }
13fcb7bd 2833 aux.tp_padding = 0;
ffbc6111 2834 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2835 }
2836
1da177e4
LT
2837 /*
2838 * Free or return the buffer as appropriate. Again this
2839 * hides all the races and re-entrancy issues from us.
2840 */
bfd5f4a3 2841 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2842
2843out_free:
2844 skb_free_datagram(sk, skb);
2845out:
2846 return err;
2847}
2848
1da177e4
LT
2849static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2850 int *uaddr_len, int peer)
2851{
2852 struct net_device *dev;
2853 struct sock *sk = sock->sk;
2854
2855 if (peer)
2856 return -EOPNOTSUPP;
2857
2858 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
2859 rcu_read_lock();
2860 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2861 if (dev)
67286640 2862 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 2863 else
1da177e4 2864 memset(uaddr->sa_data, 0, 14);
654d1f8a 2865 rcu_read_unlock();
1da177e4
LT
2866 *uaddr_len = sizeof(*uaddr);
2867
2868 return 0;
2869}
1da177e4
LT
2870
2871static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2872 int *uaddr_len, int peer)
2873{
2874 struct net_device *dev;
2875 struct sock *sk = sock->sk;
2876 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2877 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2878
2879 if (peer)
2880 return -EOPNOTSUPP;
2881
2882 sll->sll_family = AF_PACKET;
2883 sll->sll_ifindex = po->ifindex;
2884 sll->sll_protocol = po->num;
67286640 2885 sll->sll_pkttype = 0;
654d1f8a
ED
2886 rcu_read_lock();
2887 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2888 if (dev) {
2889 sll->sll_hatype = dev->type;
2890 sll->sll_halen = dev->addr_len;
2891 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2892 } else {
2893 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2894 sll->sll_halen = 0;
2895 }
654d1f8a 2896 rcu_read_unlock();
0fb375fb 2897 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2898
2899 return 0;
2900}
2901
2aeb0b88
WC
2902static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2903 int what)
1da177e4
LT
2904{
2905 switch (i->type) {
2906 case PACKET_MR_MULTICAST:
1162563f
JP
2907 if (i->alen != dev->addr_len)
2908 return -EINVAL;
1da177e4 2909 if (what > 0)
22bedad3 2910 return dev_mc_add(dev, i->addr);
1da177e4 2911 else
22bedad3 2912 return dev_mc_del(dev, i->addr);
1da177e4
LT
2913 break;
2914 case PACKET_MR_PROMISC:
2aeb0b88 2915 return dev_set_promiscuity(dev, what);
1da177e4
LT
2916 break;
2917 case PACKET_MR_ALLMULTI:
2aeb0b88 2918 return dev_set_allmulti(dev, what);
1da177e4 2919 break;
d95ed927 2920 case PACKET_MR_UNICAST:
1162563f
JP
2921 if (i->alen != dev->addr_len)
2922 return -EINVAL;
d95ed927 2923 if (what > 0)
a748ee24 2924 return dev_uc_add(dev, i->addr);
d95ed927 2925 else
a748ee24 2926 return dev_uc_del(dev, i->addr);
d95ed927 2927 break;
40d4e3df
ED
2928 default:
2929 break;
1da177e4 2930 }
2aeb0b88 2931 return 0;
1da177e4
LT
2932}
2933
2934static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2935{
40d4e3df 2936 for ( ; i; i = i->next) {
1da177e4
LT
2937 if (i->ifindex == dev->ifindex)
2938 packet_dev_mc(dev, i, what);
2939 }
2940}
2941
0fb375fb 2942static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2943{
2944 struct packet_sock *po = pkt_sk(sk);
2945 struct packet_mclist *ml, *i;
2946 struct net_device *dev;
2947 int err;
2948
2949 rtnl_lock();
2950
2951 err = -ENODEV;
3b1e0a65 2952 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2953 if (!dev)
2954 goto done;
2955
2956 err = -EINVAL;
1162563f 2957 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2958 goto done;
2959
2960 err = -ENOBUFS;
8b3a7005 2961 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2962 if (i == NULL)
2963 goto done;
2964
2965 err = 0;
2966 for (ml = po->mclist; ml; ml = ml->next) {
2967 if (ml->ifindex == mreq->mr_ifindex &&
2968 ml->type == mreq->mr_type &&
2969 ml->alen == mreq->mr_alen &&
2970 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2971 ml->count++;
2972 /* Free the new element ... */
2973 kfree(i);
2974 goto done;
2975 }
2976 }
2977
2978 i->type = mreq->mr_type;
2979 i->ifindex = mreq->mr_ifindex;
2980 i->alen = mreq->mr_alen;
2981 memcpy(i->addr, mreq->mr_address, i->alen);
2982 i->count = 1;
2983 i->next = po->mclist;
2984 po->mclist = i;
2aeb0b88
WC
2985 err = packet_dev_mc(dev, i, 1);
2986 if (err) {
2987 po->mclist = i->next;
2988 kfree(i);
2989 }
1da177e4
LT
2990
2991done:
2992 rtnl_unlock();
2993 return err;
2994}
2995
0fb375fb 2996static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2997{
2998 struct packet_mclist *ml, **mlp;
2999
3000 rtnl_lock();
3001
3002 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3003 if (ml->ifindex == mreq->mr_ifindex &&
3004 ml->type == mreq->mr_type &&
3005 ml->alen == mreq->mr_alen &&
3006 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3007 if (--ml->count == 0) {
3008 struct net_device *dev;
3009 *mlp = ml->next;
ad959e76
ED
3010 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3011 if (dev)
1da177e4 3012 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3013 kfree(ml);
3014 }
3015 rtnl_unlock();
3016 return 0;
3017 }
3018 }
3019 rtnl_unlock();
3020 return -EADDRNOTAVAIL;
3021}
3022
3023static void packet_flush_mclist(struct sock *sk)
3024{
3025 struct packet_sock *po = pkt_sk(sk);
3026 struct packet_mclist *ml;
3027
3028 if (!po->mclist)
3029 return;
3030
3031 rtnl_lock();
3032 while ((ml = po->mclist) != NULL) {
3033 struct net_device *dev;
3034
3035 po->mclist = ml->next;
ad959e76
ED
3036 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3037 if (dev != NULL)
1da177e4 3038 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3039 kfree(ml);
3040 }
3041 rtnl_unlock();
3042}
1da177e4
LT
3043
3044static int
b7058842 3045packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3046{
3047 struct sock *sk = sock->sk;
8dc41944 3048 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3049 int ret;
3050
3051 if (level != SOL_PACKET)
3052 return -ENOPROTOOPT;
3053
69e3c75f 3054 switch (optname) {
1ce4f28b 3055 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3056 case PACKET_DROP_MEMBERSHIP:
3057 {
0fb375fb
EB
3058 struct packet_mreq_max mreq;
3059 int len = optlen;
3060 memset(&mreq, 0, sizeof(mreq));
3061 if (len < sizeof(struct packet_mreq))
1da177e4 3062 return -EINVAL;
0fb375fb
EB
3063 if (len > sizeof(mreq))
3064 len = sizeof(mreq);
40d4e3df 3065 if (copy_from_user(&mreq, optval, len))
1da177e4 3066 return -EFAULT;
0fb375fb
EB
3067 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3068 return -EINVAL;
1da177e4
LT
3069 if (optname == PACKET_ADD_MEMBERSHIP)
3070 ret = packet_mc_add(sk, &mreq);
3071 else
3072 ret = packet_mc_drop(sk, &mreq);
3073 return ret;
3074 }
a2efcfa0 3075
1da177e4 3076 case PACKET_RX_RING:
69e3c75f 3077 case PACKET_TX_RING:
1da177e4 3078 {
f6fb8f10 3079 union tpacket_req_u req_u;
3080 int len;
1da177e4 3081
f6fb8f10 3082 switch (po->tp_version) {
3083 case TPACKET_V1:
3084 case TPACKET_V2:
3085 len = sizeof(req_u.req);
3086 break;
3087 case TPACKET_V3:
3088 default:
3089 len = sizeof(req_u.req3);
3090 break;
3091 }
3092 if (optlen < len)
1da177e4 3093 return -EINVAL;
bfd5f4a3
SS
3094 if (pkt_sk(sk)->has_vnet_hdr)
3095 return -EINVAL;
f6fb8f10 3096 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3097 return -EFAULT;
f6fb8f10 3098 return packet_set_ring(sk, &req_u, 0,
3099 optname == PACKET_TX_RING);
1da177e4
LT
3100 }
3101 case PACKET_COPY_THRESH:
3102 {
3103 int val;
3104
40d4e3df 3105 if (optlen != sizeof(val))
1da177e4 3106 return -EINVAL;
40d4e3df 3107 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3108 return -EFAULT;
3109
3110 pkt_sk(sk)->copy_thresh = val;
3111 return 0;
3112 }
bbd6ef87
PM
3113 case PACKET_VERSION:
3114 {
3115 int val;
3116
3117 if (optlen != sizeof(val))
3118 return -EINVAL;
69e3c75f 3119 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3120 return -EBUSY;
3121 if (copy_from_user(&val, optval, sizeof(val)))
3122 return -EFAULT;
3123 switch (val) {
3124 case TPACKET_V1:
3125 case TPACKET_V2:
f6fb8f10 3126 case TPACKET_V3:
bbd6ef87
PM
3127 po->tp_version = val;
3128 return 0;
3129 default:
3130 return -EINVAL;
3131 }
3132 }
8913336a
PM
3133 case PACKET_RESERVE:
3134 {
3135 unsigned int val;
3136
3137 if (optlen != sizeof(val))
3138 return -EINVAL;
69e3c75f 3139 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3140 return -EBUSY;
3141 if (copy_from_user(&val, optval, sizeof(val)))
3142 return -EFAULT;
3143 po->tp_reserve = val;
3144 return 0;
3145 }
69e3c75f
JB
3146 case PACKET_LOSS:
3147 {
3148 unsigned int val;
3149
3150 if (optlen != sizeof(val))
3151 return -EINVAL;
3152 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3153 return -EBUSY;
3154 if (copy_from_user(&val, optval, sizeof(val)))
3155 return -EFAULT;
3156 po->tp_loss = !!val;
3157 return 0;
3158 }
8dc41944
HX
3159 case PACKET_AUXDATA:
3160 {
3161 int val;
3162
3163 if (optlen < sizeof(val))
3164 return -EINVAL;
3165 if (copy_from_user(&val, optval, sizeof(val)))
3166 return -EFAULT;
3167
3168 po->auxdata = !!val;
3169 return 0;
3170 }
80feaacb
PWJ
3171 case PACKET_ORIGDEV:
3172 {
3173 int val;
3174
3175 if (optlen < sizeof(val))
3176 return -EINVAL;
3177 if (copy_from_user(&val, optval, sizeof(val)))
3178 return -EFAULT;
3179
3180 po->origdev = !!val;
3181 return 0;
3182 }
bfd5f4a3
SS
3183 case PACKET_VNET_HDR:
3184 {
3185 int val;
3186
3187 if (sock->type != SOCK_RAW)
3188 return -EINVAL;
3189 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3190 return -EBUSY;
3191 if (optlen < sizeof(val))
3192 return -EINVAL;
3193 if (copy_from_user(&val, optval, sizeof(val)))
3194 return -EFAULT;
3195
3196 po->has_vnet_hdr = !!val;
3197 return 0;
3198 }
614f60fa
SM
3199 case PACKET_TIMESTAMP:
3200 {
3201 int val;
3202
3203 if (optlen != sizeof(val))
3204 return -EINVAL;
3205 if (copy_from_user(&val, optval, sizeof(val)))
3206 return -EFAULT;
3207
3208 po->tp_tstamp = val;
3209 return 0;
3210 }
dc99f600
DM
3211 case PACKET_FANOUT:
3212 {
3213 int val;
3214
3215 if (optlen != sizeof(val))
3216 return -EINVAL;
3217 if (copy_from_user(&val, optval, sizeof(val)))
3218 return -EFAULT;
3219
3220 return fanout_add(sk, val & 0xffff, val >> 16);
3221 }
1da177e4
LT
3222 default:
3223 return -ENOPROTOOPT;
3224 }
3225}
3226
3227static int packet_getsockopt(struct socket *sock, int level, int optname,
3228 char __user *optval, int __user *optlen)
3229{
3230 int len;
8dc41944 3231 int val;
1da177e4
LT
3232 struct sock *sk = sock->sk;
3233 struct packet_sock *po = pkt_sk(sk);
8dc41944
HX
3234 void *data;
3235 struct tpacket_stats st;
f6fb8f10 3236 union tpacket_stats_u st_u;
1da177e4
LT
3237
3238 if (level != SOL_PACKET)
3239 return -ENOPROTOOPT;
3240
8ae55f04
KK
3241 if (get_user(len, optlen))
3242 return -EFAULT;
1da177e4
LT
3243
3244 if (len < 0)
3245 return -EINVAL;
1ce4f28b 3246
69e3c75f 3247 switch (optname) {
1da177e4 3248 case PACKET_STATISTICS:
f6fb8f10 3249 if (po->tp_version == TPACKET_V3) {
3250 len = sizeof(struct tpacket_stats_v3);
3251 } else {
3252 if (len > sizeof(struct tpacket_stats))
3253 len = sizeof(struct tpacket_stats);
3254 }
1da177e4 3255 spin_lock_bh(&sk->sk_receive_queue.lock);
f6fb8f10 3256 if (po->tp_version == TPACKET_V3) {
3257 memcpy(&st_u.stats3, &po->stats,
3258 sizeof(struct tpacket_stats));
3259 st_u.stats3.tp_freeze_q_cnt =
3260 po->stats_u.stats3.tp_freeze_q_cnt;
3261 st_u.stats3.tp_packets += po->stats.tp_drops;
3262 data = &st_u.stats3;
3263 } else {
3264 st = po->stats;
3265 st.tp_packets += st.tp_drops;
3266 data = &st;
3267 }
1da177e4
LT
3268 memset(&po->stats, 0, sizeof(st));
3269 spin_unlock_bh(&sk->sk_receive_queue.lock);
8dc41944
HX
3270 break;
3271 case PACKET_AUXDATA:
3272 if (len > sizeof(int))
3273 len = sizeof(int);
3274 val = po->auxdata;
3275
80feaacb
PWJ
3276 data = &val;
3277 break;
3278 case PACKET_ORIGDEV:
3279 if (len > sizeof(int))
3280 len = sizeof(int);
3281 val = po->origdev;
3282
bfd5f4a3
SS
3283 data = &val;
3284 break;
3285 case PACKET_VNET_HDR:
3286 if (len > sizeof(int))
3287 len = sizeof(int);
3288 val = po->has_vnet_hdr;
3289
8dc41944 3290 data = &val;
1da177e4 3291 break;
bbd6ef87
PM
3292 case PACKET_VERSION:
3293 if (len > sizeof(int))
3294 len = sizeof(int);
3295 val = po->tp_version;
3296 data = &val;
3297 break;
3298 case PACKET_HDRLEN:
3299 if (len > sizeof(int))
3300 len = sizeof(int);
3301 if (copy_from_user(&val, optval, len))
3302 return -EFAULT;
3303 switch (val) {
3304 case TPACKET_V1:
3305 val = sizeof(struct tpacket_hdr);
3306 break;
3307 case TPACKET_V2:
3308 val = sizeof(struct tpacket2_hdr);
3309 break;
f6fb8f10 3310 case TPACKET_V3:
3311 val = sizeof(struct tpacket3_hdr);
3312 break;
bbd6ef87
PM
3313 default:
3314 return -EINVAL;
3315 }
3316 data = &val;
3317 break;
8913336a
PM
3318 case PACKET_RESERVE:
3319 if (len > sizeof(unsigned int))
3320 len = sizeof(unsigned int);
3321 val = po->tp_reserve;
3322 data = &val;
3323 break;
69e3c75f
JB
3324 case PACKET_LOSS:
3325 if (len > sizeof(unsigned int))
3326 len = sizeof(unsigned int);
3327 val = po->tp_loss;
3328 data = &val;
3329 break;
614f60fa
SM
3330 case PACKET_TIMESTAMP:
3331 if (len > sizeof(int))
3332 len = sizeof(int);
3333 val = po->tp_tstamp;
3334 data = &val;
3335 break;
dc99f600
DM
3336 case PACKET_FANOUT:
3337 if (len > sizeof(int))
3338 len = sizeof(int);
3339 val = (po->fanout ?
3340 ((u32)po->fanout->id |
3341 ((u32)po->fanout->type << 16)) :
3342 0);
3343 data = &val;
3344 break;
1da177e4
LT
3345 default:
3346 return -ENOPROTOOPT;
3347 }
3348
8ae55f04
KK
3349 if (put_user(len, optlen))
3350 return -EFAULT;
8dc41944
HX
3351 if (copy_to_user(optval, data, len))
3352 return -EFAULT;
8ae55f04 3353 return 0;
1da177e4
LT
3354}
3355
3356
3357static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3358{
3359 struct sock *sk;
3360 struct hlist_node *node;
ad930650 3361 struct net_device *dev = data;
c346dca1 3362 struct net *net = dev_net(dev);
1da177e4 3363
808f5114 3364 rcu_read_lock();
3365 sk_for_each_rcu(sk, node, &net->packet.sklist) {
1da177e4
LT
3366 struct packet_sock *po = pkt_sk(sk);
3367
3368 switch (msg) {
3369 case NETDEV_UNREGISTER:
1da177e4
LT
3370 if (po->mclist)
3371 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3372 /* fallthrough */
3373
1da177e4
LT
3374 case NETDEV_DOWN:
3375 if (dev->ifindex == po->ifindex) {
3376 spin_lock(&po->bind_lock);
3377 if (po->running) {
ce06b03e 3378 __unregister_prot_hook(sk, false);
1da177e4
LT
3379 sk->sk_err = ENETDOWN;
3380 if (!sock_flag(sk, SOCK_DEAD))
3381 sk->sk_error_report(sk);
3382 }
3383 if (msg == NETDEV_UNREGISTER) {
3384 po->ifindex = -1;
160ff18a
BG
3385 if (po->prot_hook.dev)
3386 dev_put(po->prot_hook.dev);
1da177e4
LT
3387 po->prot_hook.dev = NULL;
3388 }
3389 spin_unlock(&po->bind_lock);
3390 }
3391 break;
3392 case NETDEV_UP:
808f5114 3393 if (dev->ifindex == po->ifindex) {
3394 spin_lock(&po->bind_lock);
ce06b03e
DM
3395 if (po->num)
3396 register_prot_hook(sk);
808f5114 3397 spin_unlock(&po->bind_lock);
1da177e4 3398 }
1da177e4
LT
3399 break;
3400 }
3401 }
808f5114 3402 rcu_read_unlock();
1da177e4
LT
3403 return NOTIFY_DONE;
3404}
3405
3406
3407static int packet_ioctl(struct socket *sock, unsigned int cmd,
3408 unsigned long arg)
3409{
3410 struct sock *sk = sock->sk;
3411
69e3c75f 3412 switch (cmd) {
40d4e3df
ED
3413 case SIOCOUTQ:
3414 {
3415 int amount = sk_wmem_alloc_get(sk);
31e6d363 3416
40d4e3df
ED
3417 return put_user(amount, (int __user *)arg);
3418 }
3419 case SIOCINQ:
3420 {
3421 struct sk_buff *skb;
3422 int amount = 0;
3423
3424 spin_lock_bh(&sk->sk_receive_queue.lock);
3425 skb = skb_peek(&sk->sk_receive_queue);
3426 if (skb)
3427 amount = skb->len;
3428 spin_unlock_bh(&sk->sk_receive_queue.lock);
3429 return put_user(amount, (int __user *)arg);
3430 }
3431 case SIOCGSTAMP:
3432 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3433 case SIOCGSTAMPNS:
3434 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3435
1da177e4 3436#ifdef CONFIG_INET
40d4e3df
ED
3437 case SIOCADDRT:
3438 case SIOCDELRT:
3439 case SIOCDARP:
3440 case SIOCGARP:
3441 case SIOCSARP:
3442 case SIOCGIFADDR:
3443 case SIOCSIFADDR:
3444 case SIOCGIFBRDADDR:
3445 case SIOCSIFBRDADDR:
3446 case SIOCGIFNETMASK:
3447 case SIOCSIFNETMASK:
3448 case SIOCGIFDSTADDR:
3449 case SIOCSIFDSTADDR:
3450 case SIOCSIFFLAGS:
40d4e3df 3451 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3452#endif
3453
40d4e3df
ED
3454 default:
3455 return -ENOIOCTLCMD;
1da177e4
LT
3456 }
3457 return 0;
3458}
3459
40d4e3df 3460static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3461 poll_table *wait)
3462{
3463 struct sock *sk = sock->sk;
3464 struct packet_sock *po = pkt_sk(sk);
3465 unsigned int mask = datagram_poll(file, sock, wait);
3466
3467 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3468 if (po->rx_ring.pg_vec) {
f6fb8f10 3469 if (!packet_previous_rx_frame(po, &po->rx_ring,
3470 TP_STATUS_KERNEL))
1da177e4
LT
3471 mask |= POLLIN | POLLRDNORM;
3472 }
3473 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3474 spin_lock_bh(&sk->sk_write_queue.lock);
3475 if (po->tx_ring.pg_vec) {
3476 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3477 mask |= POLLOUT | POLLWRNORM;
3478 }
3479 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3480 return mask;
3481}
3482
3483
3484/* Dirty? Well, I still did not learn better way to account
3485 * for user mmaps.
3486 */
3487
3488static void packet_mm_open(struct vm_area_struct *vma)
3489{
3490 struct file *file = vma->vm_file;
40d4e3df 3491 struct socket *sock = file->private_data;
1da177e4 3492 struct sock *sk = sock->sk;
1ce4f28b 3493
1da177e4
LT
3494 if (sk)
3495 atomic_inc(&pkt_sk(sk)->mapped);
3496}
3497
3498static void packet_mm_close(struct vm_area_struct *vma)
3499{
3500 struct file *file = vma->vm_file;
40d4e3df 3501 struct socket *sock = file->private_data;
1da177e4 3502 struct sock *sk = sock->sk;
1ce4f28b 3503
1da177e4
LT
3504 if (sk)
3505 atomic_dec(&pkt_sk(sk)->mapped);
3506}
3507
f0f37e2f 3508static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3509 .open = packet_mm_open,
3510 .close = packet_mm_close,
1da177e4
LT
3511};
3512
0e3125c7
NH
3513static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3514 unsigned int len)
1da177e4
LT
3515{
3516 int i;
3517
4ebf0ae2 3518 for (i = 0; i < len; i++) {
0e3125c7 3519 if (likely(pg_vec[i].buffer)) {
c56b4d90 3520 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3521 vfree(pg_vec[i].buffer);
3522 else
3523 free_pages((unsigned long)pg_vec[i].buffer,
3524 order);
3525 pg_vec[i].buffer = NULL;
3526 }
1da177e4
LT
3527 }
3528 kfree(pg_vec);
3529}
3530
c56b4d90 3531static inline char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3532{
0e3125c7
NH
3533 char *buffer = NULL;
3534 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3535 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3536
3537 buffer = (char *) __get_free_pages(gfp_flags, order);
3538
3539 if (buffer)
3540 return buffer;
3541
3542 /*
3543 * __get_free_pages failed, fall back to vmalloc
3544 */
bbce5a59 3545 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3546
0e3125c7
NH
3547 if (buffer)
3548 return buffer;
3549
3550 /*
3551 * vmalloc failed, lets dig into swap here
3552 */
0e3125c7
NH
3553 gfp_flags &= ~__GFP_NORETRY;
3554 buffer = (char *)__get_free_pages(gfp_flags, order);
3555 if (buffer)
3556 return buffer;
3557
3558 /*
3559 * complete and utter failure
3560 */
3561 return NULL;
4ebf0ae2
DM
3562}
3563
0e3125c7 3564static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3565{
3566 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3567 struct pgv *pg_vec;
4ebf0ae2
DM
3568 int i;
3569
0e3125c7 3570 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3571 if (unlikely(!pg_vec))
3572 goto out;
3573
3574 for (i = 0; i < block_nr; i++) {
c56b4d90 3575 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3576 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3577 goto out_free_pgvec;
3578 }
3579
3580out:
3581 return pg_vec;
3582
3583out_free_pgvec:
3584 free_pg_vec(pg_vec, order, block_nr);
3585 pg_vec = NULL;
3586 goto out;
3587}
1da177e4 3588
f6fb8f10 3589static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3590 int closing, int tx_ring)
1da177e4 3591{
0e3125c7 3592 struct pgv *pg_vec = NULL;
1da177e4 3593 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3594 int was_running, order = 0;
69e3c75f
JB
3595 struct packet_ring_buffer *rb;
3596 struct sk_buff_head *rb_queue;
0e11c91e 3597 __be16 num;
f6fb8f10 3598 int err = -EINVAL;
3599 /* Added to avoid minimal code churn */
3600 struct tpacket_req *req = &req_u->req;
3601
3602 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3603 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3604 WARN(1, "Tx-ring is not supported.\n");
3605 goto out;
3606 }
1ce4f28b 3607
69e3c75f
JB
3608 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3609 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3610
69e3c75f
JB
3611 err = -EBUSY;
3612 if (!closing) {
3613 if (atomic_read(&po->mapped))
3614 goto out;
3615 if (atomic_read(&rb->pending))
3616 goto out;
3617 }
1da177e4 3618
69e3c75f
JB
3619 if (req->tp_block_nr) {
3620 /* Sanity tests and some calculations */
3621 err = -EBUSY;
3622 if (unlikely(rb->pg_vec))
3623 goto out;
1da177e4 3624
bbd6ef87
PM
3625 switch (po->tp_version) {
3626 case TPACKET_V1:
3627 po->tp_hdrlen = TPACKET_HDRLEN;
3628 break;
3629 case TPACKET_V2:
3630 po->tp_hdrlen = TPACKET2_HDRLEN;
3631 break;
f6fb8f10 3632 case TPACKET_V3:
3633 po->tp_hdrlen = TPACKET3_HDRLEN;
3634 break;
bbd6ef87
PM
3635 }
3636
69e3c75f 3637 err = -EINVAL;
4ebf0ae2 3638 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3639 goto out;
4ebf0ae2 3640 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3641 goto out;
8913336a 3642 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3643 po->tp_reserve))
3644 goto out;
4ebf0ae2 3645 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3646 goto out;
1da177e4 3647
69e3c75f
JB
3648 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3649 if (unlikely(rb->frames_per_block <= 0))
3650 goto out;
3651 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3652 req->tp_frame_nr))
3653 goto out;
1da177e4
LT
3654
3655 err = -ENOMEM;
4ebf0ae2
DM
3656 order = get_order(req->tp_block_size);
3657 pg_vec = alloc_pg_vec(req, order);
3658 if (unlikely(!pg_vec))
1da177e4 3659 goto out;
f6fb8f10 3660 switch (po->tp_version) {
3661 case TPACKET_V3:
3662 /* Transmit path is not supported. We checked
3663 * it above but just being paranoid
3664 */
3665 if (!tx_ring)
3666 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3667 break;
3668 default:
3669 break;
3670 }
69e3c75f
JB
3671 }
3672 /* Done */
3673 else {
3674 err = -EINVAL;
4ebf0ae2 3675 if (unlikely(req->tp_frame_nr))
69e3c75f 3676 goto out;
1da177e4
LT
3677 }
3678
3679 lock_sock(sk);
3680
3681 /* Detach socket from network */
3682 spin_lock(&po->bind_lock);
3683 was_running = po->running;
3684 num = po->num;
3685 if (was_running) {
1da177e4 3686 po->num = 0;
ce06b03e 3687 __unregister_prot_hook(sk, false);
1da177e4
LT
3688 }
3689 spin_unlock(&po->bind_lock);
1ce4f28b 3690
1da177e4
LT
3691 synchronize_net();
3692
3693 err = -EBUSY;
905db440 3694 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3695 if (closing || atomic_read(&po->mapped) == 0) {
3696 err = 0;
69e3c75f 3697 spin_lock_bh(&rb_queue->lock);
c053fd96 3698 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3699 rb->frame_max = (req->tp_frame_nr - 1);
3700 rb->head = 0;
3701 rb->frame_size = req->tp_frame_size;
3702 spin_unlock_bh(&rb_queue->lock);
3703
c053fd96
CG
3704 swap(rb->pg_vec_order, order);
3705 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3706
3707 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3708 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3709 tpacket_rcv : packet_rcv;
3710 skb_queue_purge(rb_queue);
1da177e4 3711 if (atomic_read(&po->mapped))
40d4e3df
ED
3712 pr_err("packet_mmap: vma is busy: %d\n",
3713 atomic_read(&po->mapped));
1da177e4 3714 }
905db440 3715 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3716
3717 spin_lock(&po->bind_lock);
ce06b03e 3718 if (was_running) {
1da177e4 3719 po->num = num;
ce06b03e 3720 register_prot_hook(sk);
1da177e4
LT
3721 }
3722 spin_unlock(&po->bind_lock);
f6fb8f10 3723 if (closing && (po->tp_version > TPACKET_V2)) {
3724 /* Because we don't support block-based V3 on tx-ring */
3725 if (!tx_ring)
3726 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3727 }
1da177e4
LT
3728 release_sock(sk);
3729
1da177e4
LT
3730 if (pg_vec)
3731 free_pg_vec(pg_vec, order, req->tp_block_nr);
3732out:
3733 return err;
3734}
3735
69e3c75f
JB
3736static int packet_mmap(struct file *file, struct socket *sock,
3737 struct vm_area_struct *vma)
1da177e4
LT
3738{
3739 struct sock *sk = sock->sk;
3740 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3741 unsigned long size, expected_size;
3742 struct packet_ring_buffer *rb;
1da177e4
LT
3743 unsigned long start;
3744 int err = -EINVAL;
3745 int i;
3746
3747 if (vma->vm_pgoff)
3748 return -EINVAL;
3749
905db440 3750 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3751
3752 expected_size = 0;
3753 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3754 if (rb->pg_vec) {
3755 expected_size += rb->pg_vec_len
3756 * rb->pg_vec_pages
3757 * PAGE_SIZE;
3758 }
3759 }
3760
3761 if (expected_size == 0)
1da177e4 3762 goto out;
69e3c75f
JB
3763
3764 size = vma->vm_end - vma->vm_start;
3765 if (size != expected_size)
1da177e4
LT
3766 goto out;
3767
1da177e4 3768 start = vma->vm_start;
69e3c75f
JB
3769 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3770 if (rb->pg_vec == NULL)
3771 continue;
3772
3773 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3774 struct page *page;
3775 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3776 int pg_num;
3777
c56b4d90
CG
3778 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3779 page = pgv_to_page(kaddr);
69e3c75f
JB
3780 err = vm_insert_page(vma, start, page);
3781 if (unlikely(err))
3782 goto out;
3783 start += PAGE_SIZE;
0e3125c7 3784 kaddr += PAGE_SIZE;
69e3c75f 3785 }
4ebf0ae2 3786 }
1da177e4 3787 }
69e3c75f 3788
4ebf0ae2 3789 atomic_inc(&po->mapped);
1da177e4
LT
3790 vma->vm_ops = &packet_mmap_ops;
3791 err = 0;
3792
3793out:
905db440 3794 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3795 return err;
3796}
1da177e4 3797
90ddc4f0 3798static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3799 .family = PF_PACKET,
3800 .owner = THIS_MODULE,
3801 .release = packet_release,
3802 .bind = packet_bind_spkt,
3803 .connect = sock_no_connect,
3804 .socketpair = sock_no_socketpair,
3805 .accept = sock_no_accept,
3806 .getname = packet_getname_spkt,
3807 .poll = datagram_poll,
3808 .ioctl = packet_ioctl,
3809 .listen = sock_no_listen,
3810 .shutdown = sock_no_shutdown,
3811 .setsockopt = sock_no_setsockopt,
3812 .getsockopt = sock_no_getsockopt,
3813 .sendmsg = packet_sendmsg_spkt,
3814 .recvmsg = packet_recvmsg,
3815 .mmap = sock_no_mmap,
3816 .sendpage = sock_no_sendpage,
3817};
1da177e4 3818
90ddc4f0 3819static const struct proto_ops packet_ops = {
1da177e4
LT
3820 .family = PF_PACKET,
3821 .owner = THIS_MODULE,
3822 .release = packet_release,
3823 .bind = packet_bind,
3824 .connect = sock_no_connect,
3825 .socketpair = sock_no_socketpair,
3826 .accept = sock_no_accept,
1ce4f28b 3827 .getname = packet_getname,
1da177e4
LT
3828 .poll = packet_poll,
3829 .ioctl = packet_ioctl,
3830 .listen = sock_no_listen,
3831 .shutdown = sock_no_shutdown,
3832 .setsockopt = packet_setsockopt,
3833 .getsockopt = packet_getsockopt,
3834 .sendmsg = packet_sendmsg,
3835 .recvmsg = packet_recvmsg,
3836 .mmap = packet_mmap,
3837 .sendpage = sock_no_sendpage,
3838};
3839
ec1b4cf7 3840static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3841 .family = PF_PACKET,
3842 .create = packet_create,
3843 .owner = THIS_MODULE,
3844};
3845
3846static struct notifier_block packet_netdev_notifier = {
40d4e3df 3847 .notifier_call = packet_notifier,
1da177e4
LT
3848};
3849
3850#ifdef CONFIG_PROC_FS
1da177e4
LT
3851
3852static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3853 __acquires(RCU)
1da177e4 3854{
e372c414 3855 struct net *net = seq_file_net(seq);
808f5114 3856
3857 rcu_read_lock();
3858 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3859}
3860
3861static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3862{
1bf40954 3863 struct net *net = seq_file_net(seq);
808f5114 3864 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3865}
3866
3867static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3868 __releases(RCU)
1da177e4 3869{
808f5114 3870 rcu_read_unlock();
1da177e4
LT
3871}
3872
1ce4f28b 3873static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3874{
3875 if (v == SEQ_START_TOKEN)
3876 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3877 else {
b7ceabd9 3878 struct sock *s = sk_entry(v);
1da177e4
LT
3879 const struct packet_sock *po = pkt_sk(s);
3880
3881 seq_printf(seq,
71338aa7 3882 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3883 s,
3884 atomic_read(&s->sk_refcnt),
3885 s->sk_type,
3886 ntohs(po->num),
3887 po->ifindex,
3888 po->running,
3889 atomic_read(&s->sk_rmem_alloc),
3890 sock_i_uid(s),
40d4e3df 3891 sock_i_ino(s));
1da177e4
LT
3892 }
3893
3894 return 0;
3895}
3896
56b3d975 3897static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3898 .start = packet_seq_start,
3899 .next = packet_seq_next,
3900 .stop = packet_seq_stop,
3901 .show = packet_seq_show,
3902};
3903
3904static int packet_seq_open(struct inode *inode, struct file *file)
3905{
e372c414
DL
3906 return seq_open_net(inode, file, &packet_seq_ops,
3907 sizeof(struct seq_net_private));
1da177e4
LT
3908}
3909
da7071d7 3910static const struct file_operations packet_seq_fops = {
1da177e4
LT
3911 .owner = THIS_MODULE,
3912 .open = packet_seq_open,
3913 .read = seq_read,
3914 .llseek = seq_lseek,
e372c414 3915 .release = seq_release_net,
1da177e4
LT
3916};
3917
3918#endif
3919
2c8c1e72 3920static int __net_init packet_net_init(struct net *net)
d12d01d6 3921{
808f5114 3922 spin_lock_init(&net->packet.sklist_lock);
2aaef4e4 3923 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6
DL
3924
3925 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
3926 return -ENOMEM;
3927
3928 return 0;
3929}
3930
2c8c1e72 3931static void __net_exit packet_net_exit(struct net *net)
d12d01d6
DL
3932{
3933 proc_net_remove(net, "packet");
3934}
3935
3936static struct pernet_operations packet_net_ops = {
3937 .init = packet_net_init,
3938 .exit = packet_net_exit,
3939};
3940
3941
1da177e4
LT
3942static void __exit packet_exit(void)
3943{
1da177e4 3944 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3945 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3946 sock_unregister(PF_PACKET);
3947 proto_unregister(&packet_proto);
3948}
3949
3950static int __init packet_init(void)
3951{
3952 int rc = proto_register(&packet_proto, 0);
3953
3954 if (rc != 0)
3955 goto out;
3956
3957 sock_register(&packet_family_ops);
d12d01d6 3958 register_pernet_subsys(&packet_net_ops);
1da177e4 3959 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3960out:
3961 return rc;
3962}
3963
3964module_init(packet_init);
3965module_exit(packet_exit);
3966MODULE_LICENSE("GPL");
3967MODULE_ALIAS_NETPROTO(PF_PACKET);