net: ipv4: fib_trie: Don't unnecessarily search for already found fib leaf
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
1da177e4
LT
91
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
1da177e4
LT
96/*
97 Assumptions:
98 - if device has no dev->hard_header routine, it adds and removes ll header
99 inside itself. In this case ll header is invisible outside of device,
100 but higher levels still should reserve dev->hard_header_len.
101 Some devices are enough clever to reallocate skb, when header
102 will not fit to reserved space (tunnel), another ones are silly
103 (PPP).
104 - packet socket receives packets with pulled ll header,
105 so that SOCK_RAW should push it back.
106
107On receive:
108-----------
109
110Incoming, dev->hard_header!=NULL
b0e380b1
ACM
111 mac_header -> ll header
112 data -> data
1da177e4
LT
113
114Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> ll header
1da177e4
LT
117
118Incoming, dev->hard_header==NULL
b0e380b1
ACM
119 mac_header -> UNKNOWN position. It is very likely, that it points to ll
120 header. PPP makes it, that is wrong, because introduce
db0c58f9 121 assymetry between rx and tx paths.
b0e380b1 122 data -> data
1da177e4
LT
123
124Outgoing, dev->hard_header==NULL
b0e380b1
ACM
125 mac_header -> data. ll header is still not built!
126 data -> data
1da177e4
LT
127
128Resume
129 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
130
131
132On transmit:
133------------
134
135dev->hard_header != NULL
b0e380b1
ACM
136 mac_header -> ll header
137 data -> ll header
1da177e4
LT
138
139dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
140 mac_header -> data
141 data -> data
1da177e4
LT
142
143 We should set nh.raw on output to correct posistion,
144 packet classifier depends on it.
145 */
146
1da177e4
LT
147/* Private packet socket structures. */
148
40d4e3df 149struct packet_mclist {
1da177e4
LT
150 struct packet_mclist *next;
151 int ifindex;
152 int count;
153 unsigned short type;
154 unsigned short alen;
0fb375fb
EB
155 unsigned char addr[MAX_ADDR_LEN];
156};
157/* identical to struct packet_mreq except it has
158 * a longer address field.
159 */
40d4e3df 160struct packet_mreq_max {
0fb375fb
EB
161 int mr_ifindex;
162 unsigned short mr_type;
163 unsigned short mr_alen;
164 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 165};
a2efcfa0 166
f6fb8f10 167static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
168 int closing, int tx_ring);
169
f6fb8f10 170
171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
178/* kbdq - kernel block descriptor queue */
bc59ba39 179struct tpacket_kbdq_core {
f6fb8f10 180 struct pgv *pkbdq;
181 unsigned int feature_req_word;
182 unsigned int hdrlen;
183 unsigned char reset_pending_on_curr_blk;
184 unsigned char delete_blk_timer;
185 unsigned short kactive_blk_num;
186 unsigned short blk_sizeof_priv;
187
188 /* last_kactive_blk_num:
189 * trick to see if user-space has caught up
190 * in order to avoid refreshing timer when every single pkt arrives.
191 */
192 unsigned short last_kactive_blk_num;
193
194 char *pkblk_start;
195 char *pkblk_end;
196 int kblk_size;
197 unsigned int knum_blocks;
198 uint64_t knxt_seq_num;
199 char *prev;
200 char *nxt_offset;
201 struct sk_buff *skb;
202
203 atomic_t blk_fill_in_prog;
204
205 /* Default is set to 8ms */
206#define DEFAULT_PRB_RETIRE_TOV (8)
207
208 unsigned short retire_blk_tov;
209 unsigned short version;
210 unsigned long tov_in_jiffies;
211
212 /* timer to retire an outstanding block */
213 struct timer_list retire_blk_timer;
214};
215
216#define PGV_FROM_VMALLOC 1
0e3125c7
NH
217struct pgv {
218 char *buffer;
0e3125c7
NH
219};
220
69e3c75f 221struct packet_ring_buffer {
0e3125c7 222 struct pgv *pg_vec;
69e3c75f
JB
223 unsigned int head;
224 unsigned int frames_per_block;
225 unsigned int frame_size;
226 unsigned int frame_max;
227
228 unsigned int pg_vec_order;
229 unsigned int pg_vec_pages;
230 unsigned int pg_vec_len;
231
bc59ba39 232 struct tpacket_kbdq_core prb_bdqc;
69e3c75f
JB
233 atomic_t pending;
234};
235
f6fb8f10 236#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
237#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
238#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
239#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
240#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
241#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
242#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
243
69e3c75f
JB
244struct packet_sock;
245static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
1da177e4 246
f6fb8f10 247static void *packet_previous_frame(struct packet_sock *po,
248 struct packet_ring_buffer *rb,
249 int status);
250static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 251static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
252 struct tpacket_block_desc *);
253static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 254 struct packet_sock *);
bc59ba39 255static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 256 struct packet_sock *, unsigned int status);
bc59ba39 257static int prb_queue_frozen(struct tpacket_kbdq_core *);
258static void prb_open_block(struct tpacket_kbdq_core *,
259 struct tpacket_block_desc *);
f6fb8f10 260static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 261static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
262static void prb_init_blk_timer(struct packet_sock *,
263 struct tpacket_kbdq_core *,
264 void (*func) (unsigned long));
265static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
266static void prb_clear_rxhash(struct tpacket_kbdq_core *,
267 struct tpacket3_hdr *);
268static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
269 struct tpacket3_hdr *);
1da177e4
LT
270static void packet_flush_mclist(struct sock *sk);
271
dc99f600 272struct packet_fanout;
1da177e4
LT
273struct packet_sock {
274 /* struct sock has to be the first member of packet_sock */
275 struct sock sk;
dc99f600 276 struct packet_fanout *fanout;
1da177e4 277 struct tpacket_stats stats;
f6fb8f10 278 union tpacket_stats_u stats_u;
69e3c75f
JB
279 struct packet_ring_buffer rx_ring;
280 struct packet_ring_buffer tx_ring;
1da177e4 281 int copy_thresh;
1da177e4 282 spinlock_t bind_lock;
905db440 283 struct mutex pg_vec_lock;
8dc41944 284 unsigned int running:1, /* prot_hook is attached*/
80feaacb 285 auxdata:1,
bfd5f4a3
SS
286 origdev:1,
287 has_vnet_hdr:1;
1da177e4 288 int ifindex; /* bound device */
0e11c91e 289 __be16 num;
1da177e4 290 struct packet_mclist *mclist;
1da177e4 291 atomic_t mapped;
bbd6ef87
PM
292 enum tpacket_versions tp_version;
293 unsigned int tp_hdrlen;
8913336a 294 unsigned int tp_reserve;
69e3c75f 295 unsigned int tp_loss:1;
614f60fa 296 unsigned int tp_tstamp;
94b05952 297 struct packet_type prot_hook ____cacheline_aligned_in_smp;
1da177e4
LT
298};
299
dc99f600
DM
300#define PACKET_FANOUT_MAX 256
301
302struct packet_fanout {
303#ifdef CONFIG_NET_NS
304 struct net *net;
305#endif
306 unsigned int num_members;
307 u16 id;
308 u8 type;
7736d33f 309 u8 defrag;
dc99f600
DM
310 atomic_t rr_cur;
311 struct list_head list;
312 struct sock *arr[PACKET_FANOUT_MAX];
313 spinlock_t lock;
314 atomic_t sk_ref;
315 struct packet_type prot_hook ____cacheline_aligned_in_smp;
316};
317
ffbc6111
HX
318struct packet_skb_cb {
319 unsigned int origlen;
320 union {
321 struct sockaddr_pkt pkt;
322 struct sockaddr_ll ll;
323 } sa;
324};
325
326#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 327
bc59ba39 328#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 329#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 330 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 331#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 332 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 333#define GET_NEXT_PRB_BLK_NUM(x) \
334 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
335 ((x)->kactive_blk_num+1) : 0)
336
eea49cc9 337static struct packet_sock *pkt_sk(struct sock *sk)
ce06b03e
DM
338{
339 return (struct packet_sock *)sk;
340}
341
dc99f600
DM
342static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
343static void __fanout_link(struct sock *sk, struct packet_sock *po);
344
ce06b03e
DM
345/* register_prot_hook must be invoked with the po->bind_lock held,
346 * or from a context in which asynchronous accesses to the packet
347 * socket is not possible (packet_create()).
348 */
349static void register_prot_hook(struct sock *sk)
350{
351 struct packet_sock *po = pkt_sk(sk);
352 if (!po->running) {
dc99f600
DM
353 if (po->fanout)
354 __fanout_link(sk, po);
355 else
356 dev_add_pack(&po->prot_hook);
ce06b03e
DM
357 sock_hold(sk);
358 po->running = 1;
359 }
360}
361
362/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
363 * held. If the sync parameter is true, we will temporarily drop
364 * the po->bind_lock and do a synchronize_net to make sure no
365 * asynchronous packet processing paths still refer to the elements
366 * of po->prot_hook. If the sync parameter is false, it is the
367 * callers responsibility to take care of this.
368 */
369static void __unregister_prot_hook(struct sock *sk, bool sync)
370{
371 struct packet_sock *po = pkt_sk(sk);
372
373 po->running = 0;
dc99f600
DM
374 if (po->fanout)
375 __fanout_unlink(sk, po);
376 else
377 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
378 __sock_put(sk);
379
380 if (sync) {
381 spin_unlock(&po->bind_lock);
382 synchronize_net();
383 spin_lock(&po->bind_lock);
384 }
385}
386
387static void unregister_prot_hook(struct sock *sk, bool sync)
388{
389 struct packet_sock *po = pkt_sk(sk);
390
391 if (po->running)
392 __unregister_prot_hook(sk, sync);
393}
394
f6dafa95 395static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
396{
397 if (is_vmalloc_addr(addr))
398 return vmalloc_to_page(addr);
399 return virt_to_page(addr);
400}
401
69e3c75f 402static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 403{
bbd6ef87
PM
404 union {
405 struct tpacket_hdr *h1;
406 struct tpacket2_hdr *h2;
407 void *raw;
408 } h;
1da177e4 409
69e3c75f 410 h.raw = frame;
bbd6ef87
PM
411 switch (po->tp_version) {
412 case TPACKET_V1:
69e3c75f 413 h.h1->tp_status = status;
0af55bb5 414 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
415 break;
416 case TPACKET_V2:
69e3c75f 417 h.h2->tp_status = status;
0af55bb5 418 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 419 break;
f6fb8f10 420 case TPACKET_V3:
69e3c75f 421 default:
f6fb8f10 422 WARN(1, "TPACKET version not supported.\n");
69e3c75f 423 BUG();
bbd6ef87 424 }
69e3c75f
JB
425
426 smp_wmb();
bbd6ef87
PM
427}
428
69e3c75f 429static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87
PM
430{
431 union {
432 struct tpacket_hdr *h1;
433 struct tpacket2_hdr *h2;
434 void *raw;
435 } h;
436
69e3c75f
JB
437 smp_rmb();
438
bbd6ef87
PM
439 h.raw = frame;
440 switch (po->tp_version) {
441 case TPACKET_V1:
0af55bb5 442 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 443 return h.h1->tp_status;
bbd6ef87 444 case TPACKET_V2:
0af55bb5 445 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 446 return h.h2->tp_status;
f6fb8f10 447 case TPACKET_V3:
69e3c75f 448 default:
f6fb8f10 449 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
450 BUG();
451 return 0;
bbd6ef87 452 }
1da177e4 453}
69e3c75f
JB
454
455static void *packet_lookup_frame(struct packet_sock *po,
456 struct packet_ring_buffer *rb,
457 unsigned int position,
458 int status)
459{
460 unsigned int pg_vec_pos, frame_offset;
461 union {
462 struct tpacket_hdr *h1;
463 struct tpacket2_hdr *h2;
464 void *raw;
465 } h;
466
467 pg_vec_pos = position / rb->frames_per_block;
468 frame_offset = position % rb->frames_per_block;
469
0e3125c7
NH
470 h.raw = rb->pg_vec[pg_vec_pos].buffer +
471 (frame_offset * rb->frame_size);
69e3c75f
JB
472
473 if (status != __packet_get_status(po, h.raw))
474 return NULL;
475
476 return h.raw;
477}
478
eea49cc9 479static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
480 struct packet_ring_buffer *rb,
481 int status)
482{
483 return packet_lookup_frame(po, rb, rb->head, status);
484}
485
bc59ba39 486static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 487{
488 del_timer_sync(&pkc->retire_blk_timer);
489}
490
491static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
492 int tx_ring,
493 struct sk_buff_head *rb_queue)
494{
bc59ba39 495 struct tpacket_kbdq_core *pkc;
f6fb8f10 496
497 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
498
499 spin_lock(&rb_queue->lock);
500 pkc->delete_blk_timer = 1;
501 spin_unlock(&rb_queue->lock);
502
503 prb_del_retire_blk_timer(pkc);
504}
505
506static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 507 struct tpacket_kbdq_core *pkc,
f6fb8f10 508 void (*func) (unsigned long))
509{
510 init_timer(&pkc->retire_blk_timer);
511 pkc->retire_blk_timer.data = (long)po;
512 pkc->retire_blk_timer.function = func;
513 pkc->retire_blk_timer.expires = jiffies;
514}
515
516static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
517{
bc59ba39 518 struct tpacket_kbdq_core *pkc;
f6fb8f10 519
520 if (tx_ring)
521 BUG();
522
523 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
524 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
525}
526
527static int prb_calc_retire_blk_tmo(struct packet_sock *po,
528 int blk_size_in_bytes)
529{
530 struct net_device *dev;
531 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
532 struct ethtool_cmd ecmd;
533 int err;
e440cf2c 534 u32 speed;
f6fb8f10 535
4bc71cb9
JP
536 rtnl_lock();
537 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
538 if (unlikely(!dev)) {
539 rtnl_unlock();
f6fb8f10 540 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
541 }
542 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 543 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
544 rtnl_unlock();
545 if (!err) {
4bc71cb9
JP
546 /*
547 * If the link speed is so slow you don't really
548 * need to worry about perf anyways
549 */
e440cf2c 550 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 551 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 552 } else {
553 msec = 1;
554 div = speed / 1000;
f6fb8f10 555 }
556 }
557
558 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
559
560 if (div)
561 mbits /= div;
562
563 tmo = mbits * msec;
564
565 if (div)
566 return tmo+1;
567 return tmo;
568}
569
bc59ba39 570static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 571 union tpacket_req_u *req_u)
572{
573 p1->feature_req_word = req_u->req3.tp_feature_req_word;
574}
575
576static void init_prb_bdqc(struct packet_sock *po,
577 struct packet_ring_buffer *rb,
578 struct pgv *pg_vec,
579 union tpacket_req_u *req_u, int tx_ring)
580{
bc59ba39 581 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
582 struct tpacket_block_desc *pbd;
f6fb8f10 583
584 memset(p1, 0x0, sizeof(*p1));
585
586 p1->knxt_seq_num = 1;
587 p1->pkbdq = pg_vec;
bc59ba39 588 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 589 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 590 p1->kblk_size = req_u->req3.tp_block_size;
591 p1->knum_blocks = req_u->req3.tp_block_nr;
592 p1->hdrlen = po->tp_hdrlen;
593 p1->version = po->tp_version;
594 p1->last_kactive_blk_num = 0;
595 po->stats_u.stats3.tp_freeze_q_cnt = 0;
596 if (req_u->req3.tp_retire_blk_tov)
597 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
598 else
599 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
600 req_u->req3.tp_block_size);
601 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
602 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
603
604 prb_init_ft_ops(p1, req_u);
605 prb_setup_retire_blk_timer(po, tx_ring);
606 prb_open_block(p1, pbd);
607}
608
609/* Do NOT update the last_blk_num first.
610 * Assumes sk_buff_head lock is held.
611 */
bc59ba39 612static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 613{
614 mod_timer(&pkc->retire_blk_timer,
615 jiffies + pkc->tov_in_jiffies);
616 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
617}
618
619/*
620 * Timer logic:
621 * 1) We refresh the timer only when we open a block.
622 * By doing this we don't waste cycles refreshing the timer
623 * on packet-by-packet basis.
624 *
625 * With a 1MB block-size, on a 1Gbps line, it will take
626 * i) ~8 ms to fill a block + ii) memcpy etc.
627 * In this cut we are not accounting for the memcpy time.
628 *
629 * So, if the user sets the 'tmo' to 10ms then the timer
630 * will never fire while the block is still getting filled
631 * (which is what we want). However, the user could choose
632 * to close a block early and that's fine.
633 *
634 * But when the timer does fire, we check whether or not to refresh it.
635 * Since the tmo granularity is in msecs, it is not too expensive
636 * to refresh the timer, lets say every '8' msecs.
637 * Either the user can set the 'tmo' or we can derive it based on
638 * a) line-speed and b) block-size.
639 * prb_calc_retire_blk_tmo() calculates the tmo.
640 *
641 */
642static void prb_retire_rx_blk_timer_expired(unsigned long data)
643{
644 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 645 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 646 unsigned int frozen;
bc59ba39 647 struct tpacket_block_desc *pbd;
f6fb8f10 648
649 spin_lock(&po->sk.sk_receive_queue.lock);
650
651 frozen = prb_queue_frozen(pkc);
652 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
653
654 if (unlikely(pkc->delete_blk_timer))
655 goto out;
656
657 /* We only need to plug the race when the block is partially filled.
658 * tpacket_rcv:
659 * lock(); increment BLOCK_NUM_PKTS; unlock()
660 * copy_bits() is in progress ...
661 * timer fires on other cpu:
662 * we can't retire the current block because copy_bits
663 * is in progress.
664 *
665 */
666 if (BLOCK_NUM_PKTS(pbd)) {
667 while (atomic_read(&pkc->blk_fill_in_prog)) {
668 /* Waiting for skb_copy_bits to finish... */
669 cpu_relax();
670 }
671 }
672
673 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
674 if (!frozen) {
675 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
676 if (!prb_dispatch_next_block(pkc, po))
677 goto refresh_timer;
678 else
679 goto out;
680 } else {
681 /* Case 1. Queue was frozen because user-space was
682 * lagging behind.
683 */
684 if (prb_curr_blk_in_use(pkc, pbd)) {
685 /*
686 * Ok, user-space is still behind.
687 * So just refresh the timer.
688 */
689 goto refresh_timer;
690 } else {
691 /* Case 2. queue was frozen,user-space caught up,
692 * now the link went idle && the timer fired.
693 * We don't have a block to close.So we open this
694 * block and restart the timer.
695 * opening a block thaws the queue,restarts timer
696 * Thawing/timer-refresh is a side effect.
697 */
698 prb_open_block(pkc, pbd);
699 goto out;
700 }
701 }
702 }
703
704refresh_timer:
705 _prb_refresh_rx_retire_blk_timer(pkc);
706
707out:
708 spin_unlock(&po->sk.sk_receive_queue.lock);
709}
710
eea49cc9 711static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 712 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 713{
714 /* Flush everything minus the block header */
715
716#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
717 u8 *start, *end;
718
719 start = (u8 *)pbd1;
720
721 /* Skip the block header(we know header WILL fit in 4K) */
722 start += PAGE_SIZE;
723
724 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
725 for (; start < end; start += PAGE_SIZE)
726 flush_dcache_page(pgv_to_page(start));
727
728 smp_wmb();
729#endif
730
731 /* Now update the block status. */
732
733 BLOCK_STATUS(pbd1) = status;
734
735 /* Flush the block header */
736
737#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
738 start = (u8 *)pbd1;
739 flush_dcache_page(pgv_to_page(start));
740
741 smp_wmb();
742#endif
743}
744
745/*
746 * Side effect:
747 *
748 * 1) flush the block
749 * 2) Increment active_blk_num
750 *
751 * Note:We DONT refresh the timer on purpose.
752 * Because almost always the next block will be opened.
753 */
bc59ba39 754static void prb_close_block(struct tpacket_kbdq_core *pkc1,
755 struct tpacket_block_desc *pbd1,
f6fb8f10 756 struct packet_sock *po, unsigned int stat)
757{
758 __u32 status = TP_STATUS_USER | stat;
759
760 struct tpacket3_hdr *last_pkt;
bc59ba39 761 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 762
763 if (po->stats.tp_drops)
764 status |= TP_STATUS_LOSING;
765
766 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
767 last_pkt->tp_next_offset = 0;
768
769 /* Get the ts of the last pkt */
770 if (BLOCK_NUM_PKTS(pbd1)) {
771 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
772 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
773 } else {
774 /* Ok, we tmo'd - so get the current time */
775 struct timespec ts;
776 getnstimeofday(&ts);
777 h1->ts_last_pkt.ts_sec = ts.tv_sec;
778 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
779 }
780
781 smp_wmb();
782
783 /* Flush the block */
784 prb_flush_block(pkc1, pbd1, status);
785
786 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
787}
788
eea49cc9 789static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 790{
791 pkc->reset_pending_on_curr_blk = 0;
792}
793
794/*
795 * Side effect of opening a block:
796 *
797 * 1) prb_queue is thawed.
798 * 2) retire_blk_timer is refreshed.
799 *
800 */
bc59ba39 801static void prb_open_block(struct tpacket_kbdq_core *pkc1,
802 struct tpacket_block_desc *pbd1)
f6fb8f10 803{
804 struct timespec ts;
bc59ba39 805 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 806
807 smp_rmb();
808
809 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
810
811 /* We could have just memset this but we will lose the
812 * flexibility of making the priv area sticky
813 */
814 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
815 BLOCK_NUM_PKTS(pbd1) = 0;
816 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
817 getnstimeofday(&ts);
818 h1->ts_first_pkt.ts_sec = ts.tv_sec;
819 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
820 pkc1->pkblk_start = (char *)pbd1;
e3192690 821 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 822 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
823 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
824 pbd1->version = pkc1->version;
825 pkc1->prev = pkc1->nxt_offset;
826 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
827 prb_thaw_queue(pkc1);
828 _prb_refresh_rx_retire_blk_timer(pkc1);
829
830 smp_wmb();
831
832 return;
833 }
834
835 WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
836 pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
837 dump_stack();
838 BUG();
839}
840
841/*
842 * Queue freeze logic:
843 * 1) Assume tp_block_nr = 8 blocks.
844 * 2) At time 't0', user opens Rx ring.
845 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
846 * 4) user-space is either sleeping or processing block '0'.
847 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
848 * it will close block-7,loop around and try to fill block '0'.
849 * call-flow:
850 * __packet_lookup_frame_in_block
851 * prb_retire_current_block()
852 * prb_dispatch_next_block()
853 * |->(BLOCK_STATUS == USER) evaluates to true
854 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
855 * 6) Now there are two cases:
856 * 6.1) Link goes idle right after the queue is frozen.
857 * But remember, the last open_block() refreshed the timer.
858 * When this timer expires,it will refresh itself so that we can
859 * re-open block-0 in near future.
860 * 6.2) Link is busy and keeps on receiving packets. This is a simple
861 * case and __packet_lookup_frame_in_block will check if block-0
862 * is free and can now be re-used.
863 */
eea49cc9 864static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 865 struct packet_sock *po)
866{
867 pkc->reset_pending_on_curr_blk = 1;
868 po->stats_u.stats3.tp_freeze_q_cnt++;
869}
870
871#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
872
873/*
874 * If the next block is free then we will dispatch it
875 * and return a good offset.
876 * Else, we will freeze the queue.
877 * So, caller must check the return value.
878 */
bc59ba39 879static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 880 struct packet_sock *po)
881{
bc59ba39 882 struct tpacket_block_desc *pbd;
f6fb8f10 883
884 smp_rmb();
885
886 /* 1. Get current block num */
887 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
888
889 /* 2. If this block is currently in_use then freeze the queue */
890 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
891 prb_freeze_queue(pkc, po);
892 return NULL;
893 }
894
895 /*
896 * 3.
897 * open this block and return the offset where the first packet
898 * needs to get stored.
899 */
900 prb_open_block(pkc, pbd);
901 return (void *)pkc->nxt_offset;
902}
903
bc59ba39 904static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 905 struct packet_sock *po, unsigned int status)
906{
bc59ba39 907 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 908
909 /* retire/close the current block */
910 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
911 /*
912 * Plug the case where copy_bits() is in progress on
913 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
914 * have space to copy the pkt in the current block and
915 * called prb_retire_current_block()
916 *
917 * We don't need to worry about the TMO case because
918 * the timer-handler already handled this case.
919 */
920 if (!(status & TP_STATUS_BLK_TMO)) {
921 while (atomic_read(&pkc->blk_fill_in_prog)) {
922 /* Waiting for skb_copy_bits to finish... */
923 cpu_relax();
924 }
925 }
926 prb_close_block(pkc, pbd, po, status);
927 return;
928 }
929
930 WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
931 dump_stack();
932 BUG();
933}
934
eea49cc9 935static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 936 struct tpacket_block_desc *pbd)
f6fb8f10 937{
938 return TP_STATUS_USER & BLOCK_STATUS(pbd);
939}
940
eea49cc9 941static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 942{
943 return pkc->reset_pending_on_curr_blk;
944}
945
eea49cc9 946static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 947{
bc59ba39 948 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 949 atomic_dec(&pkc->blk_fill_in_prog);
950}
951
eea49cc9 952static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 953 struct tpacket3_hdr *ppd)
954{
955 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
956}
957
eea49cc9 958static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 959 struct tpacket3_hdr *ppd)
960{
961 ppd->hv1.tp_rxhash = 0;
962}
963
eea49cc9 964static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 965 struct tpacket3_hdr *ppd)
966{
967 if (vlan_tx_tag_present(pkc->skb)) {
968 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
969 ppd->tp_status = TP_STATUS_VLAN_VALID;
970 } else {
971 ppd->hv1.tp_vlan_tci = ppd->tp_status = 0;
972 }
973}
974
bc59ba39 975static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 976 struct tpacket3_hdr *ppd)
977{
978 prb_fill_vlan_info(pkc, ppd);
979
980 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
981 prb_fill_rxhash(pkc, ppd);
982 else
983 prb_clear_rxhash(pkc, ppd);
984}
985
eea49cc9 986static void prb_fill_curr_block(char *curr,
bc59ba39 987 struct tpacket_kbdq_core *pkc,
988 struct tpacket_block_desc *pbd,
f6fb8f10 989 unsigned int len)
990{
991 struct tpacket3_hdr *ppd;
992
993 ppd = (struct tpacket3_hdr *)curr;
994 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
995 pkc->prev = curr;
996 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
997 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
998 BLOCK_NUM_PKTS(pbd) += 1;
999 atomic_inc(&pkc->blk_fill_in_prog);
1000 prb_run_all_ft_ops(pkc, ppd);
1001}
1002
1003/* Assumes caller has the sk->rx_queue.lock */
1004static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1005 struct sk_buff *skb,
1006 int status,
1007 unsigned int len
1008 )
1009{
bc59ba39 1010 struct tpacket_kbdq_core *pkc;
1011 struct tpacket_block_desc *pbd;
f6fb8f10 1012 char *curr, *end;
1013
e3192690 1014 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1015 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1016
1017 /* Queue is frozen when user space is lagging behind */
1018 if (prb_queue_frozen(pkc)) {
1019 /*
1020 * Check if that last block which caused the queue to freeze,
1021 * is still in_use by user-space.
1022 */
1023 if (prb_curr_blk_in_use(pkc, pbd)) {
1024 /* Can't record this packet */
1025 return NULL;
1026 } else {
1027 /*
1028 * Ok, the block was released by user-space.
1029 * Now let's open that block.
1030 * opening a block also thaws the queue.
1031 * Thawing is a side effect.
1032 */
1033 prb_open_block(pkc, pbd);
1034 }
1035 }
1036
1037 smp_mb();
1038 curr = pkc->nxt_offset;
1039 pkc->skb = skb;
e3192690 1040 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1041
1042 /* first try the current block */
1043 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1044 prb_fill_curr_block(curr, pkc, pbd, len);
1045 return (void *)curr;
1046 }
1047
1048 /* Ok, close the current block */
1049 prb_retire_current_block(pkc, po, 0);
1050
1051 /* Now, try to dispatch the next block */
1052 curr = (char *)prb_dispatch_next_block(pkc, po);
1053 if (curr) {
1054 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1055 prb_fill_curr_block(curr, pkc, pbd, len);
1056 return (void *)curr;
1057 }
1058
1059 /*
1060 * No free blocks are available.user_space hasn't caught up yet.
1061 * Queue was just frozen and now this packet will get dropped.
1062 */
1063 return NULL;
1064}
1065
eea49cc9 1066static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1067 struct sk_buff *skb,
1068 int status, unsigned int len)
1069{
1070 char *curr = NULL;
1071 switch (po->tp_version) {
1072 case TPACKET_V1:
1073 case TPACKET_V2:
1074 curr = packet_lookup_frame(po, &po->rx_ring,
1075 po->rx_ring.head, status);
1076 return curr;
1077 case TPACKET_V3:
1078 return __packet_lookup_frame_in_block(po, skb, status, len);
1079 default:
1080 WARN(1, "TPACKET version not supported\n");
1081 BUG();
1082 return 0;
1083 }
1084}
1085
eea49cc9 1086static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1087 struct packet_ring_buffer *rb,
1088 unsigned int previous,
1089 int status)
1090{
bc59ba39 1091 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1092 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
f6fb8f10 1093
1094 if (status != BLOCK_STATUS(pbd))
1095 return NULL;
1096 return pbd;
1097}
1098
eea49cc9 1099static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1100{
1101 unsigned int prev;
1102 if (rb->prb_bdqc.kactive_blk_num)
1103 prev = rb->prb_bdqc.kactive_blk_num-1;
1104 else
1105 prev = rb->prb_bdqc.knum_blocks-1;
1106 return prev;
1107}
1108
1109/* Assumes caller has held the rx_queue.lock */
eea49cc9 1110static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1111 struct packet_ring_buffer *rb,
1112 int status)
1113{
1114 unsigned int previous = prb_previous_blk_num(rb);
1115 return prb_lookup_block(po, rb, previous, status);
1116}
1117
eea49cc9 1118static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1119 struct packet_ring_buffer *rb,
1120 int status)
1121{
1122 if (po->tp_version <= TPACKET_V2)
1123 return packet_previous_frame(po, rb, status);
1124
1125 return __prb_previous_block(po, rb, status);
1126}
1127
eea49cc9 1128static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1129 struct packet_ring_buffer *rb)
1130{
1131 switch (po->tp_version) {
1132 case TPACKET_V1:
1133 case TPACKET_V2:
1134 return packet_increment_head(rb);
1135 case TPACKET_V3:
1136 default:
1137 WARN(1, "TPACKET version not supported.\n");
1138 BUG();
1139 return;
1140 }
1141}
1142
eea49cc9 1143static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1144 struct packet_ring_buffer *rb,
1145 int status)
1146{
1147 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1148 return packet_lookup_frame(po, rb, previous, status);
1149}
1150
eea49cc9 1151static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1152{
1153 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1154}
1155
1da177e4
LT
1156static void packet_sock_destruct(struct sock *sk)
1157{
ed85b565
RC
1158 skb_queue_purge(&sk->sk_error_queue);
1159
547b792c
IJ
1160 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1161 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1162
1163 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1164 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1165 return;
1166 }
1167
17ab56a2 1168 sk_refcnt_debug_dec(sk);
1da177e4
LT
1169}
1170
dc99f600
DM
1171static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1172{
1173 int x = atomic_read(&f->rr_cur) + 1;
1174
1175 if (x >= num)
1176 x = 0;
1177
1178 return x;
1179}
1180
1181static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1182{
1183 u32 idx, hash = skb->rxhash;
1184
1185 idx = ((u64)hash * num) >> 32;
1186
1187 return f->arr[idx];
1188}
1189
1190static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1191{
1192 int cur, old;
1193
1194 cur = atomic_read(&f->rr_cur);
1195 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1196 fanout_rr_next(f, num))) != cur)
1197 cur = old;
1198 return f->arr[cur];
1199}
1200
95ec3eb4
DM
1201static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1202{
1203 unsigned int cpu = smp_processor_id();
1204
1205 return f->arr[cpu % num];
1206}
1207
95ec3eb4
DM
1208static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1209 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1210{
1211 struct packet_fanout *f = pt->af_packet_priv;
1212 unsigned int num = f->num_members;
1213 struct packet_sock *po;
1214 struct sock *sk;
1215
1216 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1217 !num) {
1218 kfree_skb(skb);
1219 return 0;
1220 }
1221
95ec3eb4
DM
1222 switch (f->type) {
1223 case PACKET_FANOUT_HASH:
1224 default:
1225 if (f->defrag) {
bc416d97 1226 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1227 if (!skb)
1228 return 0;
1229 }
1230 skb_get_rxhash(skb);
1231 sk = fanout_demux_hash(f, skb, num);
1232 break;
1233 case PACKET_FANOUT_LB:
1234 sk = fanout_demux_lb(f, skb, num);
1235 break;
1236 case PACKET_FANOUT_CPU:
1237 sk = fanout_demux_cpu(f, skb, num);
1238 break;
dc99f600
DM
1239 }
1240
dc99f600
DM
1241 po = pkt_sk(sk);
1242
1243 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1244}
1245
1246static DEFINE_MUTEX(fanout_mutex);
1247static LIST_HEAD(fanout_list);
1248
1249static void __fanout_link(struct sock *sk, struct packet_sock *po)
1250{
1251 struct packet_fanout *f = po->fanout;
1252
1253 spin_lock(&f->lock);
1254 f->arr[f->num_members] = sk;
1255 smp_wmb();
1256 f->num_members++;
1257 spin_unlock(&f->lock);
1258}
1259
1260static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1261{
1262 struct packet_fanout *f = po->fanout;
1263 int i;
1264
1265 spin_lock(&f->lock);
1266 for (i = 0; i < f->num_members; i++) {
1267 if (f->arr[i] == sk)
1268 break;
1269 }
1270 BUG_ON(i >= f->num_members);
1271 f->arr[i] = f->arr[f->num_members - 1];
1272 f->num_members--;
1273 spin_unlock(&f->lock);
1274}
1275
7736d33f 1276static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1277{
1278 struct packet_sock *po = pkt_sk(sk);
1279 struct packet_fanout *f, *match;
7736d33f
DM
1280 u8 type = type_flags & 0xff;
1281 u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0;
dc99f600
DM
1282 int err;
1283
1284 switch (type) {
1285 case PACKET_FANOUT_HASH:
1286 case PACKET_FANOUT_LB:
95ec3eb4 1287 case PACKET_FANOUT_CPU:
dc99f600
DM
1288 break;
1289 default:
1290 return -EINVAL;
1291 }
1292
1293 if (!po->running)
1294 return -EINVAL;
1295
1296 if (po->fanout)
1297 return -EALREADY;
1298
1299 mutex_lock(&fanout_mutex);
1300 match = NULL;
1301 list_for_each_entry(f, &fanout_list, list) {
1302 if (f->id == id &&
1303 read_pnet(&f->net) == sock_net(sk)) {
1304 match = f;
1305 break;
1306 }
1307 }
afe62c68 1308 err = -EINVAL;
7736d33f 1309 if (match && match->defrag != defrag)
afe62c68 1310 goto out;
dc99f600 1311 if (!match) {
afe62c68 1312 err = -ENOMEM;
dc99f600 1313 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1314 if (!match)
1315 goto out;
1316 write_pnet(&match->net, sock_net(sk));
1317 match->id = id;
1318 match->type = type;
1319 match->defrag = defrag;
1320 atomic_set(&match->rr_cur, 0);
1321 INIT_LIST_HEAD(&match->list);
1322 spin_lock_init(&match->lock);
1323 atomic_set(&match->sk_ref, 0);
1324 match->prot_hook.type = po->prot_hook.type;
1325 match->prot_hook.dev = po->prot_hook.dev;
1326 match->prot_hook.func = packet_rcv_fanout;
1327 match->prot_hook.af_packet_priv = match;
1328 dev_add_pack(&match->prot_hook);
1329 list_add(&match->list, &fanout_list);
dc99f600 1330 }
afe62c68
ED
1331 err = -EINVAL;
1332 if (match->type == type &&
1333 match->prot_hook.type == po->prot_hook.type &&
1334 match->prot_hook.dev == po->prot_hook.dev) {
1335 err = -ENOSPC;
1336 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1337 __dev_remove_pack(&po->prot_hook);
1338 po->fanout = match;
1339 atomic_inc(&match->sk_ref);
1340 __fanout_link(sk, po);
1341 err = 0;
dc99f600
DM
1342 }
1343 }
afe62c68 1344out:
dc99f600
DM
1345 mutex_unlock(&fanout_mutex);
1346 return err;
1347}
1348
1349static void fanout_release(struct sock *sk)
1350{
1351 struct packet_sock *po = pkt_sk(sk);
1352 struct packet_fanout *f;
1353
1354 f = po->fanout;
1355 if (!f)
1356 return;
1357
1358 po->fanout = NULL;
1359
1360 mutex_lock(&fanout_mutex);
1361 if (atomic_dec_and_test(&f->sk_ref)) {
1362 list_del(&f->list);
1363 dev_remove_pack(&f->prot_hook);
1364 kfree(f);
1365 }
1366 mutex_unlock(&fanout_mutex);
1367}
1da177e4 1368
90ddc4f0 1369static const struct proto_ops packet_ops;
1da177e4 1370
90ddc4f0 1371static const struct proto_ops packet_ops_spkt;
1da177e4 1372
40d4e3df
ED
1373static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1374 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1375{
1376 struct sock *sk;
1377 struct sockaddr_pkt *spkt;
1378
1379 /*
1380 * When we registered the protocol we saved the socket in the data
1381 * field for just this event.
1382 */
1383
1384 sk = pt->af_packet_priv;
1ce4f28b 1385
1da177e4
LT
1386 /*
1387 * Yank back the headers [hope the device set this
1388 * right or kerboom...]
1389 *
1390 * Incoming packets have ll header pulled,
1391 * push it back.
1392 *
98e399f8 1393 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1394 * so that this procedure is noop.
1395 */
1396
1397 if (skb->pkt_type == PACKET_LOOPBACK)
1398 goto out;
1399
09ad9bc7 1400 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1401 goto out;
1402
40d4e3df
ED
1403 skb = skb_share_check(skb, GFP_ATOMIC);
1404 if (skb == NULL)
1da177e4
LT
1405 goto oom;
1406
1407 /* drop any routing info */
adf30907 1408 skb_dst_drop(skb);
1da177e4 1409
84531c24
PO
1410 /* drop conntrack reference */
1411 nf_reset(skb);
1412
ffbc6111 1413 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1414
98e399f8 1415 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1416
1417 /*
1418 * The SOCK_PACKET socket receives _all_ frames.
1419 */
1420
1421 spkt->spkt_family = dev->type;
1422 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1423 spkt->spkt_protocol = skb->protocol;
1424
1425 /*
1426 * Charge the memory to the socket. This is done specifically
1427 * to prevent sockets using all the memory up.
1428 */
1429
40d4e3df 1430 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1431 return 0;
1432
1433out:
1434 kfree_skb(skb);
1435oom:
1436 return 0;
1437}
1438
1439
1440/*
1441 * Output a raw packet to a device layer. This bypasses all the other
1442 * protocol layers and you must therefore supply it with a complete frame
1443 */
1ce4f28b 1444
1da177e4
LT
1445static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1446 struct msghdr *msg, size_t len)
1447{
1448 struct sock *sk = sock->sk;
40d4e3df 1449 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1450 struct sk_buff *skb = NULL;
1da177e4 1451 struct net_device *dev;
40d4e3df 1452 __be16 proto = 0;
1da177e4 1453 int err;
3bdc0eba 1454 int extra_len = 0;
1ce4f28b 1455
1da177e4 1456 /*
1ce4f28b 1457 * Get and verify the address.
1da177e4
LT
1458 */
1459
40d4e3df 1460 if (saddr) {
1da177e4 1461 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1462 return -EINVAL;
1463 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1464 proto = saddr->spkt_protocol;
1465 } else
1466 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1467
1468 /*
1ce4f28b 1469 * Find the device first to size check it
1da177e4
LT
1470 */
1471
de74e92a 1472 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1473retry:
654d1f8a
ED
1474 rcu_read_lock();
1475 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1476 err = -ENODEV;
1477 if (dev == NULL)
1478 goto out_unlock;
1ce4f28b 1479
d5e76b0a
DM
1480 err = -ENETDOWN;
1481 if (!(dev->flags & IFF_UP))
1482 goto out_unlock;
1483
1da177e4 1484 /*
40d4e3df
ED
1485 * You may not queue a frame bigger than the mtu. This is the lowest level
1486 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1487 */
1ce4f28b 1488
3bdc0eba
BG
1489 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1490 if (!netif_supports_nofcs(dev)) {
1491 err = -EPROTONOSUPPORT;
1492 goto out_unlock;
1493 }
1494 extra_len = 4; /* We're doing our own CRC */
1495 }
1496
1da177e4 1497 err = -EMSGSIZE;
3bdc0eba 1498 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1499 goto out_unlock;
1500
1a35ca80
ED
1501 if (!skb) {
1502 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1503 int tlen = dev->needed_tailroom;
1a35ca80
ED
1504 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1505
1506 rcu_read_unlock();
4ce40912 1507 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1508 if (skb == NULL)
1509 return -ENOBUFS;
1510 /* FIXME: Save some space for broken drivers that write a hard
1511 * header at transmission time by themselves. PPP is the notable
1512 * one here. This should really be fixed at the driver level.
1513 */
1514 skb_reserve(skb, reserved);
1515 skb_reset_network_header(skb);
1516
1517 /* Try to align data part correctly */
1518 if (hhlen) {
1519 skb->data -= hhlen;
1520 skb->tail -= hhlen;
1521 if (len < hhlen)
1522 skb_reset_network_header(skb);
1523 }
1524 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1525 if (err)
1526 goto out_free;
1527 goto retry;
1da177e4
LT
1528 }
1529
3bdc0eba 1530 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1531 /* Earlier code assumed this would be a VLAN pkt,
1532 * double-check this now that we have the actual
1533 * packet in hand.
1534 */
1535 struct ethhdr *ehdr;
1536 skb_reset_mac_header(skb);
1537 ehdr = eth_hdr(skb);
1538 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1539 err = -EMSGSIZE;
1540 goto out_unlock;
1541 }
1542 }
1a35ca80 1543
1da177e4
LT
1544 skb->protocol = proto;
1545 skb->dev = dev;
1546 skb->priority = sk->sk_priority;
2d37a186 1547 skb->mark = sk->sk_mark;
2244d07b 1548 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
1549 if (err < 0)
1550 goto out_unlock;
1da177e4 1551
3bdc0eba
BG
1552 if (unlikely(extra_len == 4))
1553 skb->no_fcs = 1;
1554
1da177e4 1555 dev_queue_xmit(skb);
654d1f8a 1556 rcu_read_unlock();
40d4e3df 1557 return len;
1da177e4 1558
1da177e4 1559out_unlock:
654d1f8a 1560 rcu_read_unlock();
1a35ca80
ED
1561out_free:
1562 kfree_skb(skb);
1da177e4
LT
1563 return err;
1564}
1da177e4 1565
eea49cc9 1566static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1567 const struct sock *sk,
dbcb5855 1568 unsigned int res)
1da177e4
LT
1569{
1570 struct sk_filter *filter;
fda9ef5d 1571
80f8f102
ED
1572 rcu_read_lock();
1573 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1574 if (filter != NULL)
0a14842f 1575 res = SK_RUN_FILTER(filter, skb);
80f8f102 1576 rcu_read_unlock();
1da177e4 1577
dbcb5855 1578 return res;
1da177e4
LT
1579}
1580
1581/*
62ab0812
ED
1582 * This function makes lazy skb cloning in hope that most of packets
1583 * are discarded by BPF.
1584 *
1585 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1586 * and skb->cb are mangled. It works because (and until) packets
1587 * falling here are owned by current CPU. Output packets are cloned
1588 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1589 * sequencially, so that if we return skb to original state on exit,
1590 * we will not harm anyone.
1da177e4
LT
1591 */
1592
40d4e3df
ED
1593static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1594 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1595{
1596 struct sock *sk;
1597 struct sockaddr_ll *sll;
1598 struct packet_sock *po;
40d4e3df 1599 u8 *skb_head = skb->data;
1da177e4 1600 int skb_len = skb->len;
dbcb5855 1601 unsigned int snaplen, res;
1da177e4
LT
1602
1603 if (skb->pkt_type == PACKET_LOOPBACK)
1604 goto drop;
1605
1606 sk = pt->af_packet_priv;
1607 po = pkt_sk(sk);
1608
09ad9bc7 1609 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1610 goto drop;
1611
1da177e4
LT
1612 skb->dev = dev;
1613
3b04ddde 1614 if (dev->header_ops) {
1da177e4 1615 /* The device has an explicit notion of ll header,
62ab0812
ED
1616 * exported to higher levels.
1617 *
1618 * Otherwise, the device hides details of its frame
1619 * structure, so that corresponding packet head is
1620 * never delivered to user.
1da177e4
LT
1621 */
1622 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1623 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1624 else if (skb->pkt_type == PACKET_OUTGOING) {
1625 /* Special case: outgoing packets have ll header at head */
bbe735e4 1626 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1627 }
1628 }
1629
1630 snaplen = skb->len;
1631
dbcb5855
DM
1632 res = run_filter(skb, sk, snaplen);
1633 if (!res)
fda9ef5d 1634 goto drop_n_restore;
dbcb5855
DM
1635 if (snaplen > res)
1636 snaplen = res;
1da177e4 1637
0fd7bac6 1638 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1639 goto drop_n_acct;
1640
1641 if (skb_shared(skb)) {
1642 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1643 if (nskb == NULL)
1644 goto drop_n_acct;
1645
1646 if (skb_head != skb->data) {
1647 skb->data = skb_head;
1648 skb->len = skb_len;
1649 }
abc4e4fa 1650 consume_skb(skb);
1da177e4
LT
1651 skb = nskb;
1652 }
1653
ffbc6111
HX
1654 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1655 sizeof(skb->cb));
1656
1657 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1658 sll->sll_family = AF_PACKET;
1659 sll->sll_hatype = dev->type;
1660 sll->sll_protocol = skb->protocol;
1661 sll->sll_pkttype = skb->pkt_type;
8032b464 1662 if (unlikely(po->origdev))
80feaacb
PWJ
1663 sll->sll_ifindex = orig_dev->ifindex;
1664 else
1665 sll->sll_ifindex = dev->ifindex;
1da177e4 1666
b95cce35 1667 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1668
ffbc6111 1669 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1670
1da177e4
LT
1671 if (pskb_trim(skb, snaplen))
1672 goto drop_n_acct;
1673
1674 skb_set_owner_r(skb, sk);
1675 skb->dev = NULL;
adf30907 1676 skb_dst_drop(skb);
1da177e4 1677
84531c24
PO
1678 /* drop conntrack reference */
1679 nf_reset(skb);
1680
1da177e4
LT
1681 spin_lock(&sk->sk_receive_queue.lock);
1682 po->stats.tp_packets++;
3b885787 1683 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1684 __skb_queue_tail(&sk->sk_receive_queue, skb);
1685 spin_unlock(&sk->sk_receive_queue.lock);
1686 sk->sk_data_ready(sk, skb->len);
1687 return 0;
1688
1689drop_n_acct:
7091fbd8
WB
1690 spin_lock(&sk->sk_receive_queue.lock);
1691 po->stats.tp_drops++;
1692 atomic_inc(&sk->sk_drops);
1693 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1694
1695drop_n_restore:
1696 if (skb_head != skb->data && skb_shared(skb)) {
1697 skb->data = skb_head;
1698 skb->len = skb_len;
1699 }
1700drop:
ead2ceb0 1701 consume_skb(skb);
1da177e4
LT
1702 return 0;
1703}
1704
40d4e3df
ED
1705static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1706 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1707{
1708 struct sock *sk;
1709 struct packet_sock *po;
1710 struct sockaddr_ll *sll;
bbd6ef87
PM
1711 union {
1712 struct tpacket_hdr *h1;
1713 struct tpacket2_hdr *h2;
f6fb8f10 1714 struct tpacket3_hdr *h3;
bbd6ef87
PM
1715 void *raw;
1716 } h;
40d4e3df 1717 u8 *skb_head = skb->data;
1da177e4 1718 int skb_len = skb->len;
dbcb5855 1719 unsigned int snaplen, res;
f6fb8f10 1720 unsigned long status = TP_STATUS_USER;
bbd6ef87 1721 unsigned short macoff, netoff, hdrlen;
1da177e4 1722 struct sk_buff *copy_skb = NULL;
b7aa0bf7 1723 struct timeval tv;
bbd6ef87 1724 struct timespec ts;
614f60fa 1725 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
1da177e4
LT
1726
1727 if (skb->pkt_type == PACKET_LOOPBACK)
1728 goto drop;
1729
1730 sk = pt->af_packet_priv;
1731 po = pkt_sk(sk);
1732
09ad9bc7 1733 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1734 goto drop;
1735
3b04ddde 1736 if (dev->header_ops) {
1da177e4 1737 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1738 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1739 else if (skb->pkt_type == PACKET_OUTGOING) {
1740 /* Special case: outgoing packets have ll header at head */
bbe735e4 1741 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1742 }
1743 }
1744
8dc41944
HX
1745 if (skb->ip_summed == CHECKSUM_PARTIAL)
1746 status |= TP_STATUS_CSUMNOTREADY;
1747
1da177e4
LT
1748 snaplen = skb->len;
1749
dbcb5855
DM
1750 res = run_filter(skb, sk, snaplen);
1751 if (!res)
fda9ef5d 1752 goto drop_n_restore;
dbcb5855
DM
1753 if (snaplen > res)
1754 snaplen = res;
1da177e4
LT
1755
1756 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1757 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1758 po->tp_reserve;
1da177e4 1759 } else {
95c96174 1760 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1761 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1762 (maclen < 16 ? 16 : maclen)) +
1763 po->tp_reserve;
1da177e4
LT
1764 macoff = netoff - maclen;
1765 }
f6fb8f10 1766 if (po->tp_version <= TPACKET_V2) {
1767 if (macoff + snaplen > po->rx_ring.frame_size) {
1768 if (po->copy_thresh &&
0fd7bac6 1769 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1770 if (skb_shared(skb)) {
1771 copy_skb = skb_clone(skb, GFP_ATOMIC);
1772 } else {
1773 copy_skb = skb_get(skb);
1774 skb_head = skb->data;
1775 }
1776 if (copy_skb)
1777 skb_set_owner_r(copy_skb, sk);
1da177e4 1778 }
f6fb8f10 1779 snaplen = po->rx_ring.frame_size - macoff;
1780 if ((int)snaplen < 0)
1781 snaplen = 0;
1da177e4 1782 }
1da177e4 1783 }
1da177e4 1784 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1785 h.raw = packet_current_rx_frame(po, skb,
1786 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1787 if (!h.raw)
1da177e4 1788 goto ring_is_full;
f6fb8f10 1789 if (po->tp_version <= TPACKET_V2) {
1790 packet_increment_rx_head(po, &po->rx_ring);
1791 /*
1792 * LOSING will be reported till you read the stats,
1793 * because it's COR - Clear On Read.
1794 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1795 * at packet level.
1796 */
1797 if (po->stats.tp_drops)
1798 status |= TP_STATUS_LOSING;
1799 }
1da177e4
LT
1800 po->stats.tp_packets++;
1801 if (copy_skb) {
1802 status |= TP_STATUS_COPY;
1803 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1804 }
1da177e4
LT
1805 spin_unlock(&sk->sk_receive_queue.lock);
1806
bbd6ef87 1807 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1da177e4 1808
bbd6ef87
PM
1809 switch (po->tp_version) {
1810 case TPACKET_V1:
1811 h.h1->tp_len = skb->len;
1812 h.h1->tp_snaplen = snaplen;
1813 h.h1->tp_mac = macoff;
1814 h.h1->tp_net = netoff;
614f60fa
SM
1815 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1816 && shhwtstamps->syststamp.tv64)
1817 tv = ktime_to_timeval(shhwtstamps->syststamp);
1818 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1819 && shhwtstamps->hwtstamp.tv64)
1820 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
1821 else if (skb->tstamp.tv64)
bbd6ef87
PM
1822 tv = ktime_to_timeval(skb->tstamp);
1823 else
1824 do_gettimeofday(&tv);
1825 h.h1->tp_sec = tv.tv_sec;
1826 h.h1->tp_usec = tv.tv_usec;
1827 hdrlen = sizeof(*h.h1);
1828 break;
1829 case TPACKET_V2:
1830 h.h2->tp_len = skb->len;
1831 h.h2->tp_snaplen = snaplen;
1832 h.h2->tp_mac = macoff;
1833 h.h2->tp_net = netoff;
614f60fa
SM
1834 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1835 && shhwtstamps->syststamp.tv64)
1836 ts = ktime_to_timespec(shhwtstamps->syststamp);
1837 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1838 && shhwtstamps->hwtstamp.tv64)
1839 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1840 else if (skb->tstamp.tv64)
bbd6ef87
PM
1841 ts = ktime_to_timespec(skb->tstamp);
1842 else
1843 getnstimeofday(&ts);
1844 h.h2->tp_sec = ts.tv_sec;
1845 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1846 if (vlan_tx_tag_present(skb)) {
1847 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1848 status |= TP_STATUS_VLAN_VALID;
1849 } else {
1850 h.h2->tp_vlan_tci = 0;
1851 }
13fcb7bd 1852 h.h2->tp_padding = 0;
bbd6ef87
PM
1853 hdrlen = sizeof(*h.h2);
1854 break;
f6fb8f10 1855 case TPACKET_V3:
1856 /* tp_nxt_offset,vlan are already populated above.
1857 * So DONT clear those fields here
1858 */
1859 h.h3->tp_status |= status;
1860 h.h3->tp_len = skb->len;
1861 h.h3->tp_snaplen = snaplen;
1862 h.h3->tp_mac = macoff;
1863 h.h3->tp_net = netoff;
1864 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1865 && shhwtstamps->syststamp.tv64)
1866 ts = ktime_to_timespec(shhwtstamps->syststamp);
1867 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1868 && shhwtstamps->hwtstamp.tv64)
1869 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1870 else if (skb->tstamp.tv64)
1871 ts = ktime_to_timespec(skb->tstamp);
1872 else
1873 getnstimeofday(&ts);
1874 h.h3->tp_sec = ts.tv_sec;
1875 h.h3->tp_nsec = ts.tv_nsec;
1876 hdrlen = sizeof(*h.h3);
1877 break;
bbd6ef87
PM
1878 default:
1879 BUG();
1880 }
1da177e4 1881
bbd6ef87 1882 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1883 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1884 sll->sll_family = AF_PACKET;
1885 sll->sll_hatype = dev->type;
1886 sll->sll_protocol = skb->protocol;
1887 sll->sll_pkttype = skb->pkt_type;
8032b464 1888 if (unlikely(po->origdev))
80feaacb
PWJ
1889 sll->sll_ifindex = orig_dev->ifindex;
1890 else
1891 sll->sll_ifindex = dev->ifindex;
1da177e4 1892
e16aa207 1893 smp_mb();
f6dafa95 1894#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1895 {
0af55bb5
CG
1896 u8 *start, *end;
1897
f6fb8f10 1898 if (po->tp_version <= TPACKET_V2) {
1899 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1900 + macoff + snaplen);
1901 for (start = h.raw; start < end; start += PAGE_SIZE)
1902 flush_dcache_page(pgv_to_page(start));
1903 }
cc9f01b2 1904 smp_wmb();
1da177e4 1905 }
f6dafa95 1906#endif
f6fb8f10 1907 if (po->tp_version <= TPACKET_V2)
1908 __packet_set_status(po, h.raw, status);
1909 else
1910 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1911
1912 sk->sk_data_ready(sk, 0);
1913
1914drop_n_restore:
1915 if (skb_head != skb->data && skb_shared(skb)) {
1916 skb->data = skb_head;
1917 skb->len = skb_len;
1918 }
1919drop:
1ce4f28b 1920 kfree_skb(skb);
1da177e4
LT
1921 return 0;
1922
1923ring_is_full:
1924 po->stats.tp_drops++;
1925 spin_unlock(&sk->sk_receive_queue.lock);
1926
1927 sk->sk_data_ready(sk, 0);
acb5d75b 1928 kfree_skb(copy_skb);
1da177e4
LT
1929 goto drop_n_restore;
1930}
1931
69e3c75f
JB
1932static void tpacket_destruct_skb(struct sk_buff *skb)
1933{
1934 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1935 void *ph;
1da177e4 1936
69e3c75f
JB
1937 if (likely(po->tx_ring.pg_vec)) {
1938 ph = skb_shinfo(skb)->destructor_arg;
1939 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
1940 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1941 atomic_dec(&po->tx_ring.pending);
1942 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
1943 }
1944
1945 sock_wfree(skb);
1946}
1947
40d4e3df
ED
1948static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1949 void *frame, struct net_device *dev, int size_max,
ae641949 1950 __be16 proto, unsigned char *addr, int hlen)
69e3c75f
JB
1951{
1952 union {
1953 struct tpacket_hdr *h1;
1954 struct tpacket2_hdr *h2;
1955 void *raw;
1956 } ph;
1957 int to_write, offset, len, tp_len, nr_frags, len_max;
1958 struct socket *sock = po->sk.sk_socket;
1959 struct page *page;
1960 void *data;
1961 int err;
1962
1963 ph.raw = frame;
1964
1965 skb->protocol = proto;
1966 skb->dev = dev;
1967 skb->priority = po->sk.sk_priority;
2d37a186 1968 skb->mark = po->sk.sk_mark;
69e3c75f
JB
1969 skb_shinfo(skb)->destructor_arg = ph.raw;
1970
1971 switch (po->tp_version) {
1972 case TPACKET_V2:
1973 tp_len = ph.h2->tp_len;
1974 break;
1975 default:
1976 tp_len = ph.h1->tp_len;
1977 break;
1978 }
1979 if (unlikely(tp_len > size_max)) {
40d4e3df 1980 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1981 return -EMSGSIZE;
1982 }
1983
ae641949 1984 skb_reserve(skb, hlen);
69e3c75f
JB
1985 skb_reset_network_header(skb);
1986
1987 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1988 to_write = tp_len;
1989
1990 if (sock->type == SOCK_DGRAM) {
1991 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1992 NULL, tp_len);
1993 if (unlikely(err < 0))
1994 return -EINVAL;
40d4e3df 1995 } else if (dev->hard_header_len) {
69e3c75f
JB
1996 /* net device doesn't like empty head */
1997 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
1998 pr_err("packet size is too short (%d < %d)\n",
1999 tp_len, dev->hard_header_len);
69e3c75f
JB
2000 return -EINVAL;
2001 }
2002
2003 skb_push(skb, dev->hard_header_len);
2004 err = skb_store_bits(skb, 0, data,
2005 dev->hard_header_len);
2006 if (unlikely(err))
2007 return err;
2008
2009 data += dev->hard_header_len;
2010 to_write -= dev->hard_header_len;
2011 }
2012
2013 err = -EFAULT;
69e3c75f
JB
2014 offset = offset_in_page(data);
2015 len_max = PAGE_SIZE - offset;
2016 len = ((to_write > len_max) ? len_max : to_write);
2017
2018 skb->data_len = to_write;
2019 skb->len += to_write;
2020 skb->truesize += to_write;
2021 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2022
2023 while (likely(to_write)) {
2024 nr_frags = skb_shinfo(skb)->nr_frags;
2025
2026 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2027 pr_err("Packet exceed the number of skb frags(%lu)\n",
2028 MAX_SKB_FRAGS);
69e3c75f
JB
2029 return -EFAULT;
2030 }
2031
0af55bb5
CG
2032 page = pgv_to_page(data);
2033 data += len;
69e3c75f
JB
2034 flush_dcache_page(page);
2035 get_page(page);
0af55bb5 2036 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2037 to_write -= len;
2038 offset = 0;
2039 len_max = PAGE_SIZE;
2040 len = ((to_write > len_max) ? len_max : to_write);
2041 }
2042
2043 return tp_len;
2044}
2045
2046static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2047{
69e3c75f
JB
2048 struct sk_buff *skb;
2049 struct net_device *dev;
2050 __be16 proto;
827d9780
BG
2051 bool need_rls_dev = false;
2052 int err, reserve = 0;
40d4e3df
ED
2053 void *ph;
2054 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2055 int tp_len, size_max;
2056 unsigned char *addr;
2057 int len_sum = 0;
2058 int status = 0;
ae641949 2059 int hlen, tlen;
69e3c75f 2060
69e3c75f
JB
2061 mutex_lock(&po->pg_vec_lock);
2062
2063 err = -EBUSY;
2064 if (saddr == NULL) {
827d9780 2065 dev = po->prot_hook.dev;
69e3c75f
JB
2066 proto = po->num;
2067 addr = NULL;
2068 } else {
2069 err = -EINVAL;
2070 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2071 goto out;
2072 if (msg->msg_namelen < (saddr->sll_halen
2073 + offsetof(struct sockaddr_ll,
2074 sll_addr)))
2075 goto out;
69e3c75f
JB
2076 proto = saddr->sll_protocol;
2077 addr = saddr->sll_addr;
827d9780
BG
2078 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2079 need_rls_dev = true;
69e3c75f
JB
2080 }
2081
69e3c75f
JB
2082 err = -ENXIO;
2083 if (unlikely(dev == NULL))
2084 goto out;
2085
2086 reserve = dev->hard_header_len;
2087
2088 err = -ENETDOWN;
2089 if (unlikely(!(dev->flags & IFF_UP)))
2090 goto out_put;
2091
2092 size_max = po->tx_ring.frame_size
b5dd884e 2093 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2094
2095 if (size_max > dev->mtu + reserve)
2096 size_max = dev->mtu + reserve;
2097
2098 do {
2099 ph = packet_current_frame(po, &po->tx_ring,
2100 TP_STATUS_SEND_REQUEST);
2101
2102 if (unlikely(ph == NULL)) {
2103 schedule();
2104 continue;
2105 }
2106
2107 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2108 hlen = LL_RESERVED_SPACE(dev);
2109 tlen = dev->needed_tailroom;
69e3c75f 2110 skb = sock_alloc_send_skb(&po->sk,
ae641949 2111 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2112 0, &err);
2113
2114 if (unlikely(skb == NULL))
2115 goto out_status;
2116
2117 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2118 addr, hlen);
69e3c75f
JB
2119
2120 if (unlikely(tp_len < 0)) {
2121 if (po->tp_loss) {
2122 __packet_set_status(po, ph,
2123 TP_STATUS_AVAILABLE);
2124 packet_increment_head(&po->tx_ring);
2125 kfree_skb(skb);
2126 continue;
2127 } else {
2128 status = TP_STATUS_WRONG_FORMAT;
2129 err = tp_len;
2130 goto out_status;
2131 }
2132 }
2133
2134 skb->destructor = tpacket_destruct_skb;
2135 __packet_set_status(po, ph, TP_STATUS_SENDING);
2136 atomic_inc(&po->tx_ring.pending);
2137
2138 status = TP_STATUS_SEND_REQUEST;
2139 err = dev_queue_xmit(skb);
eb70df13
JP
2140 if (unlikely(err > 0)) {
2141 err = net_xmit_errno(err);
2142 if (err && __packet_get_status(po, ph) ==
2143 TP_STATUS_AVAILABLE) {
2144 /* skb was destructed already */
2145 skb = NULL;
2146 goto out_status;
2147 }
2148 /*
2149 * skb was dropped but not destructed yet;
2150 * let's treat it like congestion or err < 0
2151 */
2152 err = 0;
2153 }
69e3c75f
JB
2154 packet_increment_head(&po->tx_ring);
2155 len_sum += tp_len;
f64f9e71
JP
2156 } while (likely((ph != NULL) ||
2157 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2158 (atomic_read(&po->tx_ring.pending))))
2159 );
69e3c75f
JB
2160
2161 err = len_sum;
2162 goto out_put;
2163
69e3c75f
JB
2164out_status:
2165 __packet_set_status(po, ph, status);
2166 kfree_skb(skb);
2167out_put:
827d9780
BG
2168 if (need_rls_dev)
2169 dev_put(dev);
69e3c75f
JB
2170out:
2171 mutex_unlock(&po->pg_vec_lock);
2172 return err;
2173}
69e3c75f 2174
eea49cc9
OJ
2175static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2176 size_t reserve, size_t len,
2177 size_t linear, int noblock,
2178 int *err)
bfd5f4a3
SS
2179{
2180 struct sk_buff *skb;
2181
2182 /* Under a page? Don't bother with paged skb. */
2183 if (prepad + len < PAGE_SIZE || !linear)
2184 linear = len;
2185
2186 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2187 err);
2188 if (!skb)
2189 return NULL;
2190
2191 skb_reserve(skb, reserve);
2192 skb_put(skb, linear);
2193 skb->data_len = len - linear;
2194 skb->len += len - linear;
2195
2196 return skb;
2197}
2198
69e3c75f 2199static int packet_snd(struct socket *sock,
1da177e4
LT
2200 struct msghdr *msg, size_t len)
2201{
2202 struct sock *sk = sock->sk;
40d4e3df 2203 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2204 struct sk_buff *skb;
2205 struct net_device *dev;
0e11c91e 2206 __be16 proto;
827d9780 2207 bool need_rls_dev = false;
1da177e4 2208 unsigned char *addr;
827d9780 2209 int err, reserve = 0;
bfd5f4a3
SS
2210 struct virtio_net_hdr vnet_hdr = { 0 };
2211 int offset = 0;
2212 int vnet_hdr_len;
2213 struct packet_sock *po = pkt_sk(sk);
2214 unsigned short gso_type = 0;
ae641949 2215 int hlen, tlen;
3bdc0eba 2216 int extra_len = 0;
1da177e4
LT
2217
2218 /*
1ce4f28b 2219 * Get and verify the address.
1da177e4 2220 */
1ce4f28b 2221
1da177e4 2222 if (saddr == NULL) {
827d9780 2223 dev = po->prot_hook.dev;
1da177e4
LT
2224 proto = po->num;
2225 addr = NULL;
2226 } else {
2227 err = -EINVAL;
2228 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2229 goto out;
0fb375fb
EB
2230 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2231 goto out;
1da177e4
LT
2232 proto = saddr->sll_protocol;
2233 addr = saddr->sll_addr;
827d9780
BG
2234 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2235 need_rls_dev = true;
1da177e4
LT
2236 }
2237
1da177e4
LT
2238 err = -ENXIO;
2239 if (dev == NULL)
2240 goto out_unlock;
2241 if (sock->type == SOCK_RAW)
2242 reserve = dev->hard_header_len;
2243
d5e76b0a
DM
2244 err = -ENETDOWN;
2245 if (!(dev->flags & IFF_UP))
2246 goto out_unlock;
2247
bfd5f4a3
SS
2248 if (po->has_vnet_hdr) {
2249 vnet_hdr_len = sizeof(vnet_hdr);
2250
2251 err = -EINVAL;
2252 if (len < vnet_hdr_len)
2253 goto out_unlock;
2254
2255 len -= vnet_hdr_len;
2256
2257 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2258 vnet_hdr_len);
2259 if (err < 0)
2260 goto out_unlock;
2261
2262 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2263 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2264 vnet_hdr.hdr_len))
2265 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2266 vnet_hdr.csum_offset + 2;
2267
2268 err = -EINVAL;
2269 if (vnet_hdr.hdr_len > len)
2270 goto out_unlock;
2271
2272 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2273 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2274 case VIRTIO_NET_HDR_GSO_TCPV4:
2275 gso_type = SKB_GSO_TCPV4;
2276 break;
2277 case VIRTIO_NET_HDR_GSO_TCPV6:
2278 gso_type = SKB_GSO_TCPV6;
2279 break;
2280 case VIRTIO_NET_HDR_GSO_UDP:
2281 gso_type = SKB_GSO_UDP;
2282 break;
2283 default:
2284 goto out_unlock;
2285 }
2286
2287 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2288 gso_type |= SKB_GSO_TCP_ECN;
2289
2290 if (vnet_hdr.gso_size == 0)
2291 goto out_unlock;
2292
2293 }
2294 }
2295
3bdc0eba
BG
2296 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2297 if (!netif_supports_nofcs(dev)) {
2298 err = -EPROTONOSUPPORT;
2299 goto out_unlock;
2300 }
2301 extra_len = 4; /* We're doing our own CRC */
2302 }
2303
1da177e4 2304 err = -EMSGSIZE;
3bdc0eba 2305 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2306 goto out_unlock;
2307
bfd5f4a3 2308 err = -ENOBUFS;
ae641949
HX
2309 hlen = LL_RESERVED_SPACE(dev);
2310 tlen = dev->needed_tailroom;
2311 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2312 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2313 if (skb == NULL)
1da177e4
LT
2314 goto out_unlock;
2315
bfd5f4a3 2316 skb_set_network_header(skb, reserve);
1da177e4 2317
0c4e8581
SH
2318 err = -EINVAL;
2319 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2320 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2321 goto out_free;
1da177e4
LT
2322
2323 /* Returns -EFAULT on error */
bfd5f4a3 2324 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2325 if (err)
2326 goto out_free;
2244d07b 2327 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
2328 if (err < 0)
2329 goto out_free;
1da177e4 2330
3bdc0eba 2331 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
57f89bfa
BG
2332 /* Earlier code assumed this would be a VLAN pkt,
2333 * double-check this now that we have the actual
2334 * packet in hand.
2335 */
2336 struct ethhdr *ehdr;
2337 skb_reset_mac_header(skb);
2338 ehdr = eth_hdr(skb);
2339 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2340 err = -EMSGSIZE;
2341 goto out_free;
2342 }
2343 }
2344
1da177e4
LT
2345 skb->protocol = proto;
2346 skb->dev = dev;
2347 skb->priority = sk->sk_priority;
2d37a186 2348 skb->mark = sk->sk_mark;
1da177e4 2349
bfd5f4a3
SS
2350 if (po->has_vnet_hdr) {
2351 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2352 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2353 vnet_hdr.csum_offset)) {
2354 err = -EINVAL;
2355 goto out_free;
2356 }
2357 }
2358
2359 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2360 skb_shinfo(skb)->gso_type = gso_type;
2361
2362 /* Header must be checked, and gso_segs computed. */
2363 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2364 skb_shinfo(skb)->gso_segs = 0;
2365
2366 len += vnet_hdr_len;
2367 }
2368
3bdc0eba
BG
2369 if (unlikely(extra_len == 4))
2370 skb->no_fcs = 1;
2371
1da177e4
LT
2372 /*
2373 * Now send it
2374 */
2375
2376 err = dev_queue_xmit(skb);
2377 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2378 goto out_unlock;
2379
827d9780
BG
2380 if (need_rls_dev)
2381 dev_put(dev);
1da177e4 2382
40d4e3df 2383 return len;
1da177e4
LT
2384
2385out_free:
2386 kfree_skb(skb);
2387out_unlock:
827d9780 2388 if (dev && need_rls_dev)
1da177e4
LT
2389 dev_put(dev);
2390out:
2391 return err;
2392}
2393
69e3c75f
JB
2394static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2395 struct msghdr *msg, size_t len)
2396{
69e3c75f
JB
2397 struct sock *sk = sock->sk;
2398 struct packet_sock *po = pkt_sk(sk);
2399 if (po->tx_ring.pg_vec)
2400 return tpacket_snd(po, msg);
2401 else
69e3c75f
JB
2402 return packet_snd(sock, msg, len);
2403}
2404
1da177e4
LT
2405/*
2406 * Close a PACKET socket. This is fairly simple. We immediately go
2407 * to 'closed' state and remove our protocol entry in the device list.
2408 */
2409
2410static int packet_release(struct socket *sock)
2411{
2412 struct sock *sk = sock->sk;
2413 struct packet_sock *po;
d12d01d6 2414 struct net *net;
f6fb8f10 2415 union tpacket_req_u req_u;
1da177e4
LT
2416
2417 if (!sk)
2418 return 0;
2419
3b1e0a65 2420 net = sock_net(sk);
1da177e4
LT
2421 po = pkt_sk(sk);
2422
808f5114 2423 spin_lock_bh(&net->packet.sklist_lock);
2424 sk_del_node_init_rcu(sk);
920de804 2425 sock_prot_inuse_add(net, sk->sk_prot, -1);
808f5114 2426 spin_unlock_bh(&net->packet.sklist_lock);
1da177e4 2427
808f5114 2428 spin_lock(&po->bind_lock);
ce06b03e 2429 unregister_prot_hook(sk, false);
160ff18a
BG
2430 if (po->prot_hook.dev) {
2431 dev_put(po->prot_hook.dev);
2432 po->prot_hook.dev = NULL;
2433 }
808f5114 2434 spin_unlock(&po->bind_lock);
1da177e4 2435
1da177e4 2436 packet_flush_mclist(sk);
1da177e4 2437
f6fb8f10 2438 memset(&req_u, 0, sizeof(req_u));
69e3c75f
JB
2439
2440 if (po->rx_ring.pg_vec)
f6fb8f10 2441 packet_set_ring(sk, &req_u, 1, 0);
69e3c75f
JB
2442
2443 if (po->tx_ring.pg_vec)
f6fb8f10 2444 packet_set_ring(sk, &req_u, 1, 1);
1da177e4 2445
dc99f600
DM
2446 fanout_release(sk);
2447
808f5114 2448 synchronize_net();
1da177e4
LT
2449 /*
2450 * Now the socket is dead. No more input will appear.
2451 */
1da177e4
LT
2452 sock_orphan(sk);
2453 sock->sk = NULL;
2454
2455 /* Purge queues */
2456
2457 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2458 sk_refcnt_debug_release(sk);
1da177e4
LT
2459
2460 sock_put(sk);
2461 return 0;
2462}
2463
2464/*
2465 * Attach a packet hook.
2466 */
2467
0e11c91e 2468static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2469{
2470 struct packet_sock *po = pkt_sk(sk);
dc99f600 2471
aef950b4
WY
2472 if (po->fanout) {
2473 if (dev)
2474 dev_put(dev);
2475
dc99f600 2476 return -EINVAL;
aef950b4 2477 }
1da177e4
LT
2478
2479 lock_sock(sk);
2480
2481 spin_lock(&po->bind_lock);
ce06b03e 2482 unregister_prot_hook(sk, true);
1da177e4
LT
2483 po->num = protocol;
2484 po->prot_hook.type = protocol;
160ff18a
BG
2485 if (po->prot_hook.dev)
2486 dev_put(po->prot_hook.dev);
1da177e4
LT
2487 po->prot_hook.dev = dev;
2488
2489 po->ifindex = dev ? dev->ifindex : 0;
2490
2491 if (protocol == 0)
2492 goto out_unlock;
2493
be85d4ad 2494 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2495 register_prot_hook(sk);
be85d4ad
UT
2496 } else {
2497 sk->sk_err = ENETDOWN;
2498 if (!sock_flag(sk, SOCK_DEAD))
2499 sk->sk_error_report(sk);
1da177e4
LT
2500 }
2501
2502out_unlock:
2503 spin_unlock(&po->bind_lock);
2504 release_sock(sk);
2505 return 0;
2506}
2507
2508/*
2509 * Bind a packet socket to a device
2510 */
2511
40d4e3df
ED
2512static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2513 int addr_len)
1da177e4 2514{
40d4e3df 2515 struct sock *sk = sock->sk;
1da177e4
LT
2516 char name[15];
2517 struct net_device *dev;
2518 int err = -ENODEV;
1ce4f28b 2519
1da177e4
LT
2520 /*
2521 * Check legality
2522 */
1ce4f28b 2523
8ae55f04 2524 if (addr_len != sizeof(struct sockaddr))
1da177e4 2525 return -EINVAL;
40d4e3df 2526 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2527
3b1e0a65 2528 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2529 if (dev)
1da177e4 2530 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2531 return err;
2532}
1da177e4
LT
2533
2534static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2535{
40d4e3df
ED
2536 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2537 struct sock *sk = sock->sk;
1da177e4
LT
2538 struct net_device *dev = NULL;
2539 int err;
2540
2541
2542 /*
2543 * Check legality
2544 */
1ce4f28b 2545
1da177e4
LT
2546 if (addr_len < sizeof(struct sockaddr_ll))
2547 return -EINVAL;
2548 if (sll->sll_family != AF_PACKET)
2549 return -EINVAL;
2550
2551 if (sll->sll_ifindex) {
2552 err = -ENODEV;
3b1e0a65 2553 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2554 if (dev == NULL)
2555 goto out;
2556 }
2557 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2558
2559out:
2560 return err;
2561}
2562
2563static struct proto packet_proto = {
2564 .name = "PACKET",
2565 .owner = THIS_MODULE,
2566 .obj_size = sizeof(struct packet_sock),
2567};
2568
2569/*
1ce4f28b 2570 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2571 */
2572
3f378b68
EP
2573static int packet_create(struct net *net, struct socket *sock, int protocol,
2574 int kern)
1da177e4
LT
2575{
2576 struct sock *sk;
2577 struct packet_sock *po;
0e11c91e 2578 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2579 int err;
2580
2581 if (!capable(CAP_NET_RAW))
2582 return -EPERM;
be02097c
DM
2583 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2584 sock->type != SOCK_PACKET)
1da177e4
LT
2585 return -ESOCKTNOSUPPORT;
2586
2587 sock->state = SS_UNCONNECTED;
2588
2589 err = -ENOBUFS;
6257ff21 2590 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2591 if (sk == NULL)
2592 goto out;
2593
2594 sock->ops = &packet_ops;
1da177e4
LT
2595 if (sock->type == SOCK_PACKET)
2596 sock->ops = &packet_ops_spkt;
be02097c 2597
1da177e4
LT
2598 sock_init_data(sock, sk);
2599
2600 po = pkt_sk(sk);
2601 sk->sk_family = PF_PACKET;
0e11c91e 2602 po->num = proto;
1da177e4
LT
2603
2604 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2605 sk_refcnt_debug_inc(sk);
1da177e4
LT
2606
2607 /*
2608 * Attach a protocol block
2609 */
2610
2611 spin_lock_init(&po->bind_lock);
905db440 2612 mutex_init(&po->pg_vec_lock);
1da177e4 2613 po->prot_hook.func = packet_rcv;
be02097c 2614
1da177e4
LT
2615 if (sock->type == SOCK_PACKET)
2616 po->prot_hook.func = packet_rcv_spkt;
be02097c 2617
1da177e4
LT
2618 po->prot_hook.af_packet_priv = sk;
2619
0e11c91e
AV
2620 if (proto) {
2621 po->prot_hook.type = proto;
ce06b03e 2622 register_prot_hook(sk);
1da177e4
LT
2623 }
2624
808f5114 2625 spin_lock_bh(&net->packet.sklist_lock);
2626 sk_add_node_rcu(sk, &net->packet.sklist);
3680453c 2627 sock_prot_inuse_add(net, &packet_proto, 1);
808f5114 2628 spin_unlock_bh(&net->packet.sklist_lock);
2629
40d4e3df 2630 return 0;
1da177e4
LT
2631out:
2632 return err;
2633}
2634
ed85b565
RC
2635static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2636{
2637 struct sock_exterr_skb *serr;
2638 struct sk_buff *skb, *skb2;
2639 int copied, err;
2640
2641 err = -EAGAIN;
2642 skb = skb_dequeue(&sk->sk_error_queue);
2643 if (skb == NULL)
2644 goto out;
2645
2646 copied = skb->len;
2647 if (copied > len) {
2648 msg->msg_flags |= MSG_TRUNC;
2649 copied = len;
2650 }
2651 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2652 if (err)
2653 goto out_free_skb;
2654
2655 sock_recv_timestamp(msg, sk, skb);
2656
2657 serr = SKB_EXT_ERR(skb);
2658 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2659 sizeof(serr->ee), &serr->ee);
2660
2661 msg->msg_flags |= MSG_ERRQUEUE;
2662 err = copied;
2663
2664 /* Reset and regenerate socket error */
2665 spin_lock_bh(&sk->sk_error_queue.lock);
2666 sk->sk_err = 0;
2667 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2668 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2669 spin_unlock_bh(&sk->sk_error_queue.lock);
2670 sk->sk_error_report(sk);
2671 } else
2672 spin_unlock_bh(&sk->sk_error_queue.lock);
2673
2674out_free_skb:
2675 kfree_skb(skb);
2676out:
2677 return err;
2678}
2679
1da177e4
LT
2680/*
2681 * Pull a packet from our receive queue and hand it to the user.
2682 * If necessary we block.
2683 */
2684
2685static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2686 struct msghdr *msg, size_t len, int flags)
2687{
2688 struct sock *sk = sock->sk;
2689 struct sk_buff *skb;
2690 int copied, err;
0fb375fb 2691 struct sockaddr_ll *sll;
bfd5f4a3 2692 int vnet_hdr_len = 0;
1da177e4
LT
2693
2694 err = -EINVAL;
ed85b565 2695 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2696 goto out;
2697
2698#if 0
2699 /* What error should we return now? EUNATTACH? */
2700 if (pkt_sk(sk)->ifindex < 0)
2701 return -ENODEV;
2702#endif
2703
ed85b565
RC
2704 if (flags & MSG_ERRQUEUE) {
2705 err = packet_recv_error(sk, msg, len);
2706 goto out;
2707 }
2708
1da177e4
LT
2709 /*
2710 * Call the generic datagram receiver. This handles all sorts
2711 * of horrible races and re-entrancy so we can forget about it
2712 * in the protocol layers.
2713 *
2714 * Now it will return ENETDOWN, if device have just gone down,
2715 * but then it will block.
2716 */
2717
40d4e3df 2718 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2719
2720 /*
1ce4f28b 2721 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2722 * handles the blocking we don't see and worry about blocking
2723 * retries.
2724 */
2725
8ae55f04 2726 if (skb == NULL)
1da177e4
LT
2727 goto out;
2728
bfd5f4a3
SS
2729 if (pkt_sk(sk)->has_vnet_hdr) {
2730 struct virtio_net_hdr vnet_hdr = { 0 };
2731
2732 err = -EINVAL;
2733 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2734 if (len < vnet_hdr_len)
bfd5f4a3
SS
2735 goto out_free;
2736
1f18b717
MK
2737 len -= vnet_hdr_len;
2738
bfd5f4a3
SS
2739 if (skb_is_gso(skb)) {
2740 struct skb_shared_info *sinfo = skb_shinfo(skb);
2741
2742 /* This is a hint as to how much should be linear. */
2743 vnet_hdr.hdr_len = skb_headlen(skb);
2744 vnet_hdr.gso_size = sinfo->gso_size;
2745 if (sinfo->gso_type & SKB_GSO_TCPV4)
2746 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2747 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2748 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2749 else if (sinfo->gso_type & SKB_GSO_UDP)
2750 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2751 else if (sinfo->gso_type & SKB_GSO_FCOE)
2752 goto out_free;
2753 else
2754 BUG();
2755 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2756 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2757 } else
2758 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2759
2760 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2761 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2762 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2763 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2764 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2765 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2766 } /* else everything is zero */
2767
2768 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2769 vnet_hdr_len);
2770 if (err < 0)
2771 goto out_free;
2772 }
2773
0fb375fb
EB
2774 /*
2775 * If the address length field is there to be filled in, we fill
2776 * it in now.
2777 */
2778
ffbc6111 2779 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
2780 if (sock->type == SOCK_PACKET)
2781 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2782 else
2783 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2784
1da177e4
LT
2785 /*
2786 * You lose any data beyond the buffer you gave. If it worries a
2787 * user program they can ask the device for its MTU anyway.
2788 */
2789
2790 copied = skb->len;
40d4e3df
ED
2791 if (copied > len) {
2792 copied = len;
2793 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2794 }
2795
2796 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2797 if (err)
2798 goto out_free;
2799
3b885787 2800 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
2801
2802 if (msg->msg_name)
ffbc6111
HX
2803 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2804 msg->msg_namelen);
1da177e4 2805
8dc41944 2806 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2807 struct tpacket_auxdata aux;
2808
2809 aux.tp_status = TP_STATUS_USER;
2810 if (skb->ip_summed == CHECKSUM_PARTIAL)
2811 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2812 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2813 aux.tp_snaplen = skb->len;
2814 aux.tp_mac = 0;
bbe735e4 2815 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2816 if (vlan_tx_tag_present(skb)) {
2817 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2818 aux.tp_status |= TP_STATUS_VLAN_VALID;
2819 } else {
2820 aux.tp_vlan_tci = 0;
2821 }
13fcb7bd 2822 aux.tp_padding = 0;
ffbc6111 2823 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2824 }
2825
1da177e4
LT
2826 /*
2827 * Free or return the buffer as appropriate. Again this
2828 * hides all the races and re-entrancy issues from us.
2829 */
bfd5f4a3 2830 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2831
2832out_free:
2833 skb_free_datagram(sk, skb);
2834out:
2835 return err;
2836}
2837
1da177e4
LT
2838static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2839 int *uaddr_len, int peer)
2840{
2841 struct net_device *dev;
2842 struct sock *sk = sock->sk;
2843
2844 if (peer)
2845 return -EOPNOTSUPP;
2846
2847 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
2848 rcu_read_lock();
2849 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2850 if (dev)
67286640 2851 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 2852 else
1da177e4 2853 memset(uaddr->sa_data, 0, 14);
654d1f8a 2854 rcu_read_unlock();
1da177e4
LT
2855 *uaddr_len = sizeof(*uaddr);
2856
2857 return 0;
2858}
1da177e4
LT
2859
2860static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2861 int *uaddr_len, int peer)
2862{
2863 struct net_device *dev;
2864 struct sock *sk = sock->sk;
2865 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2866 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2867
2868 if (peer)
2869 return -EOPNOTSUPP;
2870
2871 sll->sll_family = AF_PACKET;
2872 sll->sll_ifindex = po->ifindex;
2873 sll->sll_protocol = po->num;
67286640 2874 sll->sll_pkttype = 0;
654d1f8a
ED
2875 rcu_read_lock();
2876 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2877 if (dev) {
2878 sll->sll_hatype = dev->type;
2879 sll->sll_halen = dev->addr_len;
2880 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2881 } else {
2882 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2883 sll->sll_halen = 0;
2884 }
654d1f8a 2885 rcu_read_unlock();
0fb375fb 2886 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2887
2888 return 0;
2889}
2890
2aeb0b88
WC
2891static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2892 int what)
1da177e4
LT
2893{
2894 switch (i->type) {
2895 case PACKET_MR_MULTICAST:
1162563f
JP
2896 if (i->alen != dev->addr_len)
2897 return -EINVAL;
1da177e4 2898 if (what > 0)
22bedad3 2899 return dev_mc_add(dev, i->addr);
1da177e4 2900 else
22bedad3 2901 return dev_mc_del(dev, i->addr);
1da177e4
LT
2902 break;
2903 case PACKET_MR_PROMISC:
2aeb0b88 2904 return dev_set_promiscuity(dev, what);
1da177e4
LT
2905 break;
2906 case PACKET_MR_ALLMULTI:
2aeb0b88 2907 return dev_set_allmulti(dev, what);
1da177e4 2908 break;
d95ed927 2909 case PACKET_MR_UNICAST:
1162563f
JP
2910 if (i->alen != dev->addr_len)
2911 return -EINVAL;
d95ed927 2912 if (what > 0)
a748ee24 2913 return dev_uc_add(dev, i->addr);
d95ed927 2914 else
a748ee24 2915 return dev_uc_del(dev, i->addr);
d95ed927 2916 break;
40d4e3df
ED
2917 default:
2918 break;
1da177e4 2919 }
2aeb0b88 2920 return 0;
1da177e4
LT
2921}
2922
2923static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2924{
40d4e3df 2925 for ( ; i; i = i->next) {
1da177e4
LT
2926 if (i->ifindex == dev->ifindex)
2927 packet_dev_mc(dev, i, what);
2928 }
2929}
2930
0fb375fb 2931static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2932{
2933 struct packet_sock *po = pkt_sk(sk);
2934 struct packet_mclist *ml, *i;
2935 struct net_device *dev;
2936 int err;
2937
2938 rtnl_lock();
2939
2940 err = -ENODEV;
3b1e0a65 2941 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2942 if (!dev)
2943 goto done;
2944
2945 err = -EINVAL;
1162563f 2946 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2947 goto done;
2948
2949 err = -ENOBUFS;
8b3a7005 2950 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2951 if (i == NULL)
2952 goto done;
2953
2954 err = 0;
2955 for (ml = po->mclist; ml; ml = ml->next) {
2956 if (ml->ifindex == mreq->mr_ifindex &&
2957 ml->type == mreq->mr_type &&
2958 ml->alen == mreq->mr_alen &&
2959 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2960 ml->count++;
2961 /* Free the new element ... */
2962 kfree(i);
2963 goto done;
2964 }
2965 }
2966
2967 i->type = mreq->mr_type;
2968 i->ifindex = mreq->mr_ifindex;
2969 i->alen = mreq->mr_alen;
2970 memcpy(i->addr, mreq->mr_address, i->alen);
2971 i->count = 1;
2972 i->next = po->mclist;
2973 po->mclist = i;
2aeb0b88
WC
2974 err = packet_dev_mc(dev, i, 1);
2975 if (err) {
2976 po->mclist = i->next;
2977 kfree(i);
2978 }
1da177e4
LT
2979
2980done:
2981 rtnl_unlock();
2982 return err;
2983}
2984
0fb375fb 2985static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2986{
2987 struct packet_mclist *ml, **mlp;
2988
2989 rtnl_lock();
2990
2991 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2992 if (ml->ifindex == mreq->mr_ifindex &&
2993 ml->type == mreq->mr_type &&
2994 ml->alen == mreq->mr_alen &&
2995 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2996 if (--ml->count == 0) {
2997 struct net_device *dev;
2998 *mlp = ml->next;
ad959e76
ED
2999 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3000 if (dev)
1da177e4 3001 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3002 kfree(ml);
3003 }
3004 rtnl_unlock();
3005 return 0;
3006 }
3007 }
3008 rtnl_unlock();
3009 return -EADDRNOTAVAIL;
3010}
3011
3012static void packet_flush_mclist(struct sock *sk)
3013{
3014 struct packet_sock *po = pkt_sk(sk);
3015 struct packet_mclist *ml;
3016
3017 if (!po->mclist)
3018 return;
3019
3020 rtnl_lock();
3021 while ((ml = po->mclist) != NULL) {
3022 struct net_device *dev;
3023
3024 po->mclist = ml->next;
ad959e76
ED
3025 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3026 if (dev != NULL)
1da177e4 3027 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3028 kfree(ml);
3029 }
3030 rtnl_unlock();
3031}
1da177e4
LT
3032
3033static int
b7058842 3034packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3035{
3036 struct sock *sk = sock->sk;
8dc41944 3037 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3038 int ret;
3039
3040 if (level != SOL_PACKET)
3041 return -ENOPROTOOPT;
3042
69e3c75f 3043 switch (optname) {
1ce4f28b 3044 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3045 case PACKET_DROP_MEMBERSHIP:
3046 {
0fb375fb
EB
3047 struct packet_mreq_max mreq;
3048 int len = optlen;
3049 memset(&mreq, 0, sizeof(mreq));
3050 if (len < sizeof(struct packet_mreq))
1da177e4 3051 return -EINVAL;
0fb375fb
EB
3052 if (len > sizeof(mreq))
3053 len = sizeof(mreq);
40d4e3df 3054 if (copy_from_user(&mreq, optval, len))
1da177e4 3055 return -EFAULT;
0fb375fb
EB
3056 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3057 return -EINVAL;
1da177e4
LT
3058 if (optname == PACKET_ADD_MEMBERSHIP)
3059 ret = packet_mc_add(sk, &mreq);
3060 else
3061 ret = packet_mc_drop(sk, &mreq);
3062 return ret;
3063 }
a2efcfa0 3064
1da177e4 3065 case PACKET_RX_RING:
69e3c75f 3066 case PACKET_TX_RING:
1da177e4 3067 {
f6fb8f10 3068 union tpacket_req_u req_u;
3069 int len;
1da177e4 3070
f6fb8f10 3071 switch (po->tp_version) {
3072 case TPACKET_V1:
3073 case TPACKET_V2:
3074 len = sizeof(req_u.req);
3075 break;
3076 case TPACKET_V3:
3077 default:
3078 len = sizeof(req_u.req3);
3079 break;
3080 }
3081 if (optlen < len)
1da177e4 3082 return -EINVAL;
bfd5f4a3
SS
3083 if (pkt_sk(sk)->has_vnet_hdr)
3084 return -EINVAL;
f6fb8f10 3085 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3086 return -EFAULT;
f6fb8f10 3087 return packet_set_ring(sk, &req_u, 0,
3088 optname == PACKET_TX_RING);
1da177e4
LT
3089 }
3090 case PACKET_COPY_THRESH:
3091 {
3092 int val;
3093
40d4e3df 3094 if (optlen != sizeof(val))
1da177e4 3095 return -EINVAL;
40d4e3df 3096 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3097 return -EFAULT;
3098
3099 pkt_sk(sk)->copy_thresh = val;
3100 return 0;
3101 }
bbd6ef87
PM
3102 case PACKET_VERSION:
3103 {
3104 int val;
3105
3106 if (optlen != sizeof(val))
3107 return -EINVAL;
69e3c75f 3108 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3109 return -EBUSY;
3110 if (copy_from_user(&val, optval, sizeof(val)))
3111 return -EFAULT;
3112 switch (val) {
3113 case TPACKET_V1:
3114 case TPACKET_V2:
f6fb8f10 3115 case TPACKET_V3:
bbd6ef87
PM
3116 po->tp_version = val;
3117 return 0;
3118 default:
3119 return -EINVAL;
3120 }
3121 }
8913336a
PM
3122 case PACKET_RESERVE:
3123 {
3124 unsigned int val;
3125
3126 if (optlen != sizeof(val))
3127 return -EINVAL;
69e3c75f 3128 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3129 return -EBUSY;
3130 if (copy_from_user(&val, optval, sizeof(val)))
3131 return -EFAULT;
3132 po->tp_reserve = val;
3133 return 0;
3134 }
69e3c75f
JB
3135 case PACKET_LOSS:
3136 {
3137 unsigned int val;
3138
3139 if (optlen != sizeof(val))
3140 return -EINVAL;
3141 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3142 return -EBUSY;
3143 if (copy_from_user(&val, optval, sizeof(val)))
3144 return -EFAULT;
3145 po->tp_loss = !!val;
3146 return 0;
3147 }
8dc41944
HX
3148 case PACKET_AUXDATA:
3149 {
3150 int val;
3151
3152 if (optlen < sizeof(val))
3153 return -EINVAL;
3154 if (copy_from_user(&val, optval, sizeof(val)))
3155 return -EFAULT;
3156
3157 po->auxdata = !!val;
3158 return 0;
3159 }
80feaacb
PWJ
3160 case PACKET_ORIGDEV:
3161 {
3162 int val;
3163
3164 if (optlen < sizeof(val))
3165 return -EINVAL;
3166 if (copy_from_user(&val, optval, sizeof(val)))
3167 return -EFAULT;
3168
3169 po->origdev = !!val;
3170 return 0;
3171 }
bfd5f4a3
SS
3172 case PACKET_VNET_HDR:
3173 {
3174 int val;
3175
3176 if (sock->type != SOCK_RAW)
3177 return -EINVAL;
3178 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3179 return -EBUSY;
3180 if (optlen < sizeof(val))
3181 return -EINVAL;
3182 if (copy_from_user(&val, optval, sizeof(val)))
3183 return -EFAULT;
3184
3185 po->has_vnet_hdr = !!val;
3186 return 0;
3187 }
614f60fa
SM
3188 case PACKET_TIMESTAMP:
3189 {
3190 int val;
3191
3192 if (optlen != sizeof(val))
3193 return -EINVAL;
3194 if (copy_from_user(&val, optval, sizeof(val)))
3195 return -EFAULT;
3196
3197 po->tp_tstamp = val;
3198 return 0;
3199 }
dc99f600
DM
3200 case PACKET_FANOUT:
3201 {
3202 int val;
3203
3204 if (optlen != sizeof(val))
3205 return -EINVAL;
3206 if (copy_from_user(&val, optval, sizeof(val)))
3207 return -EFAULT;
3208
3209 return fanout_add(sk, val & 0xffff, val >> 16);
3210 }
1da177e4
LT
3211 default:
3212 return -ENOPROTOOPT;
3213 }
3214}
3215
3216static int packet_getsockopt(struct socket *sock, int level, int optname,
3217 char __user *optval, int __user *optlen)
3218{
3219 int len;
c06fff6e 3220 int val, lv = sizeof(val);
1da177e4
LT
3221 struct sock *sk = sock->sk;
3222 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3223 void *data = &val;
8dc41944 3224 struct tpacket_stats st;
f6fb8f10 3225 union tpacket_stats_u st_u;
1da177e4
LT
3226
3227 if (level != SOL_PACKET)
3228 return -ENOPROTOOPT;
3229
8ae55f04
KK
3230 if (get_user(len, optlen))
3231 return -EFAULT;
1da177e4
LT
3232
3233 if (len < 0)
3234 return -EINVAL;
1ce4f28b 3235
69e3c75f 3236 switch (optname) {
1da177e4 3237 case PACKET_STATISTICS:
1da177e4 3238 spin_lock_bh(&sk->sk_receive_queue.lock);
f6fb8f10 3239 if (po->tp_version == TPACKET_V3) {
c06fff6e 3240 lv = sizeof(struct tpacket_stats_v3);
f6fb8f10 3241 memcpy(&st_u.stats3, &po->stats,
c06fff6e 3242 sizeof(struct tpacket_stats));
f6fb8f10 3243 st_u.stats3.tp_freeze_q_cnt =
c06fff6e 3244 po->stats_u.stats3.tp_freeze_q_cnt;
f6fb8f10 3245 st_u.stats3.tp_packets += po->stats.tp_drops;
3246 data = &st_u.stats3;
3247 } else {
c06fff6e 3248 lv = sizeof(struct tpacket_stats);
f6fb8f10 3249 st = po->stats;
3250 st.tp_packets += st.tp_drops;
3251 data = &st;
3252 }
1da177e4
LT
3253 memset(&po->stats, 0, sizeof(st));
3254 spin_unlock_bh(&sk->sk_receive_queue.lock);
8dc41944
HX
3255 break;
3256 case PACKET_AUXDATA:
8dc41944 3257 val = po->auxdata;
80feaacb
PWJ
3258 break;
3259 case PACKET_ORIGDEV:
80feaacb 3260 val = po->origdev;
bfd5f4a3
SS
3261 break;
3262 case PACKET_VNET_HDR:
bfd5f4a3 3263 val = po->has_vnet_hdr;
1da177e4 3264 break;
bbd6ef87 3265 case PACKET_VERSION:
bbd6ef87 3266 val = po->tp_version;
bbd6ef87
PM
3267 break;
3268 case PACKET_HDRLEN:
3269 if (len > sizeof(int))
3270 len = sizeof(int);
3271 if (copy_from_user(&val, optval, len))
3272 return -EFAULT;
3273 switch (val) {
3274 case TPACKET_V1:
3275 val = sizeof(struct tpacket_hdr);
3276 break;
3277 case TPACKET_V2:
3278 val = sizeof(struct tpacket2_hdr);
3279 break;
f6fb8f10 3280 case TPACKET_V3:
3281 val = sizeof(struct tpacket3_hdr);
3282 break;
bbd6ef87
PM
3283 default:
3284 return -EINVAL;
3285 }
bbd6ef87 3286 break;
8913336a 3287 case PACKET_RESERVE:
8913336a 3288 val = po->tp_reserve;
8913336a 3289 break;
69e3c75f 3290 case PACKET_LOSS:
69e3c75f 3291 val = po->tp_loss;
69e3c75f 3292 break;
614f60fa 3293 case PACKET_TIMESTAMP:
614f60fa 3294 val = po->tp_tstamp;
614f60fa 3295 break;
dc99f600 3296 case PACKET_FANOUT:
dc99f600
DM
3297 val = (po->fanout ?
3298 ((u32)po->fanout->id |
3299 ((u32)po->fanout->type << 16)) :
3300 0);
dc99f600 3301 break;
1da177e4
LT
3302 default:
3303 return -ENOPROTOOPT;
3304 }
3305
c06fff6e
ED
3306 if (len > lv)
3307 len = lv;
8ae55f04
KK
3308 if (put_user(len, optlen))
3309 return -EFAULT;
8dc41944
HX
3310 if (copy_to_user(optval, data, len))
3311 return -EFAULT;
8ae55f04 3312 return 0;
1da177e4
LT
3313}
3314
3315
3316static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3317{
3318 struct sock *sk;
3319 struct hlist_node *node;
ad930650 3320 struct net_device *dev = data;
c346dca1 3321 struct net *net = dev_net(dev);
1da177e4 3322
808f5114 3323 rcu_read_lock();
3324 sk_for_each_rcu(sk, node, &net->packet.sklist) {
1da177e4
LT
3325 struct packet_sock *po = pkt_sk(sk);
3326
3327 switch (msg) {
3328 case NETDEV_UNREGISTER:
1da177e4
LT
3329 if (po->mclist)
3330 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3331 /* fallthrough */
3332
1da177e4
LT
3333 case NETDEV_DOWN:
3334 if (dev->ifindex == po->ifindex) {
3335 spin_lock(&po->bind_lock);
3336 if (po->running) {
ce06b03e 3337 __unregister_prot_hook(sk, false);
1da177e4
LT
3338 sk->sk_err = ENETDOWN;
3339 if (!sock_flag(sk, SOCK_DEAD))
3340 sk->sk_error_report(sk);
3341 }
3342 if (msg == NETDEV_UNREGISTER) {
3343 po->ifindex = -1;
160ff18a
BG
3344 if (po->prot_hook.dev)
3345 dev_put(po->prot_hook.dev);
1da177e4
LT
3346 po->prot_hook.dev = NULL;
3347 }
3348 spin_unlock(&po->bind_lock);
3349 }
3350 break;
3351 case NETDEV_UP:
808f5114 3352 if (dev->ifindex == po->ifindex) {
3353 spin_lock(&po->bind_lock);
ce06b03e
DM
3354 if (po->num)
3355 register_prot_hook(sk);
808f5114 3356 spin_unlock(&po->bind_lock);
1da177e4 3357 }
1da177e4
LT
3358 break;
3359 }
3360 }
808f5114 3361 rcu_read_unlock();
1da177e4
LT
3362 return NOTIFY_DONE;
3363}
3364
3365
3366static int packet_ioctl(struct socket *sock, unsigned int cmd,
3367 unsigned long arg)
3368{
3369 struct sock *sk = sock->sk;
3370
69e3c75f 3371 switch (cmd) {
40d4e3df
ED
3372 case SIOCOUTQ:
3373 {
3374 int amount = sk_wmem_alloc_get(sk);
31e6d363 3375
40d4e3df
ED
3376 return put_user(amount, (int __user *)arg);
3377 }
3378 case SIOCINQ:
3379 {
3380 struct sk_buff *skb;
3381 int amount = 0;
3382
3383 spin_lock_bh(&sk->sk_receive_queue.lock);
3384 skb = skb_peek(&sk->sk_receive_queue);
3385 if (skb)
3386 amount = skb->len;
3387 spin_unlock_bh(&sk->sk_receive_queue.lock);
3388 return put_user(amount, (int __user *)arg);
3389 }
3390 case SIOCGSTAMP:
3391 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3392 case SIOCGSTAMPNS:
3393 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3394
1da177e4 3395#ifdef CONFIG_INET
40d4e3df
ED
3396 case SIOCADDRT:
3397 case SIOCDELRT:
3398 case SIOCDARP:
3399 case SIOCGARP:
3400 case SIOCSARP:
3401 case SIOCGIFADDR:
3402 case SIOCSIFADDR:
3403 case SIOCGIFBRDADDR:
3404 case SIOCSIFBRDADDR:
3405 case SIOCGIFNETMASK:
3406 case SIOCSIFNETMASK:
3407 case SIOCGIFDSTADDR:
3408 case SIOCSIFDSTADDR:
3409 case SIOCSIFFLAGS:
40d4e3df 3410 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3411#endif
3412
40d4e3df
ED
3413 default:
3414 return -ENOIOCTLCMD;
1da177e4
LT
3415 }
3416 return 0;
3417}
3418
40d4e3df 3419static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3420 poll_table *wait)
3421{
3422 struct sock *sk = sock->sk;
3423 struct packet_sock *po = pkt_sk(sk);
3424 unsigned int mask = datagram_poll(file, sock, wait);
3425
3426 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3427 if (po->rx_ring.pg_vec) {
f6fb8f10 3428 if (!packet_previous_rx_frame(po, &po->rx_ring,
3429 TP_STATUS_KERNEL))
1da177e4
LT
3430 mask |= POLLIN | POLLRDNORM;
3431 }
3432 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3433 spin_lock_bh(&sk->sk_write_queue.lock);
3434 if (po->tx_ring.pg_vec) {
3435 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3436 mask |= POLLOUT | POLLWRNORM;
3437 }
3438 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3439 return mask;
3440}
3441
3442
3443/* Dirty? Well, I still did not learn better way to account
3444 * for user mmaps.
3445 */
3446
3447static void packet_mm_open(struct vm_area_struct *vma)
3448{
3449 struct file *file = vma->vm_file;
40d4e3df 3450 struct socket *sock = file->private_data;
1da177e4 3451 struct sock *sk = sock->sk;
1ce4f28b 3452
1da177e4
LT
3453 if (sk)
3454 atomic_inc(&pkt_sk(sk)->mapped);
3455}
3456
3457static void packet_mm_close(struct vm_area_struct *vma)
3458{
3459 struct file *file = vma->vm_file;
40d4e3df 3460 struct socket *sock = file->private_data;
1da177e4 3461 struct sock *sk = sock->sk;
1ce4f28b 3462
1da177e4
LT
3463 if (sk)
3464 atomic_dec(&pkt_sk(sk)->mapped);
3465}
3466
f0f37e2f 3467static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3468 .open = packet_mm_open,
3469 .close = packet_mm_close,
1da177e4
LT
3470};
3471
0e3125c7
NH
3472static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3473 unsigned int len)
1da177e4
LT
3474{
3475 int i;
3476
4ebf0ae2 3477 for (i = 0; i < len; i++) {
0e3125c7 3478 if (likely(pg_vec[i].buffer)) {
c56b4d90 3479 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3480 vfree(pg_vec[i].buffer);
3481 else
3482 free_pages((unsigned long)pg_vec[i].buffer,
3483 order);
3484 pg_vec[i].buffer = NULL;
3485 }
1da177e4
LT
3486 }
3487 kfree(pg_vec);
3488}
3489
eea49cc9 3490static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3491{
0e3125c7
NH
3492 char *buffer = NULL;
3493 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3494 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3495
3496 buffer = (char *) __get_free_pages(gfp_flags, order);
3497
3498 if (buffer)
3499 return buffer;
3500
3501 /*
3502 * __get_free_pages failed, fall back to vmalloc
3503 */
bbce5a59 3504 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3505
0e3125c7
NH
3506 if (buffer)
3507 return buffer;
3508
3509 /*
3510 * vmalloc failed, lets dig into swap here
3511 */
0e3125c7
NH
3512 gfp_flags &= ~__GFP_NORETRY;
3513 buffer = (char *)__get_free_pages(gfp_flags, order);
3514 if (buffer)
3515 return buffer;
3516
3517 /*
3518 * complete and utter failure
3519 */
3520 return NULL;
4ebf0ae2
DM
3521}
3522
0e3125c7 3523static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3524{
3525 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3526 struct pgv *pg_vec;
4ebf0ae2
DM
3527 int i;
3528
0e3125c7 3529 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3530 if (unlikely(!pg_vec))
3531 goto out;
3532
3533 for (i = 0; i < block_nr; i++) {
c56b4d90 3534 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3535 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3536 goto out_free_pgvec;
3537 }
3538
3539out:
3540 return pg_vec;
3541
3542out_free_pgvec:
3543 free_pg_vec(pg_vec, order, block_nr);
3544 pg_vec = NULL;
3545 goto out;
3546}
1da177e4 3547
f6fb8f10 3548static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3549 int closing, int tx_ring)
1da177e4 3550{
0e3125c7 3551 struct pgv *pg_vec = NULL;
1da177e4 3552 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3553 int was_running, order = 0;
69e3c75f
JB
3554 struct packet_ring_buffer *rb;
3555 struct sk_buff_head *rb_queue;
0e11c91e 3556 __be16 num;
f6fb8f10 3557 int err = -EINVAL;
3558 /* Added to avoid minimal code churn */
3559 struct tpacket_req *req = &req_u->req;
3560
3561 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3562 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3563 WARN(1, "Tx-ring is not supported.\n");
3564 goto out;
3565 }
1ce4f28b 3566
69e3c75f
JB
3567 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3568 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3569
69e3c75f
JB
3570 err = -EBUSY;
3571 if (!closing) {
3572 if (atomic_read(&po->mapped))
3573 goto out;
3574 if (atomic_read(&rb->pending))
3575 goto out;
3576 }
1da177e4 3577
69e3c75f
JB
3578 if (req->tp_block_nr) {
3579 /* Sanity tests and some calculations */
3580 err = -EBUSY;
3581 if (unlikely(rb->pg_vec))
3582 goto out;
1da177e4 3583
bbd6ef87
PM
3584 switch (po->tp_version) {
3585 case TPACKET_V1:
3586 po->tp_hdrlen = TPACKET_HDRLEN;
3587 break;
3588 case TPACKET_V2:
3589 po->tp_hdrlen = TPACKET2_HDRLEN;
3590 break;
f6fb8f10 3591 case TPACKET_V3:
3592 po->tp_hdrlen = TPACKET3_HDRLEN;
3593 break;
bbd6ef87
PM
3594 }
3595
69e3c75f 3596 err = -EINVAL;
4ebf0ae2 3597 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3598 goto out;
4ebf0ae2 3599 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3600 goto out;
8913336a 3601 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3602 po->tp_reserve))
3603 goto out;
4ebf0ae2 3604 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3605 goto out;
1da177e4 3606
69e3c75f
JB
3607 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3608 if (unlikely(rb->frames_per_block <= 0))
3609 goto out;
3610 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3611 req->tp_frame_nr))
3612 goto out;
1da177e4
LT
3613
3614 err = -ENOMEM;
4ebf0ae2
DM
3615 order = get_order(req->tp_block_size);
3616 pg_vec = alloc_pg_vec(req, order);
3617 if (unlikely(!pg_vec))
1da177e4 3618 goto out;
f6fb8f10 3619 switch (po->tp_version) {
3620 case TPACKET_V3:
3621 /* Transmit path is not supported. We checked
3622 * it above but just being paranoid
3623 */
3624 if (!tx_ring)
3625 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3626 break;
3627 default:
3628 break;
3629 }
69e3c75f
JB
3630 }
3631 /* Done */
3632 else {
3633 err = -EINVAL;
4ebf0ae2 3634 if (unlikely(req->tp_frame_nr))
69e3c75f 3635 goto out;
1da177e4
LT
3636 }
3637
3638 lock_sock(sk);
3639
3640 /* Detach socket from network */
3641 spin_lock(&po->bind_lock);
3642 was_running = po->running;
3643 num = po->num;
3644 if (was_running) {
1da177e4 3645 po->num = 0;
ce06b03e 3646 __unregister_prot_hook(sk, false);
1da177e4
LT
3647 }
3648 spin_unlock(&po->bind_lock);
1ce4f28b 3649
1da177e4
LT
3650 synchronize_net();
3651
3652 err = -EBUSY;
905db440 3653 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3654 if (closing || atomic_read(&po->mapped) == 0) {
3655 err = 0;
69e3c75f 3656 spin_lock_bh(&rb_queue->lock);
c053fd96 3657 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3658 rb->frame_max = (req->tp_frame_nr - 1);
3659 rb->head = 0;
3660 rb->frame_size = req->tp_frame_size;
3661 spin_unlock_bh(&rb_queue->lock);
3662
c053fd96
CG
3663 swap(rb->pg_vec_order, order);
3664 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3665
3666 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3667 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3668 tpacket_rcv : packet_rcv;
3669 skb_queue_purge(rb_queue);
1da177e4 3670 if (atomic_read(&po->mapped))
40d4e3df
ED
3671 pr_err("packet_mmap: vma is busy: %d\n",
3672 atomic_read(&po->mapped));
1da177e4 3673 }
905db440 3674 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3675
3676 spin_lock(&po->bind_lock);
ce06b03e 3677 if (was_running) {
1da177e4 3678 po->num = num;
ce06b03e 3679 register_prot_hook(sk);
1da177e4
LT
3680 }
3681 spin_unlock(&po->bind_lock);
f6fb8f10 3682 if (closing && (po->tp_version > TPACKET_V2)) {
3683 /* Because we don't support block-based V3 on tx-ring */
3684 if (!tx_ring)
3685 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3686 }
1da177e4
LT
3687 release_sock(sk);
3688
1da177e4
LT
3689 if (pg_vec)
3690 free_pg_vec(pg_vec, order, req->tp_block_nr);
3691out:
3692 return err;
3693}
3694
69e3c75f
JB
3695static int packet_mmap(struct file *file, struct socket *sock,
3696 struct vm_area_struct *vma)
1da177e4
LT
3697{
3698 struct sock *sk = sock->sk;
3699 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3700 unsigned long size, expected_size;
3701 struct packet_ring_buffer *rb;
1da177e4
LT
3702 unsigned long start;
3703 int err = -EINVAL;
3704 int i;
3705
3706 if (vma->vm_pgoff)
3707 return -EINVAL;
3708
905db440 3709 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3710
3711 expected_size = 0;
3712 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3713 if (rb->pg_vec) {
3714 expected_size += rb->pg_vec_len
3715 * rb->pg_vec_pages
3716 * PAGE_SIZE;
3717 }
3718 }
3719
3720 if (expected_size == 0)
1da177e4 3721 goto out;
69e3c75f
JB
3722
3723 size = vma->vm_end - vma->vm_start;
3724 if (size != expected_size)
1da177e4
LT
3725 goto out;
3726
1da177e4 3727 start = vma->vm_start;
69e3c75f
JB
3728 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3729 if (rb->pg_vec == NULL)
3730 continue;
3731
3732 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3733 struct page *page;
3734 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3735 int pg_num;
3736
c56b4d90
CG
3737 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3738 page = pgv_to_page(kaddr);
69e3c75f
JB
3739 err = vm_insert_page(vma, start, page);
3740 if (unlikely(err))
3741 goto out;
3742 start += PAGE_SIZE;
0e3125c7 3743 kaddr += PAGE_SIZE;
69e3c75f 3744 }
4ebf0ae2 3745 }
1da177e4 3746 }
69e3c75f 3747
4ebf0ae2 3748 atomic_inc(&po->mapped);
1da177e4
LT
3749 vma->vm_ops = &packet_mmap_ops;
3750 err = 0;
3751
3752out:
905db440 3753 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3754 return err;
3755}
1da177e4 3756
90ddc4f0 3757static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3758 .family = PF_PACKET,
3759 .owner = THIS_MODULE,
3760 .release = packet_release,
3761 .bind = packet_bind_spkt,
3762 .connect = sock_no_connect,
3763 .socketpair = sock_no_socketpair,
3764 .accept = sock_no_accept,
3765 .getname = packet_getname_spkt,
3766 .poll = datagram_poll,
3767 .ioctl = packet_ioctl,
3768 .listen = sock_no_listen,
3769 .shutdown = sock_no_shutdown,
3770 .setsockopt = sock_no_setsockopt,
3771 .getsockopt = sock_no_getsockopt,
3772 .sendmsg = packet_sendmsg_spkt,
3773 .recvmsg = packet_recvmsg,
3774 .mmap = sock_no_mmap,
3775 .sendpage = sock_no_sendpage,
3776};
1da177e4 3777
90ddc4f0 3778static const struct proto_ops packet_ops = {
1da177e4
LT
3779 .family = PF_PACKET,
3780 .owner = THIS_MODULE,
3781 .release = packet_release,
3782 .bind = packet_bind,
3783 .connect = sock_no_connect,
3784 .socketpair = sock_no_socketpair,
3785 .accept = sock_no_accept,
1ce4f28b 3786 .getname = packet_getname,
1da177e4
LT
3787 .poll = packet_poll,
3788 .ioctl = packet_ioctl,
3789 .listen = sock_no_listen,
3790 .shutdown = sock_no_shutdown,
3791 .setsockopt = packet_setsockopt,
3792 .getsockopt = packet_getsockopt,
3793 .sendmsg = packet_sendmsg,
3794 .recvmsg = packet_recvmsg,
3795 .mmap = packet_mmap,
3796 .sendpage = sock_no_sendpage,
3797};
3798
ec1b4cf7 3799static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3800 .family = PF_PACKET,
3801 .create = packet_create,
3802 .owner = THIS_MODULE,
3803};
3804
3805static struct notifier_block packet_netdev_notifier = {
40d4e3df 3806 .notifier_call = packet_notifier,
1da177e4
LT
3807};
3808
3809#ifdef CONFIG_PROC_FS
1da177e4
LT
3810
3811static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3812 __acquires(RCU)
1da177e4 3813{
e372c414 3814 struct net *net = seq_file_net(seq);
808f5114 3815
3816 rcu_read_lock();
3817 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3818}
3819
3820static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3821{
1bf40954 3822 struct net *net = seq_file_net(seq);
808f5114 3823 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3824}
3825
3826static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3827 __releases(RCU)
1da177e4 3828{
808f5114 3829 rcu_read_unlock();
1da177e4
LT
3830}
3831
1ce4f28b 3832static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3833{
3834 if (v == SEQ_START_TOKEN)
3835 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3836 else {
b7ceabd9 3837 struct sock *s = sk_entry(v);
1da177e4
LT
3838 const struct packet_sock *po = pkt_sk(s);
3839
3840 seq_printf(seq,
71338aa7 3841 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3842 s,
3843 atomic_read(&s->sk_refcnt),
3844 s->sk_type,
3845 ntohs(po->num),
3846 po->ifindex,
3847 po->running,
3848 atomic_read(&s->sk_rmem_alloc),
3849 sock_i_uid(s),
40d4e3df 3850 sock_i_ino(s));
1da177e4
LT
3851 }
3852
3853 return 0;
3854}
3855
56b3d975 3856static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3857 .start = packet_seq_start,
3858 .next = packet_seq_next,
3859 .stop = packet_seq_stop,
3860 .show = packet_seq_show,
3861};
3862
3863static int packet_seq_open(struct inode *inode, struct file *file)
3864{
e372c414
DL
3865 return seq_open_net(inode, file, &packet_seq_ops,
3866 sizeof(struct seq_net_private));
1da177e4
LT
3867}
3868
da7071d7 3869static const struct file_operations packet_seq_fops = {
1da177e4
LT
3870 .owner = THIS_MODULE,
3871 .open = packet_seq_open,
3872 .read = seq_read,
3873 .llseek = seq_lseek,
e372c414 3874 .release = seq_release_net,
1da177e4
LT
3875};
3876
3877#endif
3878
2c8c1e72 3879static int __net_init packet_net_init(struct net *net)
d12d01d6 3880{
808f5114 3881 spin_lock_init(&net->packet.sklist_lock);
2aaef4e4 3882 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6
DL
3883
3884 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
3885 return -ENOMEM;
3886
3887 return 0;
3888}
3889
2c8c1e72 3890static void __net_exit packet_net_exit(struct net *net)
d12d01d6
DL
3891{
3892 proc_net_remove(net, "packet");
3893}
3894
3895static struct pernet_operations packet_net_ops = {
3896 .init = packet_net_init,
3897 .exit = packet_net_exit,
3898};
3899
3900
1da177e4
LT
3901static void __exit packet_exit(void)
3902{
1da177e4 3903 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3904 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3905 sock_unregister(PF_PACKET);
3906 proto_unregister(&packet_proto);
3907}
3908
3909static int __init packet_init(void)
3910{
3911 int rc = proto_register(&packet_proto, 0);
3912
3913 if (rc != 0)
3914 goto out;
3915
3916 sock_register(&packet_family_ops);
d12d01d6 3917 register_pernet_subsys(&packet_net_ops);
1da177e4 3918 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3919out:
3920 return rc;
3921}
3922
3923module_init(packet_init);
3924module_exit(packet_exit);
3925MODULE_LICENSE("GPL");
3926MODULE_ALIAS_NETPROTO(PF_PACKET);