ipv4: Add ip_defrag() agent IP_DEFRAG_AF_PACKET.
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
1da177e4
LT
43 *
44 * This program is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU General Public License
46 * as published by the Free Software Foundation; either version
47 * 2 of the License, or (at your option) any later version.
48 *
49 */
1ce4f28b 50
1da177e4 51#include <linux/types.h>
1da177e4 52#include <linux/mm.h>
4fc268d2 53#include <linux/capability.h>
1da177e4
LT
54#include <linux/fcntl.h>
55#include <linux/socket.h>
56#include <linux/in.h>
57#include <linux/inet.h>
58#include <linux/netdevice.h>
59#include <linux/if_packet.h>
60#include <linux/wireless.h>
ffbc6111 61#include <linux/kernel.h>
1da177e4 62#include <linux/kmod.h>
5a0e3ad6 63#include <linux/slab.h>
0e3125c7 64#include <linux/vmalloc.h>
457c4cbc 65#include <net/net_namespace.h>
1da177e4
LT
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <linux/skbuff.h>
69#include <net/sock.h>
70#include <linux/errno.h>
71#include <linux/timer.h>
72#include <asm/system.h>
73#include <asm/uaccess.h>
74#include <asm/ioctls.h>
75#include <asm/page.h>
a1f8e7f7 76#include <asm/cacheflush.h>
1da177e4
LT
77#include <asm/io.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80#include <linux/poll.h>
81#include <linux/module.h>
82#include <linux/init.h>
905db440 83#include <linux/mutex.h>
05423b24 84#include <linux/if_vlan.h>
bfd5f4a3 85#include <linux/virtio_net.h>
ed85b565 86#include <linux/errqueue.h>
614f60fa 87#include <linux/net_tstamp.h>
1da177e4
LT
88
89#ifdef CONFIG_INET
90#include <net/inet_common.h>
91#endif
92
1da177e4
LT
93/*
94 Assumptions:
95 - if device has no dev->hard_header routine, it adds and removes ll header
96 inside itself. In this case ll header is invisible outside of device,
97 but higher levels still should reserve dev->hard_header_len.
98 Some devices are enough clever to reallocate skb, when header
99 will not fit to reserved space (tunnel), another ones are silly
100 (PPP).
101 - packet socket receives packets with pulled ll header,
102 so that SOCK_RAW should push it back.
103
104On receive:
105-----------
106
107Incoming, dev->hard_header!=NULL
b0e380b1
ACM
108 mac_header -> ll header
109 data -> data
1da177e4
LT
110
111Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
112 mac_header -> ll header
113 data -> ll header
1da177e4
LT
114
115Incoming, dev->hard_header==NULL
b0e380b1
ACM
116 mac_header -> UNKNOWN position. It is very likely, that it points to ll
117 header. PPP makes it, that is wrong, because introduce
db0c58f9 118 assymetry between rx and tx paths.
b0e380b1 119 data -> data
1da177e4
LT
120
121Outgoing, dev->hard_header==NULL
b0e380b1
ACM
122 mac_header -> data. ll header is still not built!
123 data -> data
1da177e4
LT
124
125Resume
126 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
127
128
129On transmit:
130------------
131
132dev->hard_header != NULL
b0e380b1
ACM
133 mac_header -> ll header
134 data -> ll header
1da177e4
LT
135
136dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
137 mac_header -> data
138 data -> data
1da177e4
LT
139
140 We should set nh.raw on output to correct posistion,
141 packet classifier depends on it.
142 */
143
1da177e4
LT
144/* Private packet socket structures. */
145
40d4e3df 146struct packet_mclist {
1da177e4
LT
147 struct packet_mclist *next;
148 int ifindex;
149 int count;
150 unsigned short type;
151 unsigned short alen;
0fb375fb
EB
152 unsigned char addr[MAX_ADDR_LEN];
153};
154/* identical to struct packet_mreq except it has
155 * a longer address field.
156 */
40d4e3df 157struct packet_mreq_max {
0fb375fb
EB
158 int mr_ifindex;
159 unsigned short mr_type;
160 unsigned short mr_alen;
161 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 162};
a2efcfa0 163
69e3c75f
JB
164static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
165 int closing, int tx_ring);
166
0e3125c7
NH
167struct pgv {
168 char *buffer;
0e3125c7
NH
169};
170
69e3c75f 171struct packet_ring_buffer {
0e3125c7 172 struct pgv *pg_vec;
69e3c75f
JB
173 unsigned int head;
174 unsigned int frames_per_block;
175 unsigned int frame_size;
176 unsigned int frame_max;
177
178 unsigned int pg_vec_order;
179 unsigned int pg_vec_pages;
180 unsigned int pg_vec_len;
181
182 atomic_t pending;
183};
184
185struct packet_sock;
186static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
1da177e4
LT
187
188static void packet_flush_mclist(struct sock *sk);
189
dc99f600 190struct packet_fanout;
1da177e4
LT
191struct packet_sock {
192 /* struct sock has to be the first member of packet_sock */
193 struct sock sk;
dc99f600 194 struct packet_fanout *fanout;
1da177e4 195 struct tpacket_stats stats;
69e3c75f
JB
196 struct packet_ring_buffer rx_ring;
197 struct packet_ring_buffer tx_ring;
1da177e4 198 int copy_thresh;
1da177e4 199 spinlock_t bind_lock;
905db440 200 struct mutex pg_vec_lock;
8dc41944 201 unsigned int running:1, /* prot_hook is attached*/
80feaacb 202 auxdata:1,
bfd5f4a3
SS
203 origdev:1,
204 has_vnet_hdr:1;
1da177e4 205 int ifindex; /* bound device */
0e11c91e 206 __be16 num;
1da177e4 207 struct packet_mclist *mclist;
1da177e4 208 atomic_t mapped;
bbd6ef87
PM
209 enum tpacket_versions tp_version;
210 unsigned int tp_hdrlen;
8913336a 211 unsigned int tp_reserve;
69e3c75f 212 unsigned int tp_loss:1;
614f60fa 213 unsigned int tp_tstamp;
94b05952 214 struct packet_type prot_hook ____cacheline_aligned_in_smp;
1da177e4
LT
215};
216
dc99f600
DM
217#define PACKET_FANOUT_MAX 256
218
219struct packet_fanout {
220#ifdef CONFIG_NET_NS
221 struct net *net;
222#endif
223 unsigned int num_members;
224 u16 id;
225 u8 type;
226 u8 pad;
227 atomic_t rr_cur;
228 struct list_head list;
229 struct sock *arr[PACKET_FANOUT_MAX];
230 spinlock_t lock;
231 atomic_t sk_ref;
232 struct packet_type prot_hook ____cacheline_aligned_in_smp;
233};
234
ffbc6111
HX
235struct packet_skb_cb {
236 unsigned int origlen;
237 union {
238 struct sockaddr_pkt pkt;
239 struct sockaddr_ll ll;
240 } sa;
241};
242
243#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 244
ce06b03e
DM
245static inline struct packet_sock *pkt_sk(struct sock *sk)
246{
247 return (struct packet_sock *)sk;
248}
249
dc99f600
DM
250static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
251static void __fanout_link(struct sock *sk, struct packet_sock *po);
252
ce06b03e
DM
253/* register_prot_hook must be invoked with the po->bind_lock held,
254 * or from a context in which asynchronous accesses to the packet
255 * socket is not possible (packet_create()).
256 */
257static void register_prot_hook(struct sock *sk)
258{
259 struct packet_sock *po = pkt_sk(sk);
260 if (!po->running) {
dc99f600
DM
261 if (po->fanout)
262 __fanout_link(sk, po);
263 else
264 dev_add_pack(&po->prot_hook);
ce06b03e
DM
265 sock_hold(sk);
266 po->running = 1;
267 }
268}
269
270/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
271 * held. If the sync parameter is true, we will temporarily drop
272 * the po->bind_lock and do a synchronize_net to make sure no
273 * asynchronous packet processing paths still refer to the elements
274 * of po->prot_hook. If the sync parameter is false, it is the
275 * callers responsibility to take care of this.
276 */
277static void __unregister_prot_hook(struct sock *sk, bool sync)
278{
279 struct packet_sock *po = pkt_sk(sk);
280
281 po->running = 0;
dc99f600
DM
282 if (po->fanout)
283 __fanout_unlink(sk, po);
284 else
285 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
286 __sock_put(sk);
287
288 if (sync) {
289 spin_unlock(&po->bind_lock);
290 synchronize_net();
291 spin_lock(&po->bind_lock);
292 }
293}
294
295static void unregister_prot_hook(struct sock *sk, bool sync)
296{
297 struct packet_sock *po = pkt_sk(sk);
298
299 if (po->running)
300 __unregister_prot_hook(sk, sync);
301}
302
f6dafa95 303static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
304{
305 if (is_vmalloc_addr(addr))
306 return vmalloc_to_page(addr);
307 return virt_to_page(addr);
308}
309
69e3c75f 310static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 311{
bbd6ef87
PM
312 union {
313 struct tpacket_hdr *h1;
314 struct tpacket2_hdr *h2;
315 void *raw;
316 } h;
1da177e4 317
69e3c75f 318 h.raw = frame;
bbd6ef87
PM
319 switch (po->tp_version) {
320 case TPACKET_V1:
69e3c75f 321 h.h1->tp_status = status;
0af55bb5 322 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
323 break;
324 case TPACKET_V2:
69e3c75f 325 h.h2->tp_status = status;
0af55bb5 326 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 327 break;
69e3c75f 328 default:
40d4e3df 329 pr_err("TPACKET version not supported\n");
69e3c75f 330 BUG();
bbd6ef87 331 }
69e3c75f
JB
332
333 smp_wmb();
bbd6ef87
PM
334}
335
69e3c75f 336static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87
PM
337{
338 union {
339 struct tpacket_hdr *h1;
340 struct tpacket2_hdr *h2;
341 void *raw;
342 } h;
343
69e3c75f
JB
344 smp_rmb();
345
bbd6ef87
PM
346 h.raw = frame;
347 switch (po->tp_version) {
348 case TPACKET_V1:
0af55bb5 349 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 350 return h.h1->tp_status;
bbd6ef87 351 case TPACKET_V2:
0af55bb5 352 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f
JB
353 return h.h2->tp_status;
354 default:
40d4e3df 355 pr_err("TPACKET version not supported\n");
69e3c75f
JB
356 BUG();
357 return 0;
bbd6ef87 358 }
1da177e4 359}
69e3c75f
JB
360
361static void *packet_lookup_frame(struct packet_sock *po,
362 struct packet_ring_buffer *rb,
363 unsigned int position,
364 int status)
365{
366 unsigned int pg_vec_pos, frame_offset;
367 union {
368 struct tpacket_hdr *h1;
369 struct tpacket2_hdr *h2;
370 void *raw;
371 } h;
372
373 pg_vec_pos = position / rb->frames_per_block;
374 frame_offset = position % rb->frames_per_block;
375
0e3125c7
NH
376 h.raw = rb->pg_vec[pg_vec_pos].buffer +
377 (frame_offset * rb->frame_size);
69e3c75f
JB
378
379 if (status != __packet_get_status(po, h.raw))
380 return NULL;
381
382 return h.raw;
383}
384
385static inline void *packet_current_frame(struct packet_sock *po,
386 struct packet_ring_buffer *rb,
387 int status)
388{
389 return packet_lookup_frame(po, rb, rb->head, status);
390}
391
392static inline void *packet_previous_frame(struct packet_sock *po,
393 struct packet_ring_buffer *rb,
394 int status)
395{
396 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
397 return packet_lookup_frame(po, rb, previous, status);
398}
399
400static inline void packet_increment_head(struct packet_ring_buffer *buff)
401{
402 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
403}
404
1da177e4
LT
405static void packet_sock_destruct(struct sock *sk)
406{
ed85b565
RC
407 skb_queue_purge(&sk->sk_error_queue);
408
547b792c
IJ
409 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
410 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
411
412 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 413 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
414 return;
415 }
416
17ab56a2 417 sk_refcnt_debug_dec(sk);
1da177e4
LT
418}
419
dc99f600
DM
420static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
421{
422 int x = atomic_read(&f->rr_cur) + 1;
423
424 if (x >= num)
425 x = 0;
426
427 return x;
428}
429
430static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
431{
432 u32 idx, hash = skb->rxhash;
433
434 idx = ((u64)hash * num) >> 32;
435
436 return f->arr[idx];
437}
438
439static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
440{
441 int cur, old;
442
443 cur = atomic_read(&f->rr_cur);
444 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
445 fanout_rr_next(f, num))) != cur)
446 cur = old;
447 return f->arr[cur];
448}
449
450static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
451 struct packet_type *pt, struct net_device *orig_dev)
452{
453 struct packet_fanout *f = pt->af_packet_priv;
454 unsigned int num = f->num_members;
455 struct packet_sock *po;
456 struct sock *sk;
457
458 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
459 !num) {
460 kfree_skb(skb);
461 return 0;
462 }
463
464 skb_get_rxhash(skb);
465
466 sk = fanout_demux_hash(f, skb, num);
467 po = pkt_sk(sk);
468
469 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
470}
471
472static int packet_rcv_fanout_lb(struct sk_buff *skb, struct net_device *dev,
473 struct packet_type *pt, struct net_device *orig_dev)
474{
475 struct packet_fanout *f = pt->af_packet_priv;
476 unsigned int num = f->num_members;
477 struct packet_sock *po;
478 struct sock *sk;
479
480 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
481 !num) {
482 kfree_skb(skb);
483 return 0;
484 }
485
486 sk = fanout_demux_lb(f, skb, num);
487 po = pkt_sk(sk);
488
489 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
490}
491
492static DEFINE_MUTEX(fanout_mutex);
493static LIST_HEAD(fanout_list);
494
495static void __fanout_link(struct sock *sk, struct packet_sock *po)
496{
497 struct packet_fanout *f = po->fanout;
498
499 spin_lock(&f->lock);
500 f->arr[f->num_members] = sk;
501 smp_wmb();
502 f->num_members++;
503 spin_unlock(&f->lock);
504}
505
506static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
507{
508 struct packet_fanout *f = po->fanout;
509 int i;
510
511 spin_lock(&f->lock);
512 for (i = 0; i < f->num_members; i++) {
513 if (f->arr[i] == sk)
514 break;
515 }
516 BUG_ON(i >= f->num_members);
517 f->arr[i] = f->arr[f->num_members - 1];
518 f->num_members--;
519 spin_unlock(&f->lock);
520}
521
522static int fanout_add(struct sock *sk, u16 id, u8 type)
523{
524 struct packet_sock *po = pkt_sk(sk);
525 struct packet_fanout *f, *match;
526 int err;
527
528 switch (type) {
529 case PACKET_FANOUT_HASH:
530 case PACKET_FANOUT_LB:
531 break;
532 default:
533 return -EINVAL;
534 }
535
536 if (!po->running)
537 return -EINVAL;
538
539 if (po->fanout)
540 return -EALREADY;
541
542 mutex_lock(&fanout_mutex);
543 match = NULL;
544 list_for_each_entry(f, &fanout_list, list) {
545 if (f->id == id &&
546 read_pnet(&f->net) == sock_net(sk)) {
547 match = f;
548 break;
549 }
550 }
551 if (!match) {
552 match = kzalloc(sizeof(*match), GFP_KERNEL);
553 if (match) {
554 write_pnet(&match->net, sock_net(sk));
555 match->id = id;
556 match->type = type;
557 atomic_set(&match->rr_cur, 0);
558 INIT_LIST_HEAD(&match->list);
559 spin_lock_init(&match->lock);
560 atomic_set(&match->sk_ref, 0);
561 match->prot_hook.type = po->prot_hook.type;
562 match->prot_hook.dev = po->prot_hook.dev;
563 switch (type) {
564 case PACKET_FANOUT_HASH:
565 match->prot_hook.func = packet_rcv_fanout_hash;
566 break;
567 case PACKET_FANOUT_LB:
568 match->prot_hook.func = packet_rcv_fanout_lb;
569 break;
570 }
571 match->prot_hook.af_packet_priv = match;
572 dev_add_pack(&match->prot_hook);
573 list_add(&match->list, &fanout_list);
574 }
575 }
576 err = -ENOMEM;
577 if (match) {
578 err = -EINVAL;
579 if (match->type == type &&
580 match->prot_hook.type == po->prot_hook.type &&
581 match->prot_hook.dev == po->prot_hook.dev) {
582 err = -ENOSPC;
583 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
584 __dev_remove_pack(&po->prot_hook);
585 po->fanout = match;
586 atomic_inc(&match->sk_ref);
587 __fanout_link(sk, po);
588 err = 0;
589 }
590 }
591 }
592 mutex_unlock(&fanout_mutex);
593 return err;
594}
595
596static void fanout_release(struct sock *sk)
597{
598 struct packet_sock *po = pkt_sk(sk);
599 struct packet_fanout *f;
600
601 f = po->fanout;
602 if (!f)
603 return;
604
605 po->fanout = NULL;
606
607 mutex_lock(&fanout_mutex);
608 if (atomic_dec_and_test(&f->sk_ref)) {
609 list_del(&f->list);
610 dev_remove_pack(&f->prot_hook);
611 kfree(f);
612 }
613 mutex_unlock(&fanout_mutex);
614}
1da177e4 615
90ddc4f0 616static const struct proto_ops packet_ops;
1da177e4 617
90ddc4f0 618static const struct proto_ops packet_ops_spkt;
1da177e4 619
40d4e3df
ED
620static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
621 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
622{
623 struct sock *sk;
624 struct sockaddr_pkt *spkt;
625
626 /*
627 * When we registered the protocol we saved the socket in the data
628 * field for just this event.
629 */
630
631 sk = pt->af_packet_priv;
1ce4f28b 632
1da177e4
LT
633 /*
634 * Yank back the headers [hope the device set this
635 * right or kerboom...]
636 *
637 * Incoming packets have ll header pulled,
638 * push it back.
639 *
98e399f8 640 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
641 * so that this procedure is noop.
642 */
643
644 if (skb->pkt_type == PACKET_LOOPBACK)
645 goto out;
646
09ad9bc7 647 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
648 goto out;
649
40d4e3df
ED
650 skb = skb_share_check(skb, GFP_ATOMIC);
651 if (skb == NULL)
1da177e4
LT
652 goto oom;
653
654 /* drop any routing info */
adf30907 655 skb_dst_drop(skb);
1da177e4 656
84531c24
PO
657 /* drop conntrack reference */
658 nf_reset(skb);
659
ffbc6111 660 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 661
98e399f8 662 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
663
664 /*
665 * The SOCK_PACKET socket receives _all_ frames.
666 */
667
668 spkt->spkt_family = dev->type;
669 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
670 spkt->spkt_protocol = skb->protocol;
671
672 /*
673 * Charge the memory to the socket. This is done specifically
674 * to prevent sockets using all the memory up.
675 */
676
40d4e3df 677 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
678 return 0;
679
680out:
681 kfree_skb(skb);
682oom:
683 return 0;
684}
685
686
687/*
688 * Output a raw packet to a device layer. This bypasses all the other
689 * protocol layers and you must therefore supply it with a complete frame
690 */
1ce4f28b 691
1da177e4
LT
692static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
693 struct msghdr *msg, size_t len)
694{
695 struct sock *sk = sock->sk;
40d4e3df 696 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 697 struct sk_buff *skb = NULL;
1da177e4 698 struct net_device *dev;
40d4e3df 699 __be16 proto = 0;
1da177e4 700 int err;
1ce4f28b 701
1da177e4 702 /*
1ce4f28b 703 * Get and verify the address.
1da177e4
LT
704 */
705
40d4e3df 706 if (saddr) {
1da177e4 707 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
708 return -EINVAL;
709 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
710 proto = saddr->spkt_protocol;
711 } else
712 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
713
714 /*
1ce4f28b 715 * Find the device first to size check it
1da177e4
LT
716 */
717
718 saddr->spkt_device[13] = 0;
1a35ca80 719retry:
654d1f8a
ED
720 rcu_read_lock();
721 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
722 err = -ENODEV;
723 if (dev == NULL)
724 goto out_unlock;
1ce4f28b 725
d5e76b0a
DM
726 err = -ENETDOWN;
727 if (!(dev->flags & IFF_UP))
728 goto out_unlock;
729
1da177e4 730 /*
40d4e3df
ED
731 * You may not queue a frame bigger than the mtu. This is the lowest level
732 * raw protocol and you must do your own fragmentation at this level.
1da177e4 733 */
1ce4f28b 734
1da177e4 735 err = -EMSGSIZE;
57f89bfa 736 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN)
1da177e4
LT
737 goto out_unlock;
738
1a35ca80
ED
739 if (!skb) {
740 size_t reserved = LL_RESERVED_SPACE(dev);
741 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
742
743 rcu_read_unlock();
744 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
745 if (skb == NULL)
746 return -ENOBUFS;
747 /* FIXME: Save some space for broken drivers that write a hard
748 * header at transmission time by themselves. PPP is the notable
749 * one here. This should really be fixed at the driver level.
750 */
751 skb_reserve(skb, reserved);
752 skb_reset_network_header(skb);
753
754 /* Try to align data part correctly */
755 if (hhlen) {
756 skb->data -= hhlen;
757 skb->tail -= hhlen;
758 if (len < hhlen)
759 skb_reset_network_header(skb);
760 }
761 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
762 if (err)
763 goto out_free;
764 goto retry;
1da177e4
LT
765 }
766
57f89bfa
BG
767 if (len > (dev->mtu + dev->hard_header_len)) {
768 /* Earlier code assumed this would be a VLAN pkt,
769 * double-check this now that we have the actual
770 * packet in hand.
771 */
772 struct ethhdr *ehdr;
773 skb_reset_mac_header(skb);
774 ehdr = eth_hdr(skb);
775 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
776 err = -EMSGSIZE;
777 goto out_unlock;
778 }
779 }
1a35ca80 780
1da177e4
LT
781 skb->protocol = proto;
782 skb->dev = dev;
783 skb->priority = sk->sk_priority;
2d37a186 784 skb->mark = sk->sk_mark;
2244d07b 785 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
786 if (err < 0)
787 goto out_unlock;
1da177e4
LT
788
789 dev_queue_xmit(skb);
654d1f8a 790 rcu_read_unlock();
40d4e3df 791 return len;
1da177e4 792
1da177e4 793out_unlock:
654d1f8a 794 rcu_read_unlock();
1a35ca80
ED
795out_free:
796 kfree_skb(skb);
1da177e4
LT
797 return err;
798}
1da177e4 799
62ab0812
ED
800static inline unsigned int run_filter(const struct sk_buff *skb,
801 const struct sock *sk,
dbcb5855 802 unsigned int res)
1da177e4
LT
803{
804 struct sk_filter *filter;
fda9ef5d 805
80f8f102
ED
806 rcu_read_lock();
807 filter = rcu_dereference(sk->sk_filter);
dbcb5855 808 if (filter != NULL)
0a14842f 809 res = SK_RUN_FILTER(filter, skb);
80f8f102 810 rcu_read_unlock();
1da177e4 811
dbcb5855 812 return res;
1da177e4
LT
813}
814
815/*
62ab0812
ED
816 * This function makes lazy skb cloning in hope that most of packets
817 * are discarded by BPF.
818 *
819 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
820 * and skb->cb are mangled. It works because (and until) packets
821 * falling here are owned by current CPU. Output packets are cloned
822 * by dev_queue_xmit_nit(), input packets are processed by net_bh
823 * sequencially, so that if we return skb to original state on exit,
824 * we will not harm anyone.
1da177e4
LT
825 */
826
40d4e3df
ED
827static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
828 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
829{
830 struct sock *sk;
831 struct sockaddr_ll *sll;
832 struct packet_sock *po;
40d4e3df 833 u8 *skb_head = skb->data;
1da177e4 834 int skb_len = skb->len;
dbcb5855 835 unsigned int snaplen, res;
1da177e4
LT
836
837 if (skb->pkt_type == PACKET_LOOPBACK)
838 goto drop;
839
840 sk = pt->af_packet_priv;
841 po = pkt_sk(sk);
842
09ad9bc7 843 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
844 goto drop;
845
1da177e4
LT
846 skb->dev = dev;
847
3b04ddde 848 if (dev->header_ops) {
1da177e4 849 /* The device has an explicit notion of ll header,
62ab0812
ED
850 * exported to higher levels.
851 *
852 * Otherwise, the device hides details of its frame
853 * structure, so that corresponding packet head is
854 * never delivered to user.
1da177e4
LT
855 */
856 if (sk->sk_type != SOCK_DGRAM)
98e399f8 857 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
858 else if (skb->pkt_type == PACKET_OUTGOING) {
859 /* Special case: outgoing packets have ll header at head */
bbe735e4 860 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
861 }
862 }
863
864 snaplen = skb->len;
865
dbcb5855
DM
866 res = run_filter(skb, sk, snaplen);
867 if (!res)
fda9ef5d 868 goto drop_n_restore;
dbcb5855
DM
869 if (snaplen > res)
870 snaplen = res;
1da177e4
LT
871
872 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
873 (unsigned)sk->sk_rcvbuf)
874 goto drop_n_acct;
875
876 if (skb_shared(skb)) {
877 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
878 if (nskb == NULL)
879 goto drop_n_acct;
880
881 if (skb_head != skb->data) {
882 skb->data = skb_head;
883 skb->len = skb_len;
884 }
885 kfree_skb(skb);
886 skb = nskb;
887 }
888
ffbc6111
HX
889 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
890 sizeof(skb->cb));
891
892 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
893 sll->sll_family = AF_PACKET;
894 sll->sll_hatype = dev->type;
895 sll->sll_protocol = skb->protocol;
896 sll->sll_pkttype = skb->pkt_type;
8032b464 897 if (unlikely(po->origdev))
80feaacb
PWJ
898 sll->sll_ifindex = orig_dev->ifindex;
899 else
900 sll->sll_ifindex = dev->ifindex;
1da177e4 901
b95cce35 902 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 903
ffbc6111 904 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 905
1da177e4
LT
906 if (pskb_trim(skb, snaplen))
907 goto drop_n_acct;
908
909 skb_set_owner_r(skb, sk);
910 skb->dev = NULL;
adf30907 911 skb_dst_drop(skb);
1da177e4 912
84531c24
PO
913 /* drop conntrack reference */
914 nf_reset(skb);
915
1da177e4
LT
916 spin_lock(&sk->sk_receive_queue.lock);
917 po->stats.tp_packets++;
3b885787 918 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
919 __skb_queue_tail(&sk->sk_receive_queue, skb);
920 spin_unlock(&sk->sk_receive_queue.lock);
921 sk->sk_data_ready(sk, skb->len);
922 return 0;
923
924drop_n_acct:
3b885787 925 po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
1da177e4
LT
926
927drop_n_restore:
928 if (skb_head != skb->data && skb_shared(skb)) {
929 skb->data = skb_head;
930 skb->len = skb_len;
931 }
932drop:
ead2ceb0 933 consume_skb(skb);
1da177e4
LT
934 return 0;
935}
936
40d4e3df
ED
937static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
938 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
939{
940 struct sock *sk;
941 struct packet_sock *po;
942 struct sockaddr_ll *sll;
bbd6ef87
PM
943 union {
944 struct tpacket_hdr *h1;
945 struct tpacket2_hdr *h2;
946 void *raw;
947 } h;
40d4e3df 948 u8 *skb_head = skb->data;
1da177e4 949 int skb_len = skb->len;
dbcb5855 950 unsigned int snaplen, res;
1da177e4 951 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
bbd6ef87 952 unsigned short macoff, netoff, hdrlen;
1da177e4 953 struct sk_buff *copy_skb = NULL;
b7aa0bf7 954 struct timeval tv;
bbd6ef87 955 struct timespec ts;
614f60fa 956 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
1da177e4
LT
957
958 if (skb->pkt_type == PACKET_LOOPBACK)
959 goto drop;
960
961 sk = pt->af_packet_priv;
962 po = pkt_sk(sk);
963
09ad9bc7 964 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
965 goto drop;
966
3b04ddde 967 if (dev->header_ops) {
1da177e4 968 if (sk->sk_type != SOCK_DGRAM)
98e399f8 969 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
970 else if (skb->pkt_type == PACKET_OUTGOING) {
971 /* Special case: outgoing packets have ll header at head */
bbe735e4 972 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
973 }
974 }
975
8dc41944
HX
976 if (skb->ip_summed == CHECKSUM_PARTIAL)
977 status |= TP_STATUS_CSUMNOTREADY;
978
1da177e4
LT
979 snaplen = skb->len;
980
dbcb5855
DM
981 res = run_filter(skb, sk, snaplen);
982 if (!res)
fda9ef5d 983 goto drop_n_restore;
dbcb5855
DM
984 if (snaplen > res)
985 snaplen = res;
1da177e4
LT
986
987 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
988 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
989 po->tp_reserve;
1da177e4 990 } else {
bbe735e4 991 unsigned maclen = skb_network_offset(skb);
bbd6ef87 992 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
993 (maclen < 16 ? 16 : maclen)) +
994 po->tp_reserve;
1da177e4
LT
995 macoff = netoff - maclen;
996 }
997
69e3c75f 998 if (macoff + snaplen > po->rx_ring.frame_size) {
1da177e4
LT
999 if (po->copy_thresh &&
1000 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
1001 (unsigned)sk->sk_rcvbuf) {
1002 if (skb_shared(skb)) {
1003 copy_skb = skb_clone(skb, GFP_ATOMIC);
1004 } else {
1005 copy_skb = skb_get(skb);
1006 skb_head = skb->data;
1007 }
1008 if (copy_skb)
1009 skb_set_owner_r(copy_skb, sk);
1010 }
69e3c75f 1011 snaplen = po->rx_ring.frame_size - macoff;
1da177e4
LT
1012 if ((int)snaplen < 0)
1013 snaplen = 0;
1014 }
1da177e4
LT
1015
1016 spin_lock(&sk->sk_receive_queue.lock);
69e3c75f 1017 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
bbd6ef87 1018 if (!h.raw)
1da177e4 1019 goto ring_is_full;
69e3c75f 1020 packet_increment_head(&po->rx_ring);
1da177e4
LT
1021 po->stats.tp_packets++;
1022 if (copy_skb) {
1023 status |= TP_STATUS_COPY;
1024 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1025 }
1026 if (!po->stats.tp_drops)
1027 status &= ~TP_STATUS_LOSING;
1028 spin_unlock(&sk->sk_receive_queue.lock);
1029
bbd6ef87 1030 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1da177e4 1031
bbd6ef87
PM
1032 switch (po->tp_version) {
1033 case TPACKET_V1:
1034 h.h1->tp_len = skb->len;
1035 h.h1->tp_snaplen = snaplen;
1036 h.h1->tp_mac = macoff;
1037 h.h1->tp_net = netoff;
614f60fa
SM
1038 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1039 && shhwtstamps->syststamp.tv64)
1040 tv = ktime_to_timeval(shhwtstamps->syststamp);
1041 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1042 && shhwtstamps->hwtstamp.tv64)
1043 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
1044 else if (skb->tstamp.tv64)
bbd6ef87
PM
1045 tv = ktime_to_timeval(skb->tstamp);
1046 else
1047 do_gettimeofday(&tv);
1048 h.h1->tp_sec = tv.tv_sec;
1049 h.h1->tp_usec = tv.tv_usec;
1050 hdrlen = sizeof(*h.h1);
1051 break;
1052 case TPACKET_V2:
1053 h.h2->tp_len = skb->len;
1054 h.h2->tp_snaplen = snaplen;
1055 h.h2->tp_mac = macoff;
1056 h.h2->tp_net = netoff;
614f60fa
SM
1057 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1058 && shhwtstamps->syststamp.tv64)
1059 ts = ktime_to_timespec(shhwtstamps->syststamp);
1060 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1061 && shhwtstamps->hwtstamp.tv64)
1062 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1063 else if (skb->tstamp.tv64)
bbd6ef87
PM
1064 ts = ktime_to_timespec(skb->tstamp);
1065 else
1066 getnstimeofday(&ts);
1067 h.h2->tp_sec = ts.tv_sec;
1068 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1069 if (vlan_tx_tag_present(skb)) {
1070 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1071 status |= TP_STATUS_VLAN_VALID;
1072 } else {
1073 h.h2->tp_vlan_tci = 0;
1074 }
13fcb7bd 1075 h.h2->tp_padding = 0;
bbd6ef87
PM
1076 hdrlen = sizeof(*h.h2);
1077 break;
1078 default:
1079 BUG();
1080 }
1da177e4 1081
bbd6ef87 1082 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1083 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1084 sll->sll_family = AF_PACKET;
1085 sll->sll_hatype = dev->type;
1086 sll->sll_protocol = skb->protocol;
1087 sll->sll_pkttype = skb->pkt_type;
8032b464 1088 if (unlikely(po->origdev))
80feaacb
PWJ
1089 sll->sll_ifindex = orig_dev->ifindex;
1090 else
1091 sll->sll_ifindex = dev->ifindex;
1da177e4 1092
bbd6ef87 1093 __packet_set_status(po, h.raw, status);
e16aa207 1094 smp_mb();
f6dafa95 1095#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1096 {
0af55bb5
CG
1097 u8 *start, *end;
1098
1099 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen);
1100 for (start = h.raw; start < end; start += PAGE_SIZE)
1101 flush_dcache_page(pgv_to_page(start));
1da177e4 1102 }
f6dafa95 1103#endif
1da177e4
LT
1104
1105 sk->sk_data_ready(sk, 0);
1106
1107drop_n_restore:
1108 if (skb_head != skb->data && skb_shared(skb)) {
1109 skb->data = skb_head;
1110 skb->len = skb_len;
1111 }
1112drop:
1ce4f28b 1113 kfree_skb(skb);
1da177e4
LT
1114 return 0;
1115
1116ring_is_full:
1117 po->stats.tp_drops++;
1118 spin_unlock(&sk->sk_receive_queue.lock);
1119
1120 sk->sk_data_ready(sk, 0);
acb5d75b 1121 kfree_skb(copy_skb);
1da177e4
LT
1122 goto drop_n_restore;
1123}
1124
69e3c75f
JB
1125static void tpacket_destruct_skb(struct sk_buff *skb)
1126{
1127 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1128 void *ph;
1da177e4 1129
69e3c75f 1130 BUG_ON(skb == NULL);
1da177e4 1131
69e3c75f
JB
1132 if (likely(po->tx_ring.pg_vec)) {
1133 ph = skb_shinfo(skb)->destructor_arg;
1134 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
1135 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1136 atomic_dec(&po->tx_ring.pending);
1137 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
1138 }
1139
1140 sock_wfree(skb);
1141}
1142
40d4e3df
ED
1143static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1144 void *frame, struct net_device *dev, int size_max,
1145 __be16 proto, unsigned char *addr)
69e3c75f
JB
1146{
1147 union {
1148 struct tpacket_hdr *h1;
1149 struct tpacket2_hdr *h2;
1150 void *raw;
1151 } ph;
1152 int to_write, offset, len, tp_len, nr_frags, len_max;
1153 struct socket *sock = po->sk.sk_socket;
1154 struct page *page;
1155 void *data;
1156 int err;
1157
1158 ph.raw = frame;
1159
1160 skb->protocol = proto;
1161 skb->dev = dev;
1162 skb->priority = po->sk.sk_priority;
2d37a186 1163 skb->mark = po->sk.sk_mark;
69e3c75f
JB
1164 skb_shinfo(skb)->destructor_arg = ph.raw;
1165
1166 switch (po->tp_version) {
1167 case TPACKET_V2:
1168 tp_len = ph.h2->tp_len;
1169 break;
1170 default:
1171 tp_len = ph.h1->tp_len;
1172 break;
1173 }
1174 if (unlikely(tp_len > size_max)) {
40d4e3df 1175 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1176 return -EMSGSIZE;
1177 }
1178
1179 skb_reserve(skb, LL_RESERVED_SPACE(dev));
1180 skb_reset_network_header(skb);
1181
1182 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1183 to_write = tp_len;
1184
1185 if (sock->type == SOCK_DGRAM) {
1186 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1187 NULL, tp_len);
1188 if (unlikely(err < 0))
1189 return -EINVAL;
40d4e3df 1190 } else if (dev->hard_header_len) {
69e3c75f
JB
1191 /* net device doesn't like empty head */
1192 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
1193 pr_err("packet size is too short (%d < %d)\n",
1194 tp_len, dev->hard_header_len);
69e3c75f
JB
1195 return -EINVAL;
1196 }
1197
1198 skb_push(skb, dev->hard_header_len);
1199 err = skb_store_bits(skb, 0, data,
1200 dev->hard_header_len);
1201 if (unlikely(err))
1202 return err;
1203
1204 data += dev->hard_header_len;
1205 to_write -= dev->hard_header_len;
1206 }
1207
1208 err = -EFAULT;
69e3c75f
JB
1209 offset = offset_in_page(data);
1210 len_max = PAGE_SIZE - offset;
1211 len = ((to_write > len_max) ? len_max : to_write);
1212
1213 skb->data_len = to_write;
1214 skb->len += to_write;
1215 skb->truesize += to_write;
1216 atomic_add(to_write, &po->sk.sk_wmem_alloc);
1217
1218 while (likely(to_write)) {
1219 nr_frags = skb_shinfo(skb)->nr_frags;
1220
1221 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
1222 pr_err("Packet exceed the number of skb frags(%lu)\n",
1223 MAX_SKB_FRAGS);
69e3c75f
JB
1224 return -EFAULT;
1225 }
1226
0af55bb5
CG
1227 page = pgv_to_page(data);
1228 data += len;
69e3c75f
JB
1229 flush_dcache_page(page);
1230 get_page(page);
0af55bb5 1231 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
1232 to_write -= len;
1233 offset = 0;
1234 len_max = PAGE_SIZE;
1235 len = ((to_write > len_max) ? len_max : to_write);
1236 }
1237
1238 return tp_len;
1239}
1240
1241static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
1242{
69e3c75f
JB
1243 struct sk_buff *skb;
1244 struct net_device *dev;
1245 __be16 proto;
827d9780
BG
1246 bool need_rls_dev = false;
1247 int err, reserve = 0;
40d4e3df
ED
1248 void *ph;
1249 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
1250 int tp_len, size_max;
1251 unsigned char *addr;
1252 int len_sum = 0;
1253 int status = 0;
1254
69e3c75f
JB
1255 mutex_lock(&po->pg_vec_lock);
1256
1257 err = -EBUSY;
1258 if (saddr == NULL) {
827d9780 1259 dev = po->prot_hook.dev;
69e3c75f
JB
1260 proto = po->num;
1261 addr = NULL;
1262 } else {
1263 err = -EINVAL;
1264 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1265 goto out;
1266 if (msg->msg_namelen < (saddr->sll_halen
1267 + offsetof(struct sockaddr_ll,
1268 sll_addr)))
1269 goto out;
69e3c75f
JB
1270 proto = saddr->sll_protocol;
1271 addr = saddr->sll_addr;
827d9780
BG
1272 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
1273 need_rls_dev = true;
69e3c75f
JB
1274 }
1275
69e3c75f
JB
1276 err = -ENXIO;
1277 if (unlikely(dev == NULL))
1278 goto out;
1279
1280 reserve = dev->hard_header_len;
1281
1282 err = -ENETDOWN;
1283 if (unlikely(!(dev->flags & IFF_UP)))
1284 goto out_put;
1285
1286 size_max = po->tx_ring.frame_size
b5dd884e 1287 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
1288
1289 if (size_max > dev->mtu + reserve)
1290 size_max = dev->mtu + reserve;
1291
1292 do {
1293 ph = packet_current_frame(po, &po->tx_ring,
1294 TP_STATUS_SEND_REQUEST);
1295
1296 if (unlikely(ph == NULL)) {
1297 schedule();
1298 continue;
1299 }
1300
1301 status = TP_STATUS_SEND_REQUEST;
1302 skb = sock_alloc_send_skb(&po->sk,
1303 LL_ALLOCATED_SPACE(dev)
1304 + sizeof(struct sockaddr_ll),
1305 0, &err);
1306
1307 if (unlikely(skb == NULL))
1308 goto out_status;
1309
1310 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1311 addr);
1312
1313 if (unlikely(tp_len < 0)) {
1314 if (po->tp_loss) {
1315 __packet_set_status(po, ph,
1316 TP_STATUS_AVAILABLE);
1317 packet_increment_head(&po->tx_ring);
1318 kfree_skb(skb);
1319 continue;
1320 } else {
1321 status = TP_STATUS_WRONG_FORMAT;
1322 err = tp_len;
1323 goto out_status;
1324 }
1325 }
1326
1327 skb->destructor = tpacket_destruct_skb;
1328 __packet_set_status(po, ph, TP_STATUS_SENDING);
1329 atomic_inc(&po->tx_ring.pending);
1330
1331 status = TP_STATUS_SEND_REQUEST;
1332 err = dev_queue_xmit(skb);
eb70df13
JP
1333 if (unlikely(err > 0)) {
1334 err = net_xmit_errno(err);
1335 if (err && __packet_get_status(po, ph) ==
1336 TP_STATUS_AVAILABLE) {
1337 /* skb was destructed already */
1338 skb = NULL;
1339 goto out_status;
1340 }
1341 /*
1342 * skb was dropped but not destructed yet;
1343 * let's treat it like congestion or err < 0
1344 */
1345 err = 0;
1346 }
69e3c75f
JB
1347 packet_increment_head(&po->tx_ring);
1348 len_sum += tp_len;
f64f9e71
JP
1349 } while (likely((ph != NULL) ||
1350 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1351 (atomic_read(&po->tx_ring.pending))))
1352 );
69e3c75f
JB
1353
1354 err = len_sum;
1355 goto out_put;
1356
69e3c75f
JB
1357out_status:
1358 __packet_set_status(po, ph, status);
1359 kfree_skb(skb);
1360out_put:
827d9780
BG
1361 if (need_rls_dev)
1362 dev_put(dev);
69e3c75f
JB
1363out:
1364 mutex_unlock(&po->pg_vec_lock);
1365 return err;
1366}
69e3c75f 1367
bfd5f4a3
SS
1368static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1369 size_t reserve, size_t len,
1370 size_t linear, int noblock,
1371 int *err)
1372{
1373 struct sk_buff *skb;
1374
1375 /* Under a page? Don't bother with paged skb. */
1376 if (prepad + len < PAGE_SIZE || !linear)
1377 linear = len;
1378
1379 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1380 err);
1381 if (!skb)
1382 return NULL;
1383
1384 skb_reserve(skb, reserve);
1385 skb_put(skb, linear);
1386 skb->data_len = len - linear;
1387 skb->len += len - linear;
1388
1389 return skb;
1390}
1391
69e3c75f 1392static int packet_snd(struct socket *sock,
1da177e4
LT
1393 struct msghdr *msg, size_t len)
1394{
1395 struct sock *sk = sock->sk;
40d4e3df 1396 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
1397 struct sk_buff *skb;
1398 struct net_device *dev;
0e11c91e 1399 __be16 proto;
827d9780 1400 bool need_rls_dev = false;
1da177e4 1401 unsigned char *addr;
827d9780 1402 int err, reserve = 0;
bfd5f4a3
SS
1403 struct virtio_net_hdr vnet_hdr = { 0 };
1404 int offset = 0;
1405 int vnet_hdr_len;
1406 struct packet_sock *po = pkt_sk(sk);
1407 unsigned short gso_type = 0;
1da177e4
LT
1408
1409 /*
1ce4f28b 1410 * Get and verify the address.
1da177e4 1411 */
1ce4f28b 1412
1da177e4 1413 if (saddr == NULL) {
827d9780 1414 dev = po->prot_hook.dev;
1da177e4
LT
1415 proto = po->num;
1416 addr = NULL;
1417 } else {
1418 err = -EINVAL;
1419 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1420 goto out;
0fb375fb
EB
1421 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1422 goto out;
1da177e4
LT
1423 proto = saddr->sll_protocol;
1424 addr = saddr->sll_addr;
827d9780
BG
1425 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1426 need_rls_dev = true;
1da177e4
LT
1427 }
1428
1da177e4
LT
1429 err = -ENXIO;
1430 if (dev == NULL)
1431 goto out_unlock;
1432 if (sock->type == SOCK_RAW)
1433 reserve = dev->hard_header_len;
1434
d5e76b0a
DM
1435 err = -ENETDOWN;
1436 if (!(dev->flags & IFF_UP))
1437 goto out_unlock;
1438
bfd5f4a3
SS
1439 if (po->has_vnet_hdr) {
1440 vnet_hdr_len = sizeof(vnet_hdr);
1441
1442 err = -EINVAL;
1443 if (len < vnet_hdr_len)
1444 goto out_unlock;
1445
1446 len -= vnet_hdr_len;
1447
1448 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1449 vnet_hdr_len);
1450 if (err < 0)
1451 goto out_unlock;
1452
1453 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1454 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1455 vnet_hdr.hdr_len))
1456 vnet_hdr.hdr_len = vnet_hdr.csum_start +
1457 vnet_hdr.csum_offset + 2;
1458
1459 err = -EINVAL;
1460 if (vnet_hdr.hdr_len > len)
1461 goto out_unlock;
1462
1463 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1464 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1465 case VIRTIO_NET_HDR_GSO_TCPV4:
1466 gso_type = SKB_GSO_TCPV4;
1467 break;
1468 case VIRTIO_NET_HDR_GSO_TCPV6:
1469 gso_type = SKB_GSO_TCPV6;
1470 break;
1471 case VIRTIO_NET_HDR_GSO_UDP:
1472 gso_type = SKB_GSO_UDP;
1473 break;
1474 default:
1475 goto out_unlock;
1476 }
1477
1478 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1479 gso_type |= SKB_GSO_TCP_ECN;
1480
1481 if (vnet_hdr.gso_size == 0)
1482 goto out_unlock;
1483
1484 }
1485 }
1486
1da177e4 1487 err = -EMSGSIZE;
57f89bfa 1488 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN))
1da177e4
LT
1489 goto out_unlock;
1490
bfd5f4a3
SS
1491 err = -ENOBUFS;
1492 skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1493 LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1494 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 1495 if (skb == NULL)
1da177e4
LT
1496 goto out_unlock;
1497
bfd5f4a3 1498 skb_set_network_header(skb, reserve);
1da177e4 1499
0c4e8581
SH
1500 err = -EINVAL;
1501 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 1502 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 1503 goto out_free;
1da177e4
LT
1504
1505 /* Returns -EFAULT on error */
bfd5f4a3 1506 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
1507 if (err)
1508 goto out_free;
2244d07b 1509 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
1510 if (err < 0)
1511 goto out_free;
1da177e4 1512
57f89bfa
BG
1513 if (!gso_type && (len > dev->mtu + reserve)) {
1514 /* Earlier code assumed this would be a VLAN pkt,
1515 * double-check this now that we have the actual
1516 * packet in hand.
1517 */
1518 struct ethhdr *ehdr;
1519 skb_reset_mac_header(skb);
1520 ehdr = eth_hdr(skb);
1521 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1522 err = -EMSGSIZE;
1523 goto out_free;
1524 }
1525 }
1526
1da177e4
LT
1527 skb->protocol = proto;
1528 skb->dev = dev;
1529 skb->priority = sk->sk_priority;
2d37a186 1530 skb->mark = sk->sk_mark;
1da177e4 1531
bfd5f4a3
SS
1532 if (po->has_vnet_hdr) {
1533 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1534 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1535 vnet_hdr.csum_offset)) {
1536 err = -EINVAL;
1537 goto out_free;
1538 }
1539 }
1540
1541 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1542 skb_shinfo(skb)->gso_type = gso_type;
1543
1544 /* Header must be checked, and gso_segs computed. */
1545 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1546 skb_shinfo(skb)->gso_segs = 0;
1547
1548 len += vnet_hdr_len;
1549 }
1550
1da177e4
LT
1551 /*
1552 * Now send it
1553 */
1554
1555 err = dev_queue_xmit(skb);
1556 if (err > 0 && (err = net_xmit_errno(err)) != 0)
1557 goto out_unlock;
1558
827d9780
BG
1559 if (need_rls_dev)
1560 dev_put(dev);
1da177e4 1561
40d4e3df 1562 return len;
1da177e4
LT
1563
1564out_free:
1565 kfree_skb(skb);
1566out_unlock:
827d9780 1567 if (dev && need_rls_dev)
1da177e4
LT
1568 dev_put(dev);
1569out:
1570 return err;
1571}
1572
69e3c75f
JB
1573static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1574 struct msghdr *msg, size_t len)
1575{
69e3c75f
JB
1576 struct sock *sk = sock->sk;
1577 struct packet_sock *po = pkt_sk(sk);
1578 if (po->tx_ring.pg_vec)
1579 return tpacket_snd(po, msg);
1580 else
69e3c75f
JB
1581 return packet_snd(sock, msg, len);
1582}
1583
1da177e4
LT
1584/*
1585 * Close a PACKET socket. This is fairly simple. We immediately go
1586 * to 'closed' state and remove our protocol entry in the device list.
1587 */
1588
1589static int packet_release(struct socket *sock)
1590{
1591 struct sock *sk = sock->sk;
1592 struct packet_sock *po;
d12d01d6 1593 struct net *net;
69e3c75f 1594 struct tpacket_req req;
1da177e4
LT
1595
1596 if (!sk)
1597 return 0;
1598
3b1e0a65 1599 net = sock_net(sk);
1da177e4
LT
1600 po = pkt_sk(sk);
1601
808f5114 1602 spin_lock_bh(&net->packet.sklist_lock);
1603 sk_del_node_init_rcu(sk);
920de804 1604 sock_prot_inuse_add(net, sk->sk_prot, -1);
808f5114 1605 spin_unlock_bh(&net->packet.sklist_lock);
1da177e4 1606
808f5114 1607 spin_lock(&po->bind_lock);
ce06b03e 1608 unregister_prot_hook(sk, false);
160ff18a
BG
1609 if (po->prot_hook.dev) {
1610 dev_put(po->prot_hook.dev);
1611 po->prot_hook.dev = NULL;
1612 }
808f5114 1613 spin_unlock(&po->bind_lock);
1da177e4 1614
1da177e4 1615 packet_flush_mclist(sk);
1da177e4 1616
69e3c75f
JB
1617 memset(&req, 0, sizeof(req));
1618
1619 if (po->rx_ring.pg_vec)
1620 packet_set_ring(sk, &req, 1, 0);
1621
1622 if (po->tx_ring.pg_vec)
1623 packet_set_ring(sk, &req, 1, 1);
1da177e4 1624
dc99f600
DM
1625 fanout_release(sk);
1626
808f5114 1627 synchronize_net();
1da177e4
LT
1628 /*
1629 * Now the socket is dead. No more input will appear.
1630 */
1da177e4
LT
1631 sock_orphan(sk);
1632 sock->sk = NULL;
1633
1634 /* Purge queues */
1635
1636 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 1637 sk_refcnt_debug_release(sk);
1da177e4
LT
1638
1639 sock_put(sk);
1640 return 0;
1641}
1642
1643/*
1644 * Attach a packet hook.
1645 */
1646
0e11c91e 1647static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
1648{
1649 struct packet_sock *po = pkt_sk(sk);
dc99f600
DM
1650
1651 if (po->fanout)
1652 return -EINVAL;
1da177e4
LT
1653
1654 lock_sock(sk);
1655
1656 spin_lock(&po->bind_lock);
ce06b03e 1657 unregister_prot_hook(sk, true);
1da177e4
LT
1658 po->num = protocol;
1659 po->prot_hook.type = protocol;
160ff18a
BG
1660 if (po->prot_hook.dev)
1661 dev_put(po->prot_hook.dev);
1da177e4
LT
1662 po->prot_hook.dev = dev;
1663
1664 po->ifindex = dev ? dev->ifindex : 0;
1665
1666 if (protocol == 0)
1667 goto out_unlock;
1668
be85d4ad 1669 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 1670 register_prot_hook(sk);
be85d4ad
UT
1671 } else {
1672 sk->sk_err = ENETDOWN;
1673 if (!sock_flag(sk, SOCK_DEAD))
1674 sk->sk_error_report(sk);
1da177e4
LT
1675 }
1676
1677out_unlock:
1678 spin_unlock(&po->bind_lock);
1679 release_sock(sk);
1680 return 0;
1681}
1682
1683/*
1684 * Bind a packet socket to a device
1685 */
1686
40d4e3df
ED
1687static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1688 int addr_len)
1da177e4 1689{
40d4e3df 1690 struct sock *sk = sock->sk;
1da177e4
LT
1691 char name[15];
1692 struct net_device *dev;
1693 int err = -ENODEV;
1ce4f28b 1694
1da177e4
LT
1695 /*
1696 * Check legality
1697 */
1ce4f28b 1698
8ae55f04 1699 if (addr_len != sizeof(struct sockaddr))
1da177e4 1700 return -EINVAL;
40d4e3df 1701 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 1702
3b1e0a65 1703 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 1704 if (dev)
1da177e4 1705 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
1706 return err;
1707}
1da177e4
LT
1708
1709static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1710{
40d4e3df
ED
1711 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1712 struct sock *sk = sock->sk;
1da177e4
LT
1713 struct net_device *dev = NULL;
1714 int err;
1715
1716
1717 /*
1718 * Check legality
1719 */
1ce4f28b 1720
1da177e4
LT
1721 if (addr_len < sizeof(struct sockaddr_ll))
1722 return -EINVAL;
1723 if (sll->sll_family != AF_PACKET)
1724 return -EINVAL;
1725
1726 if (sll->sll_ifindex) {
1727 err = -ENODEV;
3b1e0a65 1728 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
1729 if (dev == NULL)
1730 goto out;
1731 }
1732 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
1733
1734out:
1735 return err;
1736}
1737
1738static struct proto packet_proto = {
1739 .name = "PACKET",
1740 .owner = THIS_MODULE,
1741 .obj_size = sizeof(struct packet_sock),
1742};
1743
1744/*
1ce4f28b 1745 * Create a packet of type SOCK_PACKET.
1da177e4
LT
1746 */
1747
3f378b68
EP
1748static int packet_create(struct net *net, struct socket *sock, int protocol,
1749 int kern)
1da177e4
LT
1750{
1751 struct sock *sk;
1752 struct packet_sock *po;
0e11c91e 1753 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
1754 int err;
1755
1756 if (!capable(CAP_NET_RAW))
1757 return -EPERM;
be02097c
DM
1758 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1759 sock->type != SOCK_PACKET)
1da177e4
LT
1760 return -ESOCKTNOSUPPORT;
1761
1762 sock->state = SS_UNCONNECTED;
1763
1764 err = -ENOBUFS;
6257ff21 1765 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
1766 if (sk == NULL)
1767 goto out;
1768
1769 sock->ops = &packet_ops;
1da177e4
LT
1770 if (sock->type == SOCK_PACKET)
1771 sock->ops = &packet_ops_spkt;
be02097c 1772
1da177e4
LT
1773 sock_init_data(sock, sk);
1774
1775 po = pkt_sk(sk);
1776 sk->sk_family = PF_PACKET;
0e11c91e 1777 po->num = proto;
1da177e4
LT
1778
1779 sk->sk_destruct = packet_sock_destruct;
17ab56a2 1780 sk_refcnt_debug_inc(sk);
1da177e4
LT
1781
1782 /*
1783 * Attach a protocol block
1784 */
1785
1786 spin_lock_init(&po->bind_lock);
905db440 1787 mutex_init(&po->pg_vec_lock);
1da177e4 1788 po->prot_hook.func = packet_rcv;
be02097c 1789
1da177e4
LT
1790 if (sock->type == SOCK_PACKET)
1791 po->prot_hook.func = packet_rcv_spkt;
be02097c 1792
1da177e4
LT
1793 po->prot_hook.af_packet_priv = sk;
1794
0e11c91e
AV
1795 if (proto) {
1796 po->prot_hook.type = proto;
ce06b03e 1797 register_prot_hook(sk);
1da177e4
LT
1798 }
1799
808f5114 1800 spin_lock_bh(&net->packet.sklist_lock);
1801 sk_add_node_rcu(sk, &net->packet.sklist);
3680453c 1802 sock_prot_inuse_add(net, &packet_proto, 1);
808f5114 1803 spin_unlock_bh(&net->packet.sklist_lock);
1804
40d4e3df 1805 return 0;
1da177e4
LT
1806out:
1807 return err;
1808}
1809
ed85b565
RC
1810static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1811{
1812 struct sock_exterr_skb *serr;
1813 struct sk_buff *skb, *skb2;
1814 int copied, err;
1815
1816 err = -EAGAIN;
1817 skb = skb_dequeue(&sk->sk_error_queue);
1818 if (skb == NULL)
1819 goto out;
1820
1821 copied = skb->len;
1822 if (copied > len) {
1823 msg->msg_flags |= MSG_TRUNC;
1824 copied = len;
1825 }
1826 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1827 if (err)
1828 goto out_free_skb;
1829
1830 sock_recv_timestamp(msg, sk, skb);
1831
1832 serr = SKB_EXT_ERR(skb);
1833 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1834 sizeof(serr->ee), &serr->ee);
1835
1836 msg->msg_flags |= MSG_ERRQUEUE;
1837 err = copied;
1838
1839 /* Reset and regenerate socket error */
1840 spin_lock_bh(&sk->sk_error_queue.lock);
1841 sk->sk_err = 0;
1842 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1843 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1844 spin_unlock_bh(&sk->sk_error_queue.lock);
1845 sk->sk_error_report(sk);
1846 } else
1847 spin_unlock_bh(&sk->sk_error_queue.lock);
1848
1849out_free_skb:
1850 kfree_skb(skb);
1851out:
1852 return err;
1853}
1854
1da177e4
LT
1855/*
1856 * Pull a packet from our receive queue and hand it to the user.
1857 * If necessary we block.
1858 */
1859
1860static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1861 struct msghdr *msg, size_t len, int flags)
1862{
1863 struct sock *sk = sock->sk;
1864 struct sk_buff *skb;
1865 int copied, err;
0fb375fb 1866 struct sockaddr_ll *sll;
bfd5f4a3 1867 int vnet_hdr_len = 0;
1da177e4
LT
1868
1869 err = -EINVAL;
ed85b565 1870 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
1871 goto out;
1872
1873#if 0
1874 /* What error should we return now? EUNATTACH? */
1875 if (pkt_sk(sk)->ifindex < 0)
1876 return -ENODEV;
1877#endif
1878
ed85b565
RC
1879 if (flags & MSG_ERRQUEUE) {
1880 err = packet_recv_error(sk, msg, len);
1881 goto out;
1882 }
1883
1da177e4
LT
1884 /*
1885 * Call the generic datagram receiver. This handles all sorts
1886 * of horrible races and re-entrancy so we can forget about it
1887 * in the protocol layers.
1888 *
1889 * Now it will return ENETDOWN, if device have just gone down,
1890 * but then it will block.
1891 */
1892
40d4e3df 1893 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
1894
1895 /*
1ce4f28b 1896 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
1897 * handles the blocking we don't see and worry about blocking
1898 * retries.
1899 */
1900
8ae55f04 1901 if (skb == NULL)
1da177e4
LT
1902 goto out;
1903
bfd5f4a3
SS
1904 if (pkt_sk(sk)->has_vnet_hdr) {
1905 struct virtio_net_hdr vnet_hdr = { 0 };
1906
1907 err = -EINVAL;
1908 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 1909 if (len < vnet_hdr_len)
bfd5f4a3
SS
1910 goto out_free;
1911
1f18b717
MK
1912 len -= vnet_hdr_len;
1913
bfd5f4a3
SS
1914 if (skb_is_gso(skb)) {
1915 struct skb_shared_info *sinfo = skb_shinfo(skb);
1916
1917 /* This is a hint as to how much should be linear. */
1918 vnet_hdr.hdr_len = skb_headlen(skb);
1919 vnet_hdr.gso_size = sinfo->gso_size;
1920 if (sinfo->gso_type & SKB_GSO_TCPV4)
1921 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1922 else if (sinfo->gso_type & SKB_GSO_TCPV6)
1923 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1924 else if (sinfo->gso_type & SKB_GSO_UDP)
1925 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1926 else if (sinfo->gso_type & SKB_GSO_FCOE)
1927 goto out_free;
1928 else
1929 BUG();
1930 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1931 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1932 } else
1933 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1934
1935 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1936 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 1937 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 1938 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
1939 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
1940 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
1941 } /* else everything is zero */
1942
1943 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1944 vnet_hdr_len);
1945 if (err < 0)
1946 goto out_free;
1947 }
1948
0fb375fb
EB
1949 /*
1950 * If the address length field is there to be filled in, we fill
1951 * it in now.
1952 */
1953
ffbc6111 1954 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
1955 if (sock->type == SOCK_PACKET)
1956 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1957 else
1958 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1959
1da177e4
LT
1960 /*
1961 * You lose any data beyond the buffer you gave. If it worries a
1962 * user program they can ask the device for its MTU anyway.
1963 */
1964
1965 copied = skb->len;
40d4e3df
ED
1966 if (copied > len) {
1967 copied = len;
1968 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
1969 }
1970
1971 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1972 if (err)
1973 goto out_free;
1974
3b885787 1975 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
1976
1977 if (msg->msg_name)
ffbc6111
HX
1978 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1979 msg->msg_namelen);
1da177e4 1980
8dc41944 1981 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
1982 struct tpacket_auxdata aux;
1983
1984 aux.tp_status = TP_STATUS_USER;
1985 if (skb->ip_summed == CHECKSUM_PARTIAL)
1986 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1987 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1988 aux.tp_snaplen = skb->len;
1989 aux.tp_mac = 0;
bbe735e4 1990 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
1991 if (vlan_tx_tag_present(skb)) {
1992 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1993 aux.tp_status |= TP_STATUS_VLAN_VALID;
1994 } else {
1995 aux.tp_vlan_tci = 0;
1996 }
13fcb7bd 1997 aux.tp_padding = 0;
ffbc6111 1998 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
1999 }
2000
1da177e4
LT
2001 /*
2002 * Free or return the buffer as appropriate. Again this
2003 * hides all the races and re-entrancy issues from us.
2004 */
bfd5f4a3 2005 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2006
2007out_free:
2008 skb_free_datagram(sk, skb);
2009out:
2010 return err;
2011}
2012
1da177e4
LT
2013static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2014 int *uaddr_len, int peer)
2015{
2016 struct net_device *dev;
2017 struct sock *sk = sock->sk;
2018
2019 if (peer)
2020 return -EOPNOTSUPP;
2021
2022 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
2023 rcu_read_lock();
2024 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2025 if (dev)
67286640 2026 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 2027 else
1da177e4 2028 memset(uaddr->sa_data, 0, 14);
654d1f8a 2029 rcu_read_unlock();
1da177e4
LT
2030 *uaddr_len = sizeof(*uaddr);
2031
2032 return 0;
2033}
1da177e4
LT
2034
2035static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2036 int *uaddr_len, int peer)
2037{
2038 struct net_device *dev;
2039 struct sock *sk = sock->sk;
2040 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2041 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2042
2043 if (peer)
2044 return -EOPNOTSUPP;
2045
2046 sll->sll_family = AF_PACKET;
2047 sll->sll_ifindex = po->ifindex;
2048 sll->sll_protocol = po->num;
67286640 2049 sll->sll_pkttype = 0;
654d1f8a
ED
2050 rcu_read_lock();
2051 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2052 if (dev) {
2053 sll->sll_hatype = dev->type;
2054 sll->sll_halen = dev->addr_len;
2055 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2056 } else {
2057 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2058 sll->sll_halen = 0;
2059 }
654d1f8a 2060 rcu_read_unlock();
0fb375fb 2061 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2062
2063 return 0;
2064}
2065
2aeb0b88
WC
2066static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2067 int what)
1da177e4
LT
2068{
2069 switch (i->type) {
2070 case PACKET_MR_MULTICAST:
1162563f
JP
2071 if (i->alen != dev->addr_len)
2072 return -EINVAL;
1da177e4 2073 if (what > 0)
22bedad3 2074 return dev_mc_add(dev, i->addr);
1da177e4 2075 else
22bedad3 2076 return dev_mc_del(dev, i->addr);
1da177e4
LT
2077 break;
2078 case PACKET_MR_PROMISC:
2aeb0b88 2079 return dev_set_promiscuity(dev, what);
1da177e4
LT
2080 break;
2081 case PACKET_MR_ALLMULTI:
2aeb0b88 2082 return dev_set_allmulti(dev, what);
1da177e4 2083 break;
d95ed927 2084 case PACKET_MR_UNICAST:
1162563f
JP
2085 if (i->alen != dev->addr_len)
2086 return -EINVAL;
d95ed927 2087 if (what > 0)
a748ee24 2088 return dev_uc_add(dev, i->addr);
d95ed927 2089 else
a748ee24 2090 return dev_uc_del(dev, i->addr);
d95ed927 2091 break;
40d4e3df
ED
2092 default:
2093 break;
1da177e4 2094 }
2aeb0b88 2095 return 0;
1da177e4
LT
2096}
2097
2098static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2099{
40d4e3df 2100 for ( ; i; i = i->next) {
1da177e4
LT
2101 if (i->ifindex == dev->ifindex)
2102 packet_dev_mc(dev, i, what);
2103 }
2104}
2105
0fb375fb 2106static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2107{
2108 struct packet_sock *po = pkt_sk(sk);
2109 struct packet_mclist *ml, *i;
2110 struct net_device *dev;
2111 int err;
2112
2113 rtnl_lock();
2114
2115 err = -ENODEV;
3b1e0a65 2116 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2117 if (!dev)
2118 goto done;
2119
2120 err = -EINVAL;
1162563f 2121 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2122 goto done;
2123
2124 err = -ENOBUFS;
8b3a7005 2125 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2126 if (i == NULL)
2127 goto done;
2128
2129 err = 0;
2130 for (ml = po->mclist; ml; ml = ml->next) {
2131 if (ml->ifindex == mreq->mr_ifindex &&
2132 ml->type == mreq->mr_type &&
2133 ml->alen == mreq->mr_alen &&
2134 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2135 ml->count++;
2136 /* Free the new element ... */
2137 kfree(i);
2138 goto done;
2139 }
2140 }
2141
2142 i->type = mreq->mr_type;
2143 i->ifindex = mreq->mr_ifindex;
2144 i->alen = mreq->mr_alen;
2145 memcpy(i->addr, mreq->mr_address, i->alen);
2146 i->count = 1;
2147 i->next = po->mclist;
2148 po->mclist = i;
2aeb0b88
WC
2149 err = packet_dev_mc(dev, i, 1);
2150 if (err) {
2151 po->mclist = i->next;
2152 kfree(i);
2153 }
1da177e4
LT
2154
2155done:
2156 rtnl_unlock();
2157 return err;
2158}
2159
0fb375fb 2160static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2161{
2162 struct packet_mclist *ml, **mlp;
2163
2164 rtnl_lock();
2165
2166 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2167 if (ml->ifindex == mreq->mr_ifindex &&
2168 ml->type == mreq->mr_type &&
2169 ml->alen == mreq->mr_alen &&
2170 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2171 if (--ml->count == 0) {
2172 struct net_device *dev;
2173 *mlp = ml->next;
ad959e76
ED
2174 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2175 if (dev)
1da177e4 2176 packet_dev_mc(dev, ml, -1);
1da177e4
LT
2177 kfree(ml);
2178 }
2179 rtnl_unlock();
2180 return 0;
2181 }
2182 }
2183 rtnl_unlock();
2184 return -EADDRNOTAVAIL;
2185}
2186
2187static void packet_flush_mclist(struct sock *sk)
2188{
2189 struct packet_sock *po = pkt_sk(sk);
2190 struct packet_mclist *ml;
2191
2192 if (!po->mclist)
2193 return;
2194
2195 rtnl_lock();
2196 while ((ml = po->mclist) != NULL) {
2197 struct net_device *dev;
2198
2199 po->mclist = ml->next;
ad959e76
ED
2200 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2201 if (dev != NULL)
1da177e4 2202 packet_dev_mc(dev, ml, -1);
1da177e4
LT
2203 kfree(ml);
2204 }
2205 rtnl_unlock();
2206}
1da177e4
LT
2207
2208static int
b7058842 2209packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
2210{
2211 struct sock *sk = sock->sk;
8dc41944 2212 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
2213 int ret;
2214
2215 if (level != SOL_PACKET)
2216 return -ENOPROTOOPT;
2217
69e3c75f 2218 switch (optname) {
1ce4f28b 2219 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
2220 case PACKET_DROP_MEMBERSHIP:
2221 {
0fb375fb
EB
2222 struct packet_mreq_max mreq;
2223 int len = optlen;
2224 memset(&mreq, 0, sizeof(mreq));
2225 if (len < sizeof(struct packet_mreq))
1da177e4 2226 return -EINVAL;
0fb375fb
EB
2227 if (len > sizeof(mreq))
2228 len = sizeof(mreq);
40d4e3df 2229 if (copy_from_user(&mreq, optval, len))
1da177e4 2230 return -EFAULT;
0fb375fb
EB
2231 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
2232 return -EINVAL;
1da177e4
LT
2233 if (optname == PACKET_ADD_MEMBERSHIP)
2234 ret = packet_mc_add(sk, &mreq);
2235 else
2236 ret = packet_mc_drop(sk, &mreq);
2237 return ret;
2238 }
a2efcfa0 2239
1da177e4 2240 case PACKET_RX_RING:
69e3c75f 2241 case PACKET_TX_RING:
1da177e4
LT
2242 {
2243 struct tpacket_req req;
2244
40d4e3df 2245 if (optlen < sizeof(req))
1da177e4 2246 return -EINVAL;
bfd5f4a3
SS
2247 if (pkt_sk(sk)->has_vnet_hdr)
2248 return -EINVAL;
40d4e3df 2249 if (copy_from_user(&req, optval, sizeof(req)))
1da177e4 2250 return -EFAULT;
69e3c75f 2251 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1da177e4
LT
2252 }
2253 case PACKET_COPY_THRESH:
2254 {
2255 int val;
2256
40d4e3df 2257 if (optlen != sizeof(val))
1da177e4 2258 return -EINVAL;
40d4e3df 2259 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
2260 return -EFAULT;
2261
2262 pkt_sk(sk)->copy_thresh = val;
2263 return 0;
2264 }
bbd6ef87
PM
2265 case PACKET_VERSION:
2266 {
2267 int val;
2268
2269 if (optlen != sizeof(val))
2270 return -EINVAL;
69e3c75f 2271 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
2272 return -EBUSY;
2273 if (copy_from_user(&val, optval, sizeof(val)))
2274 return -EFAULT;
2275 switch (val) {
2276 case TPACKET_V1:
2277 case TPACKET_V2:
2278 po->tp_version = val;
2279 return 0;
2280 default:
2281 return -EINVAL;
2282 }
2283 }
8913336a
PM
2284 case PACKET_RESERVE:
2285 {
2286 unsigned int val;
2287
2288 if (optlen != sizeof(val))
2289 return -EINVAL;
69e3c75f 2290 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
2291 return -EBUSY;
2292 if (copy_from_user(&val, optval, sizeof(val)))
2293 return -EFAULT;
2294 po->tp_reserve = val;
2295 return 0;
2296 }
69e3c75f
JB
2297 case PACKET_LOSS:
2298 {
2299 unsigned int val;
2300
2301 if (optlen != sizeof(val))
2302 return -EINVAL;
2303 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2304 return -EBUSY;
2305 if (copy_from_user(&val, optval, sizeof(val)))
2306 return -EFAULT;
2307 po->tp_loss = !!val;
2308 return 0;
2309 }
8dc41944
HX
2310 case PACKET_AUXDATA:
2311 {
2312 int val;
2313
2314 if (optlen < sizeof(val))
2315 return -EINVAL;
2316 if (copy_from_user(&val, optval, sizeof(val)))
2317 return -EFAULT;
2318
2319 po->auxdata = !!val;
2320 return 0;
2321 }
80feaacb
PWJ
2322 case PACKET_ORIGDEV:
2323 {
2324 int val;
2325
2326 if (optlen < sizeof(val))
2327 return -EINVAL;
2328 if (copy_from_user(&val, optval, sizeof(val)))
2329 return -EFAULT;
2330
2331 po->origdev = !!val;
2332 return 0;
2333 }
bfd5f4a3
SS
2334 case PACKET_VNET_HDR:
2335 {
2336 int val;
2337
2338 if (sock->type != SOCK_RAW)
2339 return -EINVAL;
2340 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2341 return -EBUSY;
2342 if (optlen < sizeof(val))
2343 return -EINVAL;
2344 if (copy_from_user(&val, optval, sizeof(val)))
2345 return -EFAULT;
2346
2347 po->has_vnet_hdr = !!val;
2348 return 0;
2349 }
614f60fa
SM
2350 case PACKET_TIMESTAMP:
2351 {
2352 int val;
2353
2354 if (optlen != sizeof(val))
2355 return -EINVAL;
2356 if (copy_from_user(&val, optval, sizeof(val)))
2357 return -EFAULT;
2358
2359 po->tp_tstamp = val;
2360 return 0;
2361 }
dc99f600
DM
2362 case PACKET_FANOUT:
2363 {
2364 int val;
2365
2366 if (optlen != sizeof(val))
2367 return -EINVAL;
2368 if (copy_from_user(&val, optval, sizeof(val)))
2369 return -EFAULT;
2370
2371 return fanout_add(sk, val & 0xffff, val >> 16);
2372 }
1da177e4
LT
2373 default:
2374 return -ENOPROTOOPT;
2375 }
2376}
2377
2378static int packet_getsockopt(struct socket *sock, int level, int optname,
2379 char __user *optval, int __user *optlen)
2380{
2381 int len;
8dc41944 2382 int val;
1da177e4
LT
2383 struct sock *sk = sock->sk;
2384 struct packet_sock *po = pkt_sk(sk);
8dc41944
HX
2385 void *data;
2386 struct tpacket_stats st;
1da177e4
LT
2387
2388 if (level != SOL_PACKET)
2389 return -ENOPROTOOPT;
2390
8ae55f04
KK
2391 if (get_user(len, optlen))
2392 return -EFAULT;
1da177e4
LT
2393
2394 if (len < 0)
2395 return -EINVAL;
1ce4f28b 2396
69e3c75f 2397 switch (optname) {
1da177e4 2398 case PACKET_STATISTICS:
1da177e4
LT
2399 if (len > sizeof(struct tpacket_stats))
2400 len = sizeof(struct tpacket_stats);
2401 spin_lock_bh(&sk->sk_receive_queue.lock);
2402 st = po->stats;
2403 memset(&po->stats, 0, sizeof(st));
2404 spin_unlock_bh(&sk->sk_receive_queue.lock);
2405 st.tp_packets += st.tp_drops;
2406
8dc41944
HX
2407 data = &st;
2408 break;
2409 case PACKET_AUXDATA:
2410 if (len > sizeof(int))
2411 len = sizeof(int);
2412 val = po->auxdata;
2413
80feaacb
PWJ
2414 data = &val;
2415 break;
2416 case PACKET_ORIGDEV:
2417 if (len > sizeof(int))
2418 len = sizeof(int);
2419 val = po->origdev;
2420
bfd5f4a3
SS
2421 data = &val;
2422 break;
2423 case PACKET_VNET_HDR:
2424 if (len > sizeof(int))
2425 len = sizeof(int);
2426 val = po->has_vnet_hdr;
2427
8dc41944 2428 data = &val;
1da177e4 2429 break;
bbd6ef87
PM
2430 case PACKET_VERSION:
2431 if (len > sizeof(int))
2432 len = sizeof(int);
2433 val = po->tp_version;
2434 data = &val;
2435 break;
2436 case PACKET_HDRLEN:
2437 if (len > sizeof(int))
2438 len = sizeof(int);
2439 if (copy_from_user(&val, optval, len))
2440 return -EFAULT;
2441 switch (val) {
2442 case TPACKET_V1:
2443 val = sizeof(struct tpacket_hdr);
2444 break;
2445 case TPACKET_V2:
2446 val = sizeof(struct tpacket2_hdr);
2447 break;
2448 default:
2449 return -EINVAL;
2450 }
2451 data = &val;
2452 break;
8913336a
PM
2453 case PACKET_RESERVE:
2454 if (len > sizeof(unsigned int))
2455 len = sizeof(unsigned int);
2456 val = po->tp_reserve;
2457 data = &val;
2458 break;
69e3c75f
JB
2459 case PACKET_LOSS:
2460 if (len > sizeof(unsigned int))
2461 len = sizeof(unsigned int);
2462 val = po->tp_loss;
2463 data = &val;
2464 break;
614f60fa
SM
2465 case PACKET_TIMESTAMP:
2466 if (len > sizeof(int))
2467 len = sizeof(int);
2468 val = po->tp_tstamp;
2469 data = &val;
2470 break;
dc99f600
DM
2471 case PACKET_FANOUT:
2472 if (len > sizeof(int))
2473 len = sizeof(int);
2474 val = (po->fanout ?
2475 ((u32)po->fanout->id |
2476 ((u32)po->fanout->type << 16)) :
2477 0);
2478 data = &val;
2479 break;
1da177e4
LT
2480 default:
2481 return -ENOPROTOOPT;
2482 }
2483
8ae55f04
KK
2484 if (put_user(len, optlen))
2485 return -EFAULT;
8dc41944
HX
2486 if (copy_to_user(optval, data, len))
2487 return -EFAULT;
8ae55f04 2488 return 0;
1da177e4
LT
2489}
2490
2491
2492static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2493{
2494 struct sock *sk;
2495 struct hlist_node *node;
ad930650 2496 struct net_device *dev = data;
c346dca1 2497 struct net *net = dev_net(dev);
1da177e4 2498
808f5114 2499 rcu_read_lock();
2500 sk_for_each_rcu(sk, node, &net->packet.sklist) {
1da177e4
LT
2501 struct packet_sock *po = pkt_sk(sk);
2502
2503 switch (msg) {
2504 case NETDEV_UNREGISTER:
1da177e4
LT
2505 if (po->mclist)
2506 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
2507 /* fallthrough */
2508
1da177e4
LT
2509 case NETDEV_DOWN:
2510 if (dev->ifindex == po->ifindex) {
2511 spin_lock(&po->bind_lock);
2512 if (po->running) {
ce06b03e 2513 __unregister_prot_hook(sk, false);
1da177e4
LT
2514 sk->sk_err = ENETDOWN;
2515 if (!sock_flag(sk, SOCK_DEAD))
2516 sk->sk_error_report(sk);
2517 }
2518 if (msg == NETDEV_UNREGISTER) {
2519 po->ifindex = -1;
160ff18a
BG
2520 if (po->prot_hook.dev)
2521 dev_put(po->prot_hook.dev);
1da177e4
LT
2522 po->prot_hook.dev = NULL;
2523 }
2524 spin_unlock(&po->bind_lock);
2525 }
2526 break;
2527 case NETDEV_UP:
808f5114 2528 if (dev->ifindex == po->ifindex) {
2529 spin_lock(&po->bind_lock);
ce06b03e
DM
2530 if (po->num)
2531 register_prot_hook(sk);
808f5114 2532 spin_unlock(&po->bind_lock);
1da177e4 2533 }
1da177e4
LT
2534 break;
2535 }
2536 }
808f5114 2537 rcu_read_unlock();
1da177e4
LT
2538 return NOTIFY_DONE;
2539}
2540
2541
2542static int packet_ioctl(struct socket *sock, unsigned int cmd,
2543 unsigned long arg)
2544{
2545 struct sock *sk = sock->sk;
2546
69e3c75f 2547 switch (cmd) {
40d4e3df
ED
2548 case SIOCOUTQ:
2549 {
2550 int amount = sk_wmem_alloc_get(sk);
31e6d363 2551
40d4e3df
ED
2552 return put_user(amount, (int __user *)arg);
2553 }
2554 case SIOCINQ:
2555 {
2556 struct sk_buff *skb;
2557 int amount = 0;
2558
2559 spin_lock_bh(&sk->sk_receive_queue.lock);
2560 skb = skb_peek(&sk->sk_receive_queue);
2561 if (skb)
2562 amount = skb->len;
2563 spin_unlock_bh(&sk->sk_receive_queue.lock);
2564 return put_user(amount, (int __user *)arg);
2565 }
2566 case SIOCGSTAMP:
2567 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2568 case SIOCGSTAMPNS:
2569 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 2570
1da177e4 2571#ifdef CONFIG_INET
40d4e3df
ED
2572 case SIOCADDRT:
2573 case SIOCDELRT:
2574 case SIOCDARP:
2575 case SIOCGARP:
2576 case SIOCSARP:
2577 case SIOCGIFADDR:
2578 case SIOCSIFADDR:
2579 case SIOCGIFBRDADDR:
2580 case SIOCSIFBRDADDR:
2581 case SIOCGIFNETMASK:
2582 case SIOCSIFNETMASK:
2583 case SIOCGIFDSTADDR:
2584 case SIOCSIFDSTADDR:
2585 case SIOCSIFFLAGS:
40d4e3df 2586 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
2587#endif
2588
40d4e3df
ED
2589 default:
2590 return -ENOIOCTLCMD;
1da177e4
LT
2591 }
2592 return 0;
2593}
2594
40d4e3df 2595static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
2596 poll_table *wait)
2597{
2598 struct sock *sk = sock->sk;
2599 struct packet_sock *po = pkt_sk(sk);
2600 unsigned int mask = datagram_poll(file, sock, wait);
2601
2602 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
2603 if (po->rx_ring.pg_vec) {
2604 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
1da177e4
LT
2605 mask |= POLLIN | POLLRDNORM;
2606 }
2607 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
2608 spin_lock_bh(&sk->sk_write_queue.lock);
2609 if (po->tx_ring.pg_vec) {
2610 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2611 mask |= POLLOUT | POLLWRNORM;
2612 }
2613 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
2614 return mask;
2615}
2616
2617
2618/* Dirty? Well, I still did not learn better way to account
2619 * for user mmaps.
2620 */
2621
2622static void packet_mm_open(struct vm_area_struct *vma)
2623{
2624 struct file *file = vma->vm_file;
40d4e3df 2625 struct socket *sock = file->private_data;
1da177e4 2626 struct sock *sk = sock->sk;
1ce4f28b 2627
1da177e4
LT
2628 if (sk)
2629 atomic_inc(&pkt_sk(sk)->mapped);
2630}
2631
2632static void packet_mm_close(struct vm_area_struct *vma)
2633{
2634 struct file *file = vma->vm_file;
40d4e3df 2635 struct socket *sock = file->private_data;
1da177e4 2636 struct sock *sk = sock->sk;
1ce4f28b 2637
1da177e4
LT
2638 if (sk)
2639 atomic_dec(&pkt_sk(sk)->mapped);
2640}
2641
f0f37e2f 2642static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
2643 .open = packet_mm_open,
2644 .close = packet_mm_close,
1da177e4
LT
2645};
2646
0e3125c7
NH
2647static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2648 unsigned int len)
1da177e4
LT
2649{
2650 int i;
2651
4ebf0ae2 2652 for (i = 0; i < len; i++) {
0e3125c7 2653 if (likely(pg_vec[i].buffer)) {
c56b4d90 2654 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
2655 vfree(pg_vec[i].buffer);
2656 else
2657 free_pages((unsigned long)pg_vec[i].buffer,
2658 order);
2659 pg_vec[i].buffer = NULL;
2660 }
1da177e4
LT
2661 }
2662 kfree(pg_vec);
2663}
2664
c56b4d90 2665static inline char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 2666{
0e3125c7
NH
2667 char *buffer = NULL;
2668 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2669 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
2670
2671 buffer = (char *) __get_free_pages(gfp_flags, order);
2672
2673 if (buffer)
2674 return buffer;
2675
2676 /*
2677 * __get_free_pages failed, fall back to vmalloc
2678 */
bbce5a59 2679 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 2680
0e3125c7
NH
2681 if (buffer)
2682 return buffer;
2683
2684 /*
2685 * vmalloc failed, lets dig into swap here
2686 */
0e3125c7
NH
2687 gfp_flags &= ~__GFP_NORETRY;
2688 buffer = (char *)__get_free_pages(gfp_flags, order);
2689 if (buffer)
2690 return buffer;
2691
2692 /*
2693 * complete and utter failure
2694 */
2695 return NULL;
4ebf0ae2
DM
2696}
2697
0e3125c7 2698static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
2699{
2700 unsigned int block_nr = req->tp_block_nr;
0e3125c7 2701 struct pgv *pg_vec;
4ebf0ae2
DM
2702 int i;
2703
0e3125c7 2704 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
2705 if (unlikely(!pg_vec))
2706 goto out;
2707
2708 for (i = 0; i < block_nr; i++) {
c56b4d90 2709 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 2710 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
2711 goto out_free_pgvec;
2712 }
2713
2714out:
2715 return pg_vec;
2716
2717out_free_pgvec:
2718 free_pg_vec(pg_vec, order, block_nr);
2719 pg_vec = NULL;
2720 goto out;
2721}
1da177e4 2722
69e3c75f
JB
2723static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2724 int closing, int tx_ring)
1da177e4 2725{
0e3125c7 2726 struct pgv *pg_vec = NULL;
1da177e4 2727 struct packet_sock *po = pkt_sk(sk);
0e11c91e 2728 int was_running, order = 0;
69e3c75f
JB
2729 struct packet_ring_buffer *rb;
2730 struct sk_buff_head *rb_queue;
0e11c91e 2731 __be16 num;
69e3c75f 2732 int err;
1ce4f28b 2733
69e3c75f
JB
2734 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2735 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 2736
69e3c75f
JB
2737 err = -EBUSY;
2738 if (!closing) {
2739 if (atomic_read(&po->mapped))
2740 goto out;
2741 if (atomic_read(&rb->pending))
2742 goto out;
2743 }
1da177e4 2744
69e3c75f
JB
2745 if (req->tp_block_nr) {
2746 /* Sanity tests and some calculations */
2747 err = -EBUSY;
2748 if (unlikely(rb->pg_vec))
2749 goto out;
1da177e4 2750
bbd6ef87
PM
2751 switch (po->tp_version) {
2752 case TPACKET_V1:
2753 po->tp_hdrlen = TPACKET_HDRLEN;
2754 break;
2755 case TPACKET_V2:
2756 po->tp_hdrlen = TPACKET2_HDRLEN;
2757 break;
2758 }
2759
69e3c75f 2760 err = -EINVAL;
4ebf0ae2 2761 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 2762 goto out;
4ebf0ae2 2763 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 2764 goto out;
8913336a 2765 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
2766 po->tp_reserve))
2767 goto out;
4ebf0ae2 2768 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 2769 goto out;
1da177e4 2770
69e3c75f
JB
2771 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2772 if (unlikely(rb->frames_per_block <= 0))
2773 goto out;
2774 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2775 req->tp_frame_nr))
2776 goto out;
1da177e4
LT
2777
2778 err = -ENOMEM;
4ebf0ae2
DM
2779 order = get_order(req->tp_block_size);
2780 pg_vec = alloc_pg_vec(req, order);
2781 if (unlikely(!pg_vec))
1da177e4 2782 goto out;
69e3c75f
JB
2783 }
2784 /* Done */
2785 else {
2786 err = -EINVAL;
4ebf0ae2 2787 if (unlikely(req->tp_frame_nr))
69e3c75f 2788 goto out;
1da177e4
LT
2789 }
2790
2791 lock_sock(sk);
2792
2793 /* Detach socket from network */
2794 spin_lock(&po->bind_lock);
2795 was_running = po->running;
2796 num = po->num;
2797 if (was_running) {
1da177e4 2798 po->num = 0;
ce06b03e 2799 __unregister_prot_hook(sk, false);
1da177e4
LT
2800 }
2801 spin_unlock(&po->bind_lock);
1ce4f28b 2802
1da177e4
LT
2803 synchronize_net();
2804
2805 err = -EBUSY;
905db440 2806 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
2807 if (closing || atomic_read(&po->mapped) == 0) {
2808 err = 0;
69e3c75f 2809 spin_lock_bh(&rb_queue->lock);
c053fd96 2810 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
2811 rb->frame_max = (req->tp_frame_nr - 1);
2812 rb->head = 0;
2813 rb->frame_size = req->tp_frame_size;
2814 spin_unlock_bh(&rb_queue->lock);
2815
c053fd96
CG
2816 swap(rb->pg_vec_order, order);
2817 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
2818
2819 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2820 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2821 tpacket_rcv : packet_rcv;
2822 skb_queue_purge(rb_queue);
1da177e4 2823 if (atomic_read(&po->mapped))
40d4e3df
ED
2824 pr_err("packet_mmap: vma is busy: %d\n",
2825 atomic_read(&po->mapped));
1da177e4 2826 }
905db440 2827 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
2828
2829 spin_lock(&po->bind_lock);
ce06b03e 2830 if (was_running) {
1da177e4 2831 po->num = num;
ce06b03e 2832 register_prot_hook(sk);
1da177e4
LT
2833 }
2834 spin_unlock(&po->bind_lock);
2835
2836 release_sock(sk);
2837
1da177e4
LT
2838 if (pg_vec)
2839 free_pg_vec(pg_vec, order, req->tp_block_nr);
2840out:
2841 return err;
2842}
2843
69e3c75f
JB
2844static int packet_mmap(struct file *file, struct socket *sock,
2845 struct vm_area_struct *vma)
1da177e4
LT
2846{
2847 struct sock *sk = sock->sk;
2848 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
2849 unsigned long size, expected_size;
2850 struct packet_ring_buffer *rb;
1da177e4
LT
2851 unsigned long start;
2852 int err = -EINVAL;
2853 int i;
2854
2855 if (vma->vm_pgoff)
2856 return -EINVAL;
2857
905db440 2858 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
2859
2860 expected_size = 0;
2861 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2862 if (rb->pg_vec) {
2863 expected_size += rb->pg_vec_len
2864 * rb->pg_vec_pages
2865 * PAGE_SIZE;
2866 }
2867 }
2868
2869 if (expected_size == 0)
1da177e4 2870 goto out;
69e3c75f
JB
2871
2872 size = vma->vm_end - vma->vm_start;
2873 if (size != expected_size)
1da177e4
LT
2874 goto out;
2875
1da177e4 2876 start = vma->vm_start;
69e3c75f
JB
2877 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2878 if (rb->pg_vec == NULL)
2879 continue;
2880
2881 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
2882 struct page *page;
2883 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
2884 int pg_num;
2885
c56b4d90
CG
2886 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
2887 page = pgv_to_page(kaddr);
69e3c75f
JB
2888 err = vm_insert_page(vma, start, page);
2889 if (unlikely(err))
2890 goto out;
2891 start += PAGE_SIZE;
0e3125c7 2892 kaddr += PAGE_SIZE;
69e3c75f 2893 }
4ebf0ae2 2894 }
1da177e4 2895 }
69e3c75f 2896
4ebf0ae2 2897 atomic_inc(&po->mapped);
1da177e4
LT
2898 vma->vm_ops = &packet_mmap_ops;
2899 err = 0;
2900
2901out:
905db440 2902 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
2903 return err;
2904}
1da177e4 2905
90ddc4f0 2906static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
2907 .family = PF_PACKET,
2908 .owner = THIS_MODULE,
2909 .release = packet_release,
2910 .bind = packet_bind_spkt,
2911 .connect = sock_no_connect,
2912 .socketpair = sock_no_socketpair,
2913 .accept = sock_no_accept,
2914 .getname = packet_getname_spkt,
2915 .poll = datagram_poll,
2916 .ioctl = packet_ioctl,
2917 .listen = sock_no_listen,
2918 .shutdown = sock_no_shutdown,
2919 .setsockopt = sock_no_setsockopt,
2920 .getsockopt = sock_no_getsockopt,
2921 .sendmsg = packet_sendmsg_spkt,
2922 .recvmsg = packet_recvmsg,
2923 .mmap = sock_no_mmap,
2924 .sendpage = sock_no_sendpage,
2925};
1da177e4 2926
90ddc4f0 2927static const struct proto_ops packet_ops = {
1da177e4
LT
2928 .family = PF_PACKET,
2929 .owner = THIS_MODULE,
2930 .release = packet_release,
2931 .bind = packet_bind,
2932 .connect = sock_no_connect,
2933 .socketpair = sock_no_socketpair,
2934 .accept = sock_no_accept,
1ce4f28b 2935 .getname = packet_getname,
1da177e4
LT
2936 .poll = packet_poll,
2937 .ioctl = packet_ioctl,
2938 .listen = sock_no_listen,
2939 .shutdown = sock_no_shutdown,
2940 .setsockopt = packet_setsockopt,
2941 .getsockopt = packet_getsockopt,
2942 .sendmsg = packet_sendmsg,
2943 .recvmsg = packet_recvmsg,
2944 .mmap = packet_mmap,
2945 .sendpage = sock_no_sendpage,
2946};
2947
ec1b4cf7 2948static const struct net_proto_family packet_family_ops = {
1da177e4
LT
2949 .family = PF_PACKET,
2950 .create = packet_create,
2951 .owner = THIS_MODULE,
2952};
2953
2954static struct notifier_block packet_netdev_notifier = {
40d4e3df 2955 .notifier_call = packet_notifier,
1da177e4
LT
2956};
2957
2958#ifdef CONFIG_PROC_FS
1da177e4
LT
2959
2960static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 2961 __acquires(RCU)
1da177e4 2962{
e372c414 2963 struct net *net = seq_file_net(seq);
808f5114 2964
2965 rcu_read_lock();
2966 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
2967}
2968
2969static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2970{
1bf40954 2971 struct net *net = seq_file_net(seq);
808f5114 2972 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
2973}
2974
2975static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 2976 __releases(RCU)
1da177e4 2977{
808f5114 2978 rcu_read_unlock();
1da177e4
LT
2979}
2980
1ce4f28b 2981static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
2982{
2983 if (v == SEQ_START_TOKEN)
2984 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
2985 else {
b7ceabd9 2986 struct sock *s = sk_entry(v);
1da177e4
LT
2987 const struct packet_sock *po = pkt_sk(s);
2988
2989 seq_printf(seq,
71338aa7 2990 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
2991 s,
2992 atomic_read(&s->sk_refcnt),
2993 s->sk_type,
2994 ntohs(po->num),
2995 po->ifindex,
2996 po->running,
2997 atomic_read(&s->sk_rmem_alloc),
2998 sock_i_uid(s),
40d4e3df 2999 sock_i_ino(s));
1da177e4
LT
3000 }
3001
3002 return 0;
3003}
3004
56b3d975 3005static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3006 .start = packet_seq_start,
3007 .next = packet_seq_next,
3008 .stop = packet_seq_stop,
3009 .show = packet_seq_show,
3010};
3011
3012static int packet_seq_open(struct inode *inode, struct file *file)
3013{
e372c414
DL
3014 return seq_open_net(inode, file, &packet_seq_ops,
3015 sizeof(struct seq_net_private));
1da177e4
LT
3016}
3017
da7071d7 3018static const struct file_operations packet_seq_fops = {
1da177e4
LT
3019 .owner = THIS_MODULE,
3020 .open = packet_seq_open,
3021 .read = seq_read,
3022 .llseek = seq_lseek,
e372c414 3023 .release = seq_release_net,
1da177e4
LT
3024};
3025
3026#endif
3027
2c8c1e72 3028static int __net_init packet_net_init(struct net *net)
d12d01d6 3029{
808f5114 3030 spin_lock_init(&net->packet.sklist_lock);
2aaef4e4 3031 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6
DL
3032
3033 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
3034 return -ENOMEM;
3035
3036 return 0;
3037}
3038
2c8c1e72 3039static void __net_exit packet_net_exit(struct net *net)
d12d01d6
DL
3040{
3041 proc_net_remove(net, "packet");
3042}
3043
3044static struct pernet_operations packet_net_ops = {
3045 .init = packet_net_init,
3046 .exit = packet_net_exit,
3047};
3048
3049
1da177e4
LT
3050static void __exit packet_exit(void)
3051{
1da177e4 3052 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3053 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3054 sock_unregister(PF_PACKET);
3055 proto_unregister(&packet_proto);
3056}
3057
3058static int __init packet_init(void)
3059{
3060 int rc = proto_register(&packet_proto, 0);
3061
3062 if (rc != 0)
3063 goto out;
3064
3065 sock_register(&packet_family_ops);
d12d01d6 3066 register_pernet_subsys(&packet_net_ops);
1da177e4 3067 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3068out:
3069 return rc;
3070}
3071
3072module_init(packet_init);
3073module_exit(packet_exit);
3074MODULE_LICENSE("GPL");
3075MODULE_ALIAS_NETPROTO(PF_PACKET);