ipv4: Kill RTO_CONN.
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
1da177e4
LT
43 *
44 * This program is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU General Public License
46 * as published by the Free Software Foundation; either version
47 * 2 of the License, or (at your option) any later version.
48 *
49 */
1ce4f28b 50
1da177e4 51#include <linux/types.h>
1da177e4 52#include <linux/mm.h>
4fc268d2 53#include <linux/capability.h>
1da177e4
LT
54#include <linux/fcntl.h>
55#include <linux/socket.h>
56#include <linux/in.h>
57#include <linux/inet.h>
58#include <linux/netdevice.h>
59#include <linux/if_packet.h>
60#include <linux/wireless.h>
ffbc6111 61#include <linux/kernel.h>
1da177e4 62#include <linux/kmod.h>
5a0e3ad6 63#include <linux/slab.h>
0e3125c7 64#include <linux/vmalloc.h>
457c4cbc 65#include <net/net_namespace.h>
1da177e4
LT
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <linux/skbuff.h>
69#include <net/sock.h>
70#include <linux/errno.h>
71#include <linux/timer.h>
72#include <asm/system.h>
73#include <asm/uaccess.h>
74#include <asm/ioctls.h>
75#include <asm/page.h>
a1f8e7f7 76#include <asm/cacheflush.h>
1da177e4
LT
77#include <asm/io.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80#include <linux/poll.h>
81#include <linux/module.h>
82#include <linux/init.h>
905db440 83#include <linux/mutex.h>
05423b24 84#include <linux/if_vlan.h>
bfd5f4a3 85#include <linux/virtio_net.h>
ed85b565 86#include <linux/errqueue.h>
614f60fa 87#include <linux/net_tstamp.h>
1da177e4
LT
88
89#ifdef CONFIG_INET
90#include <net/inet_common.h>
91#endif
92
1da177e4
LT
93/*
94 Assumptions:
95 - if device has no dev->hard_header routine, it adds and removes ll header
96 inside itself. In this case ll header is invisible outside of device,
97 but higher levels still should reserve dev->hard_header_len.
98 Some devices are enough clever to reallocate skb, when header
99 will not fit to reserved space (tunnel), another ones are silly
100 (PPP).
101 - packet socket receives packets with pulled ll header,
102 so that SOCK_RAW should push it back.
103
104On receive:
105-----------
106
107Incoming, dev->hard_header!=NULL
b0e380b1
ACM
108 mac_header -> ll header
109 data -> data
1da177e4
LT
110
111Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
112 mac_header -> ll header
113 data -> ll header
1da177e4
LT
114
115Incoming, dev->hard_header==NULL
b0e380b1
ACM
116 mac_header -> UNKNOWN position. It is very likely, that it points to ll
117 header. PPP makes it, that is wrong, because introduce
db0c58f9 118 assymetry between rx and tx paths.
b0e380b1 119 data -> data
1da177e4
LT
120
121Outgoing, dev->hard_header==NULL
b0e380b1
ACM
122 mac_header -> data. ll header is still not built!
123 data -> data
1da177e4
LT
124
125Resume
126 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
127
128
129On transmit:
130------------
131
132dev->hard_header != NULL
b0e380b1
ACM
133 mac_header -> ll header
134 data -> ll header
1da177e4
LT
135
136dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
137 mac_header -> data
138 data -> data
1da177e4
LT
139
140 We should set nh.raw on output to correct posistion,
141 packet classifier depends on it.
142 */
143
1da177e4
LT
144/* Private packet socket structures. */
145
40d4e3df 146struct packet_mclist {
1da177e4
LT
147 struct packet_mclist *next;
148 int ifindex;
149 int count;
150 unsigned short type;
151 unsigned short alen;
0fb375fb
EB
152 unsigned char addr[MAX_ADDR_LEN];
153};
154/* identical to struct packet_mreq except it has
155 * a longer address field.
156 */
40d4e3df 157struct packet_mreq_max {
0fb375fb
EB
158 int mr_ifindex;
159 unsigned short mr_type;
160 unsigned short mr_alen;
161 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 162};
a2efcfa0 163
69e3c75f
JB
164static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
165 int closing, int tx_ring);
166
0e3125c7
NH
167struct pgv {
168 char *buffer;
0e3125c7
NH
169};
170
69e3c75f 171struct packet_ring_buffer {
0e3125c7 172 struct pgv *pg_vec;
69e3c75f
JB
173 unsigned int head;
174 unsigned int frames_per_block;
175 unsigned int frame_size;
176 unsigned int frame_max;
177
178 unsigned int pg_vec_order;
179 unsigned int pg_vec_pages;
180 unsigned int pg_vec_len;
181
182 atomic_t pending;
183};
184
185struct packet_sock;
186static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
1da177e4
LT
187
188static void packet_flush_mclist(struct sock *sk);
189
190struct packet_sock {
191 /* struct sock has to be the first member of packet_sock */
192 struct sock sk;
193 struct tpacket_stats stats;
69e3c75f
JB
194 struct packet_ring_buffer rx_ring;
195 struct packet_ring_buffer tx_ring;
1da177e4 196 int copy_thresh;
1da177e4 197 spinlock_t bind_lock;
905db440 198 struct mutex pg_vec_lock;
8dc41944 199 unsigned int running:1, /* prot_hook is attached*/
80feaacb 200 auxdata:1,
bfd5f4a3
SS
201 origdev:1,
202 has_vnet_hdr:1;
1da177e4 203 int ifindex; /* bound device */
0e11c91e 204 __be16 num;
1da177e4 205 struct packet_mclist *mclist;
1da177e4 206 atomic_t mapped;
bbd6ef87
PM
207 enum tpacket_versions tp_version;
208 unsigned int tp_hdrlen;
8913336a 209 unsigned int tp_reserve;
69e3c75f 210 unsigned int tp_loss:1;
614f60fa 211 unsigned int tp_tstamp;
94b05952 212 struct packet_type prot_hook ____cacheline_aligned_in_smp;
1da177e4
LT
213};
214
ffbc6111
HX
215struct packet_skb_cb {
216 unsigned int origlen;
217 union {
218 struct sockaddr_pkt pkt;
219 struct sockaddr_ll ll;
220 } sa;
221};
222
223#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 224
f6dafa95 225static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
226{
227 if (is_vmalloc_addr(addr))
228 return vmalloc_to_page(addr);
229 return virt_to_page(addr);
230}
231
69e3c75f 232static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 233{
bbd6ef87
PM
234 union {
235 struct tpacket_hdr *h1;
236 struct tpacket2_hdr *h2;
237 void *raw;
238 } h;
1da177e4 239
69e3c75f 240 h.raw = frame;
bbd6ef87
PM
241 switch (po->tp_version) {
242 case TPACKET_V1:
69e3c75f 243 h.h1->tp_status = status;
0af55bb5 244 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
245 break;
246 case TPACKET_V2:
69e3c75f 247 h.h2->tp_status = status;
0af55bb5 248 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 249 break;
69e3c75f 250 default:
40d4e3df 251 pr_err("TPACKET version not supported\n");
69e3c75f 252 BUG();
bbd6ef87 253 }
69e3c75f
JB
254
255 smp_wmb();
bbd6ef87
PM
256}
257
69e3c75f 258static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87
PM
259{
260 union {
261 struct tpacket_hdr *h1;
262 struct tpacket2_hdr *h2;
263 void *raw;
264 } h;
265
69e3c75f
JB
266 smp_rmb();
267
bbd6ef87
PM
268 h.raw = frame;
269 switch (po->tp_version) {
270 case TPACKET_V1:
0af55bb5 271 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 272 return h.h1->tp_status;
bbd6ef87 273 case TPACKET_V2:
0af55bb5 274 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f
JB
275 return h.h2->tp_status;
276 default:
40d4e3df 277 pr_err("TPACKET version not supported\n");
69e3c75f
JB
278 BUG();
279 return 0;
bbd6ef87 280 }
1da177e4 281}
69e3c75f
JB
282
283static void *packet_lookup_frame(struct packet_sock *po,
284 struct packet_ring_buffer *rb,
285 unsigned int position,
286 int status)
287{
288 unsigned int pg_vec_pos, frame_offset;
289 union {
290 struct tpacket_hdr *h1;
291 struct tpacket2_hdr *h2;
292 void *raw;
293 } h;
294
295 pg_vec_pos = position / rb->frames_per_block;
296 frame_offset = position % rb->frames_per_block;
297
0e3125c7
NH
298 h.raw = rb->pg_vec[pg_vec_pos].buffer +
299 (frame_offset * rb->frame_size);
69e3c75f
JB
300
301 if (status != __packet_get_status(po, h.raw))
302 return NULL;
303
304 return h.raw;
305}
306
307static inline void *packet_current_frame(struct packet_sock *po,
308 struct packet_ring_buffer *rb,
309 int status)
310{
311 return packet_lookup_frame(po, rb, rb->head, status);
312}
313
314static inline void *packet_previous_frame(struct packet_sock *po,
315 struct packet_ring_buffer *rb,
316 int status)
317{
318 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
319 return packet_lookup_frame(po, rb, previous, status);
320}
321
322static inline void packet_increment_head(struct packet_ring_buffer *buff)
323{
324 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
325}
326
1da177e4
LT
327static inline struct packet_sock *pkt_sk(struct sock *sk)
328{
329 return (struct packet_sock *)sk;
330}
331
332static void packet_sock_destruct(struct sock *sk)
333{
ed85b565
RC
334 skb_queue_purge(&sk->sk_error_queue);
335
547b792c
IJ
336 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
337 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
338
339 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 340 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
341 return;
342 }
343
17ab56a2 344 sk_refcnt_debug_dec(sk);
1da177e4
LT
345}
346
347
90ddc4f0 348static const struct proto_ops packet_ops;
1da177e4 349
90ddc4f0 350static const struct proto_ops packet_ops_spkt;
1da177e4 351
40d4e3df
ED
352static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
353 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
354{
355 struct sock *sk;
356 struct sockaddr_pkt *spkt;
357
358 /*
359 * When we registered the protocol we saved the socket in the data
360 * field for just this event.
361 */
362
363 sk = pt->af_packet_priv;
1ce4f28b 364
1da177e4
LT
365 /*
366 * Yank back the headers [hope the device set this
367 * right or kerboom...]
368 *
369 * Incoming packets have ll header pulled,
370 * push it back.
371 *
98e399f8 372 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
373 * so that this procedure is noop.
374 */
375
376 if (skb->pkt_type == PACKET_LOOPBACK)
377 goto out;
378
09ad9bc7 379 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
380 goto out;
381
40d4e3df
ED
382 skb = skb_share_check(skb, GFP_ATOMIC);
383 if (skb == NULL)
1da177e4
LT
384 goto oom;
385
386 /* drop any routing info */
adf30907 387 skb_dst_drop(skb);
1da177e4 388
84531c24
PO
389 /* drop conntrack reference */
390 nf_reset(skb);
391
ffbc6111 392 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 393
98e399f8 394 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
395
396 /*
397 * The SOCK_PACKET socket receives _all_ frames.
398 */
399
400 spkt->spkt_family = dev->type;
401 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
402 spkt->spkt_protocol = skb->protocol;
403
404 /*
405 * Charge the memory to the socket. This is done specifically
406 * to prevent sockets using all the memory up.
407 */
408
40d4e3df 409 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
410 return 0;
411
412out:
413 kfree_skb(skb);
414oom:
415 return 0;
416}
417
418
419/*
420 * Output a raw packet to a device layer. This bypasses all the other
421 * protocol layers and you must therefore supply it with a complete frame
422 */
1ce4f28b 423
1da177e4
LT
424static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
425 struct msghdr *msg, size_t len)
426{
427 struct sock *sk = sock->sk;
40d4e3df 428 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 429 struct sk_buff *skb = NULL;
1da177e4 430 struct net_device *dev;
40d4e3df 431 __be16 proto = 0;
1da177e4 432 int err;
1ce4f28b 433
1da177e4 434 /*
1ce4f28b 435 * Get and verify the address.
1da177e4
LT
436 */
437
40d4e3df 438 if (saddr) {
1da177e4 439 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
440 return -EINVAL;
441 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
442 proto = saddr->spkt_protocol;
443 } else
444 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
445
446 /*
1ce4f28b 447 * Find the device first to size check it
1da177e4
LT
448 */
449
450 saddr->spkt_device[13] = 0;
1a35ca80 451retry:
654d1f8a
ED
452 rcu_read_lock();
453 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
454 err = -ENODEV;
455 if (dev == NULL)
456 goto out_unlock;
1ce4f28b 457
d5e76b0a
DM
458 err = -ENETDOWN;
459 if (!(dev->flags & IFF_UP))
460 goto out_unlock;
461
1da177e4 462 /*
40d4e3df
ED
463 * You may not queue a frame bigger than the mtu. This is the lowest level
464 * raw protocol and you must do your own fragmentation at this level.
1da177e4 465 */
1ce4f28b 466
1da177e4 467 err = -EMSGSIZE;
57f89bfa 468 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN)
1da177e4
LT
469 goto out_unlock;
470
1a35ca80
ED
471 if (!skb) {
472 size_t reserved = LL_RESERVED_SPACE(dev);
473 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
474
475 rcu_read_unlock();
476 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
477 if (skb == NULL)
478 return -ENOBUFS;
479 /* FIXME: Save some space for broken drivers that write a hard
480 * header at transmission time by themselves. PPP is the notable
481 * one here. This should really be fixed at the driver level.
482 */
483 skb_reserve(skb, reserved);
484 skb_reset_network_header(skb);
485
486 /* Try to align data part correctly */
487 if (hhlen) {
488 skb->data -= hhlen;
489 skb->tail -= hhlen;
490 if (len < hhlen)
491 skb_reset_network_header(skb);
492 }
493 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
494 if (err)
495 goto out_free;
496 goto retry;
1da177e4
LT
497 }
498
57f89bfa
BG
499 if (len > (dev->mtu + dev->hard_header_len)) {
500 /* Earlier code assumed this would be a VLAN pkt,
501 * double-check this now that we have the actual
502 * packet in hand.
503 */
504 struct ethhdr *ehdr;
505 skb_reset_mac_header(skb);
506 ehdr = eth_hdr(skb);
507 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
508 err = -EMSGSIZE;
509 goto out_unlock;
510 }
511 }
1a35ca80 512
1da177e4
LT
513 skb->protocol = proto;
514 skb->dev = dev;
515 skb->priority = sk->sk_priority;
2d37a186 516 skb->mark = sk->sk_mark;
2244d07b 517 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
518 if (err < 0)
519 goto out_unlock;
1da177e4
LT
520
521 dev_queue_xmit(skb);
654d1f8a 522 rcu_read_unlock();
40d4e3df 523 return len;
1da177e4 524
1da177e4 525out_unlock:
654d1f8a 526 rcu_read_unlock();
1a35ca80
ED
527out_free:
528 kfree_skb(skb);
1da177e4
LT
529 return err;
530}
1da177e4 531
62ab0812
ED
532static inline unsigned int run_filter(const struct sk_buff *skb,
533 const struct sock *sk,
dbcb5855 534 unsigned int res)
1da177e4
LT
535{
536 struct sk_filter *filter;
fda9ef5d 537
80f8f102
ED
538 rcu_read_lock();
539 filter = rcu_dereference(sk->sk_filter);
dbcb5855 540 if (filter != NULL)
93aaae2e 541 res = sk_run_filter(skb, filter->insns);
80f8f102 542 rcu_read_unlock();
1da177e4 543
dbcb5855 544 return res;
1da177e4
LT
545}
546
547/*
62ab0812
ED
548 * This function makes lazy skb cloning in hope that most of packets
549 * are discarded by BPF.
550 *
551 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
552 * and skb->cb are mangled. It works because (and until) packets
553 * falling here are owned by current CPU. Output packets are cloned
554 * by dev_queue_xmit_nit(), input packets are processed by net_bh
555 * sequencially, so that if we return skb to original state on exit,
556 * we will not harm anyone.
1da177e4
LT
557 */
558
40d4e3df
ED
559static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
560 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
561{
562 struct sock *sk;
563 struct sockaddr_ll *sll;
564 struct packet_sock *po;
40d4e3df 565 u8 *skb_head = skb->data;
1da177e4 566 int skb_len = skb->len;
dbcb5855 567 unsigned int snaplen, res;
1da177e4
LT
568
569 if (skb->pkt_type == PACKET_LOOPBACK)
570 goto drop;
571
572 sk = pt->af_packet_priv;
573 po = pkt_sk(sk);
574
09ad9bc7 575 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
576 goto drop;
577
1da177e4
LT
578 skb->dev = dev;
579
3b04ddde 580 if (dev->header_ops) {
1da177e4 581 /* The device has an explicit notion of ll header,
62ab0812
ED
582 * exported to higher levels.
583 *
584 * Otherwise, the device hides details of its frame
585 * structure, so that corresponding packet head is
586 * never delivered to user.
1da177e4
LT
587 */
588 if (sk->sk_type != SOCK_DGRAM)
98e399f8 589 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
590 else if (skb->pkt_type == PACKET_OUTGOING) {
591 /* Special case: outgoing packets have ll header at head */
bbe735e4 592 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
593 }
594 }
595
596 snaplen = skb->len;
597
dbcb5855
DM
598 res = run_filter(skb, sk, snaplen);
599 if (!res)
fda9ef5d 600 goto drop_n_restore;
dbcb5855
DM
601 if (snaplen > res)
602 snaplen = res;
1da177e4
LT
603
604 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
605 (unsigned)sk->sk_rcvbuf)
606 goto drop_n_acct;
607
608 if (skb_shared(skb)) {
609 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
610 if (nskb == NULL)
611 goto drop_n_acct;
612
613 if (skb_head != skb->data) {
614 skb->data = skb_head;
615 skb->len = skb_len;
616 }
617 kfree_skb(skb);
618 skb = nskb;
619 }
620
ffbc6111
HX
621 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
622 sizeof(skb->cb));
623
624 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
625 sll->sll_family = AF_PACKET;
626 sll->sll_hatype = dev->type;
627 sll->sll_protocol = skb->protocol;
628 sll->sll_pkttype = skb->pkt_type;
8032b464 629 if (unlikely(po->origdev))
80feaacb
PWJ
630 sll->sll_ifindex = orig_dev->ifindex;
631 else
632 sll->sll_ifindex = dev->ifindex;
1da177e4 633
b95cce35 634 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 635
ffbc6111 636 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 637
1da177e4
LT
638 if (pskb_trim(skb, snaplen))
639 goto drop_n_acct;
640
641 skb_set_owner_r(skb, sk);
642 skb->dev = NULL;
adf30907 643 skb_dst_drop(skb);
1da177e4 644
84531c24
PO
645 /* drop conntrack reference */
646 nf_reset(skb);
647
1da177e4
LT
648 spin_lock(&sk->sk_receive_queue.lock);
649 po->stats.tp_packets++;
3b885787 650 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
651 __skb_queue_tail(&sk->sk_receive_queue, skb);
652 spin_unlock(&sk->sk_receive_queue.lock);
653 sk->sk_data_ready(sk, skb->len);
654 return 0;
655
656drop_n_acct:
3b885787 657 po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
1da177e4
LT
658
659drop_n_restore:
660 if (skb_head != skb->data && skb_shared(skb)) {
661 skb->data = skb_head;
662 skb->len = skb_len;
663 }
664drop:
ead2ceb0 665 consume_skb(skb);
1da177e4
LT
666 return 0;
667}
668
40d4e3df
ED
669static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
670 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
671{
672 struct sock *sk;
673 struct packet_sock *po;
674 struct sockaddr_ll *sll;
bbd6ef87
PM
675 union {
676 struct tpacket_hdr *h1;
677 struct tpacket2_hdr *h2;
678 void *raw;
679 } h;
40d4e3df 680 u8 *skb_head = skb->data;
1da177e4 681 int skb_len = skb->len;
dbcb5855 682 unsigned int snaplen, res;
1da177e4 683 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
bbd6ef87 684 unsigned short macoff, netoff, hdrlen;
1da177e4 685 struct sk_buff *copy_skb = NULL;
b7aa0bf7 686 struct timeval tv;
bbd6ef87 687 struct timespec ts;
614f60fa 688 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
1da177e4
LT
689
690 if (skb->pkt_type == PACKET_LOOPBACK)
691 goto drop;
692
693 sk = pt->af_packet_priv;
694 po = pkt_sk(sk);
695
09ad9bc7 696 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
697 goto drop;
698
3b04ddde 699 if (dev->header_ops) {
1da177e4 700 if (sk->sk_type != SOCK_DGRAM)
98e399f8 701 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
702 else if (skb->pkt_type == PACKET_OUTGOING) {
703 /* Special case: outgoing packets have ll header at head */
bbe735e4 704 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
705 }
706 }
707
8dc41944
HX
708 if (skb->ip_summed == CHECKSUM_PARTIAL)
709 status |= TP_STATUS_CSUMNOTREADY;
710
1da177e4
LT
711 snaplen = skb->len;
712
dbcb5855
DM
713 res = run_filter(skb, sk, snaplen);
714 if (!res)
fda9ef5d 715 goto drop_n_restore;
dbcb5855
DM
716 if (snaplen > res)
717 snaplen = res;
1da177e4
LT
718
719 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
720 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
721 po->tp_reserve;
1da177e4 722 } else {
bbe735e4 723 unsigned maclen = skb_network_offset(skb);
bbd6ef87 724 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
725 (maclen < 16 ? 16 : maclen)) +
726 po->tp_reserve;
1da177e4
LT
727 macoff = netoff - maclen;
728 }
729
69e3c75f 730 if (macoff + snaplen > po->rx_ring.frame_size) {
1da177e4
LT
731 if (po->copy_thresh &&
732 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
733 (unsigned)sk->sk_rcvbuf) {
734 if (skb_shared(skb)) {
735 copy_skb = skb_clone(skb, GFP_ATOMIC);
736 } else {
737 copy_skb = skb_get(skb);
738 skb_head = skb->data;
739 }
740 if (copy_skb)
741 skb_set_owner_r(copy_skb, sk);
742 }
69e3c75f 743 snaplen = po->rx_ring.frame_size - macoff;
1da177e4
LT
744 if ((int)snaplen < 0)
745 snaplen = 0;
746 }
1da177e4
LT
747
748 spin_lock(&sk->sk_receive_queue.lock);
69e3c75f 749 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
bbd6ef87 750 if (!h.raw)
1da177e4 751 goto ring_is_full;
69e3c75f 752 packet_increment_head(&po->rx_ring);
1da177e4
LT
753 po->stats.tp_packets++;
754 if (copy_skb) {
755 status |= TP_STATUS_COPY;
756 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
757 }
758 if (!po->stats.tp_drops)
759 status &= ~TP_STATUS_LOSING;
760 spin_unlock(&sk->sk_receive_queue.lock);
761
bbd6ef87 762 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1da177e4 763
bbd6ef87
PM
764 switch (po->tp_version) {
765 case TPACKET_V1:
766 h.h1->tp_len = skb->len;
767 h.h1->tp_snaplen = snaplen;
768 h.h1->tp_mac = macoff;
769 h.h1->tp_net = netoff;
614f60fa
SM
770 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
771 && shhwtstamps->syststamp.tv64)
772 tv = ktime_to_timeval(shhwtstamps->syststamp);
773 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
774 && shhwtstamps->hwtstamp.tv64)
775 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
776 else if (skb->tstamp.tv64)
bbd6ef87
PM
777 tv = ktime_to_timeval(skb->tstamp);
778 else
779 do_gettimeofday(&tv);
780 h.h1->tp_sec = tv.tv_sec;
781 h.h1->tp_usec = tv.tv_usec;
782 hdrlen = sizeof(*h.h1);
783 break;
784 case TPACKET_V2:
785 h.h2->tp_len = skb->len;
786 h.h2->tp_snaplen = snaplen;
787 h.h2->tp_mac = macoff;
788 h.h2->tp_net = netoff;
614f60fa
SM
789 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
790 && shhwtstamps->syststamp.tv64)
791 ts = ktime_to_timespec(shhwtstamps->syststamp);
792 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
793 && shhwtstamps->hwtstamp.tv64)
794 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
795 else if (skb->tstamp.tv64)
bbd6ef87
PM
796 ts = ktime_to_timespec(skb->tstamp);
797 else
798 getnstimeofday(&ts);
799 h.h2->tp_sec = ts.tv_sec;
800 h.h2->tp_nsec = ts.tv_nsec;
05423b24 801 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
bbd6ef87
PM
802 hdrlen = sizeof(*h.h2);
803 break;
804 default:
805 BUG();
806 }
1da177e4 807
bbd6ef87 808 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 809 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
810 sll->sll_family = AF_PACKET;
811 sll->sll_hatype = dev->type;
812 sll->sll_protocol = skb->protocol;
813 sll->sll_pkttype = skb->pkt_type;
8032b464 814 if (unlikely(po->origdev))
80feaacb
PWJ
815 sll->sll_ifindex = orig_dev->ifindex;
816 else
817 sll->sll_ifindex = dev->ifindex;
1da177e4 818
bbd6ef87 819 __packet_set_status(po, h.raw, status);
e16aa207 820 smp_mb();
f6dafa95 821#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 822 {
0af55bb5
CG
823 u8 *start, *end;
824
825 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen);
826 for (start = h.raw; start < end; start += PAGE_SIZE)
827 flush_dcache_page(pgv_to_page(start));
1da177e4 828 }
f6dafa95 829#endif
1da177e4
LT
830
831 sk->sk_data_ready(sk, 0);
832
833drop_n_restore:
834 if (skb_head != skb->data && skb_shared(skb)) {
835 skb->data = skb_head;
836 skb->len = skb_len;
837 }
838drop:
1ce4f28b 839 kfree_skb(skb);
1da177e4
LT
840 return 0;
841
842ring_is_full:
843 po->stats.tp_drops++;
844 spin_unlock(&sk->sk_receive_queue.lock);
845
846 sk->sk_data_ready(sk, 0);
acb5d75b 847 kfree_skb(copy_skb);
1da177e4
LT
848 goto drop_n_restore;
849}
850
69e3c75f
JB
851static void tpacket_destruct_skb(struct sk_buff *skb)
852{
853 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 854 void *ph;
1da177e4 855
69e3c75f 856 BUG_ON(skb == NULL);
1da177e4 857
69e3c75f
JB
858 if (likely(po->tx_ring.pg_vec)) {
859 ph = skb_shinfo(skb)->destructor_arg;
860 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
861 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
862 atomic_dec(&po->tx_ring.pending);
863 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
864 }
865
866 sock_wfree(skb);
867}
868
40d4e3df
ED
869static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
870 void *frame, struct net_device *dev, int size_max,
871 __be16 proto, unsigned char *addr)
69e3c75f
JB
872{
873 union {
874 struct tpacket_hdr *h1;
875 struct tpacket2_hdr *h2;
876 void *raw;
877 } ph;
878 int to_write, offset, len, tp_len, nr_frags, len_max;
879 struct socket *sock = po->sk.sk_socket;
880 struct page *page;
881 void *data;
882 int err;
883
884 ph.raw = frame;
885
886 skb->protocol = proto;
887 skb->dev = dev;
888 skb->priority = po->sk.sk_priority;
2d37a186 889 skb->mark = po->sk.sk_mark;
69e3c75f
JB
890 skb_shinfo(skb)->destructor_arg = ph.raw;
891
892 switch (po->tp_version) {
893 case TPACKET_V2:
894 tp_len = ph.h2->tp_len;
895 break;
896 default:
897 tp_len = ph.h1->tp_len;
898 break;
899 }
900 if (unlikely(tp_len > size_max)) {
40d4e3df 901 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
902 return -EMSGSIZE;
903 }
904
905 skb_reserve(skb, LL_RESERVED_SPACE(dev));
906 skb_reset_network_header(skb);
907
908 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
909 to_write = tp_len;
910
911 if (sock->type == SOCK_DGRAM) {
912 err = dev_hard_header(skb, dev, ntohs(proto), addr,
913 NULL, tp_len);
914 if (unlikely(err < 0))
915 return -EINVAL;
40d4e3df 916 } else if (dev->hard_header_len) {
69e3c75f
JB
917 /* net device doesn't like empty head */
918 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
919 pr_err("packet size is too short (%d < %d)\n",
920 tp_len, dev->hard_header_len);
69e3c75f
JB
921 return -EINVAL;
922 }
923
924 skb_push(skb, dev->hard_header_len);
925 err = skb_store_bits(skb, 0, data,
926 dev->hard_header_len);
927 if (unlikely(err))
928 return err;
929
930 data += dev->hard_header_len;
931 to_write -= dev->hard_header_len;
932 }
933
934 err = -EFAULT;
69e3c75f
JB
935 offset = offset_in_page(data);
936 len_max = PAGE_SIZE - offset;
937 len = ((to_write > len_max) ? len_max : to_write);
938
939 skb->data_len = to_write;
940 skb->len += to_write;
941 skb->truesize += to_write;
942 atomic_add(to_write, &po->sk.sk_wmem_alloc);
943
944 while (likely(to_write)) {
945 nr_frags = skb_shinfo(skb)->nr_frags;
946
947 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
948 pr_err("Packet exceed the number of skb frags(%lu)\n",
949 MAX_SKB_FRAGS);
69e3c75f
JB
950 return -EFAULT;
951 }
952
0af55bb5
CG
953 page = pgv_to_page(data);
954 data += len;
69e3c75f
JB
955 flush_dcache_page(page);
956 get_page(page);
0af55bb5 957 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
958 to_write -= len;
959 offset = 0;
960 len_max = PAGE_SIZE;
961 len = ((to_write > len_max) ? len_max : to_write);
962 }
963
964 return tp_len;
965}
966
967static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
968{
69e3c75f
JB
969 struct sk_buff *skb;
970 struct net_device *dev;
971 __be16 proto;
972 int ifindex, err, reserve = 0;
40d4e3df
ED
973 void *ph;
974 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
975 int tp_len, size_max;
976 unsigned char *addr;
977 int len_sum = 0;
978 int status = 0;
979
69e3c75f
JB
980 mutex_lock(&po->pg_vec_lock);
981
982 err = -EBUSY;
983 if (saddr == NULL) {
984 ifindex = po->ifindex;
985 proto = po->num;
986 addr = NULL;
987 } else {
988 err = -EINVAL;
989 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
990 goto out;
991 if (msg->msg_namelen < (saddr->sll_halen
992 + offsetof(struct sockaddr_ll,
993 sll_addr)))
994 goto out;
995 ifindex = saddr->sll_ifindex;
996 proto = saddr->sll_protocol;
997 addr = saddr->sll_addr;
998 }
999
1000 dev = dev_get_by_index(sock_net(&po->sk), ifindex);
1001 err = -ENXIO;
1002 if (unlikely(dev == NULL))
1003 goto out;
1004
1005 reserve = dev->hard_header_len;
1006
1007 err = -ENETDOWN;
1008 if (unlikely(!(dev->flags & IFF_UP)))
1009 goto out_put;
1010
1011 size_max = po->tx_ring.frame_size
b5dd884e 1012 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
1013
1014 if (size_max > dev->mtu + reserve)
1015 size_max = dev->mtu + reserve;
1016
1017 do {
1018 ph = packet_current_frame(po, &po->tx_ring,
1019 TP_STATUS_SEND_REQUEST);
1020
1021 if (unlikely(ph == NULL)) {
1022 schedule();
1023 continue;
1024 }
1025
1026 status = TP_STATUS_SEND_REQUEST;
1027 skb = sock_alloc_send_skb(&po->sk,
1028 LL_ALLOCATED_SPACE(dev)
1029 + sizeof(struct sockaddr_ll),
1030 0, &err);
1031
1032 if (unlikely(skb == NULL))
1033 goto out_status;
1034
1035 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1036 addr);
1037
1038 if (unlikely(tp_len < 0)) {
1039 if (po->tp_loss) {
1040 __packet_set_status(po, ph,
1041 TP_STATUS_AVAILABLE);
1042 packet_increment_head(&po->tx_ring);
1043 kfree_skb(skb);
1044 continue;
1045 } else {
1046 status = TP_STATUS_WRONG_FORMAT;
1047 err = tp_len;
1048 goto out_status;
1049 }
1050 }
1051
1052 skb->destructor = tpacket_destruct_skb;
1053 __packet_set_status(po, ph, TP_STATUS_SENDING);
1054 atomic_inc(&po->tx_ring.pending);
1055
1056 status = TP_STATUS_SEND_REQUEST;
1057 err = dev_queue_xmit(skb);
eb70df13
JP
1058 if (unlikely(err > 0)) {
1059 err = net_xmit_errno(err);
1060 if (err && __packet_get_status(po, ph) ==
1061 TP_STATUS_AVAILABLE) {
1062 /* skb was destructed already */
1063 skb = NULL;
1064 goto out_status;
1065 }
1066 /*
1067 * skb was dropped but not destructed yet;
1068 * let's treat it like congestion or err < 0
1069 */
1070 err = 0;
1071 }
69e3c75f
JB
1072 packet_increment_head(&po->tx_ring);
1073 len_sum += tp_len;
f64f9e71
JP
1074 } while (likely((ph != NULL) ||
1075 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1076 (atomic_read(&po->tx_ring.pending))))
1077 );
69e3c75f
JB
1078
1079 err = len_sum;
1080 goto out_put;
1081
69e3c75f
JB
1082out_status:
1083 __packet_set_status(po, ph, status);
1084 kfree_skb(skb);
1085out_put:
1086 dev_put(dev);
1087out:
1088 mutex_unlock(&po->pg_vec_lock);
1089 return err;
1090}
69e3c75f 1091
bfd5f4a3
SS
1092static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1093 size_t reserve, size_t len,
1094 size_t linear, int noblock,
1095 int *err)
1096{
1097 struct sk_buff *skb;
1098
1099 /* Under a page? Don't bother with paged skb. */
1100 if (prepad + len < PAGE_SIZE || !linear)
1101 linear = len;
1102
1103 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1104 err);
1105 if (!skb)
1106 return NULL;
1107
1108 skb_reserve(skb, reserve);
1109 skb_put(skb, linear);
1110 skb->data_len = len - linear;
1111 skb->len += len - linear;
1112
1113 return skb;
1114}
1115
69e3c75f 1116static int packet_snd(struct socket *sock,
1da177e4
LT
1117 struct msghdr *msg, size_t len)
1118{
1119 struct sock *sk = sock->sk;
40d4e3df 1120 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
1121 struct sk_buff *skb;
1122 struct net_device *dev;
0e11c91e 1123 __be16 proto;
1da177e4
LT
1124 unsigned char *addr;
1125 int ifindex, err, reserve = 0;
bfd5f4a3
SS
1126 struct virtio_net_hdr vnet_hdr = { 0 };
1127 int offset = 0;
1128 int vnet_hdr_len;
1129 struct packet_sock *po = pkt_sk(sk);
1130 unsigned short gso_type = 0;
1da177e4
LT
1131
1132 /*
1ce4f28b 1133 * Get and verify the address.
1da177e4 1134 */
1ce4f28b 1135
1da177e4 1136 if (saddr == NULL) {
1da177e4
LT
1137 ifindex = po->ifindex;
1138 proto = po->num;
1139 addr = NULL;
1140 } else {
1141 err = -EINVAL;
1142 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1143 goto out;
0fb375fb
EB
1144 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1145 goto out;
1da177e4
LT
1146 ifindex = saddr->sll_ifindex;
1147 proto = saddr->sll_protocol;
1148 addr = saddr->sll_addr;
1149 }
1150
1151
3b1e0a65 1152 dev = dev_get_by_index(sock_net(sk), ifindex);
1da177e4
LT
1153 err = -ENXIO;
1154 if (dev == NULL)
1155 goto out_unlock;
1156 if (sock->type == SOCK_RAW)
1157 reserve = dev->hard_header_len;
1158
d5e76b0a
DM
1159 err = -ENETDOWN;
1160 if (!(dev->flags & IFF_UP))
1161 goto out_unlock;
1162
bfd5f4a3
SS
1163 if (po->has_vnet_hdr) {
1164 vnet_hdr_len = sizeof(vnet_hdr);
1165
1166 err = -EINVAL;
1167 if (len < vnet_hdr_len)
1168 goto out_unlock;
1169
1170 len -= vnet_hdr_len;
1171
1172 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1173 vnet_hdr_len);
1174 if (err < 0)
1175 goto out_unlock;
1176
1177 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1178 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1179 vnet_hdr.hdr_len))
1180 vnet_hdr.hdr_len = vnet_hdr.csum_start +
1181 vnet_hdr.csum_offset + 2;
1182
1183 err = -EINVAL;
1184 if (vnet_hdr.hdr_len > len)
1185 goto out_unlock;
1186
1187 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1188 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1189 case VIRTIO_NET_HDR_GSO_TCPV4:
1190 gso_type = SKB_GSO_TCPV4;
1191 break;
1192 case VIRTIO_NET_HDR_GSO_TCPV6:
1193 gso_type = SKB_GSO_TCPV6;
1194 break;
1195 case VIRTIO_NET_HDR_GSO_UDP:
1196 gso_type = SKB_GSO_UDP;
1197 break;
1198 default:
1199 goto out_unlock;
1200 }
1201
1202 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1203 gso_type |= SKB_GSO_TCP_ECN;
1204
1205 if (vnet_hdr.gso_size == 0)
1206 goto out_unlock;
1207
1208 }
1209 }
1210
1da177e4 1211 err = -EMSGSIZE;
57f89bfa 1212 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN))
1da177e4
LT
1213 goto out_unlock;
1214
bfd5f4a3
SS
1215 err = -ENOBUFS;
1216 skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1217 LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1218 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 1219 if (skb == NULL)
1da177e4
LT
1220 goto out_unlock;
1221
bfd5f4a3 1222 skb_set_network_header(skb, reserve);
1da177e4 1223
0c4e8581
SH
1224 err = -EINVAL;
1225 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 1226 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 1227 goto out_free;
1da177e4
LT
1228
1229 /* Returns -EFAULT on error */
bfd5f4a3 1230 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
1231 if (err)
1232 goto out_free;
2244d07b 1233 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
1234 if (err < 0)
1235 goto out_free;
1da177e4 1236
57f89bfa
BG
1237 if (!gso_type && (len > dev->mtu + reserve)) {
1238 /* Earlier code assumed this would be a VLAN pkt,
1239 * double-check this now that we have the actual
1240 * packet in hand.
1241 */
1242 struct ethhdr *ehdr;
1243 skb_reset_mac_header(skb);
1244 ehdr = eth_hdr(skb);
1245 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1246 err = -EMSGSIZE;
1247 goto out_free;
1248 }
1249 }
1250
1da177e4
LT
1251 skb->protocol = proto;
1252 skb->dev = dev;
1253 skb->priority = sk->sk_priority;
2d37a186 1254 skb->mark = sk->sk_mark;
1da177e4 1255
bfd5f4a3
SS
1256 if (po->has_vnet_hdr) {
1257 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1258 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1259 vnet_hdr.csum_offset)) {
1260 err = -EINVAL;
1261 goto out_free;
1262 }
1263 }
1264
1265 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1266 skb_shinfo(skb)->gso_type = gso_type;
1267
1268 /* Header must be checked, and gso_segs computed. */
1269 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1270 skb_shinfo(skb)->gso_segs = 0;
1271
1272 len += vnet_hdr_len;
1273 }
1274
1da177e4
LT
1275 /*
1276 * Now send it
1277 */
1278
1279 err = dev_queue_xmit(skb);
1280 if (err > 0 && (err = net_xmit_errno(err)) != 0)
1281 goto out_unlock;
1282
1283 dev_put(dev);
1284
40d4e3df 1285 return len;
1da177e4
LT
1286
1287out_free:
1288 kfree_skb(skb);
1289out_unlock:
1290 if (dev)
1291 dev_put(dev);
1292out:
1293 return err;
1294}
1295
69e3c75f
JB
1296static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1297 struct msghdr *msg, size_t len)
1298{
69e3c75f
JB
1299 struct sock *sk = sock->sk;
1300 struct packet_sock *po = pkt_sk(sk);
1301 if (po->tx_ring.pg_vec)
1302 return tpacket_snd(po, msg);
1303 else
69e3c75f
JB
1304 return packet_snd(sock, msg, len);
1305}
1306
1da177e4
LT
1307/*
1308 * Close a PACKET socket. This is fairly simple. We immediately go
1309 * to 'closed' state and remove our protocol entry in the device list.
1310 */
1311
1312static int packet_release(struct socket *sock)
1313{
1314 struct sock *sk = sock->sk;
1315 struct packet_sock *po;
d12d01d6 1316 struct net *net;
69e3c75f 1317 struct tpacket_req req;
1da177e4
LT
1318
1319 if (!sk)
1320 return 0;
1321
3b1e0a65 1322 net = sock_net(sk);
1da177e4
LT
1323 po = pkt_sk(sk);
1324
808f5114 1325 spin_lock_bh(&net->packet.sklist_lock);
1326 sk_del_node_init_rcu(sk);
920de804 1327 sock_prot_inuse_add(net, sk->sk_prot, -1);
808f5114 1328 spin_unlock_bh(&net->packet.sklist_lock);
1da177e4 1329
808f5114 1330 spin_lock(&po->bind_lock);
1da177e4
LT
1331 if (po->running) {
1332 /*
808f5114 1333 * Remove from protocol table
1da177e4 1334 */
1da177e4
LT
1335 po->running = 0;
1336 po->num = 0;
808f5114 1337 __dev_remove_pack(&po->prot_hook);
1da177e4
LT
1338 __sock_put(sk);
1339 }
808f5114 1340 spin_unlock(&po->bind_lock);
1da177e4 1341
1da177e4 1342 packet_flush_mclist(sk);
1da177e4 1343
69e3c75f
JB
1344 memset(&req, 0, sizeof(req));
1345
1346 if (po->rx_ring.pg_vec)
1347 packet_set_ring(sk, &req, 1, 0);
1348
1349 if (po->tx_ring.pg_vec)
1350 packet_set_ring(sk, &req, 1, 1);
1da177e4 1351
808f5114 1352 synchronize_net();
1da177e4
LT
1353 /*
1354 * Now the socket is dead. No more input will appear.
1355 */
1da177e4
LT
1356 sock_orphan(sk);
1357 sock->sk = NULL;
1358
1359 /* Purge queues */
1360
1361 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 1362 sk_refcnt_debug_release(sk);
1da177e4
LT
1363
1364 sock_put(sk);
1365 return 0;
1366}
1367
1368/*
1369 * Attach a packet hook.
1370 */
1371
0e11c91e 1372static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
1373{
1374 struct packet_sock *po = pkt_sk(sk);
1375 /*
1376 * Detach an existing hook if present.
1377 */
1378
1379 lock_sock(sk);
1380
1381 spin_lock(&po->bind_lock);
1382 if (po->running) {
1383 __sock_put(sk);
1384 po->running = 0;
1385 po->num = 0;
1386 spin_unlock(&po->bind_lock);
1387 dev_remove_pack(&po->prot_hook);
1388 spin_lock(&po->bind_lock);
1389 }
1390
1391 po->num = protocol;
1392 po->prot_hook.type = protocol;
1393 po->prot_hook.dev = dev;
1394
1395 po->ifindex = dev ? dev->ifindex : 0;
1396
1397 if (protocol == 0)
1398 goto out_unlock;
1399
be85d4ad 1400 if (!dev || (dev->flags & IFF_UP)) {
1da177e4
LT
1401 dev_add_pack(&po->prot_hook);
1402 sock_hold(sk);
1403 po->running = 1;
be85d4ad
UT
1404 } else {
1405 sk->sk_err = ENETDOWN;
1406 if (!sock_flag(sk, SOCK_DEAD))
1407 sk->sk_error_report(sk);
1da177e4
LT
1408 }
1409
1410out_unlock:
1411 spin_unlock(&po->bind_lock);
1412 release_sock(sk);
1413 return 0;
1414}
1415
1416/*
1417 * Bind a packet socket to a device
1418 */
1419
40d4e3df
ED
1420static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1421 int addr_len)
1da177e4 1422{
40d4e3df 1423 struct sock *sk = sock->sk;
1da177e4
LT
1424 char name[15];
1425 struct net_device *dev;
1426 int err = -ENODEV;
1ce4f28b 1427
1da177e4
LT
1428 /*
1429 * Check legality
1430 */
1ce4f28b 1431
8ae55f04 1432 if (addr_len != sizeof(struct sockaddr))
1da177e4 1433 return -EINVAL;
40d4e3df 1434 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 1435
3b1e0a65 1436 dev = dev_get_by_name(sock_net(sk), name);
1da177e4
LT
1437 if (dev) {
1438 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1439 dev_put(dev);
1440 }
1441 return err;
1442}
1da177e4
LT
1443
1444static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1445{
40d4e3df
ED
1446 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1447 struct sock *sk = sock->sk;
1da177e4
LT
1448 struct net_device *dev = NULL;
1449 int err;
1450
1451
1452 /*
1453 * Check legality
1454 */
1ce4f28b 1455
1da177e4
LT
1456 if (addr_len < sizeof(struct sockaddr_ll))
1457 return -EINVAL;
1458 if (sll->sll_family != AF_PACKET)
1459 return -EINVAL;
1460
1461 if (sll->sll_ifindex) {
1462 err = -ENODEV;
3b1e0a65 1463 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
1464 if (dev == NULL)
1465 goto out;
1466 }
1467 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1468 if (dev)
1469 dev_put(dev);
1470
1471out:
1472 return err;
1473}
1474
1475static struct proto packet_proto = {
1476 .name = "PACKET",
1477 .owner = THIS_MODULE,
1478 .obj_size = sizeof(struct packet_sock),
1479};
1480
1481/*
1ce4f28b 1482 * Create a packet of type SOCK_PACKET.
1da177e4
LT
1483 */
1484
3f378b68
EP
1485static int packet_create(struct net *net, struct socket *sock, int protocol,
1486 int kern)
1da177e4
LT
1487{
1488 struct sock *sk;
1489 struct packet_sock *po;
0e11c91e 1490 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
1491 int err;
1492
1493 if (!capable(CAP_NET_RAW))
1494 return -EPERM;
be02097c
DM
1495 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1496 sock->type != SOCK_PACKET)
1da177e4
LT
1497 return -ESOCKTNOSUPPORT;
1498
1499 sock->state = SS_UNCONNECTED;
1500
1501 err = -ENOBUFS;
6257ff21 1502 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
1503 if (sk == NULL)
1504 goto out;
1505
1506 sock->ops = &packet_ops;
1da177e4
LT
1507 if (sock->type == SOCK_PACKET)
1508 sock->ops = &packet_ops_spkt;
be02097c 1509
1da177e4
LT
1510 sock_init_data(sock, sk);
1511
1512 po = pkt_sk(sk);
1513 sk->sk_family = PF_PACKET;
0e11c91e 1514 po->num = proto;
1da177e4
LT
1515
1516 sk->sk_destruct = packet_sock_destruct;
17ab56a2 1517 sk_refcnt_debug_inc(sk);
1da177e4
LT
1518
1519 /*
1520 * Attach a protocol block
1521 */
1522
1523 spin_lock_init(&po->bind_lock);
905db440 1524 mutex_init(&po->pg_vec_lock);
1da177e4 1525 po->prot_hook.func = packet_rcv;
be02097c 1526
1da177e4
LT
1527 if (sock->type == SOCK_PACKET)
1528 po->prot_hook.func = packet_rcv_spkt;
be02097c 1529
1da177e4
LT
1530 po->prot_hook.af_packet_priv = sk;
1531
0e11c91e
AV
1532 if (proto) {
1533 po->prot_hook.type = proto;
1da177e4
LT
1534 dev_add_pack(&po->prot_hook);
1535 sock_hold(sk);
1536 po->running = 1;
1537 }
1538
808f5114 1539 spin_lock_bh(&net->packet.sklist_lock);
1540 sk_add_node_rcu(sk, &net->packet.sklist);
3680453c 1541 sock_prot_inuse_add(net, &packet_proto, 1);
808f5114 1542 spin_unlock_bh(&net->packet.sklist_lock);
1543
40d4e3df 1544 return 0;
1da177e4
LT
1545out:
1546 return err;
1547}
1548
ed85b565
RC
1549static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1550{
1551 struct sock_exterr_skb *serr;
1552 struct sk_buff *skb, *skb2;
1553 int copied, err;
1554
1555 err = -EAGAIN;
1556 skb = skb_dequeue(&sk->sk_error_queue);
1557 if (skb == NULL)
1558 goto out;
1559
1560 copied = skb->len;
1561 if (copied > len) {
1562 msg->msg_flags |= MSG_TRUNC;
1563 copied = len;
1564 }
1565 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1566 if (err)
1567 goto out_free_skb;
1568
1569 sock_recv_timestamp(msg, sk, skb);
1570
1571 serr = SKB_EXT_ERR(skb);
1572 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1573 sizeof(serr->ee), &serr->ee);
1574
1575 msg->msg_flags |= MSG_ERRQUEUE;
1576 err = copied;
1577
1578 /* Reset and regenerate socket error */
1579 spin_lock_bh(&sk->sk_error_queue.lock);
1580 sk->sk_err = 0;
1581 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1582 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1583 spin_unlock_bh(&sk->sk_error_queue.lock);
1584 sk->sk_error_report(sk);
1585 } else
1586 spin_unlock_bh(&sk->sk_error_queue.lock);
1587
1588out_free_skb:
1589 kfree_skb(skb);
1590out:
1591 return err;
1592}
1593
1da177e4
LT
1594/*
1595 * Pull a packet from our receive queue and hand it to the user.
1596 * If necessary we block.
1597 */
1598
1599static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1600 struct msghdr *msg, size_t len, int flags)
1601{
1602 struct sock *sk = sock->sk;
1603 struct sk_buff *skb;
1604 int copied, err;
0fb375fb 1605 struct sockaddr_ll *sll;
bfd5f4a3 1606 int vnet_hdr_len = 0;
1da177e4
LT
1607
1608 err = -EINVAL;
ed85b565 1609 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
1610 goto out;
1611
1612#if 0
1613 /* What error should we return now? EUNATTACH? */
1614 if (pkt_sk(sk)->ifindex < 0)
1615 return -ENODEV;
1616#endif
1617
ed85b565
RC
1618 if (flags & MSG_ERRQUEUE) {
1619 err = packet_recv_error(sk, msg, len);
1620 goto out;
1621 }
1622
1da177e4
LT
1623 /*
1624 * Call the generic datagram receiver. This handles all sorts
1625 * of horrible races and re-entrancy so we can forget about it
1626 * in the protocol layers.
1627 *
1628 * Now it will return ENETDOWN, if device have just gone down,
1629 * but then it will block.
1630 */
1631
40d4e3df 1632 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
1633
1634 /*
1ce4f28b 1635 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
1636 * handles the blocking we don't see and worry about blocking
1637 * retries.
1638 */
1639
8ae55f04 1640 if (skb == NULL)
1da177e4
LT
1641 goto out;
1642
bfd5f4a3
SS
1643 if (pkt_sk(sk)->has_vnet_hdr) {
1644 struct virtio_net_hdr vnet_hdr = { 0 };
1645
1646 err = -EINVAL;
1647 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 1648 if (len < vnet_hdr_len)
bfd5f4a3
SS
1649 goto out_free;
1650
1f18b717
MK
1651 len -= vnet_hdr_len;
1652
bfd5f4a3
SS
1653 if (skb_is_gso(skb)) {
1654 struct skb_shared_info *sinfo = skb_shinfo(skb);
1655
1656 /* This is a hint as to how much should be linear. */
1657 vnet_hdr.hdr_len = skb_headlen(skb);
1658 vnet_hdr.gso_size = sinfo->gso_size;
1659 if (sinfo->gso_type & SKB_GSO_TCPV4)
1660 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1661 else if (sinfo->gso_type & SKB_GSO_TCPV6)
1662 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1663 else if (sinfo->gso_type & SKB_GSO_UDP)
1664 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1665 else if (sinfo->gso_type & SKB_GSO_FCOE)
1666 goto out_free;
1667 else
1668 BUG();
1669 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1670 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1671 } else
1672 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1673
1674 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1675 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 1676 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3
SS
1677 vnet_hdr.csum_offset = skb->csum_offset;
1678 } /* else everything is zero */
1679
1680 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1681 vnet_hdr_len);
1682 if (err < 0)
1683 goto out_free;
1684 }
1685
0fb375fb
EB
1686 /*
1687 * If the address length field is there to be filled in, we fill
1688 * it in now.
1689 */
1690
ffbc6111 1691 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
1692 if (sock->type == SOCK_PACKET)
1693 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1694 else
1695 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1696
1da177e4
LT
1697 /*
1698 * You lose any data beyond the buffer you gave. If it worries a
1699 * user program they can ask the device for its MTU anyway.
1700 */
1701
1702 copied = skb->len;
40d4e3df
ED
1703 if (copied > len) {
1704 copied = len;
1705 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
1706 }
1707
1708 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1709 if (err)
1710 goto out_free;
1711
3b885787 1712 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
1713
1714 if (msg->msg_name)
ffbc6111
HX
1715 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1716 msg->msg_namelen);
1da177e4 1717
8dc41944 1718 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
1719 struct tpacket_auxdata aux;
1720
1721 aux.tp_status = TP_STATUS_USER;
1722 if (skb->ip_summed == CHECKSUM_PARTIAL)
1723 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1724 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1725 aux.tp_snaplen = skb->len;
1726 aux.tp_mac = 0;
bbe735e4 1727 aux.tp_net = skb_network_offset(skb);
05423b24 1728 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
ffbc6111
HX
1729
1730 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
1731 }
1732
1da177e4
LT
1733 /*
1734 * Free or return the buffer as appropriate. Again this
1735 * hides all the races and re-entrancy issues from us.
1736 */
bfd5f4a3 1737 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
1738
1739out_free:
1740 skb_free_datagram(sk, skb);
1741out:
1742 return err;
1743}
1744
1da177e4
LT
1745static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1746 int *uaddr_len, int peer)
1747{
1748 struct net_device *dev;
1749 struct sock *sk = sock->sk;
1750
1751 if (peer)
1752 return -EOPNOTSUPP;
1753
1754 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
1755 rcu_read_lock();
1756 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1757 if (dev)
67286640 1758 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 1759 else
1da177e4 1760 memset(uaddr->sa_data, 0, 14);
654d1f8a 1761 rcu_read_unlock();
1da177e4
LT
1762 *uaddr_len = sizeof(*uaddr);
1763
1764 return 0;
1765}
1da177e4
LT
1766
1767static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1768 int *uaddr_len, int peer)
1769{
1770 struct net_device *dev;
1771 struct sock *sk = sock->sk;
1772 struct packet_sock *po = pkt_sk(sk);
13cfa97b 1773 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
1774
1775 if (peer)
1776 return -EOPNOTSUPP;
1777
1778 sll->sll_family = AF_PACKET;
1779 sll->sll_ifindex = po->ifindex;
1780 sll->sll_protocol = po->num;
67286640 1781 sll->sll_pkttype = 0;
654d1f8a
ED
1782 rcu_read_lock();
1783 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
1784 if (dev) {
1785 sll->sll_hatype = dev->type;
1786 sll->sll_halen = dev->addr_len;
1787 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
1788 } else {
1789 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
1790 sll->sll_halen = 0;
1791 }
654d1f8a 1792 rcu_read_unlock();
0fb375fb 1793 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
1794
1795 return 0;
1796}
1797
2aeb0b88
WC
1798static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1799 int what)
1da177e4
LT
1800{
1801 switch (i->type) {
1802 case PACKET_MR_MULTICAST:
1162563f
JP
1803 if (i->alen != dev->addr_len)
1804 return -EINVAL;
1da177e4 1805 if (what > 0)
22bedad3 1806 return dev_mc_add(dev, i->addr);
1da177e4 1807 else
22bedad3 1808 return dev_mc_del(dev, i->addr);
1da177e4
LT
1809 break;
1810 case PACKET_MR_PROMISC:
2aeb0b88 1811 return dev_set_promiscuity(dev, what);
1da177e4
LT
1812 break;
1813 case PACKET_MR_ALLMULTI:
2aeb0b88 1814 return dev_set_allmulti(dev, what);
1da177e4 1815 break;
d95ed927 1816 case PACKET_MR_UNICAST:
1162563f
JP
1817 if (i->alen != dev->addr_len)
1818 return -EINVAL;
d95ed927 1819 if (what > 0)
a748ee24 1820 return dev_uc_add(dev, i->addr);
d95ed927 1821 else
a748ee24 1822 return dev_uc_del(dev, i->addr);
d95ed927 1823 break;
40d4e3df
ED
1824 default:
1825 break;
1da177e4 1826 }
2aeb0b88 1827 return 0;
1da177e4
LT
1828}
1829
1830static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1831{
40d4e3df 1832 for ( ; i; i = i->next) {
1da177e4
LT
1833 if (i->ifindex == dev->ifindex)
1834 packet_dev_mc(dev, i, what);
1835 }
1836}
1837
0fb375fb 1838static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
1839{
1840 struct packet_sock *po = pkt_sk(sk);
1841 struct packet_mclist *ml, *i;
1842 struct net_device *dev;
1843 int err;
1844
1845 rtnl_lock();
1846
1847 err = -ENODEV;
3b1e0a65 1848 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
1849 if (!dev)
1850 goto done;
1851
1852 err = -EINVAL;
1162563f 1853 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
1854 goto done;
1855
1856 err = -ENOBUFS;
8b3a7005 1857 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
1858 if (i == NULL)
1859 goto done;
1860
1861 err = 0;
1862 for (ml = po->mclist; ml; ml = ml->next) {
1863 if (ml->ifindex == mreq->mr_ifindex &&
1864 ml->type == mreq->mr_type &&
1865 ml->alen == mreq->mr_alen &&
1866 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1867 ml->count++;
1868 /* Free the new element ... */
1869 kfree(i);
1870 goto done;
1871 }
1872 }
1873
1874 i->type = mreq->mr_type;
1875 i->ifindex = mreq->mr_ifindex;
1876 i->alen = mreq->mr_alen;
1877 memcpy(i->addr, mreq->mr_address, i->alen);
1878 i->count = 1;
1879 i->next = po->mclist;
1880 po->mclist = i;
2aeb0b88
WC
1881 err = packet_dev_mc(dev, i, 1);
1882 if (err) {
1883 po->mclist = i->next;
1884 kfree(i);
1885 }
1da177e4
LT
1886
1887done:
1888 rtnl_unlock();
1889 return err;
1890}
1891
0fb375fb 1892static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
1893{
1894 struct packet_mclist *ml, **mlp;
1895
1896 rtnl_lock();
1897
1898 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1899 if (ml->ifindex == mreq->mr_ifindex &&
1900 ml->type == mreq->mr_type &&
1901 ml->alen == mreq->mr_alen &&
1902 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1903 if (--ml->count == 0) {
1904 struct net_device *dev;
1905 *mlp = ml->next;
ad959e76
ED
1906 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1907 if (dev)
1da177e4 1908 packet_dev_mc(dev, ml, -1);
1da177e4
LT
1909 kfree(ml);
1910 }
1911 rtnl_unlock();
1912 return 0;
1913 }
1914 }
1915 rtnl_unlock();
1916 return -EADDRNOTAVAIL;
1917}
1918
1919static void packet_flush_mclist(struct sock *sk)
1920{
1921 struct packet_sock *po = pkt_sk(sk);
1922 struct packet_mclist *ml;
1923
1924 if (!po->mclist)
1925 return;
1926
1927 rtnl_lock();
1928 while ((ml = po->mclist) != NULL) {
1929 struct net_device *dev;
1930
1931 po->mclist = ml->next;
ad959e76
ED
1932 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1933 if (dev != NULL)
1da177e4 1934 packet_dev_mc(dev, ml, -1);
1da177e4
LT
1935 kfree(ml);
1936 }
1937 rtnl_unlock();
1938}
1da177e4
LT
1939
1940static int
b7058842 1941packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
1942{
1943 struct sock *sk = sock->sk;
8dc41944 1944 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
1945 int ret;
1946
1947 if (level != SOL_PACKET)
1948 return -ENOPROTOOPT;
1949
69e3c75f 1950 switch (optname) {
1ce4f28b 1951 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
1952 case PACKET_DROP_MEMBERSHIP:
1953 {
0fb375fb
EB
1954 struct packet_mreq_max mreq;
1955 int len = optlen;
1956 memset(&mreq, 0, sizeof(mreq));
1957 if (len < sizeof(struct packet_mreq))
1da177e4 1958 return -EINVAL;
0fb375fb
EB
1959 if (len > sizeof(mreq))
1960 len = sizeof(mreq);
40d4e3df 1961 if (copy_from_user(&mreq, optval, len))
1da177e4 1962 return -EFAULT;
0fb375fb
EB
1963 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1964 return -EINVAL;
1da177e4
LT
1965 if (optname == PACKET_ADD_MEMBERSHIP)
1966 ret = packet_mc_add(sk, &mreq);
1967 else
1968 ret = packet_mc_drop(sk, &mreq);
1969 return ret;
1970 }
a2efcfa0 1971
1da177e4 1972 case PACKET_RX_RING:
69e3c75f 1973 case PACKET_TX_RING:
1da177e4
LT
1974 {
1975 struct tpacket_req req;
1976
40d4e3df 1977 if (optlen < sizeof(req))
1da177e4 1978 return -EINVAL;
bfd5f4a3
SS
1979 if (pkt_sk(sk)->has_vnet_hdr)
1980 return -EINVAL;
40d4e3df 1981 if (copy_from_user(&req, optval, sizeof(req)))
1da177e4 1982 return -EFAULT;
69e3c75f 1983 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1da177e4
LT
1984 }
1985 case PACKET_COPY_THRESH:
1986 {
1987 int val;
1988
40d4e3df 1989 if (optlen != sizeof(val))
1da177e4 1990 return -EINVAL;
40d4e3df 1991 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
1992 return -EFAULT;
1993
1994 pkt_sk(sk)->copy_thresh = val;
1995 return 0;
1996 }
bbd6ef87
PM
1997 case PACKET_VERSION:
1998 {
1999 int val;
2000
2001 if (optlen != sizeof(val))
2002 return -EINVAL;
69e3c75f 2003 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
2004 return -EBUSY;
2005 if (copy_from_user(&val, optval, sizeof(val)))
2006 return -EFAULT;
2007 switch (val) {
2008 case TPACKET_V1:
2009 case TPACKET_V2:
2010 po->tp_version = val;
2011 return 0;
2012 default:
2013 return -EINVAL;
2014 }
2015 }
8913336a
PM
2016 case PACKET_RESERVE:
2017 {
2018 unsigned int val;
2019
2020 if (optlen != sizeof(val))
2021 return -EINVAL;
69e3c75f 2022 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
2023 return -EBUSY;
2024 if (copy_from_user(&val, optval, sizeof(val)))
2025 return -EFAULT;
2026 po->tp_reserve = val;
2027 return 0;
2028 }
69e3c75f
JB
2029 case PACKET_LOSS:
2030 {
2031 unsigned int val;
2032
2033 if (optlen != sizeof(val))
2034 return -EINVAL;
2035 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2036 return -EBUSY;
2037 if (copy_from_user(&val, optval, sizeof(val)))
2038 return -EFAULT;
2039 po->tp_loss = !!val;
2040 return 0;
2041 }
8dc41944
HX
2042 case PACKET_AUXDATA:
2043 {
2044 int val;
2045
2046 if (optlen < sizeof(val))
2047 return -EINVAL;
2048 if (copy_from_user(&val, optval, sizeof(val)))
2049 return -EFAULT;
2050
2051 po->auxdata = !!val;
2052 return 0;
2053 }
80feaacb
PWJ
2054 case PACKET_ORIGDEV:
2055 {
2056 int val;
2057
2058 if (optlen < sizeof(val))
2059 return -EINVAL;
2060 if (copy_from_user(&val, optval, sizeof(val)))
2061 return -EFAULT;
2062
2063 po->origdev = !!val;
2064 return 0;
2065 }
bfd5f4a3
SS
2066 case PACKET_VNET_HDR:
2067 {
2068 int val;
2069
2070 if (sock->type != SOCK_RAW)
2071 return -EINVAL;
2072 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2073 return -EBUSY;
2074 if (optlen < sizeof(val))
2075 return -EINVAL;
2076 if (copy_from_user(&val, optval, sizeof(val)))
2077 return -EFAULT;
2078
2079 po->has_vnet_hdr = !!val;
2080 return 0;
2081 }
614f60fa
SM
2082 case PACKET_TIMESTAMP:
2083 {
2084 int val;
2085
2086 if (optlen != sizeof(val))
2087 return -EINVAL;
2088 if (copy_from_user(&val, optval, sizeof(val)))
2089 return -EFAULT;
2090
2091 po->tp_tstamp = val;
2092 return 0;
2093 }
1da177e4
LT
2094 default:
2095 return -ENOPROTOOPT;
2096 }
2097}
2098
2099static int packet_getsockopt(struct socket *sock, int level, int optname,
2100 char __user *optval, int __user *optlen)
2101{
2102 int len;
8dc41944 2103 int val;
1da177e4
LT
2104 struct sock *sk = sock->sk;
2105 struct packet_sock *po = pkt_sk(sk);
8dc41944
HX
2106 void *data;
2107 struct tpacket_stats st;
1da177e4
LT
2108
2109 if (level != SOL_PACKET)
2110 return -ENOPROTOOPT;
2111
8ae55f04
KK
2112 if (get_user(len, optlen))
2113 return -EFAULT;
1da177e4
LT
2114
2115 if (len < 0)
2116 return -EINVAL;
1ce4f28b 2117
69e3c75f 2118 switch (optname) {
1da177e4 2119 case PACKET_STATISTICS:
1da177e4
LT
2120 if (len > sizeof(struct tpacket_stats))
2121 len = sizeof(struct tpacket_stats);
2122 spin_lock_bh(&sk->sk_receive_queue.lock);
2123 st = po->stats;
2124 memset(&po->stats, 0, sizeof(st));
2125 spin_unlock_bh(&sk->sk_receive_queue.lock);
2126 st.tp_packets += st.tp_drops;
2127
8dc41944
HX
2128 data = &st;
2129 break;
2130 case PACKET_AUXDATA:
2131 if (len > sizeof(int))
2132 len = sizeof(int);
2133 val = po->auxdata;
2134
80feaacb
PWJ
2135 data = &val;
2136 break;
2137 case PACKET_ORIGDEV:
2138 if (len > sizeof(int))
2139 len = sizeof(int);
2140 val = po->origdev;
2141
bfd5f4a3
SS
2142 data = &val;
2143 break;
2144 case PACKET_VNET_HDR:
2145 if (len > sizeof(int))
2146 len = sizeof(int);
2147 val = po->has_vnet_hdr;
2148
8dc41944 2149 data = &val;
1da177e4 2150 break;
bbd6ef87
PM
2151 case PACKET_VERSION:
2152 if (len > sizeof(int))
2153 len = sizeof(int);
2154 val = po->tp_version;
2155 data = &val;
2156 break;
2157 case PACKET_HDRLEN:
2158 if (len > sizeof(int))
2159 len = sizeof(int);
2160 if (copy_from_user(&val, optval, len))
2161 return -EFAULT;
2162 switch (val) {
2163 case TPACKET_V1:
2164 val = sizeof(struct tpacket_hdr);
2165 break;
2166 case TPACKET_V2:
2167 val = sizeof(struct tpacket2_hdr);
2168 break;
2169 default:
2170 return -EINVAL;
2171 }
2172 data = &val;
2173 break;
8913336a
PM
2174 case PACKET_RESERVE:
2175 if (len > sizeof(unsigned int))
2176 len = sizeof(unsigned int);
2177 val = po->tp_reserve;
2178 data = &val;
2179 break;
69e3c75f
JB
2180 case PACKET_LOSS:
2181 if (len > sizeof(unsigned int))
2182 len = sizeof(unsigned int);
2183 val = po->tp_loss;
2184 data = &val;
2185 break;
614f60fa
SM
2186 case PACKET_TIMESTAMP:
2187 if (len > sizeof(int))
2188 len = sizeof(int);
2189 val = po->tp_tstamp;
2190 data = &val;
2191 break;
1da177e4
LT
2192 default:
2193 return -ENOPROTOOPT;
2194 }
2195
8ae55f04
KK
2196 if (put_user(len, optlen))
2197 return -EFAULT;
8dc41944
HX
2198 if (copy_to_user(optval, data, len))
2199 return -EFAULT;
8ae55f04 2200 return 0;
1da177e4
LT
2201}
2202
2203
2204static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2205{
2206 struct sock *sk;
2207 struct hlist_node *node;
ad930650 2208 struct net_device *dev = data;
c346dca1 2209 struct net *net = dev_net(dev);
1da177e4 2210
808f5114 2211 rcu_read_lock();
2212 sk_for_each_rcu(sk, node, &net->packet.sklist) {
1da177e4
LT
2213 struct packet_sock *po = pkt_sk(sk);
2214
2215 switch (msg) {
2216 case NETDEV_UNREGISTER:
1da177e4
LT
2217 if (po->mclist)
2218 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
2219 /* fallthrough */
2220
1da177e4
LT
2221 case NETDEV_DOWN:
2222 if (dev->ifindex == po->ifindex) {
2223 spin_lock(&po->bind_lock);
2224 if (po->running) {
2225 __dev_remove_pack(&po->prot_hook);
2226 __sock_put(sk);
2227 po->running = 0;
2228 sk->sk_err = ENETDOWN;
2229 if (!sock_flag(sk, SOCK_DEAD))
2230 sk->sk_error_report(sk);
2231 }
2232 if (msg == NETDEV_UNREGISTER) {
2233 po->ifindex = -1;
2234 po->prot_hook.dev = NULL;
2235 }
2236 spin_unlock(&po->bind_lock);
2237 }
2238 break;
2239 case NETDEV_UP:
808f5114 2240 if (dev->ifindex == po->ifindex) {
2241 spin_lock(&po->bind_lock);
2242 if (po->num && !po->running) {
2243 dev_add_pack(&po->prot_hook);
2244 sock_hold(sk);
2245 po->running = 1;
2246 }
2247 spin_unlock(&po->bind_lock);
1da177e4 2248 }
1da177e4
LT
2249 break;
2250 }
2251 }
808f5114 2252 rcu_read_unlock();
1da177e4
LT
2253 return NOTIFY_DONE;
2254}
2255
2256
2257static int packet_ioctl(struct socket *sock, unsigned int cmd,
2258 unsigned long arg)
2259{
2260 struct sock *sk = sock->sk;
2261
69e3c75f 2262 switch (cmd) {
40d4e3df
ED
2263 case SIOCOUTQ:
2264 {
2265 int amount = sk_wmem_alloc_get(sk);
31e6d363 2266
40d4e3df
ED
2267 return put_user(amount, (int __user *)arg);
2268 }
2269 case SIOCINQ:
2270 {
2271 struct sk_buff *skb;
2272 int amount = 0;
2273
2274 spin_lock_bh(&sk->sk_receive_queue.lock);
2275 skb = skb_peek(&sk->sk_receive_queue);
2276 if (skb)
2277 amount = skb->len;
2278 spin_unlock_bh(&sk->sk_receive_queue.lock);
2279 return put_user(amount, (int __user *)arg);
2280 }
2281 case SIOCGSTAMP:
2282 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2283 case SIOCGSTAMPNS:
2284 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 2285
1da177e4 2286#ifdef CONFIG_INET
40d4e3df
ED
2287 case SIOCADDRT:
2288 case SIOCDELRT:
2289 case SIOCDARP:
2290 case SIOCGARP:
2291 case SIOCSARP:
2292 case SIOCGIFADDR:
2293 case SIOCSIFADDR:
2294 case SIOCGIFBRDADDR:
2295 case SIOCSIFBRDADDR:
2296 case SIOCGIFNETMASK:
2297 case SIOCSIFNETMASK:
2298 case SIOCGIFDSTADDR:
2299 case SIOCSIFDSTADDR:
2300 case SIOCSIFFLAGS:
40d4e3df 2301 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
2302#endif
2303
40d4e3df
ED
2304 default:
2305 return -ENOIOCTLCMD;
1da177e4
LT
2306 }
2307 return 0;
2308}
2309
40d4e3df 2310static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
2311 poll_table *wait)
2312{
2313 struct sock *sk = sock->sk;
2314 struct packet_sock *po = pkt_sk(sk);
2315 unsigned int mask = datagram_poll(file, sock, wait);
2316
2317 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
2318 if (po->rx_ring.pg_vec) {
2319 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
1da177e4
LT
2320 mask |= POLLIN | POLLRDNORM;
2321 }
2322 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
2323 spin_lock_bh(&sk->sk_write_queue.lock);
2324 if (po->tx_ring.pg_vec) {
2325 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2326 mask |= POLLOUT | POLLWRNORM;
2327 }
2328 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
2329 return mask;
2330}
2331
2332
2333/* Dirty? Well, I still did not learn better way to account
2334 * for user mmaps.
2335 */
2336
2337static void packet_mm_open(struct vm_area_struct *vma)
2338{
2339 struct file *file = vma->vm_file;
40d4e3df 2340 struct socket *sock = file->private_data;
1da177e4 2341 struct sock *sk = sock->sk;
1ce4f28b 2342
1da177e4
LT
2343 if (sk)
2344 atomic_inc(&pkt_sk(sk)->mapped);
2345}
2346
2347static void packet_mm_close(struct vm_area_struct *vma)
2348{
2349 struct file *file = vma->vm_file;
40d4e3df 2350 struct socket *sock = file->private_data;
1da177e4 2351 struct sock *sk = sock->sk;
1ce4f28b 2352
1da177e4
LT
2353 if (sk)
2354 atomic_dec(&pkt_sk(sk)->mapped);
2355}
2356
f0f37e2f 2357static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
2358 .open = packet_mm_open,
2359 .close = packet_mm_close,
1da177e4
LT
2360};
2361
0e3125c7
NH
2362static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2363 unsigned int len)
1da177e4
LT
2364{
2365 int i;
2366
4ebf0ae2 2367 for (i = 0; i < len; i++) {
0e3125c7 2368 if (likely(pg_vec[i].buffer)) {
c56b4d90 2369 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
2370 vfree(pg_vec[i].buffer);
2371 else
2372 free_pages((unsigned long)pg_vec[i].buffer,
2373 order);
2374 pg_vec[i].buffer = NULL;
2375 }
1da177e4
LT
2376 }
2377 kfree(pg_vec);
2378}
2379
c56b4d90 2380static inline char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 2381{
0e3125c7
NH
2382 char *buffer = NULL;
2383 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2384 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
2385
2386 buffer = (char *) __get_free_pages(gfp_flags, order);
2387
2388 if (buffer)
2389 return buffer;
2390
2391 /*
2392 * __get_free_pages failed, fall back to vmalloc
2393 */
bbce5a59 2394 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 2395
0e3125c7
NH
2396 if (buffer)
2397 return buffer;
2398
2399 /*
2400 * vmalloc failed, lets dig into swap here
2401 */
0e3125c7
NH
2402 gfp_flags &= ~__GFP_NORETRY;
2403 buffer = (char *)__get_free_pages(gfp_flags, order);
2404 if (buffer)
2405 return buffer;
2406
2407 /*
2408 * complete and utter failure
2409 */
2410 return NULL;
4ebf0ae2
DM
2411}
2412
0e3125c7 2413static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
2414{
2415 unsigned int block_nr = req->tp_block_nr;
0e3125c7 2416 struct pgv *pg_vec;
4ebf0ae2
DM
2417 int i;
2418
0e3125c7 2419 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
2420 if (unlikely(!pg_vec))
2421 goto out;
2422
2423 for (i = 0; i < block_nr; i++) {
c56b4d90 2424 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 2425 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
2426 goto out_free_pgvec;
2427 }
2428
2429out:
2430 return pg_vec;
2431
2432out_free_pgvec:
2433 free_pg_vec(pg_vec, order, block_nr);
2434 pg_vec = NULL;
2435 goto out;
2436}
1da177e4 2437
69e3c75f
JB
2438static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2439 int closing, int tx_ring)
1da177e4 2440{
0e3125c7 2441 struct pgv *pg_vec = NULL;
1da177e4 2442 struct packet_sock *po = pkt_sk(sk);
0e11c91e 2443 int was_running, order = 0;
69e3c75f
JB
2444 struct packet_ring_buffer *rb;
2445 struct sk_buff_head *rb_queue;
0e11c91e 2446 __be16 num;
69e3c75f 2447 int err;
1ce4f28b 2448
69e3c75f
JB
2449 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2450 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 2451
69e3c75f
JB
2452 err = -EBUSY;
2453 if (!closing) {
2454 if (atomic_read(&po->mapped))
2455 goto out;
2456 if (atomic_read(&rb->pending))
2457 goto out;
2458 }
1da177e4 2459
69e3c75f
JB
2460 if (req->tp_block_nr) {
2461 /* Sanity tests and some calculations */
2462 err = -EBUSY;
2463 if (unlikely(rb->pg_vec))
2464 goto out;
1da177e4 2465
bbd6ef87
PM
2466 switch (po->tp_version) {
2467 case TPACKET_V1:
2468 po->tp_hdrlen = TPACKET_HDRLEN;
2469 break;
2470 case TPACKET_V2:
2471 po->tp_hdrlen = TPACKET2_HDRLEN;
2472 break;
2473 }
2474
69e3c75f 2475 err = -EINVAL;
4ebf0ae2 2476 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 2477 goto out;
4ebf0ae2 2478 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 2479 goto out;
8913336a 2480 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
2481 po->tp_reserve))
2482 goto out;
4ebf0ae2 2483 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 2484 goto out;
1da177e4 2485
69e3c75f
JB
2486 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2487 if (unlikely(rb->frames_per_block <= 0))
2488 goto out;
2489 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2490 req->tp_frame_nr))
2491 goto out;
1da177e4
LT
2492
2493 err = -ENOMEM;
4ebf0ae2
DM
2494 order = get_order(req->tp_block_size);
2495 pg_vec = alloc_pg_vec(req, order);
2496 if (unlikely(!pg_vec))
1da177e4 2497 goto out;
69e3c75f
JB
2498 }
2499 /* Done */
2500 else {
2501 err = -EINVAL;
4ebf0ae2 2502 if (unlikely(req->tp_frame_nr))
69e3c75f 2503 goto out;
1da177e4
LT
2504 }
2505
2506 lock_sock(sk);
2507
2508 /* Detach socket from network */
2509 spin_lock(&po->bind_lock);
2510 was_running = po->running;
2511 num = po->num;
2512 if (was_running) {
2513 __dev_remove_pack(&po->prot_hook);
2514 po->num = 0;
2515 po->running = 0;
2516 __sock_put(sk);
2517 }
2518 spin_unlock(&po->bind_lock);
1ce4f28b 2519
1da177e4
LT
2520 synchronize_net();
2521
2522 err = -EBUSY;
905db440 2523 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
2524 if (closing || atomic_read(&po->mapped) == 0) {
2525 err = 0;
69e3c75f 2526 spin_lock_bh(&rb_queue->lock);
c053fd96 2527 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
2528 rb->frame_max = (req->tp_frame_nr - 1);
2529 rb->head = 0;
2530 rb->frame_size = req->tp_frame_size;
2531 spin_unlock_bh(&rb_queue->lock);
2532
c053fd96
CG
2533 swap(rb->pg_vec_order, order);
2534 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
2535
2536 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2537 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2538 tpacket_rcv : packet_rcv;
2539 skb_queue_purge(rb_queue);
1da177e4 2540 if (atomic_read(&po->mapped))
40d4e3df
ED
2541 pr_err("packet_mmap: vma is busy: %d\n",
2542 atomic_read(&po->mapped));
1da177e4 2543 }
905db440 2544 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
2545
2546 spin_lock(&po->bind_lock);
2547 if (was_running && !po->running) {
2548 sock_hold(sk);
2549 po->running = 1;
2550 po->num = num;
2551 dev_add_pack(&po->prot_hook);
2552 }
2553 spin_unlock(&po->bind_lock);
2554
2555 release_sock(sk);
2556
1da177e4
LT
2557 if (pg_vec)
2558 free_pg_vec(pg_vec, order, req->tp_block_nr);
2559out:
2560 return err;
2561}
2562
69e3c75f
JB
2563static int packet_mmap(struct file *file, struct socket *sock,
2564 struct vm_area_struct *vma)
1da177e4
LT
2565{
2566 struct sock *sk = sock->sk;
2567 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
2568 unsigned long size, expected_size;
2569 struct packet_ring_buffer *rb;
1da177e4
LT
2570 unsigned long start;
2571 int err = -EINVAL;
2572 int i;
2573
2574 if (vma->vm_pgoff)
2575 return -EINVAL;
2576
905db440 2577 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
2578
2579 expected_size = 0;
2580 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2581 if (rb->pg_vec) {
2582 expected_size += rb->pg_vec_len
2583 * rb->pg_vec_pages
2584 * PAGE_SIZE;
2585 }
2586 }
2587
2588 if (expected_size == 0)
1da177e4 2589 goto out;
69e3c75f
JB
2590
2591 size = vma->vm_end - vma->vm_start;
2592 if (size != expected_size)
1da177e4
LT
2593 goto out;
2594
1da177e4 2595 start = vma->vm_start;
69e3c75f
JB
2596 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2597 if (rb->pg_vec == NULL)
2598 continue;
2599
2600 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
2601 struct page *page;
2602 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
2603 int pg_num;
2604
c56b4d90
CG
2605 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
2606 page = pgv_to_page(kaddr);
69e3c75f
JB
2607 err = vm_insert_page(vma, start, page);
2608 if (unlikely(err))
2609 goto out;
2610 start += PAGE_SIZE;
0e3125c7 2611 kaddr += PAGE_SIZE;
69e3c75f 2612 }
4ebf0ae2 2613 }
1da177e4 2614 }
69e3c75f 2615
4ebf0ae2 2616 atomic_inc(&po->mapped);
1da177e4
LT
2617 vma->vm_ops = &packet_mmap_ops;
2618 err = 0;
2619
2620out:
905db440 2621 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
2622 return err;
2623}
1da177e4 2624
90ddc4f0 2625static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
2626 .family = PF_PACKET,
2627 .owner = THIS_MODULE,
2628 .release = packet_release,
2629 .bind = packet_bind_spkt,
2630 .connect = sock_no_connect,
2631 .socketpair = sock_no_socketpair,
2632 .accept = sock_no_accept,
2633 .getname = packet_getname_spkt,
2634 .poll = datagram_poll,
2635 .ioctl = packet_ioctl,
2636 .listen = sock_no_listen,
2637 .shutdown = sock_no_shutdown,
2638 .setsockopt = sock_no_setsockopt,
2639 .getsockopt = sock_no_getsockopt,
2640 .sendmsg = packet_sendmsg_spkt,
2641 .recvmsg = packet_recvmsg,
2642 .mmap = sock_no_mmap,
2643 .sendpage = sock_no_sendpage,
2644};
1da177e4 2645
90ddc4f0 2646static const struct proto_ops packet_ops = {
1da177e4
LT
2647 .family = PF_PACKET,
2648 .owner = THIS_MODULE,
2649 .release = packet_release,
2650 .bind = packet_bind,
2651 .connect = sock_no_connect,
2652 .socketpair = sock_no_socketpair,
2653 .accept = sock_no_accept,
1ce4f28b 2654 .getname = packet_getname,
1da177e4
LT
2655 .poll = packet_poll,
2656 .ioctl = packet_ioctl,
2657 .listen = sock_no_listen,
2658 .shutdown = sock_no_shutdown,
2659 .setsockopt = packet_setsockopt,
2660 .getsockopt = packet_getsockopt,
2661 .sendmsg = packet_sendmsg,
2662 .recvmsg = packet_recvmsg,
2663 .mmap = packet_mmap,
2664 .sendpage = sock_no_sendpage,
2665};
2666
ec1b4cf7 2667static const struct net_proto_family packet_family_ops = {
1da177e4
LT
2668 .family = PF_PACKET,
2669 .create = packet_create,
2670 .owner = THIS_MODULE,
2671};
2672
2673static struct notifier_block packet_netdev_notifier = {
40d4e3df 2674 .notifier_call = packet_notifier,
1da177e4
LT
2675};
2676
2677#ifdef CONFIG_PROC_FS
1da177e4
LT
2678
2679static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 2680 __acquires(RCU)
1da177e4 2681{
e372c414 2682 struct net *net = seq_file_net(seq);
808f5114 2683
2684 rcu_read_lock();
2685 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
2686}
2687
2688static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2689{
1bf40954 2690 struct net *net = seq_file_net(seq);
808f5114 2691 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
2692}
2693
2694static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 2695 __releases(RCU)
1da177e4 2696{
808f5114 2697 rcu_read_unlock();
1da177e4
LT
2698}
2699
1ce4f28b 2700static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
2701{
2702 if (v == SEQ_START_TOKEN)
2703 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
2704 else {
b7ceabd9 2705 struct sock *s = sk_entry(v);
1da177e4
LT
2706 const struct packet_sock *po = pkt_sk(s);
2707
2708 seq_printf(seq,
2709 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
2710 s,
2711 atomic_read(&s->sk_refcnt),
2712 s->sk_type,
2713 ntohs(po->num),
2714 po->ifindex,
2715 po->running,
2716 atomic_read(&s->sk_rmem_alloc),
2717 sock_i_uid(s),
40d4e3df 2718 sock_i_ino(s));
1da177e4
LT
2719 }
2720
2721 return 0;
2722}
2723
56b3d975 2724static const struct seq_operations packet_seq_ops = {
1da177e4
LT
2725 .start = packet_seq_start,
2726 .next = packet_seq_next,
2727 .stop = packet_seq_stop,
2728 .show = packet_seq_show,
2729};
2730
2731static int packet_seq_open(struct inode *inode, struct file *file)
2732{
e372c414
DL
2733 return seq_open_net(inode, file, &packet_seq_ops,
2734 sizeof(struct seq_net_private));
1da177e4
LT
2735}
2736
da7071d7 2737static const struct file_operations packet_seq_fops = {
1da177e4
LT
2738 .owner = THIS_MODULE,
2739 .open = packet_seq_open,
2740 .read = seq_read,
2741 .llseek = seq_lseek,
e372c414 2742 .release = seq_release_net,
1da177e4
LT
2743};
2744
2745#endif
2746
2c8c1e72 2747static int __net_init packet_net_init(struct net *net)
d12d01d6 2748{
808f5114 2749 spin_lock_init(&net->packet.sklist_lock);
2aaef4e4 2750 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6
DL
2751
2752 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2753 return -ENOMEM;
2754
2755 return 0;
2756}
2757
2c8c1e72 2758static void __net_exit packet_net_exit(struct net *net)
d12d01d6
DL
2759{
2760 proc_net_remove(net, "packet");
2761}
2762
2763static struct pernet_operations packet_net_ops = {
2764 .init = packet_net_init,
2765 .exit = packet_net_exit,
2766};
2767
2768
1da177e4
LT
2769static void __exit packet_exit(void)
2770{
1da177e4 2771 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 2772 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
2773 sock_unregister(PF_PACKET);
2774 proto_unregister(&packet_proto);
2775}
2776
2777static int __init packet_init(void)
2778{
2779 int rc = proto_register(&packet_proto, 0);
2780
2781 if (rc != 0)
2782 goto out;
2783
2784 sock_register(&packet_family_ops);
d12d01d6 2785 register_pernet_subsys(&packet_net_ops);
1da177e4 2786 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
2787out:
2788 return rc;
2789}
2790
2791module_init(packet_init);
2792module_exit(packet_exit);
2793MODULE_LICENSE("GPL");
2794MODULE_ALIAS_NETPROTO(PF_PACKET);