2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <linux/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/percpu.h>
93 #include <net/inet_common.h>
95 #include <linux/bpf.h>
96 #include <net/compat.h>
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
114 Incoming, dev->hard_header!=NULL
115 mac_header -> ll header
118 Outgoing, dev->hard_header!=NULL
119 mac_header -> ll header
122 Incoming, dev->hard_header==NULL
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
125 assymetry between rx and tx paths.
128 Outgoing, dev->hard_header==NULL
129 mac_header -> data. ll header is still not built!
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
139 dev->hard_header != NULL
140 mac_header -> ll header
143 dev->hard_header == NULL (ll header is added by device, we cannot control it)
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
151 /* Private packet socket structures. */
153 /* identical to struct packet_mreq except it has
154 * a longer address field.
156 struct packet_mreq_max
{
158 unsigned short mr_type
;
159 unsigned short mr_alen
;
160 unsigned char mr_address
[MAX_ADDR_LEN
];
164 struct tpacket_hdr
*h1
;
165 struct tpacket2_hdr
*h2
;
166 struct tpacket3_hdr
*h3
;
170 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
171 int closing
, int tx_ring
);
173 #define V3_ALIGNMENT (8)
175 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
177 #define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
190 struct packet_type
*pt
, struct net_device
*orig_dev
);
192 static void *packet_previous_frame(struct packet_sock
*po
,
193 struct packet_ring_buffer
*rb
,
195 static void packet_increment_head(struct packet_ring_buffer
*buff
);
196 static int prb_curr_blk_in_use(struct tpacket_block_desc
*);
197 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*,
198 struct packet_sock
*);
199 static void prb_retire_current_block(struct tpacket_kbdq_core
*,
200 struct packet_sock
*, unsigned int status
);
201 static int prb_queue_frozen(struct tpacket_kbdq_core
*);
202 static void prb_open_block(struct tpacket_kbdq_core
*,
203 struct tpacket_block_desc
*);
204 static void prb_retire_rx_blk_timer_expired(unsigned long);
205 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*);
206 static void prb_init_blk_timer(struct packet_sock
*,
207 struct tpacket_kbdq_core
*,
208 void (*func
) (unsigned long));
209 static void prb_fill_rxhash(struct tpacket_kbdq_core
*, struct tpacket3_hdr
*);
210 static void prb_clear_rxhash(struct tpacket_kbdq_core
*,
211 struct tpacket3_hdr
*);
212 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*,
213 struct tpacket3_hdr
*);
214 static void packet_flush_mclist(struct sock
*sk
);
215 static void packet_pick_tx_queue(struct net_device
*dev
, struct sk_buff
*skb
);
217 struct packet_skb_cb
{
219 struct sockaddr_pkt pkt
;
221 /* Trick: alias skb original length with
222 * ll.sll_family and ll.protocol in order
225 unsigned int origlen
;
226 struct sockaddr_ll ll
;
231 #define vio_le() virtio_legacy_is_little_endian()
233 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
235 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
236 #define GET_PBLOCK_DESC(x, bid) \
237 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
238 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
239 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
240 #define GET_NEXT_PRB_BLK_NUM(x) \
241 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
242 ((x)->kactive_blk_num+1) : 0)
244 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
);
245 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
);
247 static int packet_direct_xmit(struct sk_buff
*skb
)
249 struct net_device
*dev
= skb
->dev
;
250 struct sk_buff
*orig_skb
= skb
;
251 struct netdev_queue
*txq
;
252 int ret
= NETDEV_TX_BUSY
;
254 if (unlikely(!netif_running(dev
) ||
255 !netif_carrier_ok(dev
)))
258 skb
= validate_xmit_skb_list(skb
, dev
);
262 packet_pick_tx_queue(dev
, skb
);
263 txq
= skb_get_tx_queue(dev
, skb
);
267 HARD_TX_LOCK(dev
, txq
, smp_processor_id());
268 if (!netif_xmit_frozen_or_drv_stopped(txq
))
269 ret
= netdev_start_xmit(skb
, dev
, txq
, false);
270 HARD_TX_UNLOCK(dev
, txq
);
274 if (!dev_xmit_complete(ret
))
279 atomic_long_inc(&dev
->tx_dropped
);
281 return NET_XMIT_DROP
;
284 static struct net_device
*packet_cached_dev_get(struct packet_sock
*po
)
286 struct net_device
*dev
;
289 dev
= rcu_dereference(po
->cached_dev
);
297 static void packet_cached_dev_assign(struct packet_sock
*po
,
298 struct net_device
*dev
)
300 rcu_assign_pointer(po
->cached_dev
, dev
);
303 static void packet_cached_dev_reset(struct packet_sock
*po
)
305 RCU_INIT_POINTER(po
->cached_dev
, NULL
);
308 static bool packet_use_direct_xmit(const struct packet_sock
*po
)
310 return po
->xmit
== packet_direct_xmit
;
313 static u16
__packet_pick_tx_queue(struct net_device
*dev
, struct sk_buff
*skb
)
315 return (u16
) raw_smp_processor_id() % dev
->real_num_tx_queues
;
318 static void packet_pick_tx_queue(struct net_device
*dev
, struct sk_buff
*skb
)
320 const struct net_device_ops
*ops
= dev
->netdev_ops
;
323 if (ops
->ndo_select_queue
) {
324 queue_index
= ops
->ndo_select_queue(dev
, skb
, NULL
,
325 __packet_pick_tx_queue
);
326 queue_index
= netdev_cap_txqueue(dev
, queue_index
);
328 queue_index
= __packet_pick_tx_queue(dev
, skb
);
331 skb_set_queue_mapping(skb
, queue_index
);
334 /* __register_prot_hook must be invoked through register_prot_hook
335 * or from a context in which asynchronous accesses to the packet
336 * socket is not possible (packet_create()).
338 static void __register_prot_hook(struct sock
*sk
)
340 struct packet_sock
*po
= pkt_sk(sk
);
344 __fanout_link(sk
, po
);
346 dev_add_pack(&po
->prot_hook
);
353 static void register_prot_hook(struct sock
*sk
)
355 lockdep_assert_held_once(&pkt_sk(sk
)->bind_lock
);
356 __register_prot_hook(sk
);
359 /* If the sync parameter is true, we will temporarily drop
360 * the po->bind_lock and do a synchronize_net to make sure no
361 * asynchronous packet processing paths still refer to the elements
362 * of po->prot_hook. If the sync parameter is false, it is the
363 * callers responsibility to take care of this.
365 static void __unregister_prot_hook(struct sock
*sk
, bool sync
)
367 struct packet_sock
*po
= pkt_sk(sk
);
369 lockdep_assert_held_once(&po
->bind_lock
);
374 __fanout_unlink(sk
, po
);
376 __dev_remove_pack(&po
->prot_hook
);
381 spin_unlock(&po
->bind_lock
);
383 spin_lock(&po
->bind_lock
);
387 static void unregister_prot_hook(struct sock
*sk
, bool sync
)
389 struct packet_sock
*po
= pkt_sk(sk
);
392 __unregister_prot_hook(sk
, sync
);
395 static inline struct page
* __pure
pgv_to_page(void *addr
)
397 if (is_vmalloc_addr(addr
))
398 return vmalloc_to_page(addr
);
399 return virt_to_page(addr
);
402 static void __packet_set_status(struct packet_sock
*po
, void *frame
, int status
)
404 union tpacket_uhdr h
;
407 switch (po
->tp_version
) {
409 h
.h1
->tp_status
= status
;
410 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
413 h
.h2
->tp_status
= status
;
414 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
417 h
.h3
->tp_status
= status
;
418 flush_dcache_page(pgv_to_page(&h
.h3
->tp_status
));
421 WARN(1, "TPACKET version not supported.\n");
428 static int __packet_get_status(struct packet_sock
*po
, void *frame
)
430 union tpacket_uhdr h
;
435 switch (po
->tp_version
) {
437 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
438 return h
.h1
->tp_status
;
440 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
441 return h
.h2
->tp_status
;
443 flush_dcache_page(pgv_to_page(&h
.h3
->tp_status
));
444 return h
.h3
->tp_status
;
446 WARN(1, "TPACKET version not supported.\n");
452 static __u32
tpacket_get_timestamp(struct sk_buff
*skb
, struct timespec
*ts
,
455 struct skb_shared_hwtstamps
*shhwtstamps
= skb_hwtstamps(skb
);
458 (flags
& SOF_TIMESTAMPING_RAW_HARDWARE
) &&
459 ktime_to_timespec_cond(shhwtstamps
->hwtstamp
, ts
))
460 return TP_STATUS_TS_RAW_HARDWARE
;
462 if (ktime_to_timespec_cond(skb
->tstamp
, ts
))
463 return TP_STATUS_TS_SOFTWARE
;
468 static __u32
__packet_set_timestamp(struct packet_sock
*po
, void *frame
,
471 union tpacket_uhdr h
;
475 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
479 switch (po
->tp_version
) {
481 h
.h1
->tp_sec
= ts
.tv_sec
;
482 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
485 h
.h2
->tp_sec
= ts
.tv_sec
;
486 h
.h2
->tp_nsec
= ts
.tv_nsec
;
489 h
.h3
->tp_sec
= ts
.tv_sec
;
490 h
.h3
->tp_nsec
= ts
.tv_nsec
;
493 WARN(1, "TPACKET version not supported.\n");
497 /* one flush is safe, as both fields always lie on the same cacheline */
498 flush_dcache_page(pgv_to_page(&h
.h1
->tp_sec
));
504 static void *packet_lookup_frame(struct packet_sock
*po
,
505 struct packet_ring_buffer
*rb
,
506 unsigned int position
,
509 unsigned int pg_vec_pos
, frame_offset
;
510 union tpacket_uhdr h
;
512 pg_vec_pos
= position
/ rb
->frames_per_block
;
513 frame_offset
= position
% rb
->frames_per_block
;
515 h
.raw
= rb
->pg_vec
[pg_vec_pos
].buffer
+
516 (frame_offset
* rb
->frame_size
);
518 if (status
!= __packet_get_status(po
, h
.raw
))
524 static void *packet_current_frame(struct packet_sock
*po
,
525 struct packet_ring_buffer
*rb
,
528 return packet_lookup_frame(po
, rb
, rb
->head
, status
);
531 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
533 del_timer_sync(&pkc
->retire_blk_timer
);
536 static void prb_shutdown_retire_blk_timer(struct packet_sock
*po
,
537 struct sk_buff_head
*rb_queue
)
539 struct tpacket_kbdq_core
*pkc
;
541 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
543 spin_lock_bh(&rb_queue
->lock
);
544 pkc
->delete_blk_timer
= 1;
545 spin_unlock_bh(&rb_queue
->lock
);
547 prb_del_retire_blk_timer(pkc
);
550 static void prb_init_blk_timer(struct packet_sock
*po
,
551 struct tpacket_kbdq_core
*pkc
,
552 void (*func
) (unsigned long))
554 init_timer(&pkc
->retire_blk_timer
);
555 pkc
->retire_blk_timer
.data
= (long)po
;
556 pkc
->retire_blk_timer
.function
= func
;
557 pkc
->retire_blk_timer
.expires
= jiffies
;
560 static void prb_setup_retire_blk_timer(struct packet_sock
*po
)
562 struct tpacket_kbdq_core
*pkc
;
564 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
565 prb_init_blk_timer(po
, pkc
, prb_retire_rx_blk_timer_expired
);
568 static int prb_calc_retire_blk_tmo(struct packet_sock
*po
,
569 int blk_size_in_bytes
)
571 struct net_device
*dev
;
572 unsigned int mbits
= 0, msec
= 0, div
= 0, tmo
= 0;
573 struct ethtool_link_ksettings ecmd
;
577 dev
= __dev_get_by_index(sock_net(&po
->sk
), po
->ifindex
);
578 if (unlikely(!dev
)) {
580 return DEFAULT_PRB_RETIRE_TOV
;
582 err
= __ethtool_get_link_ksettings(dev
, &ecmd
);
586 * If the link speed is so slow you don't really
587 * need to worry about perf anyways
589 if (ecmd
.base
.speed
< SPEED_1000
||
590 ecmd
.base
.speed
== SPEED_UNKNOWN
) {
591 return DEFAULT_PRB_RETIRE_TOV
;
594 div
= ecmd
.base
.speed
/ 1000;
598 mbits
= (blk_size_in_bytes
* 8) / (1024 * 1024);
610 static void prb_init_ft_ops(struct tpacket_kbdq_core
*p1
,
611 union tpacket_req_u
*req_u
)
613 p1
->feature_req_word
= req_u
->req3
.tp_feature_req_word
;
616 static void init_prb_bdqc(struct packet_sock
*po
,
617 struct packet_ring_buffer
*rb
,
619 union tpacket_req_u
*req_u
)
621 struct tpacket_kbdq_core
*p1
= GET_PBDQC_FROM_RB(rb
);
622 struct tpacket_block_desc
*pbd
;
624 memset(p1
, 0x0, sizeof(*p1
));
626 p1
->knxt_seq_num
= 1;
628 pbd
= (struct tpacket_block_desc
*)pg_vec
[0].buffer
;
629 p1
->pkblk_start
= pg_vec
[0].buffer
;
630 p1
->kblk_size
= req_u
->req3
.tp_block_size
;
631 p1
->knum_blocks
= req_u
->req3
.tp_block_nr
;
632 p1
->hdrlen
= po
->tp_hdrlen
;
633 p1
->version
= po
->tp_version
;
634 p1
->last_kactive_blk_num
= 0;
635 po
->stats
.stats3
.tp_freeze_q_cnt
= 0;
636 if (req_u
->req3
.tp_retire_blk_tov
)
637 p1
->retire_blk_tov
= req_u
->req3
.tp_retire_blk_tov
;
639 p1
->retire_blk_tov
= prb_calc_retire_blk_tmo(po
,
640 req_u
->req3
.tp_block_size
);
641 p1
->tov_in_jiffies
= msecs_to_jiffies(p1
->retire_blk_tov
);
642 p1
->blk_sizeof_priv
= req_u
->req3
.tp_sizeof_priv
;
644 p1
->max_frame_len
= p1
->kblk_size
- BLK_PLUS_PRIV(p1
->blk_sizeof_priv
);
645 prb_init_ft_ops(p1
, req_u
);
646 prb_setup_retire_blk_timer(po
);
647 prb_open_block(p1
, pbd
);
650 /* Do NOT update the last_blk_num first.
651 * Assumes sk_buff_head lock is held.
653 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
655 mod_timer(&pkc
->retire_blk_timer
,
656 jiffies
+ pkc
->tov_in_jiffies
);
657 pkc
->last_kactive_blk_num
= pkc
->kactive_blk_num
;
662 * 1) We refresh the timer only when we open a block.
663 * By doing this we don't waste cycles refreshing the timer
664 * on packet-by-packet basis.
666 * With a 1MB block-size, on a 1Gbps line, it will take
667 * i) ~8 ms to fill a block + ii) memcpy etc.
668 * In this cut we are not accounting for the memcpy time.
670 * So, if the user sets the 'tmo' to 10ms then the timer
671 * will never fire while the block is still getting filled
672 * (which is what we want). However, the user could choose
673 * to close a block early and that's fine.
675 * But when the timer does fire, we check whether or not to refresh it.
676 * Since the tmo granularity is in msecs, it is not too expensive
677 * to refresh the timer, lets say every '8' msecs.
678 * Either the user can set the 'tmo' or we can derive it based on
679 * a) line-speed and b) block-size.
680 * prb_calc_retire_blk_tmo() calculates the tmo.
683 static void prb_retire_rx_blk_timer_expired(unsigned long data
)
685 struct packet_sock
*po
= (struct packet_sock
*)data
;
686 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
688 struct tpacket_block_desc
*pbd
;
690 spin_lock(&po
->sk
.sk_receive_queue
.lock
);
692 frozen
= prb_queue_frozen(pkc
);
693 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
695 if (unlikely(pkc
->delete_blk_timer
))
698 /* We only need to plug the race when the block is partially filled.
700 * lock(); increment BLOCK_NUM_PKTS; unlock()
701 * copy_bits() is in progress ...
702 * timer fires on other cpu:
703 * we can't retire the current block because copy_bits
707 if (BLOCK_NUM_PKTS(pbd
)) {
708 while (atomic_read(&pkc
->blk_fill_in_prog
)) {
709 /* Waiting for skb_copy_bits to finish... */
714 if (pkc
->last_kactive_blk_num
== pkc
->kactive_blk_num
) {
716 if (!BLOCK_NUM_PKTS(pbd
)) {
717 /* An empty block. Just refresh the timer. */
720 prb_retire_current_block(pkc
, po
, TP_STATUS_BLK_TMO
);
721 if (!prb_dispatch_next_block(pkc
, po
))
726 /* Case 1. Queue was frozen because user-space was
729 if (prb_curr_blk_in_use(pbd
)) {
731 * Ok, user-space is still behind.
732 * So just refresh the timer.
736 /* Case 2. queue was frozen,user-space caught up,
737 * now the link went idle && the timer fired.
738 * We don't have a block to close.So we open this
739 * block and restart the timer.
740 * opening a block thaws the queue,restarts timer
741 * Thawing/timer-refresh is a side effect.
743 prb_open_block(pkc
, pbd
);
750 _prb_refresh_rx_retire_blk_timer(pkc
);
753 spin_unlock(&po
->sk
.sk_receive_queue
.lock
);
756 static void prb_flush_block(struct tpacket_kbdq_core
*pkc1
,
757 struct tpacket_block_desc
*pbd1
, __u32 status
)
759 /* Flush everything minus the block header */
761 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
766 /* Skip the block header(we know header WILL fit in 4K) */
769 end
= (u8
*)PAGE_ALIGN((unsigned long)pkc1
->pkblk_end
);
770 for (; start
< end
; start
+= PAGE_SIZE
)
771 flush_dcache_page(pgv_to_page(start
));
776 /* Now update the block status. */
778 BLOCK_STATUS(pbd1
) = status
;
780 /* Flush the block header */
782 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
784 flush_dcache_page(pgv_to_page(start
));
794 * 2) Increment active_blk_num
796 * Note:We DONT refresh the timer on purpose.
797 * Because almost always the next block will be opened.
799 static void prb_close_block(struct tpacket_kbdq_core
*pkc1
,
800 struct tpacket_block_desc
*pbd1
,
801 struct packet_sock
*po
, unsigned int stat
)
803 __u32 status
= TP_STATUS_USER
| stat
;
805 struct tpacket3_hdr
*last_pkt
;
806 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
807 struct sock
*sk
= &po
->sk
;
809 if (po
->stats
.stats3
.tp_drops
)
810 status
|= TP_STATUS_LOSING
;
812 last_pkt
= (struct tpacket3_hdr
*)pkc1
->prev
;
813 last_pkt
->tp_next_offset
= 0;
815 /* Get the ts of the last pkt */
816 if (BLOCK_NUM_PKTS(pbd1
)) {
817 h1
->ts_last_pkt
.ts_sec
= last_pkt
->tp_sec
;
818 h1
->ts_last_pkt
.ts_nsec
= last_pkt
->tp_nsec
;
820 /* Ok, we tmo'd - so get the current time.
822 * It shouldn't really happen as we don't close empty
823 * blocks. See prb_retire_rx_blk_timer_expired().
827 h1
->ts_last_pkt
.ts_sec
= ts
.tv_sec
;
828 h1
->ts_last_pkt
.ts_nsec
= ts
.tv_nsec
;
833 /* Flush the block */
834 prb_flush_block(pkc1
, pbd1
, status
);
836 sk
->sk_data_ready(sk
);
838 pkc1
->kactive_blk_num
= GET_NEXT_PRB_BLK_NUM(pkc1
);
841 static void prb_thaw_queue(struct tpacket_kbdq_core
*pkc
)
843 pkc
->reset_pending_on_curr_blk
= 0;
847 * Side effect of opening a block:
849 * 1) prb_queue is thawed.
850 * 2) retire_blk_timer is refreshed.
853 static void prb_open_block(struct tpacket_kbdq_core
*pkc1
,
854 struct tpacket_block_desc
*pbd1
)
857 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
861 /* We could have just memset this but we will lose the
862 * flexibility of making the priv area sticky
865 BLOCK_SNUM(pbd1
) = pkc1
->knxt_seq_num
++;
866 BLOCK_NUM_PKTS(pbd1
) = 0;
867 BLOCK_LEN(pbd1
) = BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
871 h1
->ts_first_pkt
.ts_sec
= ts
.tv_sec
;
872 h1
->ts_first_pkt
.ts_nsec
= ts
.tv_nsec
;
874 pkc1
->pkblk_start
= (char *)pbd1
;
875 pkc1
->nxt_offset
= pkc1
->pkblk_start
+ BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
877 BLOCK_O2FP(pbd1
) = (__u32
)BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
878 BLOCK_O2PRIV(pbd1
) = BLK_HDR_LEN
;
880 pbd1
->version
= pkc1
->version
;
881 pkc1
->prev
= pkc1
->nxt_offset
;
882 pkc1
->pkblk_end
= pkc1
->pkblk_start
+ pkc1
->kblk_size
;
884 prb_thaw_queue(pkc1
);
885 _prb_refresh_rx_retire_blk_timer(pkc1
);
891 * Queue freeze logic:
892 * 1) Assume tp_block_nr = 8 blocks.
893 * 2) At time 't0', user opens Rx ring.
894 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
895 * 4) user-space is either sleeping or processing block '0'.
896 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
897 * it will close block-7,loop around and try to fill block '0'.
899 * __packet_lookup_frame_in_block
900 * prb_retire_current_block()
901 * prb_dispatch_next_block()
902 * |->(BLOCK_STATUS == USER) evaluates to true
903 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
904 * 6) Now there are two cases:
905 * 6.1) Link goes idle right after the queue is frozen.
906 * But remember, the last open_block() refreshed the timer.
907 * When this timer expires,it will refresh itself so that we can
908 * re-open block-0 in near future.
909 * 6.2) Link is busy and keeps on receiving packets. This is a simple
910 * case and __packet_lookup_frame_in_block will check if block-0
911 * is free and can now be re-used.
913 static void prb_freeze_queue(struct tpacket_kbdq_core
*pkc
,
914 struct packet_sock
*po
)
916 pkc
->reset_pending_on_curr_blk
= 1;
917 po
->stats
.stats3
.tp_freeze_q_cnt
++;
920 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
923 * If the next block is free then we will dispatch it
924 * and return a good offset.
925 * Else, we will freeze the queue.
926 * So, caller must check the return value.
928 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*pkc
,
929 struct packet_sock
*po
)
931 struct tpacket_block_desc
*pbd
;
935 /* 1. Get current block num */
936 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
938 /* 2. If this block is currently in_use then freeze the queue */
939 if (TP_STATUS_USER
& BLOCK_STATUS(pbd
)) {
940 prb_freeze_queue(pkc
, po
);
946 * open this block and return the offset where the first packet
947 * needs to get stored.
949 prb_open_block(pkc
, pbd
);
950 return (void *)pkc
->nxt_offset
;
953 static void prb_retire_current_block(struct tpacket_kbdq_core
*pkc
,
954 struct packet_sock
*po
, unsigned int status
)
956 struct tpacket_block_desc
*pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
958 /* retire/close the current block */
959 if (likely(TP_STATUS_KERNEL
== BLOCK_STATUS(pbd
))) {
961 * Plug the case where copy_bits() is in progress on
962 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
963 * have space to copy the pkt in the current block and
964 * called prb_retire_current_block()
966 * We don't need to worry about the TMO case because
967 * the timer-handler already handled this case.
969 if (!(status
& TP_STATUS_BLK_TMO
)) {
970 while (atomic_read(&pkc
->blk_fill_in_prog
)) {
971 /* Waiting for skb_copy_bits to finish... */
975 prb_close_block(pkc
, pbd
, po
, status
);
980 static int prb_curr_blk_in_use(struct tpacket_block_desc
*pbd
)
982 return TP_STATUS_USER
& BLOCK_STATUS(pbd
);
985 static int prb_queue_frozen(struct tpacket_kbdq_core
*pkc
)
987 return pkc
->reset_pending_on_curr_blk
;
990 static void prb_clear_blk_fill_status(struct packet_ring_buffer
*rb
)
992 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
993 atomic_dec(&pkc
->blk_fill_in_prog
);
996 static void prb_fill_rxhash(struct tpacket_kbdq_core
*pkc
,
997 struct tpacket3_hdr
*ppd
)
999 ppd
->hv1
.tp_rxhash
= skb_get_hash(pkc
->skb
);
1002 static void prb_clear_rxhash(struct tpacket_kbdq_core
*pkc
,
1003 struct tpacket3_hdr
*ppd
)
1005 ppd
->hv1
.tp_rxhash
= 0;
1008 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*pkc
,
1009 struct tpacket3_hdr
*ppd
)
1011 if (skb_vlan_tag_present(pkc
->skb
)) {
1012 ppd
->hv1
.tp_vlan_tci
= skb_vlan_tag_get(pkc
->skb
);
1013 ppd
->hv1
.tp_vlan_tpid
= ntohs(pkc
->skb
->vlan_proto
);
1014 ppd
->tp_status
= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
1016 ppd
->hv1
.tp_vlan_tci
= 0;
1017 ppd
->hv1
.tp_vlan_tpid
= 0;
1018 ppd
->tp_status
= TP_STATUS_AVAILABLE
;
1022 static void prb_run_all_ft_ops(struct tpacket_kbdq_core
*pkc
,
1023 struct tpacket3_hdr
*ppd
)
1025 ppd
->hv1
.tp_padding
= 0;
1026 prb_fill_vlan_info(pkc
, ppd
);
1028 if (pkc
->feature_req_word
& TP_FT_REQ_FILL_RXHASH
)
1029 prb_fill_rxhash(pkc
, ppd
);
1031 prb_clear_rxhash(pkc
, ppd
);
1034 static void prb_fill_curr_block(char *curr
,
1035 struct tpacket_kbdq_core
*pkc
,
1036 struct tpacket_block_desc
*pbd
,
1039 struct tpacket3_hdr
*ppd
;
1041 ppd
= (struct tpacket3_hdr
*)curr
;
1042 ppd
->tp_next_offset
= TOTAL_PKT_LEN_INCL_ALIGN(len
);
1044 pkc
->nxt_offset
+= TOTAL_PKT_LEN_INCL_ALIGN(len
);
1045 BLOCK_LEN(pbd
) += TOTAL_PKT_LEN_INCL_ALIGN(len
);
1046 BLOCK_NUM_PKTS(pbd
) += 1;
1047 atomic_inc(&pkc
->blk_fill_in_prog
);
1048 prb_run_all_ft_ops(pkc
, ppd
);
1051 /* Assumes caller has the sk->rx_queue.lock */
1052 static void *__packet_lookup_frame_in_block(struct packet_sock
*po
,
1053 struct sk_buff
*skb
,
1058 struct tpacket_kbdq_core
*pkc
;
1059 struct tpacket_block_desc
*pbd
;
1062 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
1063 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1065 /* Queue is frozen when user space is lagging behind */
1066 if (prb_queue_frozen(pkc
)) {
1068 * Check if that last block which caused the queue to freeze,
1069 * is still in_use by user-space.
1071 if (prb_curr_blk_in_use(pbd
)) {
1072 /* Can't record this packet */
1076 * Ok, the block was released by user-space.
1077 * Now let's open that block.
1078 * opening a block also thaws the queue.
1079 * Thawing is a side effect.
1081 prb_open_block(pkc
, pbd
);
1086 curr
= pkc
->nxt_offset
;
1088 end
= (char *)pbd
+ pkc
->kblk_size
;
1090 /* first try the current block */
1091 if (curr
+TOTAL_PKT_LEN_INCL_ALIGN(len
) < end
) {
1092 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1093 return (void *)curr
;
1096 /* Ok, close the current block */
1097 prb_retire_current_block(pkc
, po
, 0);
1099 /* Now, try to dispatch the next block */
1100 curr
= (char *)prb_dispatch_next_block(pkc
, po
);
1102 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1103 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1104 return (void *)curr
;
1108 * No free blocks are available.user_space hasn't caught up yet.
1109 * Queue was just frozen and now this packet will get dropped.
1114 static void *packet_current_rx_frame(struct packet_sock
*po
,
1115 struct sk_buff
*skb
,
1116 int status
, unsigned int len
)
1119 switch (po
->tp_version
) {
1122 curr
= packet_lookup_frame(po
, &po
->rx_ring
,
1123 po
->rx_ring
.head
, status
);
1126 return __packet_lookup_frame_in_block(po
, skb
, status
, len
);
1128 WARN(1, "TPACKET version not supported\n");
1134 static void *prb_lookup_block(struct packet_sock
*po
,
1135 struct packet_ring_buffer
*rb
,
1139 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
1140 struct tpacket_block_desc
*pbd
= GET_PBLOCK_DESC(pkc
, idx
);
1142 if (status
!= BLOCK_STATUS(pbd
))
1147 static int prb_previous_blk_num(struct packet_ring_buffer
*rb
)
1150 if (rb
->prb_bdqc
.kactive_blk_num
)
1151 prev
= rb
->prb_bdqc
.kactive_blk_num
-1;
1153 prev
= rb
->prb_bdqc
.knum_blocks
-1;
1157 /* Assumes caller has held the rx_queue.lock */
1158 static void *__prb_previous_block(struct packet_sock
*po
,
1159 struct packet_ring_buffer
*rb
,
1162 unsigned int previous
= prb_previous_blk_num(rb
);
1163 return prb_lookup_block(po
, rb
, previous
, status
);
1166 static void *packet_previous_rx_frame(struct packet_sock
*po
,
1167 struct packet_ring_buffer
*rb
,
1170 if (po
->tp_version
<= TPACKET_V2
)
1171 return packet_previous_frame(po
, rb
, status
);
1173 return __prb_previous_block(po
, rb
, status
);
1176 static void packet_increment_rx_head(struct packet_sock
*po
,
1177 struct packet_ring_buffer
*rb
)
1179 switch (po
->tp_version
) {
1182 return packet_increment_head(rb
);
1185 WARN(1, "TPACKET version not supported.\n");
1191 static void *packet_previous_frame(struct packet_sock
*po
,
1192 struct packet_ring_buffer
*rb
,
1195 unsigned int previous
= rb
->head
? rb
->head
- 1 : rb
->frame_max
;
1196 return packet_lookup_frame(po
, rb
, previous
, status
);
1199 static void packet_increment_head(struct packet_ring_buffer
*buff
)
1201 buff
->head
= buff
->head
!= buff
->frame_max
? buff
->head
+1 : 0;
1204 static void packet_inc_pending(struct packet_ring_buffer
*rb
)
1206 this_cpu_inc(*rb
->pending_refcnt
);
1209 static void packet_dec_pending(struct packet_ring_buffer
*rb
)
1211 this_cpu_dec(*rb
->pending_refcnt
);
1214 static unsigned int packet_read_pending(const struct packet_ring_buffer
*rb
)
1216 unsigned int refcnt
= 0;
1219 /* We don't use pending refcount in rx_ring. */
1220 if (rb
->pending_refcnt
== NULL
)
1223 for_each_possible_cpu(cpu
)
1224 refcnt
+= *per_cpu_ptr(rb
->pending_refcnt
, cpu
);
1229 static int packet_alloc_pending(struct packet_sock
*po
)
1231 po
->rx_ring
.pending_refcnt
= NULL
;
1233 po
->tx_ring
.pending_refcnt
= alloc_percpu(unsigned int);
1234 if (unlikely(po
->tx_ring
.pending_refcnt
== NULL
))
1240 static void packet_free_pending(struct packet_sock
*po
)
1242 free_percpu(po
->tx_ring
.pending_refcnt
);
1245 #define ROOM_POW_OFF 2
1246 #define ROOM_NONE 0x0
1247 #define ROOM_LOW 0x1
1248 #define ROOM_NORMAL 0x2
1250 static bool __tpacket_has_room(struct packet_sock
*po
, int pow_off
)
1254 len
= po
->rx_ring
.frame_max
+ 1;
1255 idx
= po
->rx_ring
.head
;
1257 idx
+= len
>> pow_off
;
1260 return packet_lookup_frame(po
, &po
->rx_ring
, idx
, TP_STATUS_KERNEL
);
1263 static bool __tpacket_v3_has_room(struct packet_sock
*po
, int pow_off
)
1267 len
= po
->rx_ring
.prb_bdqc
.knum_blocks
;
1268 idx
= po
->rx_ring
.prb_bdqc
.kactive_blk_num
;
1270 idx
+= len
>> pow_off
;
1273 return prb_lookup_block(po
, &po
->rx_ring
, idx
, TP_STATUS_KERNEL
);
1276 static int __packet_rcv_has_room(struct packet_sock
*po
, struct sk_buff
*skb
)
1278 struct sock
*sk
= &po
->sk
;
1279 int ret
= ROOM_NONE
;
1281 if (po
->prot_hook
.func
!= tpacket_rcv
) {
1282 int avail
= sk
->sk_rcvbuf
- atomic_read(&sk
->sk_rmem_alloc
)
1283 - (skb
? skb
->truesize
: 0);
1284 if (avail
> (sk
->sk_rcvbuf
>> ROOM_POW_OFF
))
1292 if (po
->tp_version
== TPACKET_V3
) {
1293 if (__tpacket_v3_has_room(po
, ROOM_POW_OFF
))
1295 else if (__tpacket_v3_has_room(po
, 0))
1298 if (__tpacket_has_room(po
, ROOM_POW_OFF
))
1300 else if (__tpacket_has_room(po
, 0))
1307 static int packet_rcv_has_room(struct packet_sock
*po
, struct sk_buff
*skb
)
1312 spin_lock_bh(&po
->sk
.sk_receive_queue
.lock
);
1313 ret
= __packet_rcv_has_room(po
, skb
);
1314 has_room
= ret
== ROOM_NORMAL
;
1315 if (po
->pressure
== has_room
)
1316 po
->pressure
= !has_room
;
1317 spin_unlock_bh(&po
->sk
.sk_receive_queue
.lock
);
1322 static void packet_sock_destruct(struct sock
*sk
)
1324 skb_queue_purge(&sk
->sk_error_queue
);
1326 WARN_ON(atomic_read(&sk
->sk_rmem_alloc
));
1327 WARN_ON(refcount_read(&sk
->sk_wmem_alloc
));
1329 if (!sock_flag(sk
, SOCK_DEAD
)) {
1330 pr_err("Attempt to release alive packet socket: %p\n", sk
);
1334 sk_refcnt_debug_dec(sk
);
1337 static bool fanout_flow_is_huge(struct packet_sock
*po
, struct sk_buff
*skb
)
1342 rxhash
= skb_get_hash(skb
);
1343 for (i
= 0; i
< ROLLOVER_HLEN
; i
++)
1344 if (po
->rollover
->history
[i
] == rxhash
)
1347 po
->rollover
->history
[prandom_u32() % ROLLOVER_HLEN
] = rxhash
;
1348 return count
> (ROLLOVER_HLEN
>> 1);
1351 static unsigned int fanout_demux_hash(struct packet_fanout
*f
,
1352 struct sk_buff
*skb
,
1355 return reciprocal_scale(__skb_get_hash_symmetric(skb
), num
);
1358 static unsigned int fanout_demux_lb(struct packet_fanout
*f
,
1359 struct sk_buff
*skb
,
1362 unsigned int val
= atomic_inc_return(&f
->rr_cur
);
1367 static unsigned int fanout_demux_cpu(struct packet_fanout
*f
,
1368 struct sk_buff
*skb
,
1371 return smp_processor_id() % num
;
1374 static unsigned int fanout_demux_rnd(struct packet_fanout
*f
,
1375 struct sk_buff
*skb
,
1378 return prandom_u32_max(num
);
1381 static unsigned int fanout_demux_rollover(struct packet_fanout
*f
,
1382 struct sk_buff
*skb
,
1383 unsigned int idx
, bool try_self
,
1386 struct packet_sock
*po
, *po_next
, *po_skip
= NULL
;
1387 unsigned int i
, j
, room
= ROOM_NONE
;
1389 po
= pkt_sk(f
->arr
[idx
]);
1392 room
= packet_rcv_has_room(po
, skb
);
1393 if (room
== ROOM_NORMAL
||
1394 (room
== ROOM_LOW
&& !fanout_flow_is_huge(po
, skb
)))
1399 i
= j
= min_t(int, po
->rollover
->sock
, num
- 1);
1401 po_next
= pkt_sk(f
->arr
[i
]);
1402 if (po_next
!= po_skip
&& !po_next
->pressure
&&
1403 packet_rcv_has_room(po_next
, skb
) == ROOM_NORMAL
) {
1405 po
->rollover
->sock
= i
;
1406 atomic_long_inc(&po
->rollover
->num
);
1407 if (room
== ROOM_LOW
)
1408 atomic_long_inc(&po
->rollover
->num_huge
);
1416 atomic_long_inc(&po
->rollover
->num_failed
);
1420 static unsigned int fanout_demux_qm(struct packet_fanout
*f
,
1421 struct sk_buff
*skb
,
1424 return skb_get_queue_mapping(skb
) % num
;
1427 static unsigned int fanout_demux_bpf(struct packet_fanout
*f
,
1428 struct sk_buff
*skb
,
1431 struct bpf_prog
*prog
;
1432 unsigned int ret
= 0;
1435 prog
= rcu_dereference(f
->bpf_prog
);
1437 ret
= bpf_prog_run_clear_cb(prog
, skb
) % num
;
1443 static bool fanout_has_flag(struct packet_fanout
*f
, u16 flag
)
1445 return f
->flags
& (flag
>> 8);
1448 static int packet_rcv_fanout(struct sk_buff
*skb
, struct net_device
*dev
,
1449 struct packet_type
*pt
, struct net_device
*orig_dev
)
1451 struct packet_fanout
*f
= pt
->af_packet_priv
;
1452 unsigned int num
= READ_ONCE(f
->num_members
);
1453 struct net
*net
= read_pnet(&f
->net
);
1454 struct packet_sock
*po
;
1457 if (!net_eq(dev_net(dev
), net
) || !num
) {
1462 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_DEFRAG
)) {
1463 skb
= ip_check_defrag(net
, skb
, IP_DEFRAG_AF_PACKET
);
1468 case PACKET_FANOUT_HASH
:
1470 idx
= fanout_demux_hash(f
, skb
, num
);
1472 case PACKET_FANOUT_LB
:
1473 idx
= fanout_demux_lb(f
, skb
, num
);
1475 case PACKET_FANOUT_CPU
:
1476 idx
= fanout_demux_cpu(f
, skb
, num
);
1478 case PACKET_FANOUT_RND
:
1479 idx
= fanout_demux_rnd(f
, skb
, num
);
1481 case PACKET_FANOUT_QM
:
1482 idx
= fanout_demux_qm(f
, skb
, num
);
1484 case PACKET_FANOUT_ROLLOVER
:
1485 idx
= fanout_demux_rollover(f
, skb
, 0, false, num
);
1487 case PACKET_FANOUT_CBPF
:
1488 case PACKET_FANOUT_EBPF
:
1489 idx
= fanout_demux_bpf(f
, skb
, num
);
1493 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_ROLLOVER
))
1494 idx
= fanout_demux_rollover(f
, skb
, idx
, true, num
);
1496 po
= pkt_sk(f
->arr
[idx
]);
1497 return po
->prot_hook
.func(skb
, dev
, &po
->prot_hook
, orig_dev
);
1500 DEFINE_MUTEX(fanout_mutex
);
1501 EXPORT_SYMBOL_GPL(fanout_mutex
);
1502 static LIST_HEAD(fanout_list
);
1503 static u16 fanout_next_id
;
1505 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
)
1507 struct packet_fanout
*f
= po
->fanout
;
1509 spin_lock(&f
->lock
);
1510 f
->arr
[f
->num_members
] = sk
;
1513 if (f
->num_members
== 1)
1514 dev_add_pack(&f
->prot_hook
);
1515 spin_unlock(&f
->lock
);
1518 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
)
1520 struct packet_fanout
*f
= po
->fanout
;
1523 spin_lock(&f
->lock
);
1524 for (i
= 0; i
< f
->num_members
; i
++) {
1525 if (f
->arr
[i
] == sk
)
1528 BUG_ON(i
>= f
->num_members
);
1529 f
->arr
[i
] = f
->arr
[f
->num_members
- 1];
1531 if (f
->num_members
== 0)
1532 __dev_remove_pack(&f
->prot_hook
);
1533 spin_unlock(&f
->lock
);
1536 static bool match_fanout_group(struct packet_type
*ptype
, struct sock
*sk
)
1538 if (sk
->sk_family
!= PF_PACKET
)
1541 return ptype
->af_packet_priv
== pkt_sk(sk
)->fanout
;
1544 static void fanout_init_data(struct packet_fanout
*f
)
1547 case PACKET_FANOUT_LB
:
1548 atomic_set(&f
->rr_cur
, 0);
1550 case PACKET_FANOUT_CBPF
:
1551 case PACKET_FANOUT_EBPF
:
1552 RCU_INIT_POINTER(f
->bpf_prog
, NULL
);
1557 static void __fanout_set_data_bpf(struct packet_fanout
*f
, struct bpf_prog
*new)
1559 struct bpf_prog
*old
;
1561 spin_lock(&f
->lock
);
1562 old
= rcu_dereference_protected(f
->bpf_prog
, lockdep_is_held(&f
->lock
));
1563 rcu_assign_pointer(f
->bpf_prog
, new);
1564 spin_unlock(&f
->lock
);
1568 bpf_prog_destroy(old
);
1572 static int fanout_set_data_cbpf(struct packet_sock
*po
, char __user
*data
,
1575 struct bpf_prog
*new;
1576 struct sock_fprog fprog
;
1579 if (sock_flag(&po
->sk
, SOCK_FILTER_LOCKED
))
1581 if (len
!= sizeof(fprog
))
1583 if (copy_from_user(&fprog
, data
, len
))
1586 ret
= bpf_prog_create_from_user(&new, &fprog
, NULL
, false);
1590 __fanout_set_data_bpf(po
->fanout
, new);
1594 static int fanout_set_data_ebpf(struct packet_sock
*po
, char __user
*data
,
1597 struct bpf_prog
*new;
1600 if (sock_flag(&po
->sk
, SOCK_FILTER_LOCKED
))
1602 if (len
!= sizeof(fd
))
1604 if (copy_from_user(&fd
, data
, len
))
1607 new = bpf_prog_get_type(fd
, BPF_PROG_TYPE_SOCKET_FILTER
);
1609 return PTR_ERR(new);
1611 __fanout_set_data_bpf(po
->fanout
, new);
1615 static int fanout_set_data(struct packet_sock
*po
, char __user
*data
,
1618 switch (po
->fanout
->type
) {
1619 case PACKET_FANOUT_CBPF
:
1620 return fanout_set_data_cbpf(po
, data
, len
);
1621 case PACKET_FANOUT_EBPF
:
1622 return fanout_set_data_ebpf(po
, data
, len
);
1628 static void fanout_release_data(struct packet_fanout
*f
)
1631 case PACKET_FANOUT_CBPF
:
1632 case PACKET_FANOUT_EBPF
:
1633 __fanout_set_data_bpf(f
, NULL
);
1637 static bool __fanout_id_is_free(struct sock
*sk
, u16 candidate_id
)
1639 struct packet_fanout
*f
;
1641 list_for_each_entry(f
, &fanout_list
, list
) {
1642 if (f
->id
== candidate_id
&&
1643 read_pnet(&f
->net
) == sock_net(sk
)) {
1650 static bool fanout_find_new_id(struct sock
*sk
, u16
*new_id
)
1652 u16 id
= fanout_next_id
;
1655 if (__fanout_id_is_free(sk
, id
)) {
1657 fanout_next_id
= id
+ 1;
1662 } while (id
!= fanout_next_id
);
1667 static int fanout_add(struct sock
*sk
, u16 id
, u16 type_flags
)
1669 struct packet_rollover
*rollover
= NULL
;
1670 struct packet_sock
*po
= pkt_sk(sk
);
1671 struct packet_fanout
*f
, *match
;
1672 u8 type
= type_flags
& 0xff;
1673 u8 flags
= type_flags
>> 8;
1677 case PACKET_FANOUT_ROLLOVER
:
1678 if (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)
1680 case PACKET_FANOUT_HASH
:
1681 case PACKET_FANOUT_LB
:
1682 case PACKET_FANOUT_CPU
:
1683 case PACKET_FANOUT_RND
:
1684 case PACKET_FANOUT_QM
:
1685 case PACKET_FANOUT_CBPF
:
1686 case PACKET_FANOUT_EBPF
:
1692 mutex_lock(&fanout_mutex
);
1698 if (type
== PACKET_FANOUT_ROLLOVER
||
1699 (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)) {
1701 rollover
= kzalloc(sizeof(*rollover
), GFP_KERNEL
);
1704 atomic_long_set(&rollover
->num
, 0);
1705 atomic_long_set(&rollover
->num_huge
, 0);
1706 atomic_long_set(&rollover
->num_failed
, 0);
1709 if (type_flags
& PACKET_FANOUT_FLAG_UNIQUEID
) {
1714 if (!fanout_find_new_id(sk
, &id
)) {
1718 /* ephemeral flag for the first socket in the group: drop it */
1719 flags
&= ~(PACKET_FANOUT_FLAG_UNIQUEID
>> 8);
1723 list_for_each_entry(f
, &fanout_list
, list
) {
1725 read_pnet(&f
->net
) == sock_net(sk
)) {
1731 if (match
&& match
->flags
!= flags
)
1735 match
= kzalloc(sizeof(*match
), GFP_KERNEL
);
1738 write_pnet(&match
->net
, sock_net(sk
));
1741 match
->flags
= flags
;
1742 INIT_LIST_HEAD(&match
->list
);
1743 spin_lock_init(&match
->lock
);
1744 refcount_set(&match
->sk_ref
, 0);
1745 fanout_init_data(match
);
1746 match
->prot_hook
.type
= po
->prot_hook
.type
;
1747 match
->prot_hook
.dev
= po
->prot_hook
.dev
;
1748 match
->prot_hook
.func
= packet_rcv_fanout
;
1749 match
->prot_hook
.af_packet_priv
= match
;
1750 match
->prot_hook
.id_match
= match_fanout_group
;
1751 list_add(&match
->list
, &fanout_list
);
1755 spin_lock(&po
->bind_lock
);
1757 match
->type
== type
&&
1758 match
->prot_hook
.type
== po
->prot_hook
.type
&&
1759 match
->prot_hook
.dev
== po
->prot_hook
.dev
) {
1761 if (refcount_read(&match
->sk_ref
) < PACKET_FANOUT_MAX
) {
1762 __dev_remove_pack(&po
->prot_hook
);
1764 po
->rollover
= rollover
;
1766 refcount_set(&match
->sk_ref
, refcount_read(&match
->sk_ref
) + 1);
1767 __fanout_link(sk
, po
);
1771 spin_unlock(&po
->bind_lock
);
1773 if (err
&& !refcount_read(&match
->sk_ref
)) {
1774 list_del(&match
->list
);
1780 mutex_unlock(&fanout_mutex
);
1784 /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1785 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1786 * It is the responsibility of the caller to call fanout_release_data() and
1787 * free the returned packet_fanout (after synchronize_net())
1789 static struct packet_fanout
*fanout_release(struct sock
*sk
)
1791 struct packet_sock
*po
= pkt_sk(sk
);
1792 struct packet_fanout
*f
;
1794 mutex_lock(&fanout_mutex
);
1799 if (refcount_dec_and_test(&f
->sk_ref
))
1804 mutex_unlock(&fanout_mutex
);
1809 static bool packet_extra_vlan_len_allowed(const struct net_device
*dev
,
1810 struct sk_buff
*skb
)
1812 /* Earlier code assumed this would be a VLAN pkt, double-check
1813 * this now that we have the actual packet in hand. We can only
1814 * do this check on Ethernet devices.
1816 if (unlikely(dev
->type
!= ARPHRD_ETHER
))
1819 skb_reset_mac_header(skb
);
1820 return likely(eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
));
1823 static const struct proto_ops packet_ops
;
1825 static const struct proto_ops packet_ops_spkt
;
1827 static int packet_rcv_spkt(struct sk_buff
*skb
, struct net_device
*dev
,
1828 struct packet_type
*pt
, struct net_device
*orig_dev
)
1831 struct sockaddr_pkt
*spkt
;
1834 * When we registered the protocol we saved the socket in the data
1835 * field for just this event.
1838 sk
= pt
->af_packet_priv
;
1841 * Yank back the headers [hope the device set this
1842 * right or kerboom...]
1844 * Incoming packets have ll header pulled,
1847 * For outgoing ones skb->data == skb_mac_header(skb)
1848 * so that this procedure is noop.
1851 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1854 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1857 skb
= skb_share_check(skb
, GFP_ATOMIC
);
1861 /* drop any routing info */
1864 /* drop conntrack reference */
1867 spkt
= &PACKET_SKB_CB(skb
)->sa
.pkt
;
1869 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1872 * The SOCK_PACKET socket receives _all_ frames.
1875 spkt
->spkt_family
= dev
->type
;
1876 strlcpy(spkt
->spkt_device
, dev
->name
, sizeof(spkt
->spkt_device
));
1877 spkt
->spkt_protocol
= skb
->protocol
;
1880 * Charge the memory to the socket. This is done specifically
1881 * to prevent sockets using all the memory up.
1884 if (sock_queue_rcv_skb(sk
, skb
) == 0)
1895 * Output a raw packet to a device layer. This bypasses all the other
1896 * protocol layers and you must therefore supply it with a complete frame
1899 static int packet_sendmsg_spkt(struct socket
*sock
, struct msghdr
*msg
,
1902 struct sock
*sk
= sock
->sk
;
1903 DECLARE_SOCKADDR(struct sockaddr_pkt
*, saddr
, msg
->msg_name
);
1904 struct sk_buff
*skb
= NULL
;
1905 struct net_device
*dev
;
1906 struct sockcm_cookie sockc
;
1912 * Get and verify the address.
1916 if (msg
->msg_namelen
< sizeof(struct sockaddr
))
1918 if (msg
->msg_namelen
== sizeof(struct sockaddr_pkt
))
1919 proto
= saddr
->spkt_protocol
;
1921 return -ENOTCONN
; /* SOCK_PACKET must be sent giving an address */
1924 * Find the device first to size check it
1927 saddr
->spkt_device
[sizeof(saddr
->spkt_device
) - 1] = 0;
1930 dev
= dev_get_by_name_rcu(sock_net(sk
), saddr
->spkt_device
);
1936 if (!(dev
->flags
& IFF_UP
))
1940 * You may not queue a frame bigger than the mtu. This is the lowest level
1941 * raw protocol and you must do your own fragmentation at this level.
1944 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
1945 if (!netif_supports_nofcs(dev
)) {
1946 err
= -EPROTONOSUPPORT
;
1949 extra_len
= 4; /* We're doing our own CRC */
1953 if (len
> dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
+ extra_len
)
1957 size_t reserved
= LL_RESERVED_SPACE(dev
);
1958 int tlen
= dev
->needed_tailroom
;
1959 unsigned int hhlen
= dev
->header_ops
? dev
->hard_header_len
: 0;
1962 skb
= sock_wmalloc(sk
, len
+ reserved
+ tlen
, 0, GFP_KERNEL
);
1965 /* FIXME: Save some space for broken drivers that write a hard
1966 * header at transmission time by themselves. PPP is the notable
1967 * one here. This should really be fixed at the driver level.
1969 skb_reserve(skb
, reserved
);
1970 skb_reset_network_header(skb
);
1972 /* Try to align data part correctly */
1977 skb_reset_network_header(skb
);
1979 err
= memcpy_from_msg(skb_put(skb
, len
), msg
, len
);
1985 if (!dev_validate_header(dev
, skb
->data
, len
)) {
1989 if (len
> (dev
->mtu
+ dev
->hard_header_len
+ extra_len
) &&
1990 !packet_extra_vlan_len_allowed(dev
, skb
)) {
1995 sockc
.tsflags
= sk
->sk_tsflags
;
1996 if (msg
->msg_controllen
) {
1997 err
= sock_cmsg_send(sk
, msg
, &sockc
);
2002 skb
->protocol
= proto
;
2004 skb
->priority
= sk
->sk_priority
;
2005 skb
->mark
= sk
->sk_mark
;
2007 sock_tx_timestamp(sk
, sockc
.tsflags
, &skb_shinfo(skb
)->tx_flags
);
2009 if (unlikely(extra_len
== 4))
2012 skb_probe_transport_header(skb
, 0);
2014 dev_queue_xmit(skb
);
2025 static unsigned int run_filter(struct sk_buff
*skb
,
2026 const struct sock
*sk
,
2029 struct sk_filter
*filter
;
2032 filter
= rcu_dereference(sk
->sk_filter
);
2034 res
= bpf_prog_run_clear_cb(filter
->prog
, skb
);
2040 static int packet_rcv_vnet(struct msghdr
*msg
, const struct sk_buff
*skb
,
2043 struct virtio_net_hdr vnet_hdr
;
2045 if (*len
< sizeof(vnet_hdr
))
2047 *len
-= sizeof(vnet_hdr
);
2049 if (virtio_net_hdr_from_skb(skb
, &vnet_hdr
, vio_le(), true, 0))
2052 return memcpy_to_msg(msg
, (void *)&vnet_hdr
, sizeof(vnet_hdr
));
2056 * This function makes lazy skb cloning in hope that most of packets
2057 * are discarded by BPF.
2059 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2060 * and skb->cb are mangled. It works because (and until) packets
2061 * falling here are owned by current CPU. Output packets are cloned
2062 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2063 * sequencially, so that if we return skb to original state on exit,
2064 * we will not harm anyone.
2067 static int packet_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
2068 struct packet_type
*pt
, struct net_device
*orig_dev
)
2071 struct sockaddr_ll
*sll
;
2072 struct packet_sock
*po
;
2073 u8
*skb_head
= skb
->data
;
2074 int skb_len
= skb
->len
;
2075 unsigned int snaplen
, res
;
2076 bool is_drop_n_account
= false;
2078 if (skb
->pkt_type
== PACKET_LOOPBACK
)
2081 sk
= pt
->af_packet_priv
;
2084 if (!net_eq(dev_net(dev
), sock_net(sk
)))
2089 if (dev
->header_ops
) {
2090 /* The device has an explicit notion of ll header,
2091 * exported to higher levels.
2093 * Otherwise, the device hides details of its frame
2094 * structure, so that corresponding packet head is
2095 * never delivered to user.
2097 if (sk
->sk_type
!= SOCK_DGRAM
)
2098 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2099 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
2100 /* Special case: outgoing packets have ll header at head */
2101 skb_pull(skb
, skb_network_offset(skb
));
2107 res
= run_filter(skb
, sk
, snaplen
);
2109 goto drop_n_restore
;
2113 if (atomic_read(&sk
->sk_rmem_alloc
) >= sk
->sk_rcvbuf
)
2116 if (skb_shared(skb
)) {
2117 struct sk_buff
*nskb
= skb_clone(skb
, GFP_ATOMIC
);
2121 if (skb_head
!= skb
->data
) {
2122 skb
->data
= skb_head
;
2129 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb
)) + MAX_ADDR_LEN
- 8);
2131 sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
2132 sll
->sll_hatype
= dev
->type
;
2133 sll
->sll_pkttype
= skb
->pkt_type
;
2134 if (unlikely(po
->origdev
))
2135 sll
->sll_ifindex
= orig_dev
->ifindex
;
2137 sll
->sll_ifindex
= dev
->ifindex
;
2139 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
2141 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2142 * Use their space for storing the original skb length.
2144 PACKET_SKB_CB(skb
)->sa
.origlen
= skb
->len
;
2146 if (pskb_trim(skb
, snaplen
))
2149 skb_set_owner_r(skb
, sk
);
2153 /* drop conntrack reference */
2156 spin_lock(&sk
->sk_receive_queue
.lock
);
2157 po
->stats
.stats1
.tp_packets
++;
2158 sock_skb_set_dropcount(sk
, skb
);
2159 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
2160 spin_unlock(&sk
->sk_receive_queue
.lock
);
2161 sk
->sk_data_ready(sk
);
2165 is_drop_n_account
= true;
2166 spin_lock(&sk
->sk_receive_queue
.lock
);
2167 po
->stats
.stats1
.tp_drops
++;
2168 atomic_inc(&sk
->sk_drops
);
2169 spin_unlock(&sk
->sk_receive_queue
.lock
);
2172 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
2173 skb
->data
= skb_head
;
2177 if (!is_drop_n_account
)
2184 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
2185 struct packet_type
*pt
, struct net_device
*orig_dev
)
2188 struct packet_sock
*po
;
2189 struct sockaddr_ll
*sll
;
2190 union tpacket_uhdr h
;
2191 u8
*skb_head
= skb
->data
;
2192 int skb_len
= skb
->len
;
2193 unsigned int snaplen
, res
;
2194 unsigned long status
= TP_STATUS_USER
;
2195 unsigned short macoff
, netoff
, hdrlen
;
2196 struct sk_buff
*copy_skb
= NULL
;
2199 bool is_drop_n_account
= false;
2200 bool do_vnet
= false;
2202 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2203 * We may add members to them until current aligned size without forcing
2204 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2206 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h2
)) != 32);
2207 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h3
)) != 48);
2209 if (skb
->pkt_type
== PACKET_LOOPBACK
)
2212 sk
= pt
->af_packet_priv
;
2215 if (!net_eq(dev_net(dev
), sock_net(sk
)))
2218 if (dev
->header_ops
) {
2219 if (sk
->sk_type
!= SOCK_DGRAM
)
2220 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2221 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
2222 /* Special case: outgoing packets have ll header at head */
2223 skb_pull(skb
, skb_network_offset(skb
));
2229 res
= run_filter(skb
, sk
, snaplen
);
2231 goto drop_n_restore
;
2233 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
2234 status
|= TP_STATUS_CSUMNOTREADY
;
2235 else if (skb
->pkt_type
!= PACKET_OUTGOING
&&
2236 (skb
->ip_summed
== CHECKSUM_COMPLETE
||
2237 skb_csum_unnecessary(skb
)))
2238 status
|= TP_STATUS_CSUM_VALID
;
2243 if (sk
->sk_type
== SOCK_DGRAM
) {
2244 macoff
= netoff
= TPACKET_ALIGN(po
->tp_hdrlen
) + 16 +
2247 unsigned int maclen
= skb_network_offset(skb
);
2248 netoff
= TPACKET_ALIGN(po
->tp_hdrlen
+
2249 (maclen
< 16 ? 16 : maclen
)) +
2251 if (po
->has_vnet_hdr
) {
2252 netoff
+= sizeof(struct virtio_net_hdr
);
2255 macoff
= netoff
- maclen
;
2257 if (po
->tp_version
<= TPACKET_V2
) {
2258 if (macoff
+ snaplen
> po
->rx_ring
.frame_size
) {
2259 if (po
->copy_thresh
&&
2260 atomic_read(&sk
->sk_rmem_alloc
) < sk
->sk_rcvbuf
) {
2261 if (skb_shared(skb
)) {
2262 copy_skb
= skb_clone(skb
, GFP_ATOMIC
);
2264 copy_skb
= skb_get(skb
);
2265 skb_head
= skb
->data
;
2268 skb_set_owner_r(copy_skb
, sk
);
2270 snaplen
= po
->rx_ring
.frame_size
- macoff
;
2271 if ((int)snaplen
< 0) {
2276 } else if (unlikely(macoff
+ snaplen
>
2277 GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
)) {
2280 nval
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
- macoff
;
2281 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2282 snaplen
, nval
, macoff
);
2284 if (unlikely((int)snaplen
< 0)) {
2286 macoff
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
;
2290 spin_lock(&sk
->sk_receive_queue
.lock
);
2291 h
.raw
= packet_current_rx_frame(po
, skb
,
2292 TP_STATUS_KERNEL
, (macoff
+snaplen
));
2294 goto drop_n_account
;
2295 if (po
->tp_version
<= TPACKET_V2
) {
2296 packet_increment_rx_head(po
, &po
->rx_ring
);
2298 * LOSING will be reported till you read the stats,
2299 * because it's COR - Clear On Read.
2300 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2303 if (po
->stats
.stats1
.tp_drops
)
2304 status
|= TP_STATUS_LOSING
;
2308 virtio_net_hdr_from_skb(skb
, h
.raw
+ macoff
-
2309 sizeof(struct virtio_net_hdr
),
2311 goto drop_n_account
;
2313 po
->stats
.stats1
.tp_packets
++;
2315 status
|= TP_STATUS_COPY
;
2316 __skb_queue_tail(&sk
->sk_receive_queue
, copy_skb
);
2318 spin_unlock(&sk
->sk_receive_queue
.lock
);
2320 skb_copy_bits(skb
, 0, h
.raw
+ macoff
, snaplen
);
2322 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
2323 getnstimeofday(&ts
);
2325 status
|= ts_status
;
2327 switch (po
->tp_version
) {
2329 h
.h1
->tp_len
= skb
->len
;
2330 h
.h1
->tp_snaplen
= snaplen
;
2331 h
.h1
->tp_mac
= macoff
;
2332 h
.h1
->tp_net
= netoff
;
2333 h
.h1
->tp_sec
= ts
.tv_sec
;
2334 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
2335 hdrlen
= sizeof(*h
.h1
);
2338 h
.h2
->tp_len
= skb
->len
;
2339 h
.h2
->tp_snaplen
= snaplen
;
2340 h
.h2
->tp_mac
= macoff
;
2341 h
.h2
->tp_net
= netoff
;
2342 h
.h2
->tp_sec
= ts
.tv_sec
;
2343 h
.h2
->tp_nsec
= ts
.tv_nsec
;
2344 if (skb_vlan_tag_present(skb
)) {
2345 h
.h2
->tp_vlan_tci
= skb_vlan_tag_get(skb
);
2346 h
.h2
->tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
2347 status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
2349 h
.h2
->tp_vlan_tci
= 0;
2350 h
.h2
->tp_vlan_tpid
= 0;
2352 memset(h
.h2
->tp_padding
, 0, sizeof(h
.h2
->tp_padding
));
2353 hdrlen
= sizeof(*h
.h2
);
2356 /* tp_nxt_offset,vlan are already populated above.
2357 * So DONT clear those fields here
2359 h
.h3
->tp_status
|= status
;
2360 h
.h3
->tp_len
= skb
->len
;
2361 h
.h3
->tp_snaplen
= snaplen
;
2362 h
.h3
->tp_mac
= macoff
;
2363 h
.h3
->tp_net
= netoff
;
2364 h
.h3
->tp_sec
= ts
.tv_sec
;
2365 h
.h3
->tp_nsec
= ts
.tv_nsec
;
2366 memset(h
.h3
->tp_padding
, 0, sizeof(h
.h3
->tp_padding
));
2367 hdrlen
= sizeof(*h
.h3
);
2373 sll
= h
.raw
+ TPACKET_ALIGN(hdrlen
);
2374 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
2375 sll
->sll_family
= AF_PACKET
;
2376 sll
->sll_hatype
= dev
->type
;
2377 sll
->sll_protocol
= skb
->protocol
;
2378 sll
->sll_pkttype
= skb
->pkt_type
;
2379 if (unlikely(po
->origdev
))
2380 sll
->sll_ifindex
= orig_dev
->ifindex
;
2382 sll
->sll_ifindex
= dev
->ifindex
;
2386 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2387 if (po
->tp_version
<= TPACKET_V2
) {
2390 end
= (u8
*) PAGE_ALIGN((unsigned long) h
.raw
+
2393 for (start
= h
.raw
; start
< end
; start
+= PAGE_SIZE
)
2394 flush_dcache_page(pgv_to_page(start
));
2399 if (po
->tp_version
<= TPACKET_V2
) {
2400 __packet_set_status(po
, h
.raw
, status
);
2401 sk
->sk_data_ready(sk
);
2403 prb_clear_blk_fill_status(&po
->rx_ring
);
2407 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
2408 skb
->data
= skb_head
;
2412 if (!is_drop_n_account
)
2419 is_drop_n_account
= true;
2420 po
->stats
.stats1
.tp_drops
++;
2421 spin_unlock(&sk
->sk_receive_queue
.lock
);
2423 sk
->sk_data_ready(sk
);
2424 kfree_skb(copy_skb
);
2425 goto drop_n_restore
;
2428 static void tpacket_destruct_skb(struct sk_buff
*skb
)
2430 struct packet_sock
*po
= pkt_sk(skb
->sk
);
2432 if (likely(po
->tx_ring
.pg_vec
)) {
2436 ph
= skb_zcopy_get_nouarg(skb
);
2437 packet_dec_pending(&po
->tx_ring
);
2439 ts
= __packet_set_timestamp(po
, ph
, skb
);
2440 __packet_set_status(po
, ph
, TP_STATUS_AVAILABLE
| ts
);
2446 static void tpacket_set_protocol(const struct net_device
*dev
,
2447 struct sk_buff
*skb
)
2449 if (dev
->type
== ARPHRD_ETHER
) {
2450 skb_reset_mac_header(skb
);
2451 skb
->protocol
= eth_hdr(skb
)->h_proto
;
2455 static int __packet_snd_vnet_parse(struct virtio_net_hdr
*vnet_hdr
, size_t len
)
2457 if ((vnet_hdr
->flags
& VIRTIO_NET_HDR_F_NEEDS_CSUM
) &&
2458 (__virtio16_to_cpu(vio_le(), vnet_hdr
->csum_start
) +
2459 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_offset
) + 2 >
2460 __virtio16_to_cpu(vio_le(), vnet_hdr
->hdr_len
)))
2461 vnet_hdr
->hdr_len
= __cpu_to_virtio16(vio_le(),
2462 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_start
) +
2463 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_offset
) + 2);
2465 if (__virtio16_to_cpu(vio_le(), vnet_hdr
->hdr_len
) > len
)
2471 static int packet_snd_vnet_parse(struct msghdr
*msg
, size_t *len
,
2472 struct virtio_net_hdr
*vnet_hdr
)
2474 if (*len
< sizeof(*vnet_hdr
))
2476 *len
-= sizeof(*vnet_hdr
);
2478 if (!copy_from_iter_full(vnet_hdr
, sizeof(*vnet_hdr
), &msg
->msg_iter
))
2481 return __packet_snd_vnet_parse(vnet_hdr
, *len
);
2484 static int tpacket_fill_skb(struct packet_sock
*po
, struct sk_buff
*skb
,
2485 void *frame
, struct net_device
*dev
, void *data
, int tp_len
,
2486 __be16 proto
, unsigned char *addr
, int hlen
, int copylen
,
2487 const struct sockcm_cookie
*sockc
)
2489 union tpacket_uhdr ph
;
2490 int to_write
, offset
, len
, nr_frags
, len_max
;
2491 struct socket
*sock
= po
->sk
.sk_socket
;
2497 skb
->protocol
= proto
;
2499 skb
->priority
= po
->sk
.sk_priority
;
2500 skb
->mark
= po
->sk
.sk_mark
;
2501 sock_tx_timestamp(&po
->sk
, sockc
->tsflags
, &skb_shinfo(skb
)->tx_flags
);
2502 skb_zcopy_set_nouarg(skb
, ph
.raw
);
2504 skb_reserve(skb
, hlen
);
2505 skb_reset_network_header(skb
);
2509 if (sock
->type
== SOCK_DGRAM
) {
2510 err
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
,
2512 if (unlikely(err
< 0))
2514 } else if (copylen
) {
2515 int hdrlen
= min_t(int, copylen
, tp_len
);
2517 skb_push(skb
, dev
->hard_header_len
);
2518 skb_put(skb
, copylen
- dev
->hard_header_len
);
2519 err
= skb_store_bits(skb
, 0, data
, hdrlen
);
2522 if (!dev_validate_header(dev
, skb
->data
, hdrlen
))
2525 tpacket_set_protocol(dev
, skb
);
2531 offset
= offset_in_page(data
);
2532 len_max
= PAGE_SIZE
- offset
;
2533 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2535 skb
->data_len
= to_write
;
2536 skb
->len
+= to_write
;
2537 skb
->truesize
+= to_write
;
2538 refcount_add(to_write
, &po
->sk
.sk_wmem_alloc
);
2540 while (likely(to_write
)) {
2541 nr_frags
= skb_shinfo(skb
)->nr_frags
;
2543 if (unlikely(nr_frags
>= MAX_SKB_FRAGS
)) {
2544 pr_err("Packet exceed the number of skb frags(%lu)\n",
2549 page
= pgv_to_page(data
);
2551 flush_dcache_page(page
);
2553 skb_fill_page_desc(skb
, nr_frags
, page
, offset
, len
);
2556 len_max
= PAGE_SIZE
;
2557 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2560 skb_probe_transport_header(skb
, 0);
2565 static int tpacket_parse_header(struct packet_sock
*po
, void *frame
,
2566 int size_max
, void **data
)
2568 union tpacket_uhdr ph
;
2573 switch (po
->tp_version
) {
2575 if (ph
.h3
->tp_next_offset
!= 0) {
2576 pr_warn_once("variable sized slot not supported");
2579 tp_len
= ph
.h3
->tp_len
;
2582 tp_len
= ph
.h2
->tp_len
;
2585 tp_len
= ph
.h1
->tp_len
;
2588 if (unlikely(tp_len
> size_max
)) {
2589 pr_err("packet size is too long (%d > %d)\n", tp_len
, size_max
);
2593 if (unlikely(po
->tp_tx_has_off
)) {
2594 int off_min
, off_max
;
2596 off_min
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2597 off_max
= po
->tx_ring
.frame_size
- tp_len
;
2598 if (po
->sk
.sk_type
== SOCK_DGRAM
) {
2599 switch (po
->tp_version
) {
2601 off
= ph
.h3
->tp_net
;
2604 off
= ph
.h2
->tp_net
;
2607 off
= ph
.h1
->tp_net
;
2611 switch (po
->tp_version
) {
2613 off
= ph
.h3
->tp_mac
;
2616 off
= ph
.h2
->tp_mac
;
2619 off
= ph
.h1
->tp_mac
;
2623 if (unlikely((off
< off_min
) || (off_max
< off
)))
2626 off
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2629 *data
= frame
+ off
;
2633 static int tpacket_snd(struct packet_sock
*po
, struct msghdr
*msg
)
2635 struct sk_buff
*skb
;
2636 struct net_device
*dev
;
2637 struct virtio_net_hdr
*vnet_hdr
= NULL
;
2638 struct sockcm_cookie sockc
;
2640 int err
, reserve
= 0;
2642 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2643 bool need_wait
= !(msg
->msg_flags
& MSG_DONTWAIT
);
2644 int tp_len
, size_max
;
2645 unsigned char *addr
;
2648 int status
= TP_STATUS_AVAILABLE
;
2649 int hlen
, tlen
, copylen
= 0;
2651 mutex_lock(&po
->pg_vec_lock
);
2653 if (likely(saddr
== NULL
)) {
2654 dev
= packet_cached_dev_get(po
);
2659 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2661 if (msg
->msg_namelen
< (saddr
->sll_halen
2662 + offsetof(struct sockaddr_ll
,
2665 proto
= saddr
->sll_protocol
;
2666 addr
= saddr
->sll_halen
? saddr
->sll_addr
: NULL
;
2667 dev
= dev_get_by_index(sock_net(&po
->sk
), saddr
->sll_ifindex
);
2668 if (addr
&& dev
&& saddr
->sll_halen
< dev
->addr_len
)
2673 if (unlikely(dev
== NULL
))
2676 if (unlikely(!(dev
->flags
& IFF_UP
)))
2679 sockc
.tsflags
= po
->sk
.sk_tsflags
;
2680 if (msg
->msg_controllen
) {
2681 err
= sock_cmsg_send(&po
->sk
, msg
, &sockc
);
2686 if (po
->sk
.sk_socket
->type
== SOCK_RAW
)
2687 reserve
= dev
->hard_header_len
;
2688 size_max
= po
->tx_ring
.frame_size
2689 - (po
->tp_hdrlen
- sizeof(struct sockaddr_ll
));
2691 if ((size_max
> dev
->mtu
+ reserve
+ VLAN_HLEN
) && !po
->has_vnet_hdr
)
2692 size_max
= dev
->mtu
+ reserve
+ VLAN_HLEN
;
2695 ph
= packet_current_frame(po
, &po
->tx_ring
,
2696 TP_STATUS_SEND_REQUEST
);
2697 if (unlikely(ph
== NULL
)) {
2698 if (need_wait
&& need_resched())
2704 tp_len
= tpacket_parse_header(po
, ph
, size_max
, &data
);
2708 status
= TP_STATUS_SEND_REQUEST
;
2709 hlen
= LL_RESERVED_SPACE(dev
);
2710 tlen
= dev
->needed_tailroom
;
2711 if (po
->has_vnet_hdr
) {
2713 data
+= sizeof(*vnet_hdr
);
2714 tp_len
-= sizeof(*vnet_hdr
);
2716 __packet_snd_vnet_parse(vnet_hdr
, tp_len
)) {
2720 copylen
= __virtio16_to_cpu(vio_le(),
2723 copylen
= max_t(int, copylen
, dev
->hard_header_len
);
2724 skb
= sock_alloc_send_skb(&po
->sk
,
2725 hlen
+ tlen
+ sizeof(struct sockaddr_ll
) +
2726 (copylen
- dev
->hard_header_len
),
2729 if (unlikely(skb
== NULL
)) {
2730 /* we assume the socket was initially writeable ... */
2731 if (likely(len_sum
> 0))
2735 tp_len
= tpacket_fill_skb(po
, skb
, ph
, dev
, data
, tp_len
, proto
,
2736 addr
, hlen
, copylen
, &sockc
);
2737 if (likely(tp_len
>= 0) &&
2738 tp_len
> dev
->mtu
+ reserve
&&
2739 !po
->has_vnet_hdr
&&
2740 !packet_extra_vlan_len_allowed(dev
, skb
))
2743 if (unlikely(tp_len
< 0)) {
2746 __packet_set_status(po
, ph
,
2747 TP_STATUS_AVAILABLE
);
2748 packet_increment_head(&po
->tx_ring
);
2752 status
= TP_STATUS_WRONG_FORMAT
;
2758 if (po
->has_vnet_hdr
) {
2759 if (virtio_net_hdr_to_skb(skb
, vnet_hdr
, vio_le())) {
2763 virtio_net_hdr_set_proto(skb
, vnet_hdr
);
2766 skb
->destructor
= tpacket_destruct_skb
;
2767 __packet_set_status(po
, ph
, TP_STATUS_SENDING
);
2768 packet_inc_pending(&po
->tx_ring
);
2770 status
= TP_STATUS_SEND_REQUEST
;
2771 err
= po
->xmit(skb
);
2772 if (unlikely(err
> 0)) {
2773 err
= net_xmit_errno(err
);
2774 if (err
&& __packet_get_status(po
, ph
) ==
2775 TP_STATUS_AVAILABLE
) {
2776 /* skb was destructed already */
2781 * skb was dropped but not destructed yet;
2782 * let's treat it like congestion or err < 0
2786 packet_increment_head(&po
->tx_ring
);
2788 } while (likely((ph
!= NULL
) ||
2789 /* Note: packet_read_pending() might be slow if we have
2790 * to call it as it's per_cpu variable, but in fast-path
2791 * we already short-circuit the loop with the first
2792 * condition, and luckily don't have to go that path
2795 (need_wait
&& packet_read_pending(&po
->tx_ring
))));
2801 __packet_set_status(po
, ph
, status
);
2806 mutex_unlock(&po
->pg_vec_lock
);
2810 static struct sk_buff
*packet_alloc_skb(struct sock
*sk
, size_t prepad
,
2811 size_t reserve
, size_t len
,
2812 size_t linear
, int noblock
,
2815 struct sk_buff
*skb
;
2817 /* Under a page? Don't bother with paged skb. */
2818 if (prepad
+ len
< PAGE_SIZE
|| !linear
)
2821 skb
= sock_alloc_send_pskb(sk
, prepad
+ linear
, len
- linear
, noblock
,
2826 skb_reserve(skb
, reserve
);
2827 skb_put(skb
, linear
);
2828 skb
->data_len
= len
- linear
;
2829 skb
->len
+= len
- linear
;
2834 static int packet_snd(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
2836 struct sock
*sk
= sock
->sk
;
2837 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2838 struct sk_buff
*skb
;
2839 struct net_device
*dev
;
2841 unsigned char *addr
;
2842 int err
, reserve
= 0;
2843 struct sockcm_cookie sockc
;
2844 struct virtio_net_hdr vnet_hdr
= { 0 };
2846 struct packet_sock
*po
= pkt_sk(sk
);
2847 bool has_vnet_hdr
= false;
2848 int hlen
, tlen
, linear
;
2852 * Get and verify the address.
2855 if (likely(saddr
== NULL
)) {
2856 dev
= packet_cached_dev_get(po
);
2861 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2863 if (msg
->msg_namelen
< (saddr
->sll_halen
+ offsetof(struct sockaddr_ll
, sll_addr
)))
2865 proto
= saddr
->sll_protocol
;
2866 addr
= saddr
->sll_halen
? saddr
->sll_addr
: NULL
;
2867 dev
= dev_get_by_index(sock_net(sk
), saddr
->sll_ifindex
);
2868 if (addr
&& dev
&& saddr
->sll_halen
< dev
->addr_len
)
2873 if (unlikely(dev
== NULL
))
2876 if (unlikely(!(dev
->flags
& IFF_UP
)))
2879 sockc
.tsflags
= sk
->sk_tsflags
;
2880 sockc
.mark
= sk
->sk_mark
;
2881 if (msg
->msg_controllen
) {
2882 err
= sock_cmsg_send(sk
, msg
, &sockc
);
2887 if (sock
->type
== SOCK_RAW
)
2888 reserve
= dev
->hard_header_len
;
2889 if (po
->has_vnet_hdr
) {
2890 err
= packet_snd_vnet_parse(msg
, &len
, &vnet_hdr
);
2893 has_vnet_hdr
= true;
2896 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
2897 if (!netif_supports_nofcs(dev
)) {
2898 err
= -EPROTONOSUPPORT
;
2901 extra_len
= 4; /* We're doing our own CRC */
2905 if (!vnet_hdr
.gso_type
&&
2906 (len
> dev
->mtu
+ reserve
+ VLAN_HLEN
+ extra_len
))
2910 hlen
= LL_RESERVED_SPACE(dev
);
2911 tlen
= dev
->needed_tailroom
;
2912 linear
= __virtio16_to_cpu(vio_le(), vnet_hdr
.hdr_len
);
2913 linear
= max(linear
, min_t(int, len
, dev
->hard_header_len
));
2914 skb
= packet_alloc_skb(sk
, hlen
+ tlen
, hlen
, len
, linear
,
2915 msg
->msg_flags
& MSG_DONTWAIT
, &err
);
2919 skb_reset_network_header(skb
);
2922 if (sock
->type
== SOCK_DGRAM
) {
2923 offset
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
, NULL
, len
);
2924 if (unlikely(offset
< 0))
2926 } else if (reserve
) {
2927 skb_reserve(skb
, -reserve
);
2929 skb_reset_network_header(skb
);
2932 /* Returns -EFAULT on error */
2933 err
= skb_copy_datagram_from_iter(skb
, offset
, &msg
->msg_iter
, len
);
2937 if (sock
->type
== SOCK_RAW
&&
2938 !dev_validate_header(dev
, skb
->data
, len
)) {
2943 sock_tx_timestamp(sk
, sockc
.tsflags
, &skb_shinfo(skb
)->tx_flags
);
2945 if (!vnet_hdr
.gso_type
&& (len
> dev
->mtu
+ reserve
+ extra_len
) &&
2946 !packet_extra_vlan_len_allowed(dev
, skb
)) {
2951 skb
->protocol
= proto
;
2953 skb
->priority
= sk
->sk_priority
;
2954 skb
->mark
= sockc
.mark
;
2957 err
= virtio_net_hdr_to_skb(skb
, &vnet_hdr
, vio_le());
2960 len
+= sizeof(vnet_hdr
);
2961 virtio_net_hdr_set_proto(skb
, &vnet_hdr
);
2964 skb_probe_transport_header(skb
, reserve
);
2966 if (unlikely(extra_len
== 4))
2969 err
= po
->xmit(skb
);
2970 if (err
> 0 && (err
= net_xmit_errno(err
)) != 0)
2986 static int packet_sendmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
2988 struct sock
*sk
= sock
->sk
;
2989 struct packet_sock
*po
= pkt_sk(sk
);
2991 if (po
->tx_ring
.pg_vec
)
2992 return tpacket_snd(po
, msg
);
2994 return packet_snd(sock
, msg
, len
);
2998 * Close a PACKET socket. This is fairly simple. We immediately go
2999 * to 'closed' state and remove our protocol entry in the device list.
3002 static int packet_release(struct socket
*sock
)
3004 struct sock
*sk
= sock
->sk
;
3005 struct packet_sock
*po
;
3006 struct packet_fanout
*f
;
3008 union tpacket_req_u req_u
;
3016 mutex_lock(&net
->packet
.sklist_lock
);
3017 sk_del_node_init_rcu(sk
);
3018 mutex_unlock(&net
->packet
.sklist_lock
);
3021 sock_prot_inuse_add(net
, sk
->sk_prot
, -1);
3024 spin_lock(&po
->bind_lock
);
3025 unregister_prot_hook(sk
, false);
3026 packet_cached_dev_reset(po
);
3028 if (po
->prot_hook
.dev
) {
3029 dev_put(po
->prot_hook
.dev
);
3030 po
->prot_hook
.dev
= NULL
;
3032 spin_unlock(&po
->bind_lock
);
3034 packet_flush_mclist(sk
);
3037 if (po
->rx_ring
.pg_vec
) {
3038 memset(&req_u
, 0, sizeof(req_u
));
3039 packet_set_ring(sk
, &req_u
, 1, 0);
3042 if (po
->tx_ring
.pg_vec
) {
3043 memset(&req_u
, 0, sizeof(req_u
));
3044 packet_set_ring(sk
, &req_u
, 1, 1);
3048 f
= fanout_release(sk
);
3053 kfree(po
->rollover
);
3054 fanout_release_data(f
);
3058 * Now the socket is dead. No more input will appear.
3065 skb_queue_purge(&sk
->sk_receive_queue
);
3066 packet_free_pending(po
);
3067 sk_refcnt_debug_release(sk
);
3074 * Attach a packet hook.
3077 static int packet_do_bind(struct sock
*sk
, const char *name
, int ifindex
,
3080 struct packet_sock
*po
= pkt_sk(sk
);
3081 struct net_device
*dev_curr
;
3084 struct net_device
*dev
= NULL
;
3086 bool unlisted
= false;
3089 spin_lock(&po
->bind_lock
);
3098 dev
= dev_get_by_name_rcu(sock_net(sk
), name
);
3103 } else if (ifindex
) {
3104 dev
= dev_get_by_index_rcu(sock_net(sk
), ifindex
);
3114 proto_curr
= po
->prot_hook
.type
;
3115 dev_curr
= po
->prot_hook
.dev
;
3117 need_rehook
= proto_curr
!= proto
|| dev_curr
!= dev
;
3122 /* prevents packet_notifier() from calling
3123 * register_prot_hook()
3126 __unregister_prot_hook(sk
, true);
3128 dev_curr
= po
->prot_hook
.dev
;
3130 unlisted
= !dev_get_by_index_rcu(sock_net(sk
),
3134 BUG_ON(po
->running
);
3136 po
->prot_hook
.type
= proto
;
3138 if (unlikely(unlisted
)) {
3140 po
->prot_hook
.dev
= NULL
;
3142 packet_cached_dev_reset(po
);
3144 po
->prot_hook
.dev
= dev
;
3145 po
->ifindex
= dev
? dev
->ifindex
: 0;
3146 packet_cached_dev_assign(po
, dev
);
3152 if (proto
== 0 || !need_rehook
)
3155 if (!unlisted
&& (!dev
|| (dev
->flags
& IFF_UP
))) {
3156 register_prot_hook(sk
);
3158 sk
->sk_err
= ENETDOWN
;
3159 if (!sock_flag(sk
, SOCK_DEAD
))
3160 sk
->sk_error_report(sk
);
3165 spin_unlock(&po
->bind_lock
);
3171 * Bind a packet socket to a device
3174 static int packet_bind_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
3177 struct sock
*sk
= sock
->sk
;
3178 char name
[sizeof(uaddr
->sa_data
) + 1];
3184 if (addr_len
!= sizeof(struct sockaddr
))
3186 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3189 memcpy(name
, uaddr
->sa_data
, sizeof(uaddr
->sa_data
));
3190 name
[sizeof(uaddr
->sa_data
)] = 0;
3192 return packet_do_bind(sk
, name
, 0, pkt_sk(sk
)->num
);
3195 static int packet_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
3197 struct sockaddr_ll
*sll
= (struct sockaddr_ll
*)uaddr
;
3198 struct sock
*sk
= sock
->sk
;
3204 if (addr_len
< sizeof(struct sockaddr_ll
))
3206 if (sll
->sll_family
!= AF_PACKET
)
3209 return packet_do_bind(sk
, NULL
, sll
->sll_ifindex
,
3210 sll
->sll_protocol
? : pkt_sk(sk
)->num
);
3213 static struct proto packet_proto
= {
3215 .owner
= THIS_MODULE
,
3216 .obj_size
= sizeof(struct packet_sock
),
3220 * Create a packet of type SOCK_PACKET.
3223 static int packet_create(struct net
*net
, struct socket
*sock
, int protocol
,
3227 struct packet_sock
*po
;
3228 __be16 proto
= (__force __be16
)protocol
; /* weird, but documented */
3231 if (!ns_capable(net
->user_ns
, CAP_NET_RAW
))
3233 if (sock
->type
!= SOCK_DGRAM
&& sock
->type
!= SOCK_RAW
&&
3234 sock
->type
!= SOCK_PACKET
)
3235 return -ESOCKTNOSUPPORT
;
3237 sock
->state
= SS_UNCONNECTED
;
3240 sk
= sk_alloc(net
, PF_PACKET
, GFP_KERNEL
, &packet_proto
, kern
);
3244 sock
->ops
= &packet_ops
;
3245 if (sock
->type
== SOCK_PACKET
)
3246 sock
->ops
= &packet_ops_spkt
;
3248 sock_init_data(sock
, sk
);
3251 sk
->sk_family
= PF_PACKET
;
3253 po
->xmit
= dev_queue_xmit
;
3255 err
= packet_alloc_pending(po
);
3259 packet_cached_dev_reset(po
);
3261 sk
->sk_destruct
= packet_sock_destruct
;
3262 sk_refcnt_debug_inc(sk
);
3265 * Attach a protocol block
3268 spin_lock_init(&po
->bind_lock
);
3269 mutex_init(&po
->pg_vec_lock
);
3270 po
->rollover
= NULL
;
3271 po
->prot_hook
.func
= packet_rcv
;
3273 if (sock
->type
== SOCK_PACKET
)
3274 po
->prot_hook
.func
= packet_rcv_spkt
;
3276 po
->prot_hook
.af_packet_priv
= sk
;
3279 po
->prot_hook
.type
= proto
;
3280 __register_prot_hook(sk
);
3283 mutex_lock(&net
->packet
.sklist_lock
);
3284 sk_add_node_rcu(sk
, &net
->packet
.sklist
);
3285 mutex_unlock(&net
->packet
.sklist_lock
);
3288 sock_prot_inuse_add(net
, &packet_proto
, 1);
3299 * Pull a packet from our receive queue and hand it to the user.
3300 * If necessary we block.
3303 static int packet_recvmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
,
3306 struct sock
*sk
= sock
->sk
;
3307 struct sk_buff
*skb
;
3309 int vnet_hdr_len
= 0;
3310 unsigned int origlen
= 0;
3313 if (flags
& ~(MSG_PEEK
|MSG_DONTWAIT
|MSG_TRUNC
|MSG_CMSG_COMPAT
|MSG_ERRQUEUE
))
3317 /* What error should we return now? EUNATTACH? */
3318 if (pkt_sk(sk
)->ifindex
< 0)
3322 if (flags
& MSG_ERRQUEUE
) {
3323 err
= sock_recv_errqueue(sk
, msg
, len
,
3324 SOL_PACKET
, PACKET_TX_TIMESTAMP
);
3329 * Call the generic datagram receiver. This handles all sorts
3330 * of horrible races and re-entrancy so we can forget about it
3331 * in the protocol layers.
3333 * Now it will return ENETDOWN, if device have just gone down,
3334 * but then it will block.
3337 skb
= skb_recv_datagram(sk
, flags
, flags
& MSG_DONTWAIT
, &err
);
3340 * An error occurred so return it. Because skb_recv_datagram()
3341 * handles the blocking we don't see and worry about blocking
3348 if (pkt_sk(sk
)->pressure
)
3349 packet_rcv_has_room(pkt_sk(sk
), NULL
);
3351 if (pkt_sk(sk
)->has_vnet_hdr
) {
3352 err
= packet_rcv_vnet(msg
, skb
, &len
);
3355 vnet_hdr_len
= sizeof(struct virtio_net_hdr
);
3358 /* You lose any data beyond the buffer you gave. If it worries
3359 * a user program they can ask the device for its MTU
3365 msg
->msg_flags
|= MSG_TRUNC
;
3368 err
= skb_copy_datagram_msg(skb
, 0, msg
, copied
);
3372 if (sock
->type
!= SOCK_PACKET
) {
3373 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
3375 /* Original length was stored in sockaddr_ll fields */
3376 origlen
= PACKET_SKB_CB(skb
)->sa
.origlen
;
3377 sll
->sll_family
= AF_PACKET
;
3378 sll
->sll_protocol
= skb
->protocol
;
3381 sock_recv_ts_and_drops(msg
, sk
, skb
);
3383 if (msg
->msg_name
) {
3384 /* If the address length field is there to be filled
3385 * in, we fill it in now.
3387 if (sock
->type
== SOCK_PACKET
) {
3388 __sockaddr_check_size(sizeof(struct sockaddr_pkt
));
3389 msg
->msg_namelen
= sizeof(struct sockaddr_pkt
);
3391 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
3393 msg
->msg_namelen
= sll
->sll_halen
+
3394 offsetof(struct sockaddr_ll
, sll_addr
);
3396 memcpy(msg
->msg_name
, &PACKET_SKB_CB(skb
)->sa
,
3400 if (pkt_sk(sk
)->auxdata
) {
3401 struct tpacket_auxdata aux
;
3403 aux
.tp_status
= TP_STATUS_USER
;
3404 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
3405 aux
.tp_status
|= TP_STATUS_CSUMNOTREADY
;
3406 else if (skb
->pkt_type
!= PACKET_OUTGOING
&&
3407 (skb
->ip_summed
== CHECKSUM_COMPLETE
||
3408 skb_csum_unnecessary(skb
)))
3409 aux
.tp_status
|= TP_STATUS_CSUM_VALID
;
3411 aux
.tp_len
= origlen
;
3412 aux
.tp_snaplen
= skb
->len
;
3414 aux
.tp_net
= skb_network_offset(skb
);
3415 if (skb_vlan_tag_present(skb
)) {
3416 aux
.tp_vlan_tci
= skb_vlan_tag_get(skb
);
3417 aux
.tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
3418 aux
.tp_status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
3420 aux
.tp_vlan_tci
= 0;
3421 aux
.tp_vlan_tpid
= 0;
3423 put_cmsg(msg
, SOL_PACKET
, PACKET_AUXDATA
, sizeof(aux
), &aux
);
3427 * Free or return the buffer as appropriate. Again this
3428 * hides all the races and re-entrancy issues from us.
3430 err
= vnet_hdr_len
+ ((flags
&MSG_TRUNC
) ? skb
->len
: copied
);
3433 skb_free_datagram(sk
, skb
);
3438 static int packet_getname_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
3439 int *uaddr_len
, int peer
)
3441 struct net_device
*dev
;
3442 struct sock
*sk
= sock
->sk
;
3447 uaddr
->sa_family
= AF_PACKET
;
3448 memset(uaddr
->sa_data
, 0, sizeof(uaddr
->sa_data
));
3450 dev
= dev_get_by_index_rcu(sock_net(sk
), pkt_sk(sk
)->ifindex
);
3452 strlcpy(uaddr
->sa_data
, dev
->name
, sizeof(uaddr
->sa_data
));
3454 *uaddr_len
= sizeof(*uaddr
);
3459 static int packet_getname(struct socket
*sock
, struct sockaddr
*uaddr
,
3460 int *uaddr_len
, int peer
)
3462 struct net_device
*dev
;
3463 struct sock
*sk
= sock
->sk
;
3464 struct packet_sock
*po
= pkt_sk(sk
);
3465 DECLARE_SOCKADDR(struct sockaddr_ll
*, sll
, uaddr
);
3470 sll
->sll_family
= AF_PACKET
;
3471 sll
->sll_ifindex
= po
->ifindex
;
3472 sll
->sll_protocol
= po
->num
;
3473 sll
->sll_pkttype
= 0;
3475 dev
= dev_get_by_index_rcu(sock_net(sk
), po
->ifindex
);
3477 sll
->sll_hatype
= dev
->type
;
3478 sll
->sll_halen
= dev
->addr_len
;
3479 memcpy(sll
->sll_addr
, dev
->dev_addr
, dev
->addr_len
);
3481 sll
->sll_hatype
= 0; /* Bad: we have no ARPHRD_UNSPEC */
3485 *uaddr_len
= offsetof(struct sockaddr_ll
, sll_addr
) + sll
->sll_halen
;
3490 static int packet_dev_mc(struct net_device
*dev
, struct packet_mclist
*i
,
3494 case PACKET_MR_MULTICAST
:
3495 if (i
->alen
!= dev
->addr_len
)
3498 return dev_mc_add(dev
, i
->addr
);
3500 return dev_mc_del(dev
, i
->addr
);
3502 case PACKET_MR_PROMISC
:
3503 return dev_set_promiscuity(dev
, what
);
3504 case PACKET_MR_ALLMULTI
:
3505 return dev_set_allmulti(dev
, what
);
3506 case PACKET_MR_UNICAST
:
3507 if (i
->alen
!= dev
->addr_len
)
3510 return dev_uc_add(dev
, i
->addr
);
3512 return dev_uc_del(dev
, i
->addr
);
3520 static void packet_dev_mclist_delete(struct net_device
*dev
,
3521 struct packet_mclist
**mlp
)
3523 struct packet_mclist
*ml
;
3525 while ((ml
= *mlp
) != NULL
) {
3526 if (ml
->ifindex
== dev
->ifindex
) {
3527 packet_dev_mc(dev
, ml
, -1);
3535 static int packet_mc_add(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3537 struct packet_sock
*po
= pkt_sk(sk
);
3538 struct packet_mclist
*ml
, *i
;
3539 struct net_device
*dev
;
3545 dev
= __dev_get_by_index(sock_net(sk
), mreq
->mr_ifindex
);
3550 if (mreq
->mr_alen
> dev
->addr_len
)
3554 i
= kmalloc(sizeof(*i
), GFP_KERNEL
);
3559 for (ml
= po
->mclist
; ml
; ml
= ml
->next
) {
3560 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3561 ml
->type
== mreq
->mr_type
&&
3562 ml
->alen
== mreq
->mr_alen
&&
3563 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3565 /* Free the new element ... */
3571 i
->type
= mreq
->mr_type
;
3572 i
->ifindex
= mreq
->mr_ifindex
;
3573 i
->alen
= mreq
->mr_alen
;
3574 memcpy(i
->addr
, mreq
->mr_address
, i
->alen
);
3575 memset(i
->addr
+ i
->alen
, 0, sizeof(i
->addr
) - i
->alen
);
3577 i
->next
= po
->mclist
;
3579 err
= packet_dev_mc(dev
, i
, 1);
3581 po
->mclist
= i
->next
;
3590 static int packet_mc_drop(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3592 struct packet_mclist
*ml
, **mlp
;
3596 for (mlp
= &pkt_sk(sk
)->mclist
; (ml
= *mlp
) != NULL
; mlp
= &ml
->next
) {
3597 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3598 ml
->type
== mreq
->mr_type
&&
3599 ml
->alen
== mreq
->mr_alen
&&
3600 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3601 if (--ml
->count
== 0) {
3602 struct net_device
*dev
;
3604 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3606 packet_dev_mc(dev
, ml
, -1);
3616 static void packet_flush_mclist(struct sock
*sk
)
3618 struct packet_sock
*po
= pkt_sk(sk
);
3619 struct packet_mclist
*ml
;
3625 while ((ml
= po
->mclist
) != NULL
) {
3626 struct net_device
*dev
;
3628 po
->mclist
= ml
->next
;
3629 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3631 packet_dev_mc(dev
, ml
, -1);
3638 packet_setsockopt(struct socket
*sock
, int level
, int optname
, char __user
*optval
, unsigned int optlen
)
3640 struct sock
*sk
= sock
->sk
;
3641 struct packet_sock
*po
= pkt_sk(sk
);
3644 if (level
!= SOL_PACKET
)
3645 return -ENOPROTOOPT
;
3648 case PACKET_ADD_MEMBERSHIP
:
3649 case PACKET_DROP_MEMBERSHIP
:
3651 struct packet_mreq_max mreq
;
3653 memset(&mreq
, 0, sizeof(mreq
));
3654 if (len
< sizeof(struct packet_mreq
))
3656 if (len
> sizeof(mreq
))
3658 if (copy_from_user(&mreq
, optval
, len
))
3660 if (len
< (mreq
.mr_alen
+ offsetof(struct packet_mreq
, mr_address
)))
3662 if (optname
== PACKET_ADD_MEMBERSHIP
)
3663 ret
= packet_mc_add(sk
, &mreq
);
3665 ret
= packet_mc_drop(sk
, &mreq
);
3669 case PACKET_RX_RING
:
3670 case PACKET_TX_RING
:
3672 union tpacket_req_u req_u
;
3676 switch (po
->tp_version
) {
3679 len
= sizeof(req_u
.req
);
3683 len
= sizeof(req_u
.req3
);
3689 if (copy_from_user(&req_u
.req
, optval
, len
))
3692 ret
= packet_set_ring(sk
, &req_u
, 0,
3693 optname
== PACKET_TX_RING
);
3698 case PACKET_COPY_THRESH
:
3702 if (optlen
!= sizeof(val
))
3704 if (copy_from_user(&val
, optval
, sizeof(val
)))
3707 pkt_sk(sk
)->copy_thresh
= val
;
3710 case PACKET_VERSION
:
3714 if (optlen
!= sizeof(val
))
3716 if (copy_from_user(&val
, optval
, sizeof(val
)))
3727 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3730 po
->tp_version
= val
;
3736 case PACKET_RESERVE
:
3740 if (optlen
!= sizeof(val
))
3742 if (copy_from_user(&val
, optval
, sizeof(val
)))
3747 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3750 po
->tp_reserve
= val
;
3760 if (optlen
!= sizeof(val
))
3762 if (copy_from_user(&val
, optval
, sizeof(val
)))
3766 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3769 po
->tp_loss
= !!val
;
3775 case PACKET_AUXDATA
:
3779 if (optlen
< sizeof(val
))
3781 if (copy_from_user(&val
, optval
, sizeof(val
)))
3785 po
->auxdata
= !!val
;
3789 case PACKET_ORIGDEV
:
3793 if (optlen
< sizeof(val
))
3795 if (copy_from_user(&val
, optval
, sizeof(val
)))
3799 po
->origdev
= !!val
;
3803 case PACKET_VNET_HDR
:
3807 if (sock
->type
!= SOCK_RAW
)
3809 if (optlen
< sizeof(val
))
3811 if (copy_from_user(&val
, optval
, sizeof(val
)))
3815 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3818 po
->has_vnet_hdr
= !!val
;
3824 case PACKET_TIMESTAMP
:
3828 if (optlen
!= sizeof(val
))
3830 if (copy_from_user(&val
, optval
, sizeof(val
)))
3833 po
->tp_tstamp
= val
;
3840 if (optlen
!= sizeof(val
))
3842 if (copy_from_user(&val
, optval
, sizeof(val
)))
3845 return fanout_add(sk
, val
& 0xffff, val
>> 16);
3847 case PACKET_FANOUT_DATA
:
3852 return fanout_set_data(po
, optval
, optlen
);
3854 case PACKET_TX_HAS_OFF
:
3858 if (optlen
!= sizeof(val
))
3860 if (copy_from_user(&val
, optval
, sizeof(val
)))
3864 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3867 po
->tp_tx_has_off
= !!val
;
3873 case PACKET_QDISC_BYPASS
:
3877 if (optlen
!= sizeof(val
))
3879 if (copy_from_user(&val
, optval
, sizeof(val
)))
3882 po
->xmit
= val
? packet_direct_xmit
: dev_queue_xmit
;
3886 return -ENOPROTOOPT
;
3890 static int packet_getsockopt(struct socket
*sock
, int level
, int optname
,
3891 char __user
*optval
, int __user
*optlen
)
3894 int val
, lv
= sizeof(val
);
3895 struct sock
*sk
= sock
->sk
;
3896 struct packet_sock
*po
= pkt_sk(sk
);
3898 union tpacket_stats_u st
;
3899 struct tpacket_rollover_stats rstats
;
3901 if (level
!= SOL_PACKET
)
3902 return -ENOPROTOOPT
;
3904 if (get_user(len
, optlen
))
3911 case PACKET_STATISTICS
:
3912 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3913 memcpy(&st
, &po
->stats
, sizeof(st
));
3914 memset(&po
->stats
, 0, sizeof(po
->stats
));
3915 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3917 if (po
->tp_version
== TPACKET_V3
) {
3918 lv
= sizeof(struct tpacket_stats_v3
);
3919 st
.stats3
.tp_packets
+= st
.stats3
.tp_drops
;
3922 lv
= sizeof(struct tpacket_stats
);
3923 st
.stats1
.tp_packets
+= st
.stats1
.tp_drops
;
3928 case PACKET_AUXDATA
:
3931 case PACKET_ORIGDEV
:
3934 case PACKET_VNET_HDR
:
3935 val
= po
->has_vnet_hdr
;
3937 case PACKET_VERSION
:
3938 val
= po
->tp_version
;
3941 if (len
> sizeof(int))
3943 if (len
< sizeof(int))
3945 if (copy_from_user(&val
, optval
, len
))
3949 val
= sizeof(struct tpacket_hdr
);
3952 val
= sizeof(struct tpacket2_hdr
);
3955 val
= sizeof(struct tpacket3_hdr
);
3961 case PACKET_RESERVE
:
3962 val
= po
->tp_reserve
;
3967 case PACKET_TIMESTAMP
:
3968 val
= po
->tp_tstamp
;
3972 ((u32
)po
->fanout
->id
|
3973 ((u32
)po
->fanout
->type
<< 16) |
3974 ((u32
)po
->fanout
->flags
<< 24)) :
3977 case PACKET_ROLLOVER_STATS
:
3980 rstats
.tp_all
= atomic_long_read(&po
->rollover
->num
);
3981 rstats
.tp_huge
= atomic_long_read(&po
->rollover
->num_huge
);
3982 rstats
.tp_failed
= atomic_long_read(&po
->rollover
->num_failed
);
3984 lv
= sizeof(rstats
);
3986 case PACKET_TX_HAS_OFF
:
3987 val
= po
->tp_tx_has_off
;
3989 case PACKET_QDISC_BYPASS
:
3990 val
= packet_use_direct_xmit(po
);
3993 return -ENOPROTOOPT
;
3998 if (put_user(len
, optlen
))
4000 if (copy_to_user(optval
, data
, len
))
4006 #ifdef CONFIG_COMPAT
4007 static int compat_packet_setsockopt(struct socket
*sock
, int level
, int optname
,
4008 char __user
*optval
, unsigned int optlen
)
4010 struct packet_sock
*po
= pkt_sk(sock
->sk
);
4012 if (level
!= SOL_PACKET
)
4013 return -ENOPROTOOPT
;
4015 if (optname
== PACKET_FANOUT_DATA
&&
4016 po
->fanout
&& po
->fanout
->type
== PACKET_FANOUT_CBPF
) {
4017 optval
= (char __user
*)get_compat_bpf_fprog(optval
);
4020 optlen
= sizeof(struct sock_fprog
);
4023 return packet_setsockopt(sock
, level
, optname
, optval
, optlen
);
4027 static int packet_notifier(struct notifier_block
*this,
4028 unsigned long msg
, void *ptr
)
4031 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
4032 struct net
*net
= dev_net(dev
);
4035 sk_for_each_rcu(sk
, &net
->packet
.sklist
) {
4036 struct packet_sock
*po
= pkt_sk(sk
);
4039 case NETDEV_UNREGISTER
:
4041 packet_dev_mclist_delete(dev
, &po
->mclist
);
4045 if (dev
->ifindex
== po
->ifindex
) {
4046 spin_lock(&po
->bind_lock
);
4048 __unregister_prot_hook(sk
, false);
4049 sk
->sk_err
= ENETDOWN
;
4050 if (!sock_flag(sk
, SOCK_DEAD
))
4051 sk
->sk_error_report(sk
);
4053 if (msg
== NETDEV_UNREGISTER
) {
4054 packet_cached_dev_reset(po
);
4056 if (po
->prot_hook
.dev
)
4057 dev_put(po
->prot_hook
.dev
);
4058 po
->prot_hook
.dev
= NULL
;
4060 spin_unlock(&po
->bind_lock
);
4064 if (dev
->ifindex
== po
->ifindex
) {
4065 spin_lock(&po
->bind_lock
);
4067 register_prot_hook(sk
);
4068 spin_unlock(&po
->bind_lock
);
4078 static int packet_ioctl(struct socket
*sock
, unsigned int cmd
,
4081 struct sock
*sk
= sock
->sk
;
4086 int amount
= sk_wmem_alloc_get(sk
);
4088 return put_user(amount
, (int __user
*)arg
);
4092 struct sk_buff
*skb
;
4095 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
4096 skb
= skb_peek(&sk
->sk_receive_queue
);
4099 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
4100 return put_user(amount
, (int __user
*)arg
);
4103 return sock_get_timestamp(sk
, (struct timeval __user
*)arg
);
4105 return sock_get_timestampns(sk
, (struct timespec __user
*)arg
);
4115 case SIOCGIFBRDADDR
:
4116 case SIOCSIFBRDADDR
:
4117 case SIOCGIFNETMASK
:
4118 case SIOCSIFNETMASK
:
4119 case SIOCGIFDSTADDR
:
4120 case SIOCSIFDSTADDR
:
4122 return inet_dgram_ops
.ioctl(sock
, cmd
, arg
);
4126 return -ENOIOCTLCMD
;
4131 static unsigned int packet_poll(struct file
*file
, struct socket
*sock
,
4134 struct sock
*sk
= sock
->sk
;
4135 struct packet_sock
*po
= pkt_sk(sk
);
4136 unsigned int mask
= datagram_poll(file
, sock
, wait
);
4138 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
4139 if (po
->rx_ring
.pg_vec
) {
4140 if (!packet_previous_rx_frame(po
, &po
->rx_ring
,
4142 mask
|= POLLIN
| POLLRDNORM
;
4144 if (po
->pressure
&& __packet_rcv_has_room(po
, NULL
) == ROOM_NORMAL
)
4146 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
4147 spin_lock_bh(&sk
->sk_write_queue
.lock
);
4148 if (po
->tx_ring
.pg_vec
) {
4149 if (packet_current_frame(po
, &po
->tx_ring
, TP_STATUS_AVAILABLE
))
4150 mask
|= POLLOUT
| POLLWRNORM
;
4152 spin_unlock_bh(&sk
->sk_write_queue
.lock
);
4157 /* Dirty? Well, I still did not learn better way to account
4161 static void packet_mm_open(struct vm_area_struct
*vma
)
4163 struct file
*file
= vma
->vm_file
;
4164 struct socket
*sock
= file
->private_data
;
4165 struct sock
*sk
= sock
->sk
;
4168 atomic_inc(&pkt_sk(sk
)->mapped
);
4171 static void packet_mm_close(struct vm_area_struct
*vma
)
4173 struct file
*file
= vma
->vm_file
;
4174 struct socket
*sock
= file
->private_data
;
4175 struct sock
*sk
= sock
->sk
;
4178 atomic_dec(&pkt_sk(sk
)->mapped
);
4181 static const struct vm_operations_struct packet_mmap_ops
= {
4182 .open
= packet_mm_open
,
4183 .close
= packet_mm_close
,
4186 static void free_pg_vec(struct pgv
*pg_vec
, unsigned int order
,
4191 for (i
= 0; i
< len
; i
++) {
4192 if (likely(pg_vec
[i
].buffer
)) {
4193 if (is_vmalloc_addr(pg_vec
[i
].buffer
))
4194 vfree(pg_vec
[i
].buffer
);
4196 free_pages((unsigned long)pg_vec
[i
].buffer
,
4198 pg_vec
[i
].buffer
= NULL
;
4204 static char *alloc_one_pg_vec_page(unsigned long order
)
4207 gfp_t gfp_flags
= GFP_KERNEL
| __GFP_COMP
|
4208 __GFP_ZERO
| __GFP_NOWARN
| __GFP_NORETRY
;
4210 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
4214 /* __get_free_pages failed, fall back to vmalloc */
4215 buffer
= vzalloc((1 << order
) * PAGE_SIZE
);
4219 /* vmalloc failed, lets dig into swap here */
4220 gfp_flags
&= ~__GFP_NORETRY
;
4221 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
4225 /* complete and utter failure */
4229 static struct pgv
*alloc_pg_vec(struct tpacket_req
*req
, int order
)
4231 unsigned int block_nr
= req
->tp_block_nr
;
4235 pg_vec
= kcalloc(block_nr
, sizeof(struct pgv
), GFP_KERNEL
| __GFP_NOWARN
);
4236 if (unlikely(!pg_vec
))
4239 for (i
= 0; i
< block_nr
; i
++) {
4240 pg_vec
[i
].buffer
= alloc_one_pg_vec_page(order
);
4241 if (unlikely(!pg_vec
[i
].buffer
))
4242 goto out_free_pgvec
;
4249 free_pg_vec(pg_vec
, order
, block_nr
);
4254 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
4255 int closing
, int tx_ring
)
4257 struct pgv
*pg_vec
= NULL
;
4258 struct packet_sock
*po
= pkt_sk(sk
);
4259 int was_running
, order
= 0;
4260 struct packet_ring_buffer
*rb
;
4261 struct sk_buff_head
*rb_queue
;
4264 /* Added to avoid minimal code churn */
4265 struct tpacket_req
*req
= &req_u
->req
;
4267 rb
= tx_ring
? &po
->tx_ring
: &po
->rx_ring
;
4268 rb_queue
= tx_ring
? &sk
->sk_write_queue
: &sk
->sk_receive_queue
;
4272 if (atomic_read(&po
->mapped
))
4274 if (packet_read_pending(rb
))
4278 if (req
->tp_block_nr
) {
4279 unsigned int min_frame_size
;
4281 /* Sanity tests and some calculations */
4283 if (unlikely(rb
->pg_vec
))
4286 switch (po
->tp_version
) {
4288 po
->tp_hdrlen
= TPACKET_HDRLEN
;
4291 po
->tp_hdrlen
= TPACKET2_HDRLEN
;
4294 po
->tp_hdrlen
= TPACKET3_HDRLEN
;
4299 if (unlikely((int)req
->tp_block_size
<= 0))
4301 if (unlikely(!PAGE_ALIGNED(req
->tp_block_size
)))
4303 min_frame_size
= po
->tp_hdrlen
+ po
->tp_reserve
;
4304 if (po
->tp_version
>= TPACKET_V3
&&
4305 req
->tp_block_size
<
4306 BLK_PLUS_PRIV((u64
)req_u
->req3
.tp_sizeof_priv
) + min_frame_size
)
4308 if (unlikely(req
->tp_frame_size
< min_frame_size
))
4310 if (unlikely(req
->tp_frame_size
& (TPACKET_ALIGNMENT
- 1)))
4313 rb
->frames_per_block
= req
->tp_block_size
/ req
->tp_frame_size
;
4314 if (unlikely(rb
->frames_per_block
== 0))
4316 if (unlikely(rb
->frames_per_block
> UINT_MAX
/ req
->tp_block_nr
))
4318 if (unlikely((rb
->frames_per_block
* req
->tp_block_nr
) !=
4323 order
= get_order(req
->tp_block_size
);
4324 pg_vec
= alloc_pg_vec(req
, order
);
4325 if (unlikely(!pg_vec
))
4327 switch (po
->tp_version
) {
4329 /* Block transmit is not supported yet */
4331 init_prb_bdqc(po
, rb
, pg_vec
, req_u
);
4333 struct tpacket_req3
*req3
= &req_u
->req3
;
4335 if (req3
->tp_retire_blk_tov
||
4336 req3
->tp_sizeof_priv
||
4337 req3
->tp_feature_req_word
) {
4350 if (unlikely(req
->tp_frame_nr
))
4355 /* Detach socket from network */
4356 spin_lock(&po
->bind_lock
);
4357 was_running
= po
->running
;
4361 __unregister_prot_hook(sk
, false);
4363 spin_unlock(&po
->bind_lock
);
4368 mutex_lock(&po
->pg_vec_lock
);
4369 if (closing
|| atomic_read(&po
->mapped
) == 0) {
4371 spin_lock_bh(&rb_queue
->lock
);
4372 swap(rb
->pg_vec
, pg_vec
);
4373 rb
->frame_max
= (req
->tp_frame_nr
- 1);
4375 rb
->frame_size
= req
->tp_frame_size
;
4376 spin_unlock_bh(&rb_queue
->lock
);
4378 swap(rb
->pg_vec_order
, order
);
4379 swap(rb
->pg_vec_len
, req
->tp_block_nr
);
4381 rb
->pg_vec_pages
= req
->tp_block_size
/PAGE_SIZE
;
4382 po
->prot_hook
.func
= (po
->rx_ring
.pg_vec
) ?
4383 tpacket_rcv
: packet_rcv
;
4384 skb_queue_purge(rb_queue
);
4385 if (atomic_read(&po
->mapped
))
4386 pr_err("packet_mmap: vma is busy: %d\n",
4387 atomic_read(&po
->mapped
));
4389 mutex_unlock(&po
->pg_vec_lock
);
4391 spin_lock(&po
->bind_lock
);
4394 register_prot_hook(sk
);
4396 spin_unlock(&po
->bind_lock
);
4397 if (pg_vec
&& (po
->tp_version
> TPACKET_V2
)) {
4398 /* Because we don't support block-based V3 on tx-ring */
4400 prb_shutdown_retire_blk_timer(po
, rb_queue
);
4404 free_pg_vec(pg_vec
, order
, req
->tp_block_nr
);
4409 static int packet_mmap(struct file
*file
, struct socket
*sock
,
4410 struct vm_area_struct
*vma
)
4412 struct sock
*sk
= sock
->sk
;
4413 struct packet_sock
*po
= pkt_sk(sk
);
4414 unsigned long size
, expected_size
;
4415 struct packet_ring_buffer
*rb
;
4416 unsigned long start
;
4423 mutex_lock(&po
->pg_vec_lock
);
4426 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
4428 expected_size
+= rb
->pg_vec_len
4434 if (expected_size
== 0)
4437 size
= vma
->vm_end
- vma
->vm_start
;
4438 if (size
!= expected_size
)
4441 start
= vma
->vm_start
;
4442 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
4443 if (rb
->pg_vec
== NULL
)
4446 for (i
= 0; i
< rb
->pg_vec_len
; i
++) {
4448 void *kaddr
= rb
->pg_vec
[i
].buffer
;
4451 for (pg_num
= 0; pg_num
< rb
->pg_vec_pages
; pg_num
++) {
4452 page
= pgv_to_page(kaddr
);
4453 err
= vm_insert_page(vma
, start
, page
);
4462 atomic_inc(&po
->mapped
);
4463 vma
->vm_ops
= &packet_mmap_ops
;
4467 mutex_unlock(&po
->pg_vec_lock
);
4471 static const struct proto_ops packet_ops_spkt
= {
4472 .family
= PF_PACKET
,
4473 .owner
= THIS_MODULE
,
4474 .release
= packet_release
,
4475 .bind
= packet_bind_spkt
,
4476 .connect
= sock_no_connect
,
4477 .socketpair
= sock_no_socketpair
,
4478 .accept
= sock_no_accept
,
4479 .getname
= packet_getname_spkt
,
4480 .poll
= datagram_poll
,
4481 .ioctl
= packet_ioctl
,
4482 .listen
= sock_no_listen
,
4483 .shutdown
= sock_no_shutdown
,
4484 .setsockopt
= sock_no_setsockopt
,
4485 .getsockopt
= sock_no_getsockopt
,
4486 .sendmsg
= packet_sendmsg_spkt
,
4487 .recvmsg
= packet_recvmsg
,
4488 .mmap
= sock_no_mmap
,
4489 .sendpage
= sock_no_sendpage
,
4492 static const struct proto_ops packet_ops
= {
4493 .family
= PF_PACKET
,
4494 .owner
= THIS_MODULE
,
4495 .release
= packet_release
,
4496 .bind
= packet_bind
,
4497 .connect
= sock_no_connect
,
4498 .socketpair
= sock_no_socketpair
,
4499 .accept
= sock_no_accept
,
4500 .getname
= packet_getname
,
4501 .poll
= packet_poll
,
4502 .ioctl
= packet_ioctl
,
4503 .listen
= sock_no_listen
,
4504 .shutdown
= sock_no_shutdown
,
4505 .setsockopt
= packet_setsockopt
,
4506 .getsockopt
= packet_getsockopt
,
4507 #ifdef CONFIG_COMPAT
4508 .compat_setsockopt
= compat_packet_setsockopt
,
4510 .sendmsg
= packet_sendmsg
,
4511 .recvmsg
= packet_recvmsg
,
4512 .mmap
= packet_mmap
,
4513 .sendpage
= sock_no_sendpage
,
4516 static const struct net_proto_family packet_family_ops
= {
4517 .family
= PF_PACKET
,
4518 .create
= packet_create
,
4519 .owner
= THIS_MODULE
,
4522 static struct notifier_block packet_netdev_notifier
= {
4523 .notifier_call
= packet_notifier
,
4526 #ifdef CONFIG_PROC_FS
4528 static void *packet_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4531 struct net
*net
= seq_file_net(seq
);
4534 return seq_hlist_start_head_rcu(&net
->packet
.sklist
, *pos
);
4537 static void *packet_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4539 struct net
*net
= seq_file_net(seq
);
4540 return seq_hlist_next_rcu(v
, &net
->packet
.sklist
, pos
);
4543 static void packet_seq_stop(struct seq_file
*seq
, void *v
)
4549 static int packet_seq_show(struct seq_file
*seq
, void *v
)
4551 if (v
== SEQ_START_TOKEN
)
4552 seq_puts(seq
, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4554 struct sock
*s
= sk_entry(v
);
4555 const struct packet_sock
*po
= pkt_sk(s
);
4558 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4560 refcount_read(&s
->sk_refcnt
),
4565 atomic_read(&s
->sk_rmem_alloc
),
4566 from_kuid_munged(seq_user_ns(seq
), sock_i_uid(s
)),
4573 static const struct seq_operations packet_seq_ops
= {
4574 .start
= packet_seq_start
,
4575 .next
= packet_seq_next
,
4576 .stop
= packet_seq_stop
,
4577 .show
= packet_seq_show
,
4580 static int packet_seq_open(struct inode
*inode
, struct file
*file
)
4582 return seq_open_net(inode
, file
, &packet_seq_ops
,
4583 sizeof(struct seq_net_private
));
4586 static const struct file_operations packet_seq_fops
= {
4587 .owner
= THIS_MODULE
,
4588 .open
= packet_seq_open
,
4590 .llseek
= seq_lseek
,
4591 .release
= seq_release_net
,
4596 static int __net_init
packet_net_init(struct net
*net
)
4598 mutex_init(&net
->packet
.sklist_lock
);
4599 INIT_HLIST_HEAD(&net
->packet
.sklist
);
4601 if (!proc_create("packet", 0, net
->proc_net
, &packet_seq_fops
))
4607 static void __net_exit
packet_net_exit(struct net
*net
)
4609 remove_proc_entry("packet", net
->proc_net
);
4612 static struct pernet_operations packet_net_ops
= {
4613 .init
= packet_net_init
,
4614 .exit
= packet_net_exit
,
4618 static void __exit
packet_exit(void)
4620 unregister_netdevice_notifier(&packet_netdev_notifier
);
4621 unregister_pernet_subsys(&packet_net_ops
);
4622 sock_unregister(PF_PACKET
);
4623 proto_unregister(&packet_proto
);
4626 static int __init
packet_init(void)
4628 int rc
= proto_register(&packet_proto
, 0);
4633 sock_register(&packet_family_ops
);
4634 register_pernet_subsys(&packet_net_ops
);
4635 register_netdevice_notifier(&packet_netdev_notifier
);
4640 module_init(packet_init
);
4641 module_exit(packet_exit
);
4642 MODULE_LICENSE("GPL");
4643 MODULE_ALIAS_NETPROTO(PF_PACKET
);