vti6: better validate user provided tunnel names
[GitHub/LineageOS/android_kernel_motorola_exynos9610.git] / net / ipv6 / ip6_output.c
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 struct dst_entry *dst = skb_dst(skb);
65 struct net_device *dev = dst->dev;
66 struct neighbour *neigh;
67 struct in6_addr *nexthop;
68 int ret;
69
70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 ((mroute6_socket(net, skb) &&
75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
82 */
83 if (newskb)
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 net, sk, newskb, NULL, newskb->dev,
86 dev_loopback_xmit);
87
88 if (ipv6_hdr(skb)->hop_limit == 0) {
89 IP6_INC_STATS(net, idev,
90 IPSTATS_MIB_OUTDISCARDS);
91 kfree_skb(skb);
92 return 0;
93 }
94 }
95
96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 IPV6_ADDR_SCOPE_NODELOCAL &&
100 !(dev->flags & IFF_LOOPBACK)) {
101 kfree_skb(skb);
102 return 0;
103 }
104 }
105
106 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 int res = lwtunnel_xmit(skb);
108
109 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 return res;
111 }
112
113 rcu_read_lock_bh();
114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 if (unlikely(!neigh))
117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 if (!IS_ERR(neigh)) {
119 sock_confirm_neigh(skb, neigh);
120 ret = neigh_output(neigh, skb);
121 rcu_read_unlock_bh();
122 return ret;
123 }
124 rcu_read_unlock_bh();
125
126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 kfree_skb(skb);
128 return -EINVAL;
129 }
130
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 int ret;
134
135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 if (ret) {
137 kfree_skb(skb);
138 return ret;
139 }
140
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 /* Policy lookup after SNAT yielded a new policy */
143 if (skb_dst(skb)->xfrm) {
144 IPCB(skb)->flags |= IPSKB_REROUTED;
145 return dst_output(net, sk, skb);
146 }
147 #endif
148
149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 dst_allfrag(skb_dst(skb)) ||
151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 else
154 return ip6_finish_output2(net, sk, skb);
155 }
156
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 struct net_device *dev = skb_dst(skb)->dev;
160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
162 skb->protocol = htons(ETH_P_IPV6);
163 skb->dev = dev;
164
165 if (unlikely(idev->cnf.disable_ipv6)) {
166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 kfree_skb(skb);
168 return 0;
169 }
170
171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 net, sk, skb, NULL, dev,
173 ip6_finish_output,
174 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 if (!np->autoflowlabel_set)
180 return ip6_default_np_autolabel(net);
181 else
182 return np->autoflowlabel;
183 }
184
185 /*
186 * xmit an sk_buff (used by TCP, SCTP and DCCP)
187 * Note : socket lock is not held for SYNACK packets, but might be modified
188 * by calls to skb_set_owner_w() and ipv6_local_error(),
189 * which are using proper atomic operations or spinlocks.
190 */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 struct net *net = sock_net(sk);
195 const struct ipv6_pinfo *np = inet6_sk(sk);
196 struct in6_addr *first_hop = &fl6->daddr;
197 struct dst_entry *dst = skb_dst(skb);
198 struct ipv6hdr *hdr;
199 u8 proto = fl6->flowi6_proto;
200 int seg_len = skb->len;
201 int hlimit = -1;
202 u32 mtu;
203
204 if (opt) {
205 unsigned int head_room;
206
207 /* First: exthdrs may take lots of space (~8K for now)
208 MAX_HEADER is not enough.
209 */
210 head_room = opt->opt_nflen + opt->opt_flen;
211 seg_len += head_room;
212 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213
214 if (skb_headroom(skb) < head_room) {
215 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216 if (!skb2) {
217 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218 IPSTATS_MIB_OUTDISCARDS);
219 kfree_skb(skb);
220 return -ENOBUFS;
221 }
222 consume_skb(skb);
223 skb = skb2;
224 /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225 * it is safe to call in our context (socket lock not held)
226 */
227 skb_set_owner_w(skb, (struct sock *)sk);
228 }
229 if (opt->opt_flen)
230 ipv6_push_frag_opts(skb, opt, &proto);
231 if (opt->opt_nflen)
232 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233 &fl6->saddr);
234 }
235
236 skb_push(skb, sizeof(struct ipv6hdr));
237 skb_reset_network_header(skb);
238 hdr = ipv6_hdr(skb);
239
240 /*
241 * Fill in the IPv6 header
242 */
243 if (np)
244 hlimit = np->hop_limit;
245 if (hlimit < 0)
246 hlimit = ip6_dst_hoplimit(dst);
247
248 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249 ip6_autoflowlabel(net, np), fl6));
250
251 hdr->payload_len = htons(seg_len);
252 hdr->nexthdr = proto;
253 hdr->hop_limit = hlimit;
254
255 hdr->saddr = fl6->saddr;
256 hdr->daddr = *first_hop;
257
258 skb->protocol = htons(ETH_P_IPV6);
259 skb->priority = sk->sk_priority;
260 skb->mark = mark;
261
262 mtu = dst_mtu(dst);
263 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265 IPSTATS_MIB_OUT, skb->len);
266
267 /* if egress device is enslaved to an L3 master device pass the
268 * skb to its handler for processing
269 */
270 skb = l3mdev_ip6_out((struct sock *)sk, skb);
271 if (unlikely(!skb))
272 return 0;
273
274 /* hooks should never assume socket lock is held.
275 * we promote our socket to non const
276 */
277 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278 net, (struct sock *)sk, skb, NULL, dst->dev,
279 dst_output);
280 }
281
282 skb->dev = dst->dev;
283 /* ipv6_local_error() does not require socket lock,
284 * we promote our socket to non const
285 */
286 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287
288 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289 kfree_skb(skb);
290 return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296 struct ip6_ra_chain *ra;
297 struct sock *last = NULL;
298
299 read_lock(&ip6_ra_lock);
300 for (ra = ip6_ra_chain; ra; ra = ra->next) {
301 struct sock *sk = ra->sk;
302 if (sk && ra->sel == sel &&
303 (!sk->sk_bound_dev_if ||
304 sk->sk_bound_dev_if == skb->dev->ifindex)) {
305 if (last) {
306 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307 if (skb2)
308 rawv6_rcv(last, skb2);
309 }
310 last = sk;
311 }
312 }
313
314 if (last) {
315 rawv6_rcv(last, skb);
316 read_unlock(&ip6_ra_lock);
317 return 1;
318 }
319 read_unlock(&ip6_ra_lock);
320 return 0;
321 }
322
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325 struct ipv6hdr *hdr = ipv6_hdr(skb);
326 u8 nexthdr = hdr->nexthdr;
327 __be16 frag_off;
328 int offset;
329
330 if (ipv6_ext_hdr(nexthdr)) {
331 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332 if (offset < 0)
333 return 0;
334 } else
335 offset = sizeof(struct ipv6hdr);
336
337 if (nexthdr == IPPROTO_ICMPV6) {
338 struct icmp6hdr *icmp6;
339
340 if (!pskb_may_pull(skb, (skb_network_header(skb) +
341 offset + 1 - skb->data)))
342 return 0;
343
344 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345
346 switch (icmp6->icmp6_type) {
347 case NDISC_ROUTER_SOLICITATION:
348 case NDISC_ROUTER_ADVERTISEMENT:
349 case NDISC_NEIGHBOUR_SOLICITATION:
350 case NDISC_NEIGHBOUR_ADVERTISEMENT:
351 case NDISC_REDIRECT:
352 /* For reaction involving unicast neighbor discovery
353 * message destined to the proxied address, pass it to
354 * input function.
355 */
356 return 1;
357 default:
358 break;
359 }
360 }
361
362 /*
363 * The proxying router can't forward traffic sent to a link-local
364 * address, so signal the sender and discard the packet. This
365 * behavior is clarified by the MIPv6 specification.
366 */
367 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368 dst_link_failure(skb);
369 return -1;
370 }
371
372 return 0;
373 }
374
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376 struct sk_buff *skb)
377 {
378 struct dst_entry *dst = skb_dst(skb);
379
380 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
381 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
382
383 return dst_output(net, sk, skb);
384 }
385
386 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
387 {
388 unsigned int mtu;
389 struct inet6_dev *idev;
390
391 if (dst_metric_locked(dst, RTAX_MTU)) {
392 mtu = dst_metric_raw(dst, RTAX_MTU);
393 if (mtu)
394 return mtu;
395 }
396
397 mtu = IPV6_MIN_MTU;
398 rcu_read_lock();
399 idev = __in6_dev_get(dst->dev);
400 if (idev)
401 mtu = idev->cnf.mtu6;
402 rcu_read_unlock();
403
404 return mtu;
405 }
406
407 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
408 {
409 if (skb->len <= mtu)
410 return false;
411
412 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
413 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
414 return true;
415
416 if (skb->ignore_df)
417 return false;
418
419 if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
420 return false;
421
422 return true;
423 }
424
425 int ip6_forward(struct sk_buff *skb)
426 {
427 struct dst_entry *dst = skb_dst(skb);
428 struct ipv6hdr *hdr = ipv6_hdr(skb);
429 struct inet6_skb_parm *opt = IP6CB(skb);
430 struct net *net = dev_net(dst->dev);
431 u32 mtu;
432
433 if (net->ipv6.devconf_all->forwarding == 0)
434 goto error;
435
436 if (skb->pkt_type != PACKET_HOST)
437 goto drop;
438
439 if (unlikely(skb->sk))
440 goto drop;
441
442 if (skb_warn_if_lro(skb))
443 goto drop;
444
445 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
446 __IP6_INC_STATS(net, ip6_dst_idev(dst),
447 IPSTATS_MIB_INDISCARDS);
448 goto drop;
449 }
450
451 skb_forward_csum(skb);
452
453 /*
454 * We DO NOT make any processing on
455 * RA packets, pushing them to user level AS IS
456 * without ane WARRANTY that application will be able
457 * to interpret them. The reason is that we
458 * cannot make anything clever here.
459 *
460 * We are not end-node, so that if packet contains
461 * AH/ESP, we cannot make anything.
462 * Defragmentation also would be mistake, RA packets
463 * cannot be fragmented, because there is no warranty
464 * that different fragments will go along one path. --ANK
465 */
466 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
467 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
468 return 0;
469 }
470
471 /*
472 * check and decrement ttl
473 */
474 if (hdr->hop_limit <= 1) {
475 /* Force OUTPUT device used as source address */
476 skb->dev = dst->dev;
477 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
478 __IP6_INC_STATS(net, ip6_dst_idev(dst),
479 IPSTATS_MIB_INHDRERRORS);
480
481 kfree_skb(skb);
482 return -ETIMEDOUT;
483 }
484
485 /* XXX: idev->cnf.proxy_ndp? */
486 if (net->ipv6.devconf_all->proxy_ndp &&
487 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
488 int proxied = ip6_forward_proxy_check(skb);
489 if (proxied > 0)
490 return ip6_input(skb);
491 else if (proxied < 0) {
492 __IP6_INC_STATS(net, ip6_dst_idev(dst),
493 IPSTATS_MIB_INDISCARDS);
494 goto drop;
495 }
496 }
497
498 if (!xfrm6_route_forward(skb)) {
499 __IP6_INC_STATS(net, ip6_dst_idev(dst),
500 IPSTATS_MIB_INDISCARDS);
501 goto drop;
502 }
503 dst = skb_dst(skb);
504
505 /* IPv6 specs say nothing about it, but it is clear that we cannot
506 send redirects to source routed frames.
507 We don't send redirects to frames decapsulated from IPsec.
508 */
509 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
510 struct in6_addr *target = NULL;
511 struct inet_peer *peer;
512 struct rt6_info *rt;
513
514 /*
515 * incoming and outgoing devices are the same
516 * send a redirect.
517 */
518
519 rt = (struct rt6_info *) dst;
520 if (rt->rt6i_flags & RTF_GATEWAY)
521 target = &rt->rt6i_gateway;
522 else
523 target = &hdr->daddr;
524
525 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
526
527 /* Limit redirects both by destination (here)
528 and by source (inside ndisc_send_redirect)
529 */
530 if (inet_peer_xrlim_allow(peer, 1*HZ))
531 ndisc_send_redirect(skb, target);
532 if (peer)
533 inet_putpeer(peer);
534 } else {
535 int addrtype = ipv6_addr_type(&hdr->saddr);
536
537 /* This check is security critical. */
538 if (addrtype == IPV6_ADDR_ANY ||
539 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
540 goto error;
541 if (addrtype & IPV6_ADDR_LINKLOCAL) {
542 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
543 ICMPV6_NOT_NEIGHBOUR, 0);
544 goto error;
545 }
546 }
547
548 mtu = ip6_dst_mtu_forward(dst);
549 if (mtu < IPV6_MIN_MTU)
550 mtu = IPV6_MIN_MTU;
551
552 if (ip6_pkt_too_big(skb, mtu)) {
553 /* Again, force OUTPUT device used as source address */
554 skb->dev = dst->dev;
555 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
556 __IP6_INC_STATS(net, ip6_dst_idev(dst),
557 IPSTATS_MIB_INTOOBIGERRORS);
558 __IP6_INC_STATS(net, ip6_dst_idev(dst),
559 IPSTATS_MIB_FRAGFAILS);
560 kfree_skb(skb);
561 return -EMSGSIZE;
562 }
563
564 if (skb_cow(skb, dst->dev->hard_header_len)) {
565 __IP6_INC_STATS(net, ip6_dst_idev(dst),
566 IPSTATS_MIB_OUTDISCARDS);
567 goto drop;
568 }
569
570 hdr = ipv6_hdr(skb);
571
572 /* Mangling hops number delayed to point after skb COW */
573
574 hdr->hop_limit--;
575
576 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
577 net, NULL, skb, skb->dev, dst->dev,
578 ip6_forward_finish);
579
580 error:
581 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
582 drop:
583 kfree_skb(skb);
584 return -EINVAL;
585 }
586
587 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
588 {
589 to->pkt_type = from->pkt_type;
590 to->priority = from->priority;
591 to->protocol = from->protocol;
592 skb_dst_drop(to);
593 skb_dst_set(to, dst_clone(skb_dst(from)));
594 to->dev = from->dev;
595 to->mark = from->mark;
596
597 #ifdef CONFIG_NET_SCHED
598 to->tc_index = from->tc_index;
599 #endif
600 nf_copy(to, from);
601 skb_copy_secmark(to, from);
602 }
603
604 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
605 int (*output)(struct net *, struct sock *, struct sk_buff *))
606 {
607 struct sk_buff *frag;
608 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
609 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
610 inet6_sk(skb->sk) : NULL;
611 struct ipv6hdr *tmp_hdr;
612 struct frag_hdr *fh;
613 unsigned int mtu, hlen, left, len;
614 int hroom, troom;
615 __be32 frag_id;
616 int ptr, offset = 0, err = 0;
617 u8 *prevhdr, nexthdr = 0;
618
619 err = ip6_find_1stfragopt(skb, &prevhdr);
620 if (err < 0)
621 goto fail;
622 hlen = err;
623 nexthdr = *prevhdr;
624
625 mtu = ip6_skb_dst_mtu(skb);
626
627 /* We must not fragment if the socket is set to force MTU discovery
628 * or if the skb it not generated by a local socket.
629 */
630 if (unlikely(!skb->ignore_df && skb->len > mtu))
631 goto fail_toobig;
632
633 if (IP6CB(skb)->frag_max_size) {
634 if (IP6CB(skb)->frag_max_size > mtu)
635 goto fail_toobig;
636
637 /* don't send fragments larger than what we received */
638 mtu = IP6CB(skb)->frag_max_size;
639 if (mtu < IPV6_MIN_MTU)
640 mtu = IPV6_MIN_MTU;
641 }
642
643 if (np && np->frag_size < mtu) {
644 if (np->frag_size)
645 mtu = np->frag_size;
646 }
647 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
648 goto fail_toobig;
649 mtu -= hlen + sizeof(struct frag_hdr);
650
651 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
652 &ipv6_hdr(skb)->saddr);
653
654 if (skb->ip_summed == CHECKSUM_PARTIAL &&
655 (err = skb_checksum_help(skb)))
656 goto fail;
657
658 hroom = LL_RESERVED_SPACE(rt->dst.dev);
659 if (skb_has_frag_list(skb)) {
660 unsigned int first_len = skb_pagelen(skb);
661 struct sk_buff *frag2;
662
663 if (first_len - hlen > mtu ||
664 ((first_len - hlen) & 7) ||
665 skb_cloned(skb) ||
666 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
667 goto slow_path;
668
669 skb_walk_frags(skb, frag) {
670 /* Correct geometry. */
671 if (frag->len > mtu ||
672 ((frag->len & 7) && frag->next) ||
673 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
674 goto slow_path_clean;
675
676 /* Partially cloned skb? */
677 if (skb_shared(frag))
678 goto slow_path_clean;
679
680 BUG_ON(frag->sk);
681 if (skb->sk) {
682 frag->sk = skb->sk;
683 frag->destructor = sock_wfree;
684 }
685 skb->truesize -= frag->truesize;
686 }
687
688 err = 0;
689 offset = 0;
690 /* BUILD HEADER */
691
692 *prevhdr = NEXTHDR_FRAGMENT;
693 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
694 if (!tmp_hdr) {
695 err = -ENOMEM;
696 goto fail;
697 }
698 frag = skb_shinfo(skb)->frag_list;
699 skb_frag_list_init(skb);
700
701 __skb_pull(skb, hlen);
702 fh = __skb_push(skb, sizeof(struct frag_hdr));
703 __skb_push(skb, hlen);
704 skb_reset_network_header(skb);
705 memcpy(skb_network_header(skb), tmp_hdr, hlen);
706
707 fh->nexthdr = nexthdr;
708 fh->reserved = 0;
709 fh->frag_off = htons(IP6_MF);
710 fh->identification = frag_id;
711
712 first_len = skb_pagelen(skb);
713 skb->data_len = first_len - skb_headlen(skb);
714 skb->len = first_len;
715 ipv6_hdr(skb)->payload_len = htons(first_len -
716 sizeof(struct ipv6hdr));
717
718 for (;;) {
719 /* Prepare header of the next frame,
720 * before previous one went down. */
721 if (frag) {
722 frag->ip_summed = CHECKSUM_NONE;
723 skb_reset_transport_header(frag);
724 fh = __skb_push(frag, sizeof(struct frag_hdr));
725 __skb_push(frag, hlen);
726 skb_reset_network_header(frag);
727 memcpy(skb_network_header(frag), tmp_hdr,
728 hlen);
729 offset += skb->len - hlen - sizeof(struct frag_hdr);
730 fh->nexthdr = nexthdr;
731 fh->reserved = 0;
732 fh->frag_off = htons(offset);
733 if (frag->next)
734 fh->frag_off |= htons(IP6_MF);
735 fh->identification = frag_id;
736 ipv6_hdr(frag)->payload_len =
737 htons(frag->len -
738 sizeof(struct ipv6hdr));
739 ip6_copy_metadata(frag, skb);
740 }
741
742 err = output(net, sk, skb);
743 if (!err)
744 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
745 IPSTATS_MIB_FRAGCREATES);
746
747 if (err || !frag)
748 break;
749
750 skb = frag;
751 frag = skb->next;
752 skb->next = NULL;
753 }
754
755 kfree(tmp_hdr);
756
757 if (err == 0) {
758 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
759 IPSTATS_MIB_FRAGOKS);
760 return 0;
761 }
762
763 kfree_skb_list(frag);
764
765 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
766 IPSTATS_MIB_FRAGFAILS);
767 return err;
768
769 slow_path_clean:
770 skb_walk_frags(skb, frag2) {
771 if (frag2 == frag)
772 break;
773 frag2->sk = NULL;
774 frag2->destructor = NULL;
775 skb->truesize += frag2->truesize;
776 }
777 }
778
779 slow_path:
780 left = skb->len - hlen; /* Space per frame */
781 ptr = hlen; /* Where to start from */
782
783 /*
784 * Fragment the datagram.
785 */
786
787 troom = rt->dst.dev->needed_tailroom;
788
789 /*
790 * Keep copying data until we run out.
791 */
792 while (left > 0) {
793 u8 *fragnexthdr_offset;
794
795 len = left;
796 /* IF: it doesn't fit, use 'mtu' - the data space left */
797 if (len > mtu)
798 len = mtu;
799 /* IF: we are not sending up to and including the packet end
800 then align the next start on an eight byte boundary */
801 if (len < left) {
802 len &= ~7;
803 }
804
805 /* Allocate buffer */
806 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
807 hroom + troom, GFP_ATOMIC);
808 if (!frag) {
809 err = -ENOMEM;
810 goto fail;
811 }
812
813 /*
814 * Set up data on packet
815 */
816
817 ip6_copy_metadata(frag, skb);
818 skb_reserve(frag, hroom);
819 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
820 skb_reset_network_header(frag);
821 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
822 frag->transport_header = (frag->network_header + hlen +
823 sizeof(struct frag_hdr));
824
825 /*
826 * Charge the memory for the fragment to any owner
827 * it might possess
828 */
829 if (skb->sk)
830 skb_set_owner_w(frag, skb->sk);
831
832 /*
833 * Copy the packet header into the new buffer.
834 */
835 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
836
837 fragnexthdr_offset = skb_network_header(frag);
838 fragnexthdr_offset += prevhdr - skb_network_header(skb);
839 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
840
841 /*
842 * Build fragment header.
843 */
844 fh->nexthdr = nexthdr;
845 fh->reserved = 0;
846 fh->identification = frag_id;
847
848 /*
849 * Copy a block of the IP datagram.
850 */
851 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
852 len));
853 left -= len;
854
855 fh->frag_off = htons(offset);
856 if (left > 0)
857 fh->frag_off |= htons(IP6_MF);
858 ipv6_hdr(frag)->payload_len = htons(frag->len -
859 sizeof(struct ipv6hdr));
860
861 ptr += len;
862 offset += len;
863
864 /*
865 * Put this fragment into the sending queue.
866 */
867 err = output(net, sk, frag);
868 if (err)
869 goto fail;
870
871 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
872 IPSTATS_MIB_FRAGCREATES);
873 }
874 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
875 IPSTATS_MIB_FRAGOKS);
876 consume_skb(skb);
877 return err;
878
879 fail_toobig:
880 if (skb->sk && dst_allfrag(skb_dst(skb)))
881 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
882
883 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
884 err = -EMSGSIZE;
885
886 fail:
887 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888 IPSTATS_MIB_FRAGFAILS);
889 kfree_skb(skb);
890 return err;
891 }
892
893 static inline int ip6_rt_check(const struct rt6key *rt_key,
894 const struct in6_addr *fl_addr,
895 const struct in6_addr *addr_cache)
896 {
897 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
898 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
899 }
900
901 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
902 struct dst_entry *dst,
903 const struct flowi6 *fl6)
904 {
905 struct ipv6_pinfo *np = inet6_sk(sk);
906 struct rt6_info *rt;
907
908 if (!dst)
909 goto out;
910
911 if (dst->ops->family != AF_INET6) {
912 dst_release(dst);
913 return NULL;
914 }
915
916 rt = (struct rt6_info *)dst;
917 /* Yes, checking route validity in not connected
918 * case is not very simple. Take into account,
919 * that we do not support routing by source, TOS,
920 * and MSG_DONTROUTE --ANK (980726)
921 *
922 * 1. ip6_rt_check(): If route was host route,
923 * check that cached destination is current.
924 * If it is network route, we still may
925 * check its validity using saved pointer
926 * to the last used address: daddr_cache.
927 * We do not want to save whole address now,
928 * (because main consumer of this service
929 * is tcp, which has not this problem),
930 * so that the last trick works only on connected
931 * sockets.
932 * 2. oif also should be the same.
933 */
934 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
935 #ifdef CONFIG_IPV6_SUBTREES
936 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
937 #endif
938 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
939 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
940 dst_release(dst);
941 dst = NULL;
942 }
943
944 out:
945 return dst;
946 }
947
948 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
949 struct dst_entry **dst, struct flowi6 *fl6)
950 {
951 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
952 struct neighbour *n;
953 struct rt6_info *rt;
954 #endif
955 int err;
956 int flags = 0;
957
958 /* The correct way to handle this would be to do
959 * ip6_route_get_saddr, and then ip6_route_output; however,
960 * the route-specific preferred source forces the
961 * ip6_route_output call _before_ ip6_route_get_saddr.
962 *
963 * In source specific routing (no src=any default route),
964 * ip6_route_output will fail given src=any saddr, though, so
965 * that's why we try it again later.
966 */
967 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
968 struct rt6_info *rt;
969 bool had_dst = *dst != NULL;
970
971 if (!had_dst)
972 *dst = ip6_route_output(net, sk, fl6);
973 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
974 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
975 sk ? inet6_sk(sk)->srcprefs : 0,
976 &fl6->saddr);
977 if (err)
978 goto out_err_release;
979
980 /* If we had an erroneous initial result, pretend it
981 * never existed and let the SA-enabled version take
982 * over.
983 */
984 if (!had_dst && (*dst)->error) {
985 dst_release(*dst);
986 *dst = NULL;
987 }
988
989 if (fl6->flowi6_oif)
990 flags |= RT6_LOOKUP_F_IFACE;
991 }
992
993 if (!*dst)
994 *dst = ip6_route_output_flags(net, sk, fl6, flags);
995
996 err = (*dst)->error;
997 if (err)
998 goto out_err_release;
999
1000 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1001 /*
1002 * Here if the dst entry we've looked up
1003 * has a neighbour entry that is in the INCOMPLETE
1004 * state and the src address from the flow is
1005 * marked as OPTIMISTIC, we release the found
1006 * dst entry and replace it instead with the
1007 * dst entry of the nexthop router
1008 */
1009 rt = (struct rt6_info *) *dst;
1010 rcu_read_lock_bh();
1011 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1012 rt6_nexthop(rt, &fl6->daddr));
1013 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1014 rcu_read_unlock_bh();
1015
1016 if (err) {
1017 struct inet6_ifaddr *ifp;
1018 struct flowi6 fl_gw6;
1019 int redirect;
1020
1021 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1022 (*dst)->dev, 1);
1023
1024 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1025 if (ifp)
1026 in6_ifa_put(ifp);
1027
1028 if (redirect) {
1029 /*
1030 * We need to get the dst entry for the
1031 * default router instead
1032 */
1033 dst_release(*dst);
1034 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1035 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1036 *dst = ip6_route_output(net, sk, &fl_gw6);
1037 err = (*dst)->error;
1038 if (err)
1039 goto out_err_release;
1040 }
1041 }
1042 #endif
1043 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1044 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1045 err = -EAFNOSUPPORT;
1046 goto out_err_release;
1047 }
1048
1049 return 0;
1050
1051 out_err_release:
1052 dst_release(*dst);
1053 *dst = NULL;
1054
1055 if (err == -ENETUNREACH)
1056 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1057 return err;
1058 }
1059
1060 /**
1061 * ip6_dst_lookup - perform route lookup on flow
1062 * @sk: socket which provides route info
1063 * @dst: pointer to dst_entry * for result
1064 * @fl6: flow to lookup
1065 *
1066 * This function performs a route lookup on the given flow.
1067 *
1068 * It returns zero on success, or a standard errno code on error.
1069 */
1070 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1071 struct flowi6 *fl6)
1072 {
1073 *dst = NULL;
1074 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1075 }
1076 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1077
1078 /**
1079 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1080 * @sk: socket which provides route info
1081 * @fl6: flow to lookup
1082 * @final_dst: final destination address for ipsec lookup
1083 *
1084 * This function performs a route lookup on the given flow.
1085 *
1086 * It returns a valid dst pointer on success, or a pointer encoded
1087 * error code.
1088 */
1089 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1090 const struct in6_addr *final_dst)
1091 {
1092 struct dst_entry *dst = NULL;
1093 int err;
1094
1095 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1096 if (err)
1097 return ERR_PTR(err);
1098 if (final_dst)
1099 fl6->daddr = *final_dst;
1100
1101 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1102 }
1103 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1104
1105 /**
1106 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1107 * @sk: socket which provides the dst cache and route info
1108 * @fl6: flow to lookup
1109 * @final_dst: final destination address for ipsec lookup
1110 *
1111 * This function performs a route lookup on the given flow with the
1112 * possibility of using the cached route in the socket if it is valid.
1113 * It will take the socket dst lock when operating on the dst cache.
1114 * As a result, this function can only be used in process context.
1115 *
1116 * It returns a valid dst pointer on success, or a pointer encoded
1117 * error code.
1118 */
1119 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1120 const struct in6_addr *final_dst)
1121 {
1122 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1123
1124 dst = ip6_sk_dst_check(sk, dst, fl6);
1125 if (!dst)
1126 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1127
1128 return dst;
1129 }
1130 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1131
1132 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1133 gfp_t gfp)
1134 {
1135 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1136 }
1137
1138 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1139 gfp_t gfp)
1140 {
1141 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1142 }
1143
1144 static void ip6_append_data_mtu(unsigned int *mtu,
1145 int *maxfraglen,
1146 unsigned int fragheaderlen,
1147 struct sk_buff *skb,
1148 struct rt6_info *rt,
1149 unsigned int orig_mtu)
1150 {
1151 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1152 if (!skb) {
1153 /* first fragment, reserve header_len */
1154 *mtu = orig_mtu - rt->dst.header_len;
1155
1156 } else {
1157 /*
1158 * this fragment is not first, the headers
1159 * space is regarded as data space.
1160 */
1161 *mtu = orig_mtu;
1162 }
1163 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1164 + fragheaderlen - sizeof(struct frag_hdr);
1165 }
1166 }
1167
1168 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1169 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1170 struct rt6_info *rt, struct flowi6 *fl6)
1171 {
1172 struct ipv6_pinfo *np = inet6_sk(sk);
1173 unsigned int mtu;
1174 struct ipv6_txoptions *opt = ipc6->opt;
1175
1176 /*
1177 * setup for corking
1178 */
1179 if (opt) {
1180 if (WARN_ON(v6_cork->opt))
1181 return -EINVAL;
1182
1183 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1184 if (unlikely(!v6_cork->opt))
1185 return -ENOBUFS;
1186
1187 v6_cork->opt->tot_len = sizeof(*opt);
1188 v6_cork->opt->opt_flen = opt->opt_flen;
1189 v6_cork->opt->opt_nflen = opt->opt_nflen;
1190
1191 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1192 sk->sk_allocation);
1193 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1194 return -ENOBUFS;
1195
1196 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1197 sk->sk_allocation);
1198 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1199 return -ENOBUFS;
1200
1201 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1202 sk->sk_allocation);
1203 if (opt->hopopt && !v6_cork->opt->hopopt)
1204 return -ENOBUFS;
1205
1206 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1207 sk->sk_allocation);
1208 if (opt->srcrt && !v6_cork->opt->srcrt)
1209 return -ENOBUFS;
1210
1211 /* need source address above miyazawa*/
1212 }
1213 dst_hold(&rt->dst);
1214 cork->base.dst = &rt->dst;
1215 cork->fl.u.ip6 = *fl6;
1216 v6_cork->hop_limit = ipc6->hlimit;
1217 v6_cork->tclass = ipc6->tclass;
1218 if (rt->dst.flags & DST_XFRM_TUNNEL)
1219 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1220 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1221 else
1222 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1223 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1224 if (np->frag_size < mtu) {
1225 if (np->frag_size)
1226 mtu = np->frag_size;
1227 }
1228 if (mtu < IPV6_MIN_MTU)
1229 return -EINVAL;
1230 cork->base.fragsize = mtu;
1231 if (dst_allfrag(rt->dst.path))
1232 cork->base.flags |= IPCORK_ALLFRAG;
1233 cork->base.length = 0;
1234
1235 return 0;
1236 }
1237
1238 static int __ip6_append_data(struct sock *sk,
1239 struct flowi6 *fl6,
1240 struct sk_buff_head *queue,
1241 struct inet_cork *cork,
1242 struct inet6_cork *v6_cork,
1243 struct page_frag *pfrag,
1244 int getfrag(void *from, char *to, int offset,
1245 int len, int odd, struct sk_buff *skb),
1246 void *from, int length, int transhdrlen,
1247 unsigned int flags, struct ipcm6_cookie *ipc6,
1248 const struct sockcm_cookie *sockc)
1249 {
1250 struct sk_buff *skb, *skb_prev = NULL;
1251 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1252 int exthdrlen = 0;
1253 int dst_exthdrlen = 0;
1254 int hh_len;
1255 int copy;
1256 int err;
1257 int offset = 0;
1258 __u8 tx_flags = 0;
1259 u32 tskey = 0;
1260 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1261 struct ipv6_txoptions *opt = v6_cork->opt;
1262 int csummode = CHECKSUM_NONE;
1263 unsigned int maxnonfragsize, headersize;
1264
1265 skb = skb_peek_tail(queue);
1266 if (!skb) {
1267 exthdrlen = opt ? opt->opt_flen : 0;
1268 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1269 }
1270
1271 mtu = cork->fragsize;
1272 orig_mtu = mtu;
1273
1274 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1275
1276 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1277 (opt ? opt->opt_nflen : 0);
1278 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1279 sizeof(struct frag_hdr);
1280
1281 headersize = sizeof(struct ipv6hdr) +
1282 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1283 (dst_allfrag(&rt->dst) ?
1284 sizeof(struct frag_hdr) : 0) +
1285 rt->rt6i_nfheader_len;
1286
1287 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1288 * the first fragment
1289 */
1290 if (headersize + transhdrlen > mtu)
1291 goto emsgsize;
1292
1293 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1294 (sk->sk_protocol == IPPROTO_UDP ||
1295 sk->sk_protocol == IPPROTO_RAW)) {
1296 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1297 sizeof(struct ipv6hdr));
1298 goto emsgsize;
1299 }
1300
1301 if (ip6_sk_ignore_df(sk))
1302 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1303 else
1304 maxnonfragsize = mtu;
1305
1306 if (cork->length + length > maxnonfragsize - headersize) {
1307 emsgsize:
1308 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1309 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1310 return -EMSGSIZE;
1311 }
1312
1313 /* CHECKSUM_PARTIAL only with no extension headers and when
1314 * we are not going to fragment
1315 */
1316 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1317 headersize == sizeof(struct ipv6hdr) &&
1318 length <= mtu - headersize &&
1319 !(flags & MSG_MORE) &&
1320 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1321 csummode = CHECKSUM_PARTIAL;
1322
1323 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1324 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1325 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1326 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1327 tskey = sk->sk_tskey++;
1328 }
1329
1330 /*
1331 * Let's try using as much space as possible.
1332 * Use MTU if total length of the message fits into the MTU.
1333 * Otherwise, we need to reserve fragment header and
1334 * fragment alignment (= 8-15 octects, in total).
1335 *
1336 * Note that we may need to "move" the data from the tail of
1337 * of the buffer to the new fragment when we split
1338 * the message.
1339 *
1340 * FIXME: It may be fragmented into multiple chunks
1341 * at once if non-fragmentable extension headers
1342 * are too large.
1343 * --yoshfuji
1344 */
1345
1346 cork->length += length;
1347 if (!skb)
1348 goto alloc_new_skb;
1349
1350 while (length > 0) {
1351 /* Check if the remaining data fits into current packet. */
1352 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1353 if (copy < length)
1354 copy = maxfraglen - skb->len;
1355
1356 if (copy <= 0) {
1357 char *data;
1358 unsigned int datalen;
1359 unsigned int fraglen;
1360 unsigned int fraggap;
1361 unsigned int alloclen;
1362 alloc_new_skb:
1363 /* There's no room in the current skb */
1364 if (skb)
1365 fraggap = skb->len - maxfraglen;
1366 else
1367 fraggap = 0;
1368 /* update mtu and maxfraglen if necessary */
1369 if (!skb || !skb_prev)
1370 ip6_append_data_mtu(&mtu, &maxfraglen,
1371 fragheaderlen, skb, rt,
1372 orig_mtu);
1373
1374 skb_prev = skb;
1375
1376 /*
1377 * If remaining data exceeds the mtu,
1378 * we know we need more fragment(s).
1379 */
1380 datalen = length + fraggap;
1381
1382 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1383 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1384 if ((flags & MSG_MORE) &&
1385 !(rt->dst.dev->features&NETIF_F_SG))
1386 alloclen = mtu;
1387 else
1388 alloclen = datalen + fragheaderlen;
1389
1390 alloclen += dst_exthdrlen;
1391
1392 if (datalen != length + fraggap) {
1393 /*
1394 * this is not the last fragment, the trailer
1395 * space is regarded as data space.
1396 */
1397 datalen += rt->dst.trailer_len;
1398 }
1399
1400 alloclen += rt->dst.trailer_len;
1401 fraglen = datalen + fragheaderlen;
1402
1403 /*
1404 * We just reserve space for fragment header.
1405 * Note: this may be overallocation if the message
1406 * (without MSG_MORE) fits into the MTU.
1407 */
1408 alloclen += sizeof(struct frag_hdr);
1409
1410 copy = datalen - transhdrlen - fraggap;
1411 if (copy < 0) {
1412 err = -EINVAL;
1413 goto error;
1414 }
1415 if (transhdrlen) {
1416 skb = sock_alloc_send_skb(sk,
1417 alloclen + hh_len,
1418 (flags & MSG_DONTWAIT), &err);
1419 } else {
1420 skb = NULL;
1421 if (refcount_read(&sk->sk_wmem_alloc) <=
1422 2 * sk->sk_sndbuf)
1423 skb = sock_wmalloc(sk,
1424 alloclen + hh_len, 1,
1425 sk->sk_allocation);
1426 if (unlikely(!skb))
1427 err = -ENOBUFS;
1428 }
1429 if (!skb)
1430 goto error;
1431 /*
1432 * Fill in the control structures
1433 */
1434 skb->protocol = htons(ETH_P_IPV6);
1435 skb->ip_summed = csummode;
1436 skb->csum = 0;
1437 /* reserve for fragmentation and ipsec header */
1438 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1439 dst_exthdrlen);
1440
1441 /* Only the initial fragment is time stamped */
1442 skb_shinfo(skb)->tx_flags = tx_flags;
1443 tx_flags = 0;
1444 skb_shinfo(skb)->tskey = tskey;
1445 tskey = 0;
1446
1447 /*
1448 * Find where to start putting bytes
1449 */
1450 data = skb_put(skb, fraglen);
1451 skb_set_network_header(skb, exthdrlen);
1452 data += fragheaderlen;
1453 skb->transport_header = (skb->network_header +
1454 fragheaderlen);
1455 if (fraggap) {
1456 skb->csum = skb_copy_and_csum_bits(
1457 skb_prev, maxfraglen,
1458 data + transhdrlen, fraggap, 0);
1459 skb_prev->csum = csum_sub(skb_prev->csum,
1460 skb->csum);
1461 data += fraggap;
1462 pskb_trim_unique(skb_prev, maxfraglen);
1463 }
1464 if (copy > 0 &&
1465 getfrag(from, data + transhdrlen, offset,
1466 copy, fraggap, skb) < 0) {
1467 err = -EFAULT;
1468 kfree_skb(skb);
1469 goto error;
1470 }
1471
1472 offset += copy;
1473 length -= datalen - fraggap;
1474 transhdrlen = 0;
1475 exthdrlen = 0;
1476 dst_exthdrlen = 0;
1477
1478 if ((flags & MSG_CONFIRM) && !skb_prev)
1479 skb_set_dst_pending_confirm(skb, 1);
1480
1481 /*
1482 * Put the packet on the pending queue
1483 */
1484 __skb_queue_tail(queue, skb);
1485 continue;
1486 }
1487
1488 if (copy > length)
1489 copy = length;
1490
1491 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1492 unsigned int off;
1493
1494 off = skb->len;
1495 if (getfrag(from, skb_put(skb, copy),
1496 offset, copy, off, skb) < 0) {
1497 __skb_trim(skb, off);
1498 err = -EFAULT;
1499 goto error;
1500 }
1501 } else {
1502 int i = skb_shinfo(skb)->nr_frags;
1503
1504 err = -ENOMEM;
1505 if (!sk_page_frag_refill(sk, pfrag))
1506 goto error;
1507
1508 if (!skb_can_coalesce(skb, i, pfrag->page,
1509 pfrag->offset)) {
1510 err = -EMSGSIZE;
1511 if (i == MAX_SKB_FRAGS)
1512 goto error;
1513
1514 __skb_fill_page_desc(skb, i, pfrag->page,
1515 pfrag->offset, 0);
1516 skb_shinfo(skb)->nr_frags = ++i;
1517 get_page(pfrag->page);
1518 }
1519 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1520 if (getfrag(from,
1521 page_address(pfrag->page) + pfrag->offset,
1522 offset, copy, skb->len, skb) < 0)
1523 goto error_efault;
1524
1525 pfrag->offset += copy;
1526 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1527 skb->len += copy;
1528 skb->data_len += copy;
1529 skb->truesize += copy;
1530 refcount_add(copy, &sk->sk_wmem_alloc);
1531 }
1532 offset += copy;
1533 length -= copy;
1534 }
1535
1536 return 0;
1537
1538 error_efault:
1539 err = -EFAULT;
1540 error:
1541 cork->length -= length;
1542 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1543 return err;
1544 }
1545
1546 int ip6_append_data(struct sock *sk,
1547 int getfrag(void *from, char *to, int offset, int len,
1548 int odd, struct sk_buff *skb),
1549 void *from, int length, int transhdrlen,
1550 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1551 struct rt6_info *rt, unsigned int flags,
1552 const struct sockcm_cookie *sockc)
1553 {
1554 struct inet_sock *inet = inet_sk(sk);
1555 struct ipv6_pinfo *np = inet6_sk(sk);
1556 int exthdrlen;
1557 int err;
1558
1559 if (flags&MSG_PROBE)
1560 return 0;
1561 if (skb_queue_empty(&sk->sk_write_queue)) {
1562 /*
1563 * setup for corking
1564 */
1565 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1566 ipc6, rt, fl6);
1567 if (err)
1568 return err;
1569
1570 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1571 length += exthdrlen;
1572 transhdrlen += exthdrlen;
1573 } else {
1574 fl6 = &inet->cork.fl.u.ip6;
1575 transhdrlen = 0;
1576 }
1577
1578 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1579 &np->cork, sk_page_frag(sk), getfrag,
1580 from, length, transhdrlen, flags, ipc6, sockc);
1581 }
1582 EXPORT_SYMBOL_GPL(ip6_append_data);
1583
1584 static void ip6_cork_release(struct inet_cork_full *cork,
1585 struct inet6_cork *v6_cork)
1586 {
1587 if (v6_cork->opt) {
1588 kfree(v6_cork->opt->dst0opt);
1589 kfree(v6_cork->opt->dst1opt);
1590 kfree(v6_cork->opt->hopopt);
1591 kfree(v6_cork->opt->srcrt);
1592 kfree(v6_cork->opt);
1593 v6_cork->opt = NULL;
1594 }
1595
1596 if (cork->base.dst) {
1597 dst_release(cork->base.dst);
1598 cork->base.dst = NULL;
1599 cork->base.flags &= ~IPCORK_ALLFRAG;
1600 }
1601 memset(&cork->fl, 0, sizeof(cork->fl));
1602 }
1603
1604 struct sk_buff *__ip6_make_skb(struct sock *sk,
1605 struct sk_buff_head *queue,
1606 struct inet_cork_full *cork,
1607 struct inet6_cork *v6_cork)
1608 {
1609 struct sk_buff *skb, *tmp_skb;
1610 struct sk_buff **tail_skb;
1611 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1612 struct ipv6_pinfo *np = inet6_sk(sk);
1613 struct net *net = sock_net(sk);
1614 struct ipv6hdr *hdr;
1615 struct ipv6_txoptions *opt = v6_cork->opt;
1616 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1617 struct flowi6 *fl6 = &cork->fl.u.ip6;
1618 unsigned char proto = fl6->flowi6_proto;
1619
1620 skb = __skb_dequeue(queue);
1621 if (!skb)
1622 goto out;
1623 tail_skb = &(skb_shinfo(skb)->frag_list);
1624
1625 /* move skb->data to ip header from ext header */
1626 if (skb->data < skb_network_header(skb))
1627 __skb_pull(skb, skb_network_offset(skb));
1628 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1629 __skb_pull(tmp_skb, skb_network_header_len(skb));
1630 *tail_skb = tmp_skb;
1631 tail_skb = &(tmp_skb->next);
1632 skb->len += tmp_skb->len;
1633 skb->data_len += tmp_skb->len;
1634 skb->truesize += tmp_skb->truesize;
1635 tmp_skb->destructor = NULL;
1636 tmp_skb->sk = NULL;
1637 }
1638
1639 /* Allow local fragmentation. */
1640 skb->ignore_df = ip6_sk_ignore_df(sk);
1641
1642 *final_dst = fl6->daddr;
1643 __skb_pull(skb, skb_network_header_len(skb));
1644 if (opt && opt->opt_flen)
1645 ipv6_push_frag_opts(skb, opt, &proto);
1646 if (opt && opt->opt_nflen)
1647 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1648
1649 skb_push(skb, sizeof(struct ipv6hdr));
1650 skb_reset_network_header(skb);
1651 hdr = ipv6_hdr(skb);
1652
1653 ip6_flow_hdr(hdr, v6_cork->tclass,
1654 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1655 ip6_autoflowlabel(net, np), fl6));
1656 hdr->hop_limit = v6_cork->hop_limit;
1657 hdr->nexthdr = proto;
1658 hdr->saddr = fl6->saddr;
1659 hdr->daddr = *final_dst;
1660
1661 skb->priority = sk->sk_priority;
1662 skb->mark = sk->sk_mark;
1663
1664 skb_dst_set(skb, dst_clone(&rt->dst));
1665 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1666 if (proto == IPPROTO_ICMPV6) {
1667 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1668
1669 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1670 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1671 }
1672
1673 ip6_cork_release(cork, v6_cork);
1674 out:
1675 return skb;
1676 }
1677
1678 int ip6_send_skb(struct sk_buff *skb)
1679 {
1680 struct net *net = sock_net(skb->sk);
1681 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1682 int err;
1683
1684 err = ip6_local_out(net, skb->sk, skb);
1685 if (err) {
1686 if (err > 0)
1687 err = net_xmit_errno(err);
1688 if (err)
1689 IP6_INC_STATS(net, rt->rt6i_idev,
1690 IPSTATS_MIB_OUTDISCARDS);
1691 }
1692
1693 return err;
1694 }
1695
1696 int ip6_push_pending_frames(struct sock *sk)
1697 {
1698 struct sk_buff *skb;
1699
1700 skb = ip6_finish_skb(sk);
1701 if (!skb)
1702 return 0;
1703
1704 return ip6_send_skb(skb);
1705 }
1706 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1707
1708 static void __ip6_flush_pending_frames(struct sock *sk,
1709 struct sk_buff_head *queue,
1710 struct inet_cork_full *cork,
1711 struct inet6_cork *v6_cork)
1712 {
1713 struct sk_buff *skb;
1714
1715 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1716 if (skb_dst(skb))
1717 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1718 IPSTATS_MIB_OUTDISCARDS);
1719 kfree_skb(skb);
1720 }
1721
1722 ip6_cork_release(cork, v6_cork);
1723 }
1724
1725 void ip6_flush_pending_frames(struct sock *sk)
1726 {
1727 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1728 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1729 }
1730 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1731
1732 struct sk_buff *ip6_make_skb(struct sock *sk,
1733 int getfrag(void *from, char *to, int offset,
1734 int len, int odd, struct sk_buff *skb),
1735 void *from, int length, int transhdrlen,
1736 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1737 struct rt6_info *rt, unsigned int flags,
1738 const struct sockcm_cookie *sockc)
1739 {
1740 struct inet_cork_full cork;
1741 struct inet6_cork v6_cork;
1742 struct sk_buff_head queue;
1743 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1744 int err;
1745
1746 if (flags & MSG_PROBE)
1747 return NULL;
1748
1749 __skb_queue_head_init(&queue);
1750
1751 cork.base.flags = 0;
1752 cork.base.addr = 0;
1753 cork.base.opt = NULL;
1754 cork.base.dst = NULL;
1755 v6_cork.opt = NULL;
1756 err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1757 if (err) {
1758 ip6_cork_release(&cork, &v6_cork);
1759 return ERR_PTR(err);
1760 }
1761 if (ipc6->dontfrag < 0)
1762 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1763
1764 err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1765 &current->task_frag, getfrag, from,
1766 length + exthdrlen, transhdrlen + exthdrlen,
1767 flags, ipc6, sockc);
1768 if (err) {
1769 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1770 return ERR_PTR(err);
1771 }
1772
1773 return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1774 }