net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142         /* Policy lookup after SNAT yielded a new policy */
 143         if (skb_dst(skb)->xfrm) {
 144                 IPCB(skb)->flags |= IPSKB_REROUTED;
 145                 return dst_output(net, sk, skb);
 146         }
 147 #endif
 148
 149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150             dst_allfrag(skb_dst(skb)) ||
 151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153         else
 154                 return ip6_finish_output2(net, sk, skb);
 155 }
 156
 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158 {
 159         struct net_device *dev = skb_dst(skb)->dev;
 160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162         skb->protocol = htons(ETH_P_IPV6);
 163         skb->dev = dev;
 164
 165         if (unlikely(idev->cnf.disable_ipv6)) {
 166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                 kfree_skb(skb);
 168                 return 0;
 169         }
 170
 171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                             net, sk, skb, NULL, dev,
 173                             ip6_finish_output,
 174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175 }
 176
 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178 {
 179         if (!np->autoflowlabel_set)
 180                 return ip6_default_np_autolabel(net);
 181         else
 182                 return np->autoflowlabel;
 183 }
 184
 185 /*
 186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187  * Note : socket lock is not held for SYNACK packets, but might be modified
 188  * by calls to skb_set_owner_w() and ipv6_local_error(),
 189  * which are using proper atomic operations or spinlocks.
 190  */
 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193 {
 194         struct net *net = sock_net(sk);
 195         const struct ipv6_pinfo *np = inet6_sk(sk);
 196         struct in6_addr *first_hop = &fl6->daddr;
 197         struct dst_entry *dst = skb_dst(skb);
 198         struct ipv6hdr *hdr;
 199         u8  proto = fl6->flowi6_proto;
 200         int seg_len = skb->len;
 201         int hlimit = -1;
 202         u32 mtu;
 203
 204         if (opt) {
 205                 unsigned int head_room;
 206
 207                 /* First: exthdrs may take lots of space (~8K for now)
 208                    MAX_HEADER is not enough.
 209                  */
 210                 head_room = opt->opt_nflen + opt->opt_flen;
 211                 seg_len += head_room;
 212                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 213
 214                 if (skb_headroom(skb) < head_room) {
 215                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 216                         if (!skb2) {
 217                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 218                                               IPSTATS_MIB_OUTDISCARDS);
 219                                 kfree_skb(skb);
 220                                 return -ENOBUFS;
 221                         }
 222                         consume_skb(skb);
 223                         skb = skb2;
 224                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 225                          * it is safe to call in our context (socket lock not held)
 226                          */
 227                         skb_set_owner_w(skb, (struct sock *)sk);
 228                 }
 229                 if (opt->opt_flen)
 230                         ipv6_push_frag_opts(skb, opt, &proto);
 231                 if (opt->opt_nflen)
 232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 233                                              &fl6->saddr);
 234         }
 235
 236         skb_push(skb, sizeof(struct ipv6hdr));
 237         skb_reset_network_header(skb);
 238         hdr = ipv6_hdr(skb);
 239
 240         /*
 241          *      Fill in the IPv6 header
 242          */
 243         if (np)
 244                 hlimit = np->hop_limit;
 245         if (hlimit < 0)
 246                 hlimit = ip6_dst_hoplimit(dst);
 247
 248         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 249                                 ip6_autoflowlabel(net, np), fl6));
 250
 251         hdr->payload_len = htons(seg_len);
 252         hdr->nexthdr = proto;
 253         hdr->hop_limit = hlimit;
 254
 255         hdr->saddr = fl6->saddr;
 256         hdr->daddr = *first_hop;
 257
 258         skb->protocol = htons(ETH_P_IPV6);
 259         skb->priority = sk->sk_priority;
 260         skb->mark = mark;
 261
 262         mtu = dst_mtu(dst);
 263         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 264                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 265                               IPSTATS_MIB_OUT, skb->len);
 266
 267                 /* if egress device is enslaved to an L3 master device pass the
 268                  * skb to its handler for processing
 269                  */
 270                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 271                 if (unlikely(!skb))
 272                         return 0;
 273
 274                 /* hooks should never assume socket lock is held.
 275                  * we promote our socket to non const
 276                  */
 277                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 278                                net, (struct sock *)sk, skb, NULL, dst->dev,
 279                                dst_output);
 280         }
 281
 282         skb->dev = dst->dev;
 283         /* ipv6_local_error() does not require socket lock,
 284          * we promote our socket to non const
 285          */
 286         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 287
 288         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 289         kfree_skb(skb);
 290         return -EMSGSIZE;
 291 }
 292 EXPORT_SYMBOL(ip6_xmit);
 293
 294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 295 {
 296         struct ip6_ra_chain *ra;
 297         struct sock *last = NULL;
 298
 299         read_lock(&ip6_ra_lock);
 300         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 301                 struct sock *sk = ra->sk;
 302                 if (sk && ra->sel == sel &&
 303                     (!sk->sk_bound_dev_if ||
 304                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 305                         if (last) {
 306                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 307                                 if (skb2)
 308                                         rawv6_rcv(last, skb2);
 309                         }
 310                         last = sk;
 311                 }
 312         }
 313
 314         if (last) {
 315                 rawv6_rcv(last, skb);
 316                 read_unlock(&ip6_ra_lock);
 317                 return 1;
 318         }
 319         read_unlock(&ip6_ra_lock);
 320         return 0;
 321 }
 322
 323 static int ip6_forward_proxy_check(struct sk_buff *skb)
 324 {
 325         struct ipv6hdr *hdr = ipv6_hdr(skb);
 326         u8 nexthdr = hdr->nexthdr;
 327         __be16 frag_off;
 328         int offset;
 329
 330         if (ipv6_ext_hdr(nexthdr)) {
 331                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 332                 if (offset < 0)
 333                         return 0;
 334         } else
 335                 offset = sizeof(struct ipv6hdr);
 336
 337         if (nexthdr == IPPROTO_ICMPV6) {
 338                 struct icmp6hdr *icmp6;
 339
 340                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 341                                          offset + 1 - skb->data)))
 342                         return 0;
 343
 344                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 345
 346                 switch (icmp6->icmp6_type) {
 347                 case NDISC_ROUTER_SOLICITATION:
 348                 case NDISC_ROUTER_ADVERTISEMENT:
 349                 case NDISC_NEIGHBOUR_SOLICITATION:
 350                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 351                 case NDISC_REDIRECT:
 352                         /* For reaction involving unicast neighbor discovery
 353                          * message destined to the proxied address, pass it to
 354                          * input function.
 355                          */
 356                         return 1;
 357                 default:
 358                         break;
 359                 }
 360         }
 361
 362         /*
 363          * The proxying router can't forward traffic sent to a link-local
 364          * address, so signal the sender and discard the packet. This
 365          * behavior is clarified by the MIPv6 specification.
 366          */
 367         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 368                 dst_link_failure(skb);
 369                 return -1;
 370         }
 371
 372         return 0;
 373 }
 374
 375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 376                                      struct sk_buff *skb)
 377 {
 378         struct dst_entry *dst = skb_dst(skb);
 379
 380         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 381         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 382
 383         return dst_output(net, sk, skb);
 384 }
 385
 386 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 387 {
 388         unsigned int mtu;
 389         struct inet6_dev *idev;
 390
 391         if (dst_metric_locked(dst, RTAX_MTU)) {
 392                 mtu = dst_metric_raw(dst, RTAX_MTU);
 393                 if (mtu)
 394                         return mtu;
 395         }
 396
 397         mtu = IPV6_MIN_MTU;
 398         rcu_read_lock();
 399         idev = __in6_dev_get(dst->dev);
 400         if (idev)
 401                 mtu = idev->cnf.mtu6;
 402         rcu_read_unlock();
 403
 404         return mtu;
 405 }
 406
 407 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 408 {
 409         if (skb->len <= mtu)
 410                 return false;
 411
 412         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 413         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 414                 return true;
 415
 416         if (skb->ignore_df)
 417                 return false;
 418
 419         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 420                 return false;
 421
 422         return true;
 423 }
 424
 425 int ip6_forward(struct sk_buff *skb)
 426 {
 427         struct dst_entry *dst = skb_dst(skb);
 428         struct ipv6hdr *hdr = ipv6_hdr(skb);
 429         struct inet6_skb_parm *opt = IP6CB(skb);
 430         struct net *net = dev_net(dst->dev);
 431         u32 mtu;
 432
 433         if (net->ipv6.devconf_all->forwarding == 0)
 434                 goto error;
 435
 436         if (skb->pkt_type != PACKET_HOST)
 437                 goto drop;
 438
 439         if (unlikely(skb->sk))
 440                 goto drop;
 441
 442         if (skb_warn_if_lro(skb))
 443                 goto drop;
 444
 445         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 446                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 447                                 IPSTATS_MIB_INDISCARDS);
 448                 goto drop;
 449         }
 450
 451         skb_forward_csum(skb);
 452
 453         /*
 454          *      We DO NOT make any processing on
 455          *      RA packets, pushing them to user level AS IS
 456          *      without ane WARRANTY that application will be able
 457          *      to interpret them. The reason is that we
 458          *      cannot make anything clever here.
 459          *
 460          *      We are not end-node, so that if packet contains
 461          *      AH/ESP, we cannot make anything.
 462          *      Defragmentation also would be mistake, RA packets
 463          *      cannot be fragmented, because there is no warranty
 464          *      that different fragments will go along one path. --ANK
 465          */
 466         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 467                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 468                         return 0;
 469         }
 470
 471         /*
 472          *      check and decrement ttl
 473          */
 474         if (hdr->hop_limit <= 1) {
 475                 /* Force OUTPUT device used as source address */
 476                 skb->dev = dst->dev;
 477                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 478                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 479                                 IPSTATS_MIB_INHDRERRORS);
 480
 481                 kfree_skb(skb);
 482                 return -ETIMEDOUT;
 483         }
 484
 485         /* XXX: idev->cnf.proxy_ndp? */
 486         if (net->ipv6.devconf_all->proxy_ndp &&
 487             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 488                 int proxied = ip6_forward_proxy_check(skb);
 489                 if (proxied > 0)
 490                         return ip6_input(skb);
 491                 else if (proxied < 0) {
 492                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 493                                         IPSTATS_MIB_INDISCARDS);
 494                         goto drop;
 495                 }
 496         }
 497
 498         if (!xfrm6_route_forward(skb)) {
 499                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 500                                 IPSTATS_MIB_INDISCARDS);
 501                 goto drop;
 502         }
 503         dst = skb_dst(skb);
 504
 505         /* IPv6 specs say nothing about it, but it is clear that we cannot
 506            send redirects to source routed frames.
 507            We don't send redirects to frames decapsulated from IPsec.
 508          */
 509         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 510                 struct in6_addr *target = NULL;
 511                 struct inet_peer *peer;
 512                 struct rt6_info *rt;
 513
 514                 /*
 515                  *      incoming and outgoing devices are the same
 516                  *      send a redirect.
 517                  */
 518
 519                 rt = (struct rt6_info *) dst;
 520                 if (rt->rt6i_flags & RTF_GATEWAY)
 521                         target = &rt->rt6i_gateway;
 522                 else
 523                         target = &hdr->daddr;
 524
 525                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 526
 527                 /* Limit redirects both by destination (here)
 528                    and by source (inside ndisc_send_redirect)
 529                  */
 530                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 531                         ndisc_send_redirect(skb, target);
 532                 if (peer)
 533                         inet_putpeer(peer);
 534         } else {
 535                 int addrtype = ipv6_addr_type(&hdr->saddr);
 536
 537                 /* This check is security critical. */
 538                 if (addrtype == IPV6_ADDR_ANY ||
 539                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 540                         goto error;
 541                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 542                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 543                                     ICMPV6_NOT_NEIGHBOUR, 0);
 544                         goto error;
 545                 }
 546         }
 547
 548         mtu = ip6_dst_mtu_forward(dst);
 549         if (mtu < IPV6_MIN_MTU)
 550                 mtu = IPV6_MIN_MTU;
 551
 552         if (ip6_pkt_too_big(skb, mtu)) {
 553                 /* Again, force OUTPUT device used as source address */
 554                 skb->dev = dst->dev;
 555                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 556                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 557                                 IPSTATS_MIB_INTOOBIGERRORS);
 558                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 559                                 IPSTATS_MIB_FRAGFAILS);
 560                 kfree_skb(skb);
 561                 return -EMSGSIZE;
 562         }
 563
 564         if (skb_cow(skb, dst->dev->hard_header_len)) {
 565                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 566                                 IPSTATS_MIB_OUTDISCARDS);
 567                 goto drop;
 568         }
 569
 570         hdr = ipv6_hdr(skb);
 571
 572         /* Mangling hops number delayed to point after skb COW */
 573
 574         hdr->hop_limit--;
 575
 576         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 577                        net, NULL, skb, skb->dev, dst->dev,
 578                        ip6_forward_finish);
 579
 580 error:
 581         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 582 drop:
 583         kfree_skb(skb);
 584         return -EINVAL;
 585 }
 586
 587 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 588 {
 589         to->pkt_type = from->pkt_type;
 590         to->priority = from->priority;
 591         to->protocol = from->protocol;
 592         skb_dst_drop(to);
 593         skb_dst_set(to, dst_clone(skb_dst(from)));
 594         to->dev = from->dev;
 595         to->mark = from->mark;
 596
 597 #ifdef CONFIG_NET_SCHED
 598         to->tc_index = from->tc_index;
 599 #endif
 600         nf_copy(to, from);
 601         skb_copy_secmark(to, from);
 602 }
 603
 604 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 605                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 606 {
 607         struct sk_buff *frag;
 608         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 609         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 610                                 inet6_sk(skb->sk) : NULL;
 611         struct ipv6hdr *tmp_hdr;
 612         struct frag_hdr *fh;
 613         unsigned int mtu, hlen, left, len;
 614         int hroom, troom;
 615         __be32 frag_id;
 616         int ptr, offset = 0, err = 0;
 617         u8 *prevhdr, nexthdr = 0;
 618
 619         err = ip6_find_1stfragopt(skb, &prevhdr);
 620         if (err < 0)
 621                 goto fail;
 622         hlen = err;
 623         nexthdr = *prevhdr;
 624
 625         mtu = ip6_skb_dst_mtu(skb);
 626
 627         /* We must not fragment if the socket is set to force MTU discovery
 628          * or if the skb it not generated by a local socket.
 629          */
 630         if (unlikely(!skb->ignore_df && skb->len > mtu))
 631                 goto fail_toobig;
 632
 633         if (IP6CB(skb)->frag_max_size) {
 634                 if (IP6CB(skb)->frag_max_size > mtu)
 635                         goto fail_toobig;
 636
 637                 /* don't send fragments larger than what we received */
 638                 mtu = IP6CB(skb)->frag_max_size;
 639                 if (mtu < IPV6_MIN_MTU)
 640                         mtu = IPV6_MIN_MTU;
 641         }
 642
 643         if (np && np->frag_size < mtu) {
 644                 if (np->frag_size)
 645                         mtu = np->frag_size;
 646         }
 647         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 648                 goto fail_toobig;
 649         mtu -= hlen + sizeof(struct frag_hdr);
 650
 651         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 652                                     &ipv6_hdr(skb)->saddr);
 653
 654         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 655             (err = skb_checksum_help(skb)))
 656                 goto fail;
 657
 658         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 659         if (skb_has_frag_list(skb)) {
 660                 unsigned int first_len = skb_pagelen(skb);
 661                 struct sk_buff *frag2;
 662
 663                 if (first_len - hlen > mtu ||
 664                     ((first_len - hlen) & 7) ||
 665                     skb_cloned(skb) ||
 666                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 667                         goto slow_path;
 668
 669                 skb_walk_frags(skb, frag) {
 670                         /* Correct geometry. */
 671                         if (frag->len > mtu ||
 672                             ((frag->len & 7) && frag->next) ||
 673                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 674                                 goto slow_path_clean;
 675
 676                         /* Partially cloned skb? */
 677                         if (skb_shared(frag))
 678                                 goto slow_path_clean;
 679
 680                         BUG_ON(frag->sk);
 681                         if (skb->sk) {
 682                                 frag->sk = skb->sk;
 683                                 frag->destructor = sock_wfree;
 684                         }
 685                         skb->truesize -= frag->truesize;
 686                 }
 687
 688                 err = 0;
 689                 offset = 0;
 690                 /* BUILD HEADER */
 691
 692                 *prevhdr = NEXTHDR_FRAGMENT;
 693                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 694                 if (!tmp_hdr) {
 695                         err = -ENOMEM;
 696                         goto fail;
 697                 }
 698                 frag = skb_shinfo(skb)->frag_list;
 699                 skb_frag_list_init(skb);
 700
 701                 __skb_pull(skb, hlen);
 702                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 703                 __skb_push(skb, hlen);
 704                 skb_reset_network_header(skb);
 705                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 706
 707                 fh->nexthdr = nexthdr;
 708                 fh->reserved = 0;
 709                 fh->frag_off = htons(IP6_MF);
 710                 fh->identification = frag_id;
 711
 712                 first_len = skb_pagelen(skb);
 713                 skb->data_len = first_len - skb_headlen(skb);
 714                 skb->len = first_len;
 715                 ipv6_hdr(skb)->payload_len = htons(first_len -
 716                                                    sizeof(struct ipv6hdr));
 717
 718                 for (;;) {
 719                         /* Prepare header of the next frame,
 720                          * before previous one went down. */
 721                         if (frag) {
 722                                 frag->ip_summed = CHECKSUM_NONE;
 723                                 skb_reset_transport_header(frag);
 724                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 725                                 __skb_push(frag, hlen);
 726                                 skb_reset_network_header(frag);
 727                                 memcpy(skb_network_header(frag), tmp_hdr,
 728                                        hlen);
 729                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 730                                 fh->nexthdr = nexthdr;
 731                                 fh->reserved = 0;
 732                                 fh->frag_off = htons(offset);
 733                                 if (frag->next)
 734                                         fh->frag_off |= htons(IP6_MF);
 735                                 fh->identification = frag_id;
 736                                 ipv6_hdr(frag)->payload_len =
 737                                                 htons(frag->len -
 738                                                       sizeof(struct ipv6hdr));
 739                                 ip6_copy_metadata(frag, skb);
 740                         }
 741
 742                         err = output(net, sk, skb);
 743                         if (!err)
 744                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 745                                               IPSTATS_MIB_FRAGCREATES);
 746
 747                         if (err || !frag)
 748                                 break;
 749
 750                         skb = frag;
 751                         frag = skb->next;
 752                         skb->next = NULL;
 753                 }
 754
 755                 kfree(tmp_hdr);
 756
 757                 if (err == 0) {
 758                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 759                                       IPSTATS_MIB_FRAGOKS);
 760                         return 0;
 761                 }
 762
 763                 kfree_skb_list(frag);
 764
 765                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 766                               IPSTATS_MIB_FRAGFAILS);
 767                 return err;
 768
 769 slow_path_clean:
 770                 skb_walk_frags(skb, frag2) {
 771                         if (frag2 == frag)
 772                                 break;
 773                         frag2->sk = NULL;
 774                         frag2->destructor = NULL;
 775                         skb->truesize += frag2->truesize;
 776                 }
 777         }
 778
 779 slow_path:
 780         left = skb->len - hlen;         /* Space per frame */
 781         ptr = hlen;                     /* Where to start from */
 782
 783         /*
 784          *      Fragment the datagram.
 785          */
 786
 787         troom = rt->dst.dev->needed_tailroom;
 788
 789         /*
 790          *      Keep copying data until we run out.
 791          */
 792         while (left > 0)        {
 793                 u8 *fragnexthdr_offset;
 794
 795                 len = left;
 796                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 797                 if (len > mtu)
 798                         len = mtu;
 799                 /* IF: we are not sending up to and including the packet end
 800                    then align the next start on an eight byte boundary */
 801                 if (len < left) {
 802                         len &= ~7;
 803                 }
 804
 805                 /* Allocate buffer */
 806                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 807                                  hroom + troom, GFP_ATOMIC);
 808                 if (!frag) {
 809                         err = -ENOMEM;
 810                         goto fail;
 811                 }
 812
 813                 /*
 814                  *      Set up data on packet
 815                  */
 816
 817                 ip6_copy_metadata(frag, skb);
 818                 skb_reserve(frag, hroom);
 819                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 820                 skb_reset_network_header(frag);
 821                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 822                 frag->transport_header = (frag->network_header + hlen +
 823                                           sizeof(struct frag_hdr));
 824
 825                 /*
 826                  *      Charge the memory for the fragment to any owner
 827                  *      it might possess
 828                  */
 829                 if (skb->sk)
 830                         skb_set_owner_w(frag, skb->sk);
 831
 832                 /*
 833                  *      Copy the packet header into the new buffer.
 834                  */
 835                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 836
 837                 fragnexthdr_offset = skb_network_header(frag);
 838                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 839                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 840
 841                 /*
 842                  *      Build fragment header.
 843                  */
 844                 fh->nexthdr = nexthdr;
 845                 fh->reserved = 0;
 846                 fh->identification = frag_id;
 847
 848                 /*
 849                  *      Copy a block of the IP datagram.
 850                  */
 851                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 852                                      len));
 853                 left -= len;
 854
 855                 fh->frag_off = htons(offset);
 856                 if (left > 0)
 857                         fh->frag_off |= htons(IP6_MF);
 858                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 859                                                     sizeof(struct ipv6hdr));
 860
 861                 ptr += len;
 862                 offset += len;
 863
 864                 /*
 865                  *      Put this fragment into the sending queue.
 866                  */
 867                 err = output(net, sk, frag);
 868                 if (err)
 869                         goto fail;
 870
 871                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 872                               IPSTATS_MIB_FRAGCREATES);
 873         }
 874         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 875                       IPSTATS_MIB_FRAGOKS);
 876         consume_skb(skb);
 877         return err;
 878
 879 fail_toobig:
 880         if (skb->sk && dst_allfrag(skb_dst(skb)))
 881                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 882
 883         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 884         err = -EMSGSIZE;
 885
 886 fail:
 887         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 888                       IPSTATS_MIB_FRAGFAILS);
 889         kfree_skb(skb);
 890         return err;
 891 }
 892
 893 static inline int ip6_rt_check(const struct rt6key *rt_key,
 894                                const struct in6_addr *fl_addr,
 895                                const struct in6_addr *addr_cache)
 896 {
 897         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 898                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 899 }
 900
 901 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 902                                           struct dst_entry *dst,
 903                                           const struct flowi6 *fl6)
 904 {
 905         struct ipv6_pinfo *np = inet6_sk(sk);
 906         struct rt6_info *rt;
 907
 908         if (!dst)
 909                 goto out;
 910
 911         if (dst->ops->family != AF_INET6) {
 912                 dst_release(dst);
 913                 return NULL;
 914         }
 915
 916         rt = (struct rt6_info *)dst;
 917         /* Yes, checking route validity in not connected
 918          * case is not very simple. Take into account,
 919          * that we do not support routing by source, TOS,
 920          * and MSG_DONTROUTE            --ANK (980726)
 921          *
 922          * 1. ip6_rt_check(): If route was host route,
 923          *    check that cached destination is current.
 924          *    If it is network route, we still may
 925          *    check its validity using saved pointer
 926          *    to the last used address: daddr_cache.
 927          *    We do not want to save whole address now,
 928          *    (because main consumer of this service
 929          *    is tcp, which has not this problem),
 930          *    so that the last trick works only on connected
 931          *    sockets.
 932          * 2. oif also should be the same.
 933          */
 934         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 935 #ifdef CONFIG_IPV6_SUBTREES
 936             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 937 #endif
 938            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 939               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 940                 dst_release(dst);
 941                 dst = NULL;
 942         }
 943
 944 out:
 945         return dst;
 946 }
 947
 948 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 949                                struct dst_entry **dst, struct flowi6 *fl6)
 950 {
 951 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 952         struct neighbour *n;
 953         struct rt6_info *rt;
 954 #endif
 955         int err;
 956         int flags = 0;
 957
 958         /* The correct way to handle this would be to do
 959          * ip6_route_get_saddr, and then ip6_route_output; however,
 960          * the route-specific preferred source forces the
 961          * ip6_route_output call _before_ ip6_route_get_saddr.
 962          *
 963          * In source specific routing (no src=any default route),
 964          * ip6_route_output will fail given src=any saddr, though, so
 965          * that's why we try it again later.
 966          */
 967         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 968                 struct rt6_info *rt;
 969                 bool had_dst = *dst != NULL;
 970
 971                 if (!had_dst)
 972                         *dst = ip6_route_output(net, sk, fl6);
 973                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 974                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 975                                           sk ? inet6_sk(sk)->srcprefs : 0,
 976                                           &fl6->saddr);
 977                 if (err)
 978                         goto out_err_release;
 979
 980                 /* If we had an erroneous initial result, pretend it
 981                  * never existed and let the SA-enabled version take
 982                  * over.
 983                  */
 984                 if (!had_dst && (*dst)->error) {
 985                         dst_release(*dst);
 986                         *dst = NULL;
 987                 }
 988
 989                 if (fl6->flowi6_oif)
 990                         flags |= RT6_LOOKUP_F_IFACE;
 991         }
 992
 993         if (!*dst)
 994                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 995
 996         err = (*dst)->error;
 997         if (err)
 998                 goto out_err_release;
 999
1000 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1001         /*
1002          * Here if the dst entry we've looked up
1003          * has a neighbour entry that is in the INCOMPLETE
1004          * state and the src address from the flow is
1005          * marked as OPTIMISTIC, we release the found
1006          * dst entry and replace it instead with the
1007          * dst entry of the nexthop router
1008          */
1009         rt = (struct rt6_info *) *dst;
1010         rcu_read_lock_bh();
1011         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1012                                       rt6_nexthop(rt, &fl6->daddr));
1013         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1014         rcu_read_unlock_bh();
1015
1016         if (err) {
1017                 struct inet6_ifaddr *ifp;
1018                 struct flowi6 fl_gw6;
1019                 int redirect;
1020
1021                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1022                                       (*dst)->dev, 1);
1023
1024                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1025                 if (ifp)
1026                         in6_ifa_put(ifp);
1027
1028                 if (redirect) {
1029                         /*
1030                          * We need to get the dst entry for the
1031                          * default router instead
1032                          */
1033                         dst_release(*dst);
1034                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1035                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1036                         *dst = ip6_route_output(net, sk, &fl_gw6);
1037                         err = (*dst)->error;
1038                         if (err)
1039                                 goto out_err_release;
1040                 }
1041         }
1042 #endif
1043         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1044             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1045                 err = -EAFNOSUPPORT;
1046                 goto out_err_release;
1047         }
1048
1049         return 0;
1050
1051 out_err_release:
1052         dst_release(*dst);
1053         *dst = NULL;
1054
1055         if (err == -ENETUNREACH)
1056                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1057         return err;
1058 }
1059
1060 /**
1061  *      ip6_dst_lookup - perform route lookup on flow
1062  *      @sk: socket which provides route info
1063  *      @dst: pointer to dst_entry * for result
1064  *      @fl6: flow to lookup
1065  *
1066  *      This function performs a route lookup on the given flow.
1067  *
1068  *      It returns zero on success, or a standard errno code on error.
1069  */
1070 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1071                    struct flowi6 *fl6)
1072 {
1073         *dst = NULL;
1074         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1075 }
1076 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1077
1078 /**
1079  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1080  *      @sk: socket which provides route info
1081  *      @fl6: flow to lookup
1082  *      @final_dst: final destination address for ipsec lookup
1083  *
1084  *      This function performs a route lookup on the given flow.
1085  *
1086  *      It returns a valid dst pointer on success, or a pointer encoded
1087  *      error code.
1088  */
1089 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1090                                       const struct in6_addr *final_dst)
1091 {
1092         struct dst_entry *dst = NULL;
1093         int err;
1094
1095         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1096         if (err)
1097                 return ERR_PTR(err);
1098         if (final_dst)
1099                 fl6->daddr = *final_dst;
1100
1101         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1102 }
1103 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1104
1105 /**
1106  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1107  *      @sk: socket which provides the dst cache and route info
1108  *      @fl6: flow to lookup
1109  *      @final_dst: final destination address for ipsec lookup
1110  *
1111  *      This function performs a route lookup on the given flow with the
1112  *      possibility of using the cached route in the socket if it is valid.
1113  *      It will take the socket dst lock when operating on the dst cache.
1114  *      As a result, this function can only be used in process context.
1115  *
1116  *      It returns a valid dst pointer on success, or a pointer encoded
1117  *      error code.
1118  */
1119 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1120                                          const struct in6_addr *final_dst)
1121 {
1122         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1123
1124         dst = ip6_sk_dst_check(sk, dst, fl6);
1125         if (!dst)
1126                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1127
1128         return dst;
1129 }
1130 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1131
1132 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1133                                                gfp_t gfp)
1134 {
1135         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1136 }
1137
1138 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1139                                                 gfp_t gfp)
1140 {
1141         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1142 }
1143
1144 static void ip6_append_data_mtu(unsigned int *mtu,
1145                                 int *maxfraglen,
1146                                 unsigned int fragheaderlen,
1147                                 struct sk_buff *skb,
1148                                 struct rt6_info *rt,
1149                                 unsigned int orig_mtu)
1150 {
1151         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1152                 if (!skb) {
1153                         /* first fragment, reserve header_len */
1154                         *mtu = orig_mtu - rt->dst.header_len;
1155
1156                 } else {
1157                         /*
1158                          * this fragment is not first, the headers
1159                          * space is regarded as data space.
1160                          */
1161                         *mtu = orig_mtu;
1162                 }
1163                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1164                               + fragheaderlen - sizeof(struct frag_hdr);
1165         }
1166 }
1167
1168 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1169                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1170                           struct rt6_info *rt, struct flowi6 *fl6)
1171 {
1172         struct ipv6_pinfo *np = inet6_sk(sk);
1173         unsigned int mtu;
1174         struct ipv6_txoptions *opt = ipc6->opt;
1175
1176         /*
1177          * setup for corking
1178          */
1179         if (opt) {
1180                 if (WARN_ON(v6_cork->opt))
1181                         return -EINVAL;
1182
1183                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1184                 if (unlikely(!v6_cork->opt))
1185                         return -ENOBUFS;
1186
1187                 v6_cork->opt->tot_len = sizeof(*opt);
1188                 v6_cork->opt->opt_flen = opt->opt_flen;
1189                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1190
1191                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1192                                                     sk->sk_allocation);
1193                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1194                         return -ENOBUFS;
1195
1196                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1197                                                     sk->sk_allocation);
1198                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1199                         return -ENOBUFS;
1200
1201                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1202                                                    sk->sk_allocation);
1203                 if (opt->hopopt && !v6_cork->opt->hopopt)
1204                         return -ENOBUFS;
1205
1206                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1207                                                     sk->sk_allocation);
1208                 if (opt->srcrt && !v6_cork->opt->srcrt)
1209                         return -ENOBUFS;
1210
1211                 /* need source address above miyazawa*/
1212         }
1213         dst_hold(&rt->dst);
1214         cork->base.dst = &rt->dst;
1215         cork->fl.u.ip6 = *fl6;
1216         v6_cork->hop_limit = ipc6->hlimit;
1217         v6_cork->tclass = ipc6->tclass;
1218         if (rt->dst.flags & DST_XFRM_TUNNEL)
1219                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1220                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1221         else
1222                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1223                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1224         if (np->frag_size < mtu) {
1225                 if (np->frag_size)
1226                         mtu = np->frag_size;
1227         }
1228         if (mtu < IPV6_MIN_MTU)
1229                 return -EINVAL;
1230         cork->base.fragsize = mtu;
1231         if (dst_allfrag(rt->dst.path))
1232                 cork->base.flags |= IPCORK_ALLFRAG;
1233         cork->base.length = 0;
1234
1235         return 0;
1236 }
1237
1238 static int __ip6_append_data(struct sock *sk,
1239                              struct flowi6 *fl6,
1240                              struct sk_buff_head *queue,
1241                              struct inet_cork *cork,
1242                              struct inet6_cork *v6_cork,
1243                              struct page_frag *pfrag,
1244                              int getfrag(void *from, char *to, int offset,
1245                                          int len, int odd, struct sk_buff *skb),
1246                              void *from, int length, int transhdrlen,
1247                              unsigned int flags, struct ipcm6_cookie *ipc6,
1248                              const struct sockcm_cookie *sockc)
1249 {
1250         struct sk_buff *skb, *skb_prev = NULL;
1251         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1252         int exthdrlen = 0;
1253         int dst_exthdrlen = 0;
1254         int hh_len;
1255         int copy;
1256         int err;
1257         int offset = 0;
1258         __u8 tx_flags = 0;
1259         u32 tskey = 0;
1260         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1261         struct ipv6_txoptions *opt = v6_cork->opt;
1262         int csummode = CHECKSUM_NONE;
1263         unsigned int maxnonfragsize, headersize;
1264
1265         skb = skb_peek_tail(queue);
1266         if (!skb) {
1267                 exthdrlen = opt ? opt->opt_flen : 0;
1268                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1269         }
1270
1271         mtu = cork->fragsize;
1272         orig_mtu = mtu;
1273
1274         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1275
1276         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1277                         (opt ? opt->opt_nflen : 0);
1278         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1279                      sizeof(struct frag_hdr);
1280
1281         headersize = sizeof(struct ipv6hdr) +
1282                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1283                      (dst_allfrag(&rt->dst) ?
1284                       sizeof(struct frag_hdr) : 0) +
1285                      rt->rt6i_nfheader_len;
1286
1287         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1288          * the first fragment
1289          */
1290         if (headersize + transhdrlen > mtu)
1291                 goto emsgsize;
1292
1293         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1294             (sk->sk_protocol == IPPROTO_UDP ||
1295              sk->sk_protocol == IPPROTO_RAW)) {
1296                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1297                                 sizeof(struct ipv6hdr));
1298                 goto emsgsize;
1299         }
1300
1301         if (ip6_sk_ignore_df(sk))
1302                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1303         else
1304                 maxnonfragsize = mtu;
1305
1306         if (cork->length + length > maxnonfragsize - headersize) {
1307 emsgsize:
1308                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1309                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1310                 return -EMSGSIZE;
1311         }
1312
1313         /* CHECKSUM_PARTIAL only with no extension headers and when
1314          * we are not going to fragment
1315          */
1316         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1317             headersize == sizeof(struct ipv6hdr) &&
1318             length <= mtu - headersize &&
1319             !(flags & MSG_MORE) &&
1320             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1321                 csummode = CHECKSUM_PARTIAL;
1322
1323         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1324                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1325                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1326                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1327                         tskey = sk->sk_tskey++;
1328         }
1329
1330         /*
1331          * Let's try using as much space as possible.
1332          * Use MTU if total length of the message fits into the MTU.
1333          * Otherwise, we need to reserve fragment header and
1334          * fragment alignment (= 8-15 octects, in total).
1335          *
1336          * Note that we may need to "move" the data from the tail of
1337          * of the buffer to the new fragment when we split
1338          * the message.
1339          *
1340          * FIXME: It may be fragmented into multiple chunks
1341          *        at once if non-fragmentable extension headers
1342          *        are too large.
1343          * --yoshfuji
1344          */
1345
1346         cork->length += length;
1347         if (!skb)
1348                 goto alloc_new_skb;
1349
1350         while (length > 0) {
1351                 /* Check if the remaining data fits into current packet. */
1352                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1353                 if (copy < length)
1354                         copy = maxfraglen - skb->len;
1355
1356                 if (copy <= 0) {
1357                         char *data;
1358                         unsigned int datalen;
1359                         unsigned int fraglen;
1360                         unsigned int fraggap;
1361                         unsigned int alloclen;
1362 alloc_new_skb:
1363                         /* There's no room in the current skb */
1364                         if (skb)
1365                                 fraggap = skb->len - maxfraglen;
1366                         else
1367                                 fraggap = 0;
1368                         /* update mtu and maxfraglen if necessary */
1369                         if (!skb || !skb_prev)
1370                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1371                                                     fragheaderlen, skb, rt,
1372                                                     orig_mtu);
1373
1374                         skb_prev = skb;
1375
1376                         /*
1377                          * If remaining data exceeds the mtu,
1378                          * we know we need more fragment(s).
1379                          */
1380                         datalen = length + fraggap;
1381
1382                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1383                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1384                         if ((flags & MSG_MORE) &&
1385                             !(rt->dst.dev->features&NETIF_F_SG))
1386                                 alloclen = mtu;
1387                         else
1388                                 alloclen = datalen + fragheaderlen;
1389
1390                         alloclen += dst_exthdrlen;
1391
1392                         if (datalen != length + fraggap) {
1393                                 /*
1394                                  * this is not the last fragment, the trailer
1395                                  * space is regarded as data space.
1396                                  */
1397                                 datalen += rt->dst.trailer_len;
1398                         }
1399
1400                         alloclen += rt->dst.trailer_len;
1401                         fraglen = datalen + fragheaderlen;
1402
1403                         /*
1404                          * We just reserve space for fragment header.
1405                          * Note: this may be overallocation if the message
1406                          * (without MSG_MORE) fits into the MTU.
1407                          */
1408                         alloclen += sizeof(struct frag_hdr);
1409
1410                         copy = datalen - transhdrlen - fraggap;
1411                         if (copy < 0) {
1412                                 err = -EINVAL;
1413                                 goto error;
1414                         }
1415                         if (transhdrlen) {
1416                                 skb = sock_alloc_send_skb(sk,
1417                                                 alloclen + hh_len,
1418                                                 (flags & MSG_DONTWAIT), &err);
1419                         } else {
1420                                 skb = NULL;
1421                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1422                                     2 * sk->sk_sndbuf)
1423                                         skb = sock_wmalloc(sk,
1424                                                            alloclen + hh_len, 1,
1425                                                            sk->sk_allocation);
1426                                 if (unlikely(!skb))
1427                                         err = -ENOBUFS;
1428                         }
1429                         if (!skb)
1430                                 goto error;
1431                         /*
1432                          *      Fill in the control structures
1433                          */
1434                         skb->protocol = htons(ETH_P_IPV6);
1435                         skb->ip_summed = csummode;
1436                         skb->csum = 0;
1437                         /* reserve for fragmentation and ipsec header */
1438                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1439                                     dst_exthdrlen);
1440
1441                         /* Only the initial fragment is time stamped */
1442                         skb_shinfo(skb)->tx_flags = tx_flags;
1443                         tx_flags = 0;
1444                         skb_shinfo(skb)->tskey = tskey;
1445                         tskey = 0;
1446
1447                         /*
1448                          *      Find where to start putting bytes
1449                          */
1450                         data = skb_put(skb, fraglen);
1451                         skb_set_network_header(skb, exthdrlen);
1452                         data += fragheaderlen;
1453                         skb->transport_header = (skb->network_header +
1454                                                  fragheaderlen);
1455                         if (fraggap) {
1456                                 skb->csum = skb_copy_and_csum_bits(
1457                                         skb_prev, maxfraglen,
1458                                         data + transhdrlen, fraggap, 0);
1459                                 skb_prev->csum = csum_sub(skb_prev->csum,
1460                                                           skb->csum);
1461                                 data += fraggap;
1462                                 pskb_trim_unique(skb_prev, maxfraglen);
1463                         }
1464                         if (copy > 0 &&
1465                             getfrag(from, data + transhdrlen, offset,
1466                                     copy, fraggap, skb) < 0) {
1467                                 err = -EFAULT;
1468                                 kfree_skb(skb);
1469                                 goto error;
1470                         }
1471
1472                         offset += copy;
1473                         length -= datalen - fraggap;
1474                         transhdrlen = 0;
1475                         exthdrlen = 0;
1476                         dst_exthdrlen = 0;
1477
1478                         if ((flags & MSG_CONFIRM) && !skb_prev)
1479                                 skb_set_dst_pending_confirm(skb, 1);
1480
1481                         /*
1482                          * Put the packet on the pending queue
1483                          */
1484                         __skb_queue_tail(queue, skb);
1485                         continue;
1486                 }
1487
1488                 if (copy > length)
1489                         copy = length;
1490
1491                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1492                         unsigned int off;
1493
1494                         off = skb->len;
1495                         if (getfrag(from, skb_put(skb, copy),
1496                                                 offset, copy, off, skb) < 0) {
1497                                 __skb_trim(skb, off);
1498                                 err = -EFAULT;
1499                                 goto error;
1500                         }
1501                 } else {
1502                         int i = skb_shinfo(skb)->nr_frags;
1503
1504                         err = -ENOMEM;
1505                         if (!sk_page_frag_refill(sk, pfrag))
1506                                 goto error;
1507
1508                         if (!skb_can_coalesce(skb, i, pfrag->page,
1509                                               pfrag->offset)) {
1510                                 err = -EMSGSIZE;
1511                                 if (i == MAX_SKB_FRAGS)
1512                                         goto error;
1513
1514                                 __skb_fill_page_desc(skb, i, pfrag->page,
1515                                                      pfrag->offset, 0);
1516                                 skb_shinfo(skb)->nr_frags = ++i;
1517                                 get_page(pfrag->page);
1518                         }
1519                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1520                         if (getfrag(from,
1521                                     page_address(pfrag->page) + pfrag->offset,
1522                                     offset, copy, skb->len, skb) < 0)
1523                                 goto error_efault;
1524
1525                         pfrag->offset += copy;
1526                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1527                         skb->len += copy;
1528                         skb->data_len += copy;
1529                         skb->truesize += copy;
1530                         refcount_add(copy, &sk->sk_wmem_alloc);
1531                 }
1532                 offset += copy;
1533                 length -= copy;
1534         }
1535
1536         return 0;
1537
1538 error_efault:
1539         err = -EFAULT;
1540 error:
1541         cork->length -= length;
1542         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1543         return err;
1544 }
1545
1546 int ip6_append_data(struct sock *sk,
1547                     int getfrag(void *from, char *to, int offset, int len,
1548                                 int odd, struct sk_buff *skb),
1549                     void *from, int length, int transhdrlen,
1550                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1551                     struct rt6_info *rt, unsigned int flags,
1552                     const struct sockcm_cookie *sockc)
1553 {
1554         struct inet_sock *inet = inet_sk(sk);
1555         struct ipv6_pinfo *np = inet6_sk(sk);
1556         int exthdrlen;
1557         int err;
1558
1559         if (flags&MSG_PROBE)
1560                 return 0;
1561         if (skb_queue_empty(&sk->sk_write_queue)) {
1562                 /*
1563                  * setup for corking
1564                  */
1565                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1566                                      ipc6, rt, fl6);
1567                 if (err)
1568                         return err;
1569
1570                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1571                 length += exthdrlen;
1572                 transhdrlen += exthdrlen;
1573         } else {
1574                 fl6 = &inet->cork.fl.u.ip6;
1575                 transhdrlen = 0;
1576         }
1577
1578         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1579                                  &np->cork, sk_page_frag(sk), getfrag,
1580                                  from, length, transhdrlen, flags, ipc6, sockc);
1581 }
1582 EXPORT_SYMBOL_GPL(ip6_append_data);
1583
1584 static void ip6_cork_release(struct inet_cork_full *cork,
1585                              struct inet6_cork *v6_cork)
1586 {
1587         if (v6_cork->opt) {
1588                 kfree(v6_cork->opt->dst0opt);
1589                 kfree(v6_cork->opt->dst1opt);
1590                 kfree(v6_cork->opt->hopopt);
1591                 kfree(v6_cork->opt->srcrt);
1592                 kfree(v6_cork->opt);
1593                 v6_cork->opt = NULL;
1594         }
1595
1596         if (cork->base.dst) {
1597                 dst_release(cork->base.dst);
1598                 cork->base.dst = NULL;
1599                 cork->base.flags &= ~IPCORK_ALLFRAG;
1600         }
1601         memset(&cork->fl, 0, sizeof(cork->fl));
1602 }
1603
1604 struct sk_buff *__ip6_make_skb(struct sock *sk,
1605                                struct sk_buff_head *queue,
1606                                struct inet_cork_full *cork,
1607                                struct inet6_cork *v6_cork)
1608 {
1609         struct sk_buff *skb, *tmp_skb;
1610         struct sk_buff **tail_skb;
1611         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1612         struct ipv6_pinfo *np = inet6_sk(sk);
1613         struct net *net = sock_net(sk);
1614         struct ipv6hdr *hdr;
1615         struct ipv6_txoptions *opt = v6_cork->opt;
1616         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1617         struct flowi6 *fl6 = &cork->fl.u.ip6;
1618         unsigned char proto = fl6->flowi6_proto;
1619
1620         skb = __skb_dequeue(queue);
1621         if (!skb)
1622                 goto out;
1623         tail_skb = &(skb_shinfo(skb)->frag_list);
1624
1625         /* move skb->data to ip header from ext header */
1626         if (skb->data < skb_network_header(skb))
1627                 __skb_pull(skb, skb_network_offset(skb));
1628         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1629                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1630                 *tail_skb = tmp_skb;
1631                 tail_skb = &(tmp_skb->next);
1632                 skb->len += tmp_skb->len;
1633                 skb->data_len += tmp_skb->len;
1634                 skb->truesize += tmp_skb->truesize;
1635                 tmp_skb->destructor = NULL;
1636                 tmp_skb->sk = NULL;
1637         }
1638
1639         /* Allow local fragmentation. */
1640         skb->ignore_df = ip6_sk_ignore_df(sk);
1641
1642         *final_dst = fl6->daddr;
1643         __skb_pull(skb, skb_network_header_len(skb));
1644         if (opt && opt->opt_flen)
1645                 ipv6_push_frag_opts(skb, opt, &proto);
1646         if (opt && opt->opt_nflen)
1647                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1648
1649         skb_push(skb, sizeof(struct ipv6hdr));
1650         skb_reset_network_header(skb);
1651         hdr = ipv6_hdr(skb);
1652
1653         ip6_flow_hdr(hdr, v6_cork->tclass,
1654                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1655                                         ip6_autoflowlabel(net, np), fl6));
1656         hdr->hop_limit = v6_cork->hop_limit;
1657         hdr->nexthdr = proto;
1658         hdr->saddr = fl6->saddr;
1659         hdr->daddr = *final_dst;
1660
1661         skb->priority = sk->sk_priority;
1662         skb->mark = sk->sk_mark;
1663
1664         skb_dst_set(skb, dst_clone(&rt->dst));
1665         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1666         if (proto == IPPROTO_ICMPV6) {
1667                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1668
1669                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1670                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1671         }
1672
1673         ip6_cork_release(cork, v6_cork);
1674 out:
1675         return skb;
1676 }
1677
1678 int ip6_send_skb(struct sk_buff *skb)
1679 {
1680         struct net *net = sock_net(skb->sk);
1681         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1682         int err;
1683
1684         err = ip6_local_out(net, skb->sk, skb);
1685         if (err) {
1686                 if (err > 0)
1687                         err = net_xmit_errno(err);
1688                 if (err)
1689                         IP6_INC_STATS(net, rt->rt6i_idev,
1690                                       IPSTATS_MIB_OUTDISCARDS);
1691         }
1692
1693         return err;
1694 }
1695
1696 int ip6_push_pending_frames(struct sock *sk)
1697 {
1698         struct sk_buff *skb;
1699
1700         skb = ip6_finish_skb(sk);
1701         if (!skb)
1702                 return 0;
1703
1704         return ip6_send_skb(skb);
1705 }
1706 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1707
1708 static void __ip6_flush_pending_frames(struct sock *sk,
1709                                        struct sk_buff_head *queue,
1710                                        struct inet_cork_full *cork,
1711                                        struct inet6_cork *v6_cork)
1712 {
1713         struct sk_buff *skb;
1714
1715         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1716                 if (skb_dst(skb))
1717                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1718                                       IPSTATS_MIB_OUTDISCARDS);
1719                 kfree_skb(skb);
1720         }
1721
1722         ip6_cork_release(cork, v6_cork);
1723 }
1724
1725 void ip6_flush_pending_frames(struct sock *sk)
1726 {
1727         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1728                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1729 }
1730 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1731
1732 struct sk_buff *ip6_make_skb(struct sock *sk,
1733                              int getfrag(void *from, char *to, int offset,
1734                                          int len, int odd, struct sk_buff *skb),
1735                              void *from, int length, int transhdrlen,
1736                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1737                              struct rt6_info *rt, unsigned int flags,
1738                              const struct sockcm_cookie *sockc)
1739 {
1740         struct inet_cork_full cork;
1741         struct inet6_cork v6_cork;
1742         struct sk_buff_head queue;
1743         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1744         int err;
1745
1746         if (flags & MSG_PROBE)
1747                 return NULL;
1748
1749         __skb_queue_head_init(&queue);
1750
1751         cork.base.flags = 0;
1752         cork.base.addr = 0;
1753         cork.base.opt = NULL;
1754         cork.base.dst = NULL;
1755         v6_cork.opt = NULL;
1756         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1757         if (err) {
1758                 ip6_cork_release(&cork, &v6_cork);
1759                 return ERR_PTR(err);
1760         }
1761         if (ipc6->dontfrag < 0)
1762                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1763
1764         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1765                                 &current->task_frag, getfrag, from,
1766                                 length + exthdrlen, transhdrlen + exthdrlen,
1767                                 flags, ipc6, sockc);
1768         if (err) {
1769                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1770                 return ERR_PTR(err);
1771         }
1772
1773         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1774 }