net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 /* dev_loopback_xmit for use with netfilter. */
  87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  88 {
  89         skb_reset_mac_header(newskb);
  90         __skb_pull(newskb, skb_network_offset(newskb));
  91         newskb->pkt_type = PACKET_LOOPBACK;
  92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
  93         WARN_ON(!skb_dst(newskb));
  94
  95         netif_rx_ni(newskb);
  96         return 0;
  97 }
  98
  99 static int ip6_finish_output2(struct sk_buff *skb)
 100 {
 101         struct dst_entry *dst = skb_dst(skb);
 102         struct net_device *dev = dst->dev;
 103         struct neighbour *neigh;
 104
 105         skb->protocol = htons(ETH_P_IPV6);
 106         skb->dev = dev;
 107
 108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 110
 111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 112                     ((mroute6_socket(dev_net(dev), skb) &&
 113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 115                                          &ipv6_hdr(skb)->saddr))) {
 116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 117
 118                         /* Do not check for IFF_ALLMULTI; multicast routing
 119                            is not supported in any case.
 120                          */
 121                         if (newskb)
 122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 123                                         newskb, NULL, newskb->dev,
 124                                         ip6_dev_loopback_xmit);
 125
 126                         if (ipv6_hdr(skb)->hop_limit == 0) {
 127                                 IP6_INC_STATS(dev_net(dev), idev,
 128                                               IPSTATS_MIB_OUTDISCARDS);
 129                                 kfree_skb(skb);
 130                                 return 0;
 131                         }
 132                 }
 133
 134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 135                                 skb->len);
 136         }
 137
 138         neigh = dst->neighbour;
 139         if (neigh) {
 140                 struct hh_cache *hh = &neigh->hh;
 141                 if (hh->hh_len)
 142                         return neigh_hh_output(hh, skb);
 143                 else
 144                         return dst->neighbour->output(skb);
 145         }
 146         IP6_INC_STATS_BH(dev_net(dst->dev),
 147                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 148         kfree_skb(skb);
 149         return -EINVAL;
 150 }
 151
 152 static int ip6_finish_output(struct sk_buff *skb)
 153 {
 154         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 155             dst_allfrag(skb_dst(skb)))
 156                 return ip6_fragment(skb, ip6_finish_output2);
 157         else
 158                 return ip6_finish_output2(skb);
 159 }
 160
 161 int ip6_output(struct sk_buff *skb)
 162 {
 163         struct net_device *dev = skb_dst(skb)->dev;
 164         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 165         if (unlikely(idev->cnf.disable_ipv6)) {
 166                 IP6_INC_STATS(dev_net(dev), idev,
 167                               IPSTATS_MIB_OUTDISCARDS);
 168                 kfree_skb(skb);
 169                 return 0;
 170         }
 171
 172         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 173                             ip6_finish_output,
 174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175 }
 176
 177 /*
 178  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 179  */
 180
 181 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 182              struct ipv6_txoptions *opt)
 183 {
 184         struct net *net = sock_net(sk);
 185         struct ipv6_pinfo *np = inet6_sk(sk);
 186         struct in6_addr *first_hop = &fl6->daddr;
 187         struct dst_entry *dst = skb_dst(skb);
 188         struct ipv6hdr *hdr;
 189         u8  proto = fl6->flowi6_proto;
 190         int seg_len = skb->len;
 191         int hlimit = -1;
 192         int tclass = 0;
 193         u32 mtu;
 194
 195         if (opt) {
 196                 unsigned int head_room;
 197
 198                 /* First: exthdrs may take lots of space (~8K for now)
 199                    MAX_HEADER is not enough.
 200                  */
 201                 head_room = opt->opt_nflen + opt->opt_flen;
 202                 seg_len += head_room;
 203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 204
 205                 if (skb_headroom(skb) < head_room) {
 206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 207                         if (skb2 == NULL) {
 208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 209                                               IPSTATS_MIB_OUTDISCARDS);
 210                                 kfree_skb(skb);
 211                                 return -ENOBUFS;
 212                         }
 213                         kfree_skb(skb);
 214                         skb = skb2;
 215                         skb_set_owner_w(skb, sk);
 216                 }
 217                 if (opt->opt_flen)
 218                         ipv6_push_frag_opts(skb, opt, &proto);
 219                 if (opt->opt_nflen)
 220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 221         }
 222
 223         skb_push(skb, sizeof(struct ipv6hdr));
 224         skb_reset_network_header(skb);
 225         hdr = ipv6_hdr(skb);
 226
 227         /*
 228          *      Fill in the IPv6 header
 229          */
 230         if (np) {
 231                 tclass = np->tclass;
 232                 hlimit = np->hop_limit;
 233         }
 234         if (hlimit < 0)
 235                 hlimit = ip6_dst_hoplimit(dst);
 236
 237         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
 238
 239         hdr->payload_len = htons(seg_len);
 240         hdr->nexthdr = proto;
 241         hdr->hop_limit = hlimit;
 242
 243         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
 244         ipv6_addr_copy(&hdr->daddr, first_hop);
 245
 246         skb->priority = sk->sk_priority;
 247         skb->mark = sk->sk_mark;
 248
 249         mtu = dst_mtu(dst);
 250         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 251                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 252                               IPSTATS_MIB_OUT, skb->len);
 253                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 254                                dst->dev, dst_output);
 255         }
 256
 257         if (net_ratelimit())
 258                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 259         skb->dev = dst->dev;
 260         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 261         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 262         kfree_skb(skb);
 263         return -EMSGSIZE;
 264 }
 265
 266 EXPORT_SYMBOL(ip6_xmit);
 267
 268 /*
 269  *      To avoid extra problems ND packets are send through this
 270  *      routine. It's code duplication but I really want to avoid
 271  *      extra checks since ipv6_build_header is used by TCP (which
 272  *      is for us performance critical)
 273  */
 274
 275 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 276                const struct in6_addr *saddr, const struct in6_addr *daddr,
 277                int proto, int len)
 278 {
 279         struct ipv6_pinfo *np = inet6_sk(sk);
 280         struct ipv6hdr *hdr;
 281
 282         skb->protocol = htons(ETH_P_IPV6);
 283         skb->dev = dev;
 284
 285         skb_reset_network_header(skb);
 286         skb_put(skb, sizeof(struct ipv6hdr));
 287         hdr = ipv6_hdr(skb);
 288
 289         *(__be32*)hdr = htonl(0x60000000);
 290
 291         hdr->payload_len = htons(len);
 292         hdr->nexthdr = proto;
 293         hdr->hop_limit = np->hop_limit;
 294
 295         ipv6_addr_copy(&hdr->saddr, saddr);
 296         ipv6_addr_copy(&hdr->daddr, daddr);
 297
 298         return 0;
 299 }
 300
 301 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 302 {
 303         struct ip6_ra_chain *ra;
 304         struct sock *last = NULL;
 305
 306         read_lock(&ip6_ra_lock);
 307         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 308                 struct sock *sk = ra->sk;
 309                 if (sk && ra->sel == sel &&
 310                     (!sk->sk_bound_dev_if ||
 311                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 312                         if (last) {
 313                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 314                                 if (skb2)
 315                                         rawv6_rcv(last, skb2);
 316                         }
 317                         last = sk;
 318                 }
 319         }
 320
 321         if (last) {
 322                 rawv6_rcv(last, skb);
 323                 read_unlock(&ip6_ra_lock);
 324                 return 1;
 325         }
 326         read_unlock(&ip6_ra_lock);
 327         return 0;
 328 }
 329
 330 static int ip6_forward_proxy_check(struct sk_buff *skb)
 331 {
 332         struct ipv6hdr *hdr = ipv6_hdr(skb);
 333         u8 nexthdr = hdr->nexthdr;
 334         int offset;
 335
 336         if (ipv6_ext_hdr(nexthdr)) {
 337                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 338                 if (offset < 0)
 339                         return 0;
 340         } else
 341                 offset = sizeof(struct ipv6hdr);
 342
 343         if (nexthdr == IPPROTO_ICMPV6) {
 344                 struct icmp6hdr *icmp6;
 345
 346                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 347                                          offset + 1 - skb->data)))
 348                         return 0;
 349
 350                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 351
 352                 switch (icmp6->icmp6_type) {
 353                 case NDISC_ROUTER_SOLICITATION:
 354                 case NDISC_ROUTER_ADVERTISEMENT:
 355                 case NDISC_NEIGHBOUR_SOLICITATION:
 356                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 357                 case NDISC_REDIRECT:
 358                         /* For reaction involving unicast neighbor discovery
 359                          * message destined to the proxied address, pass it to
 360                          * input function.
 361                          */
 362                         return 1;
 363                 default:
 364                         break;
 365                 }
 366         }
 367
 368         /*
 369          * The proxying router can't forward traffic sent to a link-local
 370          * address, so signal the sender and discard the packet. This
 371          * behavior is clarified by the MIPv6 specification.
 372          */
 373         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 374                 dst_link_failure(skb);
 375                 return -1;
 376         }
 377
 378         return 0;
 379 }
 380
 381 static inline int ip6_forward_finish(struct sk_buff *skb)
 382 {
 383         return dst_output(skb);
 384 }
 385
 386 int ip6_forward(struct sk_buff *skb)
 387 {
 388         struct dst_entry *dst = skb_dst(skb);
 389         struct ipv6hdr *hdr = ipv6_hdr(skb);
 390         struct inet6_skb_parm *opt = IP6CB(skb);
 391         struct net *net = dev_net(dst->dev);
 392         u32 mtu;
 393
 394         if (net->ipv6.devconf_all->forwarding == 0)
 395                 goto error;
 396
 397         if (skb_warn_if_lro(skb))
 398                 goto drop;
 399
 400         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 401                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 402                 goto drop;
 403         }
 404
 405         if (skb->pkt_type != PACKET_HOST)
 406                 goto drop;
 407
 408         skb_forward_csum(skb);
 409
 410         /*
 411          *      We DO NOT make any processing on
 412          *      RA packets, pushing them to user level AS IS
 413          *      without ane WARRANTY that application will be able
 414          *      to interpret them. The reason is that we
 415          *      cannot make anything clever here.
 416          *
 417          *      We are not end-node, so that if packet contains
 418          *      AH/ESP, we cannot make anything.
 419          *      Defragmentation also would be mistake, RA packets
 420          *      cannot be fragmented, because there is no warranty
 421          *      that different fragments will go along one path. --ANK
 422          */
 423         if (opt->ra) {
 424                 u8 *ptr = skb_network_header(skb) + opt->ra;
 425                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 426                         return 0;
 427         }
 428
 429         /*
 430          *      check and decrement ttl
 431          */
 432         if (hdr->hop_limit <= 1) {
 433                 /* Force OUTPUT device used as source address */
 434                 skb->dev = dst->dev;
 435                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 436                 IP6_INC_STATS_BH(net,
 437                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 438
 439                 kfree_skb(skb);
 440                 return -ETIMEDOUT;
 441         }
 442
 443         /* XXX: idev->cnf.proxy_ndp? */
 444         if (net->ipv6.devconf_all->proxy_ndp &&
 445             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 446                 int proxied = ip6_forward_proxy_check(skb);
 447                 if (proxied > 0)
 448                         return ip6_input(skb);
 449                 else if (proxied < 0) {
 450                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 451                                       IPSTATS_MIB_INDISCARDS);
 452                         goto drop;
 453                 }
 454         }
 455
 456         if (!xfrm6_route_forward(skb)) {
 457                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 458                 goto drop;
 459         }
 460         dst = skb_dst(skb);
 461
 462         /* IPv6 specs say nothing about it, but it is clear that we cannot
 463            send redirects to source routed frames.
 464            We don't send redirects to frames decapsulated from IPsec.
 465          */
 466         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 467             !skb_sec_path(skb)) {
 468                 struct in6_addr *target = NULL;
 469                 struct rt6_info *rt;
 470                 struct neighbour *n = dst->neighbour;
 471
 472                 /*
 473                  *      incoming and outgoing devices are the same
 474                  *      send a redirect.
 475                  */
 476
 477                 rt = (struct rt6_info *) dst;
 478                 if ((rt->rt6i_flags & RTF_GATEWAY))
 479                         target = (struct in6_addr*)&n->primary_key;
 480                 else
 481                         target = &hdr->daddr;
 482
 483                 if (!rt->rt6i_peer)
 484                         rt6_bind_peer(rt, 1);
 485
 486                 /* Limit redirects both by destination (here)
 487                    and by source (inside ndisc_send_redirect)
 488                  */
 489                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
 490                         ndisc_send_redirect(skb, n, target);
 491         } else {
 492                 int addrtype = ipv6_addr_type(&hdr->saddr);
 493
 494                 /* This check is security critical. */
 495                 if (addrtype == IPV6_ADDR_ANY ||
 496                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 497                         goto error;
 498                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 499                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 500                                     ICMPV6_NOT_NEIGHBOUR, 0);
 501                         goto error;
 502                 }
 503         }
 504
 505         mtu = dst_mtu(dst);
 506         if (mtu < IPV6_MIN_MTU)
 507                 mtu = IPV6_MIN_MTU;
 508
 509         if (skb->len > mtu && !skb_is_gso(skb)) {
 510                 /* Again, force OUTPUT device used as source address */
 511                 skb->dev = dst->dev;
 512                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 513                 IP6_INC_STATS_BH(net,
 514                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 515                 IP6_INC_STATS_BH(net,
 516                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 517                 kfree_skb(skb);
 518                 return -EMSGSIZE;
 519         }
 520
 521         if (skb_cow(skb, dst->dev->hard_header_len)) {
 522                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 523                 goto drop;
 524         }
 525
 526         hdr = ipv6_hdr(skb);
 527
 528         /* Mangling hops number delayed to point after skb COW */
 529
 530         hdr->hop_limit--;
 531
 532         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 533         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 534                        ip6_forward_finish);
 535
 536 error:
 537         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 538 drop:
 539         kfree_skb(skb);
 540         return -EINVAL;
 541 }
 542
 543 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 544 {
 545         to->pkt_type = from->pkt_type;
 546         to->priority = from->priority;
 547         to->protocol = from->protocol;
 548         skb_dst_drop(to);
 549         skb_dst_set(to, dst_clone(skb_dst(from)));
 550         to->dev = from->dev;
 551         to->mark = from->mark;
 552
 553 #ifdef CONFIG_NET_SCHED
 554         to->tc_index = from->tc_index;
 555 #endif
 556         nf_copy(to, from);
 557 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 558     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 559         to->nf_trace = from->nf_trace;
 560 #endif
 561         skb_copy_secmark(to, from);
 562 }
 563
 564 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 565 {
 566         u16 offset = sizeof(struct ipv6hdr);
 567         struct ipv6_opt_hdr *exthdr =
 568                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 569         unsigned int packet_len = skb->tail - skb->network_header;
 570         int found_rhdr = 0;
 571         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 572
 573         while (offset + 1 <= packet_len) {
 574
 575                 switch (**nexthdr) {
 576
 577                 case NEXTHDR_HOP:
 578                         break;
 579                 case NEXTHDR_ROUTING:
 580                         found_rhdr = 1;
 581                         break;
 582                 case NEXTHDR_DEST:
 583 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 584                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 585                                 break;
 586 #endif
 587                         if (found_rhdr)
 588                                 return offset;
 589                         break;
 590                 default :
 591                         return offset;
 592                 }
 593
 594                 offset += ipv6_optlen(exthdr);
 595                 *nexthdr = &exthdr->nexthdr;
 596                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 597                                                  offset);
 598         }
 599
 600         return offset;
 601 }
 602
 603 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 604 {
 605         struct sk_buff *frag;
 606         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 607         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 608         struct ipv6hdr *tmp_hdr;
 609         struct frag_hdr *fh;
 610         unsigned int mtu, hlen, left, len;
 611         __be32 frag_id = 0;
 612         int ptr, offset = 0, err=0;
 613         u8 *prevhdr, nexthdr = 0;
 614         struct net *net = dev_net(skb_dst(skb)->dev);
 615
 616         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 617         nexthdr = *prevhdr;
 618
 619         mtu = ip6_skb_dst_mtu(skb);
 620
 621         /* We must not fragment if the socket is set to force MTU discovery
 622          * or if the skb it not generated by a local socket.
 623          */
 624         if (!skb->local_df && skb->len > mtu) {
 625                 skb->dev = skb_dst(skb)->dev;
 626                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 627                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 628                               IPSTATS_MIB_FRAGFAILS);
 629                 kfree_skb(skb);
 630                 return -EMSGSIZE;
 631         }
 632
 633         if (np && np->frag_size < mtu) {
 634                 if (np->frag_size)
 635                         mtu = np->frag_size;
 636         }
 637         mtu -= hlen + sizeof(struct frag_hdr);
 638
 639         if (skb_has_frag_list(skb)) {
 640                 int first_len = skb_pagelen(skb);
 641                 struct sk_buff *frag2;
 642
 643                 if (first_len - hlen > mtu ||
 644                     ((first_len - hlen) & 7) ||
 645                     skb_cloned(skb))
 646                         goto slow_path;
 647
 648                 skb_walk_frags(skb, frag) {
 649                         /* Correct geometry. */
 650                         if (frag->len > mtu ||
 651                             ((frag->len & 7) && frag->next) ||
 652                             skb_headroom(frag) < hlen)
 653                                 goto slow_path_clean;
 654
 655                         /* Partially cloned skb? */
 656                         if (skb_shared(frag))
 657                                 goto slow_path_clean;
 658
 659                         BUG_ON(frag->sk);
 660                         if (skb->sk) {
 661                                 frag->sk = skb->sk;
 662                                 frag->destructor = sock_wfree;
 663                         }
 664                         skb->truesize -= frag->truesize;
 665                 }
 666
 667                 err = 0;
 668                 offset = 0;
 669                 frag = skb_shinfo(skb)->frag_list;
 670                 skb_frag_list_init(skb);
 671                 /* BUILD HEADER */
 672
 673                 *prevhdr = NEXTHDR_FRAGMENT;
 674                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 675                 if (!tmp_hdr) {
 676                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 677                                       IPSTATS_MIB_FRAGFAILS);
 678                         return -ENOMEM;
 679                 }
 680
 681                 __skb_pull(skb, hlen);
 682                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 683                 __skb_push(skb, hlen);
 684                 skb_reset_network_header(skb);
 685                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 686
 687                 ipv6_select_ident(fh);
 688                 fh->nexthdr = nexthdr;
 689                 fh->reserved = 0;
 690                 fh->frag_off = htons(IP6_MF);
 691                 frag_id = fh->identification;
 692
 693                 first_len = skb_pagelen(skb);
 694                 skb->data_len = first_len - skb_headlen(skb);
 695                 skb->len = first_len;
 696                 ipv6_hdr(skb)->payload_len = htons(first_len -
 697                                                    sizeof(struct ipv6hdr));
 698
 699                 dst_hold(&rt->dst);
 700
 701                 for (;;) {
 702                         /* Prepare header of the next frame,
 703                          * before previous one went down. */
 704                         if (frag) {
 705                                 frag->ip_summed = CHECKSUM_NONE;
 706                                 skb_reset_transport_header(frag);
 707                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 708                                 __skb_push(frag, hlen);
 709                                 skb_reset_network_header(frag);
 710                                 memcpy(skb_network_header(frag), tmp_hdr,
 711                                        hlen);
 712                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 713                                 fh->nexthdr = nexthdr;
 714                                 fh->reserved = 0;
 715                                 fh->frag_off = htons(offset);
 716                                 if (frag->next != NULL)
 717                                         fh->frag_off |= htons(IP6_MF);
 718                                 fh->identification = frag_id;
 719                                 ipv6_hdr(frag)->payload_len =
 720                                                 htons(frag->len -
 721                                                       sizeof(struct ipv6hdr));
 722                                 ip6_copy_metadata(frag, skb);
 723                         }
 724
 725                         err = output(skb);
 726                         if(!err)
 727                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 728                                               IPSTATS_MIB_FRAGCREATES);
 729
 730                         if (err || !frag)
 731                                 break;
 732
 733                         skb = frag;
 734                         frag = skb->next;
 735                         skb->next = NULL;
 736                 }
 737
 738                 kfree(tmp_hdr);
 739
 740                 if (err == 0) {
 741                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 742                                       IPSTATS_MIB_FRAGOKS);
 743                         dst_release(&rt->dst);
 744                         return 0;
 745                 }
 746
 747                 while (frag) {
 748                         skb = frag->next;
 749                         kfree_skb(frag);
 750                         frag = skb;
 751                 }
 752
 753                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 754                               IPSTATS_MIB_FRAGFAILS);
 755                 dst_release(&rt->dst);
 756                 return err;
 757
 758 slow_path_clean:
 759                 skb_walk_frags(skb, frag2) {
 760                         if (frag2 == frag)
 761                                 break;
 762                         frag2->sk = NULL;
 763                         frag2->destructor = NULL;
 764                         skb->truesize += frag2->truesize;
 765                 }
 766         }
 767
 768 slow_path:
 769         left = skb->len - hlen;         /* Space per frame */
 770         ptr = hlen;                     /* Where to start from */
 771
 772         /*
 773          *      Fragment the datagram.
 774          */
 775
 776         *prevhdr = NEXTHDR_FRAGMENT;
 777
 778         /*
 779          *      Keep copying data until we run out.
 780          */
 781         while(left > 0) {
 782                 len = left;
 783                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 784                 if (len > mtu)
 785                         len = mtu;
 786                 /* IF: we are not sending up to and including the packet end
 787                    then align the next start on an eight byte boundary */
 788                 if (len < left) {
 789                         len &= ~7;
 790                 }
 791                 /*
 792                  *      Allocate buffer.
 793                  */
 794
 795                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
 796                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 797                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 798                                       IPSTATS_MIB_FRAGFAILS);
 799                         err = -ENOMEM;
 800                         goto fail;
 801                 }
 802
 803                 /*
 804                  *      Set up data on packet
 805                  */
 806
 807                 ip6_copy_metadata(frag, skb);
 808                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
 809                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 810                 skb_reset_network_header(frag);
 811                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 812                 frag->transport_header = (frag->network_header + hlen +
 813                                           sizeof(struct frag_hdr));
 814
 815                 /*
 816                  *      Charge the memory for the fragment to any owner
 817                  *      it might possess
 818                  */
 819                 if (skb->sk)
 820                         skb_set_owner_w(frag, skb->sk);
 821
 822                 /*
 823                  *      Copy the packet header into the new buffer.
 824                  */
 825                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 826
 827                 /*
 828                  *      Build fragment header.
 829                  */
 830                 fh->nexthdr = nexthdr;
 831                 fh->reserved = 0;
 832                 if (!frag_id) {
 833                         ipv6_select_ident(fh);
 834                         frag_id = fh->identification;
 835                 } else
 836                         fh->identification = frag_id;
 837
 838                 /*
 839                  *      Copy a block of the IP datagram.
 840                  */
 841                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 842                         BUG();
 843                 left -= len;
 844
 845                 fh->frag_off = htons(offset);
 846                 if (left > 0)
 847                         fh->frag_off |= htons(IP6_MF);
 848                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 849                                                     sizeof(struct ipv6hdr));
 850
 851                 ptr += len;
 852                 offset += len;
 853
 854                 /*
 855                  *      Put this fragment into the sending queue.
 856                  */
 857                 err = output(frag);
 858                 if (err)
 859                         goto fail;
 860
 861                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 862                               IPSTATS_MIB_FRAGCREATES);
 863         }
 864         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 865                       IPSTATS_MIB_FRAGOKS);
 866         kfree_skb(skb);
 867         return err;
 868
 869 fail:
 870         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 871                       IPSTATS_MIB_FRAGFAILS);
 872         kfree_skb(skb);
 873         return err;
 874 }
 875
 876 static inline int ip6_rt_check(const struct rt6key *rt_key,
 877                                const struct in6_addr *fl_addr,
 878                                const struct in6_addr *addr_cache)
 879 {
 880         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 881                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 882 }
 883
 884 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 885                                           struct dst_entry *dst,
 886                                           const struct flowi6 *fl6)
 887 {
 888         struct ipv6_pinfo *np = inet6_sk(sk);
 889         struct rt6_info *rt = (struct rt6_info *)dst;
 890
 891         if (!dst)
 892                 goto out;
 893
 894         /* Yes, checking route validity in not connected
 895          * case is not very simple. Take into account,
 896          * that we do not support routing by source, TOS,
 897          * and MSG_DONTROUTE            --ANK (980726)
 898          *
 899          * 1. ip6_rt_check(): If route was host route,
 900          *    check that cached destination is current.
 901          *    If it is network route, we still may
 902          *    check its validity using saved pointer
 903          *    to the last used address: daddr_cache.
 904          *    We do not want to save whole address now,
 905          *    (because main consumer of this service
 906          *    is tcp, which has not this problem),
 907          *    so that the last trick works only on connected
 908          *    sockets.
 909          * 2. oif also should be the same.
 910          */
 911         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 912 #ifdef CONFIG_IPV6_SUBTREES
 913             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 914 #endif
 915             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 916                 dst_release(dst);
 917                 dst = NULL;
 918         }
 919
 920 out:
 921         return dst;
 922 }
 923
 924 static int ip6_dst_lookup_tail(struct sock *sk,
 925                                struct dst_entry **dst, struct flowi6 *fl6)
 926 {
 927         int err;
 928         struct net *net = sock_net(sk);
 929
 930         if (*dst == NULL)
 931                 *dst = ip6_route_output(net, sk, fl6);
 932
 933         if ((err = (*dst)->error))
 934                 goto out_err_release;
 935
 936         if (ipv6_addr_any(&fl6->saddr)) {
 937                 struct rt6_info *rt = (struct rt6_info *) *dst;
 938                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 939                                           sk ? inet6_sk(sk)->srcprefs : 0,
 940                                           &fl6->saddr);
 941                 if (err)
 942                         goto out_err_release;
 943         }
 944
 945 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 946         /*
 947          * Here if the dst entry we've looked up
 948          * has a neighbour entry that is in the INCOMPLETE
 949          * state and the src address from the flow is
 950          * marked as OPTIMISTIC, we release the found
 951          * dst entry and replace it instead with the
 952          * dst entry of the nexthop router
 953          */
 954         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 955                 struct inet6_ifaddr *ifp;
 956                 struct flowi6 fl_gw6;
 957                 int redirect;
 958
 959                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 960                                       (*dst)->dev, 1);
 961
 962                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 963                 if (ifp)
 964                         in6_ifa_put(ifp);
 965
 966                 if (redirect) {
 967                         /*
 968                          * We need to get the dst entry for the
 969                          * default router instead
 970                          */
 971                         dst_release(*dst);
 972                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
 973                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
 974                         *dst = ip6_route_output(net, sk, &fl_gw6);
 975                         if ((err = (*dst)->error))
 976                                 goto out_err_release;
 977                 }
 978         }
 979 #endif
 980
 981         return 0;
 982
 983 out_err_release:
 984         if (err == -ENETUNREACH)
 985                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 986         dst_release(*dst);
 987         *dst = NULL;
 988         return err;
 989 }
 990
 991 /**
 992  *      ip6_dst_lookup - perform route lookup on flow
 993  *      @sk: socket which provides route info
 994  *      @dst: pointer to dst_entry * for result
 995  *      @fl6: flow to lookup
 996  *
 997  *      This function performs a route lookup on the given flow.
 998  *
 999  *      It returns zero on success, or a standard errno code on error.
1000  */
1001 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1002 {
1003         *dst = NULL;
1004         return ip6_dst_lookup_tail(sk, dst, fl6);
1005 }
1006 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1007
1008 /**
1009  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1010  *      @sk: socket which provides route info
1011  *      @fl6: flow to lookup
1012  *      @final_dst: final destination address for ipsec lookup
1013  *      @can_sleep: we are in a sleepable context
1014  *
1015  *      This function performs a route lookup on the given flow.
1016  *
1017  *      It returns a valid dst pointer on success, or a pointer encoded
1018  *      error code.
1019  */
1020 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1021                                       const struct in6_addr *final_dst,
1022                                       bool can_sleep)
1023 {
1024         struct dst_entry *dst = NULL;
1025         int err;
1026
1027         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1028         if (err)
1029                 return ERR_PTR(err);
1030         if (final_dst)
1031                 ipv6_addr_copy(&fl6->daddr, final_dst);
1032         if (can_sleep)
1033                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1034
1035         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1036 }
1037 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1038
1039 /**
1040  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1041  *      @sk: socket which provides the dst cache and route info
1042  *      @fl6: flow to lookup
1043  *      @final_dst: final destination address for ipsec lookup
1044  *      @can_sleep: we are in a sleepable context
1045  *
1046  *      This function performs a route lookup on the given flow with the
1047  *      possibility of using the cached route in the socket if it is valid.
1048  *      It will take the socket dst lock when operating on the dst cache.
1049  *      As a result, this function can only be used in process context.
1050  *
1051  *      It returns a valid dst pointer on success, or a pointer encoded
1052  *      error code.
1053  */
1054 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1055                                          const struct in6_addr *final_dst,
1056                                          bool can_sleep)
1057 {
1058         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1059         int err;
1060
1061         dst = ip6_sk_dst_check(sk, dst, fl6);
1062
1063         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1064         if (err)
1065                 return ERR_PTR(err);
1066         if (final_dst)
1067                 ipv6_addr_copy(&fl6->daddr, final_dst);
1068         if (can_sleep)
1069                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1070
1071         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1072 }
1073 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1074
1075 static inline int ip6_ufo_append_data(struct sock *sk,
1076                         int getfrag(void *from, char *to, int offset, int len,
1077                         int odd, struct sk_buff *skb),
1078                         void *from, int length, int hh_len, int fragheaderlen,
1079                         int transhdrlen, int mtu,unsigned int flags)
1080
1081 {
1082         struct sk_buff *skb;
1083         int err;
1084
1085         /* There is support for UDP large send offload by network
1086          * device, so create one single skb packet containing complete
1087          * udp datagram
1088          */
1089         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1090                 skb = sock_alloc_send_skb(sk,
1091                         hh_len + fragheaderlen + transhdrlen + 20,
1092                         (flags & MSG_DONTWAIT), &err);
1093                 if (skb == NULL)
1094                         return -ENOMEM;
1095
1096                 /* reserve space for Hardware header */
1097                 skb_reserve(skb, hh_len);
1098
1099                 /* create space for UDP/IP header */
1100                 skb_put(skb,fragheaderlen + transhdrlen);
1101
1102                 /* initialize network header pointer */
1103                 skb_reset_network_header(skb);
1104
1105                 /* initialize protocol header pointer */
1106                 skb->transport_header = skb->network_header + fragheaderlen;
1107
1108                 skb->ip_summed = CHECKSUM_PARTIAL;
1109                 skb->csum = 0;
1110         }
1111
1112         err = skb_append_datato_frags(sk,skb, getfrag, from,
1113                                       (length - transhdrlen));
1114         if (!err) {
1115                 struct frag_hdr fhdr;
1116
1117                 /* Specify the length of each IPv6 datagram fragment.
1118                  * It has to be a multiple of 8.
1119                  */
1120                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1121                                              sizeof(struct frag_hdr)) & ~7;
1122                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1123                 ipv6_select_ident(&fhdr);
1124                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1125                 __skb_queue_tail(&sk->sk_write_queue, skb);
1126
1127                 return 0;
1128         }
1129         /* There is not enough support do UPD LSO,
1130          * so follow normal path
1131          */
1132         kfree_skb(skb);
1133
1134         return err;
1135 }
1136
1137 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1138                                                gfp_t gfp)
1139 {
1140         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1141 }
1142
1143 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1144                                                 gfp_t gfp)
1145 {
1146         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1147 }
1148
1149 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1150         int offset, int len, int odd, struct sk_buff *skb),
1151         void *from, int length, int transhdrlen,
1152         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1153         struct rt6_info *rt, unsigned int flags, int dontfrag)
1154 {
1155         struct inet_sock *inet = inet_sk(sk);
1156         struct ipv6_pinfo *np = inet6_sk(sk);
1157         struct inet_cork *cork;
1158         struct sk_buff *skb;
1159         unsigned int maxfraglen, fragheaderlen;
1160         int exthdrlen;
1161         int hh_len;
1162         int mtu;
1163         int copy;
1164         int err;
1165         int offset = 0;
1166         int csummode = CHECKSUM_NONE;
1167         __u8 tx_flags = 0;
1168
1169         if (flags&MSG_PROBE)
1170                 return 0;
1171         cork = &inet->cork.base;
1172         if (skb_queue_empty(&sk->sk_write_queue)) {
1173                 /*
1174                  * setup for corking
1175                  */
1176                 if (opt) {
1177                         if (WARN_ON(np->cork.opt))
1178                                 return -EINVAL;
1179
1180                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1181                         if (unlikely(np->cork.opt == NULL))
1182                                 return -ENOBUFS;
1183
1184                         np->cork.opt->tot_len = opt->tot_len;
1185                         np->cork.opt->opt_flen = opt->opt_flen;
1186                         np->cork.opt->opt_nflen = opt->opt_nflen;
1187
1188                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1189                                                             sk->sk_allocation);
1190                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1191                                 return -ENOBUFS;
1192
1193                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1194                                                             sk->sk_allocation);
1195                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1196                                 return -ENOBUFS;
1197
1198                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1199                                                            sk->sk_allocation);
1200                         if (opt->hopopt && !np->cork.opt->hopopt)
1201                                 return -ENOBUFS;
1202
1203                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1204                                                             sk->sk_allocation);
1205                         if (opt->srcrt && !np->cork.opt->srcrt)
1206                                 return -ENOBUFS;
1207
1208                         /* need source address above miyazawa*/
1209                 }
1210                 dst_hold(&rt->dst);
1211                 cork->dst = &rt->dst;
1212                 inet->cork.fl.u.ip6 = *fl6;
1213                 np->cork.hop_limit = hlimit;
1214                 np->cork.tclass = tclass;
1215                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1216                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1217                 if (np->frag_size < mtu) {
1218                         if (np->frag_size)
1219                                 mtu = np->frag_size;
1220                 }
1221                 cork->fragsize = mtu;
1222                 if (dst_allfrag(rt->dst.path))
1223                         cork->flags |= IPCORK_ALLFRAG;
1224                 cork->length = 0;
1225                 sk->sk_sndmsg_page = NULL;
1226                 sk->sk_sndmsg_off = 0;
1227                 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1228                             rt->rt6i_nfheader_len;
1229                 length += exthdrlen;
1230                 transhdrlen += exthdrlen;
1231         } else {
1232                 rt = (struct rt6_info *)cork->dst;
1233                 fl6 = &inet->cork.fl.u.ip6;
1234                 opt = np->cork.opt;
1235                 transhdrlen = 0;
1236                 exthdrlen = 0;
1237                 mtu = cork->fragsize;
1238         }
1239
1240         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1241
1242         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1243                         (opt ? opt->opt_nflen : 0);
1244         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1245
1246         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1247                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1248                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1249                         return -EMSGSIZE;
1250                 }
1251         }
1252
1253         /* For UDP, check if TX timestamp is enabled */
1254         if (sk->sk_type == SOCK_DGRAM) {
1255                 err = sock_tx_timestamp(sk, &tx_flags);
1256                 if (err)
1257                         goto error;
1258         }
1259
1260         /*
1261          * Let's try using as much space as possible.
1262          * Use MTU if total length of the message fits into the MTU.
1263          * Otherwise, we need to reserve fragment header and
1264          * fragment alignment (= 8-15 octects, in total).
1265          *
1266          * Note that we may need to "move" the data from the tail of
1267          * of the buffer to the new fragment when we split
1268          * the message.
1269          *
1270          * FIXME: It may be fragmented into multiple chunks
1271          *        at once if non-fragmentable extension headers
1272          *        are too large.
1273          * --yoshfuji
1274          */
1275
1276         cork->length += length;
1277         if (length > mtu) {
1278                 int proto = sk->sk_protocol;
1279                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1280                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1281                         return -EMSGSIZE;
1282                 }
1283
1284                 if (proto == IPPROTO_UDP &&
1285                     (rt->dst.dev->features & NETIF_F_UFO)) {
1286
1287                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1288                                                   hh_len, fragheaderlen,
1289                                                   transhdrlen, mtu, flags);
1290                         if (err)
1291                                 goto error;
1292                         return 0;
1293                 }
1294         }
1295
1296         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1297                 goto alloc_new_skb;
1298
1299         while (length > 0) {
1300                 /* Check if the remaining data fits into current packet. */
1301                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1302                 if (copy < length)
1303                         copy = maxfraglen - skb->len;
1304
1305                 if (copy <= 0) {
1306                         char *data;
1307                         unsigned int datalen;
1308                         unsigned int fraglen;
1309                         unsigned int fraggap;
1310                         unsigned int alloclen;
1311                         struct sk_buff *skb_prev;
1312 alloc_new_skb:
1313                         skb_prev = skb;
1314
1315                         /* There's no room in the current skb */
1316                         if (skb_prev)
1317                                 fraggap = skb_prev->len - maxfraglen;
1318                         else
1319                                 fraggap = 0;
1320
1321                         /*
1322                          * If remaining data exceeds the mtu,
1323                          * we know we need more fragment(s).
1324                          */
1325                         datalen = length + fraggap;
1326                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1327                                 datalen = maxfraglen - fragheaderlen;
1328
1329                         fraglen = datalen + fragheaderlen;
1330                         if ((flags & MSG_MORE) &&
1331                             !(rt->dst.dev->features&NETIF_F_SG))
1332                                 alloclen = mtu;
1333                         else
1334                                 alloclen = datalen + fragheaderlen;
1335
1336                         /*
1337                          * The last fragment gets additional space at tail.
1338                          * Note: we overallocate on fragments with MSG_MODE
1339                          * because we have no idea if we're the last one.
1340                          */
1341                         if (datalen == length + fraggap)
1342                                 alloclen += rt->dst.trailer_len;
1343
1344                         /*
1345                          * We just reserve space for fragment header.
1346                          * Note: this may be overallocation if the message
1347                          * (without MSG_MORE) fits into the MTU.
1348                          */
1349                         alloclen += sizeof(struct frag_hdr);
1350
1351                         if (transhdrlen) {
1352                                 skb = sock_alloc_send_skb(sk,
1353                                                 alloclen + hh_len,
1354                                                 (flags & MSG_DONTWAIT), &err);
1355                         } else {
1356                                 skb = NULL;
1357                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1358                                     2 * sk->sk_sndbuf)
1359                                         skb = sock_wmalloc(sk,
1360                                                            alloclen + hh_len, 1,
1361                                                            sk->sk_allocation);
1362                                 if (unlikely(skb == NULL))
1363                                         err = -ENOBUFS;
1364                                 else {
1365                                         /* Only the initial fragment
1366                                          * is time stamped.
1367                                          */
1368                                         tx_flags = 0;
1369                                 }
1370                         }
1371                         if (skb == NULL)
1372                                 goto error;
1373                         /*
1374                          *      Fill in the control structures
1375                          */
1376                         skb->ip_summed = csummode;
1377                         skb->csum = 0;
1378                         /* reserve for fragmentation */
1379                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1380
1381                         if (sk->sk_type == SOCK_DGRAM)
1382                                 skb_shinfo(skb)->tx_flags = tx_flags;
1383
1384                         /*
1385                          *      Find where to start putting bytes
1386                          */
1387                         data = skb_put(skb, fraglen);
1388                         skb_set_network_header(skb, exthdrlen);
1389                         data += fragheaderlen;
1390                         skb->transport_header = (skb->network_header +
1391                                                  fragheaderlen);
1392                         if (fraggap) {
1393                                 skb->csum = skb_copy_and_csum_bits(
1394                                         skb_prev, maxfraglen,
1395                                         data + transhdrlen, fraggap, 0);
1396                                 skb_prev->csum = csum_sub(skb_prev->csum,
1397                                                           skb->csum);
1398                                 data += fraggap;
1399                                 pskb_trim_unique(skb_prev, maxfraglen);
1400                         }
1401                         copy = datalen - transhdrlen - fraggap;
1402                         if (copy < 0) {
1403                                 err = -EINVAL;
1404                                 kfree_skb(skb);
1405                                 goto error;
1406                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1407                                 err = -EFAULT;
1408                                 kfree_skb(skb);
1409                                 goto error;
1410                         }
1411
1412                         offset += copy;
1413                         length -= datalen - fraggap;
1414                         transhdrlen = 0;
1415                         exthdrlen = 0;
1416                         csummode = CHECKSUM_NONE;
1417
1418                         /*
1419                          * Put the packet on the pending queue
1420                          */
1421                         __skb_queue_tail(&sk->sk_write_queue, skb);
1422                         continue;
1423                 }
1424
1425                 if (copy > length)
1426                         copy = length;
1427
1428                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1429                         unsigned int off;
1430
1431                         off = skb->len;
1432                         if (getfrag(from, skb_put(skb, copy),
1433                                                 offset, copy, off, skb) < 0) {
1434                                 __skb_trim(skb, off);
1435                                 err = -EFAULT;
1436                                 goto error;
1437                         }
1438                 } else {
1439                         int i = skb_shinfo(skb)->nr_frags;
1440                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1441                         struct page *page = sk->sk_sndmsg_page;
1442                         int off = sk->sk_sndmsg_off;
1443                         unsigned int left;
1444
1445                         if (page && (left = PAGE_SIZE - off) > 0) {
1446                                 if (copy >= left)
1447                                         copy = left;
1448                                 if (page != frag->page) {
1449                                         if (i == MAX_SKB_FRAGS) {
1450                                                 err = -EMSGSIZE;
1451                                                 goto error;
1452                                         }
1453                                         get_page(page);
1454                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1455                                         frag = &skb_shinfo(skb)->frags[i];
1456                                 }
1457                         } else if(i < MAX_SKB_FRAGS) {
1458                                 if (copy > PAGE_SIZE)
1459                                         copy = PAGE_SIZE;
1460                                 page = alloc_pages(sk->sk_allocation, 0);
1461                                 if (page == NULL) {
1462                                         err = -ENOMEM;
1463                                         goto error;
1464                                 }
1465                                 sk->sk_sndmsg_page = page;
1466                                 sk->sk_sndmsg_off = 0;
1467
1468                                 skb_fill_page_desc(skb, i, page, 0, 0);
1469                                 frag = &skb_shinfo(skb)->frags[i];
1470                         } else {
1471                                 err = -EMSGSIZE;
1472                                 goto error;
1473                         }
1474                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1475                                 err = -EFAULT;
1476                                 goto error;
1477                         }
1478                         sk->sk_sndmsg_off += copy;
1479                         frag->size += copy;
1480                         skb->len += copy;
1481                         skb->data_len += copy;
1482                         skb->truesize += copy;
1483                         atomic_add(copy, &sk->sk_wmem_alloc);
1484                 }
1485                 offset += copy;
1486                 length -= copy;
1487         }
1488         return 0;
1489 error:
1490         cork->length -= length;
1491         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1492         return err;
1493 }
1494
1495 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1496 {
1497         if (np->cork.opt) {
1498                 kfree(np->cork.opt->dst0opt);
1499                 kfree(np->cork.opt->dst1opt);
1500                 kfree(np->cork.opt->hopopt);
1501                 kfree(np->cork.opt->srcrt);
1502                 kfree(np->cork.opt);
1503                 np->cork.opt = NULL;
1504         }
1505
1506         if (inet->cork.base.dst) {
1507                 dst_release(inet->cork.base.dst);
1508                 inet->cork.base.dst = NULL;
1509                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1510         }
1511         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1512 }
1513
1514 int ip6_push_pending_frames(struct sock *sk)
1515 {
1516         struct sk_buff *skb, *tmp_skb;
1517         struct sk_buff **tail_skb;
1518         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1519         struct inet_sock *inet = inet_sk(sk);
1520         struct ipv6_pinfo *np = inet6_sk(sk);
1521         struct net *net = sock_net(sk);
1522         struct ipv6hdr *hdr;
1523         struct ipv6_txoptions *opt = np->cork.opt;
1524         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1525         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1526         unsigned char proto = fl6->flowi6_proto;
1527         int err = 0;
1528
1529         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1530                 goto out;
1531         tail_skb = &(skb_shinfo(skb)->frag_list);
1532
1533         /* move skb->data to ip header from ext header */
1534         if (skb->data < skb_network_header(skb))
1535                 __skb_pull(skb, skb_network_offset(skb));
1536         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1537                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1538                 *tail_skb = tmp_skb;
1539                 tail_skb = &(tmp_skb->next);
1540                 skb->len += tmp_skb->len;
1541                 skb->data_len += tmp_skb->len;
1542                 skb->truesize += tmp_skb->truesize;
1543                 tmp_skb->destructor = NULL;
1544                 tmp_skb->sk = NULL;
1545         }
1546
1547         /* Allow local fragmentation. */
1548         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1549                 skb->local_df = 1;
1550
1551         ipv6_addr_copy(final_dst, &fl6->daddr);
1552         __skb_pull(skb, skb_network_header_len(skb));
1553         if (opt && opt->opt_flen)
1554                 ipv6_push_frag_opts(skb, opt, &proto);
1555         if (opt && opt->opt_nflen)
1556                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1557
1558         skb_push(skb, sizeof(struct ipv6hdr));
1559         skb_reset_network_header(skb);
1560         hdr = ipv6_hdr(skb);
1561
1562         *(__be32*)hdr = fl6->flowlabel |
1563                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1564
1565         hdr->hop_limit = np->cork.hop_limit;
1566         hdr->nexthdr = proto;
1567         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1568         ipv6_addr_copy(&hdr->daddr, final_dst);
1569
1570         skb->priority = sk->sk_priority;
1571         skb->mark = sk->sk_mark;
1572
1573         skb_dst_set(skb, dst_clone(&rt->dst));
1574         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1575         if (proto == IPPROTO_ICMPV6) {
1576                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1577
1578                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1579                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1580         }
1581
1582         err = ip6_local_out(skb);
1583         if (err) {
1584                 if (err > 0)
1585                         err = net_xmit_errno(err);
1586                 if (err)
1587                         goto error;
1588         }
1589
1590 out:
1591         ip6_cork_release(inet, np);
1592         return err;
1593 error:
1594         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1595         goto out;
1596 }
1597
1598 void ip6_flush_pending_frames(struct sock *sk)
1599 {
1600         struct sk_buff *skb;
1601
1602         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1603                 if (skb_dst(skb))
1604                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1605                                       IPSTATS_MIB_OUTDISCARDS);
1606                 kfree_skb(skb);
1607         }
1608
1609         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1610 }