SAFE public projects git trees. - safe/jmp/linux-2.6/blob - net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 int __ip6_local_out(struct sk_buff *skb)
  61 {
  62         int len;
  63
  64         len = skb->len - sizeof(struct ipv6hdr);
  65         if (len > IPV6_MAXPLEN)
  66                 len = 0;
  67         ipv6_hdr(skb)->payload_len = htons(len);
  68
  69         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
  70                        dst_output);
  71 }
  72
  73 int ip6_local_out(struct sk_buff *skb)
  74 {
  75         int err;
  76
  77         err = __ip6_local_out(skb);
  78         if (likely(err == 1))
  79                 err = dst_output(skb);
  80
  81         return err;
  82 }
  83 EXPORT_SYMBOL_GPL(ip6_local_out);
  84
  85 static int ip6_output_finish(struct sk_buff *skb)
  86 {
  87         struct dst_entry *dst = skb_dst(skb);
  88
  89         if (dst->hh)
  90                 return neigh_hh_output(dst->hh, skb);
  91         else if (dst->neighbour)
  92                 return dst->neighbour->output(skb);
  93
  94         IP6_INC_STATS_BH(dev_net(dst->dev),
  95                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
  96         kfree_skb(skb);
  97         return -EINVAL;
  98
  99 }
 100
 101 /* dev_loopback_xmit for use with netfilter. */
 102 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 103 {
 104         skb_reset_mac_header(newskb);
 105         __skb_pull(newskb, skb_network_offset(newskb));
 106         newskb->pkt_type = PACKET_LOOPBACK;
 107         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 108         WARN_ON(!skb_dst(newskb));
 109
 110         netif_rx(newskb);
 111         return 0;
 112 }
 113
 114
 115 static int ip6_output2(struct sk_buff *skb)
 116 {
 117         struct dst_entry *dst = skb_dst(skb);
 118         struct net_device *dev = dst->dev;
 119
 120         skb->protocol = htons(ETH_P_IPV6);
 121         skb->dev = dev;
 122
 123         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 124                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 125
 126                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 127                     ((mroute6_socket(dev_net(dev)) &&
 128                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 129                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 130                                          &ipv6_hdr(skb)->saddr))) {
 131                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 132
 133                         /* Do not check for IFF_ALLMULTI; multicast routing
 134                            is not supported in any case.
 135                          */
 136                         if (newskb)
 137                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 138                                         NULL, newskb->dev,
 139                                         ip6_dev_loopback_xmit);
 140
 141                         if (ipv6_hdr(skb)->hop_limit == 0) {
 142                                 IP6_INC_STATS(dev_net(dev), idev,
 143                                               IPSTATS_MIB_OUTDISCARDS);
 144                                 kfree_skb(skb);
 145                                 return 0;
 146                         }
 147                 }
 148
 149                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 150                                 skb->len);
 151         }
 152
 153         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 154                        ip6_output_finish);
 155 }
 156
 157 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 158 {
 159         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 160
 161         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 162                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 163 }
 164
 165 int ip6_output(struct sk_buff *skb)
 166 {
 167         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 168         if (unlikely(idev->cnf.disable_ipv6)) {
 169                 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
 170                               IPSTATS_MIB_OUTDISCARDS);
 171                 kfree_skb(skb);
 172                 return 0;
 173         }
 174
 175         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 176                                 dst_allfrag(skb_dst(skb)))
 177                 return ip6_fragment(skb, ip6_output2);
 178         else
 179                 return ip6_output2(skb);
 180 }
 181
 182 /*
 183  *      xmit an sk_buff (used by TCP)
 184  */
 185
 186 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 187              struct ipv6_txoptions *opt, int ipfragok)
 188 {
 189         struct net *net = sock_net(sk);
 190         struct ipv6_pinfo *np = inet6_sk(sk);
 191         struct in6_addr *first_hop = &fl->fl6_dst;
 192         struct dst_entry *dst = skb_dst(skb);
 193         struct ipv6hdr *hdr;
 194         u8  proto = fl->proto;
 195         int seg_len = skb->len;
 196         int hlimit = -1;
 197         int tclass = 0;
 198         u32 mtu;
 199
 200         if (opt) {
 201                 unsigned int head_room;
 202
 203                 /* First: exthdrs may take lots of space (~8K for now)
 204                    MAX_HEADER is not enough.
 205                  */
 206                 head_room = opt->opt_nflen + opt->opt_flen;
 207                 seg_len += head_room;
 208                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 209
 210                 if (skb_headroom(skb) < head_room) {
 211                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 212                         if (skb2 == NULL) {
 213                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 214                                               IPSTATS_MIB_OUTDISCARDS);
 215                                 kfree_skb(skb);
 216                                 return -ENOBUFS;
 217                         }
 218                         kfree_skb(skb);
 219                         skb = skb2;
 220                         if (sk)
 221                                 skb_set_owner_w(skb, sk);
 222                 }
 223                 if (opt->opt_flen)
 224                         ipv6_push_frag_opts(skb, opt, &proto);
 225                 if (opt->opt_nflen)
 226                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 227         }
 228
 229         skb_push(skb, sizeof(struct ipv6hdr));
 230         skb_reset_network_header(skb);
 231         hdr = ipv6_hdr(skb);
 232
 233         /* Allow local fragmentation. */
 234         if (ipfragok)
 235                 skb->local_df = 1;
 236
 237         /*
 238          *      Fill in the IPv6 header
 239          */
 240         if (np) {
 241                 tclass = np->tclass;
 242                 hlimit = np->hop_limit;
 243         }
 244         if (hlimit < 0)
 245                 hlimit = ip6_dst_hoplimit(dst);
 246
 247         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 248
 249         hdr->payload_len = htons(seg_len);
 250         hdr->nexthdr = proto;
 251         hdr->hop_limit = hlimit;
 252
 253         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 254         ipv6_addr_copy(&hdr->daddr, first_hop);
 255
 256         skb->priority = sk->sk_priority;
 257         skb->mark = sk->sk_mark;
 258
 259         mtu = dst_mtu(dst);
 260         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 261                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 262                               IPSTATS_MIB_OUT, skb->len);
 263                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 264                                 dst_output);
 265         }
 266
 267         if (net_ratelimit())
 268                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 269         skb->dev = dst->dev;
 270         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 271         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 272         kfree_skb(skb);
 273         return -EMSGSIZE;
 274 }
 275
 276 EXPORT_SYMBOL(ip6_xmit);
 277
 278 /*
 279  *      To avoid extra problems ND packets are send through this
 280  *      routine. It's code duplication but I really want to avoid
 281  *      extra checks since ipv6_build_header is used by TCP (which
 282  *      is for us performance critical)
 283  */
 284
 285 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 286                const struct in6_addr *saddr, const struct in6_addr *daddr,
 287                int proto, int len)
 288 {
 289         struct ipv6_pinfo *np = inet6_sk(sk);
 290         struct ipv6hdr *hdr;
 291         int totlen;
 292
 293         skb->protocol = htons(ETH_P_IPV6);
 294         skb->dev = dev;
 295
 296         totlen = len + sizeof(struct ipv6hdr);
 297
 298         skb_reset_network_header(skb);
 299         skb_put(skb, sizeof(struct ipv6hdr));
 300         hdr = ipv6_hdr(skb);
 301
 302         *(__be32*)hdr = htonl(0x60000000);
 303
 304         hdr->payload_len = htons(len);
 305         hdr->nexthdr = proto;
 306         hdr->hop_limit = np->hop_limit;
 307
 308         ipv6_addr_copy(&hdr->saddr, saddr);
 309         ipv6_addr_copy(&hdr->daddr, daddr);
 310
 311         return 0;
 312 }
 313
 314 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 315 {
 316         struct ip6_ra_chain *ra;
 317         struct sock *last = NULL;
 318
 319         read_lock(&ip6_ra_lock);
 320         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 321                 struct sock *sk = ra->sk;
 322                 if (sk && ra->sel == sel &&
 323                     (!sk->sk_bound_dev_if ||
 324                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 325                         if (last) {
 326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 327                                 if (skb2)
 328                                         rawv6_rcv(last, skb2);
 329                         }
 330                         last = sk;
 331                 }
 332         }
 333
 334         if (last) {
 335                 rawv6_rcv(last, skb);
 336                 read_unlock(&ip6_ra_lock);
 337                 return 1;
 338         }
 339         read_unlock(&ip6_ra_lock);
 340         return 0;
 341 }
 342
 343 static int ip6_forward_proxy_check(struct sk_buff *skb)
 344 {
 345         struct ipv6hdr *hdr = ipv6_hdr(skb);
 346         u8 nexthdr = hdr->nexthdr;
 347         int offset;
 348
 349         if (ipv6_ext_hdr(nexthdr)) {
 350                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 351                 if (offset < 0)
 352                         return 0;
 353         } else
 354                 offset = sizeof(struct ipv6hdr);
 355
 356         if (nexthdr == IPPROTO_ICMPV6) {
 357                 struct icmp6hdr *icmp6;
 358
 359                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 360                                          offset + 1 - skb->data)))
 361                         return 0;
 362
 363                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 364
 365                 switch (icmp6->icmp6_type) {
 366                 case NDISC_ROUTER_SOLICITATION:
 367                 case NDISC_ROUTER_ADVERTISEMENT:
 368                 case NDISC_NEIGHBOUR_SOLICITATION:
 369                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 370                 case NDISC_REDIRECT:
 371                         /* For reaction involving unicast neighbor discovery
 372                          * message destined to the proxied address, pass it to
 373                          * input function.
 374                          */
 375                         return 1;
 376                 default:
 377                         break;
 378                 }
 379         }
 380
 381         /*
 382          * The proxying router can't forward traffic sent to a link-local
 383          * address, so signal the sender and discard the packet. This
 384          * behavior is clarified by the MIPv6 specification.
 385          */
 386         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 387                 dst_link_failure(skb);
 388                 return -1;
 389         }
 390
 391         return 0;
 392 }
 393
 394 static inline int ip6_forward_finish(struct sk_buff *skb)
 395 {
 396         return dst_output(skb);
 397 }
 398
 399 int ip6_forward(struct sk_buff *skb)
 400 {
 401         struct dst_entry *dst = skb_dst(skb);
 402         struct ipv6hdr *hdr = ipv6_hdr(skb);
 403         struct inet6_skb_parm *opt = IP6CB(skb);
 404         struct net *net = dev_net(dst->dev);
 405
 406         if (net->ipv6.devconf_all->forwarding == 0)
 407                 goto error;
 408
 409         if (skb_warn_if_lro(skb))
 410                 goto drop;
 411
 412         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 413                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 414                 goto drop;
 415         }
 416
 417         skb_forward_csum(skb);
 418
 419         /*
 420          *      We DO NOT make any processing on
 421          *      RA packets, pushing them to user level AS IS
 422          *      without ane WARRANTY that application will be able
 423          *      to interpret them. The reason is that we
 424          *      cannot make anything clever here.
 425          *
 426          *      We are not end-node, so that if packet contains
 427          *      AH/ESP, we cannot make anything.
 428          *      Defragmentation also would be mistake, RA packets
 429          *      cannot be fragmented, because there is no warranty
 430          *      that different fragments will go along one path. --ANK
 431          */
 432         if (opt->ra) {
 433                 u8 *ptr = skb_network_header(skb) + opt->ra;
 434                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 435                         return 0;
 436         }
 437
 438         /*
 439          *      check and decrement ttl
 440          */
 441         if (hdr->hop_limit <= 1) {
 442                 /* Force OUTPUT device used as source address */
 443                 skb->dev = dst->dev;
 444                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 445                 IP6_INC_STATS_BH(net,
 446                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 447
 448                 kfree_skb(skb);
 449                 return -ETIMEDOUT;
 450         }
 451
 452         /* XXX: idev->cnf.proxy_ndp? */
 453         if (net->ipv6.devconf_all->proxy_ndp &&
 454             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 455                 int proxied = ip6_forward_proxy_check(skb);
 456                 if (proxied > 0)
 457                         return ip6_input(skb);
 458                 else if (proxied < 0) {
 459                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 460                                       IPSTATS_MIB_INDISCARDS);
 461                         goto drop;
 462                 }
 463         }
 464
 465         if (!xfrm6_route_forward(skb)) {
 466                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 467                 goto drop;
 468         }
 469         dst = skb_dst(skb);
 470
 471         /* IPv6 specs say nothing about it, but it is clear that we cannot
 472            send redirects to source routed frames.
 473            We don't send redirects to frames decapsulated from IPsec.
 474          */
 475         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 476             !skb_sec_path(skb)) {
 477                 struct in6_addr *target = NULL;
 478                 struct rt6_info *rt;
 479                 struct neighbour *n = dst->neighbour;
 480
 481                 /*
 482                  *      incoming and outgoing devices are the same
 483                  *      send a redirect.
 484                  */
 485
 486                 rt = (struct rt6_info *) dst;
 487                 if ((rt->rt6i_flags & RTF_GATEWAY))
 488                         target = (struct in6_addr*)&n->primary_key;
 489                 else
 490                         target = &hdr->daddr;
 491
 492                 /* Limit redirects both by destination (here)
 493                    and by source (inside ndisc_send_redirect)
 494                  */
 495                 if (xrlim_allow(dst, 1*HZ))
 496                         ndisc_send_redirect(skb, n, target);
 497         } else {
 498                 int addrtype = ipv6_addr_type(&hdr->saddr);
 499
 500                 /* This check is security critical. */
 501                 if (addrtype == IPV6_ADDR_ANY ||
 502                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 503                         goto error;
 504                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 505                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 506                                     ICMPV6_NOT_NEIGHBOUR, 0);
 507                         goto error;
 508                 }
 509         }
 510
 511         if (skb->len > dst_mtu(dst)) {
 512                 /* Again, force OUTPUT device used as source address */
 513                 skb->dev = dst->dev;
 514                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst));
 515                 IP6_INC_STATS_BH(net,
 516                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 517                 IP6_INC_STATS_BH(net,
 518                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 519                 kfree_skb(skb);
 520                 return -EMSGSIZE;
 521         }
 522
 523         if (skb_cow(skb, dst->dev->hard_header_len)) {
 524                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 525                 goto drop;
 526         }
 527
 528         hdr = ipv6_hdr(skb);
 529
 530         /* Mangling hops number delayed to point after skb COW */
 531
 532         hdr->hop_limit--;
 533
 534         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 535         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 536                        ip6_forward_finish);
 537
 538 error:
 539         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 540 drop:
 541         kfree_skb(skb);
 542         return -EINVAL;
 543 }
 544
 545 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 546 {
 547         to->pkt_type = from->pkt_type;
 548         to->priority = from->priority;
 549         to->protocol = from->protocol;
 550         skb_dst_drop(to);
 551         skb_dst_set(to, dst_clone(skb_dst(from)));
 552         to->dev = from->dev;
 553         to->mark = from->mark;
 554
 555 #ifdef CONFIG_NET_SCHED
 556         to->tc_index = from->tc_index;
 557 #endif
 558         nf_copy(to, from);
 559 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 560     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 561         to->nf_trace = from->nf_trace;
 562 #endif
 563         skb_copy_secmark(to, from);
 564 }
 565
 566 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 567 {
 568         u16 offset = sizeof(struct ipv6hdr);
 569         struct ipv6_opt_hdr *exthdr =
 570                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 571         unsigned int packet_len = skb->tail - skb->network_header;
 572         int found_rhdr = 0;
 573         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 574
 575         while (offset + 1 <= packet_len) {
 576
 577                 switch (**nexthdr) {
 578
 579                 case NEXTHDR_HOP:
 580                         break;
 581                 case NEXTHDR_ROUTING:
 582                         found_rhdr = 1;
 583                         break;
 584                 case NEXTHDR_DEST:
 585 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 586                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 587                                 break;
 588 #endif
 589                         if (found_rhdr)
 590                                 return offset;
 591                         break;
 592                 default :
 593                         return offset;
 594                 }
 595
 596                 offset += ipv6_optlen(exthdr);
 597                 *nexthdr = &exthdr->nexthdr;
 598                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 599                                                  offset);
 600         }
 601
 602         return offset;
 603 }
 604
 605 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 606 {
 607         struct sk_buff *frag;
 608         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 609         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 610         struct ipv6hdr *tmp_hdr;
 611         struct frag_hdr *fh;
 612         unsigned int mtu, hlen, left, len;
 613         __be32 frag_id = 0;
 614         int ptr, offset = 0, err=0;
 615         u8 *prevhdr, nexthdr = 0;
 616         struct net *net = dev_net(skb_dst(skb)->dev);
 617
 618         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 619         nexthdr = *prevhdr;
 620
 621         mtu = ip6_skb_dst_mtu(skb);
 622
 623         /* We must not fragment if the socket is set to force MTU discovery
 624          * or if the skb it not generated by a local socket.  (This last
 625          * check should be redundant, but it's free.)
 626          */
 627         if (!skb->local_df) {
 628                 skb->dev = skb_dst(skb)->dev;
 629                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 630                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 631                               IPSTATS_MIB_FRAGFAILS);
 632                 kfree_skb(skb);
 633                 return -EMSGSIZE;
 634         }
 635
 636         if (np && np->frag_size < mtu) {
 637                 if (np->frag_size)
 638                         mtu = np->frag_size;
 639         }
 640         mtu -= hlen + sizeof(struct frag_hdr);
 641
 642         if (skb_has_frags(skb)) {
 643                 int first_len = skb_pagelen(skb);
 644                 int truesizes = 0;
 645
 646                 if (first_len - hlen > mtu ||
 647                     ((first_len - hlen) & 7) ||
 648                     skb_cloned(skb))
 649                         goto slow_path;
 650
 651                 skb_walk_frags(skb, frag) {
 652                         /* Correct geometry. */
 653                         if (frag->len > mtu ||
 654                             ((frag->len & 7) && frag->next) ||
 655                             skb_headroom(frag) < hlen)
 656                             goto slow_path;
 657
 658                         /* Partially cloned skb? */
 659                         if (skb_shared(frag))
 660                                 goto slow_path;
 661
 662                         BUG_ON(frag->sk);
 663                         if (skb->sk) {
 664                                 frag->sk = skb->sk;
 665                                 frag->destructor = sock_wfree;
 666                                 truesizes += frag->truesize;
 667                         }
 668                 }
 669
 670                 err = 0;
 671                 offset = 0;
 672                 frag = skb_shinfo(skb)->frag_list;
 673                 skb_frag_list_init(skb);
 674                 /* BUILD HEADER */
 675
 676                 *prevhdr = NEXTHDR_FRAGMENT;
 677                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 678                 if (!tmp_hdr) {
 679                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 680                                       IPSTATS_MIB_FRAGFAILS);
 681                         return -ENOMEM;
 682                 }
 683
 684                 __skb_pull(skb, hlen);
 685                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 686                 __skb_push(skb, hlen);
 687                 skb_reset_network_header(skb);
 688                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 689
 690                 ipv6_select_ident(fh);
 691                 fh->nexthdr = nexthdr;
 692                 fh->reserved = 0;
 693                 fh->frag_off = htons(IP6_MF);
 694                 frag_id = fh->identification;
 695
 696                 first_len = skb_pagelen(skb);
 697                 skb->data_len = first_len - skb_headlen(skb);
 698                 skb->truesize -= truesizes;
 699                 skb->len = first_len;
 700                 ipv6_hdr(skb)->payload_len = htons(first_len -
 701                                                    sizeof(struct ipv6hdr));
 702
 703                 dst_hold(&rt->u.dst);
 704
 705                 for (;;) {
 706                         /* Prepare header of the next frame,
 707                          * before previous one went down. */
 708                         if (frag) {
 709                                 frag->ip_summed = CHECKSUM_NONE;
 710                                 skb_reset_transport_header(frag);
 711                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 712                                 __skb_push(frag, hlen);
 713                                 skb_reset_network_header(frag);
 714                                 memcpy(skb_network_header(frag), tmp_hdr,
 715                                        hlen);
 716                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 717                                 fh->nexthdr = nexthdr;
 718                                 fh->reserved = 0;
 719                                 fh->frag_off = htons(offset);
 720                                 if (frag->next != NULL)
 721                                         fh->frag_off |= htons(IP6_MF);
 722                                 fh->identification = frag_id;
 723                                 ipv6_hdr(frag)->payload_len =
 724                                                 htons(frag->len -
 725                                                       sizeof(struct ipv6hdr));
 726                                 ip6_copy_metadata(frag, skb);
 727                         }
 728
 729                         err = output(skb);
 730                         if(!err)
 731                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 732                                               IPSTATS_MIB_FRAGCREATES);
 733
 734                         if (err || !frag)
 735                                 break;
 736
 737                         skb = frag;
 738                         frag = skb->next;
 739                         skb->next = NULL;
 740                 }
 741
 742                 kfree(tmp_hdr);
 743
 744                 if (err == 0) {
 745                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 746                                       IPSTATS_MIB_FRAGOKS);
 747                         dst_release(&rt->u.dst);
 748                         return 0;
 749                 }
 750
 751                 while (frag) {
 752                         skb = frag->next;
 753                         kfree_skb(frag);
 754                         frag = skb;
 755                 }
 756
 757                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 758                               IPSTATS_MIB_FRAGFAILS);
 759                 dst_release(&rt->u.dst);
 760                 return err;
 761         }
 762
 763 slow_path:
 764         left = skb->len - hlen;         /* Space per frame */
 765         ptr = hlen;                     /* Where to start from */
 766
 767         /*
 768          *      Fragment the datagram.
 769          */
 770
 771         *prevhdr = NEXTHDR_FRAGMENT;
 772
 773         /*
 774          *      Keep copying data until we run out.
 775          */
 776         while(left > 0) {
 777                 len = left;
 778                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 779                 if (len > mtu)
 780                         len = mtu;
 781                 /* IF: we are not sending upto and including the packet end
 782                    then align the next start on an eight byte boundary */
 783                 if (len < left) {
 784                         len &= ~7;
 785                 }
 786                 /*
 787                  *      Allocate buffer.
 788                  */
 789
 790                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 791                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 792                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 793                                       IPSTATS_MIB_FRAGFAILS);
 794                         err = -ENOMEM;
 795                         goto fail;
 796                 }
 797
 798                 /*
 799                  *      Set up data on packet
 800                  */
 801
 802                 ip6_copy_metadata(frag, skb);
 803                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 804                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 805                 skb_reset_network_header(frag);
 806                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 807                 frag->transport_header = (frag->network_header + hlen +
 808                                           sizeof(struct frag_hdr));
 809
 810                 /*
 811                  *      Charge the memory for the fragment to any owner
 812                  *      it might possess
 813                  */
 814                 if (skb->sk)
 815                         skb_set_owner_w(frag, skb->sk);
 816
 817                 /*
 818                  *      Copy the packet header into the new buffer.
 819                  */
 820                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 821
 822                 /*
 823                  *      Build fragment header.
 824                  */
 825                 fh->nexthdr = nexthdr;
 826                 fh->reserved = 0;
 827                 if (!frag_id) {
 828                         ipv6_select_ident(fh);
 829                         frag_id = fh->identification;
 830                 } else
 831                         fh->identification = frag_id;
 832
 833                 /*
 834                  *      Copy a block of the IP datagram.
 835                  */
 836                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 837                         BUG();
 838                 left -= len;
 839
 840                 fh->frag_off = htons(offset);
 841                 if (left > 0)
 842                         fh->frag_off |= htons(IP6_MF);
 843                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 844                                                     sizeof(struct ipv6hdr));
 845
 846                 ptr += len;
 847                 offset += len;
 848
 849                 /*
 850                  *      Put this fragment into the sending queue.
 851                  */
 852                 err = output(frag);
 853                 if (err)
 854                         goto fail;
 855
 856                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 857                               IPSTATS_MIB_FRAGCREATES);
 858         }
 859         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 860                       IPSTATS_MIB_FRAGOKS);
 861         kfree_skb(skb);
 862         return err;
 863
 864 fail:
 865         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 866                       IPSTATS_MIB_FRAGFAILS);
 867         kfree_skb(skb);
 868         return err;
 869 }
 870
 871 static inline int ip6_rt_check(struct rt6key *rt_key,
 872                                struct in6_addr *fl_addr,
 873                                struct in6_addr *addr_cache)
 874 {
 875         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 876                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 877 }
 878
 879 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 880                                           struct dst_entry *dst,
 881                                           struct flowi *fl)
 882 {
 883         struct ipv6_pinfo *np = inet6_sk(sk);
 884         struct rt6_info *rt = (struct rt6_info *)dst;
 885
 886         if (!dst)
 887                 goto out;
 888
 889         /* Yes, checking route validity in not connected
 890          * case is not very simple. Take into account,
 891          * that we do not support routing by source, TOS,
 892          * and MSG_DONTROUTE            --ANK (980726)
 893          *
 894          * 1. ip6_rt_check(): If route was host route,
 895          *    check that cached destination is current.
 896          *    If it is network route, we still may
 897          *    check its validity using saved pointer
 898          *    to the last used address: daddr_cache.
 899          *    We do not want to save whole address now,
 900          *    (because main consumer of this service
 901          *    is tcp, which has not this problem),
 902          *    so that the last trick works only on connected
 903          *    sockets.
 904          * 2. oif also should be the same.
 905          */
 906         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 907 #ifdef CONFIG_IPV6_SUBTREES
 908             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 909 #endif
 910             (fl->oif && fl->oif != dst->dev->ifindex)) {
 911                 dst_release(dst);
 912                 dst = NULL;
 913         }
 914
 915 out:
 916         return dst;
 917 }
 918
 919 static int ip6_dst_lookup_tail(struct sock *sk,
 920                                struct dst_entry **dst, struct flowi *fl)
 921 {
 922         int err;
 923         struct net *net = sock_net(sk);
 924
 925         if (*dst == NULL)
 926                 *dst = ip6_route_output(net, sk, fl);
 927
 928         if ((err = (*dst)->error))
 929                 goto out_err_release;
 930
 931         if (ipv6_addr_any(&fl->fl6_src)) {
 932                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 933                                          &fl->fl6_dst,
 934                                          sk ? inet6_sk(sk)->srcprefs : 0,
 935                                          &fl->fl6_src);
 936                 if (err)
 937                         goto out_err_release;
 938         }
 939
 940 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 941         /*
 942          * Here if the dst entry we've looked up
 943          * has a neighbour entry that is in the INCOMPLETE
 944          * state and the src address from the flow is
 945          * marked as OPTIMISTIC, we release the found
 946          * dst entry and replace it instead with the
 947          * dst entry of the nexthop router
 948          */
 949         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 950                 struct inet6_ifaddr *ifp;
 951                 struct flowi fl_gw;
 952                 int redirect;
 953
 954                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 955                                       (*dst)->dev, 1);
 956
 957                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 958                 if (ifp)
 959                         in6_ifa_put(ifp);
 960
 961                 if (redirect) {
 962                         /*
 963                          * We need to get the dst entry for the
 964                          * default router instead
 965                          */
 966                         dst_release(*dst);
 967                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 968                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 969                         *dst = ip6_route_output(net, sk, &fl_gw);
 970                         if ((err = (*dst)->error))
 971                                 goto out_err_release;
 972                 }
 973         }
 974 #endif
 975
 976         return 0;
 977
 978 out_err_release:
 979         if (err == -ENETUNREACH)
 980                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 981         dst_release(*dst);
 982         *dst = NULL;
 983         return err;
 984 }
 985
 986 /**
 987  *      ip6_dst_lookup - perform route lookup on flow
 988  *      @sk: socket which provides route info
 989  *      @dst: pointer to dst_entry * for result
 990  *      @fl: flow to lookup
 991  *
 992  *      This function performs a route lookup on the given flow.
 993  *
 994  *      It returns zero on success, or a standard errno code on error.
 995  */
 996 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 997 {
 998         *dst = NULL;
 999         return ip6_dst_lookup_tail(sk, dst, fl);
1000 }
1001 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1002
1003 /**
1004  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1005  *      @sk: socket which provides the dst cache and route info
1006  *      @dst: pointer to dst_entry * for result
1007  *      @fl: flow to lookup
1008  *
1009  *      This function performs a route lookup on the given flow with the
1010  *      possibility of using the cached route in the socket if it is valid.
1011  *      It will take the socket dst lock when operating on the dst cache.
1012  *      As a result, this function can only be used in process context.
1013  *
1014  *      It returns zero on success, or a standard errno code on error.
1015  */
1016 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1017 {
1018         *dst = NULL;
1019         if (sk) {
1020                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1021                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1022         }
1023
1024         return ip6_dst_lookup_tail(sk, dst, fl);
1025 }
1026 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1027
1028 static inline int ip6_ufo_append_data(struct sock *sk,
1029                         int getfrag(void *from, char *to, int offset, int len,
1030                         int odd, struct sk_buff *skb),
1031                         void *from, int length, int hh_len, int fragheaderlen,
1032                         int transhdrlen, int mtu,unsigned int flags)
1033
1034 {
1035         struct sk_buff *skb;
1036         int err;
1037
1038         /* There is support for UDP large send offload by network
1039          * device, so create one single skb packet containing complete
1040          * udp datagram
1041          */
1042         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1043                 skb = sock_alloc_send_skb(sk,
1044                         hh_len + fragheaderlen + transhdrlen + 20,
1045                         (flags & MSG_DONTWAIT), &err);
1046                 if (skb == NULL)
1047                         return -ENOMEM;
1048
1049                 /* reserve space for Hardware header */
1050                 skb_reserve(skb, hh_len);
1051
1052                 /* create space for UDP/IP header */
1053                 skb_put(skb,fragheaderlen + transhdrlen);
1054
1055                 /* initialize network header pointer */
1056                 skb_reset_network_header(skb);
1057
1058                 /* initialize protocol header pointer */
1059                 skb->transport_header = skb->network_header + fragheaderlen;
1060
1061                 skb->ip_summed = CHECKSUM_PARTIAL;
1062                 skb->csum = 0;
1063                 sk->sk_sndmsg_off = 0;
1064         }
1065
1066         err = skb_append_datato_frags(sk,skb, getfrag, from,
1067                                       (length - transhdrlen));
1068         if (!err) {
1069                 struct frag_hdr fhdr;
1070
1071                 /* Specify the length of each IPv6 datagram fragment.
1072                  * It has to be a multiple of 8.
1073                  */
1074                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1075                                              sizeof(struct frag_hdr)) & ~7;
1076                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1077                 ipv6_select_ident(&fhdr);
1078                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1079                 __skb_queue_tail(&sk->sk_write_queue, skb);
1080
1081                 return 0;
1082         }
1083         /* There is not enough support do UPD LSO,
1084          * so follow normal path
1085          */
1086         kfree_skb(skb);
1087
1088         return err;
1089 }
1090
1091 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1092                                                gfp_t gfp)
1093 {
1094         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1095 }
1096
1097 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1098                                                 gfp_t gfp)
1099 {
1100         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1101 }
1102
1103 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1104         int offset, int len, int odd, struct sk_buff *skb),
1105         void *from, int length, int transhdrlen,
1106         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1107         struct rt6_info *rt, unsigned int flags)
1108 {
1109         struct inet_sock *inet = inet_sk(sk);
1110         struct ipv6_pinfo *np = inet6_sk(sk);
1111         struct sk_buff *skb;
1112         unsigned int maxfraglen, fragheaderlen;
1113         int exthdrlen;
1114         int hh_len;
1115         int mtu;
1116         int copy;
1117         int err;
1118         int offset = 0;
1119         int csummode = CHECKSUM_NONE;
1120
1121         if (flags&MSG_PROBE)
1122                 return 0;
1123         if (skb_queue_empty(&sk->sk_write_queue)) {
1124                 /*
1125                  * setup for corking
1126                  */
1127                 if (opt) {
1128                         if (WARN_ON(np->cork.opt))
1129                                 return -EINVAL;
1130
1131                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1132                         if (unlikely(np->cork.opt == NULL))
1133                                 return -ENOBUFS;
1134
1135                         np->cork.opt->tot_len = opt->tot_len;
1136                         np->cork.opt->opt_flen = opt->opt_flen;
1137                         np->cork.opt->opt_nflen = opt->opt_nflen;
1138
1139                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1140                                                             sk->sk_allocation);
1141                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1142                                 return -ENOBUFS;
1143
1144                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1145                                                             sk->sk_allocation);
1146                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1147                                 return -ENOBUFS;
1148
1149                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1150                                                            sk->sk_allocation);
1151                         if (opt->hopopt && !np->cork.opt->hopopt)
1152                                 return -ENOBUFS;
1153
1154                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1155                                                             sk->sk_allocation);
1156                         if (opt->srcrt && !np->cork.opt->srcrt)
1157                                 return -ENOBUFS;
1158
1159                         /* need source address above miyazawa*/
1160                 }
1161                 dst_hold(&rt->u.dst);
1162                 inet->cork.dst = &rt->u.dst;
1163                 inet->cork.fl = *fl;
1164                 np->cork.hop_limit = hlimit;
1165                 np->cork.tclass = tclass;
1166                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1167                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1168                 if (np->frag_size < mtu) {
1169                         if (np->frag_size)
1170                                 mtu = np->frag_size;
1171                 }
1172                 inet->cork.fragsize = mtu;
1173                 if (dst_allfrag(rt->u.dst.path))
1174                         inet->cork.flags |= IPCORK_ALLFRAG;
1175                 inet->cork.length = 0;
1176                 sk->sk_sndmsg_page = NULL;
1177                 sk->sk_sndmsg_off = 0;
1178                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1179                             rt->rt6i_nfheader_len;
1180                 length += exthdrlen;
1181                 transhdrlen += exthdrlen;
1182         } else {
1183                 rt = (struct rt6_info *)inet->cork.dst;
1184                 fl = &inet->cork.fl;
1185                 opt = np->cork.opt;
1186                 transhdrlen = 0;
1187                 exthdrlen = 0;
1188                 mtu = inet->cork.fragsize;
1189         }
1190
1191         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1192
1193         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1194                         (opt ? opt->opt_nflen : 0);
1195         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1196
1197         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1198                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1199                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1200                         return -EMSGSIZE;
1201                 }
1202         }
1203
1204         /*
1205          * Let's try using as much space as possible.
1206          * Use MTU if total length of the message fits into the MTU.
1207          * Otherwise, we need to reserve fragment header and
1208          * fragment alignment (= 8-15 octects, in total).
1209          *
1210          * Note that we may need to "move" the data from the tail of
1211          * of the buffer to the new fragment when we split
1212          * the message.
1213          *
1214          * FIXME: It may be fragmented into multiple chunks
1215          *        at once if non-fragmentable extension headers
1216          *        are too large.
1217          * --yoshfuji
1218          */
1219
1220         inet->cork.length += length;
1221         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1222             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1223
1224                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1225                                           fragheaderlen, transhdrlen, mtu,
1226                                           flags);
1227                 if (err)
1228                         goto error;
1229                 return 0;
1230         }
1231
1232         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1233                 goto alloc_new_skb;
1234
1235         while (length > 0) {
1236                 /* Check if the remaining data fits into current packet. */
1237                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1238                 if (copy < length)
1239                         copy = maxfraglen - skb->len;
1240
1241                 if (copy <= 0) {
1242                         char *data;
1243                         unsigned int datalen;
1244                         unsigned int fraglen;
1245                         unsigned int fraggap;
1246                         unsigned int alloclen;
1247                         struct sk_buff *skb_prev;
1248 alloc_new_skb:
1249                         skb_prev = skb;
1250
1251                         /* There's no room in the current skb */
1252                         if (skb_prev)
1253                                 fraggap = skb_prev->len - maxfraglen;
1254                         else
1255                                 fraggap = 0;
1256
1257                         /*
1258                          * If remaining data exceeds the mtu,
1259                          * we know we need more fragment(s).
1260                          */
1261                         datalen = length + fraggap;
1262                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1263                                 datalen = maxfraglen - fragheaderlen;
1264
1265                         fraglen = datalen + fragheaderlen;
1266                         if ((flags & MSG_MORE) &&
1267                             !(rt->u.dst.dev->features&NETIF_F_SG))
1268                                 alloclen = mtu;
1269                         else
1270                                 alloclen = datalen + fragheaderlen;
1271
1272                         /*
1273                          * The last fragment gets additional space at tail.
1274                          * Note: we overallocate on fragments with MSG_MODE
1275                          * because we have no idea if we're the last one.
1276                          */
1277                         if (datalen == length + fraggap)
1278                                 alloclen += rt->u.dst.trailer_len;
1279
1280                         /*
1281                          * We just reserve space for fragment header.
1282                          * Note: this may be overallocation if the message
1283                          * (without MSG_MORE) fits into the MTU.
1284                          */
1285                         alloclen += sizeof(struct frag_hdr);
1286
1287                         if (transhdrlen) {
1288                                 skb = sock_alloc_send_skb(sk,
1289                                                 alloclen + hh_len,
1290                                                 (flags & MSG_DONTWAIT), &err);
1291                         } else {
1292                                 skb = NULL;
1293                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1294                                     2 * sk->sk_sndbuf)
1295                                         skb = sock_wmalloc(sk,
1296                                                            alloclen + hh_len, 1,
1297                                                            sk->sk_allocation);
1298                                 if (unlikely(skb == NULL))
1299                                         err = -ENOBUFS;
1300                         }
1301                         if (skb == NULL)
1302                                 goto error;
1303                         /*
1304                          *      Fill in the control structures
1305                          */
1306                         skb->ip_summed = csummode;
1307                         skb->csum = 0;
1308                         /* reserve for fragmentation */
1309                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1310
1311                         /*
1312                          *      Find where to start putting bytes
1313                          */
1314                         data = skb_put(skb, fraglen);
1315                         skb_set_network_header(skb, exthdrlen);
1316                         data += fragheaderlen;
1317                         skb->transport_header = (skb->network_header +
1318                                                  fragheaderlen);
1319                         if (fraggap) {
1320                                 skb->csum = skb_copy_and_csum_bits(
1321                                         skb_prev, maxfraglen,
1322                                         data + transhdrlen, fraggap, 0);
1323                                 skb_prev->csum = csum_sub(skb_prev->csum,
1324                                                           skb->csum);
1325                                 data += fraggap;
1326                                 pskb_trim_unique(skb_prev, maxfraglen);
1327                         }
1328                         copy = datalen - transhdrlen - fraggap;
1329                         if (copy < 0) {
1330                                 err = -EINVAL;
1331                                 kfree_skb(skb);
1332                                 goto error;
1333                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1334                                 err = -EFAULT;
1335                                 kfree_skb(skb);
1336                                 goto error;
1337                         }
1338
1339                         offset += copy;
1340                         length -= datalen - fraggap;
1341                         transhdrlen = 0;
1342                         exthdrlen = 0;
1343                         csummode = CHECKSUM_NONE;
1344
1345                         /*
1346                          * Put the packet on the pending queue
1347                          */
1348                         __skb_queue_tail(&sk->sk_write_queue, skb);
1349                         continue;
1350                 }
1351
1352                 if (copy > length)
1353                         copy = length;
1354
1355                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1356                         unsigned int off;
1357
1358                         off = skb->len;
1359                         if (getfrag(from, skb_put(skb, copy),
1360                                                 offset, copy, off, skb) < 0) {
1361                                 __skb_trim(skb, off);
1362                                 err = -EFAULT;
1363                                 goto error;
1364                         }
1365                 } else {
1366                         int i = skb_shinfo(skb)->nr_frags;
1367                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1368                         struct page *page = sk->sk_sndmsg_page;
1369                         int off = sk->sk_sndmsg_off;
1370                         unsigned int left;
1371
1372                         if (page && (left = PAGE_SIZE - off) > 0) {
1373                                 if (copy >= left)
1374                                         copy = left;
1375                                 if (page != frag->page) {
1376                                         if (i == MAX_SKB_FRAGS) {
1377                                                 err = -EMSGSIZE;
1378                                                 goto error;
1379                                         }
1380                                         get_page(page);
1381                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1382                                         frag = &skb_shinfo(skb)->frags[i];
1383                                 }
1384                         } else if(i < MAX_SKB_FRAGS) {
1385                                 if (copy > PAGE_SIZE)
1386                                         copy = PAGE_SIZE;
1387                                 page = alloc_pages(sk->sk_allocation, 0);
1388                                 if (page == NULL) {
1389                                         err = -ENOMEM;
1390                                         goto error;
1391                                 }
1392                                 sk->sk_sndmsg_page = page;
1393                                 sk->sk_sndmsg_off = 0;
1394
1395                                 skb_fill_page_desc(skb, i, page, 0, 0);
1396                                 frag = &skb_shinfo(skb)->frags[i];
1397                         } else {
1398                                 err = -EMSGSIZE;
1399                                 goto error;
1400                         }
1401                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1402                                 err = -EFAULT;
1403                                 goto error;
1404                         }
1405                         sk->sk_sndmsg_off += copy;
1406                         frag->size += copy;
1407                         skb->len += copy;
1408                         skb->data_len += copy;
1409                         skb->truesize += copy;
1410                         atomic_add(copy, &sk->sk_wmem_alloc);
1411                 }
1412                 offset += copy;
1413                 length -= copy;
1414         }
1415         return 0;
1416 error:
1417         inet->cork.length -= length;
1418         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1419         return err;
1420 }
1421
1422 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1423 {
1424         if (np->cork.opt) {
1425                 kfree(np->cork.opt->dst0opt);
1426                 kfree(np->cork.opt->dst1opt);
1427                 kfree(np->cork.opt->hopopt);
1428                 kfree(np->cork.opt->srcrt);
1429                 kfree(np->cork.opt);
1430                 np->cork.opt = NULL;
1431         }
1432
1433         if (inet->cork.dst) {
1434                 dst_release(inet->cork.dst);
1435                 inet->cork.dst = NULL;
1436                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1437         }
1438         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1439 }
1440
1441 int ip6_push_pending_frames(struct sock *sk)
1442 {
1443         struct sk_buff *skb, *tmp_skb;
1444         struct sk_buff **tail_skb;
1445         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1446         struct inet_sock *inet = inet_sk(sk);
1447         struct ipv6_pinfo *np = inet6_sk(sk);
1448         struct net *net = sock_net(sk);
1449         struct ipv6hdr *hdr;
1450         struct ipv6_txoptions *opt = np->cork.opt;
1451         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1452         struct flowi *fl = &inet->cork.fl;
1453         unsigned char proto = fl->proto;
1454         int err = 0;
1455
1456         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1457                 goto out;
1458         tail_skb = &(skb_shinfo(skb)->frag_list);
1459
1460         /* move skb->data to ip header from ext header */
1461         if (skb->data < skb_network_header(skb))
1462                 __skb_pull(skb, skb_network_offset(skb));
1463         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1464                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1465                 *tail_skb = tmp_skb;
1466                 tail_skb = &(tmp_skb->next);
1467                 skb->len += tmp_skb->len;
1468                 skb->data_len += tmp_skb->len;
1469                 skb->truesize += tmp_skb->truesize;
1470                 tmp_skb->destructor = NULL;
1471                 tmp_skb->sk = NULL;
1472         }
1473
1474         /* Allow local fragmentation. */
1475         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1476                 skb->local_df = 1;
1477
1478         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1479         __skb_pull(skb, skb_network_header_len(skb));
1480         if (opt && opt->opt_flen)
1481                 ipv6_push_frag_opts(skb, opt, &proto);
1482         if (opt && opt->opt_nflen)
1483                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1484
1485         skb_push(skb, sizeof(struct ipv6hdr));
1486         skb_reset_network_header(skb);
1487         hdr = ipv6_hdr(skb);
1488
1489         *(__be32*)hdr = fl->fl6_flowlabel |
1490                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1491
1492         hdr->hop_limit = np->cork.hop_limit;
1493         hdr->nexthdr = proto;
1494         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1495         ipv6_addr_copy(&hdr->daddr, final_dst);
1496
1497         skb->priority = sk->sk_priority;
1498         skb->mark = sk->sk_mark;
1499
1500         skb_dst_set(skb, dst_clone(&rt->u.dst));
1501         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1502         if (proto == IPPROTO_ICMPV6) {
1503                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1504
1505                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1506                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1507         }
1508
1509         err = ip6_local_out(skb);
1510         if (err) {
1511                 if (err > 0)
1512                         err = net_xmit_errno(err);
1513                 if (err)
1514                         goto error;
1515         }
1516
1517 out:
1518         ip6_cork_release(inet, np);
1519         return err;
1520 error:
1521         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1522         goto out;
1523 }
1524
1525 void ip6_flush_pending_frames(struct sock *sk)
1526 {
1527         struct sk_buff *skb;
1528
1529         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1530                 if (skb_dst(skb))
1531                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1532                                       IPSTATS_MIB_OUTDISCARDS);
1533                 kfree_skb(skb);
1534         }
1535
1536         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1537 }