]> rtime.felk.cvut.cz Git - can-eth-gw-linux.git/blob - net/ipv6/ip6_output.c
Merge tag 'for-linus-20121212' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowe...
[can-eth-gw-linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 static int ip6_finish_output2(struct sk_buff *skb)
87 {
88         struct dst_entry *dst = skb_dst(skb);
89         struct net_device *dev = dst->dev;
90         struct neighbour *neigh;
91         struct rt6_info *rt;
92
93         skb->protocol = htons(ETH_P_IPV6);
94         skb->dev = dev;
95
96         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
97                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
98
99                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
100                     ((mroute6_socket(dev_net(dev), skb) &&
101                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
102                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
103                                          &ipv6_hdr(skb)->saddr))) {
104                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
105
106                         /* Do not check for IFF_ALLMULTI; multicast routing
107                            is not supported in any case.
108                          */
109                         if (newskb)
110                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
111                                         newskb, NULL, newskb->dev,
112                                         dev_loopback_xmit);
113
114                         if (ipv6_hdr(skb)->hop_limit == 0) {
115                                 IP6_INC_STATS(dev_net(dev), idev,
116                                               IPSTATS_MIB_OUTDISCARDS);
117                                 kfree_skb(skb);
118                                 return 0;
119                         }
120                 }
121
122                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
123                                 skb->len);
124         }
125
126         rt = (struct rt6_info *) dst;
127         neigh = rt->n;
128         if (neigh)
129                 return dst_neigh_output(dst, neigh, skb);
130
131         IP6_INC_STATS_BH(dev_net(dst->dev),
132                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
133         kfree_skb(skb);
134         return -EINVAL;
135 }
136
137 static int ip6_finish_output(struct sk_buff *skb)
138 {
139         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
140             dst_allfrag(skb_dst(skb)))
141                 return ip6_fragment(skb, ip6_finish_output2);
142         else
143                 return ip6_finish_output2(skb);
144 }
145
146 int ip6_output(struct sk_buff *skb)
147 {
148         struct net_device *dev = skb_dst(skb)->dev;
149         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
150         if (unlikely(idev->cnf.disable_ipv6)) {
151                 IP6_INC_STATS(dev_net(dev), idev,
152                               IPSTATS_MIB_OUTDISCARDS);
153                 kfree_skb(skb);
154                 return 0;
155         }
156
157         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
158                             ip6_finish_output,
159                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
160 }
161
162 /*
163  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
164  */
165
166 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
167              struct ipv6_txoptions *opt, int tclass)
168 {
169         struct net *net = sock_net(sk);
170         struct ipv6_pinfo *np = inet6_sk(sk);
171         struct in6_addr *first_hop = &fl6->daddr;
172         struct dst_entry *dst = skb_dst(skb);
173         struct ipv6hdr *hdr;
174         u8  proto = fl6->flowi6_proto;
175         int seg_len = skb->len;
176         int hlimit = -1;
177         u32 mtu;
178
179         if (opt) {
180                 unsigned int head_room;
181
182                 /* First: exthdrs may take lots of space (~8K for now)
183                    MAX_HEADER is not enough.
184                  */
185                 head_room = opt->opt_nflen + opt->opt_flen;
186                 seg_len += head_room;
187                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
188
189                 if (skb_headroom(skb) < head_room) {
190                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
191                         if (skb2 == NULL) {
192                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
193                                               IPSTATS_MIB_OUTDISCARDS);
194                                 kfree_skb(skb);
195                                 return -ENOBUFS;
196                         }
197                         consume_skb(skb);
198                         skb = skb2;
199                         skb_set_owner_w(skb, sk);
200                 }
201                 if (opt->opt_flen)
202                         ipv6_push_frag_opts(skb, opt, &proto);
203                 if (opt->opt_nflen)
204                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
205         }
206
207         skb_push(skb, sizeof(struct ipv6hdr));
208         skb_reset_network_header(skb);
209         hdr = ipv6_hdr(skb);
210
211         /*
212          *      Fill in the IPv6 header
213          */
214         if (np)
215                 hlimit = np->hop_limit;
216         if (hlimit < 0)
217                 hlimit = ip6_dst_hoplimit(dst);
218
219         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
220
221         hdr->payload_len = htons(seg_len);
222         hdr->nexthdr = proto;
223         hdr->hop_limit = hlimit;
224
225         hdr->saddr = fl6->saddr;
226         hdr->daddr = *first_hop;
227
228         skb->priority = sk->sk_priority;
229         skb->mark = sk->sk_mark;
230
231         mtu = dst_mtu(dst);
232         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
233                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
234                               IPSTATS_MIB_OUT, skb->len);
235                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
236                                dst->dev, dst_output);
237         }
238
239         net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
240         skb->dev = dst->dev;
241         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
242         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
243         kfree_skb(skb);
244         return -EMSGSIZE;
245 }
246
247 EXPORT_SYMBOL(ip6_xmit);
248
249 /*
250  *      To avoid extra problems ND packets are send through this
251  *      routine. It's code duplication but I really want to avoid
252  *      extra checks since ipv6_build_header is used by TCP (which
253  *      is for us performance critical)
254  */
255
256 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
257                const struct in6_addr *saddr, const struct in6_addr *daddr,
258                int proto, int len)
259 {
260         struct ipv6_pinfo *np = inet6_sk(sk);
261         struct ipv6hdr *hdr;
262
263         skb->protocol = htons(ETH_P_IPV6);
264         skb->dev = dev;
265
266         skb_reset_network_header(skb);
267         skb_put(skb, sizeof(struct ipv6hdr));
268         hdr = ipv6_hdr(skb);
269
270         *(__be32*)hdr = htonl(0x60000000);
271
272         hdr->payload_len = htons(len);
273         hdr->nexthdr = proto;
274         hdr->hop_limit = np->hop_limit;
275
276         hdr->saddr = *saddr;
277         hdr->daddr = *daddr;
278
279         return 0;
280 }
281
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
283 {
284         struct ip6_ra_chain *ra;
285         struct sock *last = NULL;
286
287         read_lock(&ip6_ra_lock);
288         for (ra = ip6_ra_chain; ra; ra = ra->next) {
289                 struct sock *sk = ra->sk;
290                 if (sk && ra->sel == sel &&
291                     (!sk->sk_bound_dev_if ||
292                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
293                         if (last) {
294                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
295                                 if (skb2)
296                                         rawv6_rcv(last, skb2);
297                         }
298                         last = sk;
299                 }
300         }
301
302         if (last) {
303                 rawv6_rcv(last, skb);
304                 read_unlock(&ip6_ra_lock);
305                 return 1;
306         }
307         read_unlock(&ip6_ra_lock);
308         return 0;
309 }
310
311 static int ip6_forward_proxy_check(struct sk_buff *skb)
312 {
313         struct ipv6hdr *hdr = ipv6_hdr(skb);
314         u8 nexthdr = hdr->nexthdr;
315         __be16 frag_off;
316         int offset;
317
318         if (ipv6_ext_hdr(nexthdr)) {
319                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
320                 if (offset < 0)
321                         return 0;
322         } else
323                 offset = sizeof(struct ipv6hdr);
324
325         if (nexthdr == IPPROTO_ICMPV6) {
326                 struct icmp6hdr *icmp6;
327
328                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
329                                          offset + 1 - skb->data)))
330                         return 0;
331
332                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
333
334                 switch (icmp6->icmp6_type) {
335                 case NDISC_ROUTER_SOLICITATION:
336                 case NDISC_ROUTER_ADVERTISEMENT:
337                 case NDISC_NEIGHBOUR_SOLICITATION:
338                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
339                 case NDISC_REDIRECT:
340                         /* For reaction involving unicast neighbor discovery
341                          * message destined to the proxied address, pass it to
342                          * input function.
343                          */
344                         return 1;
345                 default:
346                         break;
347                 }
348         }
349
350         /*
351          * The proxying router can't forward traffic sent to a link-local
352          * address, so signal the sender and discard the packet. This
353          * behavior is clarified by the MIPv6 specification.
354          */
355         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
356                 dst_link_failure(skb);
357                 return -1;
358         }
359
360         return 0;
361 }
362
363 static inline int ip6_forward_finish(struct sk_buff *skb)
364 {
365         return dst_output(skb);
366 }
367
368 int ip6_forward(struct sk_buff *skb)
369 {
370         struct dst_entry *dst = skb_dst(skb);
371         struct ipv6hdr *hdr = ipv6_hdr(skb);
372         struct inet6_skb_parm *opt = IP6CB(skb);
373         struct net *net = dev_net(dst->dev);
374         u32 mtu;
375
376         if (net->ipv6.devconf_all->forwarding == 0)
377                 goto error;
378
379         if (skb_warn_if_lro(skb))
380                 goto drop;
381
382         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
384                 goto drop;
385         }
386
387         if (skb->pkt_type != PACKET_HOST)
388                 goto drop;
389
390         skb_forward_csum(skb);
391
392         /*
393          *      We DO NOT make any processing on
394          *      RA packets, pushing them to user level AS IS
395          *      without ane WARRANTY that application will be able
396          *      to interpret them. The reason is that we
397          *      cannot make anything clever here.
398          *
399          *      We are not end-node, so that if packet contains
400          *      AH/ESP, we cannot make anything.
401          *      Defragmentation also would be mistake, RA packets
402          *      cannot be fragmented, because there is no warranty
403          *      that different fragments will go along one path. --ANK
404          */
405         if (opt->ra) {
406                 u8 *ptr = skb_network_header(skb) + opt->ra;
407                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
408                         return 0;
409         }
410
411         /*
412          *      check and decrement ttl
413          */
414         if (hdr->hop_limit <= 1) {
415                 /* Force OUTPUT device used as source address */
416                 skb->dev = dst->dev;
417                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418                 IP6_INC_STATS_BH(net,
419                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
420
421                 kfree_skb(skb);
422                 return -ETIMEDOUT;
423         }
424
425         /* XXX: idev->cnf.proxy_ndp? */
426         if (net->ipv6.devconf_all->proxy_ndp &&
427             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428                 int proxied = ip6_forward_proxy_check(skb);
429                 if (proxied > 0)
430                         return ip6_input(skb);
431                 else if (proxied < 0) {
432                         IP6_INC_STATS(net, ip6_dst_idev(dst),
433                                       IPSTATS_MIB_INDISCARDS);
434                         goto drop;
435                 }
436         }
437
438         if (!xfrm6_route_forward(skb)) {
439                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
440                 goto drop;
441         }
442         dst = skb_dst(skb);
443
444         /* IPv6 specs say nothing about it, but it is clear that we cannot
445            send redirects to source routed frames.
446            We don't send redirects to frames decapsulated from IPsec.
447          */
448         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
449                 struct in6_addr *target = NULL;
450                 struct inet_peer *peer;
451                 struct rt6_info *rt;
452
453                 /*
454                  *      incoming and outgoing devices are the same
455                  *      send a redirect.
456                  */
457
458                 rt = (struct rt6_info *) dst;
459                 if (rt->rt6i_flags & RTF_GATEWAY)
460                         target = &rt->rt6i_gateway;
461                 else
462                         target = &hdr->daddr;
463
464                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
465
466                 /* Limit redirects both by destination (here)
467                    and by source (inside ndisc_send_redirect)
468                  */
469                 if (inet_peer_xrlim_allow(peer, 1*HZ))
470                         ndisc_send_redirect(skb, target);
471                 if (peer)
472                         inet_putpeer(peer);
473         } else {
474                 int addrtype = ipv6_addr_type(&hdr->saddr);
475
476                 /* This check is security critical. */
477                 if (addrtype == IPV6_ADDR_ANY ||
478                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
479                         goto error;
480                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
481                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
482                                     ICMPV6_NOT_NEIGHBOUR, 0);
483                         goto error;
484                 }
485         }
486
487         mtu = dst_mtu(dst);
488         if (mtu < IPV6_MIN_MTU)
489                 mtu = IPV6_MIN_MTU;
490
491         if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
492             (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
493                 /* Again, force OUTPUT device used as source address */
494                 skb->dev = dst->dev;
495                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496                 IP6_INC_STATS_BH(net,
497                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
498                 IP6_INC_STATS_BH(net,
499                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
500                 kfree_skb(skb);
501                 return -EMSGSIZE;
502         }
503
504         if (skb_cow(skb, dst->dev->hard_header_len)) {
505                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
506                 goto drop;
507         }
508
509         hdr = ipv6_hdr(skb);
510
511         /* Mangling hops number delayed to point after skb COW */
512
513         hdr->hop_limit--;
514
515         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
516         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
517         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
518                        ip6_forward_finish);
519
520 error:
521         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
522 drop:
523         kfree_skb(skb);
524         return -EINVAL;
525 }
526
527 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
528 {
529         to->pkt_type = from->pkt_type;
530         to->priority = from->priority;
531         to->protocol = from->protocol;
532         skb_dst_drop(to);
533         skb_dst_set(to, dst_clone(skb_dst(from)));
534         to->dev = from->dev;
535         to->mark = from->mark;
536
537 #ifdef CONFIG_NET_SCHED
538         to->tc_index = from->tc_index;
539 #endif
540         nf_copy(to, from);
541 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
542     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
543         to->nf_trace = from->nf_trace;
544 #endif
545         skb_copy_secmark(to, from);
546 }
547
548 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
549 {
550         u16 offset = sizeof(struct ipv6hdr);
551         struct ipv6_opt_hdr *exthdr =
552                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
553         unsigned int packet_len = skb->tail - skb->network_header;
554         int found_rhdr = 0;
555         *nexthdr = &ipv6_hdr(skb)->nexthdr;
556
557         while (offset + 1 <= packet_len) {
558
559                 switch (**nexthdr) {
560
561                 case NEXTHDR_HOP:
562                         break;
563                 case NEXTHDR_ROUTING:
564                         found_rhdr = 1;
565                         break;
566                 case NEXTHDR_DEST:
567 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
568                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
569                                 break;
570 #endif
571                         if (found_rhdr)
572                                 return offset;
573                         break;
574                 default :
575                         return offset;
576                 }
577
578                 offset += ipv6_optlen(exthdr);
579                 *nexthdr = &exthdr->nexthdr;
580                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
581                                                  offset);
582         }
583
584         return offset;
585 }
586
587 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
588 {
589         static atomic_t ipv6_fragmentation_id;
590         int old, new;
591
592         if (rt && !(rt->dst.flags & DST_NOPEER)) {
593                 struct inet_peer *peer;
594                 struct net *net;
595
596                 net = dev_net(rt->dst.dev);
597                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
598                 if (peer) {
599                         fhdr->identification = htonl(inet_getid(peer, 0));
600                         inet_putpeer(peer);
601                         return;
602                 }
603         }
604         do {
605                 old = atomic_read(&ipv6_fragmentation_id);
606                 new = old + 1;
607                 if (!new)
608                         new = 1;
609         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
610         fhdr->identification = htonl(new);
611 }
612
613 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
614 {
615         struct sk_buff *frag;
616         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
617         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
618         struct ipv6hdr *tmp_hdr;
619         struct frag_hdr *fh;
620         unsigned int mtu, hlen, left, len;
621         int hroom, troom;
622         __be32 frag_id = 0;
623         int ptr, offset = 0, err=0;
624         u8 *prevhdr, nexthdr = 0;
625         struct net *net = dev_net(skb_dst(skb)->dev);
626
627         hlen = ip6_find_1stfragopt(skb, &prevhdr);
628         nexthdr = *prevhdr;
629
630         mtu = ip6_skb_dst_mtu(skb);
631
632         /* We must not fragment if the socket is set to force MTU discovery
633          * or if the skb it not generated by a local socket.
634          */
635         if (unlikely(!skb->local_df && skb->len > mtu) ||
636                      (IP6CB(skb)->frag_max_size &&
637                       IP6CB(skb)->frag_max_size > mtu)) {
638                 if (skb->sk && dst_allfrag(skb_dst(skb)))
639                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
640
641                 skb->dev = skb_dst(skb)->dev;
642                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
643                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
644                               IPSTATS_MIB_FRAGFAILS);
645                 kfree_skb(skb);
646                 return -EMSGSIZE;
647         }
648
649         if (np && np->frag_size < mtu) {
650                 if (np->frag_size)
651                         mtu = np->frag_size;
652         }
653         mtu -= hlen + sizeof(struct frag_hdr);
654
655         if (skb_has_frag_list(skb)) {
656                 int first_len = skb_pagelen(skb);
657                 struct sk_buff *frag2;
658
659                 if (first_len - hlen > mtu ||
660                     ((first_len - hlen) & 7) ||
661                     skb_cloned(skb))
662                         goto slow_path;
663
664                 skb_walk_frags(skb, frag) {
665                         /* Correct geometry. */
666                         if (frag->len > mtu ||
667                             ((frag->len & 7) && frag->next) ||
668                             skb_headroom(frag) < hlen)
669                                 goto slow_path_clean;
670
671                         /* Partially cloned skb? */
672                         if (skb_shared(frag))
673                                 goto slow_path_clean;
674
675                         BUG_ON(frag->sk);
676                         if (skb->sk) {
677                                 frag->sk = skb->sk;
678                                 frag->destructor = sock_wfree;
679                         }
680                         skb->truesize -= frag->truesize;
681                 }
682
683                 err = 0;
684                 offset = 0;
685                 frag = skb_shinfo(skb)->frag_list;
686                 skb_frag_list_init(skb);
687                 /* BUILD HEADER */
688
689                 *prevhdr = NEXTHDR_FRAGMENT;
690                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691                 if (!tmp_hdr) {
692                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
693                                       IPSTATS_MIB_FRAGFAILS);
694                         return -ENOMEM;
695                 }
696
697                 __skb_pull(skb, hlen);
698                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
699                 __skb_push(skb, hlen);
700                 skb_reset_network_header(skb);
701                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
702
703                 ipv6_select_ident(fh, rt);
704                 fh->nexthdr = nexthdr;
705                 fh->reserved = 0;
706                 fh->frag_off = htons(IP6_MF);
707                 frag_id = fh->identification;
708
709                 first_len = skb_pagelen(skb);
710                 skb->data_len = first_len - skb_headlen(skb);
711                 skb->len = first_len;
712                 ipv6_hdr(skb)->payload_len = htons(first_len -
713                                                    sizeof(struct ipv6hdr));
714
715                 dst_hold(&rt->dst);
716
717                 for (;;) {
718                         /* Prepare header of the next frame,
719                          * before previous one went down. */
720                         if (frag) {
721                                 frag->ip_summed = CHECKSUM_NONE;
722                                 skb_reset_transport_header(frag);
723                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
724                                 __skb_push(frag, hlen);
725                                 skb_reset_network_header(frag);
726                                 memcpy(skb_network_header(frag), tmp_hdr,
727                                        hlen);
728                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
729                                 fh->nexthdr = nexthdr;
730                                 fh->reserved = 0;
731                                 fh->frag_off = htons(offset);
732                                 if (frag->next != NULL)
733                                         fh->frag_off |= htons(IP6_MF);
734                                 fh->identification = frag_id;
735                                 ipv6_hdr(frag)->payload_len =
736                                                 htons(frag->len -
737                                                       sizeof(struct ipv6hdr));
738                                 ip6_copy_metadata(frag, skb);
739                         }
740
741                         err = output(skb);
742                         if(!err)
743                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
744                                               IPSTATS_MIB_FRAGCREATES);
745
746                         if (err || !frag)
747                                 break;
748
749                         skb = frag;
750                         frag = skb->next;
751                         skb->next = NULL;
752                 }
753
754                 kfree(tmp_hdr);
755
756                 if (err == 0) {
757                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
758                                       IPSTATS_MIB_FRAGOKS);
759                         dst_release(&rt->dst);
760                         return 0;
761                 }
762
763                 while (frag) {
764                         skb = frag->next;
765                         kfree_skb(frag);
766                         frag = skb;
767                 }
768
769                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
770                               IPSTATS_MIB_FRAGFAILS);
771                 dst_release(&rt->dst);
772                 return err;
773
774 slow_path_clean:
775                 skb_walk_frags(skb, frag2) {
776                         if (frag2 == frag)
777                                 break;
778                         frag2->sk = NULL;
779                         frag2->destructor = NULL;
780                         skb->truesize += frag2->truesize;
781                 }
782         }
783
784 slow_path:
785         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
786             skb_checksum_help(skb))
787                 goto fail;
788
789         left = skb->len - hlen;         /* Space per frame */
790         ptr = hlen;                     /* Where to start from */
791
792         /*
793          *      Fragment the datagram.
794          */
795
796         *prevhdr = NEXTHDR_FRAGMENT;
797         hroom = LL_RESERVED_SPACE(rt->dst.dev);
798         troom = rt->dst.dev->needed_tailroom;
799
800         /*
801          *      Keep copying data until we run out.
802          */
803         while(left > 0) {
804                 len = left;
805                 /* IF: it doesn't fit, use 'mtu' - the data space left */
806                 if (len > mtu)
807                         len = mtu;
808                 /* IF: we are not sending up to and including the packet end
809                    then align the next start on an eight byte boundary */
810                 if (len < left) {
811                         len &= ~7;
812                 }
813                 /*
814                  *      Allocate buffer.
815                  */
816
817                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
818                                       hroom + troom, GFP_ATOMIC)) == NULL) {
819                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
820                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
821                                       IPSTATS_MIB_FRAGFAILS);
822                         err = -ENOMEM;
823                         goto fail;
824                 }
825
826                 /*
827                  *      Set up data on packet
828                  */
829
830                 ip6_copy_metadata(frag, skb);
831                 skb_reserve(frag, hroom);
832                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
833                 skb_reset_network_header(frag);
834                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
835                 frag->transport_header = (frag->network_header + hlen +
836                                           sizeof(struct frag_hdr));
837
838                 /*
839                  *      Charge the memory for the fragment to any owner
840                  *      it might possess
841                  */
842                 if (skb->sk)
843                         skb_set_owner_w(frag, skb->sk);
844
845                 /*
846                  *      Copy the packet header into the new buffer.
847                  */
848                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
849
850                 /*
851                  *      Build fragment header.
852                  */
853                 fh->nexthdr = nexthdr;
854                 fh->reserved = 0;
855                 if (!frag_id) {
856                         ipv6_select_ident(fh, rt);
857                         frag_id = fh->identification;
858                 } else
859                         fh->identification = frag_id;
860
861                 /*
862                  *      Copy a block of the IP datagram.
863                  */
864                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
865                         BUG();
866                 left -= len;
867
868                 fh->frag_off = htons(offset);
869                 if (left > 0)
870                         fh->frag_off |= htons(IP6_MF);
871                 ipv6_hdr(frag)->payload_len = htons(frag->len -
872                                                     sizeof(struct ipv6hdr));
873
874                 ptr += len;
875                 offset += len;
876
877                 /*
878                  *      Put this fragment into the sending queue.
879                  */
880                 err = output(frag);
881                 if (err)
882                         goto fail;
883
884                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885                               IPSTATS_MIB_FRAGCREATES);
886         }
887         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888                       IPSTATS_MIB_FRAGOKS);
889         consume_skb(skb);
890         return err;
891
892 fail:
893         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894                       IPSTATS_MIB_FRAGFAILS);
895         kfree_skb(skb);
896         return err;
897 }
898
899 static inline int ip6_rt_check(const struct rt6key *rt_key,
900                                const struct in6_addr *fl_addr,
901                                const struct in6_addr *addr_cache)
902 {
903         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
904                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
905 }
906
907 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
908                                           struct dst_entry *dst,
909                                           const struct flowi6 *fl6)
910 {
911         struct ipv6_pinfo *np = inet6_sk(sk);
912         struct rt6_info *rt = (struct rt6_info *)dst;
913
914         if (!dst)
915                 goto out;
916
917         /* Yes, checking route validity in not connected
918          * case is not very simple. Take into account,
919          * that we do not support routing by source, TOS,
920          * and MSG_DONTROUTE            --ANK (980726)
921          *
922          * 1. ip6_rt_check(): If route was host route,
923          *    check that cached destination is current.
924          *    If it is network route, we still may
925          *    check its validity using saved pointer
926          *    to the last used address: daddr_cache.
927          *    We do not want to save whole address now,
928          *    (because main consumer of this service
929          *    is tcp, which has not this problem),
930          *    so that the last trick works only on connected
931          *    sockets.
932          * 2. oif also should be the same.
933          */
934         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
935 #ifdef CONFIG_IPV6_SUBTREES
936             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
937 #endif
938             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
939                 dst_release(dst);
940                 dst = NULL;
941         }
942
943 out:
944         return dst;
945 }
946
947 static int ip6_dst_lookup_tail(struct sock *sk,
948                                struct dst_entry **dst, struct flowi6 *fl6)
949 {
950         struct net *net = sock_net(sk);
951 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
952         struct neighbour *n;
953         struct rt6_info *rt;
954 #endif
955         int err;
956
957         if (*dst == NULL)
958                 *dst = ip6_route_output(net, sk, fl6);
959
960         if ((err = (*dst)->error))
961                 goto out_err_release;
962
963         if (ipv6_addr_any(&fl6->saddr)) {
964                 struct rt6_info *rt = (struct rt6_info *) *dst;
965                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
966                                           sk ? inet6_sk(sk)->srcprefs : 0,
967                                           &fl6->saddr);
968                 if (err)
969                         goto out_err_release;
970         }
971
972 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
973         /*
974          * Here if the dst entry we've looked up
975          * has a neighbour entry that is in the INCOMPLETE
976          * state and the src address from the flow is
977          * marked as OPTIMISTIC, we release the found
978          * dst entry and replace it instead with the
979          * dst entry of the nexthop router
980          */
981         rt = (struct rt6_info *) *dst;
982         n = rt->n;
983         if (n && !(n->nud_state & NUD_VALID)) {
984                 struct inet6_ifaddr *ifp;
985                 struct flowi6 fl_gw6;
986                 int redirect;
987
988                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
989                                       (*dst)->dev, 1);
990
991                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
992                 if (ifp)
993                         in6_ifa_put(ifp);
994
995                 if (redirect) {
996                         /*
997                          * We need to get the dst entry for the
998                          * default router instead
999                          */
1000                         dst_release(*dst);
1001                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1002                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1003                         *dst = ip6_route_output(net, sk, &fl_gw6);
1004                         if ((err = (*dst)->error))
1005                                 goto out_err_release;
1006                 }
1007         }
1008 #endif
1009
1010         return 0;
1011
1012 out_err_release:
1013         if (err == -ENETUNREACH)
1014                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1015         dst_release(*dst);
1016         *dst = NULL;
1017         return err;
1018 }
1019
1020 /**
1021  *      ip6_dst_lookup - perform route lookup on flow
1022  *      @sk: socket which provides route info
1023  *      @dst: pointer to dst_entry * for result
1024  *      @fl6: flow to lookup
1025  *
1026  *      This function performs a route lookup on the given flow.
1027  *
1028  *      It returns zero on success, or a standard errno code on error.
1029  */
1030 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1031 {
1032         *dst = NULL;
1033         return ip6_dst_lookup_tail(sk, dst, fl6);
1034 }
1035 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1036
1037 /**
1038  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1039  *      @sk: socket which provides route info
1040  *      @fl6: flow to lookup
1041  *      @final_dst: final destination address for ipsec lookup
1042  *      @can_sleep: we are in a sleepable context
1043  *
1044  *      This function performs a route lookup on the given flow.
1045  *
1046  *      It returns a valid dst pointer on success, or a pointer encoded
1047  *      error code.
1048  */
1049 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1050                                       const struct in6_addr *final_dst,
1051                                       bool can_sleep)
1052 {
1053         struct dst_entry *dst = NULL;
1054         int err;
1055
1056         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1057         if (err)
1058                 return ERR_PTR(err);
1059         if (final_dst)
1060                 fl6->daddr = *final_dst;
1061         if (can_sleep)
1062                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1063
1064         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1065 }
1066 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1067
1068 /**
1069  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1070  *      @sk: socket which provides the dst cache and route info
1071  *      @fl6: flow to lookup
1072  *      @final_dst: final destination address for ipsec lookup
1073  *      @can_sleep: we are in a sleepable context
1074  *
1075  *      This function performs a route lookup on the given flow with the
1076  *      possibility of using the cached route in the socket if it is valid.
1077  *      It will take the socket dst lock when operating on the dst cache.
1078  *      As a result, this function can only be used in process context.
1079  *
1080  *      It returns a valid dst pointer on success, or a pointer encoded
1081  *      error code.
1082  */
1083 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1084                                          const struct in6_addr *final_dst,
1085                                          bool can_sleep)
1086 {
1087         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1088         int err;
1089
1090         dst = ip6_sk_dst_check(sk, dst, fl6);
1091
1092         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1093         if (err)
1094                 return ERR_PTR(err);
1095         if (final_dst)
1096                 fl6->daddr = *final_dst;
1097         if (can_sleep)
1098                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1099
1100         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1101 }
1102 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1103
1104 static inline int ip6_ufo_append_data(struct sock *sk,
1105                         int getfrag(void *from, char *to, int offset, int len,
1106                         int odd, struct sk_buff *skb),
1107                         void *from, int length, int hh_len, int fragheaderlen,
1108                         int transhdrlen, int mtu,unsigned int flags,
1109                         struct rt6_info *rt)
1110
1111 {
1112         struct sk_buff *skb;
1113         int err;
1114
1115         /* There is support for UDP large send offload by network
1116          * device, so create one single skb packet containing complete
1117          * udp datagram
1118          */
1119         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1120                 skb = sock_alloc_send_skb(sk,
1121                         hh_len + fragheaderlen + transhdrlen + 20,
1122                         (flags & MSG_DONTWAIT), &err);
1123                 if (skb == NULL)
1124                         return err;
1125
1126                 /* reserve space for Hardware header */
1127                 skb_reserve(skb, hh_len);
1128
1129                 /* create space for UDP/IP header */
1130                 skb_put(skb,fragheaderlen + transhdrlen);
1131
1132                 /* initialize network header pointer */
1133                 skb_reset_network_header(skb);
1134
1135                 /* initialize protocol header pointer */
1136                 skb->transport_header = skb->network_header + fragheaderlen;
1137
1138                 skb->ip_summed = CHECKSUM_PARTIAL;
1139                 skb->csum = 0;
1140         }
1141
1142         err = skb_append_datato_frags(sk,skb, getfrag, from,
1143                                       (length - transhdrlen));
1144         if (!err) {
1145                 struct frag_hdr fhdr;
1146
1147                 /* Specify the length of each IPv6 datagram fragment.
1148                  * It has to be a multiple of 8.
1149                  */
1150                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1151                                              sizeof(struct frag_hdr)) & ~7;
1152                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1153                 ipv6_select_ident(&fhdr, rt);
1154                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1155                 __skb_queue_tail(&sk->sk_write_queue, skb);
1156
1157                 return 0;
1158         }
1159         /* There is not enough support do UPD LSO,
1160          * so follow normal path
1161          */
1162         kfree_skb(skb);
1163
1164         return err;
1165 }
1166
1167 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1168                                                gfp_t gfp)
1169 {
1170         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1171 }
1172
1173 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1174                                                 gfp_t gfp)
1175 {
1176         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1177 }
1178
1179 static void ip6_append_data_mtu(int *mtu,
1180                                 int *maxfraglen,
1181                                 unsigned int fragheaderlen,
1182                                 struct sk_buff *skb,
1183                                 struct rt6_info *rt)
1184 {
1185         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1186                 if (skb == NULL) {
1187                         /* first fragment, reserve header_len */
1188                         *mtu = *mtu - rt->dst.header_len;
1189
1190                 } else {
1191                         /*
1192                          * this fragment is not first, the headers
1193                          * space is regarded as data space.
1194                          */
1195                         *mtu = dst_mtu(rt->dst.path);
1196                 }
1197                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1198                               + fragheaderlen - sizeof(struct frag_hdr);
1199         }
1200 }
1201
1202 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1203         int offset, int len, int odd, struct sk_buff *skb),
1204         void *from, int length, int transhdrlen,
1205         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1206         struct rt6_info *rt, unsigned int flags, int dontfrag)
1207 {
1208         struct inet_sock *inet = inet_sk(sk);
1209         struct ipv6_pinfo *np = inet6_sk(sk);
1210         struct inet_cork *cork;
1211         struct sk_buff *skb, *skb_prev = NULL;
1212         unsigned int maxfraglen, fragheaderlen;
1213         int exthdrlen;
1214         int dst_exthdrlen;
1215         int hh_len;
1216         int mtu;
1217         int copy;
1218         int err;
1219         int offset = 0;
1220         __u8 tx_flags = 0;
1221
1222         if (flags&MSG_PROBE)
1223                 return 0;
1224         cork = &inet->cork.base;
1225         if (skb_queue_empty(&sk->sk_write_queue)) {
1226                 /*
1227                  * setup for corking
1228                  */
1229                 if (opt) {
1230                         if (WARN_ON(np->cork.opt))
1231                                 return -EINVAL;
1232
1233                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1234                         if (unlikely(np->cork.opt == NULL))
1235                                 return -ENOBUFS;
1236
1237                         np->cork.opt->tot_len = opt->tot_len;
1238                         np->cork.opt->opt_flen = opt->opt_flen;
1239                         np->cork.opt->opt_nflen = opt->opt_nflen;
1240
1241                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1242                                                             sk->sk_allocation);
1243                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1244                                 return -ENOBUFS;
1245
1246                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1247                                                             sk->sk_allocation);
1248                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1249                                 return -ENOBUFS;
1250
1251                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1252                                                            sk->sk_allocation);
1253                         if (opt->hopopt && !np->cork.opt->hopopt)
1254                                 return -ENOBUFS;
1255
1256                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1257                                                             sk->sk_allocation);
1258                         if (opt->srcrt && !np->cork.opt->srcrt)
1259                                 return -ENOBUFS;
1260
1261                         /* need source address above miyazawa*/
1262                 }
1263                 dst_hold(&rt->dst);
1264                 cork->dst = &rt->dst;
1265                 inet->cork.fl.u.ip6 = *fl6;
1266                 np->cork.hop_limit = hlimit;
1267                 np->cork.tclass = tclass;
1268                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1269                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1270                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1271                 else
1272                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1273                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1274                 if (np->frag_size < mtu) {
1275                         if (np->frag_size)
1276                                 mtu = np->frag_size;
1277                 }
1278                 cork->fragsize = mtu;
1279                 if (dst_allfrag(rt->dst.path))
1280                         cork->flags |= IPCORK_ALLFRAG;
1281                 cork->length = 0;
1282                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1283                 length += exthdrlen;
1284                 transhdrlen += exthdrlen;
1285                 dst_exthdrlen = rt->dst.header_len;
1286         } else {
1287                 rt = (struct rt6_info *)cork->dst;
1288                 fl6 = &inet->cork.fl.u.ip6;
1289                 opt = np->cork.opt;
1290                 transhdrlen = 0;
1291                 exthdrlen = 0;
1292                 dst_exthdrlen = 0;
1293                 mtu = cork->fragsize;
1294         }
1295
1296         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1297
1298         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1299                         (opt ? opt->opt_nflen : 0);
1300         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1301
1302         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1303                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1304                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1305                         return -EMSGSIZE;
1306                 }
1307         }
1308
1309         /* For UDP, check if TX timestamp is enabled */
1310         if (sk->sk_type == SOCK_DGRAM) {
1311                 err = sock_tx_timestamp(sk, &tx_flags);
1312                 if (err)
1313                         goto error;
1314         }
1315
1316         /*
1317          * Let's try using as much space as possible.
1318          * Use MTU if total length of the message fits into the MTU.
1319          * Otherwise, we need to reserve fragment header and
1320          * fragment alignment (= 8-15 octects, in total).
1321          *
1322          * Note that we may need to "move" the data from the tail of
1323          * of the buffer to the new fragment when we split
1324          * the message.
1325          *
1326          * FIXME: It may be fragmented into multiple chunks
1327          *        at once if non-fragmentable extension headers
1328          *        are too large.
1329          * --yoshfuji
1330          */
1331
1332         cork->length += length;
1333         if (length > mtu) {
1334                 int proto = sk->sk_protocol;
1335                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1336                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1337                         return -EMSGSIZE;
1338                 }
1339
1340                 if (proto == IPPROTO_UDP &&
1341                     (rt->dst.dev->features & NETIF_F_UFO)) {
1342
1343                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1344                                                   hh_len, fragheaderlen,
1345                                                   transhdrlen, mtu, flags, rt);
1346                         if (err)
1347                                 goto error;
1348                         return 0;
1349                 }
1350         }
1351
1352         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1353                 goto alloc_new_skb;
1354
1355         while (length > 0) {
1356                 /* Check if the remaining data fits into current packet. */
1357                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1358                 if (copy < length)
1359                         copy = maxfraglen - skb->len;
1360
1361                 if (copy <= 0) {
1362                         char *data;
1363                         unsigned int datalen;
1364                         unsigned int fraglen;
1365                         unsigned int fraggap;
1366                         unsigned int alloclen;
1367 alloc_new_skb:
1368                         /* There's no room in the current skb */
1369                         if (skb)
1370                                 fraggap = skb->len - maxfraglen;
1371                         else
1372                                 fraggap = 0;
1373                         /* update mtu and maxfraglen if necessary */
1374                         if (skb == NULL || skb_prev == NULL)
1375                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1376                                                     fragheaderlen, skb, rt);
1377
1378                         skb_prev = skb;
1379
1380                         /*
1381                          * If remaining data exceeds the mtu,
1382                          * we know we need more fragment(s).
1383                          */
1384                         datalen = length + fraggap;
1385
1386                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1387                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1388                         if ((flags & MSG_MORE) &&
1389                             !(rt->dst.dev->features&NETIF_F_SG))
1390                                 alloclen = mtu;
1391                         else
1392                                 alloclen = datalen + fragheaderlen;
1393
1394                         alloclen += dst_exthdrlen;
1395
1396                         if (datalen != length + fraggap) {
1397                                 /*
1398                                  * this is not the last fragment, the trailer
1399                                  * space is regarded as data space.
1400                                  */
1401                                 datalen += rt->dst.trailer_len;
1402                         }
1403
1404                         alloclen += rt->dst.trailer_len;
1405                         fraglen = datalen + fragheaderlen;
1406
1407                         /*
1408                          * We just reserve space for fragment header.
1409                          * Note: this may be overallocation if the message
1410                          * (without MSG_MORE) fits into the MTU.
1411                          */
1412                         alloclen += sizeof(struct frag_hdr);
1413
1414                         if (transhdrlen) {
1415                                 skb = sock_alloc_send_skb(sk,
1416                                                 alloclen + hh_len,
1417                                                 (flags & MSG_DONTWAIT), &err);
1418                         } else {
1419                                 skb = NULL;
1420                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1421                                     2 * sk->sk_sndbuf)
1422                                         skb = sock_wmalloc(sk,
1423                                                            alloclen + hh_len, 1,
1424                                                            sk->sk_allocation);
1425                                 if (unlikely(skb == NULL))
1426                                         err = -ENOBUFS;
1427                                 else {
1428                                         /* Only the initial fragment
1429                                          * is time stamped.
1430                                          */
1431                                         tx_flags = 0;
1432                                 }
1433                         }
1434                         if (skb == NULL)
1435                                 goto error;
1436                         /*
1437                          *      Fill in the control structures
1438                          */
1439                         skb->ip_summed = CHECKSUM_NONE;
1440                         skb->csum = 0;
1441                         /* reserve for fragmentation and ipsec header */
1442                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1443                                     dst_exthdrlen);
1444
1445                         if (sk->sk_type == SOCK_DGRAM)
1446                                 skb_shinfo(skb)->tx_flags = tx_flags;
1447
1448                         /*
1449                          *      Find where to start putting bytes
1450                          */
1451                         data = skb_put(skb, fraglen);
1452                         skb_set_network_header(skb, exthdrlen);
1453                         data += fragheaderlen;
1454                         skb->transport_header = (skb->network_header +
1455                                                  fragheaderlen);
1456                         if (fraggap) {
1457                                 skb->csum = skb_copy_and_csum_bits(
1458                                         skb_prev, maxfraglen,
1459                                         data + transhdrlen, fraggap, 0);
1460                                 skb_prev->csum = csum_sub(skb_prev->csum,
1461                                                           skb->csum);
1462                                 data += fraggap;
1463                                 pskb_trim_unique(skb_prev, maxfraglen);
1464                         }
1465                         copy = datalen - transhdrlen - fraggap;
1466
1467                         if (copy < 0) {
1468                                 err = -EINVAL;
1469                                 kfree_skb(skb);
1470                                 goto error;
1471                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1472                                 err = -EFAULT;
1473                                 kfree_skb(skb);
1474                                 goto error;
1475                         }
1476
1477                         offset += copy;
1478                         length -= datalen - fraggap;
1479                         transhdrlen = 0;
1480                         exthdrlen = 0;
1481                         dst_exthdrlen = 0;
1482
1483                         /*
1484                          * Put the packet on the pending queue
1485                          */
1486                         __skb_queue_tail(&sk->sk_write_queue, skb);
1487                         continue;
1488                 }
1489
1490                 if (copy > length)
1491                         copy = length;
1492
1493                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1494                         unsigned int off;
1495
1496                         off = skb->len;
1497                         if (getfrag(from, skb_put(skb, copy),
1498                                                 offset, copy, off, skb) < 0) {
1499                                 __skb_trim(skb, off);
1500                                 err = -EFAULT;
1501                                 goto error;
1502                         }
1503                 } else {
1504                         int i = skb_shinfo(skb)->nr_frags;
1505                         struct page_frag *pfrag = sk_page_frag(sk);
1506
1507                         err = -ENOMEM;
1508                         if (!sk_page_frag_refill(sk, pfrag))
1509                                 goto error;
1510
1511                         if (!skb_can_coalesce(skb, i, pfrag->page,
1512                                               pfrag->offset)) {
1513                                 err = -EMSGSIZE;
1514                                 if (i == MAX_SKB_FRAGS)
1515                                         goto error;
1516
1517                                 __skb_fill_page_desc(skb, i, pfrag->page,
1518                                                      pfrag->offset, 0);
1519                                 skb_shinfo(skb)->nr_frags = ++i;
1520                                 get_page(pfrag->page);
1521                         }
1522                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1523                         if (getfrag(from,
1524                                     page_address(pfrag->page) + pfrag->offset,
1525                                     offset, copy, skb->len, skb) < 0)
1526                                 goto error_efault;
1527
1528                         pfrag->offset += copy;
1529                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1530                         skb->len += copy;
1531                         skb->data_len += copy;
1532                         skb->truesize += copy;
1533                         atomic_add(copy, &sk->sk_wmem_alloc);
1534                 }
1535                 offset += copy;
1536                 length -= copy;
1537         }
1538
1539         return 0;
1540
1541 error_efault:
1542         err = -EFAULT;
1543 error:
1544         cork->length -= length;
1545         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1546         return err;
1547 }
1548 EXPORT_SYMBOL_GPL(ip6_append_data);
1549
1550 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1551 {
1552         if (np->cork.opt) {
1553                 kfree(np->cork.opt->dst0opt);
1554                 kfree(np->cork.opt->dst1opt);
1555                 kfree(np->cork.opt->hopopt);
1556                 kfree(np->cork.opt->srcrt);
1557                 kfree(np->cork.opt);
1558                 np->cork.opt = NULL;
1559         }
1560
1561         if (inet->cork.base.dst) {
1562                 dst_release(inet->cork.base.dst);
1563                 inet->cork.base.dst = NULL;
1564                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1565         }
1566         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1567 }
1568
1569 int ip6_push_pending_frames(struct sock *sk)
1570 {
1571         struct sk_buff *skb, *tmp_skb;
1572         struct sk_buff **tail_skb;
1573         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1574         struct inet_sock *inet = inet_sk(sk);
1575         struct ipv6_pinfo *np = inet6_sk(sk);
1576         struct net *net = sock_net(sk);
1577         struct ipv6hdr *hdr;
1578         struct ipv6_txoptions *opt = np->cork.opt;
1579         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1580         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1581         unsigned char proto = fl6->flowi6_proto;
1582         int err = 0;
1583
1584         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1585                 goto out;
1586         tail_skb = &(skb_shinfo(skb)->frag_list);
1587
1588         /* move skb->data to ip header from ext header */
1589         if (skb->data < skb_network_header(skb))
1590                 __skb_pull(skb, skb_network_offset(skb));
1591         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1592                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1593                 *tail_skb = tmp_skb;
1594                 tail_skb = &(tmp_skb->next);
1595                 skb->len += tmp_skb->len;
1596                 skb->data_len += tmp_skb->len;
1597                 skb->truesize += tmp_skb->truesize;
1598                 tmp_skb->destructor = NULL;
1599                 tmp_skb->sk = NULL;
1600         }
1601
1602         /* Allow local fragmentation. */
1603         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1604                 skb->local_df = 1;
1605
1606         *final_dst = fl6->daddr;
1607         __skb_pull(skb, skb_network_header_len(skb));
1608         if (opt && opt->opt_flen)
1609                 ipv6_push_frag_opts(skb, opt, &proto);
1610         if (opt && opt->opt_nflen)
1611                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1612
1613         skb_push(skb, sizeof(struct ipv6hdr));
1614         skb_reset_network_header(skb);
1615         hdr = ipv6_hdr(skb);
1616
1617         *(__be32*)hdr = fl6->flowlabel |
1618                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1619
1620         hdr->hop_limit = np->cork.hop_limit;
1621         hdr->nexthdr = proto;
1622         hdr->saddr = fl6->saddr;
1623         hdr->daddr = *final_dst;
1624
1625         skb->priority = sk->sk_priority;
1626         skb->mark = sk->sk_mark;
1627
1628         skb_dst_set(skb, dst_clone(&rt->dst));
1629         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1630         if (proto == IPPROTO_ICMPV6) {
1631                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1632
1633                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1634                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1635         }
1636
1637         err = ip6_local_out(skb);
1638         if (err) {
1639                 if (err > 0)
1640                         err = net_xmit_errno(err);
1641                 if (err)
1642                         goto error;
1643         }
1644
1645 out:
1646         ip6_cork_release(inet, np);
1647         return err;
1648 error:
1649         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1650         goto out;
1651 }
1652 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1653
1654 void ip6_flush_pending_frames(struct sock *sk)
1655 {
1656         struct sk_buff *skb;
1657
1658         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1659                 if (skb_dst(skb))
1660                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1661                                       IPSTATS_MIB_OUTDISCARDS);
1662                 kfree_skb(skb);
1663         }
1664
1665         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1666 }
1667 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);