]> err.no Git - linux-2.6/blob - net/ipv6/ip6_output.c
Merge rsync://rsync.kernel.org/pub/scm/linux/kernel/git/dtor/input.git manually
[linux-2.6] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151                 return ip6_fragment(skb, ip6_output2);
152         else
153                 return ip6_output2(skb);
154 }
155
156 #ifdef CONFIG_NETFILTER
157 int ip6_route_me_harder(struct sk_buff *skb)
158 {
159         struct ipv6hdr *iph = skb->nh.ipv6h;
160         struct dst_entry *dst;
161         struct flowi fl = {
162                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163                 .nl_u =
164                 { .ip6_u =
165                   { .daddr = iph->daddr,
166                     .saddr = iph->saddr, } },
167                 .proto = iph->nexthdr,
168         };
169
170         dst = ip6_route_output(skb->sk, &fl);
171
172         if (dst->error) {
173                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174                 LIMIT_NETDEBUG(
175                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176                 dst_release(dst);
177                 return -EINVAL;
178         }
179
180         /* Drop old route. */
181         dst_release(skb->dst);
182
183         skb->dst = dst;
184         return 0;
185 }
186 #endif
187
188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
189 {
190 #ifdef CONFIG_NETFILTER
191         if (skb->nfcache & NFC_ALTERED){
192                 if (ip6_route_me_harder(skb) != 0){
193                         kfree_skb(skb);
194                         return -EINVAL;
195                 }
196         }
197 #endif /* CONFIG_NETFILTER */
198         return dst_output(skb);
199 }
200
201 /*
202  *      xmit an sk_buff (used by TCP)
203  */
204
205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
206              struct ipv6_txoptions *opt, int ipfragok)
207 {
208         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
209         struct in6_addr *first_hop = &fl->fl6_dst;
210         struct dst_entry *dst = skb->dst;
211         struct ipv6hdr *hdr;
212         u8  proto = fl->proto;
213         int seg_len = skb->len;
214         int hlimit;
215         u32 mtu;
216
217         if (opt) {
218                 int head_room;
219
220                 /* First: exthdrs may take lots of space (~8K for now)
221                    MAX_HEADER is not enough.
222                  */
223                 head_room = opt->opt_nflen + opt->opt_flen;
224                 seg_len += head_room;
225                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
226
227                 if (skb_headroom(skb) < head_room) {
228                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
229                         kfree_skb(skb);
230                         skb = skb2;
231                         if (skb == NULL) {      
232                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
233                                 return -ENOBUFS;
234                         }
235                         if (sk)
236                                 skb_set_owner_w(skb, sk);
237                 }
238                 if (opt->opt_flen)
239                         ipv6_push_frag_opts(skb, opt, &proto);
240                 if (opt->opt_nflen)
241                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
242         }
243
244         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
245
246         /*
247          *      Fill in the IPv6 header
248          */
249
250         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
251         hlimit = -1;
252         if (np)
253                 hlimit = np->hop_limit;
254         if (hlimit < 0)
255                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
256         if (hlimit < 0)
257                 hlimit = ipv6_get_hoplimit(dst->dev);
258
259         hdr->payload_len = htons(seg_len);
260         hdr->nexthdr = proto;
261         hdr->hop_limit = hlimit;
262
263         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
264         ipv6_addr_copy(&hdr->daddr, first_hop);
265
266         mtu = dst_mtu(dst);
267         if ((skb->len <= mtu) || ipfragok) {
268                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
269                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
270         }
271
272         if (net_ratelimit())
273                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
274         skb->dev = dst->dev;
275         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
276         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
277         kfree_skb(skb);
278         return -EMSGSIZE;
279 }
280
281 /*
282  *      To avoid extra problems ND packets are send through this
283  *      routine. It's code duplication but I really want to avoid
284  *      extra checks since ipv6_build_header is used by TCP (which
285  *      is for us performance critical)
286  */
287
288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289                struct in6_addr *saddr, struct in6_addr *daddr,
290                int proto, int len)
291 {
292         struct ipv6_pinfo *np = inet6_sk(sk);
293         struct ipv6hdr *hdr;
294         int totlen;
295
296         skb->protocol = htons(ETH_P_IPV6);
297         skb->dev = dev;
298
299         totlen = len + sizeof(struct ipv6hdr);
300
301         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
302         skb->nh.ipv6h = hdr;
303
304         *(u32*)hdr = htonl(0x60000000);
305
306         hdr->payload_len = htons(len);
307         hdr->nexthdr = proto;
308         hdr->hop_limit = np->hop_limit;
309
310         ipv6_addr_copy(&hdr->saddr, saddr);
311         ipv6_addr_copy(&hdr->daddr, daddr);
312
313         return 0;
314 }
315
316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317 {
318         struct ip6_ra_chain *ra;
319         struct sock *last = NULL;
320
321         read_lock(&ip6_ra_lock);
322         for (ra = ip6_ra_chain; ra; ra = ra->next) {
323                 struct sock *sk = ra->sk;
324                 if (sk && ra->sel == sel) {
325                         if (last) {
326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327                                 if (skb2)
328                                         rawv6_rcv(last, skb2);
329                         }
330                         last = sk;
331                 }
332         }
333
334         if (last) {
335                 rawv6_rcv(last, skb);
336                 read_unlock(&ip6_ra_lock);
337                 return 1;
338         }
339         read_unlock(&ip6_ra_lock);
340         return 0;
341 }
342
343 static inline int ip6_forward_finish(struct sk_buff *skb)
344 {
345         return dst_output(skb);
346 }
347
348 int ip6_forward(struct sk_buff *skb)
349 {
350         struct dst_entry *dst = skb->dst;
351         struct ipv6hdr *hdr = skb->nh.ipv6h;
352         struct inet6_skb_parm *opt = IP6CB(skb);
353         
354         if (ipv6_devconf.forwarding == 0)
355                 goto error;
356
357         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
358                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
359                 goto drop;
360         }
361
362         skb->ip_summed = CHECKSUM_NONE;
363
364         /*
365          *      We DO NOT make any processing on
366          *      RA packets, pushing them to user level AS IS
367          *      without ane WARRANTY that application will be able
368          *      to interpret them. The reason is that we
369          *      cannot make anything clever here.
370          *
371          *      We are not end-node, so that if packet contains
372          *      AH/ESP, we cannot make anything.
373          *      Defragmentation also would be mistake, RA packets
374          *      cannot be fragmented, because there is no warranty
375          *      that different fragments will go along one path. --ANK
376          */
377         if (opt->ra) {
378                 u8 *ptr = skb->nh.raw + opt->ra;
379                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
380                         return 0;
381         }
382
383         /*
384          *      check and decrement ttl
385          */
386         if (hdr->hop_limit <= 1) {
387                 /* Force OUTPUT device used as source address */
388                 skb->dev = dst->dev;
389                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
390                             0, skb->dev);
391
392                 kfree_skb(skb);
393                 return -ETIMEDOUT;
394         }
395
396         if (!xfrm6_route_forward(skb)) {
397                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
398                 goto drop;
399         }
400         dst = skb->dst;
401
402         /* IPv6 specs say nothing about it, but it is clear that we cannot
403            send redirects to source routed frames.
404          */
405         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
406                 struct in6_addr *target = NULL;
407                 struct rt6_info *rt;
408                 struct neighbour *n = dst->neighbour;
409
410                 /*
411                  *      incoming and outgoing devices are the same
412                  *      send a redirect.
413                  */
414
415                 rt = (struct rt6_info *) dst;
416                 if ((rt->rt6i_flags & RTF_GATEWAY))
417                         target = (struct in6_addr*)&n->primary_key;
418                 else
419                         target = &hdr->daddr;
420
421                 /* Limit redirects both by destination (here)
422                    and by source (inside ndisc_send_redirect)
423                  */
424                 if (xrlim_allow(dst, 1*HZ))
425                         ndisc_send_redirect(skb, n, target);
426         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
427                                                 |IPV6_ADDR_LINKLOCAL)) {
428                 /* This check is security critical. */
429                 goto error;
430         }
431
432         if (skb->len > dst_mtu(dst)) {
433                 /* Again, force OUTPUT device used as source address */
434                 skb->dev = dst->dev;
435                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
436                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
437                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
438                 kfree_skb(skb);
439                 return -EMSGSIZE;
440         }
441
442         if (skb_cow(skb, dst->dev->hard_header_len)) {
443                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
444                 goto drop;
445         }
446
447         hdr = skb->nh.ipv6h;
448
449         /* Mangling hops number delayed to point after skb COW */
450  
451         hdr->hop_limit--;
452
453         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
454         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
455
456 error:
457         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
458 drop:
459         kfree_skb(skb);
460         return -EINVAL;
461 }
462
463 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
464 {
465         to->pkt_type = from->pkt_type;
466         to->priority = from->priority;
467         to->protocol = from->protocol;
468         to->security = from->security;
469         dst_release(to->dst);
470         to->dst = dst_clone(from->dst);
471         to->dev = from->dev;
472
473 #ifdef CONFIG_NET_SCHED
474         to->tc_index = from->tc_index;
475 #endif
476 #ifdef CONFIG_NETFILTER
477         to->nfmark = from->nfmark;
478         /* Connection association is same as pre-frag packet */
479         to->nfct = from->nfct;
480         nf_conntrack_get(to->nfct);
481         to->nfctinfo = from->nfctinfo;
482 #ifdef CONFIG_BRIDGE_NETFILTER
483         nf_bridge_put(to->nf_bridge);
484         to->nf_bridge = from->nf_bridge;
485         nf_bridge_get(to->nf_bridge);
486 #endif
487 #endif
488 }
489
490 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
491 {
492         u16 offset = sizeof(struct ipv6hdr);
493         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
494         unsigned int packet_len = skb->tail - skb->nh.raw;
495         int found_rhdr = 0;
496         *nexthdr = &skb->nh.ipv6h->nexthdr;
497
498         while (offset + 1 <= packet_len) {
499
500                 switch (**nexthdr) {
501
502                 case NEXTHDR_HOP:
503                 case NEXTHDR_ROUTING:
504                 case NEXTHDR_DEST:
505                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
506                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
507                         offset += ipv6_optlen(exthdr);
508                         *nexthdr = &exthdr->nexthdr;
509                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
510                         break;
511                 default :
512                         return offset;
513                 }
514         }
515
516         return offset;
517 }
518
519 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
520 {
521         struct net_device *dev;
522         struct sk_buff *frag;
523         struct rt6_info *rt = (struct rt6_info*)skb->dst;
524         struct ipv6hdr *tmp_hdr;
525         struct frag_hdr *fh;
526         unsigned int mtu, hlen, left, len;
527         u32 frag_id = 0;
528         int ptr, offset = 0, err=0;
529         u8 *prevhdr, nexthdr = 0;
530
531         dev = rt->u.dst.dev;
532         hlen = ip6_find_1stfragopt(skb, &prevhdr);
533         nexthdr = *prevhdr;
534
535         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
536
537         if (skb_shinfo(skb)->frag_list) {
538                 int first_len = skb_pagelen(skb);
539
540                 if (first_len - hlen > mtu ||
541                     ((first_len - hlen) & 7) ||
542                     skb_cloned(skb))
543                         goto slow_path;
544
545                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
546                         /* Correct geometry. */
547                         if (frag->len > mtu ||
548                             ((frag->len & 7) && frag->next) ||
549                             skb_headroom(frag) < hlen)
550                             goto slow_path;
551
552                         /* Partially cloned skb? */
553                         if (skb_shared(frag))
554                                 goto slow_path;
555
556                         BUG_ON(frag->sk);
557                         if (skb->sk) {
558                                 sock_hold(skb->sk);
559                                 frag->sk = skb->sk;
560                                 frag->destructor = sock_wfree;
561                                 skb->truesize -= frag->truesize;
562                         }
563                 }
564
565                 err = 0;
566                 offset = 0;
567                 frag = skb_shinfo(skb)->frag_list;
568                 skb_shinfo(skb)->frag_list = NULL;
569                 /* BUILD HEADER */
570
571                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
572                 if (!tmp_hdr) {
573                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
574                         return -ENOMEM;
575                 }
576
577                 *prevhdr = NEXTHDR_FRAGMENT;
578                 memcpy(tmp_hdr, skb->nh.raw, hlen);
579                 __skb_pull(skb, hlen);
580                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
581                 skb->nh.raw = __skb_push(skb, hlen);
582                 memcpy(skb->nh.raw, tmp_hdr, hlen);
583
584                 ipv6_select_ident(skb, fh);
585                 fh->nexthdr = nexthdr;
586                 fh->reserved = 0;
587                 fh->frag_off = htons(IP6_MF);
588                 frag_id = fh->identification;
589
590                 first_len = skb_pagelen(skb);
591                 skb->data_len = first_len - skb_headlen(skb);
592                 skb->len = first_len;
593                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
594  
595
596                 for (;;) {
597                         /* Prepare header of the next frame,
598                          * before previous one went down. */
599                         if (frag) {
600                                 frag->ip_summed = CHECKSUM_NONE;
601                                 frag->h.raw = frag->data;
602                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
603                                 frag->nh.raw = __skb_push(frag, hlen);
604                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
605                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
606                                 fh->nexthdr = nexthdr;
607                                 fh->reserved = 0;
608                                 fh->frag_off = htons(offset);
609                                 if (frag->next != NULL)
610                                         fh->frag_off |= htons(IP6_MF);
611                                 fh->identification = frag_id;
612                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
613                                 ip6_copy_metadata(frag, skb);
614                         }
615                         
616                         err = output(skb);
617                         if (err || !frag)
618                                 break;
619
620                         skb = frag;
621                         frag = skb->next;
622                         skb->next = NULL;
623                 }
624
625                 if (tmp_hdr)
626                         kfree(tmp_hdr);
627
628                 if (err == 0) {
629                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
630                         return 0;
631                 }
632
633                 while (frag) {
634                         skb = frag->next;
635                         kfree_skb(frag);
636                         frag = skb;
637                 }
638
639                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
640                 return err;
641         }
642
643 slow_path:
644         left = skb->len - hlen;         /* Space per frame */
645         ptr = hlen;                     /* Where to start from */
646
647         /*
648          *      Fragment the datagram.
649          */
650
651         *prevhdr = NEXTHDR_FRAGMENT;
652
653         /*
654          *      Keep copying data until we run out.
655          */
656         while(left > 0) {
657                 len = left;
658                 /* IF: it doesn't fit, use 'mtu' - the data space left */
659                 if (len > mtu)
660                         len = mtu;
661                 /* IF: we are not sending upto and including the packet end
662                    then align the next start on an eight byte boundary */
663                 if (len < left) {
664                         len &= ~7;
665                 }
666                 /*
667                  *      Allocate buffer.
668                  */
669
670                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
671                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
672                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
673                         err = -ENOMEM;
674                         goto fail;
675                 }
676
677                 /*
678                  *      Set up data on packet
679                  */
680
681                 ip6_copy_metadata(frag, skb);
682                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
683                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
684                 frag->nh.raw = frag->data;
685                 fh = (struct frag_hdr*)(frag->data + hlen);
686                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
687
688                 /*
689                  *      Charge the memory for the fragment to any owner
690                  *      it might possess
691                  */
692                 if (skb->sk)
693                         skb_set_owner_w(frag, skb->sk);
694
695                 /*
696                  *      Copy the packet header into the new buffer.
697                  */
698                 memcpy(frag->nh.raw, skb->data, hlen);
699
700                 /*
701                  *      Build fragment header.
702                  */
703                 fh->nexthdr = nexthdr;
704                 fh->reserved = 0;
705                 if (frag_id) {
706                         ipv6_select_ident(skb, fh);
707                         frag_id = fh->identification;
708                 } else
709                         fh->identification = frag_id;
710
711                 /*
712                  *      Copy a block of the IP datagram.
713                  */
714                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
715                         BUG();
716                 left -= len;
717
718                 fh->frag_off = htons(offset);
719                 if (left > 0)
720                         fh->frag_off |= htons(IP6_MF);
721                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
722
723                 ptr += len;
724                 offset += len;
725
726                 /*
727                  *      Put this fragment into the sending queue.
728                  */
729
730                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
731
732                 err = output(frag);
733                 if (err)
734                         goto fail;
735         }
736         kfree_skb(skb);
737         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
738         return err;
739
740 fail:
741         kfree_skb(skb); 
742         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
743         return err;
744 }
745
746 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
747 {
748         int err = 0;
749
750         *dst = NULL;
751         if (sk) {
752                 struct ipv6_pinfo *np = inet6_sk(sk);
753         
754                 *dst = sk_dst_check(sk, np->dst_cookie);
755                 if (*dst) {
756                         struct rt6_info *rt = (struct rt6_info*)*dst;
757         
758                                 /* Yes, checking route validity in not connected
759                                    case is not very simple. Take into account,
760                                    that we do not support routing by source, TOS,
761                                    and MSG_DONTROUTE            --ANK (980726)
762         
763                                    1. If route was host route, check that
764                                       cached destination is current.
765                                       If it is network route, we still may
766                                       check its validity using saved pointer
767                                       to the last used address: daddr_cache.
768                                       We do not want to save whole address now,
769                                       (because main consumer of this service
770                                        is tcp, which has not this problem),
771                                       so that the last trick works only on connected
772                                       sockets.
773                                    2. oif also should be the same.
774                                  */
775         
776                         if (((rt->rt6i_dst.plen != 128 ||
777                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
778                              && (np->daddr_cache == NULL ||
779                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
780                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
781                                 dst_release(*dst);
782                                 *dst = NULL;
783                         }
784                 }
785         }
786
787         if (*dst == NULL)
788                 *dst = ip6_route_output(sk, fl);
789
790         if ((err = (*dst)->error))
791                 goto out_err_release;
792
793         if (ipv6_addr_any(&fl->fl6_src)) {
794                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
795
796                 if (err) {
797 #if IP6_DEBUG >= 2
798                         printk(KERN_DEBUG "ip6_dst_lookup: "
799                                "no available source address\n");
800 #endif
801                         goto out_err_release;
802                 }
803         }
804
805         return 0;
806
807 out_err_release:
808         dst_release(*dst);
809         *dst = NULL;
810         return err;
811 }
812
813 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
814                     void *from, int length, int transhdrlen,
815                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
816                     unsigned int flags)
817 {
818         struct inet_sock *inet = inet_sk(sk);
819         struct ipv6_pinfo *np = inet6_sk(sk);
820         struct sk_buff *skb;
821         unsigned int maxfraglen, fragheaderlen;
822         int exthdrlen;
823         int hh_len;
824         int mtu;
825         int copy;
826         int err;
827         int offset = 0;
828         int csummode = CHECKSUM_NONE;
829
830         if (flags&MSG_PROBE)
831                 return 0;
832         if (skb_queue_empty(&sk->sk_write_queue)) {
833                 /*
834                  * setup for corking
835                  */
836                 if (opt) {
837                         if (np->cork.opt == NULL) {
838                                 np->cork.opt = kmalloc(opt->tot_len,
839                                                        sk->sk_allocation);
840                                 if (unlikely(np->cork.opt == NULL))
841                                         return -ENOBUFS;
842                         } else if (np->cork.opt->tot_len < opt->tot_len) {
843                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
844                                 return -EINVAL;
845                         }
846                         memcpy(np->cork.opt, opt, opt->tot_len);
847                         inet->cork.flags |= IPCORK_OPT;
848                         /* need source address above miyazawa*/
849                 }
850                 dst_hold(&rt->u.dst);
851                 np->cork.rt = rt;
852                 inet->cork.fl = *fl;
853                 np->cork.hop_limit = hlimit;
854                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
855                 if (dst_allfrag(rt->u.dst.path))
856                         inet->cork.flags |= IPCORK_ALLFRAG;
857                 inet->cork.length = 0;
858                 sk->sk_sndmsg_page = NULL;
859                 sk->sk_sndmsg_off = 0;
860                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
861                 length += exthdrlen;
862                 transhdrlen += exthdrlen;
863         } else {
864                 rt = np->cork.rt;
865                 fl = &inet->cork.fl;
866                 if (inet->cork.flags & IPCORK_OPT)
867                         opt = np->cork.opt;
868                 transhdrlen = 0;
869                 exthdrlen = 0;
870                 mtu = inet->cork.fragsize;
871         }
872
873         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
874
875         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
876         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
877
878         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
879                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
880                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
881                         return -EMSGSIZE;
882                 }
883         }
884
885         /*
886          * Let's try using as much space as possible.
887          * Use MTU if total length of the message fits into the MTU.
888          * Otherwise, we need to reserve fragment header and
889          * fragment alignment (= 8-15 octects, in total).
890          *
891          * Note that we may need to "move" the data from the tail of
892          * of the buffer to the new fragment when we split 
893          * the message.
894          *
895          * FIXME: It may be fragmented into multiple chunks 
896          *        at once if non-fragmentable extension headers
897          *        are too large.
898          * --yoshfuji 
899          */
900
901         inet->cork.length += length;
902
903         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
904                 goto alloc_new_skb;
905
906         while (length > 0) {
907                 /* Check if the remaining data fits into current packet. */
908                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
909                 if (copy < length)
910                         copy = maxfraglen - skb->len;
911
912                 if (copy <= 0) {
913                         char *data;
914                         unsigned int datalen;
915                         unsigned int fraglen;
916                         unsigned int fraggap;
917                         unsigned int alloclen;
918                         struct sk_buff *skb_prev;
919 alloc_new_skb:
920                         skb_prev = skb;
921
922                         /* There's no room in the current skb */
923                         if (skb_prev)
924                                 fraggap = skb_prev->len - maxfraglen;
925                         else
926                                 fraggap = 0;
927
928                         /*
929                          * If remaining data exceeds the mtu,
930                          * we know we need more fragment(s).
931                          */
932                         datalen = length + fraggap;
933                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
934                                 datalen = maxfraglen - fragheaderlen;
935
936                         fraglen = datalen + fragheaderlen;
937                         if ((flags & MSG_MORE) &&
938                             !(rt->u.dst.dev->features&NETIF_F_SG))
939                                 alloclen = mtu;
940                         else
941                                 alloclen = datalen + fragheaderlen;
942
943                         /*
944                          * The last fragment gets additional space at tail.
945                          * Note: we overallocate on fragments with MSG_MODE
946                          * because we have no idea if we're the last one.
947                          */
948                         if (datalen == length + fraggap)
949                                 alloclen += rt->u.dst.trailer_len;
950
951                         /*
952                          * We just reserve space for fragment header.
953                          * Note: this may be overallocation if the message 
954                          * (without MSG_MORE) fits into the MTU.
955                          */
956                         alloclen += sizeof(struct frag_hdr);
957
958                         if (transhdrlen) {
959                                 skb = sock_alloc_send_skb(sk,
960                                                 alloclen + hh_len,
961                                                 (flags & MSG_DONTWAIT), &err);
962                         } else {
963                                 skb = NULL;
964                                 if (atomic_read(&sk->sk_wmem_alloc) <=
965                                     2 * sk->sk_sndbuf)
966                                         skb = sock_wmalloc(sk,
967                                                            alloclen + hh_len, 1,
968                                                            sk->sk_allocation);
969                                 if (unlikely(skb == NULL))
970                                         err = -ENOBUFS;
971                         }
972                         if (skb == NULL)
973                                 goto error;
974                         /*
975                          *      Fill in the control structures
976                          */
977                         skb->ip_summed = csummode;
978                         skb->csum = 0;
979                         /* reserve for fragmentation */
980                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
981
982                         /*
983                          *      Find where to start putting bytes
984                          */
985                         data = skb_put(skb, fraglen);
986                         skb->nh.raw = data + exthdrlen;
987                         data += fragheaderlen;
988                         skb->h.raw = data + exthdrlen;
989
990                         if (fraggap) {
991                                 skb->csum = skb_copy_and_csum_bits(
992                                         skb_prev, maxfraglen,
993                                         data + transhdrlen, fraggap, 0);
994                                 skb_prev->csum = csum_sub(skb_prev->csum,
995                                                           skb->csum);
996                                 data += fraggap;
997                                 skb_trim(skb_prev, maxfraglen);
998                         }
999                         copy = datalen - transhdrlen - fraggap;
1000                         if (copy < 0) {
1001                                 err = -EINVAL;
1002                                 kfree_skb(skb);
1003                                 goto error;
1004                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1005                                 err = -EFAULT;
1006                                 kfree_skb(skb);
1007                                 goto error;
1008                         }
1009
1010                         offset += copy;
1011                         length -= datalen - fraggap;
1012                         transhdrlen = 0;
1013                         exthdrlen = 0;
1014                         csummode = CHECKSUM_NONE;
1015
1016                         /*
1017                          * Put the packet on the pending queue
1018                          */
1019                         __skb_queue_tail(&sk->sk_write_queue, skb);
1020                         continue;
1021                 }
1022
1023                 if (copy > length)
1024                         copy = length;
1025
1026                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1027                         unsigned int off;
1028
1029                         off = skb->len;
1030                         if (getfrag(from, skb_put(skb, copy),
1031                                                 offset, copy, off, skb) < 0) {
1032                                 __skb_trim(skb, off);
1033                                 err = -EFAULT;
1034                                 goto error;
1035                         }
1036                 } else {
1037                         int i = skb_shinfo(skb)->nr_frags;
1038                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1039                         struct page *page = sk->sk_sndmsg_page;
1040                         int off = sk->sk_sndmsg_off;
1041                         unsigned int left;
1042
1043                         if (page && (left = PAGE_SIZE - off) > 0) {
1044                                 if (copy >= left)
1045                                         copy = left;
1046                                 if (page != frag->page) {
1047                                         if (i == MAX_SKB_FRAGS) {
1048                                                 err = -EMSGSIZE;
1049                                                 goto error;
1050                                         }
1051                                         get_page(page);
1052                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1053                                         frag = &skb_shinfo(skb)->frags[i];
1054                                 }
1055                         } else if(i < MAX_SKB_FRAGS) {
1056                                 if (copy > PAGE_SIZE)
1057                                         copy = PAGE_SIZE;
1058                                 page = alloc_pages(sk->sk_allocation, 0);
1059                                 if (page == NULL) {
1060                                         err = -ENOMEM;
1061                                         goto error;
1062                                 }
1063                                 sk->sk_sndmsg_page = page;
1064                                 sk->sk_sndmsg_off = 0;
1065
1066                                 skb_fill_page_desc(skb, i, page, 0, 0);
1067                                 frag = &skb_shinfo(skb)->frags[i];
1068                                 skb->truesize += PAGE_SIZE;
1069                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1070                         } else {
1071                                 err = -EMSGSIZE;
1072                                 goto error;
1073                         }
1074                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1075                                 err = -EFAULT;
1076                                 goto error;
1077                         }
1078                         sk->sk_sndmsg_off += copy;
1079                         frag->size += copy;
1080                         skb->len += copy;
1081                         skb->data_len += copy;
1082                 }
1083                 offset += copy;
1084                 length -= copy;
1085         }
1086         return 0;
1087 error:
1088         inet->cork.length -= length;
1089         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1090         return err;
1091 }
1092
1093 int ip6_push_pending_frames(struct sock *sk)
1094 {
1095         struct sk_buff *skb, *tmp_skb;
1096         struct sk_buff **tail_skb;
1097         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1098         struct inet_sock *inet = inet_sk(sk);
1099         struct ipv6_pinfo *np = inet6_sk(sk);
1100         struct ipv6hdr *hdr;
1101         struct ipv6_txoptions *opt = np->cork.opt;
1102         struct rt6_info *rt = np->cork.rt;
1103         struct flowi *fl = &inet->cork.fl;
1104         unsigned char proto = fl->proto;
1105         int err = 0;
1106
1107         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1108                 goto out;
1109         tail_skb = &(skb_shinfo(skb)->frag_list);
1110
1111         /* move skb->data to ip header from ext header */
1112         if (skb->data < skb->nh.raw)
1113                 __skb_pull(skb, skb->nh.raw - skb->data);
1114         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1115                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1116                 *tail_skb = tmp_skb;
1117                 tail_skb = &(tmp_skb->next);
1118                 skb->len += tmp_skb->len;
1119                 skb->data_len += tmp_skb->len;
1120                 skb->truesize += tmp_skb->truesize;
1121                 __sock_put(tmp_skb->sk);
1122                 tmp_skb->destructor = NULL;
1123                 tmp_skb->sk = NULL;
1124         }
1125
1126         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1127         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1128         if (opt && opt->opt_flen)
1129                 ipv6_push_frag_opts(skb, opt, &proto);
1130         if (opt && opt->opt_nflen)
1131                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1132
1133         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1134         
1135         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1136
1137         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1138                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1139         else
1140                 hdr->payload_len = 0;
1141         hdr->hop_limit = np->cork.hop_limit;
1142         hdr->nexthdr = proto;
1143         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1144         ipv6_addr_copy(&hdr->daddr, final_dst);
1145
1146         skb->dst = dst_clone(&rt->u.dst);
1147         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1148         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1149         if (err) {
1150                 if (err > 0)
1151                         err = np->recverr ? net_xmit_errno(err) : 0;
1152                 if (err)
1153                         goto error;
1154         }
1155
1156 out:
1157         inet->cork.flags &= ~IPCORK_OPT;
1158         if (np->cork.opt) {
1159                 kfree(np->cork.opt);
1160                 np->cork.opt = NULL;
1161         }
1162         if (np->cork.rt) {
1163                 dst_release(&np->cork.rt->u.dst);
1164                 np->cork.rt = NULL;
1165                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1166         }
1167         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1168         return err;
1169 error:
1170         goto out;
1171 }
1172
1173 void ip6_flush_pending_frames(struct sock *sk)
1174 {
1175         struct inet_sock *inet = inet_sk(sk);
1176         struct ipv6_pinfo *np = inet6_sk(sk);
1177         struct sk_buff *skb;
1178
1179         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1180                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1181                 kfree_skb(skb);
1182         }
1183
1184         inet->cork.flags &= ~IPCORK_OPT;
1185
1186         if (np->cork.opt) {
1187                 kfree(np->cork.opt);
1188                 np->cork.opt = NULL;
1189         }
1190         if (np->cork.rt) {
1191                 dst_release(&np->cork.rt->u.dst);
1192                 np->cork.rt = NULL;
1193                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1194         }
1195         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1196 }