2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
10 * Based on linux/net/ipv4/ip_output.c
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
31 #include <linux/errno.h>
32 #include <linux/kernel.h>
33 #include <linux/string.h>
34 #include <linux/socket.h>
35 #include <linux/net.h>
36 #include <linux/netdevice.h>
37 #include <linux/if_arp.h>
38 #include <linux/in6.h>
39 #include <linux/tcp.h>
40 #include <linux/route.h>
41 #include <linux/module.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
57 #include <net/checksum.h>
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
73 int __ip6_local_out(struct sk_buff *skb)
77 len = skb->len - sizeof(struct ipv6hdr);
78 if (len > IPV6_MAXPLEN)
80 ipv6_hdr(skb)->payload_len = htons(len);
82 return nf_hook(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev,
86 int ip6_local_out(struct sk_buff *skb)
90 err = __ip6_local_out(skb);
92 err = dst_output(skb);
96 EXPORT_SYMBOL_GPL(ip6_local_out);
98 static int ip6_output_finish(struct sk_buff *skb)
100 struct dst_entry *dst = skb->dst;
103 return neigh_hh_output(dst->hh, skb);
104 else if (dst->neighbour)
105 return dst->neighbour->output(skb);
107 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
113 /* dev_loopback_xmit for use with netfilter. */
114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
116 skb_reset_mac_header(newskb);
117 __skb_pull(newskb, skb_network_offset(newskb));
118 newskb->pkt_type = PACKET_LOOPBACK;
119 newskb->ip_summed = CHECKSUM_UNNECESSARY;
120 BUG_TRAP(newskb->dst);
127 static int ip6_output2(struct sk_buff *skb)
129 struct dst_entry *dst = skb->dst;
130 struct net_device *dev = dst->dev;
132 skb->protocol = htons(ETH_P_IPV6);
135 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
136 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
137 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
139 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
140 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141 &ipv6_hdr(skb)->saddr)) {
142 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
144 /* Do not check for IFF_ALLMULTI; multicast routing
145 is not supported in any case.
148 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
150 ip6_dev_loopback_xmit);
152 if (ipv6_hdr(skb)->hop_limit == 0) {
153 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
159 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
162 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
165 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
167 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
169 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
170 skb->dst->dev->mtu : dst_mtu(skb->dst);
173 int ip6_output(struct sk_buff *skb)
175 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
176 dst_allfrag(skb->dst))
177 return ip6_fragment(skb, ip6_output2);
179 return ip6_output2(skb);
183 * xmit an sk_buff (used by TCP)
186 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
187 struct ipv6_txoptions *opt, int ipfragok)
189 struct ipv6_pinfo *np = inet6_sk(sk);
190 struct in6_addr *first_hop = &fl->fl6_dst;
191 struct dst_entry *dst = skb->dst;
193 u8 proto = fl->proto;
194 int seg_len = skb->len;
199 unsigned int head_room;
201 /* First: exthdrs may take lots of space (~8K for now)
202 MAX_HEADER is not enough.
204 head_room = opt->opt_nflen + opt->opt_flen;
205 seg_len += head_room;
206 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
208 if (skb_headroom(skb) < head_room) {
209 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 IP6_INC_STATS(ip6_dst_idev(skb->dst),
212 IPSTATS_MIB_OUTDISCARDS);
219 skb_set_owner_w(skb, sk);
222 ipv6_push_frag_opts(skb, opt, &proto);
224 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
227 skb_push(skb, sizeof(struct ipv6hdr));
228 skb_reset_network_header(skb);
232 * Fill in the IPv6 header
237 hlimit = np->hop_limit;
239 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
241 hlimit = ipv6_get_hoplimit(dst->dev);
249 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
251 hdr->payload_len = htons(seg_len);
252 hdr->nexthdr = proto;
253 hdr->hop_limit = hlimit;
255 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
256 ipv6_addr_copy(&hdr->daddr, first_hop);
258 skb->priority = sk->sk_priority;
261 if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
262 IP6_INC_STATS(ip6_dst_idev(skb->dst),
263 IPSTATS_MIB_OUTREQUESTS);
264 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
269 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
271 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
272 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
277 EXPORT_SYMBOL(ip6_xmit);
280 * To avoid extra problems ND packets are send through this
281 * routine. It's code duplication but I really want to avoid
282 * extra checks since ipv6_build_header is used by TCP (which
283 * is for us performance critical)
286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
287 struct in6_addr *saddr, struct in6_addr *daddr,
290 struct ipv6_pinfo *np = inet6_sk(sk);
294 skb->protocol = htons(ETH_P_IPV6);
297 totlen = len + sizeof(struct ipv6hdr);
299 skb_reset_network_header(skb);
300 skb_put(skb, sizeof(struct ipv6hdr));
303 *(__be32*)hdr = htonl(0x60000000);
305 hdr->payload_len = htons(len);
306 hdr->nexthdr = proto;
307 hdr->hop_limit = np->hop_limit;
309 ipv6_addr_copy(&hdr->saddr, saddr);
310 ipv6_addr_copy(&hdr->daddr, daddr);
315 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317 struct ip6_ra_chain *ra;
318 struct sock *last = NULL;
320 read_lock(&ip6_ra_lock);
321 for (ra = ip6_ra_chain; ra; ra = ra->next) {
322 struct sock *sk = ra->sk;
323 if (sk && ra->sel == sel &&
324 (!sk->sk_bound_dev_if ||
325 sk->sk_bound_dev_if == skb->dev->ifindex)) {
327 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
329 rawv6_rcv(last, skb2);
336 rawv6_rcv(last, skb);
337 read_unlock(&ip6_ra_lock);
340 read_unlock(&ip6_ra_lock);
344 static int ip6_forward_proxy_check(struct sk_buff *skb)
346 struct ipv6hdr *hdr = ipv6_hdr(skb);
347 u8 nexthdr = hdr->nexthdr;
350 if (ipv6_ext_hdr(nexthdr)) {
351 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
355 offset = sizeof(struct ipv6hdr);
357 if (nexthdr == IPPROTO_ICMPV6) {
358 struct icmp6hdr *icmp6;
360 if (!pskb_may_pull(skb, (skb_network_header(skb) +
361 offset + 1 - skb->data)))
364 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
366 switch (icmp6->icmp6_type) {
367 case NDISC_ROUTER_SOLICITATION:
368 case NDISC_ROUTER_ADVERTISEMENT:
369 case NDISC_NEIGHBOUR_SOLICITATION:
370 case NDISC_NEIGHBOUR_ADVERTISEMENT:
372 /* For reaction involving unicast neighbor discovery
373 * message destined to the proxied address, pass it to
383 * The proxying router can't forward traffic sent to a link-local
384 * address, so signal the sender and discard the packet. This
385 * behavior is clarified by the MIPv6 specification.
387 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
388 dst_link_failure(skb);
395 static inline int ip6_forward_finish(struct sk_buff *skb)
397 return dst_output(skb);
400 int ip6_forward(struct sk_buff *skb)
402 struct dst_entry *dst = skb->dst;
403 struct ipv6hdr *hdr = ipv6_hdr(skb);
404 struct inet6_skb_parm *opt = IP6CB(skb);
406 if (ipv6_devconf.forwarding == 0)
409 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
410 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
414 skb_forward_csum(skb);
417 * We DO NOT make any processing on
418 * RA packets, pushing them to user level AS IS
419 * without ane WARRANTY that application will be able
420 * to interpret them. The reason is that we
421 * cannot make anything clever here.
423 * We are not end-node, so that if packet contains
424 * AH/ESP, we cannot make anything.
425 * Defragmentation also would be mistake, RA packets
426 * cannot be fragmented, because there is no warranty
427 * that different fragments will go along one path. --ANK
430 u8 *ptr = skb_network_header(skb) + opt->ra;
431 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
436 * check and decrement ttl
438 if (hdr->hop_limit <= 1) {
439 /* Force OUTPUT device used as source address */
441 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
443 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
449 /* XXX: idev->cnf.proxy_ndp? */
450 if (ipv6_devconf.proxy_ndp &&
451 pneigh_lookup(&nd_tbl, &hdr->daddr, skb->dev, 0)) {
452 int proxied = ip6_forward_proxy_check(skb);
454 return ip6_input(skb);
455 else if (proxied < 0) {
456 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
461 if (!xfrm6_route_forward(skb)) {
462 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
467 /* IPv6 specs say nothing about it, but it is clear that we cannot
468 send redirects to source routed frames.
469 We don't send redirects to frames decapsulated from IPsec.
471 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
473 struct in6_addr *target = NULL;
475 struct neighbour *n = dst->neighbour;
478 * incoming and outgoing devices are the same
482 rt = (struct rt6_info *) dst;
483 if ((rt->rt6i_flags & RTF_GATEWAY))
484 target = (struct in6_addr*)&n->primary_key;
486 target = &hdr->daddr;
488 /* Limit redirects both by destination (here)
489 and by source (inside ndisc_send_redirect)
491 if (xrlim_allow(dst, 1*HZ))
492 ndisc_send_redirect(skb, n, target);
494 int addrtype = ipv6_addr_type(&hdr->saddr);
496 /* This check is security critical. */
497 if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
499 if (addrtype & IPV6_ADDR_LINKLOCAL) {
500 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
501 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
506 if (skb->len > dst_mtu(dst)) {
507 /* Again, force OUTPUT device used as source address */
509 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
510 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
516 if (skb_cow(skb, dst->dev->hard_header_len)) {
517 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
523 /* Mangling hops number delayed to point after skb COW */
527 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
528 return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
531 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
537 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
539 to->pkt_type = from->pkt_type;
540 to->priority = from->priority;
541 to->protocol = from->protocol;
542 dst_release(to->dst);
543 to->dst = dst_clone(from->dst);
545 to->mark = from->mark;
547 #ifdef CONFIG_NET_SCHED
548 to->tc_index = from->tc_index;
551 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
552 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
553 to->nf_trace = from->nf_trace;
555 skb_copy_secmark(to, from);
558 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
560 u16 offset = sizeof(struct ipv6hdr);
561 struct ipv6_opt_hdr *exthdr =
562 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
563 unsigned int packet_len = skb->tail - skb->network_header;
565 *nexthdr = &ipv6_hdr(skb)->nexthdr;
567 while (offset + 1 <= packet_len) {
573 case NEXTHDR_ROUTING:
577 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
578 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
588 offset += ipv6_optlen(exthdr);
589 *nexthdr = &exthdr->nexthdr;
590 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
596 EXPORT_SYMBOL_GPL(ip6_find_1stfragopt);
598 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
600 struct net_device *dev;
601 struct sk_buff *frag;
602 struct rt6_info *rt = (struct rt6_info*)skb->dst;
603 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
604 struct ipv6hdr *tmp_hdr;
606 unsigned int mtu, hlen, left, len;
608 int ptr, offset = 0, err=0;
609 u8 *prevhdr, nexthdr = 0;
612 hlen = ip6_find_1stfragopt(skb, &prevhdr);
615 mtu = ip6_skb_dst_mtu(skb);
617 /* We must not fragment if the socket is set to force MTU discovery
618 * or if the skb it not generated by a local socket. (This last
619 * check should be redundant, but it's free.)
621 if (!np || np->pmtudisc >= IPV6_PMTUDISC_DO) {
622 skb->dev = skb->dst->dev;
623 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
624 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
629 if (np && np->frag_size < mtu) {
633 mtu -= hlen + sizeof(struct frag_hdr);
635 if (skb_shinfo(skb)->frag_list) {
636 int first_len = skb_pagelen(skb);
638 if (first_len - hlen > mtu ||
639 ((first_len - hlen) & 7) ||
643 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
644 /* Correct geometry. */
645 if (frag->len > mtu ||
646 ((frag->len & 7) && frag->next) ||
647 skb_headroom(frag) < hlen)
650 /* Partially cloned skb? */
651 if (skb_shared(frag))
658 frag->destructor = sock_wfree;
659 skb->truesize -= frag->truesize;
665 frag = skb_shinfo(skb)->frag_list;
666 skb_shinfo(skb)->frag_list = NULL;
669 *prevhdr = NEXTHDR_FRAGMENT;
670 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
672 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
676 __skb_pull(skb, hlen);
677 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
678 __skb_push(skb, hlen);
679 skb_reset_network_header(skb);
680 memcpy(skb_network_header(skb), tmp_hdr, hlen);
682 ipv6_select_ident(skb, fh);
683 fh->nexthdr = nexthdr;
685 fh->frag_off = htons(IP6_MF);
686 frag_id = fh->identification;
688 first_len = skb_pagelen(skb);
689 skb->data_len = first_len - skb_headlen(skb);
690 skb->len = first_len;
691 ipv6_hdr(skb)->payload_len = htons(first_len -
692 sizeof(struct ipv6hdr));
694 dst_hold(&rt->u.dst);
697 /* Prepare header of the next frame,
698 * before previous one went down. */
700 frag->ip_summed = CHECKSUM_NONE;
701 skb_reset_transport_header(frag);
702 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
703 __skb_push(frag, hlen);
704 skb_reset_network_header(frag);
705 memcpy(skb_network_header(frag), tmp_hdr,
707 offset += skb->len - hlen - sizeof(struct frag_hdr);
708 fh->nexthdr = nexthdr;
710 fh->frag_off = htons(offset);
711 if (frag->next != NULL)
712 fh->frag_off |= htons(IP6_MF);
713 fh->identification = frag_id;
714 ipv6_hdr(frag)->payload_len =
716 sizeof(struct ipv6hdr));
717 ip6_copy_metadata(frag, skb);
722 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
735 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
736 dst_release(&rt->u.dst);
746 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
747 dst_release(&rt->u.dst);
752 left = skb->len - hlen; /* Space per frame */
753 ptr = hlen; /* Where to start from */
756 * Fragment the datagram.
759 *prevhdr = NEXTHDR_FRAGMENT;
762 * Keep copying data until we run out.
766 /* IF: it doesn't fit, use 'mtu' - the data space left */
769 /* IF: we are not sending upto and including the packet end
770 then align the next start on an eight byte boundary */
778 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
779 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
780 IP6_INC_STATS(ip6_dst_idev(skb->dst),
781 IPSTATS_MIB_FRAGFAILS);
787 * Set up data on packet
790 ip6_copy_metadata(frag, skb);
791 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
792 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
793 skb_reset_network_header(frag);
794 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
795 frag->transport_header = (frag->network_header + hlen +
796 sizeof(struct frag_hdr));
799 * Charge the memory for the fragment to any owner
803 skb_set_owner_w(frag, skb->sk);
806 * Copy the packet header into the new buffer.
808 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
811 * Build fragment header.
813 fh->nexthdr = nexthdr;
816 ipv6_select_ident(skb, fh);
817 frag_id = fh->identification;
819 fh->identification = frag_id;
822 * Copy a block of the IP datagram.
824 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
828 fh->frag_off = htons(offset);
830 fh->frag_off |= htons(IP6_MF);
831 ipv6_hdr(frag)->payload_len = htons(frag->len -
832 sizeof(struct ipv6hdr));
838 * Put this fragment into the sending queue.
844 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
846 IP6_INC_STATS(ip6_dst_idev(skb->dst),
847 IPSTATS_MIB_FRAGOKS);
852 IP6_INC_STATS(ip6_dst_idev(skb->dst),
853 IPSTATS_MIB_FRAGFAILS);
858 static inline int ip6_rt_check(struct rt6key *rt_key,
859 struct in6_addr *fl_addr,
860 struct in6_addr *addr_cache)
862 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
863 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
866 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
867 struct dst_entry *dst,
870 struct ipv6_pinfo *np = inet6_sk(sk);
871 struct rt6_info *rt = (struct rt6_info *)dst;
876 /* Yes, checking route validity in not connected
877 * case is not very simple. Take into account,
878 * that we do not support routing by source, TOS,
879 * and MSG_DONTROUTE --ANK (980726)
881 * 1. ip6_rt_check(): If route was host route,
882 * check that cached destination is current.
883 * If it is network route, we still may
884 * check its validity using saved pointer
885 * to the last used address: daddr_cache.
886 * We do not want to save whole address now,
887 * (because main consumer of this service
888 * is tcp, which has not this problem),
889 * so that the last trick works only on connected
891 * 2. oif also should be the same.
893 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
894 #ifdef CONFIG_IPV6_SUBTREES
895 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
897 (fl->oif && fl->oif != dst->dev->ifindex)) {
906 static int ip6_dst_lookup_tail(struct sock *sk,
907 struct dst_entry **dst, struct flowi *fl)
912 *dst = ip6_route_output(sk, fl);
914 if ((err = (*dst)->error))
915 goto out_err_release;
917 if (ipv6_addr_any(&fl->fl6_src)) {
918 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
920 goto out_err_release;
923 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
925 * Here if the dst entry we've looked up
926 * has a neighbour entry that is in the INCOMPLETE
927 * state and the src address from the flow is
928 * marked as OPTIMISTIC, we release the found
929 * dst entry and replace it instead with the
930 * dst entry of the nexthop router
932 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
933 struct inet6_ifaddr *ifp;
937 ifp = ipv6_get_ifaddr(&fl->fl6_src, (*dst)->dev, 1);
939 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
945 * We need to get the dst entry for the
946 * default router instead
949 memcpy(&fl_gw, fl, sizeof(struct flowi));
950 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
951 *dst = ip6_route_output(sk, &fl_gw);
952 if ((err = (*dst)->error))
953 goto out_err_release;
961 if (err == -ENETUNREACH)
962 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
969 * ip6_dst_lookup - perform route lookup on flow
970 * @sk: socket which provides route info
971 * @dst: pointer to dst_entry * for result
972 * @fl: flow to lookup
974 * This function performs a route lookup on the given flow.
976 * It returns zero on success, or a standard errno code on error.
978 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
981 return ip6_dst_lookup_tail(sk, dst, fl);
983 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
986 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
987 * @sk: socket which provides the dst cache and route info
988 * @dst: pointer to dst_entry * for result
989 * @fl: flow to lookup
991 * This function performs a route lookup on the given flow with the
992 * possibility of using the cached route in the socket if it is valid.
993 * It will take the socket dst lock when operating on the dst cache.
994 * As a result, this function can only be used in process context.
996 * It returns zero on success, or a standard errno code on error.
998 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1002 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1003 *dst = ip6_sk_dst_check(sk, *dst, fl);
1006 return ip6_dst_lookup_tail(sk, dst, fl);
1008 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1010 static inline int ip6_ufo_append_data(struct sock *sk,
1011 int getfrag(void *from, char *to, int offset, int len,
1012 int odd, struct sk_buff *skb),
1013 void *from, int length, int hh_len, int fragheaderlen,
1014 int transhdrlen, int mtu,unsigned int flags)
1017 struct sk_buff *skb;
1020 /* There is support for UDP large send offload by network
1021 * device, so create one single skb packet containing complete
1024 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1025 skb = sock_alloc_send_skb(sk,
1026 hh_len + fragheaderlen + transhdrlen + 20,
1027 (flags & MSG_DONTWAIT), &err);
1031 /* reserve space for Hardware header */
1032 skb_reserve(skb, hh_len);
1034 /* create space for UDP/IP header */
1035 skb_put(skb,fragheaderlen + transhdrlen);
1037 /* initialize network header pointer */
1038 skb_reset_network_header(skb);
1040 /* initialize protocol header pointer */
1041 skb->transport_header = skb->network_header + fragheaderlen;
1043 skb->ip_summed = CHECKSUM_PARTIAL;
1045 sk->sk_sndmsg_off = 0;
1048 err = skb_append_datato_frags(sk,skb, getfrag, from,
1049 (length - transhdrlen));
1051 struct frag_hdr fhdr;
1053 /* specify the length of each IP datagram fragment*/
1054 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1055 sizeof(struct frag_hdr);
1056 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1057 ipv6_select_ident(skb, &fhdr);
1058 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1059 __skb_queue_tail(&sk->sk_write_queue, skb);
1063 /* There is not enough support do UPD LSO,
1064 * so follow normal path
1071 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1072 int offset, int len, int odd, struct sk_buff *skb),
1073 void *from, int length, int transhdrlen,
1074 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1075 struct rt6_info *rt, unsigned int flags)
1077 struct inet_sock *inet = inet_sk(sk);
1078 struct ipv6_pinfo *np = inet6_sk(sk);
1079 struct sk_buff *skb;
1080 unsigned int maxfraglen, fragheaderlen;
1087 int csummode = CHECKSUM_NONE;
1089 if (flags&MSG_PROBE)
1091 if (skb_queue_empty(&sk->sk_write_queue)) {
1096 if (np->cork.opt == NULL) {
1097 np->cork.opt = kmalloc(opt->tot_len,
1099 if (unlikely(np->cork.opt == NULL))
1101 } else if (np->cork.opt->tot_len < opt->tot_len) {
1102 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1105 memcpy(np->cork.opt, opt, opt->tot_len);
1106 inet->cork.flags |= IPCORK_OPT;
1107 /* need source address above miyazawa*/
1109 dst_hold(&rt->u.dst);
1111 inet->cork.fl = *fl;
1112 np->cork.hop_limit = hlimit;
1113 np->cork.tclass = tclass;
1114 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1115 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1116 if (np->frag_size < mtu) {
1118 mtu = np->frag_size;
1120 inet->cork.fragsize = mtu;
1121 if (dst_allfrag(rt->u.dst.path))
1122 inet->cork.flags |= IPCORK_ALLFRAG;
1123 inet->cork.length = 0;
1124 sk->sk_sndmsg_page = NULL;
1125 sk->sk_sndmsg_off = 0;
1126 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1128 length += exthdrlen;
1129 transhdrlen += exthdrlen;
1132 fl = &inet->cork.fl;
1133 if (inet->cork.flags & IPCORK_OPT)
1137 mtu = inet->cork.fragsize;
1140 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1142 fragheaderlen = sizeof(struct ipv6hdr) + rt->nfheader_len +
1143 (opt ? opt->opt_nflen : 0);
1144 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1146 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1147 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1148 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1154 * Let's try using as much space as possible.
1155 * Use MTU if total length of the message fits into the MTU.
1156 * Otherwise, we need to reserve fragment header and
1157 * fragment alignment (= 8-15 octects, in total).
1159 * Note that we may need to "move" the data from the tail of
1160 * of the buffer to the new fragment when we split
1163 * FIXME: It may be fragmented into multiple chunks
1164 * at once if non-fragmentable extension headers
1169 inet->cork.length += length;
1170 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1171 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1173 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1174 fragheaderlen, transhdrlen, mtu,
1181 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1184 while (length > 0) {
1185 /* Check if the remaining data fits into current packet. */
1186 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1188 copy = maxfraglen - skb->len;
1192 unsigned int datalen;
1193 unsigned int fraglen;
1194 unsigned int fraggap;
1195 unsigned int alloclen;
1196 struct sk_buff *skb_prev;
1200 /* There's no room in the current skb */
1202 fraggap = skb_prev->len - maxfraglen;
1207 * If remaining data exceeds the mtu,
1208 * we know we need more fragment(s).
1210 datalen = length + fraggap;
1211 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1212 datalen = maxfraglen - fragheaderlen;
1214 fraglen = datalen + fragheaderlen;
1215 if ((flags & MSG_MORE) &&
1216 !(rt->u.dst.dev->features&NETIF_F_SG))
1219 alloclen = datalen + fragheaderlen;
1222 * The last fragment gets additional space at tail.
1223 * Note: we overallocate on fragments with MSG_MODE
1224 * because we have no idea if we're the last one.
1226 if (datalen == length + fraggap)
1227 alloclen += rt->u.dst.trailer_len;
1230 * We just reserve space for fragment header.
1231 * Note: this may be overallocation if the message
1232 * (without MSG_MORE) fits into the MTU.
1234 alloclen += sizeof(struct frag_hdr);
1237 skb = sock_alloc_send_skb(sk,
1239 (flags & MSG_DONTWAIT), &err);
1242 if (atomic_read(&sk->sk_wmem_alloc) <=
1244 skb = sock_wmalloc(sk,
1245 alloclen + hh_len, 1,
1247 if (unlikely(skb == NULL))
1253 * Fill in the control structures
1255 skb->ip_summed = csummode;
1257 /* reserve for fragmentation */
1258 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1261 * Find where to start putting bytes
1263 data = skb_put(skb, fraglen);
1264 skb_set_network_header(skb, exthdrlen);
1265 data += fragheaderlen;
1266 skb->transport_header = (skb->network_header +
1269 skb->csum = skb_copy_and_csum_bits(
1270 skb_prev, maxfraglen,
1271 data + transhdrlen, fraggap, 0);
1272 skb_prev->csum = csum_sub(skb_prev->csum,
1275 pskb_trim_unique(skb_prev, maxfraglen);
1277 copy = datalen - transhdrlen - fraggap;
1282 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1289 length -= datalen - fraggap;
1292 csummode = CHECKSUM_NONE;
1295 * Put the packet on the pending queue
1297 __skb_queue_tail(&sk->sk_write_queue, skb);
1304 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1308 if (getfrag(from, skb_put(skb, copy),
1309 offset, copy, off, skb) < 0) {
1310 __skb_trim(skb, off);
1315 int i = skb_shinfo(skb)->nr_frags;
1316 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1317 struct page *page = sk->sk_sndmsg_page;
1318 int off = sk->sk_sndmsg_off;
1321 if (page && (left = PAGE_SIZE - off) > 0) {
1324 if (page != frag->page) {
1325 if (i == MAX_SKB_FRAGS) {
1330 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1331 frag = &skb_shinfo(skb)->frags[i];
1333 } else if(i < MAX_SKB_FRAGS) {
1334 if (copy > PAGE_SIZE)
1336 page = alloc_pages(sk->sk_allocation, 0);
1341 sk->sk_sndmsg_page = page;
1342 sk->sk_sndmsg_off = 0;
1344 skb_fill_page_desc(skb, i, page, 0, 0);
1345 frag = &skb_shinfo(skb)->frags[i];
1350 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1354 sk->sk_sndmsg_off += copy;
1357 skb->data_len += copy;
1358 skb->truesize += copy;
1359 atomic_add(copy, &sk->sk_wmem_alloc);
1366 inet->cork.length -= length;
1367 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1371 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1373 inet->cork.flags &= ~IPCORK_OPT;
1374 kfree(np->cork.opt);
1375 np->cork.opt = NULL;
1377 dst_release(&np->cork.rt->u.dst);
1379 inet->cork.flags &= ~IPCORK_ALLFRAG;
1381 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1384 int ip6_push_pending_frames(struct sock *sk)
1386 struct sk_buff *skb, *tmp_skb;
1387 struct sk_buff **tail_skb;
1388 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1389 struct inet_sock *inet = inet_sk(sk);
1390 struct ipv6_pinfo *np = inet6_sk(sk);
1391 struct ipv6hdr *hdr;
1392 struct ipv6_txoptions *opt = np->cork.opt;
1393 struct rt6_info *rt = np->cork.rt;
1394 struct flowi *fl = &inet->cork.fl;
1395 unsigned char proto = fl->proto;
1398 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1400 tail_skb = &(skb_shinfo(skb)->frag_list);
1402 /* move skb->data to ip header from ext header */
1403 if (skb->data < skb_network_header(skb))
1404 __skb_pull(skb, skb_network_offset(skb));
1405 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1406 __skb_pull(tmp_skb, skb_network_header_len(skb));
1407 *tail_skb = tmp_skb;
1408 tail_skb = &(tmp_skb->next);
1409 skb->len += tmp_skb->len;
1410 skb->data_len += tmp_skb->len;
1411 skb->truesize += tmp_skb->truesize;
1412 __sock_put(tmp_skb->sk);
1413 tmp_skb->destructor = NULL;
1417 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1418 __skb_pull(skb, skb_network_header_len(skb));
1419 if (opt && opt->opt_flen)
1420 ipv6_push_frag_opts(skb, opt, &proto);
1421 if (opt && opt->opt_nflen)
1422 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1424 skb_push(skb, sizeof(struct ipv6hdr));
1425 skb_reset_network_header(skb);
1426 hdr = ipv6_hdr(skb);
1428 *(__be32*)hdr = fl->fl6_flowlabel |
1429 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1431 hdr->hop_limit = np->cork.hop_limit;
1432 hdr->nexthdr = proto;
1433 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1434 ipv6_addr_copy(&hdr->daddr, final_dst);
1436 skb->priority = sk->sk_priority;
1438 skb->dst = dst_clone(&rt->u.dst);
1439 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1440 if (proto == IPPROTO_ICMPV6) {
1441 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1443 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1444 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1447 err = ip6_local_out(skb);
1450 err = np->recverr ? net_xmit_errno(err) : 0;
1456 ip6_cork_release(inet, np);
1462 void ip6_flush_pending_frames(struct sock *sk)
1464 struct sk_buff *skb;
1466 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1468 IP6_INC_STATS(ip6_dst_idev(skb->dst),
1469 IPSTATS_MIB_OUTDISCARDS);
1473 ip6_cork_release(inet_sk(sk), inet6_sk(sk));