2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
35 #include <net/protocol.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
55 1. The most important issue is detecting local dead loops.
56 They would cause complete host lockup in transmit, which
57 would be "resolved" by stack overflow or, if queueing is enabled,
58 with infinite looping in net_bh.
60 We cannot track such dead loops during route installation,
61 it is infeasible task. The most general solutions would be
62 to keep skb->encapsulation counter (sort of local ttl),
63 and silently drop packet when it expires. It is the best
64 solution, but it supposes maintaing new variable in ALL
65 skb, even if no tunneling is used.
67 Current solution: t->recursion lock breaks dead loops. It looks
68 like dev->tbusy flag, but I preferred new variable, because
69 the semantics is different. One day, when hard_start_xmit
70 will be multithreaded we will have to use skb->encapsulation.
74 2. Networking dead loops would not kill routers, but would really
75 kill network. IP hop limit plays role of "t->recursion" in this case,
76 if we copy it from packet being encapsulated to upper header.
77 It is very good solution, but it introduces two problems:
79 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80 do not work over tunnels.
81 - traceroute does not work. I planned to relay ICMP from tunnel,
82 so that this problem would be solved and traceroute output
83 would even more informative. This idea appeared to be wrong:
84 only Linux complies to rfc1812 now (yes, guys, Linux is the only
85 true router now :-)), all routers (at least, in neighbourhood of mine)
86 return only 8 bytes of payload. It is the end.
88 Hence, if we want that OSPF worked or traceroute said something reasonable,
89 we should search for another solution.
91 One of them is to parse packet trying to detect inner encapsulation
92 made by our node. It is difficult or even impossible, especially,
93 taking into account fragmentation. TO be short, tt is not solution at all.
95 Current solution: The solution was UNEXPECTEDLY SIMPLE.
96 We force DF flag on tunnels with preconfigured hop limit,
97 that is ALL. :-) Well, it does not remove the problem completely,
98 but exponential growth of network traffic is changed to linear
99 (branches, that exceed pmtu are pruned) and tunnel mtu
100 fastly degrades to value <68, where looping stops.
101 Yes, it is not good if there exists a router in the loop,
102 which does not force DF, even when encapsulating packets have DF set.
103 But it is not our problem! Nobody could accuse us, we made
104 all that we could make. Even if it is your gated who injected
105 fatal route to network, even if it were you who configured
106 fatal static route: you are innocent. :-)
110 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111 practically identical code. It would be good to glue them
112 together, but it is not very evident, how to make them modular.
113 sit is integral part of IPv6, ipip and gre are naturally modular.
114 We could extract common parts (hash table, ioctl etc)
115 to a separate module (ip_tunnel.c).
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
123 /* Fallback tunnel: no source, no destination, no key, no options */
125 static int ipgre_fb_tunnel_init(struct net_device *dev);
127 static int ipgre_net_id;
129 struct net_device *fb_tunnel_dev;
132 /* Tunnel hash table */
142 We require exact key match i.e. if a key is present in packet
143 it will match only tunnel with the same key; if it is not present,
144 it will match only keyless tunnel.
146 All keysless packets, if not matched configured keyless tunnels
147 will match fallback tunnel.
151 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
153 static struct ip_tunnel *tunnels[4][HASH_SIZE];
155 #define tunnels_r_l (tunnels[3])
156 #define tunnels_r (tunnels[2])
157 #define tunnels_l (tunnels[1])
158 #define tunnels_wc (tunnels[0])
160 static DEFINE_RWLOCK(ipgre_lock);
162 /* Given src, dst and key, find appropriate for input tunnel. */
164 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
165 __be32 remote, __be32 local, __be32 key)
167 unsigned h0 = HASH(remote);
168 unsigned h1 = HASH(key);
170 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
172 for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
173 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
174 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
178 for (t = tunnels_r[h0^h1]; t; t = t->next) {
179 if (remote == t->parms.iph.daddr) {
180 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
184 for (t = tunnels_l[h1]; t; t = t->next) {
185 if (local == t->parms.iph.saddr ||
186 (local == t->parms.iph.daddr &&
187 ipv4_is_multicast(local))) {
188 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
192 for (t = tunnels_wc[h1]; t; t = t->next) {
193 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
197 if (ign->fb_tunnel_dev->flags&IFF_UP)
198 return netdev_priv(ign->fb_tunnel_dev);
202 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
203 struct ip_tunnel_parm *parms)
205 __be32 remote = parms->iph.daddr;
206 __be32 local = parms->iph.saddr;
207 __be32 key = parms->i_key;
208 unsigned h = HASH(key);
213 if (remote && !ipv4_is_multicast(remote)) {
218 return &tunnels[prio][h];
221 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
224 return __ipgre_bucket(ign, &t->parms);
227 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
229 struct ip_tunnel **tp = ipgre_bucket(ign, t);
232 write_lock_bh(&ipgre_lock);
234 write_unlock_bh(&ipgre_lock);
237 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
239 struct ip_tunnel **tp;
241 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
243 write_lock_bh(&ipgre_lock);
245 write_unlock_bh(&ipgre_lock);
251 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
252 struct ip_tunnel_parm *parms, int create)
254 __be32 remote = parms->iph.daddr;
255 __be32 local = parms->iph.saddr;
256 __be32 key = parms->i_key;
257 struct ip_tunnel *t, **tp, *nt;
258 struct net_device *dev;
260 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
262 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) {
263 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
264 if (key == t->parms.i_key)
272 strlcpy(name, parms->name, IFNAMSIZ);
274 sprintf(name, "gre%%d");
276 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
280 if (strchr(name, '%')) {
281 if (dev_alloc_name(dev, name) < 0)
285 dev->init = ipgre_tunnel_init;
286 nt = netdev_priv(dev);
289 if (register_netdevice(dev) < 0)
293 ipgre_tunnel_link(ign, nt);
301 static void ipgre_tunnel_uninit(struct net_device *dev)
303 struct net *net = dev_net(dev);
304 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
306 ipgre_tunnel_unlink(ign, netdev_priv(dev));
311 static void ipgre_err(struct sk_buff *skb, u32 info)
313 #ifndef I_WISH_WORLD_WERE_PERFECT
315 /* It is not :-( All the routers (except for Linux) return only
316 8 bytes of packet payload. It means, that precise relaying of
317 ICMP in the real Internet is absolutely infeasible.
319 Moreover, Cisco "wise men" put GRE key to the third word
320 in GRE header. It makes impossible maintaining even soft state for keyed
321 GRE tunnels with enabled checksum. Tell them "thank you".
323 Well, I wonder, rfc1812 was written by Cisco employee,
324 what the hell these idiots break standrads established
328 struct iphdr *iph = (struct iphdr*)skb->data;
329 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
330 int grehlen = (iph->ihl<<2) + 4;
331 const int type = icmp_hdr(skb)->type;
332 const int code = icmp_hdr(skb)->code;
337 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
338 if (flags&(GRE_VERSION|GRE_ROUTING))
347 /* If only 8 bytes returned, keyed message will be dropped here */
348 if (skb_headlen(skb) < grehlen)
353 case ICMP_PARAMETERPROB:
356 case ICMP_DEST_UNREACH:
359 case ICMP_PORT_UNREACH:
360 /* Impossible event. */
362 case ICMP_FRAG_NEEDED:
363 /* Soft state for pmtu is maintained by IP core. */
366 /* All others are translated to HOST_UNREACH.
367 rfc2003 contains "deep thoughts" about NET_UNREACH,
368 I believe they are just ether pollution. --ANK
373 case ICMP_TIME_EXCEEDED:
374 if (code != ICMP_EXC_TTL)
379 read_lock(&ipgre_lock);
380 t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
382 *(((__be32*)p) + (grehlen>>2) - 1) : 0);
383 if (t == NULL || t->parms.iph.daddr == 0 ||
384 ipv4_is_multicast(t->parms.iph.daddr))
387 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
390 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
394 t->err_time = jiffies;
396 read_unlock(&ipgre_lock);
399 struct iphdr *iph = (struct iphdr*)dp;
401 __be16 *p = (__be16*)(dp+(iph->ihl<<2));
402 const int type = icmp_hdr(skb)->type;
403 const int code = icmp_hdr(skb)->code;
409 int grehlen = (iph->ihl<<2) + 4;
410 struct sk_buff *skb2;
414 if (p[1] != htons(ETH_P_IP))
418 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
419 if (flags&(GRE_VERSION|GRE_ROUTING))
428 if (len < grehlen + sizeof(struct iphdr))
430 eiph = (struct iphdr*)(dp + grehlen);
435 case ICMP_PARAMETERPROB:
436 n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
437 if (n < (iph->ihl<<2))
440 /* So... This guy found something strange INSIDE encapsulated
441 packet. Well, he is fool, but what can we do ?
443 rel_type = ICMP_PARAMETERPROB;
445 rel_info = htonl(n << 24);
448 case ICMP_DEST_UNREACH:
451 case ICMP_PORT_UNREACH:
452 /* Impossible event. */
454 case ICMP_FRAG_NEEDED:
455 /* And it is the only really necessary thing :-) */
456 n = ntohs(icmp_hdr(skb)->un.frag.mtu);
460 /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
461 if (n > ntohs(eiph->tot_len))
466 /* All others are translated to HOST_UNREACH.
467 rfc2003 contains "deep thoughts" about NET_UNREACH,
468 I believe, it is just ether pollution. --ANK
470 rel_type = ICMP_DEST_UNREACH;
471 rel_code = ICMP_HOST_UNREACH;
475 case ICMP_TIME_EXCEEDED:
476 if (code != ICMP_EXC_TTL)
481 /* Prepare fake skb to feed it to icmp_send */
482 skb2 = skb_clone(skb, GFP_ATOMIC);
485 dst_release(skb2->dst);
487 skb_pull(skb2, skb->data - (u8*)eiph);
488 skb_reset_network_header(skb2);
490 /* Try to guess incoming interface */
491 memset(&fl, 0, sizeof(fl));
492 fl.fl4_dst = eiph->saddr;
493 fl.fl4_tos = RT_TOS(eiph->tos);
494 fl.proto = IPPROTO_GRE;
495 if (ip_route_output_key(&init_net, &rt, &fl)) {
499 skb2->dev = rt->u.dst.dev;
501 /* route "incoming" packet */
502 if (rt->rt_flags&RTCF_LOCAL) {
505 fl.fl4_dst = eiph->daddr;
506 fl.fl4_src = eiph->saddr;
507 fl.fl4_tos = eiph->tos;
508 if (ip_route_output_key(&init_net, &rt, &fl) ||
509 rt->u.dst.dev->type != ARPHRD_IPGRE) {
516 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
517 skb2->dst->dev->type != ARPHRD_IPGRE) {
523 /* change mtu on this route */
524 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
525 if (n > dst_mtu(skb2->dst)) {
529 skb2->dst->ops->update_pmtu(skb2->dst, n);
530 } else if (type == ICMP_TIME_EXCEEDED) {
531 struct ip_tunnel *t = netdev_priv(skb2->dev);
532 if (t->parms.iph.ttl) {
533 rel_type = ICMP_DEST_UNREACH;
534 rel_code = ICMP_HOST_UNREACH;
538 icmp_send(skb2, rel_type, rel_code, rel_info);
543 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
545 if (INET_ECN_is_ce(iph->tos)) {
546 if (skb->protocol == htons(ETH_P_IP)) {
547 IP_ECN_set_ce(ip_hdr(skb));
548 } else if (skb->protocol == htons(ETH_P_IPV6)) {
549 IP6_ECN_set_ce(ipv6_hdr(skb));
555 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
558 if (skb->protocol == htons(ETH_P_IP))
559 inner = old_iph->tos;
560 else if (skb->protocol == htons(ETH_P_IPV6))
561 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
562 return INET_ECN_encapsulate(tos, inner);
565 static int ipgre_rcv(struct sk_buff *skb)
573 struct ip_tunnel *tunnel;
576 if (!pskb_may_pull(skb, 16))
583 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
584 /* - Version must be 0.
585 - We do not support routing headers.
587 if (flags&(GRE_VERSION|GRE_ROUTING))
590 if (flags&GRE_CSUM) {
591 switch (skb->ip_summed) {
592 case CHECKSUM_COMPLETE:
593 csum = csum_fold(skb->csum);
599 csum = __skb_checksum_complete(skb);
600 skb->ip_summed = CHECKSUM_COMPLETE;
605 key = *(__be32*)(h + offset);
609 seqno = ntohl(*(__be32*)(h + offset));
614 read_lock(&ipgre_lock);
615 if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
616 iph->saddr, iph->daddr, key)) != NULL) {
619 skb->protocol = *(__be16*)(h + 2);
620 /* WCCP version 1 and 2 protocol decoding.
621 * - Change protocol to IP
622 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
625 skb->protocol == htons(ETH_P_WCCP)) {
626 skb->protocol = htons(ETH_P_IP);
627 if ((*(h + offset) & 0xF0) != 0x40)
631 skb->mac_header = skb->network_header;
632 __pskb_pull(skb, offset);
633 skb_reset_network_header(skb);
634 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
635 skb->pkt_type = PACKET_HOST;
636 #ifdef CONFIG_NET_IPGRE_BROADCAST
637 if (ipv4_is_multicast(iph->daddr)) {
638 /* Looped back packet, drop it! */
639 if (skb->rtable->fl.iif == 0)
641 tunnel->stat.multicast++;
642 skb->pkt_type = PACKET_BROADCAST;
646 if (((flags&GRE_CSUM) && csum) ||
647 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
648 tunnel->stat.rx_crc_errors++;
649 tunnel->stat.rx_errors++;
652 if (tunnel->parms.i_flags&GRE_SEQ) {
653 if (!(flags&GRE_SEQ) ||
654 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
655 tunnel->stat.rx_fifo_errors++;
656 tunnel->stat.rx_errors++;
659 tunnel->i_seqno = seqno + 1;
661 tunnel->stat.rx_packets++;
662 tunnel->stat.rx_bytes += skb->len;
663 skb->dev = tunnel->dev;
664 dst_release(skb->dst);
667 ipgre_ecn_decapsulate(iph, skb);
669 read_unlock(&ipgre_lock);
672 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
675 read_unlock(&ipgre_lock);
681 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
683 struct ip_tunnel *tunnel = netdev_priv(dev);
684 struct net_device_stats *stats = &tunnel->stat;
685 struct iphdr *old_iph = ip_hdr(skb);
689 struct rtable *rt; /* Route to the other host */
690 struct net_device *tdev; /* Device to other host */
691 struct iphdr *iph; /* Our new IP header */
692 unsigned int max_headroom; /* The extra header space needed */
697 if (tunnel->recursion++) {
698 tunnel->stat.collisions++;
702 if (dev->header_ops) {
704 tiph = (struct iphdr*)skb->data;
706 gre_hlen = tunnel->hlen;
707 tiph = &tunnel->parms.iph;
710 if ((dst = tiph->daddr) == 0) {
713 if (skb->dst == NULL) {
714 tunnel->stat.tx_fifo_errors++;
718 if (skb->protocol == htons(ETH_P_IP)) {
720 if ((dst = rt->rt_gateway) == 0)
724 else if (skb->protocol == htons(ETH_P_IPV6)) {
725 struct in6_addr *addr6;
727 struct neighbour *neigh = skb->dst->neighbour;
732 addr6 = (struct in6_addr*)&neigh->primary_key;
733 addr_type = ipv6_addr_type(addr6);
735 if (addr_type == IPV6_ADDR_ANY) {
736 addr6 = &ipv6_hdr(skb)->daddr;
737 addr_type = ipv6_addr_type(addr6);
740 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
743 dst = addr6->s6_addr32[3];
752 if (skb->protocol == htons(ETH_P_IP))
758 struct flowi fl = { .oif = tunnel->parms.link,
761 .saddr = tiph->saddr,
762 .tos = RT_TOS(tos) } },
763 .proto = IPPROTO_GRE };
764 if (ip_route_output_key(&init_net, &rt, &fl)) {
765 tunnel->stat.tx_carrier_errors++;
769 tdev = rt->u.dst.dev;
773 tunnel->stat.collisions++;
779 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
781 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
784 skb->dst->ops->update_pmtu(skb->dst, mtu);
786 if (skb->protocol == htons(ETH_P_IP)) {
787 df |= (old_iph->frag_off&htons(IP_DF));
789 if ((old_iph->frag_off&htons(IP_DF)) &&
790 mtu < ntohs(old_iph->tot_len)) {
791 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
797 else if (skb->protocol == htons(ETH_P_IPV6)) {
798 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
800 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
801 if ((tunnel->parms.iph.daddr &&
802 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
803 rt6->rt6i_dst.plen == 128) {
804 rt6->rt6i_flags |= RTF_MODIFIED;
805 skb->dst->metrics[RTAX_MTU-1] = mtu;
809 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
810 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
817 if (tunnel->err_count > 0) {
818 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
821 dst_link_failure(skb);
823 tunnel->err_count = 0;
826 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
828 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
829 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
830 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
839 skb_set_owner_w(new_skb, skb->sk);
842 old_iph = ip_hdr(skb);
845 skb->transport_header = skb->network_header;
846 skb_push(skb, gre_hlen);
847 skb_reset_network_header(skb);
848 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
849 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
851 dst_release(skb->dst);
852 skb->dst = &rt->u.dst;
855 * Push down and install the IPIP header.
860 iph->ihl = sizeof(struct iphdr) >> 2;
862 iph->protocol = IPPROTO_GRE;
863 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
864 iph->daddr = rt->rt_dst;
865 iph->saddr = rt->rt_src;
867 if ((iph->ttl = tiph->ttl) == 0) {
868 if (skb->protocol == htons(ETH_P_IP))
869 iph->ttl = old_iph->ttl;
871 else if (skb->protocol == htons(ETH_P_IPV6))
872 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
875 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
878 ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
879 ((__be16*)(iph+1))[1] = skb->protocol;
881 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
882 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
884 if (tunnel->parms.o_flags&GRE_SEQ) {
886 *ptr = htonl(tunnel->o_seqno);
889 if (tunnel->parms.o_flags&GRE_KEY) {
890 *ptr = tunnel->parms.o_key;
893 if (tunnel->parms.o_flags&GRE_CSUM) {
895 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
906 dst_link_failure(skb);
915 static void ipgre_tunnel_bind_dev(struct net_device *dev)
917 struct net_device *tdev = NULL;
918 struct ip_tunnel *tunnel;
920 int hlen = LL_MAX_HEADER;
921 int mtu = ETH_DATA_LEN;
922 int addend = sizeof(struct iphdr) + 4;
924 tunnel = netdev_priv(dev);
925 iph = &tunnel->parms.iph;
927 /* Guess output device to choose reasonable mtu and hard_header_len */
930 struct flowi fl = { .oif = tunnel->parms.link,
932 { .daddr = iph->daddr,
934 .tos = RT_TOS(iph->tos) } },
935 .proto = IPPROTO_GRE };
937 if (!ip_route_output_key(&init_net, &rt, &fl)) {
938 tdev = rt->u.dst.dev;
941 dev->flags |= IFF_POINTOPOINT;
944 if (!tdev && tunnel->parms.link)
945 tdev = __dev_get_by_index(&init_net, tunnel->parms.link);
948 hlen = tdev->hard_header_len;
951 dev->iflink = tunnel->parms.link;
953 /* Precalculate GRE options length */
954 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
955 if (tunnel->parms.o_flags&GRE_CSUM)
957 if (tunnel->parms.o_flags&GRE_KEY)
959 if (tunnel->parms.o_flags&GRE_SEQ)
962 dev->hard_header_len = hlen + addend;
963 dev->mtu = mtu - addend;
964 tunnel->hlen = addend;
969 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
972 struct ip_tunnel_parm p;
974 struct net *net = dev_net(dev);
975 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
980 if (dev == ign->fb_tunnel_dev) {
981 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
985 t = ipgre_tunnel_locate(net, &p, 0);
988 t = netdev_priv(dev);
989 memcpy(&p, &t->parms, sizeof(p));
990 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
997 if (!capable(CAP_NET_ADMIN))
1001 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1005 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1006 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1007 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1010 p.iph.frag_off |= htons(IP_DF);
1012 if (!(p.i_flags&GRE_KEY))
1014 if (!(p.o_flags&GRE_KEY))
1017 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1019 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1021 if (t->dev != dev) {
1028 t = netdev_priv(dev);
1030 if (ipv4_is_multicast(p.iph.daddr))
1031 nflags = IFF_BROADCAST;
1032 else if (p.iph.daddr)
1033 nflags = IFF_POINTOPOINT;
1035 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1039 ipgre_tunnel_unlink(ign, t);
1040 t->parms.iph.saddr = p.iph.saddr;
1041 t->parms.iph.daddr = p.iph.daddr;
1042 t->parms.i_key = p.i_key;
1043 t->parms.o_key = p.o_key;
1044 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1045 memcpy(dev->broadcast, &p.iph.daddr, 4);
1046 ipgre_tunnel_link(ign, t);
1047 netdev_state_change(dev);
1053 if (cmd == SIOCCHGTUNNEL) {
1054 t->parms.iph.ttl = p.iph.ttl;
1055 t->parms.iph.tos = p.iph.tos;
1056 t->parms.iph.frag_off = p.iph.frag_off;
1057 if (t->parms.link != p.link) {
1058 t->parms.link = p.link;
1059 ipgre_tunnel_bind_dev(dev);
1060 netdev_state_change(dev);
1063 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1066 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1071 if (!capable(CAP_NET_ADMIN))
1074 if (dev == ign->fb_tunnel_dev) {
1076 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1079 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1082 if (t == netdev_priv(ign->fb_tunnel_dev))
1086 unregister_netdevice(dev);
1098 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1100 return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
1103 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1105 struct ip_tunnel *tunnel = netdev_priv(dev);
1106 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
1112 /* Nice toy. Unfortunately, useless in real life :-)
1113 It allows to construct virtual multiprotocol broadcast "LAN"
1114 over the Internet, provided multicast routing is tuned.
1117 I have no idea was this bicycle invented before me,
1118 so that I had to set ARPHRD_IPGRE to a random value.
1119 I have an impression, that Cisco could make something similar,
1120 but this feature is apparently missing in IOS<=11.2(8).
1122 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1123 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1125 ping -t 255 224.66.66.66
1127 If nobody answers, mbone does not work.
1129 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1130 ip addr add 10.66.66.<somewhat>/24 dev Universe
1131 ifconfig Universe up
1132 ifconfig Universe add fe80::<Your_real_addr>/10
1133 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1136 ftp fec0:6666:6666::193.233.7.65
1141 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1142 unsigned short type,
1143 const void *daddr, const void *saddr, unsigned len)
1145 struct ip_tunnel *t = netdev_priv(dev);
1146 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1147 __be16 *p = (__be16*)(iph+1);
1149 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1150 p[0] = t->parms.o_flags;
1154 * Set the source hardware address.
1158 memcpy(&iph->saddr, saddr, 4);
1161 memcpy(&iph->daddr, daddr, 4);
1164 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1170 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1172 struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1173 memcpy(haddr, &iph->saddr, 4);
1177 static const struct header_ops ipgre_header_ops = {
1178 .create = ipgre_header,
1179 .parse = ipgre_header_parse,
1182 #ifdef CONFIG_NET_IPGRE_BROADCAST
1183 static int ipgre_open(struct net_device *dev)
1185 struct ip_tunnel *t = netdev_priv(dev);
1187 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1188 struct flowi fl = { .oif = t->parms.link,
1190 { .daddr = t->parms.iph.daddr,
1191 .saddr = t->parms.iph.saddr,
1192 .tos = RT_TOS(t->parms.iph.tos) } },
1193 .proto = IPPROTO_GRE };
1195 if (ip_route_output_key(&init_net, &rt, &fl))
1196 return -EADDRNOTAVAIL;
1197 dev = rt->u.dst.dev;
1199 if (__in_dev_get_rtnl(dev) == NULL)
1200 return -EADDRNOTAVAIL;
1201 t->mlink = dev->ifindex;
1202 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1207 static int ipgre_close(struct net_device *dev)
1209 struct ip_tunnel *t = netdev_priv(dev);
1210 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1211 struct in_device *in_dev;
1212 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1214 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1223 static void ipgre_tunnel_setup(struct net_device *dev)
1225 dev->uninit = ipgre_tunnel_uninit;
1226 dev->destructor = free_netdev;
1227 dev->hard_start_xmit = ipgre_tunnel_xmit;
1228 dev->get_stats = ipgre_tunnel_get_stats;
1229 dev->do_ioctl = ipgre_tunnel_ioctl;
1230 dev->change_mtu = ipgre_tunnel_change_mtu;
1232 dev->type = ARPHRD_IPGRE;
1233 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1234 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1235 dev->flags = IFF_NOARP;
1240 static int ipgre_tunnel_init(struct net_device *dev)
1242 struct ip_tunnel *tunnel;
1245 tunnel = netdev_priv(dev);
1246 iph = &tunnel->parms.iph;
1249 strcpy(tunnel->parms.name, dev->name);
1251 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1252 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1254 ipgre_tunnel_bind_dev(dev);
1257 #ifdef CONFIG_NET_IPGRE_BROADCAST
1258 if (ipv4_is_multicast(iph->daddr)) {
1261 dev->flags = IFF_BROADCAST;
1262 dev->header_ops = &ipgre_header_ops;
1263 dev->open = ipgre_open;
1264 dev->stop = ipgre_close;
1268 dev->header_ops = &ipgre_header_ops;
1273 static int ipgre_fb_tunnel_init(struct net_device *dev)
1275 struct ip_tunnel *tunnel = netdev_priv(dev);
1276 struct iphdr *iph = &tunnel->parms.iph;
1279 strcpy(tunnel->parms.name, dev->name);
1282 iph->protocol = IPPROTO_GRE;
1284 tunnel->hlen = sizeof(struct iphdr) + 4;
1287 tunnels_wc[0] = tunnel;
1292 static struct net_protocol ipgre_protocol = {
1293 .handler = ipgre_rcv,
1294 .err_handler = ipgre_err,
1297 static int ipgre_init_net(struct net *net)
1300 struct ipgre_net *ign;
1303 ign = kmalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1307 err = net_assign_generic(net, ipgre_net_id, ign);
1311 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1312 ipgre_tunnel_setup);
1313 if (!ign->fb_tunnel_dev) {
1318 ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1319 dev_net_set(ign->fb_tunnel_dev, net);
1321 if ((err = register_netdev(ign->fb_tunnel_dev)))
1327 free_netdev(ign->fb_tunnel_dev);
1336 static void ipgre_exit_net(struct net *net)
1338 struct ipgre_net *ign;
1340 ign = net_generic(net, ipgre_net_id);
1342 if (net != &init_net)
1343 unregister_netdevice(ign->fb_tunnel_dev);
1348 static struct pernet_operations ipgre_net_ops = {
1349 .init = ipgre_init_net,
1350 .exit = ipgre_exit_net,
1354 * And now the modules code and kernel interface.
1357 static int __init ipgre_init(void)
1361 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1363 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1364 printk(KERN_INFO "ipgre init: can't add protocol\n");
1368 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1370 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1375 static void __exit ipgre_destroy_tunnels(void)
1379 for (prio = 0; prio < 4; prio++) {
1381 for (h = 0; h < HASH_SIZE; h++) {
1382 struct ip_tunnel *t;
1383 while ((t = tunnels[prio][h]) != NULL)
1384 unregister_netdevice(t->dev);
1389 static void __exit ipgre_fini(void)
1391 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1392 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1395 ipgre_destroy_tunnels();
1398 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1401 module_init(ipgre_init);
1402 module_exit(ipgre_fini);
1403 MODULE_LICENSE("GPL");