2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/config.h>
66 #include <linux/module.h>
67 #include <asm/uaccess.h>
68 #include <asm/system.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/sched.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/rtnetlink.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/protocol.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
100 #include <net/ip_fib.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/ip_mp_alg.h>
107 #include <linux/sysctl.h>
110 #define RT_FL_TOS(oldflp) \
111 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 #define IP_MAX_MTU 0xFFF0
115 #define RT_GC_TIMEOUT (300*HZ)
117 static int ip_rt_min_delay = 2 * HZ;
118 static int ip_rt_max_delay = 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval = 60 * HZ;
122 static int ip_rt_gc_min_interval = HZ / 2;
123 static int ip_rt_redirect_number = 9;
124 static int ip_rt_redirect_load = HZ / 50;
125 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost = HZ;
127 static int ip_rt_error_burst = 5 * HZ;
128 static int ip_rt_gc_elasticity = 8;
129 static int ip_rt_mtu_expires = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu = 512 + 20 + 20;
131 static int ip_rt_min_advmss = 256;
132 static int ip_rt_secret_interval = 10 * 60 * HZ;
133 static unsigned long rt_deadline;
135 #define RTprint(a...) printk(KERN_DEBUG a)
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
142 * Interface to generic destination cache.
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
147 static void ipv4_dst_ifdown(struct dst_entry *dst,
148 struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void ipv4_link_failure(struct sk_buff *skb);
151 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
155 static struct dst_ops ipv4_dst_ops = {
157 .protocol = __constant_htons(ETH_P_IP),
158 .gc = rt_garbage_collect,
159 .check = ipv4_dst_check,
160 .destroy = ipv4_dst_destroy,
161 .ifdown = ipv4_dst_ifdown,
162 .negative_advice = ipv4_negative_advice,
163 .link_failure = ipv4_link_failure,
164 .update_pmtu = ip_rt_update_pmtu,
165 .entry_size = sizeof(struct rtable),
168 #define ECN_OR_COST(class) TC_PRIO_##class
170 __u8 ip_tos2prio[16] = {
174 ECN_OR_COST(BESTEFFORT),
180 ECN_OR_COST(INTERACTIVE),
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
194 /* The locking scheme is rather straight forward:
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
204 struct rt_hash_bucket {
205 struct rtable *chain;
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
213 #define RT_HASH_LOCK_SZ 4096
215 #define RT_HASH_LOCK_SZ 2048
217 #define RT_HASH_LOCK_SZ 1024
219 #define RT_HASH_LOCK_SZ 512
221 #define RT_HASH_LOCK_SZ 256
224 static spinlock_t *rt_hash_locks;
225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226 # define rt_hash_lock_init() { \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
234 # define rt_hash_lock_addr(slot) NULL
235 # define rt_hash_lock_init()
238 static struct rt_hash_bucket *rt_hash_table;
239 static unsigned rt_hash_mask;
240 static int rt_hash_log;
241 static unsigned int rt_hash_rnd;
243 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
244 #define RT_CACHE_STAT_INC(field) (__get_cpu_var(rt_cache_stat).field++)
246 static int rt_intern_hash(unsigned hash, struct rtable *rth,
247 struct rtable **res);
249 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
251 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
255 #ifdef CONFIG_PROC_FS
256 struct rt_cache_iter_state {
260 static struct rtable *rt_cache_get_first(struct seq_file *seq)
262 struct rtable *r = NULL;
263 struct rt_cache_iter_state *st = seq->private;
265 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
267 r = rt_hash_table[st->bucket].chain;
270 rcu_read_unlock_bh();
275 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
277 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
281 rcu_read_unlock_bh();
282 if (--st->bucket < 0)
285 r = rt_hash_table[st->bucket].chain;
290 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
292 struct rtable *r = rt_cache_get_first(seq);
295 while (pos && (r = rt_cache_get_next(seq, r)))
297 return pos ? NULL : r;
300 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
302 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
305 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
307 struct rtable *r = NULL;
309 if (v == SEQ_START_TOKEN)
310 r = rt_cache_get_first(seq);
312 r = rt_cache_get_next(seq, v);
317 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
319 if (v && v != SEQ_START_TOKEN)
320 rcu_read_unlock_bh();
323 static int rt_cache_seq_show(struct seq_file *seq, void *v)
325 if (v == SEQ_START_TOKEN)
326 seq_printf(seq, "%-127s\n",
327 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
328 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
331 struct rtable *r = v;
334 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
335 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
336 r->u.dst.dev ? r->u.dst.dev->name : "*",
337 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
338 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
339 r->u.dst.__use, 0, (unsigned long)r->rt_src,
340 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
341 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
342 dst_metric(&r->u.dst, RTAX_WINDOW),
343 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
344 dst_metric(&r->u.dst, RTAX_RTTVAR)),
346 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
347 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
350 seq_printf(seq, "%-127s\n", temp);
355 static struct seq_operations rt_cache_seq_ops = {
356 .start = rt_cache_seq_start,
357 .next = rt_cache_seq_next,
358 .stop = rt_cache_seq_stop,
359 .show = rt_cache_seq_show,
362 static int rt_cache_seq_open(struct inode *inode, struct file *file)
364 struct seq_file *seq;
366 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
370 rc = seq_open(file, &rt_cache_seq_ops);
373 seq = file->private_data;
375 memset(s, 0, sizeof(*s));
383 static struct file_operations rt_cache_seq_fops = {
384 .owner = THIS_MODULE,
385 .open = rt_cache_seq_open,
388 .release = seq_release_private,
392 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
397 return SEQ_START_TOKEN;
399 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
400 if (!cpu_possible(cpu))
403 return &per_cpu(rt_cache_stat, cpu);
408 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
412 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
413 if (!cpu_possible(cpu))
416 return &per_cpu(rt_cache_stat, cpu);
422 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
427 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
429 struct rt_cache_stat *st = v;
431 if (v == SEQ_START_TOKEN) {
432 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
436 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
437 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
438 atomic_read(&ipv4_dst_ops.entries),
461 static struct seq_operations rt_cpu_seq_ops = {
462 .start = rt_cpu_seq_start,
463 .next = rt_cpu_seq_next,
464 .stop = rt_cpu_seq_stop,
465 .show = rt_cpu_seq_show,
469 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
471 return seq_open(file, &rt_cpu_seq_ops);
474 static struct file_operations rt_cpu_seq_fops = {
475 .owner = THIS_MODULE,
476 .open = rt_cpu_seq_open,
479 .release = seq_release,
482 #endif /* CONFIG_PROC_FS */
484 static __inline__ void rt_free(struct rtable *rt)
486 multipath_remove(rt);
487 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
490 static __inline__ void rt_drop(struct rtable *rt)
492 multipath_remove(rt);
494 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
497 static __inline__ int rt_fast_clean(struct rtable *rth)
499 /* Kill broadcast/multicast entries very aggresively, if they
500 collide in hash table with more useful entries */
501 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
502 rth->fl.iif && rth->u.rt_next;
505 static __inline__ int rt_valuable(struct rtable *rth)
507 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
511 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
516 if (atomic_read(&rth->u.dst.__refcnt))
520 if (rth->u.dst.expires &&
521 time_after_eq(jiffies, rth->u.dst.expires))
524 age = jiffies - rth->u.dst.lastuse;
526 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
527 (age <= tmo2 && rt_valuable(rth)))
533 /* Bits of score are:
535 * 30: not quite useless
536 * 29..0: usage counter
538 static inline u32 rt_score(struct rtable *rt)
540 u32 score = jiffies - rt->u.dst.lastuse;
542 score = ~score & ~(3<<30);
548 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
554 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
556 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
557 fl1->oif == fl2->oif &&
558 fl1->iif == fl2->iif;
561 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
562 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
563 struct rtable *expentry,
566 int passedexpired = 0;
567 struct rtable **nextstep = NULL;
568 struct rtable **rthp = chain_head;
574 while ((rth = *rthp) != NULL) {
578 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
579 compare_keys(&(*rthp)->fl, &expentry->fl)) {
580 if (*rthp == expentry) {
581 *rthp = rth->u.rt_next;
584 *rthp = rth->u.rt_next;
590 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
591 passedexpired && !nextstep)
592 nextstep = &rth->u.rt_next;
594 rthp = &rth->u.rt_next;
604 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
607 /* This runs via a timer and thus is always in BH context. */
608 static void rt_check_expire(unsigned long dummy)
610 static unsigned int rover;
611 unsigned int i = rover, goal;
612 struct rtable *rth, **rthp;
613 unsigned long now = jiffies;
616 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
617 if (ip_rt_gc_timeout > 1)
618 do_div(mult, ip_rt_gc_timeout);
619 goal = (unsigned int)mult;
620 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
621 for (; goal > 0; goal--) {
622 unsigned long tmo = ip_rt_gc_timeout;
624 i = (i + 1) & rt_hash_mask;
625 rthp = &rt_hash_table[i].chain;
629 spin_lock(rt_hash_lock_addr(i));
630 while ((rth = *rthp) != NULL) {
631 if (rth->u.dst.expires) {
632 /* Entry is expired even if it is in use */
633 if (time_before_eq(now, rth->u.dst.expires)) {
635 rthp = &rth->u.rt_next;
638 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
640 rthp = &rth->u.rt_next;
644 /* Cleanup aged off entries. */
645 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
646 /* remove all related balanced entries if necessary */
647 if (rth->u.dst.flags & DST_BALANCED) {
648 rthp = rt_remove_balanced_route(
649 &rt_hash_table[i].chain,
654 *rthp = rth->u.rt_next;
657 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
658 *rthp = rth->u.rt_next;
660 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
662 spin_unlock(rt_hash_lock_addr(i));
664 /* Fallback loop breaker. */
665 if (time_after(jiffies, now))
669 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
672 /* This can run from both BH and non-BH contexts, the latter
673 * in the case of a forced flush event.
675 static void rt_run_flush(unsigned long dummy)
678 struct rtable *rth, *next;
682 get_random_bytes(&rt_hash_rnd, 4);
684 for (i = rt_hash_mask; i >= 0; i--) {
685 spin_lock_bh(rt_hash_lock_addr(i));
686 rth = rt_hash_table[i].chain;
688 rt_hash_table[i].chain = NULL;
689 spin_unlock_bh(rt_hash_lock_addr(i));
691 for (; rth; rth = next) {
692 next = rth->u.rt_next;
698 static DEFINE_SPINLOCK(rt_flush_lock);
700 void rt_cache_flush(int delay)
702 unsigned long now = jiffies;
703 int user_mode = !in_softirq();
706 delay = ip_rt_min_delay;
708 /* flush existing multipath state*/
711 spin_lock_bh(&rt_flush_lock);
713 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
714 long tmo = (long)(rt_deadline - now);
716 /* If flush timer is already running
717 and flush request is not immediate (delay > 0):
719 if deadline is not achieved, prolongate timer to "delay",
720 otherwise fire it at deadline time.
723 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
731 spin_unlock_bh(&rt_flush_lock);
736 if (rt_deadline == 0)
737 rt_deadline = now + ip_rt_max_delay;
739 mod_timer(&rt_flush_timer, now+delay);
740 spin_unlock_bh(&rt_flush_lock);
743 static void rt_secret_rebuild(unsigned long dummy)
745 unsigned long now = jiffies;
748 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
752 Short description of GC goals.
754 We want to build algorithm, which will keep routing cache
755 at some equilibrium point, when number of aged off entries
756 is kept approximately equal to newly generated ones.
758 Current expiration strength is variable "expire".
759 We try to adjust it dynamically, so that if networking
760 is idle expires is large enough to keep enough of warm entries,
761 and when load increases it reduces to limit cache size.
764 static int rt_garbage_collect(void)
766 static unsigned long expire = RT_GC_TIMEOUT;
767 static unsigned long last_gc;
769 static int equilibrium;
770 struct rtable *rth, **rthp;
771 unsigned long now = jiffies;
775 * Garbage collection is pretty expensive,
776 * do not make it too frequently.
779 RT_CACHE_STAT_INC(gc_total);
781 if (now - last_gc < ip_rt_gc_min_interval &&
782 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
783 RT_CACHE_STAT_INC(gc_ignored);
787 /* Calculate number of entries, which we want to expire now. */
788 goal = atomic_read(&ipv4_dst_ops.entries) -
789 (ip_rt_gc_elasticity << rt_hash_log);
791 if (equilibrium < ipv4_dst_ops.gc_thresh)
792 equilibrium = ipv4_dst_ops.gc_thresh;
793 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
795 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
796 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
799 /* We are in dangerous area. Try to reduce cache really
802 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
803 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
806 if (now - last_gc >= ip_rt_gc_min_interval)
817 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
818 unsigned long tmo = expire;
820 k = (k + 1) & rt_hash_mask;
821 rthp = &rt_hash_table[k].chain;
822 spin_lock_bh(rt_hash_lock_addr(k));
823 while ((rth = *rthp) != NULL) {
824 if (!rt_may_expire(rth, tmo, expire)) {
826 rthp = &rth->u.rt_next;
829 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
830 /* remove all related balanced entries
833 if (rth->u.dst.flags & DST_BALANCED) {
836 rthp = rt_remove_balanced_route(
837 &rt_hash_table[i].chain,
844 *rthp = rth->u.rt_next;
848 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
849 *rthp = rth->u.rt_next;
852 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
854 spin_unlock_bh(rt_hash_lock_addr(k));
863 /* Goal is not achieved. We stop process if:
865 - if expire reduced to zero. Otherwise, expire is halfed.
866 - if table is not full.
867 - if we are called from interrupt.
868 - jiffies check is just fallback/debug loop breaker.
869 We will not spin here for long time in any case.
872 RT_CACHE_STAT_INC(gc_goal_miss);
878 #if RT_CACHE_DEBUG >= 2
879 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
880 atomic_read(&ipv4_dst_ops.entries), goal, i);
883 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
885 } while (!in_softirq() && time_before_eq(jiffies, now));
887 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
890 printk(KERN_WARNING "dst cache overflow\n");
891 RT_CACHE_STAT_INC(gc_dst_overflow);
895 expire += ip_rt_gc_min_interval;
896 if (expire > ip_rt_gc_timeout ||
897 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
898 expire = ip_rt_gc_timeout;
899 #if RT_CACHE_DEBUG >= 2
900 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
901 atomic_read(&ipv4_dst_ops.entries), goal, rover);
906 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
908 struct rtable *rth, **rthp;
910 struct rtable *cand, **candp;
913 int attempts = !in_softirq();
922 rthp = &rt_hash_table[hash].chain;
924 spin_lock_bh(rt_hash_lock_addr(hash));
925 while ((rth = *rthp) != NULL) {
926 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
927 if (!(rth->u.dst.flags & DST_BALANCED) &&
928 compare_keys(&rth->fl, &rt->fl)) {
930 if (compare_keys(&rth->fl, &rt->fl)) {
933 *rthp = rth->u.rt_next;
935 * Since lookup is lockfree, the deletion
936 * must be visible to another weakly ordered CPU before
937 * the insertion at the start of the hash chain.
939 rcu_assign_pointer(rth->u.rt_next,
940 rt_hash_table[hash].chain);
942 * Since lookup is lockfree, the update writes
943 * must be ordered for consistency on SMP.
945 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
948 dst_hold(&rth->u.dst);
949 rth->u.dst.lastuse = now;
950 spin_unlock_bh(rt_hash_lock_addr(hash));
957 if (!atomic_read(&rth->u.dst.__refcnt)) {
958 u32 score = rt_score(rth);
960 if (score <= min_score) {
969 rthp = &rth->u.rt_next;
973 /* ip_rt_gc_elasticity used to be average length of chain
974 * length, when exceeded gc becomes really aggressive.
976 * The second limit is less certain. At the moment it allows
977 * only 2 entries per bucket. We will see.
979 if (chain_length > ip_rt_gc_elasticity) {
980 *candp = cand->u.rt_next;
985 /* Try to bind route to arp only if it is output
986 route or unicast forwarding path.
988 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
989 int err = arp_bind_neighbour(&rt->u.dst);
991 spin_unlock_bh(rt_hash_lock_addr(hash));
993 if (err != -ENOBUFS) {
998 /* Neighbour tables are full and nothing
999 can be released. Try to shrink route cache,
1000 it is most likely it holds some neighbour records.
1002 if (attempts-- > 0) {
1003 int saved_elasticity = ip_rt_gc_elasticity;
1004 int saved_int = ip_rt_gc_min_interval;
1005 ip_rt_gc_elasticity = 1;
1006 ip_rt_gc_min_interval = 0;
1007 rt_garbage_collect();
1008 ip_rt_gc_min_interval = saved_int;
1009 ip_rt_gc_elasticity = saved_elasticity;
1013 if (net_ratelimit())
1014 printk(KERN_WARNING "Neighbour table overflow.\n");
1020 rt->u.rt_next = rt_hash_table[hash].chain;
1021 #if RT_CACHE_DEBUG >= 2
1022 if (rt->u.rt_next) {
1024 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1025 NIPQUAD(rt->rt_dst));
1026 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1027 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1031 rt_hash_table[hash].chain = rt;
1032 spin_unlock_bh(rt_hash_lock_addr(hash));
1037 void rt_bind_peer(struct rtable *rt, int create)
1039 static DEFINE_SPINLOCK(rt_peer_lock);
1040 struct inet_peer *peer;
1042 peer = inet_getpeer(rt->rt_dst, create);
1044 spin_lock_bh(&rt_peer_lock);
1045 if (rt->peer == NULL) {
1049 spin_unlock_bh(&rt_peer_lock);
1055 * Peer allocation may fail only in serious out-of-memory conditions. However
1056 * we still can generate some output.
1057 * Random ID selection looks a bit dangerous because we have no chances to
1058 * select ID being unique in a reasonable period of time.
1059 * But broken packet identifier may be better than no packet at all.
1061 static void ip_select_fb_ident(struct iphdr *iph)
1063 static DEFINE_SPINLOCK(ip_fb_id_lock);
1064 static u32 ip_fallback_id;
1067 spin_lock_bh(&ip_fb_id_lock);
1068 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1069 iph->id = htons(salt & 0xFFFF);
1070 ip_fallback_id = salt;
1071 spin_unlock_bh(&ip_fb_id_lock);
1074 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1076 struct rtable *rt = (struct rtable *) dst;
1079 if (rt->peer == NULL)
1080 rt_bind_peer(rt, 1);
1082 /* If peer is attached to destination, it is never detached,
1083 so that we need not to grab a lock to dereference it.
1086 iph->id = htons(inet_getid(rt->peer, more));
1090 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1091 __builtin_return_address(0));
1093 ip_select_fb_ident(iph);
1096 static void rt_del(unsigned hash, struct rtable *rt)
1098 struct rtable **rthp;
1100 spin_lock_bh(rt_hash_lock_addr(hash));
1102 for (rthp = &rt_hash_table[hash].chain; *rthp;
1103 rthp = &(*rthp)->u.rt_next)
1105 *rthp = rt->u.rt_next;
1109 spin_unlock_bh(rt_hash_lock_addr(hash));
1112 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1113 u32 saddr, u8 tos, struct net_device *dev)
1116 struct in_device *in_dev = in_dev_get(dev);
1117 struct rtable *rth, **rthp;
1118 u32 skeys[2] = { saddr, 0 };
1119 int ikeys[2] = { dev->ifindex, 0 };
1121 tos &= IPTOS_RT_MASK;
1126 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1127 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1128 goto reject_redirect;
1130 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1131 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1132 goto reject_redirect;
1133 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1134 goto reject_redirect;
1136 if (inet_addr_type(new_gw) != RTN_UNICAST)
1137 goto reject_redirect;
1140 for (i = 0; i < 2; i++) {
1141 for (k = 0; k < 2; k++) {
1142 unsigned hash = rt_hash_code(daddr,
1143 skeys[i] ^ (ikeys[k] << 5),
1146 rthp=&rt_hash_table[hash].chain;
1149 while ((rth = rcu_dereference(*rthp)) != NULL) {
1152 if (rth->fl.fl4_dst != daddr ||
1153 rth->fl.fl4_src != skeys[i] ||
1154 rth->fl.fl4_tos != tos ||
1155 rth->fl.oif != ikeys[k] ||
1157 rthp = &rth->u.rt_next;
1161 if (rth->rt_dst != daddr ||
1162 rth->rt_src != saddr ||
1164 rth->rt_gateway != old_gw ||
1165 rth->u.dst.dev != dev)
1168 dst_hold(&rth->u.dst);
1171 rt = dst_alloc(&ipv4_dst_ops);
1178 /* Copy all the information. */
1180 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1181 rt->u.dst.__use = 1;
1182 atomic_set(&rt->u.dst.__refcnt, 1);
1183 rt->u.dst.child = NULL;
1185 dev_hold(rt->u.dst.dev);
1187 in_dev_hold(rt->idev);
1188 rt->u.dst.obsolete = 0;
1189 rt->u.dst.lastuse = jiffies;
1190 rt->u.dst.path = &rt->u.dst;
1191 rt->u.dst.neighbour = NULL;
1192 rt->u.dst.hh = NULL;
1193 rt->u.dst.xfrm = NULL;
1195 rt->rt_flags |= RTCF_REDIRECTED;
1197 /* Gateway is different ... */
1198 rt->rt_gateway = new_gw;
1200 /* Redirect received -> path was valid */
1201 dst_confirm(&rth->u.dst);
1204 atomic_inc(&rt->peer->refcnt);
1206 if (arp_bind_neighbour(&rt->u.dst) ||
1207 !(rt->u.dst.neighbour->nud_state &
1209 if (rt->u.dst.neighbour)
1210 neigh_event_send(rt->u.dst.neighbour, NULL);
1217 if (!rt_intern_hash(hash, rt, &rt))
1230 #ifdef CONFIG_IP_ROUTE_VERBOSE
1231 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1232 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1233 "%u.%u.%u.%u ignored.\n"
1234 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1236 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1237 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1242 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1244 struct rtable *rt = (struct rtable*)dst;
1245 struct dst_entry *ret = dst;
1248 if (dst->obsolete) {
1251 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1252 rt->u.dst.expires) {
1253 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1257 #if RT_CACHE_DEBUG >= 1
1258 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1259 "%u.%u.%u.%u/%02x dropped\n",
1260 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1271 * 1. The first ip_rt_redirect_number redirects are sent
1272 * with exponential backoff, then we stop sending them at all,
1273 * assuming that the host ignores our redirects.
1274 * 2. If we did not see packets requiring redirects
1275 * during ip_rt_redirect_silence, we assume that the host
1276 * forgot redirected route and start to send redirects again.
1278 * This algorithm is much cheaper and more intelligent than dumb load limiting
1281 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1282 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1285 void ip_rt_send_redirect(struct sk_buff *skb)
1287 struct rtable *rt = (struct rtable*)skb->dst;
1288 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1293 if (!IN_DEV_TX_REDIRECTS(in_dev))
1296 /* No redirected packets during ip_rt_redirect_silence;
1297 * reset the algorithm.
1299 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1300 rt->u.dst.rate_tokens = 0;
1302 /* Too many ignored redirects; do not send anything
1303 * set u.dst.rate_last to the last seen redirected packet.
1305 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1306 rt->u.dst.rate_last = jiffies;
1310 /* Check for load limit; set rate_last to the latest sent
1313 if (time_after(jiffies,
1314 (rt->u.dst.rate_last +
1315 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1316 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1317 rt->u.dst.rate_last = jiffies;
1318 ++rt->u.dst.rate_tokens;
1319 #ifdef CONFIG_IP_ROUTE_VERBOSE
1320 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1321 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1323 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1324 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1325 NIPQUAD(rt->rt_src), rt->rt_iif,
1326 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1333 static int ip_error(struct sk_buff *skb)
1335 struct rtable *rt = (struct rtable*)skb->dst;
1339 switch (rt->u.dst.error) {
1344 code = ICMP_HOST_UNREACH;
1347 code = ICMP_NET_UNREACH;
1350 code = ICMP_PKT_FILTERED;
1355 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1356 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1357 rt->u.dst.rate_tokens = ip_rt_error_burst;
1358 rt->u.dst.rate_last = now;
1359 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1360 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1361 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1364 out: kfree_skb(skb);
1369 * The last two values are not from the RFC but
1370 * are needed for AMPRnet AX.25 paths.
1373 static const unsigned short mtu_plateau[] =
1374 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1376 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1380 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1381 if (old_mtu > mtu_plateau[i])
1382 return mtu_plateau[i];
1386 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1389 unsigned short old_mtu = ntohs(iph->tot_len);
1391 u32 skeys[2] = { iph->saddr, 0, };
1392 u32 daddr = iph->daddr;
1393 u8 tos = iph->tos & IPTOS_RT_MASK;
1394 unsigned short est_mtu = 0;
1396 if (ipv4_config.no_pmtu_disc)
1399 for (i = 0; i < 2; i++) {
1400 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1403 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1404 rth = rcu_dereference(rth->u.rt_next)) {
1405 if (rth->fl.fl4_dst == daddr &&
1406 rth->fl.fl4_src == skeys[i] &&
1407 rth->rt_dst == daddr &&
1408 rth->rt_src == iph->saddr &&
1409 rth->fl.fl4_tos == tos &&
1411 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1412 unsigned short mtu = new_mtu;
1414 if (new_mtu < 68 || new_mtu >= old_mtu) {
1416 /* BSD 4.2 compatibility hack :-( */
1418 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1419 old_mtu >= 68 + (iph->ihl << 2))
1420 old_mtu -= iph->ihl << 2;
1422 mtu = guess_mtu(old_mtu);
1424 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1425 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1426 dst_confirm(&rth->u.dst);
1427 if (mtu < ip_rt_min_pmtu) {
1428 mtu = ip_rt_min_pmtu;
1429 rth->u.dst.metrics[RTAX_LOCK-1] |=
1432 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1433 dst_set_expires(&rth->u.dst,
1442 return est_mtu ? : new_mtu;
1445 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1447 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1448 !(dst_metric_locked(dst, RTAX_MTU))) {
1449 if (mtu < ip_rt_min_pmtu) {
1450 mtu = ip_rt_min_pmtu;
1451 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1453 dst->metrics[RTAX_MTU-1] = mtu;
1454 dst_set_expires(dst, ip_rt_mtu_expires);
1458 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1463 static void ipv4_dst_destroy(struct dst_entry *dst)
1465 struct rtable *rt = (struct rtable *) dst;
1466 struct inet_peer *peer = rt->peer;
1467 struct in_device *idev = rt->idev;
1480 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1483 struct rtable *rt = (struct rtable *) dst;
1484 struct in_device *idev = rt->idev;
1485 if (dev != &loopback_dev && idev && idev->dev == dev) {
1486 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1487 if (loopback_idev) {
1488 rt->idev = loopback_idev;
1494 static void ipv4_link_failure(struct sk_buff *skb)
1498 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1500 rt = (struct rtable *) skb->dst;
1502 dst_set_expires(&rt->u.dst, 0);
1505 static int ip_rt_bug(struct sk_buff *skb)
1507 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1508 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1509 skb->dev ? skb->dev->name : "?");
1515 We do not cache source address of outgoing interface,
1516 because it is used only by IP RR, TS and SRR options,
1517 so that it out of fast path.
1519 BTW remember: "addr" is allowed to be not aligned
1523 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1526 struct fib_result res;
1528 if (rt->fl.iif == 0)
1530 else if (fib_lookup(&rt->fl, &res) == 0) {
1531 src = FIB_RES_PREFSRC(res);
1534 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1536 memcpy(addr, &src, 4);
1539 #ifdef CONFIG_NET_CLS_ROUTE
1540 static void set_class_tag(struct rtable *rt, u32 tag)
1542 if (!(rt->u.dst.tclassid & 0xFFFF))
1543 rt->u.dst.tclassid |= tag & 0xFFFF;
1544 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1545 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1549 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1551 struct fib_info *fi = res->fi;
1554 if (FIB_RES_GW(*res) &&
1555 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1556 rt->rt_gateway = FIB_RES_GW(*res);
1557 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1558 sizeof(rt->u.dst.metrics));
1559 if (fi->fib_mtu == 0) {
1560 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1561 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1562 rt->rt_gateway != rt->rt_dst &&
1563 rt->u.dst.dev->mtu > 576)
1564 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1566 #ifdef CONFIG_NET_CLS_ROUTE
1567 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1570 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1572 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1573 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1574 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1575 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1576 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1577 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1579 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1580 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1582 #ifdef CONFIG_NET_CLS_ROUTE
1583 #ifdef CONFIG_IP_MULTIPLE_TABLES
1584 set_class_tag(rt, fib_rules_tclass(res));
1586 set_class_tag(rt, itag);
1588 rt->rt_type = res->type;
1591 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1592 u8 tos, struct net_device *dev, int our)
1597 struct in_device *in_dev = in_dev_get(dev);
1600 /* Primary sanity checks. */
1605 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1606 skb->protocol != htons(ETH_P_IP))
1609 if (ZERONET(saddr)) {
1610 if (!LOCAL_MCAST(daddr))
1612 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1613 } else if (fib_validate_source(saddr, 0, tos, 0,
1614 dev, &spec_dst, &itag) < 0)
1617 rth = dst_alloc(&ipv4_dst_ops);
1621 rth->u.dst.output= ip_rt_bug;
1623 atomic_set(&rth->u.dst.__refcnt, 1);
1624 rth->u.dst.flags= DST_HOST;
1625 if (in_dev->cnf.no_policy)
1626 rth->u.dst.flags |= DST_NOPOLICY;
1627 rth->fl.fl4_dst = daddr;
1628 rth->rt_dst = daddr;
1629 rth->fl.fl4_tos = tos;
1630 #ifdef CONFIG_IP_ROUTE_FWMARK
1631 rth->fl.fl4_fwmark= skb->nfmark;
1633 rth->fl.fl4_src = saddr;
1634 rth->rt_src = saddr;
1635 #ifdef CONFIG_NET_CLS_ROUTE
1636 rth->u.dst.tclassid = itag;
1639 rth->fl.iif = dev->ifindex;
1640 rth->u.dst.dev = &loopback_dev;
1641 dev_hold(rth->u.dst.dev);
1642 rth->idev = in_dev_get(rth->u.dst.dev);
1644 rth->rt_gateway = daddr;
1645 rth->rt_spec_dst= spec_dst;
1646 rth->rt_type = RTN_MULTICAST;
1647 rth->rt_flags = RTCF_MULTICAST;
1649 rth->u.dst.input= ip_local_deliver;
1650 rth->rt_flags |= RTCF_LOCAL;
1653 #ifdef CONFIG_IP_MROUTE
1654 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1655 rth->u.dst.input = ip_mr_input;
1657 RT_CACHE_STAT_INC(in_slow_mc);
1660 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1661 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1673 static void ip_handle_martian_source(struct net_device *dev,
1674 struct in_device *in_dev,
1675 struct sk_buff *skb,
1679 RT_CACHE_STAT_INC(in_martian_src);
1680 #ifdef CONFIG_IP_ROUTE_VERBOSE
1681 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1683 * RFC1812 recommendation, if source is martian,
1684 * the only hint is MAC header.
1686 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1687 "%u.%u.%u.%u, on dev %s\n",
1688 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1689 if (dev->hard_header_len && skb->mac.raw) {
1691 unsigned char *p = skb->mac.raw;
1692 printk(KERN_WARNING "ll header: ");
1693 for (i = 0; i < dev->hard_header_len; i++, p++) {
1695 if (i < (dev->hard_header_len - 1))
1704 static inline int __mkroute_input(struct sk_buff *skb,
1705 struct fib_result* res,
1706 struct in_device *in_dev,
1707 u32 daddr, u32 saddr, u32 tos,
1708 struct rtable **result)
1713 struct in_device *out_dev;
1717 /* get a working reference to the output device */
1718 out_dev = in_dev_get(FIB_RES_DEV(*res));
1719 if (out_dev == NULL) {
1720 if (net_ratelimit())
1721 printk(KERN_CRIT "Bug in ip_route_input" \
1722 "_slow(). Please, report\n");
1727 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1728 in_dev->dev, &spec_dst, &itag);
1730 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1738 flags |= RTCF_DIRECTSRC;
1740 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1741 (IN_DEV_SHARED_MEDIA(out_dev) ||
1742 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1743 flags |= RTCF_DOREDIRECT;
1745 if (skb->protocol != htons(ETH_P_IP)) {
1746 /* Not IP (i.e. ARP). Do not create route, if it is
1747 * invalid for proxy arp. DNAT routes are always valid.
1749 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1756 rth = dst_alloc(&ipv4_dst_ops);
1762 atomic_set(&rth->u.dst.__refcnt, 1);
1763 rth->u.dst.flags= DST_HOST;
1764 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1765 if (res->fi->fib_nhs > 1)
1766 rth->u.dst.flags |= DST_BALANCED;
1768 if (in_dev->cnf.no_policy)
1769 rth->u.dst.flags |= DST_NOPOLICY;
1770 if (in_dev->cnf.no_xfrm)
1771 rth->u.dst.flags |= DST_NOXFRM;
1772 rth->fl.fl4_dst = daddr;
1773 rth->rt_dst = daddr;
1774 rth->fl.fl4_tos = tos;
1775 #ifdef CONFIG_IP_ROUTE_FWMARK
1776 rth->fl.fl4_fwmark= skb->nfmark;
1778 rth->fl.fl4_src = saddr;
1779 rth->rt_src = saddr;
1780 rth->rt_gateway = daddr;
1782 rth->fl.iif = in_dev->dev->ifindex;
1783 rth->u.dst.dev = (out_dev)->dev;
1784 dev_hold(rth->u.dst.dev);
1785 rth->idev = in_dev_get(rth->u.dst.dev);
1787 rth->rt_spec_dst= spec_dst;
1789 rth->u.dst.input = ip_forward;
1790 rth->u.dst.output = ip_output;
1792 rt_set_nexthop(rth, res, itag);
1794 rth->rt_flags = flags;
1799 /* release the working reference to the output device */
1800 in_dev_put(out_dev);
1804 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1805 struct fib_result* res,
1806 const struct flowi *fl,
1807 struct in_device *in_dev,
1808 u32 daddr, u32 saddr, u32 tos)
1810 struct rtable* rth = NULL;
1814 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1815 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1816 fib_select_multipath(fl, res);
1819 /* create a routing cache entry */
1820 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1824 /* put it into the cache */
1825 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1826 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1829 static inline int ip_mkroute_input(struct sk_buff *skb,
1830 struct fib_result* res,
1831 const struct flowi *fl,
1832 struct in_device *in_dev,
1833 u32 daddr, u32 saddr, u32 tos)
1835 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1836 struct rtable* rth = NULL, *rtres;
1837 unsigned char hop, hopcount;
1842 hopcount = res->fi->fib_nhs;
1846 /* distinguish between multipath and singlepath */
1848 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1851 /* add all alternatives to the routing cache */
1852 for (hop = 0; hop < hopcount; hop++) {
1855 /* put reference to previous result */
1859 /* create a routing cache entry */
1860 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1865 /* put it into the cache */
1866 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1867 err = rt_intern_hash(hash, rth, &rtres);
1871 /* forward hop information to multipath impl. */
1872 multipath_set_nhinfo(rth,
1873 FIB_RES_NETWORK(*res),
1874 FIB_RES_NETMASK(*res),
1878 skb->dst = &rtres->u.dst;
1880 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1881 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1882 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1887 * NOTE. We drop all the packets that has local source
1888 * addresses, because every properly looped back packet
1889 * must have correct destination already attached by output routine.
1891 * Such approach solves two big problems:
1892 * 1. Not simplex devices are handled properly.
1893 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1896 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1897 u8 tos, struct net_device *dev)
1899 struct fib_result res;
1900 struct in_device *in_dev = in_dev_get(dev);
1901 struct flowi fl = { .nl_u = { .ip4_u =
1905 .scope = RT_SCOPE_UNIVERSE,
1906 #ifdef CONFIG_IP_ROUTE_FWMARK
1907 .fwmark = skb->nfmark
1910 .iif = dev->ifindex };
1913 struct rtable * rth;
1919 /* IP on this device is disabled. */
1924 /* Check for the most weird martians, which can be not detected
1928 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1929 goto martian_source;
1931 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1934 /* Accept zero addresses only to limited broadcast;
1935 * I even do not know to fix it or not. Waiting for complains :-)
1938 goto martian_source;
1940 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1941 goto martian_destination;
1944 * Now we are ready to route packet.
1946 if ((err = fib_lookup(&fl, &res)) != 0) {
1947 if (!IN_DEV_FORWARD(in_dev))
1953 RT_CACHE_STAT_INC(in_slow_tot);
1955 if (res.type == RTN_BROADCAST)
1958 if (res.type == RTN_LOCAL) {
1960 result = fib_validate_source(saddr, daddr, tos,
1961 loopback_dev.ifindex,
1962 dev, &spec_dst, &itag);
1964 goto martian_source;
1966 flags |= RTCF_DIRECTSRC;
1971 if (!IN_DEV_FORWARD(in_dev))
1973 if (res.type != RTN_UNICAST)
1974 goto martian_destination;
1976 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1977 if (err == -ENOBUFS)
1989 if (skb->protocol != htons(ETH_P_IP))
1993 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1995 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1998 goto martian_source;
2000 flags |= RTCF_DIRECTSRC;
2002 flags |= RTCF_BROADCAST;
2003 res.type = RTN_BROADCAST;
2004 RT_CACHE_STAT_INC(in_brd);
2007 rth = dst_alloc(&ipv4_dst_ops);
2011 rth->u.dst.output= ip_rt_bug;
2013 atomic_set(&rth->u.dst.__refcnt, 1);
2014 rth->u.dst.flags= DST_HOST;
2015 if (in_dev->cnf.no_policy)
2016 rth->u.dst.flags |= DST_NOPOLICY;
2017 rth->fl.fl4_dst = daddr;
2018 rth->rt_dst = daddr;
2019 rth->fl.fl4_tos = tos;
2020 #ifdef CONFIG_IP_ROUTE_FWMARK
2021 rth->fl.fl4_fwmark= skb->nfmark;
2023 rth->fl.fl4_src = saddr;
2024 rth->rt_src = saddr;
2025 #ifdef CONFIG_NET_CLS_ROUTE
2026 rth->u.dst.tclassid = itag;
2029 rth->fl.iif = dev->ifindex;
2030 rth->u.dst.dev = &loopback_dev;
2031 dev_hold(rth->u.dst.dev);
2032 rth->idev = in_dev_get(rth->u.dst.dev);
2033 rth->rt_gateway = daddr;
2034 rth->rt_spec_dst= spec_dst;
2035 rth->u.dst.input= ip_local_deliver;
2036 rth->rt_flags = flags|RTCF_LOCAL;
2037 if (res.type == RTN_UNREACHABLE) {
2038 rth->u.dst.input= ip_error;
2039 rth->u.dst.error= -err;
2040 rth->rt_flags &= ~RTCF_LOCAL;
2042 rth->rt_type = res.type;
2043 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2044 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2048 RT_CACHE_STAT_INC(in_no_route);
2049 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2050 res.type = RTN_UNREACHABLE;
2054 * Do not cache martian addresses: they should be logged (RFC1812)
2056 martian_destination:
2057 RT_CACHE_STAT_INC(in_martian_dst);
2058 #ifdef CONFIG_IP_ROUTE_VERBOSE
2059 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2060 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2061 "%u.%u.%u.%u, dev %s\n",
2062 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2066 err = -EHOSTUNREACH;
2078 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2082 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2083 u8 tos, struct net_device *dev)
2085 struct rtable * rth;
2087 int iif = dev->ifindex;
2089 tos &= IPTOS_RT_MASK;
2090 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2093 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2094 rth = rcu_dereference(rth->u.rt_next)) {
2095 if (rth->fl.fl4_dst == daddr &&
2096 rth->fl.fl4_src == saddr &&
2097 rth->fl.iif == iif &&
2099 #ifdef CONFIG_IP_ROUTE_FWMARK
2100 rth->fl.fl4_fwmark == skb->nfmark &&
2102 rth->fl.fl4_tos == tos) {
2103 rth->u.dst.lastuse = jiffies;
2104 dst_hold(&rth->u.dst);
2106 RT_CACHE_STAT_INC(in_hit);
2108 skb->dst = (struct dst_entry*)rth;
2111 RT_CACHE_STAT_INC(in_hlist_search);
2115 /* Multicast recognition logic is moved from route cache to here.
2116 The problem was that too many Ethernet cards have broken/missing
2117 hardware multicast filters :-( As result the host on multicasting
2118 network acquires a lot of useless route cache entries, sort of
2119 SDR messages from all the world. Now we try to get rid of them.
2120 Really, provided software IP multicast filter is organized
2121 reasonably (at least, hashed), it does not result in a slowdown
2122 comparing with route cache reject entries.
2123 Note, that multicast routers are not affected, because
2124 route cache entry is created eventually.
2126 if (MULTICAST(daddr)) {
2127 struct in_device *in_dev;
2130 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2131 int our = ip_check_mc(in_dev, daddr, saddr,
2132 skb->nh.iph->protocol);
2134 #ifdef CONFIG_IP_MROUTE
2135 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2139 return ip_route_input_mc(skb, daddr, saddr,
2146 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2149 static inline int __mkroute_output(struct rtable **result,
2150 struct fib_result* res,
2151 const struct flowi *fl,
2152 const struct flowi *oldflp,
2153 struct net_device *dev_out,
2157 struct in_device *in_dev;
2158 u32 tos = RT_FL_TOS(oldflp);
2161 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2164 if (fl->fl4_dst == 0xFFFFFFFF)
2165 res->type = RTN_BROADCAST;
2166 else if (MULTICAST(fl->fl4_dst))
2167 res->type = RTN_MULTICAST;
2168 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2171 if (dev_out->flags & IFF_LOOPBACK)
2172 flags |= RTCF_LOCAL;
2174 /* get work reference to inet device */
2175 in_dev = in_dev_get(dev_out);
2179 if (res->type == RTN_BROADCAST) {
2180 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2182 fib_info_put(res->fi);
2185 } else if (res->type == RTN_MULTICAST) {
2186 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2187 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2189 flags &= ~RTCF_LOCAL;
2190 /* If multicast route do not exist use
2191 default one, but do not gateway in this case.
2194 if (res->fi && res->prefixlen < 4) {
2195 fib_info_put(res->fi);
2201 rth = dst_alloc(&ipv4_dst_ops);
2207 atomic_set(&rth->u.dst.__refcnt, 1);
2208 rth->u.dst.flags= DST_HOST;
2209 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2211 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2212 if (res->fi->fib_nhs > 1)
2213 rth->u.dst.flags |= DST_BALANCED;
2216 if (in_dev->cnf.no_xfrm)
2217 rth->u.dst.flags |= DST_NOXFRM;
2218 if (in_dev->cnf.no_policy)
2219 rth->u.dst.flags |= DST_NOPOLICY;
2221 rth->fl.fl4_dst = oldflp->fl4_dst;
2222 rth->fl.fl4_tos = tos;
2223 rth->fl.fl4_src = oldflp->fl4_src;
2224 rth->fl.oif = oldflp->oif;
2225 #ifdef CONFIG_IP_ROUTE_FWMARK
2226 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2228 rth->rt_dst = fl->fl4_dst;
2229 rth->rt_src = fl->fl4_src;
2230 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2231 /* get references to the devices that are to be hold by the routing
2233 rth->u.dst.dev = dev_out;
2235 rth->idev = in_dev_get(dev_out);
2236 rth->rt_gateway = fl->fl4_dst;
2237 rth->rt_spec_dst= fl->fl4_src;
2239 rth->u.dst.output=ip_output;
2241 RT_CACHE_STAT_INC(out_slow_tot);
2243 if (flags & RTCF_LOCAL) {
2244 rth->u.dst.input = ip_local_deliver;
2245 rth->rt_spec_dst = fl->fl4_dst;
2247 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2248 rth->rt_spec_dst = fl->fl4_src;
2249 if (flags & RTCF_LOCAL &&
2250 !(dev_out->flags & IFF_LOOPBACK)) {
2251 rth->u.dst.output = ip_mc_output;
2252 RT_CACHE_STAT_INC(out_slow_mc);
2254 #ifdef CONFIG_IP_MROUTE
2255 if (res->type == RTN_MULTICAST) {
2256 if (IN_DEV_MFORWARD(in_dev) &&
2257 !LOCAL_MCAST(oldflp->fl4_dst)) {
2258 rth->u.dst.input = ip_mr_input;
2259 rth->u.dst.output = ip_mc_output;
2265 rt_set_nexthop(rth, res, 0);
2267 rth->rt_flags = flags;
2271 /* release work reference to inet device */
2277 static inline int ip_mkroute_output_def(struct rtable **rp,
2278 struct fib_result* res,
2279 const struct flowi *fl,
2280 const struct flowi *oldflp,
2281 struct net_device *dev_out,
2284 struct rtable *rth = NULL;
2285 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2288 u32 tos = RT_FL_TOS(oldflp);
2290 hash = rt_hash_code(oldflp->fl4_dst,
2291 oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2292 err = rt_intern_hash(hash, rth, rp);
2298 static inline int ip_mkroute_output(struct rtable** rp,
2299 struct fib_result* res,
2300 const struct flowi *fl,
2301 const struct flowi *oldflp,
2302 struct net_device *dev_out,
2305 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2306 u32 tos = RT_FL_TOS(oldflp);
2310 struct rtable *rth = NULL;
2312 if (res->fi && res->fi->fib_nhs > 1) {
2313 unsigned char hopcount = res->fi->fib_nhs;
2315 for (hop = 0; hop < hopcount; hop++) {
2316 struct net_device *dev2nexthop;
2320 /* hold a work reference to the output device */
2321 dev2nexthop = FIB_RES_DEV(*res);
2322 dev_hold(dev2nexthop);
2324 /* put reference to previous result */
2328 err = __mkroute_output(&rth, res, fl, oldflp,
2329 dev2nexthop, flags);
2334 hash = rt_hash_code(oldflp->fl4_dst,
2336 (oldflp->oif << 5), tos);
2337 err = rt_intern_hash(hash, rth, rp);
2339 /* forward hop information to multipath impl. */
2340 multipath_set_nhinfo(rth,
2341 FIB_RES_NETWORK(*res),
2342 FIB_RES_NETMASK(*res),
2346 /* release work reference to output device */
2347 dev_put(dev2nexthop);
2354 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2357 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2358 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2363 * Major route resolver routine.
2366 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2368 u32 tos = RT_FL_TOS(oldflp);
2369 struct flowi fl = { .nl_u = { .ip4_u =
2370 { .daddr = oldflp->fl4_dst,
2371 .saddr = oldflp->fl4_src,
2372 .tos = tos & IPTOS_RT_MASK,
2373 .scope = ((tos & RTO_ONLINK) ?
2376 #ifdef CONFIG_IP_ROUTE_FWMARK
2377 .fwmark = oldflp->fl4_fwmark
2380 .iif = loopback_dev.ifindex,
2381 .oif = oldflp->oif };
2382 struct fib_result res;
2384 struct net_device *dev_out = NULL;
2390 #ifdef CONFIG_IP_MULTIPLE_TABLES
2394 if (oldflp->fl4_src) {
2396 if (MULTICAST(oldflp->fl4_src) ||
2397 BADCLASS(oldflp->fl4_src) ||
2398 ZERONET(oldflp->fl4_src))
2401 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2402 dev_out = ip_dev_find(oldflp->fl4_src);
2403 if (dev_out == NULL)
2406 /* I removed check for oif == dev_out->oif here.
2407 It was wrong for two reasons:
2408 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2409 assigned to multiple interfaces.
2410 2. Moreover, we are allowed to send packets with saddr
2411 of another iface. --ANK
2414 if (oldflp->oif == 0
2415 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2416 /* Special hack: user can direct multicasts
2417 and limited broadcast via necessary interface
2418 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2419 This hack is not just for fun, it allows
2420 vic,vat and friends to work.
2421 They bind socket to loopback, set ttl to zero
2422 and expect that it will work.
2423 From the viewpoint of routing cache they are broken,
2424 because we are not allowed to build multicast path
2425 with loopback source addr (look, routing cache
2426 cannot know, that ttl is zero, so that packet
2427 will not leave this host and route is valid).
2428 Luckily, this hack is good workaround.
2431 fl.oif = dev_out->ifindex;
2441 dev_out = dev_get_by_index(oldflp->oif);
2443 if (dev_out == NULL)
2446 /* RACE: Check return value of inet_select_addr instead. */
2447 if (__in_dev_get_rtnl(dev_out) == NULL) {
2449 goto out; /* Wrong error code */
2452 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2454 fl.fl4_src = inet_select_addr(dev_out, 0,
2459 if (MULTICAST(oldflp->fl4_dst))
2460 fl.fl4_src = inet_select_addr(dev_out, 0,
2462 else if (!oldflp->fl4_dst)
2463 fl.fl4_src = inet_select_addr(dev_out, 0,
2469 fl.fl4_dst = fl.fl4_src;
2471 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2474 dev_out = &loopback_dev;
2476 fl.oif = loopback_dev.ifindex;
2477 res.type = RTN_LOCAL;
2478 flags |= RTCF_LOCAL;
2482 if (fib_lookup(&fl, &res)) {
2485 /* Apparently, routing tables are wrong. Assume,
2486 that the destination is on link.
2489 Because we are allowed to send to iface
2490 even if it has NO routes and NO assigned
2491 addresses. When oif is specified, routing
2492 tables are looked up with only one purpose:
2493 to catch if destination is gatewayed, rather than
2494 direct. Moreover, if MSG_DONTROUTE is set,
2495 we send packet, ignoring both routing tables
2496 and ifaddr state. --ANK
2499 We could make it even if oif is unknown,
2500 likely IPv6, but we do not.
2503 if (fl.fl4_src == 0)
2504 fl.fl4_src = inet_select_addr(dev_out, 0,
2506 res.type = RTN_UNICAST;
2516 if (res.type == RTN_LOCAL) {
2518 fl.fl4_src = fl.fl4_dst;
2521 dev_out = &loopback_dev;
2523 fl.oif = dev_out->ifindex;
2525 fib_info_put(res.fi);
2527 flags |= RTCF_LOCAL;
2531 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2532 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2533 fib_select_multipath(&fl, &res);
2536 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2537 fib_select_default(&fl, &res);
2540 fl.fl4_src = FIB_RES_PREFSRC(res);
2544 dev_out = FIB_RES_DEV(res);
2546 fl.oif = dev_out->ifindex;
2550 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2560 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2565 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2568 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2569 rth = rcu_dereference(rth->u.rt_next)) {
2570 if (rth->fl.fl4_dst == flp->fl4_dst &&
2571 rth->fl.fl4_src == flp->fl4_src &&
2573 rth->fl.oif == flp->oif &&
2574 #ifdef CONFIG_IP_ROUTE_FWMARK
2575 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2577 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2578 (IPTOS_RT_MASK | RTO_ONLINK))) {
2580 /* check for multipath routes and choose one if
2583 if (multipath_select_route(flp, rth, rp)) {
2584 dst_hold(&(*rp)->u.dst);
2585 RT_CACHE_STAT_INC(out_hit);
2586 rcu_read_unlock_bh();
2590 rth->u.dst.lastuse = jiffies;
2591 dst_hold(&rth->u.dst);
2593 RT_CACHE_STAT_INC(out_hit);
2594 rcu_read_unlock_bh();
2598 RT_CACHE_STAT_INC(out_hlist_search);
2600 rcu_read_unlock_bh();
2602 return ip_route_output_slow(rp, flp);
2605 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2607 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2611 if ((err = __ip_route_output_key(rp, flp)) != 0)
2616 flp->fl4_src = (*rp)->rt_src;
2618 flp->fl4_dst = (*rp)->rt_dst;
2619 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2625 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2627 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2629 return ip_route_output_flow(rp, flp, NULL, 0);
2632 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2633 int nowait, unsigned int flags)
2635 struct rtable *rt = (struct rtable*)skb->dst;
2637 struct nlmsghdr *nlh;
2638 unsigned char *b = skb->tail;
2639 struct rta_cacheinfo ci;
2640 #ifdef CONFIG_IP_MROUTE
2641 struct rtattr *eptr;
2643 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2644 r = NLMSG_DATA(nlh);
2645 r->rtm_family = AF_INET;
2646 r->rtm_dst_len = 32;
2648 r->rtm_tos = rt->fl.fl4_tos;
2649 r->rtm_table = RT_TABLE_MAIN;
2650 r->rtm_type = rt->rt_type;
2651 r->rtm_scope = RT_SCOPE_UNIVERSE;
2652 r->rtm_protocol = RTPROT_UNSPEC;
2653 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2654 if (rt->rt_flags & RTCF_NOTIFY)
2655 r->rtm_flags |= RTM_F_NOTIFY;
2656 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2657 if (rt->fl.fl4_src) {
2658 r->rtm_src_len = 32;
2659 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2662 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2663 #ifdef CONFIG_NET_CLS_ROUTE
2664 if (rt->u.dst.tclassid)
2665 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2667 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2668 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2669 __u32 alg = rt->rt_multipath_alg;
2671 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2675 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2676 else if (rt->rt_src != rt->fl.fl4_src)
2677 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2678 if (rt->rt_dst != rt->rt_gateway)
2679 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2680 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2681 goto rtattr_failure;
2682 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2683 ci.rta_used = rt->u.dst.__use;
2684 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2685 if (rt->u.dst.expires)
2686 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2689 ci.rta_error = rt->u.dst.error;
2690 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2692 ci.rta_id = rt->peer->ip_id_count;
2693 if (rt->peer->tcp_ts_stamp) {
2694 ci.rta_ts = rt->peer->tcp_ts;
2695 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2698 #ifdef CONFIG_IP_MROUTE
2699 eptr = (struct rtattr*)skb->tail;
2701 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2703 #ifdef CONFIG_IP_MROUTE
2704 u32 dst = rt->rt_dst;
2706 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2707 ipv4_devconf.mc_forwarding) {
2708 int err = ipmr_get_route(skb, r, nowait);
2715 if (err == -EMSGSIZE)
2717 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2722 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2725 nlh->nlmsg_len = skb->tail - b;
2730 skb_trim(skb, b - skb->data);
2734 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2736 struct rtattr **rta = arg;
2737 struct rtmsg *rtm = NLMSG_DATA(nlh);
2738 struct rtable *rt = NULL;
2743 struct sk_buff *skb;
2745 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2749 /* Reserve room for dummy headers, this skb can pass
2750 through good chunk of routing engine.
2752 skb->mac.raw = skb->data;
2753 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2755 if (rta[RTA_SRC - 1])
2756 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2757 if (rta[RTA_DST - 1])
2758 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2759 if (rta[RTA_IIF - 1])
2760 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2763 struct net_device *dev = __dev_get_by_index(iif);
2767 skb->protocol = htons(ETH_P_IP);
2770 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2772 rt = (struct rtable*)skb->dst;
2773 if (!err && rt->u.dst.error)
2774 err = -rt->u.dst.error;
2776 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2778 .tos = rtm->rtm_tos } } };
2780 if (rta[RTA_OIF - 1])
2781 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2783 err = ip_route_output_key(&rt, &fl);
2788 skb->dst = &rt->u.dst;
2789 if (rtm->rtm_flags & RTM_F_NOTIFY)
2790 rt->rt_flags |= RTCF_NOTIFY;
2792 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2794 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2795 RTM_NEWROUTE, 0, 0);
2803 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2813 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2820 s_idx = idx = cb->args[1];
2821 for (h = 0; h <= rt_hash_mask; h++) {
2822 if (h < s_h) continue;
2826 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2827 rt = rcu_dereference(rt->u.rt_next), idx++) {
2830 skb->dst = dst_clone(&rt->u.dst);
2831 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2832 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2833 1, NLM_F_MULTI) <= 0) {
2834 dst_release(xchg(&skb->dst, NULL));
2835 rcu_read_unlock_bh();
2838 dst_release(xchg(&skb->dst, NULL));
2840 rcu_read_unlock_bh();
2849 void ip_rt_multicast_event(struct in_device *in_dev)
2854 #ifdef CONFIG_SYSCTL
2855 static int flush_delay;
2857 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2858 struct file *filp, void __user *buffer,
2859 size_t *lenp, loff_t *ppos)
2862 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2863 rt_cache_flush(flush_delay);
2870 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2873 void __user *oldval,
2874 size_t __user *oldlenp,
2875 void __user *newval,
2880 if (newlen != sizeof(int))
2882 if (get_user(delay, (int __user *)newval))
2884 rt_cache_flush(delay);
2888 ctl_table ipv4_route_table[] = {
2890 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2891 .procname = "flush",
2892 .data = &flush_delay,
2893 .maxlen = sizeof(int),
2895 .proc_handler = &ipv4_sysctl_rtcache_flush,
2896 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2899 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2900 .procname = "min_delay",
2901 .data = &ip_rt_min_delay,
2902 .maxlen = sizeof(int),
2904 .proc_handler = &proc_dointvec_jiffies,
2905 .strategy = &sysctl_jiffies,
2908 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2909 .procname = "max_delay",
2910 .data = &ip_rt_max_delay,
2911 .maxlen = sizeof(int),
2913 .proc_handler = &proc_dointvec_jiffies,
2914 .strategy = &sysctl_jiffies,
2917 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2918 .procname = "gc_thresh",
2919 .data = &ipv4_dst_ops.gc_thresh,
2920 .maxlen = sizeof(int),
2922 .proc_handler = &proc_dointvec,
2925 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2926 .procname = "max_size",
2927 .data = &ip_rt_max_size,
2928 .maxlen = sizeof(int),
2930 .proc_handler = &proc_dointvec,
2933 /* Deprecated. Use gc_min_interval_ms */
2935 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936 .procname = "gc_min_interval",
2937 .data = &ip_rt_gc_min_interval,
2938 .maxlen = sizeof(int),
2940 .proc_handler = &proc_dointvec_jiffies,
2941 .strategy = &sysctl_jiffies,
2944 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945 .procname = "gc_min_interval_ms",
2946 .data = &ip_rt_gc_min_interval,
2947 .maxlen = sizeof(int),
2949 .proc_handler = &proc_dointvec_ms_jiffies,
2950 .strategy = &sysctl_ms_jiffies,
2953 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2954 .procname = "gc_timeout",
2955 .data = &ip_rt_gc_timeout,
2956 .maxlen = sizeof(int),
2958 .proc_handler = &proc_dointvec_jiffies,
2959 .strategy = &sysctl_jiffies,
2962 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2963 .procname = "gc_interval",
2964 .data = &ip_rt_gc_interval,
2965 .maxlen = sizeof(int),
2967 .proc_handler = &proc_dointvec_jiffies,
2968 .strategy = &sysctl_jiffies,
2971 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2972 .procname = "redirect_load",
2973 .data = &ip_rt_redirect_load,
2974 .maxlen = sizeof(int),
2976 .proc_handler = &proc_dointvec,
2979 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980 .procname = "redirect_number",
2981 .data = &ip_rt_redirect_number,
2982 .maxlen = sizeof(int),
2984 .proc_handler = &proc_dointvec,
2987 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988 .procname = "redirect_silence",
2989 .data = &ip_rt_redirect_silence,
2990 .maxlen = sizeof(int),
2992 .proc_handler = &proc_dointvec,
2995 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2996 .procname = "error_cost",
2997 .data = &ip_rt_error_cost,
2998 .maxlen = sizeof(int),
3000 .proc_handler = &proc_dointvec,
3003 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3004 .procname = "error_burst",
3005 .data = &ip_rt_error_burst,
3006 .maxlen = sizeof(int),
3008 .proc_handler = &proc_dointvec,
3011 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3012 .procname = "gc_elasticity",
3013 .data = &ip_rt_gc_elasticity,
3014 .maxlen = sizeof(int),
3016 .proc_handler = &proc_dointvec,
3019 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3020 .procname = "mtu_expires",
3021 .data = &ip_rt_mtu_expires,
3022 .maxlen = sizeof(int),
3024 .proc_handler = &proc_dointvec_jiffies,
3025 .strategy = &sysctl_jiffies,
3028 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3029 .procname = "min_pmtu",
3030 .data = &ip_rt_min_pmtu,
3031 .maxlen = sizeof(int),
3033 .proc_handler = &proc_dointvec,
3036 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3037 .procname = "min_adv_mss",
3038 .data = &ip_rt_min_advmss,
3039 .maxlen = sizeof(int),
3041 .proc_handler = &proc_dointvec,
3044 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3045 .procname = "secret_interval",
3046 .data = &ip_rt_secret_interval,
3047 .maxlen = sizeof(int),
3049 .proc_handler = &proc_dointvec_jiffies,
3050 .strategy = &sysctl_jiffies,
3056 #ifdef CONFIG_NET_CLS_ROUTE
3057 struct ip_rt_acct *ip_rt_acct;
3059 /* This code sucks. But you should have seen it before! --RR */
3061 /* IP route accounting ptr for this logical cpu number. */
3062 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3064 #ifdef CONFIG_PROC_FS
3065 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066 int length, int *eof, void *data)
3070 if ((offset & 3) || (length & 3))
3073 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3078 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079 length = sizeof(struct ip_rt_acct) * 256 - offset;
3083 offset /= sizeof(u32);
3086 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087 u32 *dst = (u32 *) buffer;
3089 /* Copy first cpu. */
3091 memcpy(dst, src, length);
3093 /* Add the other cpus in, one int at a time */
3097 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3099 for (j = 0; j < length/4; j++)
3105 #endif /* CONFIG_PROC_FS */
3106 #endif /* CONFIG_NET_CLS_ROUTE */
3108 static __initdata unsigned long rhash_entries;
3109 static int __init set_rhash_entries(char *str)
3113 rhash_entries = simple_strtoul(str, &str, 0);
3116 __setup("rhash_entries=", set_rhash_entries);
3118 int __init ip_rt_init(void)
3122 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123 (jiffies ^ (jiffies >> 7)));
3125 #ifdef CONFIG_NET_CLS_ROUTE
3129 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3131 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3133 panic("IP: failed to allocate ip_rt_acct\n");
3134 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3138 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3139 sizeof(struct rtable),
3140 0, SLAB_HWCACHE_ALIGN,
3143 if (!ipv4_dst_ops.kmem_cachep)
3144 panic("IP: failed to allocate ip_dst_cache\n");
3146 rt_hash_table = (struct rt_hash_bucket *)
3147 alloc_large_system_hash("IP route cache",
3148 sizeof(struct rt_hash_bucket),
3150 (num_physpages >= 128 * 1024) ?
3156 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3157 rt_hash_lock_init();
3159 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3160 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3165 init_timer(&rt_flush_timer);
3166 rt_flush_timer.function = rt_run_flush;
3167 init_timer(&rt_periodic_timer);
3168 rt_periodic_timer.function = rt_check_expire;
3169 init_timer(&rt_secret_timer);
3170 rt_secret_timer.function = rt_secret_rebuild;
3172 /* All the timers, started at system startup tend
3173 to synchronize. Perturb it a bit.
3175 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3177 add_timer(&rt_periodic_timer);
3179 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3180 ip_rt_secret_interval;
3181 add_timer(&rt_secret_timer);
3183 #ifdef CONFIG_PROC_FS
3185 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3186 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3187 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3191 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3193 #ifdef CONFIG_NET_CLS_ROUTE
3194 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3204 EXPORT_SYMBOL(__ip_select_ident);
3205 EXPORT_SYMBOL(ip_route_input);
3206 EXPORT_SYMBOL(ip_route_output_key);