2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
107 #include <net/netevent.h>
109 #include <linux/sysctl.h>
112 #define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 #define IP_MAX_MTU 0xFFF0
117 #define RT_GC_TIMEOUT (300*HZ)
119 static int ip_rt_min_delay = 2 * HZ;
120 static int ip_rt_max_delay = 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval = 60 * HZ;
124 static int ip_rt_gc_min_interval = HZ / 2;
125 static int ip_rt_redirect_number = 9;
126 static int ip_rt_redirect_load = HZ / 50;
127 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost = HZ;
129 static int ip_rt_error_burst = 5 * HZ;
130 static int ip_rt_gc_elasticity = 8;
131 static int ip_rt_mtu_expires = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu = 512 + 20 + 20;
133 static int ip_rt_min_advmss = 256;
134 static int ip_rt_secret_interval = 10 * 60 * HZ;
135 static unsigned long rt_deadline;
137 #define RTprint(a...) printk(KERN_DEBUG a)
139 static struct timer_list rt_flush_timer;
140 static struct timer_list rt_periodic_timer;
141 static struct timer_list rt_secret_timer;
144 * Interface to generic destination cache.
147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148 static void ipv4_dst_destroy(struct dst_entry *dst);
149 static void ipv4_dst_ifdown(struct dst_entry *dst,
150 struct net_device *dev, int how);
151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152 static void ipv4_link_failure(struct sk_buff *skb);
153 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154 static int rt_garbage_collect(void);
157 static struct dst_ops ipv4_dst_ops = {
159 .protocol = __constant_htons(ETH_P_IP),
160 .gc = rt_garbage_collect,
161 .check = ipv4_dst_check,
162 .destroy = ipv4_dst_destroy,
163 .ifdown = ipv4_dst_ifdown,
164 .negative_advice = ipv4_negative_advice,
165 .link_failure = ipv4_link_failure,
166 .update_pmtu = ip_rt_update_pmtu,
167 .entry_size = sizeof(struct rtable),
170 #define ECN_OR_COST(class) TC_PRIO_##class
172 __u8 ip_tos2prio[16] = {
176 ECN_OR_COST(BESTEFFORT),
182 ECN_OR_COST(INTERACTIVE),
184 ECN_OR_COST(INTERACTIVE),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK),
187 TC_PRIO_INTERACTIVE_BULK,
188 ECN_OR_COST(INTERACTIVE_BULK)
196 /* The locking scheme is rather straight forward:
198 * 1) Read-Copy Update protects the buckets of the central route hash.
199 * 2) Only writers remove entries, and they hold the lock
200 * as they look at rtable reference counts.
201 * 3) Only readers acquire references to rtable entries,
202 * they do so with atomic increments and with the
206 struct rt_hash_bucket {
207 struct rtable *chain;
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210 defined(CONFIG_PROVE_LOCKING)
212 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213 * The size of this table is a power of two and depends on the number of CPUS.
214 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ 256
220 # define RT_HASH_LOCK_SZ 4096
222 # define RT_HASH_LOCK_SZ 2048
224 # define RT_HASH_LOCK_SZ 1024
226 # define RT_HASH_LOCK_SZ 512
228 # define RT_HASH_LOCK_SZ 256
232 static spinlock_t *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234 # define rt_hash_lock_init() { \
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
237 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
238 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
239 spin_lock_init(&rt_hash_locks[i]); \
242 # define rt_hash_lock_addr(slot) NULL
243 # define rt_hash_lock_init()
246 static struct rt_hash_bucket *rt_hash_table;
247 static unsigned rt_hash_mask;
248 static int rt_hash_log;
249 static unsigned int rt_hash_rnd;
251 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
252 #define RT_CACHE_STAT_INC(field) \
253 (__raw_get_cpu_var(rt_cache_stat).field++)
255 static int rt_intern_hash(unsigned hash, struct rtable *rth,
256 struct rtable **res);
258 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
260 return (jhash_2words(daddr, saddr, rt_hash_rnd)
264 #define rt_hash(daddr, saddr, idx) \
265 rt_hash_code((__force u32)(__be32)(daddr),\
266 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
268 #ifdef CONFIG_PROC_FS
269 struct rt_cache_iter_state {
273 static struct rtable *rt_cache_get_first(struct seq_file *seq)
275 struct rtable *r = NULL;
276 struct rt_cache_iter_state *st = seq->private;
278 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
280 r = rt_hash_table[st->bucket].chain;
283 rcu_read_unlock_bh();
288 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
290 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
292 r = r->u.dst.rt_next;
294 rcu_read_unlock_bh();
295 if (--st->bucket < 0)
298 r = rt_hash_table[st->bucket].chain;
303 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
305 struct rtable *r = rt_cache_get_first(seq);
308 while (pos && (r = rt_cache_get_next(seq, r)))
310 return pos ? NULL : r;
313 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
315 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
318 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
320 struct rtable *r = NULL;
322 if (v == SEQ_START_TOKEN)
323 r = rt_cache_get_first(seq);
325 r = rt_cache_get_next(seq, v);
330 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
332 if (v && v != SEQ_START_TOKEN)
333 rcu_read_unlock_bh();
336 static int rt_cache_seq_show(struct seq_file *seq, void *v)
338 if (v == SEQ_START_TOKEN)
339 seq_printf(seq, "%-127s\n",
340 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
341 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
344 struct rtable *r = v;
347 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
348 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
349 r->u.dst.dev ? r->u.dst.dev->name : "*",
350 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
351 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
352 r->u.dst.__use, 0, (unsigned long)r->rt_src,
353 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
354 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
355 dst_metric(&r->u.dst, RTAX_WINDOW),
356 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
357 dst_metric(&r->u.dst, RTAX_RTTVAR)),
359 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
360 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
363 seq_printf(seq, "%-127s\n", temp);
368 static struct seq_operations rt_cache_seq_ops = {
369 .start = rt_cache_seq_start,
370 .next = rt_cache_seq_next,
371 .stop = rt_cache_seq_stop,
372 .show = rt_cache_seq_show,
375 static int rt_cache_seq_open(struct inode *inode, struct file *file)
377 struct seq_file *seq;
379 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
383 rc = seq_open(file, &rt_cache_seq_ops);
386 seq = file->private_data;
388 memset(s, 0, sizeof(*s));
396 static struct file_operations rt_cache_seq_fops = {
397 .owner = THIS_MODULE,
398 .open = rt_cache_seq_open,
401 .release = seq_release_private,
405 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
410 return SEQ_START_TOKEN;
412 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
413 if (!cpu_possible(cpu))
416 return &per_cpu(rt_cache_stat, cpu);
421 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
425 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
426 if (!cpu_possible(cpu))
429 return &per_cpu(rt_cache_stat, cpu);
435 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
440 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
442 struct rt_cache_stat *st = v;
444 if (v == SEQ_START_TOKEN) {
445 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
449 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
450 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
451 atomic_read(&ipv4_dst_ops.entries),
474 static struct seq_operations rt_cpu_seq_ops = {
475 .start = rt_cpu_seq_start,
476 .next = rt_cpu_seq_next,
477 .stop = rt_cpu_seq_stop,
478 .show = rt_cpu_seq_show,
482 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
484 return seq_open(file, &rt_cpu_seq_ops);
487 static struct file_operations rt_cpu_seq_fops = {
488 .owner = THIS_MODULE,
489 .open = rt_cpu_seq_open,
492 .release = seq_release,
495 #endif /* CONFIG_PROC_FS */
497 static __inline__ void rt_free(struct rtable *rt)
499 multipath_remove(rt);
500 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
503 static __inline__ void rt_drop(struct rtable *rt)
505 multipath_remove(rt);
507 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
510 static __inline__ int rt_fast_clean(struct rtable *rth)
512 /* Kill broadcast/multicast entries very aggresively, if they
513 collide in hash table with more useful entries */
514 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
515 rth->fl.iif && rth->u.dst.rt_next;
518 static __inline__ int rt_valuable(struct rtable *rth)
520 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
524 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
529 if (atomic_read(&rth->u.dst.__refcnt))
533 if (rth->u.dst.expires &&
534 time_after_eq(jiffies, rth->u.dst.expires))
537 age = jiffies - rth->u.dst.lastuse;
539 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
540 (age <= tmo2 && rt_valuable(rth)))
546 /* Bits of score are:
548 * 30: not quite useless
549 * 29..0: usage counter
551 static inline u32 rt_score(struct rtable *rt)
553 u32 score = jiffies - rt->u.dst.lastuse;
555 score = ~score & ~(3<<30);
561 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
567 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
569 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
570 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
571 (fl1->mark ^ fl2->mark) |
572 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
573 *(u16 *)&fl2->nl_u.ip4_u.tos) |
574 (fl1->oif ^ fl2->oif) |
575 (fl1->iif ^ fl2->iif)) == 0;
578 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
579 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
580 struct rtable *expentry,
583 int passedexpired = 0;
584 struct rtable **nextstep = NULL;
585 struct rtable **rthp = chain_head;
591 while ((rth = *rthp) != NULL) {
595 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
596 compare_keys(&(*rthp)->fl, &expentry->fl)) {
597 if (*rthp == expentry) {
598 *rthp = rth->u.dst.rt_next;
601 *rthp = rth->u.dst.rt_next;
607 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
608 passedexpired && !nextstep)
609 nextstep = &rth->u.dst.rt_next;
611 rthp = &rth->u.dst.rt_next;
621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
624 /* This runs via a timer and thus is always in BH context. */
625 static void rt_check_expire(unsigned long dummy)
627 static unsigned int rover;
628 unsigned int i = rover, goal;
629 struct rtable *rth, **rthp;
630 unsigned long now = jiffies;
633 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
634 if (ip_rt_gc_timeout > 1)
635 do_div(mult, ip_rt_gc_timeout);
636 goal = (unsigned int)mult;
637 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
638 for (; goal > 0; goal--) {
639 unsigned long tmo = ip_rt_gc_timeout;
641 i = (i + 1) & rt_hash_mask;
642 rthp = &rt_hash_table[i].chain;
646 spin_lock(rt_hash_lock_addr(i));
647 while ((rth = *rthp) != NULL) {
648 if (rth->u.dst.expires) {
649 /* Entry is expired even if it is in use */
650 if (time_before_eq(now, rth->u.dst.expires)) {
652 rthp = &rth->u.dst.rt_next;
655 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
657 rthp = &rth->u.dst.rt_next;
661 /* Cleanup aged off entries. */
662 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
663 /* remove all related balanced entries if necessary */
664 if (rth->u.dst.flags & DST_BALANCED) {
665 rthp = rt_remove_balanced_route(
666 &rt_hash_table[i].chain,
671 *rthp = rth->u.dst.rt_next;
674 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
675 *rthp = rth->u.dst.rt_next;
677 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
679 spin_unlock(rt_hash_lock_addr(i));
681 /* Fallback loop breaker. */
682 if (time_after(jiffies, now))
686 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
689 /* This can run from both BH and non-BH contexts, the latter
690 * in the case of a forced flush event.
692 static void rt_run_flush(unsigned long dummy)
695 struct rtable *rth, *next;
699 get_random_bytes(&rt_hash_rnd, 4);
701 for (i = rt_hash_mask; i >= 0; i--) {
702 spin_lock_bh(rt_hash_lock_addr(i));
703 rth = rt_hash_table[i].chain;
705 rt_hash_table[i].chain = NULL;
706 spin_unlock_bh(rt_hash_lock_addr(i));
708 for (; rth; rth = next) {
709 next = rth->u.dst.rt_next;
715 static DEFINE_SPINLOCK(rt_flush_lock);
717 void rt_cache_flush(int delay)
719 unsigned long now = jiffies;
720 int user_mode = !in_softirq();
723 delay = ip_rt_min_delay;
725 /* flush existing multipath state*/
728 spin_lock_bh(&rt_flush_lock);
730 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
731 long tmo = (long)(rt_deadline - now);
733 /* If flush timer is already running
734 and flush request is not immediate (delay > 0):
736 if deadline is not achieved, prolongate timer to "delay",
737 otherwise fire it at deadline time.
740 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
748 spin_unlock_bh(&rt_flush_lock);
753 if (rt_deadline == 0)
754 rt_deadline = now + ip_rt_max_delay;
756 mod_timer(&rt_flush_timer, now+delay);
757 spin_unlock_bh(&rt_flush_lock);
760 static void rt_secret_rebuild(unsigned long dummy)
762 unsigned long now = jiffies;
765 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
769 Short description of GC goals.
771 We want to build algorithm, which will keep routing cache
772 at some equilibrium point, when number of aged off entries
773 is kept approximately equal to newly generated ones.
775 Current expiration strength is variable "expire".
776 We try to adjust it dynamically, so that if networking
777 is idle expires is large enough to keep enough of warm entries,
778 and when load increases it reduces to limit cache size.
781 static int rt_garbage_collect(void)
783 static unsigned long expire = RT_GC_TIMEOUT;
784 static unsigned long last_gc;
786 static int equilibrium;
787 struct rtable *rth, **rthp;
788 unsigned long now = jiffies;
792 * Garbage collection is pretty expensive,
793 * do not make it too frequently.
796 RT_CACHE_STAT_INC(gc_total);
798 if (now - last_gc < ip_rt_gc_min_interval &&
799 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
800 RT_CACHE_STAT_INC(gc_ignored);
804 /* Calculate number of entries, which we want to expire now. */
805 goal = atomic_read(&ipv4_dst_ops.entries) -
806 (ip_rt_gc_elasticity << rt_hash_log);
808 if (equilibrium < ipv4_dst_ops.gc_thresh)
809 equilibrium = ipv4_dst_ops.gc_thresh;
810 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
812 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
813 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
816 /* We are in dangerous area. Try to reduce cache really
819 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
820 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
823 if (now - last_gc >= ip_rt_gc_min_interval)
834 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
835 unsigned long tmo = expire;
837 k = (k + 1) & rt_hash_mask;
838 rthp = &rt_hash_table[k].chain;
839 spin_lock_bh(rt_hash_lock_addr(k));
840 while ((rth = *rthp) != NULL) {
841 if (!rt_may_expire(rth, tmo, expire)) {
843 rthp = &rth->u.dst.rt_next;
846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
847 /* remove all related balanced entries
850 if (rth->u.dst.flags & DST_BALANCED) {
853 rthp = rt_remove_balanced_route(
854 &rt_hash_table[k].chain,
861 *rthp = rth->u.dst.rt_next;
865 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
866 *rthp = rth->u.dst.rt_next;
869 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
871 spin_unlock_bh(rt_hash_lock_addr(k));
880 /* Goal is not achieved. We stop process if:
882 - if expire reduced to zero. Otherwise, expire is halfed.
883 - if table is not full.
884 - if we are called from interrupt.
885 - jiffies check is just fallback/debug loop breaker.
886 We will not spin here for long time in any case.
889 RT_CACHE_STAT_INC(gc_goal_miss);
895 #if RT_CACHE_DEBUG >= 2
896 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
897 atomic_read(&ipv4_dst_ops.entries), goal, i);
900 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
902 } while (!in_softirq() && time_before_eq(jiffies, now));
904 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
907 printk(KERN_WARNING "dst cache overflow\n");
908 RT_CACHE_STAT_INC(gc_dst_overflow);
912 expire += ip_rt_gc_min_interval;
913 if (expire > ip_rt_gc_timeout ||
914 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
915 expire = ip_rt_gc_timeout;
916 #if RT_CACHE_DEBUG >= 2
917 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
918 atomic_read(&ipv4_dst_ops.entries), goal, rover);
923 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
925 struct rtable *rth, **rthp;
927 struct rtable *cand, **candp;
930 int attempts = !in_softirq();
939 rthp = &rt_hash_table[hash].chain;
941 spin_lock_bh(rt_hash_lock_addr(hash));
942 while ((rth = *rthp) != NULL) {
943 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
944 if (!(rth->u.dst.flags & DST_BALANCED) &&
945 compare_keys(&rth->fl, &rt->fl)) {
947 if (compare_keys(&rth->fl, &rt->fl)) {
950 *rthp = rth->u.dst.rt_next;
952 * Since lookup is lockfree, the deletion
953 * must be visible to another weakly ordered CPU before
954 * the insertion at the start of the hash chain.
956 rcu_assign_pointer(rth->u.dst.rt_next,
957 rt_hash_table[hash].chain);
959 * Since lookup is lockfree, the update writes
960 * must be ordered for consistency on SMP.
962 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
965 dst_hold(&rth->u.dst);
966 rth->u.dst.lastuse = now;
967 spin_unlock_bh(rt_hash_lock_addr(hash));
974 if (!atomic_read(&rth->u.dst.__refcnt)) {
975 u32 score = rt_score(rth);
977 if (score <= min_score) {
986 rthp = &rth->u.dst.rt_next;
990 /* ip_rt_gc_elasticity used to be average length of chain
991 * length, when exceeded gc becomes really aggressive.
993 * The second limit is less certain. At the moment it allows
994 * only 2 entries per bucket. We will see.
996 if (chain_length > ip_rt_gc_elasticity) {
997 *candp = cand->u.dst.rt_next;
1002 /* Try to bind route to arp only if it is output
1003 route or unicast forwarding path.
1005 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1006 int err = arp_bind_neighbour(&rt->u.dst);
1008 spin_unlock_bh(rt_hash_lock_addr(hash));
1010 if (err != -ENOBUFS) {
1015 /* Neighbour tables are full and nothing
1016 can be released. Try to shrink route cache,
1017 it is most likely it holds some neighbour records.
1019 if (attempts-- > 0) {
1020 int saved_elasticity = ip_rt_gc_elasticity;
1021 int saved_int = ip_rt_gc_min_interval;
1022 ip_rt_gc_elasticity = 1;
1023 ip_rt_gc_min_interval = 0;
1024 rt_garbage_collect();
1025 ip_rt_gc_min_interval = saved_int;
1026 ip_rt_gc_elasticity = saved_elasticity;
1030 if (net_ratelimit())
1031 printk(KERN_WARNING "Neighbour table overflow.\n");
1037 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1038 #if RT_CACHE_DEBUG >= 2
1039 if (rt->u.dst.rt_next) {
1041 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1042 NIPQUAD(rt->rt_dst));
1043 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1044 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1048 rt_hash_table[hash].chain = rt;
1049 spin_unlock_bh(rt_hash_lock_addr(hash));
1054 void rt_bind_peer(struct rtable *rt, int create)
1056 static DEFINE_SPINLOCK(rt_peer_lock);
1057 struct inet_peer *peer;
1059 peer = inet_getpeer(rt->rt_dst, create);
1061 spin_lock_bh(&rt_peer_lock);
1062 if (rt->peer == NULL) {
1066 spin_unlock_bh(&rt_peer_lock);
1072 * Peer allocation may fail only in serious out-of-memory conditions. However
1073 * we still can generate some output.
1074 * Random ID selection looks a bit dangerous because we have no chances to
1075 * select ID being unique in a reasonable period of time.
1076 * But broken packet identifier may be better than no packet at all.
1078 static void ip_select_fb_ident(struct iphdr *iph)
1080 static DEFINE_SPINLOCK(ip_fb_id_lock);
1081 static u32 ip_fallback_id;
1084 spin_lock_bh(&ip_fb_id_lock);
1085 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1086 iph->id = htons(salt & 0xFFFF);
1087 ip_fallback_id = salt;
1088 spin_unlock_bh(&ip_fb_id_lock);
1091 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1093 struct rtable *rt = (struct rtable *) dst;
1096 if (rt->peer == NULL)
1097 rt_bind_peer(rt, 1);
1099 /* If peer is attached to destination, it is never detached,
1100 so that we need not to grab a lock to dereference it.
1103 iph->id = htons(inet_getid(rt->peer, more));
1107 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1108 __builtin_return_address(0));
1110 ip_select_fb_ident(iph);
1113 static void rt_del(unsigned hash, struct rtable *rt)
1115 struct rtable **rthp;
1117 spin_lock_bh(rt_hash_lock_addr(hash));
1119 for (rthp = &rt_hash_table[hash].chain; *rthp;
1120 rthp = &(*rthp)->u.dst.rt_next)
1122 *rthp = rt->u.dst.rt_next;
1126 spin_unlock_bh(rt_hash_lock_addr(hash));
1129 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1130 __be32 saddr, struct net_device *dev)
1133 struct in_device *in_dev = in_dev_get(dev);
1134 struct rtable *rth, **rthp;
1135 __be32 skeys[2] = { saddr, 0 };
1136 int ikeys[2] = { dev->ifindex, 0 };
1137 struct netevent_redirect netevent;
1142 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1143 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1144 goto reject_redirect;
1146 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1147 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1148 goto reject_redirect;
1149 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1150 goto reject_redirect;
1152 if (inet_addr_type(new_gw) != RTN_UNICAST)
1153 goto reject_redirect;
1156 for (i = 0; i < 2; i++) {
1157 for (k = 0; k < 2; k++) {
1158 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1160 rthp=&rt_hash_table[hash].chain;
1163 while ((rth = rcu_dereference(*rthp)) != NULL) {
1166 if (rth->fl.fl4_dst != daddr ||
1167 rth->fl.fl4_src != skeys[i] ||
1168 rth->fl.oif != ikeys[k] ||
1170 rthp = &rth->u.dst.rt_next;
1174 if (rth->rt_dst != daddr ||
1175 rth->rt_src != saddr ||
1177 rth->rt_gateway != old_gw ||
1178 rth->u.dst.dev != dev)
1181 dst_hold(&rth->u.dst);
1184 rt = dst_alloc(&ipv4_dst_ops);
1191 /* Copy all the information. */
1193 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1194 rt->u.dst.__use = 1;
1195 atomic_set(&rt->u.dst.__refcnt, 1);
1196 rt->u.dst.child = NULL;
1198 dev_hold(rt->u.dst.dev);
1200 in_dev_hold(rt->idev);
1201 rt->u.dst.obsolete = 0;
1202 rt->u.dst.lastuse = jiffies;
1203 rt->u.dst.path = &rt->u.dst;
1204 rt->u.dst.neighbour = NULL;
1205 rt->u.dst.hh = NULL;
1206 rt->u.dst.xfrm = NULL;
1208 rt->rt_flags |= RTCF_REDIRECTED;
1210 /* Gateway is different ... */
1211 rt->rt_gateway = new_gw;
1213 /* Redirect received -> path was valid */
1214 dst_confirm(&rth->u.dst);
1217 atomic_inc(&rt->peer->refcnt);
1219 if (arp_bind_neighbour(&rt->u.dst) ||
1220 !(rt->u.dst.neighbour->nud_state &
1222 if (rt->u.dst.neighbour)
1223 neigh_event_send(rt->u.dst.neighbour, NULL);
1229 netevent.old = &rth->u.dst;
1230 netevent.new = &rt->u.dst;
1231 call_netevent_notifiers(NETEVENT_REDIRECT,
1235 if (!rt_intern_hash(hash, rt, &rt))
1248 #ifdef CONFIG_IP_ROUTE_VERBOSE
1249 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1250 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1251 "%u.%u.%u.%u ignored.\n"
1252 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1253 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1254 NIPQUAD(saddr), NIPQUAD(daddr));
1259 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1261 struct rtable *rt = (struct rtable*)dst;
1262 struct dst_entry *ret = dst;
1265 if (dst->obsolete) {
1268 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1269 rt->u.dst.expires) {
1270 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1272 #if RT_CACHE_DEBUG >= 1
1273 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1274 "%u.%u.%u.%u/%02x dropped\n",
1275 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1286 * 1. The first ip_rt_redirect_number redirects are sent
1287 * with exponential backoff, then we stop sending them at all,
1288 * assuming that the host ignores our redirects.
1289 * 2. If we did not see packets requiring redirects
1290 * during ip_rt_redirect_silence, we assume that the host
1291 * forgot redirected route and start to send redirects again.
1293 * This algorithm is much cheaper and more intelligent than dumb load limiting
1296 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1297 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1300 void ip_rt_send_redirect(struct sk_buff *skb)
1302 struct rtable *rt = (struct rtable*)skb->dst;
1303 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1308 if (!IN_DEV_TX_REDIRECTS(in_dev))
1311 /* No redirected packets during ip_rt_redirect_silence;
1312 * reset the algorithm.
1314 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1315 rt->u.dst.rate_tokens = 0;
1317 /* Too many ignored redirects; do not send anything
1318 * set u.dst.rate_last to the last seen redirected packet.
1320 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1321 rt->u.dst.rate_last = jiffies;
1325 /* Check for load limit; set rate_last to the latest sent
1328 if (rt->u.dst.rate_tokens == 0 ||
1330 (rt->u.dst.rate_last +
1331 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1332 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1333 rt->u.dst.rate_last = jiffies;
1334 ++rt->u.dst.rate_tokens;
1335 #ifdef CONFIG_IP_ROUTE_VERBOSE
1336 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1337 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1339 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1340 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1341 NIPQUAD(rt->rt_src), rt->rt_iif,
1342 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1349 static int ip_error(struct sk_buff *skb)
1351 struct rtable *rt = (struct rtable*)skb->dst;
1355 switch (rt->u.dst.error) {
1360 code = ICMP_HOST_UNREACH;
1363 code = ICMP_NET_UNREACH;
1366 code = ICMP_PKT_FILTERED;
1371 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1372 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1373 rt->u.dst.rate_tokens = ip_rt_error_burst;
1374 rt->u.dst.rate_last = now;
1375 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1376 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1377 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1380 out: kfree_skb(skb);
1385 * The last two values are not from the RFC but
1386 * are needed for AMPRnet AX.25 paths.
1389 static const unsigned short mtu_plateau[] =
1390 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1392 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1396 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1397 if (old_mtu > mtu_plateau[i])
1398 return mtu_plateau[i];
1402 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1405 unsigned short old_mtu = ntohs(iph->tot_len);
1407 __be32 skeys[2] = { iph->saddr, 0, };
1408 __be32 daddr = iph->daddr;
1409 unsigned short est_mtu = 0;
1411 if (ipv4_config.no_pmtu_disc)
1414 for (i = 0; i < 2; i++) {
1415 unsigned hash = rt_hash(daddr, skeys[i], 0);
1418 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1419 rth = rcu_dereference(rth->u.dst.rt_next)) {
1420 if (rth->fl.fl4_dst == daddr &&
1421 rth->fl.fl4_src == skeys[i] &&
1422 rth->rt_dst == daddr &&
1423 rth->rt_src == iph->saddr &&
1425 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1426 unsigned short mtu = new_mtu;
1428 if (new_mtu < 68 || new_mtu >= old_mtu) {
1430 /* BSD 4.2 compatibility hack :-( */
1432 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1433 old_mtu >= 68 + (iph->ihl << 2))
1434 old_mtu -= iph->ihl << 2;
1436 mtu = guess_mtu(old_mtu);
1438 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1439 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1440 dst_confirm(&rth->u.dst);
1441 if (mtu < ip_rt_min_pmtu) {
1442 mtu = ip_rt_min_pmtu;
1443 rth->u.dst.metrics[RTAX_LOCK-1] |=
1446 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1447 dst_set_expires(&rth->u.dst,
1456 return est_mtu ? : new_mtu;
1459 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1461 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1462 !(dst_metric_locked(dst, RTAX_MTU))) {
1463 if (mtu < ip_rt_min_pmtu) {
1464 mtu = ip_rt_min_pmtu;
1465 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1467 dst->metrics[RTAX_MTU-1] = mtu;
1468 dst_set_expires(dst, ip_rt_mtu_expires);
1469 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1473 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1478 static void ipv4_dst_destroy(struct dst_entry *dst)
1480 struct rtable *rt = (struct rtable *) dst;
1481 struct inet_peer *peer = rt->peer;
1482 struct in_device *idev = rt->idev;
1495 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1498 struct rtable *rt = (struct rtable *) dst;
1499 struct in_device *idev = rt->idev;
1500 if (dev != &loopback_dev && idev && idev->dev == dev) {
1501 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1502 if (loopback_idev) {
1503 rt->idev = loopback_idev;
1509 static void ipv4_link_failure(struct sk_buff *skb)
1513 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1515 rt = (struct rtable *) skb->dst;
1517 dst_set_expires(&rt->u.dst, 0);
1520 static int ip_rt_bug(struct sk_buff *skb)
1522 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1523 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1524 skb->dev ? skb->dev->name : "?");
1530 We do not cache source address of outgoing interface,
1531 because it is used only by IP RR, TS and SRR options,
1532 so that it out of fast path.
1534 BTW remember: "addr" is allowed to be not aligned
1538 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1541 struct fib_result res;
1543 if (rt->fl.iif == 0)
1545 else if (fib_lookup(&rt->fl, &res) == 0) {
1546 src = FIB_RES_PREFSRC(res);
1549 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1551 memcpy(addr, &src, 4);
1554 #ifdef CONFIG_NET_CLS_ROUTE
1555 static void set_class_tag(struct rtable *rt, u32 tag)
1557 if (!(rt->u.dst.tclassid & 0xFFFF))
1558 rt->u.dst.tclassid |= tag & 0xFFFF;
1559 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1560 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1564 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1566 struct fib_info *fi = res->fi;
1569 if (FIB_RES_GW(*res) &&
1570 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1571 rt->rt_gateway = FIB_RES_GW(*res);
1572 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1573 sizeof(rt->u.dst.metrics));
1574 if (fi->fib_mtu == 0) {
1575 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1576 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1577 rt->rt_gateway != rt->rt_dst &&
1578 rt->u.dst.dev->mtu > 576)
1579 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1581 #ifdef CONFIG_NET_CLS_ROUTE
1582 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1585 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1587 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1588 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1589 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1590 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1591 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1592 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1594 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1595 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1597 #ifdef CONFIG_NET_CLS_ROUTE
1598 #ifdef CONFIG_IP_MULTIPLE_TABLES
1599 set_class_tag(rt, fib_rules_tclass(res));
1601 set_class_tag(rt, itag);
1603 rt->rt_type = res->type;
1606 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1607 u8 tos, struct net_device *dev, int our)
1612 struct in_device *in_dev = in_dev_get(dev);
1615 /* Primary sanity checks. */
1620 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1621 skb->protocol != htons(ETH_P_IP))
1624 if (ZERONET(saddr)) {
1625 if (!LOCAL_MCAST(daddr))
1627 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1628 } else if (fib_validate_source(saddr, 0, tos, 0,
1629 dev, &spec_dst, &itag) < 0)
1632 rth = dst_alloc(&ipv4_dst_ops);
1636 rth->u.dst.output= ip_rt_bug;
1638 atomic_set(&rth->u.dst.__refcnt, 1);
1639 rth->u.dst.flags= DST_HOST;
1640 if (in_dev->cnf.no_policy)
1641 rth->u.dst.flags |= DST_NOPOLICY;
1642 rth->fl.fl4_dst = daddr;
1643 rth->rt_dst = daddr;
1644 rth->fl.fl4_tos = tos;
1645 rth->fl.mark = skb->mark;
1646 rth->fl.fl4_src = saddr;
1647 rth->rt_src = saddr;
1648 #ifdef CONFIG_NET_CLS_ROUTE
1649 rth->u.dst.tclassid = itag;
1652 rth->fl.iif = dev->ifindex;
1653 rth->u.dst.dev = &loopback_dev;
1654 dev_hold(rth->u.dst.dev);
1655 rth->idev = in_dev_get(rth->u.dst.dev);
1657 rth->rt_gateway = daddr;
1658 rth->rt_spec_dst= spec_dst;
1659 rth->rt_type = RTN_MULTICAST;
1660 rth->rt_flags = RTCF_MULTICAST;
1662 rth->u.dst.input= ip_local_deliver;
1663 rth->rt_flags |= RTCF_LOCAL;
1666 #ifdef CONFIG_IP_MROUTE
1667 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1668 rth->u.dst.input = ip_mr_input;
1670 RT_CACHE_STAT_INC(in_slow_mc);
1673 hash = rt_hash(daddr, saddr, dev->ifindex);
1674 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1686 static void ip_handle_martian_source(struct net_device *dev,
1687 struct in_device *in_dev,
1688 struct sk_buff *skb,
1692 RT_CACHE_STAT_INC(in_martian_src);
1693 #ifdef CONFIG_IP_ROUTE_VERBOSE
1694 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1696 * RFC1812 recommendation, if source is martian,
1697 * the only hint is MAC header.
1699 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1700 "%u.%u.%u.%u, on dev %s\n",
1701 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1702 if (dev->hard_header_len && skb->mac.raw) {
1704 unsigned char *p = skb->mac.raw;
1705 printk(KERN_WARNING "ll header: ");
1706 for (i = 0; i < dev->hard_header_len; i++, p++) {
1708 if (i < (dev->hard_header_len - 1))
1717 static inline int __mkroute_input(struct sk_buff *skb,
1718 struct fib_result* res,
1719 struct in_device *in_dev,
1720 __be32 daddr, __be32 saddr, u32 tos,
1721 struct rtable **result)
1726 struct in_device *out_dev;
1731 /* get a working reference to the output device */
1732 out_dev = in_dev_get(FIB_RES_DEV(*res));
1733 if (out_dev == NULL) {
1734 if (net_ratelimit())
1735 printk(KERN_CRIT "Bug in ip_route_input" \
1736 "_slow(). Please, report\n");
1741 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1742 in_dev->dev, &spec_dst, &itag);
1744 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1752 flags |= RTCF_DIRECTSRC;
1754 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1755 (IN_DEV_SHARED_MEDIA(out_dev) ||
1756 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1757 flags |= RTCF_DOREDIRECT;
1759 if (skb->protocol != htons(ETH_P_IP)) {
1760 /* Not IP (i.e. ARP). Do not create route, if it is
1761 * invalid for proxy arp. DNAT routes are always valid.
1763 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1770 rth = dst_alloc(&ipv4_dst_ops);
1776 atomic_set(&rth->u.dst.__refcnt, 1);
1777 rth->u.dst.flags= DST_HOST;
1778 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1779 if (res->fi->fib_nhs > 1)
1780 rth->u.dst.flags |= DST_BALANCED;
1782 if (in_dev->cnf.no_policy)
1783 rth->u.dst.flags |= DST_NOPOLICY;
1784 if (out_dev->cnf.no_xfrm)
1785 rth->u.dst.flags |= DST_NOXFRM;
1786 rth->fl.fl4_dst = daddr;
1787 rth->rt_dst = daddr;
1788 rth->fl.fl4_tos = tos;
1789 rth->fl.mark = skb->mark;
1790 rth->fl.fl4_src = saddr;
1791 rth->rt_src = saddr;
1792 rth->rt_gateway = daddr;
1794 rth->fl.iif = in_dev->dev->ifindex;
1795 rth->u.dst.dev = (out_dev)->dev;
1796 dev_hold(rth->u.dst.dev);
1797 rth->idev = in_dev_get(rth->u.dst.dev);
1799 rth->rt_spec_dst= spec_dst;
1801 rth->u.dst.input = ip_forward;
1802 rth->u.dst.output = ip_output;
1804 rt_set_nexthop(rth, res, itag);
1806 rth->rt_flags = flags;
1811 /* release the working reference to the output device */
1812 in_dev_put(out_dev);
1816 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1817 struct fib_result* res,
1818 const struct flowi *fl,
1819 struct in_device *in_dev,
1820 __be32 daddr, __be32 saddr, u32 tos)
1822 struct rtable* rth = NULL;
1826 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1827 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1828 fib_select_multipath(fl, res);
1831 /* create a routing cache entry */
1832 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1836 /* put it into the cache */
1837 hash = rt_hash(daddr, saddr, fl->iif);
1838 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1841 static inline int ip_mkroute_input(struct sk_buff *skb,
1842 struct fib_result* res,
1843 const struct flowi *fl,
1844 struct in_device *in_dev,
1845 __be32 daddr, __be32 saddr, u32 tos)
1847 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1848 struct rtable* rth = NULL, *rtres;
1849 unsigned char hop, hopcount;
1854 hopcount = res->fi->fib_nhs;
1858 /* distinguish between multipath and singlepath */
1860 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1863 /* add all alternatives to the routing cache */
1864 for (hop = 0; hop < hopcount; hop++) {
1867 /* put reference to previous result */
1871 /* create a routing cache entry */
1872 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1877 /* put it into the cache */
1878 hash = rt_hash(daddr, saddr, fl->iif);
1879 err = rt_intern_hash(hash, rth, &rtres);
1883 /* forward hop information to multipath impl. */
1884 multipath_set_nhinfo(rth,
1885 FIB_RES_NETWORK(*res),
1886 FIB_RES_NETMASK(*res),
1890 skb->dst = &rtres->u.dst;
1892 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1893 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1894 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1899 * NOTE. We drop all the packets that has local source
1900 * addresses, because every properly looped back packet
1901 * must have correct destination already attached by output routine.
1903 * Such approach solves two big problems:
1904 * 1. Not simplex devices are handled properly.
1905 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1908 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1909 u8 tos, struct net_device *dev)
1911 struct fib_result res;
1912 struct in_device *in_dev = in_dev_get(dev);
1913 struct flowi fl = { .nl_u = { .ip4_u =
1917 .scope = RT_SCOPE_UNIVERSE,
1920 .iif = dev->ifindex };
1923 struct rtable * rth;
1929 /* IP on this device is disabled. */
1934 /* Check for the most weird martians, which can be not detected
1938 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1939 goto martian_source;
1941 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1944 /* Accept zero addresses only to limited broadcast;
1945 * I even do not know to fix it or not. Waiting for complains :-)
1948 goto martian_source;
1950 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1951 goto martian_destination;
1954 * Now we are ready to route packet.
1956 if ((err = fib_lookup(&fl, &res)) != 0) {
1957 if (!IN_DEV_FORWARD(in_dev))
1963 RT_CACHE_STAT_INC(in_slow_tot);
1965 if (res.type == RTN_BROADCAST)
1968 if (res.type == RTN_LOCAL) {
1970 result = fib_validate_source(saddr, daddr, tos,
1971 loopback_dev.ifindex,
1972 dev, &spec_dst, &itag);
1974 goto martian_source;
1976 flags |= RTCF_DIRECTSRC;
1981 if (!IN_DEV_FORWARD(in_dev))
1983 if (res.type != RTN_UNICAST)
1984 goto martian_destination;
1986 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1987 if (err == -ENOBUFS)
1999 if (skb->protocol != htons(ETH_P_IP))
2003 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2005 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2008 goto martian_source;
2010 flags |= RTCF_DIRECTSRC;
2012 flags |= RTCF_BROADCAST;
2013 res.type = RTN_BROADCAST;
2014 RT_CACHE_STAT_INC(in_brd);
2017 rth = dst_alloc(&ipv4_dst_ops);
2021 rth->u.dst.output= ip_rt_bug;
2023 atomic_set(&rth->u.dst.__refcnt, 1);
2024 rth->u.dst.flags= DST_HOST;
2025 if (in_dev->cnf.no_policy)
2026 rth->u.dst.flags |= DST_NOPOLICY;
2027 rth->fl.fl4_dst = daddr;
2028 rth->rt_dst = daddr;
2029 rth->fl.fl4_tos = tos;
2030 rth->fl.mark = skb->mark;
2031 rth->fl.fl4_src = saddr;
2032 rth->rt_src = saddr;
2033 #ifdef CONFIG_NET_CLS_ROUTE
2034 rth->u.dst.tclassid = itag;
2037 rth->fl.iif = dev->ifindex;
2038 rth->u.dst.dev = &loopback_dev;
2039 dev_hold(rth->u.dst.dev);
2040 rth->idev = in_dev_get(rth->u.dst.dev);
2041 rth->rt_gateway = daddr;
2042 rth->rt_spec_dst= spec_dst;
2043 rth->u.dst.input= ip_local_deliver;
2044 rth->rt_flags = flags|RTCF_LOCAL;
2045 if (res.type == RTN_UNREACHABLE) {
2046 rth->u.dst.input= ip_error;
2047 rth->u.dst.error= -err;
2048 rth->rt_flags &= ~RTCF_LOCAL;
2050 rth->rt_type = res.type;
2051 hash = rt_hash(daddr, saddr, fl.iif);
2052 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2056 RT_CACHE_STAT_INC(in_no_route);
2057 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2058 res.type = RTN_UNREACHABLE;
2062 * Do not cache martian addresses: they should be logged (RFC1812)
2064 martian_destination:
2065 RT_CACHE_STAT_INC(in_martian_dst);
2066 #ifdef CONFIG_IP_ROUTE_VERBOSE
2067 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2068 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2069 "%u.%u.%u.%u, dev %s\n",
2070 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2074 err = -EHOSTUNREACH;
2086 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2090 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091 u8 tos, struct net_device *dev)
2093 struct rtable * rth;
2095 int iif = dev->ifindex;
2097 tos &= IPTOS_RT_MASK;
2098 hash = rt_hash(daddr, saddr, iif);
2101 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2102 rth = rcu_dereference(rth->u.dst.rt_next)) {
2103 if (rth->fl.fl4_dst == daddr &&
2104 rth->fl.fl4_src == saddr &&
2105 rth->fl.iif == iif &&
2107 rth->fl.mark == skb->mark &&
2108 rth->fl.fl4_tos == tos) {
2109 rth->u.dst.lastuse = jiffies;
2110 dst_hold(&rth->u.dst);
2112 RT_CACHE_STAT_INC(in_hit);
2114 skb->dst = (struct dst_entry*)rth;
2117 RT_CACHE_STAT_INC(in_hlist_search);
2121 /* Multicast recognition logic is moved from route cache to here.
2122 The problem was that too many Ethernet cards have broken/missing
2123 hardware multicast filters :-( As result the host on multicasting
2124 network acquires a lot of useless route cache entries, sort of
2125 SDR messages from all the world. Now we try to get rid of them.
2126 Really, provided software IP multicast filter is organized
2127 reasonably (at least, hashed), it does not result in a slowdown
2128 comparing with route cache reject entries.
2129 Note, that multicast routers are not affected, because
2130 route cache entry is created eventually.
2132 if (MULTICAST(daddr)) {
2133 struct in_device *in_dev;
2136 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2137 int our = ip_check_mc(in_dev, daddr, saddr,
2138 skb->nh.iph->protocol);
2140 #ifdef CONFIG_IP_MROUTE
2141 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2145 return ip_route_input_mc(skb, daddr, saddr,
2152 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2155 static inline int __mkroute_output(struct rtable **result,
2156 struct fib_result* res,
2157 const struct flowi *fl,
2158 const struct flowi *oldflp,
2159 struct net_device *dev_out,
2163 struct in_device *in_dev;
2164 u32 tos = RT_FL_TOS(oldflp);
2167 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2170 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2171 res->type = RTN_BROADCAST;
2172 else if (MULTICAST(fl->fl4_dst))
2173 res->type = RTN_MULTICAST;
2174 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2177 if (dev_out->flags & IFF_LOOPBACK)
2178 flags |= RTCF_LOCAL;
2180 /* get work reference to inet device */
2181 in_dev = in_dev_get(dev_out);
2185 if (res->type == RTN_BROADCAST) {
2186 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2188 fib_info_put(res->fi);
2191 } else if (res->type == RTN_MULTICAST) {
2192 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2193 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2195 flags &= ~RTCF_LOCAL;
2196 /* If multicast route do not exist use
2197 default one, but do not gateway in this case.
2200 if (res->fi && res->prefixlen < 4) {
2201 fib_info_put(res->fi);
2207 rth = dst_alloc(&ipv4_dst_ops);
2213 atomic_set(&rth->u.dst.__refcnt, 1);
2214 rth->u.dst.flags= DST_HOST;
2215 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2217 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2218 if (res->fi->fib_nhs > 1)
2219 rth->u.dst.flags |= DST_BALANCED;
2222 if (in_dev->cnf.no_xfrm)
2223 rth->u.dst.flags |= DST_NOXFRM;
2224 if (in_dev->cnf.no_policy)
2225 rth->u.dst.flags |= DST_NOPOLICY;
2227 rth->fl.fl4_dst = oldflp->fl4_dst;
2228 rth->fl.fl4_tos = tos;
2229 rth->fl.fl4_src = oldflp->fl4_src;
2230 rth->fl.oif = oldflp->oif;
2231 rth->fl.mark = oldflp->mark;
2232 rth->rt_dst = fl->fl4_dst;
2233 rth->rt_src = fl->fl4_src;
2234 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2235 /* get references to the devices that are to be hold by the routing
2237 rth->u.dst.dev = dev_out;
2239 rth->idev = in_dev_get(dev_out);
2240 rth->rt_gateway = fl->fl4_dst;
2241 rth->rt_spec_dst= fl->fl4_src;
2243 rth->u.dst.output=ip_output;
2245 RT_CACHE_STAT_INC(out_slow_tot);
2247 if (flags & RTCF_LOCAL) {
2248 rth->u.dst.input = ip_local_deliver;
2249 rth->rt_spec_dst = fl->fl4_dst;
2251 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2252 rth->rt_spec_dst = fl->fl4_src;
2253 if (flags & RTCF_LOCAL &&
2254 !(dev_out->flags & IFF_LOOPBACK)) {
2255 rth->u.dst.output = ip_mc_output;
2256 RT_CACHE_STAT_INC(out_slow_mc);
2258 #ifdef CONFIG_IP_MROUTE
2259 if (res->type == RTN_MULTICAST) {
2260 if (IN_DEV_MFORWARD(in_dev) &&
2261 !LOCAL_MCAST(oldflp->fl4_dst)) {
2262 rth->u.dst.input = ip_mr_input;
2263 rth->u.dst.output = ip_mc_output;
2269 rt_set_nexthop(rth, res, 0);
2271 rth->rt_flags = flags;
2275 /* release work reference to inet device */
2281 static inline int ip_mkroute_output_def(struct rtable **rp,
2282 struct fib_result* res,
2283 const struct flowi *fl,
2284 const struct flowi *oldflp,
2285 struct net_device *dev_out,
2288 struct rtable *rth = NULL;
2289 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2292 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2293 err = rt_intern_hash(hash, rth, rp);
2299 static inline int ip_mkroute_output(struct rtable** rp,
2300 struct fib_result* res,
2301 const struct flowi *fl,
2302 const struct flowi *oldflp,
2303 struct net_device *dev_out,
2306 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2310 struct rtable *rth = NULL;
2312 if (res->fi && res->fi->fib_nhs > 1) {
2313 unsigned char hopcount = res->fi->fib_nhs;
2315 for (hop = 0; hop < hopcount; hop++) {
2316 struct net_device *dev2nexthop;
2320 /* hold a work reference to the output device */
2321 dev2nexthop = FIB_RES_DEV(*res);
2322 dev_hold(dev2nexthop);
2324 /* put reference to previous result */
2328 err = __mkroute_output(&rth, res, fl, oldflp,
2329 dev2nexthop, flags);
2334 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2336 err = rt_intern_hash(hash, rth, rp);
2338 /* forward hop information to multipath impl. */
2339 multipath_set_nhinfo(rth,
2340 FIB_RES_NETWORK(*res),
2341 FIB_RES_NETMASK(*res),
2345 /* release work reference to output device */
2346 dev_put(dev2nexthop);
2353 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2356 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2357 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2362 * Major route resolver routine.
2365 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2367 u32 tos = RT_FL_TOS(oldflp);
2368 struct flowi fl = { .nl_u = { .ip4_u =
2369 { .daddr = oldflp->fl4_dst,
2370 .saddr = oldflp->fl4_src,
2371 .tos = tos & IPTOS_RT_MASK,
2372 .scope = ((tos & RTO_ONLINK) ?
2376 .mark = oldflp->mark,
2377 .iif = loopback_dev.ifindex,
2378 .oif = oldflp->oif };
2379 struct fib_result res;
2381 struct net_device *dev_out = NULL;
2387 #ifdef CONFIG_IP_MULTIPLE_TABLES
2391 if (oldflp->fl4_src) {
2393 if (MULTICAST(oldflp->fl4_src) ||
2394 BADCLASS(oldflp->fl4_src) ||
2395 ZERONET(oldflp->fl4_src))
2398 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2399 dev_out = ip_dev_find(oldflp->fl4_src);
2400 if (dev_out == NULL)
2403 /* I removed check for oif == dev_out->oif here.
2404 It was wrong for two reasons:
2405 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2406 assigned to multiple interfaces.
2407 2. Moreover, we are allowed to send packets with saddr
2408 of another iface. --ANK
2411 if (oldflp->oif == 0
2412 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2413 /* Special hack: user can direct multicasts
2414 and limited broadcast via necessary interface
2415 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2416 This hack is not just for fun, it allows
2417 vic,vat and friends to work.
2418 They bind socket to loopback, set ttl to zero
2419 and expect that it will work.
2420 From the viewpoint of routing cache they are broken,
2421 because we are not allowed to build multicast path
2422 with loopback source addr (look, routing cache
2423 cannot know, that ttl is zero, so that packet
2424 will not leave this host and route is valid).
2425 Luckily, this hack is good workaround.
2428 fl.oif = dev_out->ifindex;
2438 dev_out = dev_get_by_index(oldflp->oif);
2440 if (dev_out == NULL)
2443 /* RACE: Check return value of inet_select_addr instead. */
2444 if (__in_dev_get_rtnl(dev_out) == NULL) {
2446 goto out; /* Wrong error code */
2449 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2451 fl.fl4_src = inet_select_addr(dev_out, 0,
2456 if (MULTICAST(oldflp->fl4_dst))
2457 fl.fl4_src = inet_select_addr(dev_out, 0,
2459 else if (!oldflp->fl4_dst)
2460 fl.fl4_src = inet_select_addr(dev_out, 0,
2466 fl.fl4_dst = fl.fl4_src;
2468 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2471 dev_out = &loopback_dev;
2473 fl.oif = loopback_dev.ifindex;
2474 res.type = RTN_LOCAL;
2475 flags |= RTCF_LOCAL;
2479 if (fib_lookup(&fl, &res)) {
2482 /* Apparently, routing tables are wrong. Assume,
2483 that the destination is on link.
2486 Because we are allowed to send to iface
2487 even if it has NO routes and NO assigned
2488 addresses. When oif is specified, routing
2489 tables are looked up with only one purpose:
2490 to catch if destination is gatewayed, rather than
2491 direct. Moreover, if MSG_DONTROUTE is set,
2492 we send packet, ignoring both routing tables
2493 and ifaddr state. --ANK
2496 We could make it even if oif is unknown,
2497 likely IPv6, but we do not.
2500 if (fl.fl4_src == 0)
2501 fl.fl4_src = inet_select_addr(dev_out, 0,
2503 res.type = RTN_UNICAST;
2513 if (res.type == RTN_LOCAL) {
2515 fl.fl4_src = fl.fl4_dst;
2518 dev_out = &loopback_dev;
2520 fl.oif = dev_out->ifindex;
2522 fib_info_put(res.fi);
2524 flags |= RTCF_LOCAL;
2528 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2529 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2530 fib_select_multipath(&fl, &res);
2533 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2534 fib_select_default(&fl, &res);
2537 fl.fl4_src = FIB_RES_PREFSRC(res);
2541 dev_out = FIB_RES_DEV(res);
2543 fl.oif = dev_out->ifindex;
2547 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2557 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2562 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2565 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2566 rth = rcu_dereference(rth->u.dst.rt_next)) {
2567 if (rth->fl.fl4_dst == flp->fl4_dst &&
2568 rth->fl.fl4_src == flp->fl4_src &&
2570 rth->fl.oif == flp->oif &&
2571 rth->fl.mark == flp->mark &&
2572 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2573 (IPTOS_RT_MASK | RTO_ONLINK))) {
2575 /* check for multipath routes and choose one if
2578 if (multipath_select_route(flp, rth, rp)) {
2579 dst_hold(&(*rp)->u.dst);
2580 RT_CACHE_STAT_INC(out_hit);
2581 rcu_read_unlock_bh();
2585 rth->u.dst.lastuse = jiffies;
2586 dst_hold(&rth->u.dst);
2588 RT_CACHE_STAT_INC(out_hit);
2589 rcu_read_unlock_bh();
2593 RT_CACHE_STAT_INC(out_hlist_search);
2595 rcu_read_unlock_bh();
2597 return ip_route_output_slow(rp, flp);
2600 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2602 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2606 if ((err = __ip_route_output_key(rp, flp)) != 0)
2611 flp->fl4_src = (*rp)->rt_src;
2613 flp->fl4_dst = (*rp)->rt_dst;
2614 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2620 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2622 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2624 return ip_route_output_flow(rp, flp, NULL, 0);
2627 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2628 int nowait, unsigned int flags)
2630 struct rtable *rt = (struct rtable*)skb->dst;
2632 struct nlmsghdr *nlh;
2634 u32 id = 0, ts = 0, tsage = 0, error;
2636 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2640 r = nlmsg_data(nlh);
2641 r->rtm_family = AF_INET;
2642 r->rtm_dst_len = 32;
2644 r->rtm_tos = rt->fl.fl4_tos;
2645 r->rtm_table = RT_TABLE_MAIN;
2646 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2647 r->rtm_type = rt->rt_type;
2648 r->rtm_scope = RT_SCOPE_UNIVERSE;
2649 r->rtm_protocol = RTPROT_UNSPEC;
2650 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2651 if (rt->rt_flags & RTCF_NOTIFY)
2652 r->rtm_flags |= RTM_F_NOTIFY;
2654 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2656 if (rt->fl.fl4_src) {
2657 r->rtm_src_len = 32;
2658 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2661 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2662 #ifdef CONFIG_NET_CLS_ROUTE
2663 if (rt->u.dst.tclassid)
2664 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2666 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2667 if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2668 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2671 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2672 else if (rt->rt_src != rt->fl.fl4_src)
2673 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2675 if (rt->rt_dst != rt->rt_gateway)
2676 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2678 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2679 goto nla_put_failure;
2681 error = rt->u.dst.error;
2682 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2684 id = rt->peer->ip_id_count;
2685 if (rt->peer->tcp_ts_stamp) {
2686 ts = rt->peer->tcp_ts;
2687 tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2692 #ifdef CONFIG_IP_MROUTE
2693 __be32 dst = rt->rt_dst;
2695 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2696 ipv4_devconf.mc_forwarding) {
2697 int err = ipmr_get_route(skb, r, nowait);
2702 goto nla_put_failure;
2704 if (err == -EMSGSIZE)
2705 goto nla_put_failure;
2711 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2714 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2715 expires, error) < 0)
2716 goto nla_put_failure;
2718 return nlmsg_end(skb, nlh);
2721 nlmsg_cancel(skb, nlh);
2725 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2728 struct nlattr *tb[RTA_MAX+1];
2729 struct rtable *rt = NULL;
2734 struct sk_buff *skb;
2736 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2740 rtm = nlmsg_data(nlh);
2742 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2748 /* Reserve room for dummy headers, this skb can pass
2749 through good chunk of routing engine.
2751 skb->mac.raw = skb->nh.raw = skb->data;
2753 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2754 skb->nh.iph->protocol = IPPROTO_ICMP;
2755 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2757 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2758 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2759 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2762 struct net_device *dev;
2764 dev = __dev_get_by_index(iif);
2770 skb->protocol = htons(ETH_P_IP);
2773 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2776 rt = (struct rtable*) skb->dst;
2777 if (err == 0 && rt->u.dst.error)
2778 err = -rt->u.dst.error;
2785 .tos = rtm->rtm_tos,
2788 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2790 err = ip_route_output_key(&rt, &fl);
2796 skb->dst = &rt->u.dst;
2797 if (rtm->rtm_flags & RTM_F_NOTIFY)
2798 rt->rt_flags |= RTCF_NOTIFY;
2800 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2801 RTM_NEWROUTE, 0, 0);
2805 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2814 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2821 s_idx = idx = cb->args[1];
2822 for (h = 0; h <= rt_hash_mask; h++) {
2823 if (h < s_h) continue;
2827 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2828 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2831 skb->dst = dst_clone(&rt->u.dst);
2832 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2833 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2834 1, NLM_F_MULTI) <= 0) {
2835 dst_release(xchg(&skb->dst, NULL));
2836 rcu_read_unlock_bh();
2839 dst_release(xchg(&skb->dst, NULL));
2841 rcu_read_unlock_bh();
2850 void ip_rt_multicast_event(struct in_device *in_dev)
2855 #ifdef CONFIG_SYSCTL
2856 static int flush_delay;
2858 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2859 struct file *filp, void __user *buffer,
2860 size_t *lenp, loff_t *ppos)
2863 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2864 rt_cache_flush(flush_delay);
2871 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2874 void __user *oldval,
2875 size_t __user *oldlenp,
2876 void __user *newval,
2880 if (newlen != sizeof(int))
2882 if (get_user(delay, (int __user *)newval))
2884 rt_cache_flush(delay);
2888 ctl_table ipv4_route_table[] = {
2890 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2891 .procname = "flush",
2892 .data = &flush_delay,
2893 .maxlen = sizeof(int),
2895 .proc_handler = &ipv4_sysctl_rtcache_flush,
2896 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2899 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2900 .procname = "min_delay",
2901 .data = &ip_rt_min_delay,
2902 .maxlen = sizeof(int),
2904 .proc_handler = &proc_dointvec_jiffies,
2905 .strategy = &sysctl_jiffies,
2908 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2909 .procname = "max_delay",
2910 .data = &ip_rt_max_delay,
2911 .maxlen = sizeof(int),
2913 .proc_handler = &proc_dointvec_jiffies,
2914 .strategy = &sysctl_jiffies,
2917 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2918 .procname = "gc_thresh",
2919 .data = &ipv4_dst_ops.gc_thresh,
2920 .maxlen = sizeof(int),
2922 .proc_handler = &proc_dointvec,
2925 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2926 .procname = "max_size",
2927 .data = &ip_rt_max_size,
2928 .maxlen = sizeof(int),
2930 .proc_handler = &proc_dointvec,
2933 /* Deprecated. Use gc_min_interval_ms */
2935 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936 .procname = "gc_min_interval",
2937 .data = &ip_rt_gc_min_interval,
2938 .maxlen = sizeof(int),
2940 .proc_handler = &proc_dointvec_jiffies,
2941 .strategy = &sysctl_jiffies,
2944 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945 .procname = "gc_min_interval_ms",
2946 .data = &ip_rt_gc_min_interval,
2947 .maxlen = sizeof(int),
2949 .proc_handler = &proc_dointvec_ms_jiffies,
2950 .strategy = &sysctl_ms_jiffies,
2953 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2954 .procname = "gc_timeout",
2955 .data = &ip_rt_gc_timeout,
2956 .maxlen = sizeof(int),
2958 .proc_handler = &proc_dointvec_jiffies,
2959 .strategy = &sysctl_jiffies,
2962 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2963 .procname = "gc_interval",
2964 .data = &ip_rt_gc_interval,
2965 .maxlen = sizeof(int),
2967 .proc_handler = &proc_dointvec_jiffies,
2968 .strategy = &sysctl_jiffies,
2971 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2972 .procname = "redirect_load",
2973 .data = &ip_rt_redirect_load,
2974 .maxlen = sizeof(int),
2976 .proc_handler = &proc_dointvec,
2979 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980 .procname = "redirect_number",
2981 .data = &ip_rt_redirect_number,
2982 .maxlen = sizeof(int),
2984 .proc_handler = &proc_dointvec,
2987 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988 .procname = "redirect_silence",
2989 .data = &ip_rt_redirect_silence,
2990 .maxlen = sizeof(int),
2992 .proc_handler = &proc_dointvec,
2995 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2996 .procname = "error_cost",
2997 .data = &ip_rt_error_cost,
2998 .maxlen = sizeof(int),
3000 .proc_handler = &proc_dointvec,
3003 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3004 .procname = "error_burst",
3005 .data = &ip_rt_error_burst,
3006 .maxlen = sizeof(int),
3008 .proc_handler = &proc_dointvec,
3011 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3012 .procname = "gc_elasticity",
3013 .data = &ip_rt_gc_elasticity,
3014 .maxlen = sizeof(int),
3016 .proc_handler = &proc_dointvec,
3019 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3020 .procname = "mtu_expires",
3021 .data = &ip_rt_mtu_expires,
3022 .maxlen = sizeof(int),
3024 .proc_handler = &proc_dointvec_jiffies,
3025 .strategy = &sysctl_jiffies,
3028 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3029 .procname = "min_pmtu",
3030 .data = &ip_rt_min_pmtu,
3031 .maxlen = sizeof(int),
3033 .proc_handler = &proc_dointvec,
3036 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3037 .procname = "min_adv_mss",
3038 .data = &ip_rt_min_advmss,
3039 .maxlen = sizeof(int),
3041 .proc_handler = &proc_dointvec,
3044 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3045 .procname = "secret_interval",
3046 .data = &ip_rt_secret_interval,
3047 .maxlen = sizeof(int),
3049 .proc_handler = &proc_dointvec_jiffies,
3050 .strategy = &sysctl_jiffies,
3056 #ifdef CONFIG_NET_CLS_ROUTE
3057 struct ip_rt_acct *ip_rt_acct;
3059 /* This code sucks. But you should have seen it before! --RR */
3061 /* IP route accounting ptr for this logical cpu number. */
3062 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3064 #ifdef CONFIG_PROC_FS
3065 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066 int length, int *eof, void *data)
3070 if ((offset & 3) || (length & 3))
3073 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3078 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079 length = sizeof(struct ip_rt_acct) * 256 - offset;
3083 offset /= sizeof(u32);
3086 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087 u32 *dst = (u32 *) buffer;
3089 /* Copy first cpu. */
3091 memcpy(dst, src, length);
3093 /* Add the other cpus in, one int at a time */
3094 for_each_possible_cpu(i) {
3097 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3099 for (j = 0; j < length/4; j++)
3105 #endif /* CONFIG_PROC_FS */
3106 #endif /* CONFIG_NET_CLS_ROUTE */
3108 static __initdata unsigned long rhash_entries;
3109 static int __init set_rhash_entries(char *str)
3113 rhash_entries = simple_strtoul(str, &str, 0);
3116 __setup("rhash_entries=", set_rhash_entries);
3118 int __init ip_rt_init(void)
3122 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123 (jiffies ^ (jiffies >> 7)));
3125 #ifdef CONFIG_NET_CLS_ROUTE
3129 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3131 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3133 panic("IP: failed to allocate ip_rt_acct\n");
3134 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3138 ipv4_dst_ops.kmem_cachep =
3139 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3140 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3142 rt_hash_table = (struct rt_hash_bucket *)
3143 alloc_large_system_hash("IP route cache",
3144 sizeof(struct rt_hash_bucket),
3146 (num_physpages >= 128 * 1024) ?
3152 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3153 rt_hash_lock_init();
3155 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3156 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3161 init_timer(&rt_flush_timer);
3162 rt_flush_timer.function = rt_run_flush;
3163 init_timer(&rt_periodic_timer);
3164 rt_periodic_timer.function = rt_check_expire;
3165 init_timer(&rt_secret_timer);
3166 rt_secret_timer.function = rt_secret_rebuild;
3168 /* All the timers, started at system startup tend
3169 to synchronize. Perturb it a bit.
3171 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3173 add_timer(&rt_periodic_timer);
3175 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3176 ip_rt_secret_interval;
3177 add_timer(&rt_secret_timer);
3179 #ifdef CONFIG_PROC_FS
3181 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3182 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3183 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3187 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3189 #ifdef CONFIG_NET_CLS_ROUTE
3190 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3200 EXPORT_SYMBOL(__ip_select_ident);
3201 EXPORT_SYMBOL(ip_route_input);
3202 EXPORT_SYMBOL(ip_route_output_key);