2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
113 #define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
116 #define IP_MAX_MTU 0xFFF0
118 #define RT_GC_TIMEOUT (300*HZ)
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
123 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly = 9;
125 static int ip_rt_redirect_load __read_mostly = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly = HZ;
128 static int ip_rt_error_burst __read_mostly = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly = 8;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
133 static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
135 static void rt_worker_func(struct work_struct *work);
136 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
137 static struct timer_list rt_secret_timer;
140 * Interface to generic destination cache.
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static void ipv4_dst_destroy(struct dst_entry *dst);
145 static void ipv4_dst_ifdown(struct dst_entry *dst,
146 struct net_device *dev, int how);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void ipv4_link_failure(struct sk_buff *skb);
149 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(struct dst_ops *ops);
153 static struct dst_ops ipv4_dst_ops = {
155 .protocol = __constant_htons(ETH_P_IP),
156 .gc = rt_garbage_collect,
157 .check = ipv4_dst_check,
158 .destroy = ipv4_dst_destroy,
159 .ifdown = ipv4_dst_ifdown,
160 .negative_advice = ipv4_negative_advice,
161 .link_failure = ipv4_link_failure,
162 .update_pmtu = ip_rt_update_pmtu,
163 .local_out = ip_local_out,
164 .entry_size = sizeof(struct rtable),
165 .entries = ATOMIC_INIT(0),
168 #define ECN_OR_COST(class) TC_PRIO_##class
170 const __u8 ip_tos2prio[16] = {
174 ECN_OR_COST(BESTEFFORT),
180 ECN_OR_COST(INTERACTIVE),
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
194 /* The locking scheme is rather straight forward:
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
204 struct rt_hash_bucket {
205 struct rtable *chain;
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
208 defined(CONFIG_PROVE_LOCKING)
210 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211 * The size of this table is a power of two and depends on the number of CPUS.
212 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
214 #ifdef CONFIG_LOCKDEP
215 # define RT_HASH_LOCK_SZ 256
218 # define RT_HASH_LOCK_SZ 4096
220 # define RT_HASH_LOCK_SZ 2048
222 # define RT_HASH_LOCK_SZ 1024
224 # define RT_HASH_LOCK_SZ 512
226 # define RT_HASH_LOCK_SZ 256
230 static spinlock_t *rt_hash_locks;
231 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233 static __init void rt_hash_lock_init(void)
237 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
240 panic("IP: failed to allocate rt_hash_locks\n");
242 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
243 spin_lock_init(&rt_hash_locks[i]);
246 # define rt_hash_lock_addr(slot) NULL
248 static inline void rt_hash_lock_init(void)
253 static struct rt_hash_bucket *rt_hash_table __read_mostly;
254 static unsigned rt_hash_mask __read_mostly;
255 static unsigned int rt_hash_log __read_mostly;
256 static atomic_t rt_genid __read_mostly;
258 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
259 #define RT_CACHE_STAT_INC(field) \
260 (__raw_get_cpu_var(rt_cache_stat).field++)
262 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
264 return jhash_3words((__force u32)(__be32)(daddr),
265 (__force u32)(__be32)(saddr),
266 idx, atomic_read(&rt_genid))
270 #ifdef CONFIG_PROC_FS
271 struct rt_cache_iter_state {
272 struct seq_net_private p;
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
279 struct rt_cache_iter_state *st = seq->private;
280 struct rtable *r = NULL;
282 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
284 r = rcu_dereference(rt_hash_table[st->bucket].chain);
286 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
287 r->rt_genid == st->genid)
289 r = rcu_dereference(r->u.dst.rt_next);
291 rcu_read_unlock_bh();
296 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
299 struct rt_cache_iter_state *st = seq->private;
300 r = r->u.dst.rt_next;
302 rcu_read_unlock_bh();
303 if (--st->bucket < 0)
306 r = rt_hash_table[st->bucket].chain;
308 return rcu_dereference(r);
311 static struct rtable *rt_cache_get_next(struct seq_file *seq,
314 struct rt_cache_iter_state *st = seq->private;
315 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
316 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
318 if (r->rt_genid == st->genid)
324 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
326 struct rtable *r = rt_cache_get_first(seq);
329 while (pos && (r = rt_cache_get_next(seq, r)))
331 return pos ? NULL : r;
334 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
336 struct rt_cache_iter_state *st = seq->private;
338 return rt_cache_get_idx(seq, *pos - 1);
339 st->genid = atomic_read(&rt_genid);
340 return SEQ_START_TOKEN;
343 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
347 if (v == SEQ_START_TOKEN)
348 r = rt_cache_get_first(seq);
350 r = rt_cache_get_next(seq, v);
355 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
357 if (v && v != SEQ_START_TOKEN)
358 rcu_read_unlock_bh();
361 static int rt_cache_seq_show(struct seq_file *seq, void *v)
363 if (v == SEQ_START_TOKEN)
364 seq_printf(seq, "%-127s\n",
365 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
366 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
369 struct rtable *r = v;
372 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
373 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
374 r->u.dst.dev ? r->u.dst.dev->name : "*",
375 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
376 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
377 r->u.dst.__use, 0, (unsigned long)r->rt_src,
378 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
379 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
380 dst_metric(&r->u.dst, RTAX_WINDOW),
381 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
382 dst_metric(&r->u.dst, RTAX_RTTVAR)),
384 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
385 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
388 seq_printf(seq, "%-127s\n", temp);
393 static const struct seq_operations rt_cache_seq_ops = {
394 .start = rt_cache_seq_start,
395 .next = rt_cache_seq_next,
396 .stop = rt_cache_seq_stop,
397 .show = rt_cache_seq_show,
400 static int rt_cache_seq_open(struct inode *inode, struct file *file)
402 return seq_open_net(inode, file, &rt_cache_seq_ops,
403 sizeof(struct rt_cache_iter_state));
406 static const struct file_operations rt_cache_seq_fops = {
407 .owner = THIS_MODULE,
408 .open = rt_cache_seq_open,
411 .release = seq_release_net,
415 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
420 return SEQ_START_TOKEN;
422 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
423 if (!cpu_possible(cpu))
426 return &per_cpu(rt_cache_stat, cpu);
431 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
435 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
436 if (!cpu_possible(cpu))
439 return &per_cpu(rt_cache_stat, cpu);
445 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
450 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
452 struct rt_cache_stat *st = v;
454 if (v == SEQ_START_TOKEN) {
455 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
459 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
460 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
461 atomic_read(&ipv4_dst_ops.entries),
484 static const struct seq_operations rt_cpu_seq_ops = {
485 .start = rt_cpu_seq_start,
486 .next = rt_cpu_seq_next,
487 .stop = rt_cpu_seq_stop,
488 .show = rt_cpu_seq_show,
492 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
494 return seq_open(file, &rt_cpu_seq_ops);
497 static const struct file_operations rt_cpu_seq_fops = {
498 .owner = THIS_MODULE,
499 .open = rt_cpu_seq_open,
502 .release = seq_release,
505 #ifdef CONFIG_NET_CLS_ROUTE
506 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
507 int length, int *eof, void *data)
511 if ((offset & 3) || (length & 3))
514 if (offset >= sizeof(struct ip_rt_acct) * 256) {
519 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
520 length = sizeof(struct ip_rt_acct) * 256 - offset;
524 offset /= sizeof(u32);
527 u32 *dst = (u32 *) buffer;
530 memset(dst, 0, length);
532 for_each_possible_cpu(i) {
536 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
537 for (j = 0; j < length/4; j++)
545 static int __net_init ip_rt_do_proc_init(struct net *net)
547 struct proc_dir_entry *pde;
549 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
554 pde = proc_create("rt_cache", S_IRUGO,
555 net->proc_net_stat, &rt_cpu_seq_fops);
559 #ifdef CONFIG_NET_CLS_ROUTE
560 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
561 ip_rt_acct_read, NULL);
567 #ifdef CONFIG_NET_CLS_ROUTE
569 remove_proc_entry("rt_cache", net->proc_net_stat);
572 remove_proc_entry("rt_cache", net->proc_net);
577 static void __net_exit ip_rt_do_proc_exit(struct net *net)
579 remove_proc_entry("rt_cache", net->proc_net_stat);
580 remove_proc_entry("rt_cache", net->proc_net);
581 remove_proc_entry("rt_acct", net->proc_net);
584 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
585 .init = ip_rt_do_proc_init,
586 .exit = ip_rt_do_proc_exit,
589 static int __init ip_rt_proc_init(void)
591 return register_pernet_subsys(&ip_rt_proc_ops);
595 static inline int ip_rt_proc_init(void)
599 #endif /* CONFIG_PROC_FS */
601 static inline void rt_free(struct rtable *rt)
603 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
606 static inline void rt_drop(struct rtable *rt)
609 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
612 static inline int rt_fast_clean(struct rtable *rth)
614 /* Kill broadcast/multicast entries very aggresively, if they
615 collide in hash table with more useful entries */
616 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
617 rth->fl.iif && rth->u.dst.rt_next;
620 static inline int rt_valuable(struct rtable *rth)
622 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
626 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
631 if (atomic_read(&rth->u.dst.__refcnt))
635 if (rth->u.dst.expires &&
636 time_after_eq(jiffies, rth->u.dst.expires))
639 age = jiffies - rth->u.dst.lastuse;
641 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
642 (age <= tmo2 && rt_valuable(rth)))
648 /* Bits of score are:
650 * 30: not quite useless
651 * 29..0: usage counter
653 static inline u32 rt_score(struct rtable *rt)
655 u32 score = jiffies - rt->u.dst.lastuse;
657 score = ~score & ~(3<<30);
663 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
669 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
671 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
672 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
673 (fl1->mark ^ fl2->mark) |
674 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
675 *(u16 *)&fl2->nl_u.ip4_u.tos) |
676 (fl1->oif ^ fl2->oif) |
677 (fl1->iif ^ fl2->iif)) == 0;
680 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
682 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
686 * Perform a full scan of hash table and free all entries.
687 * Can be called by a softirq or a process.
688 * In the later case, we want to be reschedule if necessary
690 static void rt_do_flush(int process_context)
693 struct rtable *rth, *next;
695 for (i = 0; i <= rt_hash_mask; i++) {
696 if (process_context && need_resched())
698 rth = rt_hash_table[i].chain;
702 spin_lock_bh(rt_hash_lock_addr(i));
703 rth = rt_hash_table[i].chain;
704 rt_hash_table[i].chain = NULL;
705 spin_unlock_bh(rt_hash_lock_addr(i));
707 for (; rth; rth = next) {
708 next = rth->u.dst.rt_next;
714 static void rt_check_expire(void)
716 static unsigned int rover;
717 unsigned int i = rover, goal;
718 struct rtable *rth, **rthp;
721 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
722 if (ip_rt_gc_timeout > 1)
723 do_div(mult, ip_rt_gc_timeout);
724 goal = (unsigned int)mult;
725 if (goal > rt_hash_mask)
726 goal = rt_hash_mask + 1;
727 for (; goal > 0; goal--) {
728 unsigned long tmo = ip_rt_gc_timeout;
730 i = (i + 1) & rt_hash_mask;
731 rthp = &rt_hash_table[i].chain;
738 spin_lock_bh(rt_hash_lock_addr(i));
739 while ((rth = *rthp) != NULL) {
740 if (rth->rt_genid != atomic_read(&rt_genid)) {
741 *rthp = rth->u.dst.rt_next;
745 if (rth->u.dst.expires) {
746 /* Entry is expired even if it is in use */
747 if (time_before_eq(jiffies, rth->u.dst.expires)) {
749 rthp = &rth->u.dst.rt_next;
752 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
754 rthp = &rth->u.dst.rt_next;
758 /* Cleanup aged off entries. */
759 *rthp = rth->u.dst.rt_next;
762 spin_unlock_bh(rt_hash_lock_addr(i));
768 * rt_worker_func() is run in process context.
769 * we call rt_check_expire() to scan part of the hash table
771 static void rt_worker_func(struct work_struct *work)
774 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
778 * Pertubation of rt_genid by a small quantity [1..256]
779 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
780 * many times (2^24) without giving recent rt_genid.
781 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
783 static void rt_cache_invalidate(void)
785 unsigned char shuffle;
787 get_random_bytes(&shuffle, sizeof(shuffle));
788 atomic_add(shuffle + 1U, &rt_genid);
792 * delay < 0 : invalidate cache (fast : entries will be deleted later)
793 * delay >= 0 : invalidate & flush cache (can be long)
795 void rt_cache_flush(int delay)
797 rt_cache_invalidate();
799 rt_do_flush(!in_softirq());
803 * We change rt_genid and let gc do the cleanup
805 static void rt_secret_rebuild(unsigned long dummy)
807 rt_cache_invalidate();
808 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
812 Short description of GC goals.
814 We want to build algorithm, which will keep routing cache
815 at some equilibrium point, when number of aged off entries
816 is kept approximately equal to newly generated ones.
818 Current expiration strength is variable "expire".
819 We try to adjust it dynamically, so that if networking
820 is idle expires is large enough to keep enough of warm entries,
821 and when load increases it reduces to limit cache size.
824 static int rt_garbage_collect(struct dst_ops *ops)
826 static unsigned long expire = RT_GC_TIMEOUT;
827 static unsigned long last_gc;
829 static int equilibrium;
830 struct rtable *rth, **rthp;
831 unsigned long now = jiffies;
835 * Garbage collection is pretty expensive,
836 * do not make it too frequently.
839 RT_CACHE_STAT_INC(gc_total);
841 if (now - last_gc < ip_rt_gc_min_interval &&
842 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
843 RT_CACHE_STAT_INC(gc_ignored);
847 /* Calculate number of entries, which we want to expire now. */
848 goal = atomic_read(&ipv4_dst_ops.entries) -
849 (ip_rt_gc_elasticity << rt_hash_log);
851 if (equilibrium < ipv4_dst_ops.gc_thresh)
852 equilibrium = ipv4_dst_ops.gc_thresh;
853 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
855 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
856 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
859 /* We are in dangerous area. Try to reduce cache really
862 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
863 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
866 if (now - last_gc >= ip_rt_gc_min_interval)
877 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
878 unsigned long tmo = expire;
880 k = (k + 1) & rt_hash_mask;
881 rthp = &rt_hash_table[k].chain;
882 spin_lock_bh(rt_hash_lock_addr(k));
883 while ((rth = *rthp) != NULL) {
884 if (rth->rt_genid == atomic_read(&rt_genid) &&
885 !rt_may_expire(rth, tmo, expire)) {
887 rthp = &rth->u.dst.rt_next;
890 *rthp = rth->u.dst.rt_next;
894 spin_unlock_bh(rt_hash_lock_addr(k));
903 /* Goal is not achieved. We stop process if:
905 - if expire reduced to zero. Otherwise, expire is halfed.
906 - if table is not full.
907 - if we are called from interrupt.
908 - jiffies check is just fallback/debug loop breaker.
909 We will not spin here for long time in any case.
912 RT_CACHE_STAT_INC(gc_goal_miss);
918 #if RT_CACHE_DEBUG >= 2
919 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
920 atomic_read(&ipv4_dst_ops.entries), goal, i);
923 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
925 } while (!in_softirq() && time_before_eq(jiffies, now));
927 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
930 printk(KERN_WARNING "dst cache overflow\n");
931 RT_CACHE_STAT_INC(gc_dst_overflow);
935 expire += ip_rt_gc_min_interval;
936 if (expire > ip_rt_gc_timeout ||
937 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
938 expire = ip_rt_gc_timeout;
939 #if RT_CACHE_DEBUG >= 2
940 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
941 atomic_read(&ipv4_dst_ops.entries), goal, rover);
946 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
948 struct rtable *rth, **rthp;
950 struct rtable *cand, **candp;
953 int attempts = !in_softirq();
962 rthp = &rt_hash_table[hash].chain;
964 spin_lock_bh(rt_hash_lock_addr(hash));
965 while ((rth = *rthp) != NULL) {
966 if (rth->rt_genid != atomic_read(&rt_genid)) {
967 *rthp = rth->u.dst.rt_next;
971 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
973 *rthp = rth->u.dst.rt_next;
975 * Since lookup is lockfree, the deletion
976 * must be visible to another weakly ordered CPU before
977 * the insertion at the start of the hash chain.
979 rcu_assign_pointer(rth->u.dst.rt_next,
980 rt_hash_table[hash].chain);
982 * Since lookup is lockfree, the update writes
983 * must be ordered for consistency on SMP.
985 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
987 dst_use(&rth->u.dst, now);
988 spin_unlock_bh(rt_hash_lock_addr(hash));
995 if (!atomic_read(&rth->u.dst.__refcnt)) {
996 u32 score = rt_score(rth);
998 if (score <= min_score) {
1007 rthp = &rth->u.dst.rt_next;
1011 /* ip_rt_gc_elasticity used to be average length of chain
1012 * length, when exceeded gc becomes really aggressive.
1014 * The second limit is less certain. At the moment it allows
1015 * only 2 entries per bucket. We will see.
1017 if (chain_length > ip_rt_gc_elasticity) {
1018 *candp = cand->u.dst.rt_next;
1023 /* Try to bind route to arp only if it is output
1024 route or unicast forwarding path.
1026 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1027 int err = arp_bind_neighbour(&rt->u.dst);
1029 spin_unlock_bh(rt_hash_lock_addr(hash));
1031 if (err != -ENOBUFS) {
1036 /* Neighbour tables are full and nothing
1037 can be released. Try to shrink route cache,
1038 it is most likely it holds some neighbour records.
1040 if (attempts-- > 0) {
1041 int saved_elasticity = ip_rt_gc_elasticity;
1042 int saved_int = ip_rt_gc_min_interval;
1043 ip_rt_gc_elasticity = 1;
1044 ip_rt_gc_min_interval = 0;
1045 rt_garbage_collect(&ipv4_dst_ops);
1046 ip_rt_gc_min_interval = saved_int;
1047 ip_rt_gc_elasticity = saved_elasticity;
1051 if (net_ratelimit())
1052 printk(KERN_WARNING "Neighbour table overflow.\n");
1058 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1059 #if RT_CACHE_DEBUG >= 2
1060 if (rt->u.dst.rt_next) {
1062 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1063 NIPQUAD(rt->rt_dst));
1064 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1065 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1069 rt_hash_table[hash].chain = rt;
1070 spin_unlock_bh(rt_hash_lock_addr(hash));
1075 void rt_bind_peer(struct rtable *rt, int create)
1077 static DEFINE_SPINLOCK(rt_peer_lock);
1078 struct inet_peer *peer;
1080 peer = inet_getpeer(rt->rt_dst, create);
1082 spin_lock_bh(&rt_peer_lock);
1083 if (rt->peer == NULL) {
1087 spin_unlock_bh(&rt_peer_lock);
1093 * Peer allocation may fail only in serious out-of-memory conditions. However
1094 * we still can generate some output.
1095 * Random ID selection looks a bit dangerous because we have no chances to
1096 * select ID being unique in a reasonable period of time.
1097 * But broken packet identifier may be better than no packet at all.
1099 static void ip_select_fb_ident(struct iphdr *iph)
1101 static DEFINE_SPINLOCK(ip_fb_id_lock);
1102 static u32 ip_fallback_id;
1105 spin_lock_bh(&ip_fb_id_lock);
1106 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1107 iph->id = htons(salt & 0xFFFF);
1108 ip_fallback_id = salt;
1109 spin_unlock_bh(&ip_fb_id_lock);
1112 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1114 struct rtable *rt = (struct rtable *) dst;
1117 if (rt->peer == NULL)
1118 rt_bind_peer(rt, 1);
1120 /* If peer is attached to destination, it is never detached,
1121 so that we need not to grab a lock to dereference it.
1124 iph->id = htons(inet_getid(rt->peer, more));
1128 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1129 __builtin_return_address(0));
1131 ip_select_fb_ident(iph);
1134 static void rt_del(unsigned hash, struct rtable *rt)
1136 struct rtable **rthp, *aux;
1138 rthp = &rt_hash_table[hash].chain;
1139 spin_lock_bh(rt_hash_lock_addr(hash));
1141 while ((aux = *rthp) != NULL) {
1142 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1143 *rthp = aux->u.dst.rt_next;
1147 rthp = &aux->u.dst.rt_next;
1149 spin_unlock_bh(rt_hash_lock_addr(hash));
1152 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1153 __be32 saddr, struct net_device *dev)
1156 struct in_device *in_dev = in_dev_get(dev);
1157 struct rtable *rth, **rthp;
1158 __be32 skeys[2] = { saddr, 0 };
1159 int ikeys[2] = { dev->ifindex, 0 };
1160 struct netevent_redirect netevent;
1167 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1168 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1169 || ipv4_is_zeronet(new_gw))
1170 goto reject_redirect;
1172 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1173 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1174 goto reject_redirect;
1175 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1176 goto reject_redirect;
1178 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1179 goto reject_redirect;
1182 for (i = 0; i < 2; i++) {
1183 for (k = 0; k < 2; k++) {
1184 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1186 rthp=&rt_hash_table[hash].chain;
1189 while ((rth = rcu_dereference(*rthp)) != NULL) {
1192 if (rth->fl.fl4_dst != daddr ||
1193 rth->fl.fl4_src != skeys[i] ||
1194 rth->fl.oif != ikeys[k] ||
1196 rth->rt_genid != atomic_read(&rt_genid) ||
1197 !net_eq(dev_net(rth->u.dst.dev), net)) {
1198 rthp = &rth->u.dst.rt_next;
1202 if (rth->rt_dst != daddr ||
1203 rth->rt_src != saddr ||
1205 rth->rt_gateway != old_gw ||
1206 rth->u.dst.dev != dev)
1209 dst_hold(&rth->u.dst);
1212 rt = dst_alloc(&ipv4_dst_ops);
1219 /* Copy all the information. */
1221 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1222 rt->u.dst.__use = 1;
1223 atomic_set(&rt->u.dst.__refcnt, 1);
1224 rt->u.dst.child = NULL;
1226 dev_hold(rt->u.dst.dev);
1228 in_dev_hold(rt->idev);
1229 rt->u.dst.obsolete = 0;
1230 rt->u.dst.lastuse = jiffies;
1231 rt->u.dst.path = &rt->u.dst;
1232 rt->u.dst.neighbour = NULL;
1233 rt->u.dst.hh = NULL;
1234 rt->u.dst.xfrm = NULL;
1235 rt->rt_genid = atomic_read(&rt_genid);
1236 rt->rt_flags |= RTCF_REDIRECTED;
1238 /* Gateway is different ... */
1239 rt->rt_gateway = new_gw;
1241 /* Redirect received -> path was valid */
1242 dst_confirm(&rth->u.dst);
1245 atomic_inc(&rt->peer->refcnt);
1247 if (arp_bind_neighbour(&rt->u.dst) ||
1248 !(rt->u.dst.neighbour->nud_state &
1250 if (rt->u.dst.neighbour)
1251 neigh_event_send(rt->u.dst.neighbour, NULL);
1257 netevent.old = &rth->u.dst;
1258 netevent.new = &rt->u.dst;
1259 call_netevent_notifiers(NETEVENT_REDIRECT,
1263 if (!rt_intern_hash(hash, rt, &rt))
1276 #ifdef CONFIG_IP_ROUTE_VERBOSE
1277 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1278 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1279 "%u.%u.%u.%u ignored.\n"
1280 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1281 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1282 NIPQUAD(saddr), NIPQUAD(daddr));
1287 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1289 struct rtable *rt = (struct rtable *)dst;
1290 struct dst_entry *ret = dst;
1293 if (dst->obsolete) {
1296 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1297 rt->u.dst.expires) {
1298 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1300 #if RT_CACHE_DEBUG >= 1
1301 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1302 "%u.%u.%u.%u/%02x dropped\n",
1303 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1314 * 1. The first ip_rt_redirect_number redirects are sent
1315 * with exponential backoff, then we stop sending them at all,
1316 * assuming that the host ignores our redirects.
1317 * 2. If we did not see packets requiring redirects
1318 * during ip_rt_redirect_silence, we assume that the host
1319 * forgot redirected route and start to send redirects again.
1321 * This algorithm is much cheaper and more intelligent than dumb load limiting
1324 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1325 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1328 void ip_rt_send_redirect(struct sk_buff *skb)
1330 struct rtable *rt = skb->rtable;
1331 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1336 if (!IN_DEV_TX_REDIRECTS(in_dev))
1339 /* No redirected packets during ip_rt_redirect_silence;
1340 * reset the algorithm.
1342 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1343 rt->u.dst.rate_tokens = 0;
1345 /* Too many ignored redirects; do not send anything
1346 * set u.dst.rate_last to the last seen redirected packet.
1348 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1349 rt->u.dst.rate_last = jiffies;
1353 /* Check for load limit; set rate_last to the latest sent
1356 if (rt->u.dst.rate_tokens == 0 ||
1358 (rt->u.dst.rate_last +
1359 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1360 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1361 rt->u.dst.rate_last = jiffies;
1362 ++rt->u.dst.rate_tokens;
1363 #ifdef CONFIG_IP_ROUTE_VERBOSE
1364 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1365 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1367 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1368 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1369 NIPQUAD(rt->rt_src), rt->rt_iif,
1370 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1377 static int ip_error(struct sk_buff *skb)
1379 struct rtable *rt = skb->rtable;
1383 switch (rt->u.dst.error) {
1388 code = ICMP_HOST_UNREACH;
1391 code = ICMP_NET_UNREACH;
1392 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1395 code = ICMP_PKT_FILTERED;
1400 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1401 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1402 rt->u.dst.rate_tokens = ip_rt_error_burst;
1403 rt->u.dst.rate_last = now;
1404 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1405 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1406 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1409 out: kfree_skb(skb);
1414 * The last two values are not from the RFC but
1415 * are needed for AMPRnet AX.25 paths.
1418 static const unsigned short mtu_plateau[] =
1419 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1421 static inline unsigned short guess_mtu(unsigned short old_mtu)
1425 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1426 if (old_mtu > mtu_plateau[i])
1427 return mtu_plateau[i];
1431 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1432 unsigned short new_mtu)
1435 unsigned short old_mtu = ntohs(iph->tot_len);
1437 __be32 skeys[2] = { iph->saddr, 0, };
1438 __be32 daddr = iph->daddr;
1439 unsigned short est_mtu = 0;
1441 if (ipv4_config.no_pmtu_disc)
1444 for (i = 0; i < 2; i++) {
1445 unsigned hash = rt_hash(daddr, skeys[i], 0);
1448 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1449 rth = rcu_dereference(rth->u.dst.rt_next)) {
1450 if (rth->fl.fl4_dst == daddr &&
1451 rth->fl.fl4_src == skeys[i] &&
1452 rth->rt_dst == daddr &&
1453 rth->rt_src == iph->saddr &&
1455 !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1456 net_eq(dev_net(rth->u.dst.dev), net) &&
1457 rth->rt_genid == atomic_read(&rt_genid)) {
1458 unsigned short mtu = new_mtu;
1460 if (new_mtu < 68 || new_mtu >= old_mtu) {
1462 /* BSD 4.2 compatibility hack :-( */
1464 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1465 old_mtu >= 68 + (iph->ihl << 2))
1466 old_mtu -= iph->ihl << 2;
1468 mtu = guess_mtu(old_mtu);
1470 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1471 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1472 dst_confirm(&rth->u.dst);
1473 if (mtu < ip_rt_min_pmtu) {
1474 mtu = ip_rt_min_pmtu;
1475 rth->u.dst.metrics[RTAX_LOCK-1] |=
1478 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1479 dst_set_expires(&rth->u.dst,
1488 return est_mtu ? : new_mtu;
1491 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1493 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1494 !(dst_metric_locked(dst, RTAX_MTU))) {
1495 if (mtu < ip_rt_min_pmtu) {
1496 mtu = ip_rt_min_pmtu;
1497 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1499 dst->metrics[RTAX_MTU-1] = mtu;
1500 dst_set_expires(dst, ip_rt_mtu_expires);
1501 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1505 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1510 static void ipv4_dst_destroy(struct dst_entry *dst)
1512 struct rtable *rt = (struct rtable *) dst;
1513 struct inet_peer *peer = rt->peer;
1514 struct in_device *idev = rt->idev;
1527 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1530 struct rtable *rt = (struct rtable *) dst;
1531 struct in_device *idev = rt->idev;
1532 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1533 struct in_device *loopback_idev =
1534 in_dev_get(dev_net(dev)->loopback_dev);
1535 if (loopback_idev) {
1536 rt->idev = loopback_idev;
1542 static void ipv4_link_failure(struct sk_buff *skb)
1546 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1550 dst_set_expires(&rt->u.dst, 0);
1553 static int ip_rt_bug(struct sk_buff *skb)
1555 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1556 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1557 skb->dev ? skb->dev->name : "?");
1563 We do not cache source address of outgoing interface,
1564 because it is used only by IP RR, TS and SRR options,
1565 so that it out of fast path.
1567 BTW remember: "addr" is allowed to be not aligned
1571 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1574 struct fib_result res;
1576 if (rt->fl.iif == 0)
1578 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1579 src = FIB_RES_PREFSRC(res);
1582 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1584 memcpy(addr, &src, 4);
1587 #ifdef CONFIG_NET_CLS_ROUTE
1588 static void set_class_tag(struct rtable *rt, u32 tag)
1590 if (!(rt->u.dst.tclassid & 0xFFFF))
1591 rt->u.dst.tclassid |= tag & 0xFFFF;
1592 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1593 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1597 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1599 struct fib_info *fi = res->fi;
1602 if (FIB_RES_GW(*res) &&
1603 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1604 rt->rt_gateway = FIB_RES_GW(*res);
1605 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1606 sizeof(rt->u.dst.metrics));
1607 if (fi->fib_mtu == 0) {
1608 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1609 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1610 rt->rt_gateway != rt->rt_dst &&
1611 rt->u.dst.dev->mtu > 576)
1612 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1614 #ifdef CONFIG_NET_CLS_ROUTE
1615 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1618 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1620 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1621 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1622 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1623 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1624 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1625 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1627 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1628 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1630 #ifdef CONFIG_NET_CLS_ROUTE
1631 #ifdef CONFIG_IP_MULTIPLE_TABLES
1632 set_class_tag(rt, fib_rules_tclass(res));
1634 set_class_tag(rt, itag);
1636 rt->rt_type = res->type;
1639 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1640 u8 tos, struct net_device *dev, int our)
1645 struct in_device *in_dev = in_dev_get(dev);
1648 /* Primary sanity checks. */
1653 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1654 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1657 if (ipv4_is_zeronet(saddr)) {
1658 if (!ipv4_is_local_multicast(daddr))
1660 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1661 } else if (fib_validate_source(saddr, 0, tos, 0,
1662 dev, &spec_dst, &itag) < 0)
1665 rth = dst_alloc(&ipv4_dst_ops);
1669 rth->u.dst.output= ip_rt_bug;
1671 atomic_set(&rth->u.dst.__refcnt, 1);
1672 rth->u.dst.flags= DST_HOST;
1673 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1674 rth->u.dst.flags |= DST_NOPOLICY;
1675 rth->fl.fl4_dst = daddr;
1676 rth->rt_dst = daddr;
1677 rth->fl.fl4_tos = tos;
1678 rth->fl.mark = skb->mark;
1679 rth->fl.fl4_src = saddr;
1680 rth->rt_src = saddr;
1681 #ifdef CONFIG_NET_CLS_ROUTE
1682 rth->u.dst.tclassid = itag;
1685 rth->fl.iif = dev->ifindex;
1686 rth->u.dst.dev = init_net.loopback_dev;
1687 dev_hold(rth->u.dst.dev);
1688 rth->idev = in_dev_get(rth->u.dst.dev);
1690 rth->rt_gateway = daddr;
1691 rth->rt_spec_dst= spec_dst;
1692 rth->rt_genid = atomic_read(&rt_genid);
1693 rth->rt_flags = RTCF_MULTICAST;
1694 rth->rt_type = RTN_MULTICAST;
1696 rth->u.dst.input= ip_local_deliver;
1697 rth->rt_flags |= RTCF_LOCAL;
1700 #ifdef CONFIG_IP_MROUTE
1701 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1702 rth->u.dst.input = ip_mr_input;
1704 RT_CACHE_STAT_INC(in_slow_mc);
1707 hash = rt_hash(daddr, saddr, dev->ifindex);
1708 return rt_intern_hash(hash, rth, &skb->rtable);
1720 static void ip_handle_martian_source(struct net_device *dev,
1721 struct in_device *in_dev,
1722 struct sk_buff *skb,
1726 RT_CACHE_STAT_INC(in_martian_src);
1727 #ifdef CONFIG_IP_ROUTE_VERBOSE
1728 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1730 * RFC1812 recommendation, if source is martian,
1731 * the only hint is MAC header.
1733 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1734 "%u.%u.%u.%u, on dev %s\n",
1735 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1736 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1738 const unsigned char *p = skb_mac_header(skb);
1739 printk(KERN_WARNING "ll header: ");
1740 for (i = 0; i < dev->hard_header_len; i++, p++) {
1742 if (i < (dev->hard_header_len - 1))
1751 static int __mkroute_input(struct sk_buff *skb,
1752 struct fib_result *res,
1753 struct in_device *in_dev,
1754 __be32 daddr, __be32 saddr, u32 tos,
1755 struct rtable **result)
1760 struct in_device *out_dev;
1765 /* get a working reference to the output device */
1766 out_dev = in_dev_get(FIB_RES_DEV(*res));
1767 if (out_dev == NULL) {
1768 if (net_ratelimit())
1769 printk(KERN_CRIT "Bug in ip_route_input" \
1770 "_slow(). Please, report\n");
1775 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1776 in_dev->dev, &spec_dst, &itag);
1778 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1786 flags |= RTCF_DIRECTSRC;
1788 if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1789 (IN_DEV_SHARED_MEDIA(out_dev) ||
1790 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1791 flags |= RTCF_DOREDIRECT;
1793 if (skb->protocol != htons(ETH_P_IP)) {
1794 /* Not IP (i.e. ARP). Do not create route, if it is
1795 * invalid for proxy arp. DNAT routes are always valid.
1797 if (out_dev == in_dev) {
1804 rth = dst_alloc(&ipv4_dst_ops);
1810 atomic_set(&rth->u.dst.__refcnt, 1);
1811 rth->u.dst.flags= DST_HOST;
1812 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1813 rth->u.dst.flags |= DST_NOPOLICY;
1814 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1815 rth->u.dst.flags |= DST_NOXFRM;
1816 rth->fl.fl4_dst = daddr;
1817 rth->rt_dst = daddr;
1818 rth->fl.fl4_tos = tos;
1819 rth->fl.mark = skb->mark;
1820 rth->fl.fl4_src = saddr;
1821 rth->rt_src = saddr;
1822 rth->rt_gateway = daddr;
1824 rth->fl.iif = in_dev->dev->ifindex;
1825 rth->u.dst.dev = (out_dev)->dev;
1826 dev_hold(rth->u.dst.dev);
1827 rth->idev = in_dev_get(rth->u.dst.dev);
1829 rth->rt_spec_dst= spec_dst;
1831 rth->u.dst.input = ip_forward;
1832 rth->u.dst.output = ip_output;
1833 rth->rt_genid = atomic_read(&rt_genid);
1835 rt_set_nexthop(rth, res, itag);
1837 rth->rt_flags = flags;
1842 /* release the working reference to the output device */
1843 in_dev_put(out_dev);
1847 static int ip_mkroute_input(struct sk_buff *skb,
1848 struct fib_result *res,
1849 const struct flowi *fl,
1850 struct in_device *in_dev,
1851 __be32 daddr, __be32 saddr, u32 tos)
1853 struct rtable* rth = NULL;
1857 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1858 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1859 fib_select_multipath(fl, res);
1862 /* create a routing cache entry */
1863 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1867 /* put it into the cache */
1868 hash = rt_hash(daddr, saddr, fl->iif);
1869 return rt_intern_hash(hash, rth, &skb->rtable);
1873 * NOTE. We drop all the packets that has local source
1874 * addresses, because every properly looped back packet
1875 * must have correct destination already attached by output routine.
1877 * Such approach solves two big problems:
1878 * 1. Not simplex devices are handled properly.
1879 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1882 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1883 u8 tos, struct net_device *dev)
1885 struct fib_result res;
1886 struct in_device *in_dev = in_dev_get(dev);
1887 struct flowi fl = { .nl_u = { .ip4_u =
1891 .scope = RT_SCOPE_UNIVERSE,
1894 .iif = dev->ifindex };
1897 struct rtable * rth;
1902 struct net * net = dev_net(dev);
1904 /* IP on this device is disabled. */
1909 /* Check for the most weird martians, which can be not detected
1913 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1914 ipv4_is_loopback(saddr))
1915 goto martian_source;
1917 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1920 /* Accept zero addresses only to limited broadcast;
1921 * I even do not know to fix it or not. Waiting for complains :-)
1923 if (ipv4_is_zeronet(saddr))
1924 goto martian_source;
1926 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1927 ipv4_is_loopback(daddr))
1928 goto martian_destination;
1931 * Now we are ready to route packet.
1933 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1934 if (!IN_DEV_FORWARD(in_dev))
1940 RT_CACHE_STAT_INC(in_slow_tot);
1942 if (res.type == RTN_BROADCAST)
1945 if (res.type == RTN_LOCAL) {
1947 result = fib_validate_source(saddr, daddr, tos,
1948 net->loopback_dev->ifindex,
1949 dev, &spec_dst, &itag);
1951 goto martian_source;
1953 flags |= RTCF_DIRECTSRC;
1958 if (!IN_DEV_FORWARD(in_dev))
1960 if (res.type != RTN_UNICAST)
1961 goto martian_destination;
1963 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1971 if (skb->protocol != htons(ETH_P_IP))
1974 if (ipv4_is_zeronet(saddr))
1975 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1977 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1980 goto martian_source;
1982 flags |= RTCF_DIRECTSRC;
1984 flags |= RTCF_BROADCAST;
1985 res.type = RTN_BROADCAST;
1986 RT_CACHE_STAT_INC(in_brd);
1989 rth = dst_alloc(&ipv4_dst_ops);
1993 rth->u.dst.output= ip_rt_bug;
1994 rth->rt_genid = atomic_read(&rt_genid);
1996 atomic_set(&rth->u.dst.__refcnt, 1);
1997 rth->u.dst.flags= DST_HOST;
1998 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1999 rth->u.dst.flags |= DST_NOPOLICY;
2000 rth->fl.fl4_dst = daddr;
2001 rth->rt_dst = daddr;
2002 rth->fl.fl4_tos = tos;
2003 rth->fl.mark = skb->mark;
2004 rth->fl.fl4_src = saddr;
2005 rth->rt_src = saddr;
2006 #ifdef CONFIG_NET_CLS_ROUTE
2007 rth->u.dst.tclassid = itag;
2010 rth->fl.iif = dev->ifindex;
2011 rth->u.dst.dev = net->loopback_dev;
2012 dev_hold(rth->u.dst.dev);
2013 rth->idev = in_dev_get(rth->u.dst.dev);
2014 rth->rt_gateway = daddr;
2015 rth->rt_spec_dst= spec_dst;
2016 rth->u.dst.input= ip_local_deliver;
2017 rth->rt_flags = flags|RTCF_LOCAL;
2018 if (res.type == RTN_UNREACHABLE) {
2019 rth->u.dst.input= ip_error;
2020 rth->u.dst.error= -err;
2021 rth->rt_flags &= ~RTCF_LOCAL;
2023 rth->rt_type = res.type;
2024 hash = rt_hash(daddr, saddr, fl.iif);
2025 err = rt_intern_hash(hash, rth, &skb->rtable);
2029 RT_CACHE_STAT_INC(in_no_route);
2030 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2031 res.type = RTN_UNREACHABLE;
2037 * Do not cache martian addresses: they should be logged (RFC1812)
2039 martian_destination:
2040 RT_CACHE_STAT_INC(in_martian_dst);
2041 #ifdef CONFIG_IP_ROUTE_VERBOSE
2042 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2043 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2044 "%u.%u.%u.%u, dev %s\n",
2045 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2049 err = -EHOSTUNREACH;
2061 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2065 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2066 u8 tos, struct net_device *dev)
2068 struct rtable * rth;
2070 int iif = dev->ifindex;
2074 tos &= IPTOS_RT_MASK;
2075 hash = rt_hash(daddr, saddr, iif);
2078 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2079 rth = rcu_dereference(rth->u.dst.rt_next)) {
2080 if (((rth->fl.fl4_dst ^ daddr) |
2081 (rth->fl.fl4_src ^ saddr) |
2082 (rth->fl.iif ^ iif) |
2084 (rth->fl.fl4_tos ^ tos)) == 0 &&
2085 rth->fl.mark == skb->mark &&
2086 net_eq(dev_net(rth->u.dst.dev), net) &&
2087 rth->rt_genid == atomic_read(&rt_genid)) {
2088 dst_use(&rth->u.dst, jiffies);
2089 RT_CACHE_STAT_INC(in_hit);
2094 RT_CACHE_STAT_INC(in_hlist_search);
2098 /* Multicast recognition logic is moved from route cache to here.
2099 The problem was that too many Ethernet cards have broken/missing
2100 hardware multicast filters :-( As result the host on multicasting
2101 network acquires a lot of useless route cache entries, sort of
2102 SDR messages from all the world. Now we try to get rid of them.
2103 Really, provided software IP multicast filter is organized
2104 reasonably (at least, hashed), it does not result in a slowdown
2105 comparing with route cache reject entries.
2106 Note, that multicast routers are not affected, because
2107 route cache entry is created eventually.
2109 if (ipv4_is_multicast(daddr)) {
2110 struct in_device *in_dev;
2113 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2114 int our = ip_check_mc(in_dev, daddr, saddr,
2115 ip_hdr(skb)->protocol);
2117 #ifdef CONFIG_IP_MROUTE
2118 || (!ipv4_is_local_multicast(daddr) &&
2119 IN_DEV_MFORWARD(in_dev))
2123 return ip_route_input_mc(skb, daddr, saddr,
2130 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2133 static int __mkroute_output(struct rtable **result,
2134 struct fib_result *res,
2135 const struct flowi *fl,
2136 const struct flowi *oldflp,
2137 struct net_device *dev_out,
2141 struct in_device *in_dev;
2142 u32 tos = RT_FL_TOS(oldflp);
2145 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2148 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2149 res->type = RTN_BROADCAST;
2150 else if (ipv4_is_multicast(fl->fl4_dst))
2151 res->type = RTN_MULTICAST;
2152 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2155 if (dev_out->flags & IFF_LOOPBACK)
2156 flags |= RTCF_LOCAL;
2158 /* get work reference to inet device */
2159 in_dev = in_dev_get(dev_out);
2163 if (res->type == RTN_BROADCAST) {
2164 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2166 fib_info_put(res->fi);
2169 } else if (res->type == RTN_MULTICAST) {
2170 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2171 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2173 flags &= ~RTCF_LOCAL;
2174 /* If multicast route do not exist use
2175 default one, but do not gateway in this case.
2178 if (res->fi && res->prefixlen < 4) {
2179 fib_info_put(res->fi);
2185 rth = dst_alloc(&ipv4_dst_ops);
2191 atomic_set(&rth->u.dst.__refcnt, 1);
2192 rth->u.dst.flags= DST_HOST;
2193 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2194 rth->u.dst.flags |= DST_NOXFRM;
2195 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2196 rth->u.dst.flags |= DST_NOPOLICY;
2198 rth->fl.fl4_dst = oldflp->fl4_dst;
2199 rth->fl.fl4_tos = tos;
2200 rth->fl.fl4_src = oldflp->fl4_src;
2201 rth->fl.oif = oldflp->oif;
2202 rth->fl.mark = oldflp->mark;
2203 rth->rt_dst = fl->fl4_dst;
2204 rth->rt_src = fl->fl4_src;
2205 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2206 /* get references to the devices that are to be hold by the routing
2208 rth->u.dst.dev = dev_out;
2210 rth->idev = in_dev_get(dev_out);
2211 rth->rt_gateway = fl->fl4_dst;
2212 rth->rt_spec_dst= fl->fl4_src;
2214 rth->u.dst.output=ip_output;
2215 rth->rt_genid = atomic_read(&rt_genid);
2217 RT_CACHE_STAT_INC(out_slow_tot);
2219 if (flags & RTCF_LOCAL) {
2220 rth->u.dst.input = ip_local_deliver;
2221 rth->rt_spec_dst = fl->fl4_dst;
2223 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2224 rth->rt_spec_dst = fl->fl4_src;
2225 if (flags & RTCF_LOCAL &&
2226 !(dev_out->flags & IFF_LOOPBACK)) {
2227 rth->u.dst.output = ip_mc_output;
2228 RT_CACHE_STAT_INC(out_slow_mc);
2230 #ifdef CONFIG_IP_MROUTE
2231 if (res->type == RTN_MULTICAST) {
2232 if (IN_DEV_MFORWARD(in_dev) &&
2233 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2234 rth->u.dst.input = ip_mr_input;
2235 rth->u.dst.output = ip_mc_output;
2241 rt_set_nexthop(rth, res, 0);
2243 rth->rt_flags = flags;
2247 /* release work reference to inet device */
2253 static int ip_mkroute_output(struct rtable **rp,
2254 struct fib_result *res,
2255 const struct flowi *fl,
2256 const struct flowi *oldflp,
2257 struct net_device *dev_out,
2260 struct rtable *rth = NULL;
2261 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2264 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2265 err = rt_intern_hash(hash, rth, rp);
2272 * Major route resolver routine.
2275 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2276 const struct flowi *oldflp)
2278 u32 tos = RT_FL_TOS(oldflp);
2279 struct flowi fl = { .nl_u = { .ip4_u =
2280 { .daddr = oldflp->fl4_dst,
2281 .saddr = oldflp->fl4_src,
2282 .tos = tos & IPTOS_RT_MASK,
2283 .scope = ((tos & RTO_ONLINK) ?
2287 .mark = oldflp->mark,
2288 .iif = net->loopback_dev->ifindex,
2289 .oif = oldflp->oif };
2290 struct fib_result res;
2292 struct net_device *dev_out = NULL;
2298 #ifdef CONFIG_IP_MULTIPLE_TABLES
2302 if (oldflp->fl4_src) {
2304 if (ipv4_is_multicast(oldflp->fl4_src) ||
2305 ipv4_is_lbcast(oldflp->fl4_src) ||
2306 ipv4_is_zeronet(oldflp->fl4_src))
2309 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2310 dev_out = ip_dev_find(net, oldflp->fl4_src);
2311 if (dev_out == NULL)
2314 /* I removed check for oif == dev_out->oif here.
2315 It was wrong for two reasons:
2316 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2317 is assigned to multiple interfaces.
2318 2. Moreover, we are allowed to send packets with saddr
2319 of another iface. --ANK
2322 if (oldflp->oif == 0
2323 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2324 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2325 /* Special hack: user can direct multicasts
2326 and limited broadcast via necessary interface
2327 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2328 This hack is not just for fun, it allows
2329 vic,vat and friends to work.
2330 They bind socket to loopback, set ttl to zero
2331 and expect that it will work.
2332 From the viewpoint of routing cache they are broken,
2333 because we are not allowed to build multicast path
2334 with loopback source addr (look, routing cache
2335 cannot know, that ttl is zero, so that packet
2336 will not leave this host and route is valid).
2337 Luckily, this hack is good workaround.
2340 fl.oif = dev_out->ifindex;
2350 dev_out = dev_get_by_index(net, oldflp->oif);
2352 if (dev_out == NULL)
2355 /* RACE: Check return value of inet_select_addr instead. */
2356 if (__in_dev_get_rtnl(dev_out) == NULL) {
2358 goto out; /* Wrong error code */
2361 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2362 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2364 fl.fl4_src = inet_select_addr(dev_out, 0,
2369 if (ipv4_is_multicast(oldflp->fl4_dst))
2370 fl.fl4_src = inet_select_addr(dev_out, 0,
2372 else if (!oldflp->fl4_dst)
2373 fl.fl4_src = inet_select_addr(dev_out, 0,
2379 fl.fl4_dst = fl.fl4_src;
2381 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2384 dev_out = net->loopback_dev;
2386 fl.oif = net->loopback_dev->ifindex;
2387 res.type = RTN_LOCAL;
2388 flags |= RTCF_LOCAL;
2392 if (fib_lookup(net, &fl, &res)) {
2395 /* Apparently, routing tables are wrong. Assume,
2396 that the destination is on link.
2399 Because we are allowed to send to iface
2400 even if it has NO routes and NO assigned
2401 addresses. When oif is specified, routing
2402 tables are looked up with only one purpose:
2403 to catch if destination is gatewayed, rather than
2404 direct. Moreover, if MSG_DONTROUTE is set,
2405 we send packet, ignoring both routing tables
2406 and ifaddr state. --ANK
2409 We could make it even if oif is unknown,
2410 likely IPv6, but we do not.
2413 if (fl.fl4_src == 0)
2414 fl.fl4_src = inet_select_addr(dev_out, 0,
2416 res.type = RTN_UNICAST;
2426 if (res.type == RTN_LOCAL) {
2428 fl.fl4_src = fl.fl4_dst;
2431 dev_out = net->loopback_dev;
2433 fl.oif = dev_out->ifindex;
2435 fib_info_put(res.fi);
2437 flags |= RTCF_LOCAL;
2441 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2442 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2443 fib_select_multipath(&fl, &res);
2446 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2447 fib_select_default(net, &fl, &res);
2450 fl.fl4_src = FIB_RES_PREFSRC(res);
2454 dev_out = FIB_RES_DEV(res);
2456 fl.oif = dev_out->ifindex;
2460 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2470 int __ip_route_output_key(struct net *net, struct rtable **rp,
2471 const struct flowi *flp)
2476 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2479 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2480 rth = rcu_dereference(rth->u.dst.rt_next)) {
2481 if (rth->fl.fl4_dst == flp->fl4_dst &&
2482 rth->fl.fl4_src == flp->fl4_src &&
2484 rth->fl.oif == flp->oif &&
2485 rth->fl.mark == flp->mark &&
2486 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2487 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2488 net_eq(dev_net(rth->u.dst.dev), net) &&
2489 rth->rt_genid == atomic_read(&rt_genid)) {
2490 dst_use(&rth->u.dst, jiffies);
2491 RT_CACHE_STAT_INC(out_hit);
2492 rcu_read_unlock_bh();
2496 RT_CACHE_STAT_INC(out_hlist_search);
2498 rcu_read_unlock_bh();
2500 return ip_route_output_slow(net, rp, flp);
2503 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2505 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2509 static struct dst_ops ipv4_dst_blackhole_ops = {
2511 .protocol = __constant_htons(ETH_P_IP),
2512 .destroy = ipv4_dst_destroy,
2513 .check = ipv4_dst_check,
2514 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2515 .entry_size = sizeof(struct rtable),
2516 .entries = ATOMIC_INIT(0),
2520 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2522 struct rtable *ort = *rp;
2523 struct rtable *rt = (struct rtable *)
2524 dst_alloc(&ipv4_dst_blackhole_ops);
2527 struct dst_entry *new = &rt->u.dst;
2529 atomic_set(&new->__refcnt, 1);
2531 new->input = dst_discard;
2532 new->output = dst_discard;
2533 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2535 new->dev = ort->u.dst.dev;
2541 rt->idev = ort->idev;
2543 in_dev_hold(rt->idev);
2544 rt->rt_genid = atomic_read(&rt_genid);
2545 rt->rt_flags = ort->rt_flags;
2546 rt->rt_type = ort->rt_type;
2547 rt->rt_dst = ort->rt_dst;
2548 rt->rt_src = ort->rt_src;
2549 rt->rt_iif = ort->rt_iif;
2550 rt->rt_gateway = ort->rt_gateway;
2551 rt->rt_spec_dst = ort->rt_spec_dst;
2552 rt->peer = ort->peer;
2554 atomic_inc(&rt->peer->refcnt);
2559 dst_release(&(*rp)->u.dst);
2561 return (rt ? 0 : -ENOMEM);
2564 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2565 struct sock *sk, int flags)
2569 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2574 flp->fl4_src = (*rp)->rt_src;
2576 flp->fl4_dst = (*rp)->rt_dst;
2577 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2578 flags ? XFRM_LOOKUP_WAIT : 0);
2579 if (err == -EREMOTE)
2580 err = ipv4_dst_blackhole(rp, flp);
2588 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2590 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2592 return ip_route_output_flow(net, rp, flp, NULL, 0);
2595 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2596 int nowait, unsigned int flags)
2598 struct rtable *rt = skb->rtable;
2600 struct nlmsghdr *nlh;
2602 u32 id = 0, ts = 0, tsage = 0, error;
2604 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2608 r = nlmsg_data(nlh);
2609 r->rtm_family = AF_INET;
2610 r->rtm_dst_len = 32;
2612 r->rtm_tos = rt->fl.fl4_tos;
2613 r->rtm_table = RT_TABLE_MAIN;
2614 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2615 r->rtm_type = rt->rt_type;
2616 r->rtm_scope = RT_SCOPE_UNIVERSE;
2617 r->rtm_protocol = RTPROT_UNSPEC;
2618 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2619 if (rt->rt_flags & RTCF_NOTIFY)
2620 r->rtm_flags |= RTM_F_NOTIFY;
2622 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2624 if (rt->fl.fl4_src) {
2625 r->rtm_src_len = 32;
2626 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2629 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2630 #ifdef CONFIG_NET_CLS_ROUTE
2631 if (rt->u.dst.tclassid)
2632 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2635 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2636 else if (rt->rt_src != rt->fl.fl4_src)
2637 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2639 if (rt->rt_dst != rt->rt_gateway)
2640 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2642 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2643 goto nla_put_failure;
2645 error = rt->u.dst.error;
2646 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2648 id = rt->peer->ip_id_count;
2649 if (rt->peer->tcp_ts_stamp) {
2650 ts = rt->peer->tcp_ts;
2651 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2656 #ifdef CONFIG_IP_MROUTE
2657 __be32 dst = rt->rt_dst;
2659 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2660 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2661 int err = ipmr_get_route(skb, r, nowait);
2666 goto nla_put_failure;
2668 if (err == -EMSGSIZE)
2669 goto nla_put_failure;
2675 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2678 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2679 expires, error) < 0)
2680 goto nla_put_failure;
2682 return nlmsg_end(skb, nlh);
2685 nlmsg_cancel(skb, nlh);
2689 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2691 struct net *net = sock_net(in_skb->sk);
2693 struct nlattr *tb[RTA_MAX+1];
2694 struct rtable *rt = NULL;
2699 struct sk_buff *skb;
2701 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2705 rtm = nlmsg_data(nlh);
2707 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2713 /* Reserve room for dummy headers, this skb can pass
2714 through good chunk of routing engine.
2716 skb_reset_mac_header(skb);
2717 skb_reset_network_header(skb);
2719 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2720 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2721 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2723 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2724 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2725 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2728 struct net_device *dev;
2730 dev = __dev_get_by_index(net, iif);
2736 skb->protocol = htons(ETH_P_IP);
2739 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2743 if (err == 0 && rt->u.dst.error)
2744 err = -rt->u.dst.error;
2751 .tos = rtm->rtm_tos,
2754 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2756 err = ip_route_output_key(net, &rt, &fl);
2763 if (rtm->rtm_flags & RTM_F_NOTIFY)
2764 rt->rt_flags |= RTCF_NOTIFY;
2766 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2767 RTM_NEWROUTE, 0, 0);
2771 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2780 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2787 net = sock_net(skb->sk);
2792 s_idx = idx = cb->args[1];
2793 for (h = s_h; h <= rt_hash_mask; h++) {
2795 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2796 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2797 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2799 if (rt->rt_genid != atomic_read(&rt_genid))
2801 skb->dst = dst_clone(&rt->u.dst);
2802 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2803 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2804 1, NLM_F_MULTI) <= 0) {
2805 dst_release(xchg(&skb->dst, NULL));
2806 rcu_read_unlock_bh();
2809 dst_release(xchg(&skb->dst, NULL));
2811 rcu_read_unlock_bh();
2821 void ip_rt_multicast_event(struct in_device *in_dev)
2826 #ifdef CONFIG_SYSCTL
2827 static int flush_delay;
2829 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2830 struct file *filp, void __user *buffer,
2831 size_t *lenp, loff_t *ppos)
2834 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2835 rt_cache_flush(flush_delay);
2842 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2845 void __user *oldval,
2846 size_t __user *oldlenp,
2847 void __user *newval,
2851 if (newlen != sizeof(int))
2853 if (get_user(delay, (int __user *)newval))
2855 rt_cache_flush(delay);
2859 ctl_table ipv4_route_table[] = {
2861 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2862 .procname = "flush",
2863 .data = &flush_delay,
2864 .maxlen = sizeof(int),
2866 .proc_handler = &ipv4_sysctl_rtcache_flush,
2867 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2870 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2871 .procname = "gc_thresh",
2872 .data = &ipv4_dst_ops.gc_thresh,
2873 .maxlen = sizeof(int),
2875 .proc_handler = &proc_dointvec,
2878 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2879 .procname = "max_size",
2880 .data = &ip_rt_max_size,
2881 .maxlen = sizeof(int),
2883 .proc_handler = &proc_dointvec,
2886 /* Deprecated. Use gc_min_interval_ms */
2888 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2889 .procname = "gc_min_interval",
2890 .data = &ip_rt_gc_min_interval,
2891 .maxlen = sizeof(int),
2893 .proc_handler = &proc_dointvec_jiffies,
2894 .strategy = &sysctl_jiffies,
2897 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2898 .procname = "gc_min_interval_ms",
2899 .data = &ip_rt_gc_min_interval,
2900 .maxlen = sizeof(int),
2902 .proc_handler = &proc_dointvec_ms_jiffies,
2903 .strategy = &sysctl_ms_jiffies,
2906 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2907 .procname = "gc_timeout",
2908 .data = &ip_rt_gc_timeout,
2909 .maxlen = sizeof(int),
2911 .proc_handler = &proc_dointvec_jiffies,
2912 .strategy = &sysctl_jiffies,
2915 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2916 .procname = "gc_interval",
2917 .data = &ip_rt_gc_interval,
2918 .maxlen = sizeof(int),
2920 .proc_handler = &proc_dointvec_jiffies,
2921 .strategy = &sysctl_jiffies,
2924 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2925 .procname = "redirect_load",
2926 .data = &ip_rt_redirect_load,
2927 .maxlen = sizeof(int),
2929 .proc_handler = &proc_dointvec,
2932 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2933 .procname = "redirect_number",
2934 .data = &ip_rt_redirect_number,
2935 .maxlen = sizeof(int),
2937 .proc_handler = &proc_dointvec,
2940 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2941 .procname = "redirect_silence",
2942 .data = &ip_rt_redirect_silence,
2943 .maxlen = sizeof(int),
2945 .proc_handler = &proc_dointvec,
2948 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2949 .procname = "error_cost",
2950 .data = &ip_rt_error_cost,
2951 .maxlen = sizeof(int),
2953 .proc_handler = &proc_dointvec,
2956 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2957 .procname = "error_burst",
2958 .data = &ip_rt_error_burst,
2959 .maxlen = sizeof(int),
2961 .proc_handler = &proc_dointvec,
2964 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2965 .procname = "gc_elasticity",
2966 .data = &ip_rt_gc_elasticity,
2967 .maxlen = sizeof(int),
2969 .proc_handler = &proc_dointvec,
2972 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2973 .procname = "mtu_expires",
2974 .data = &ip_rt_mtu_expires,
2975 .maxlen = sizeof(int),
2977 .proc_handler = &proc_dointvec_jiffies,
2978 .strategy = &sysctl_jiffies,
2981 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2982 .procname = "min_pmtu",
2983 .data = &ip_rt_min_pmtu,
2984 .maxlen = sizeof(int),
2986 .proc_handler = &proc_dointvec,
2989 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2990 .procname = "min_adv_mss",
2991 .data = &ip_rt_min_advmss,
2992 .maxlen = sizeof(int),
2994 .proc_handler = &proc_dointvec,
2997 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2998 .procname = "secret_interval",
2999 .data = &ip_rt_secret_interval,
3000 .maxlen = sizeof(int),
3002 .proc_handler = &proc_dointvec_jiffies,
3003 .strategy = &sysctl_jiffies,
3009 #ifdef CONFIG_NET_CLS_ROUTE
3010 struct ip_rt_acct *ip_rt_acct __read_mostly;
3011 #endif /* CONFIG_NET_CLS_ROUTE */
3013 static __initdata unsigned long rhash_entries;
3014 static int __init set_rhash_entries(char *str)
3018 rhash_entries = simple_strtoul(str, &str, 0);
3021 __setup("rhash_entries=", set_rhash_entries);
3023 int __init ip_rt_init(void)
3027 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3028 (jiffies ^ (jiffies >> 7))));
3030 #ifdef CONFIG_NET_CLS_ROUTE
3031 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3033 panic("IP: failed to allocate ip_rt_acct\n");
3036 ipv4_dst_ops.kmem_cachep =
3037 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3038 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3040 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3042 rt_hash_table = (struct rt_hash_bucket *)
3043 alloc_large_system_hash("IP route cache",
3044 sizeof(struct rt_hash_bucket),
3046 (num_physpages >= 128 * 1024) ?
3052 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3053 rt_hash_lock_init();
3055 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3056 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3061 rt_secret_timer.function = rt_secret_rebuild;
3062 rt_secret_timer.data = 0;
3063 init_timer_deferrable(&rt_secret_timer);
3065 /* All the timers, started at system startup tend
3066 to synchronize. Perturb it a bit.
3068 schedule_delayed_work(&expires_work,
3069 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3071 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3072 ip_rt_secret_interval;
3073 add_timer(&rt_secret_timer);
3075 if (ip_rt_proc_init())
3076 printk(KERN_ERR "Unable to create route proc files\n");
3081 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3086 EXPORT_SYMBOL(__ip_select_ident);
3087 EXPORT_SYMBOL(ip_route_input);
3088 EXPORT_SYMBOL(ip_route_output_key);