]> err.no Git - linux-2.6/blob - net/ipv4/route.c
5b3834b38a2d5839436acd84684ea6b131fb25ec
[linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/mm.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
80 #include <linux/in.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
107 #include <net/netevent.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_min_delay              = 2 * HZ;
120 static int ip_rt_max_delay              = 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval            = 60 * HZ;
124 static int ip_rt_gc_min_interval        = HZ / 2;
125 static int ip_rt_redirect_number        = 9;
126 static int ip_rt_redirect_load          = HZ / 50;
127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost             = HZ;
129 static int ip_rt_error_burst            = 5 * HZ;
130 static int ip_rt_gc_elasticity          = 8;
131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
133 static int ip_rt_min_advmss             = 256;
134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136
137 #define RTprint(a...)   printk(KERN_DEBUG a)
138
139 static struct timer_list rt_flush_timer;
140 static struct timer_list rt_periodic_timer;
141 static struct timer_list rt_secret_timer;
142
143 /*
144  *      Interface to generic destination cache.
145  */
146
147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148 static void              ipv4_dst_destroy(struct dst_entry *dst);
149 static void              ipv4_dst_ifdown(struct dst_entry *dst,
150                                          struct net_device *dev, int how);
151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152 static void              ipv4_link_failure(struct sk_buff *skb);
153 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154 static int rt_garbage_collect(void);
155
156
157 static struct dst_ops ipv4_dst_ops = {
158         .family =               AF_INET,
159         .protocol =             __constant_htons(ETH_P_IP),
160         .gc =                   rt_garbage_collect,
161         .check =                ipv4_dst_check,
162         .destroy =              ipv4_dst_destroy,
163         .ifdown =               ipv4_dst_ifdown,
164         .negative_advice =      ipv4_negative_advice,
165         .link_failure =         ipv4_link_failure,
166         .update_pmtu =          ip_rt_update_pmtu,
167         .entry_size =           sizeof(struct rtable),
168 };
169
170 #define ECN_OR_COST(class)      TC_PRIO_##class
171
172 __u8 ip_tos2prio[16] = {
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(FILLER),
175         TC_PRIO_BESTEFFORT,
176         ECN_OR_COST(BESTEFFORT),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_BULK,
180         ECN_OR_COST(BULK),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE,
184         ECN_OR_COST(INTERACTIVE),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK),
187         TC_PRIO_INTERACTIVE_BULK,
188         ECN_OR_COST(INTERACTIVE_BULK)
189 };
190
191
192 /*
193  * Route cache.
194  */
195
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205
206 struct rt_hash_bucket {
207         struct rtable   *chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210         defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ        256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ       4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ       2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ       1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ       512
227 # else
228 #  define RT_HASH_LOCK_SZ       256
229 # endif
230 #endif
231
232 static spinlock_t       *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234 # define rt_hash_lock_init()    { \
235                 int i; \
236                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
237                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
238                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
239                         spin_lock_init(&rt_hash_locks[i]); \
240                 }
241 #else
242 # define rt_hash_lock_addr(slot) NULL
243 # define rt_hash_lock_init()
244 #endif
245
246 static struct rt_hash_bucket    *rt_hash_table;
247 static unsigned                 rt_hash_mask;
248 static int                      rt_hash_log;
249 static unsigned int             rt_hash_rnd;
250
251 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
252 #define RT_CACHE_STAT_INC(field) \
253         (__raw_get_cpu_var(rt_cache_stat).field++)
254
255 static int rt_intern_hash(unsigned hash, struct rtable *rth,
256                                 struct rtable **res);
257
258 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
259 {
260         return (jhash_2words(daddr, saddr, rt_hash_rnd)
261                 & rt_hash_mask);
262 }
263
264 #define rt_hash(daddr, saddr, idx) \
265         rt_hash_code((__force u32)(__be32)(daddr),\
266                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
267
268 #ifdef CONFIG_PROC_FS
269 struct rt_cache_iter_state {
270         int bucket;
271 };
272
273 static struct rtable *rt_cache_get_first(struct seq_file *seq)
274 {
275         struct rtable *r = NULL;
276         struct rt_cache_iter_state *st = seq->private;
277
278         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
279                 rcu_read_lock_bh();
280                 r = rt_hash_table[st->bucket].chain;
281                 if (r)
282                         break;
283                 rcu_read_unlock_bh();
284         }
285         return r;
286 }
287
288 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
289 {
290         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
291
292         r = r->u.dst.rt_next;
293         while (!r) {
294                 rcu_read_unlock_bh();
295                 if (--st->bucket < 0)
296                         break;
297                 rcu_read_lock_bh();
298                 r = rt_hash_table[st->bucket].chain;
299         }
300         return r;
301 }
302
303 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
304 {
305         struct rtable *r = rt_cache_get_first(seq);
306
307         if (r)
308                 while (pos && (r = rt_cache_get_next(seq, r)))
309                         --pos;
310         return pos ? NULL : r;
311 }
312
313 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
314 {
315         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
316 }
317
318 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
319 {
320         struct rtable *r = NULL;
321
322         if (v == SEQ_START_TOKEN)
323                 r = rt_cache_get_first(seq);
324         else
325                 r = rt_cache_get_next(seq, v);
326         ++*pos;
327         return r;
328 }
329
330 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
331 {
332         if (v && v != SEQ_START_TOKEN)
333                 rcu_read_unlock_bh();
334 }
335
336 static int rt_cache_seq_show(struct seq_file *seq, void *v)
337 {
338         if (v == SEQ_START_TOKEN)
339                 seq_printf(seq, "%-127s\n",
340                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
341                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
342                            "HHUptod\tSpecDst");
343         else {
344                 struct rtable *r = v;
345                 char temp[256];
346
347                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
348                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
349                         r->u.dst.dev ? r->u.dst.dev->name : "*",
350                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
351                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
352                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
353                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
354                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
355                         dst_metric(&r->u.dst, RTAX_WINDOW),
356                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
357                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
358                         r->fl.fl4_tos,
359                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
360                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
361                                        dev_queue_xmit) : 0,
362                         r->rt_spec_dst);
363                 seq_printf(seq, "%-127s\n", temp);
364         }
365         return 0;
366 }
367
368 static struct seq_operations rt_cache_seq_ops = {
369         .start  = rt_cache_seq_start,
370         .next   = rt_cache_seq_next,
371         .stop   = rt_cache_seq_stop,
372         .show   = rt_cache_seq_show,
373 };
374
375 static int rt_cache_seq_open(struct inode *inode, struct file *file)
376 {
377         struct seq_file *seq;
378         int rc = -ENOMEM;
379         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
380
381         if (!s)
382                 goto out;
383         rc = seq_open(file, &rt_cache_seq_ops);
384         if (rc)
385                 goto out_kfree;
386         seq          = file->private_data;
387         seq->private = s;
388         memset(s, 0, sizeof(*s));
389 out:
390         return rc;
391 out_kfree:
392         kfree(s);
393         goto out;
394 }
395
396 static struct file_operations rt_cache_seq_fops = {
397         .owner   = THIS_MODULE,
398         .open    = rt_cache_seq_open,
399         .read    = seq_read,
400         .llseek  = seq_lseek,
401         .release = seq_release_private,
402 };
403
404
405 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
406 {
407         int cpu;
408
409         if (*pos == 0)
410                 return SEQ_START_TOKEN;
411
412         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
413                 if (!cpu_possible(cpu))
414                         continue;
415                 *pos = cpu+1;
416                 return &per_cpu(rt_cache_stat, cpu);
417         }
418         return NULL;
419 }
420
421 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
422 {
423         int cpu;
424
425         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
426                 if (!cpu_possible(cpu))
427                         continue;
428                 *pos = cpu+1;
429                 return &per_cpu(rt_cache_stat, cpu);
430         }
431         return NULL;
432
433 }
434
435 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
436 {
437
438 }
439
440 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
441 {
442         struct rt_cache_stat *st = v;
443
444         if (v == SEQ_START_TOKEN) {
445                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
446                 return 0;
447         }
448
449         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
450                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
451                    atomic_read(&ipv4_dst_ops.entries),
452                    st->in_hit,
453                    st->in_slow_tot,
454                    st->in_slow_mc,
455                    st->in_no_route,
456                    st->in_brd,
457                    st->in_martian_dst,
458                    st->in_martian_src,
459
460                    st->out_hit,
461                    st->out_slow_tot,
462                    st->out_slow_mc,
463
464                    st->gc_total,
465                    st->gc_ignored,
466                    st->gc_goal_miss,
467                    st->gc_dst_overflow,
468                    st->in_hlist_search,
469                    st->out_hlist_search
470                 );
471         return 0;
472 }
473
474 static struct seq_operations rt_cpu_seq_ops = {
475         .start  = rt_cpu_seq_start,
476         .next   = rt_cpu_seq_next,
477         .stop   = rt_cpu_seq_stop,
478         .show   = rt_cpu_seq_show,
479 };
480
481
482 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
483 {
484         return seq_open(file, &rt_cpu_seq_ops);
485 }
486
487 static struct file_operations rt_cpu_seq_fops = {
488         .owner   = THIS_MODULE,
489         .open    = rt_cpu_seq_open,
490         .read    = seq_read,
491         .llseek  = seq_lseek,
492         .release = seq_release,
493 };
494
495 #endif /* CONFIG_PROC_FS */
496
497 static __inline__ void rt_free(struct rtable *rt)
498 {
499         multipath_remove(rt);
500         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
501 }
502
503 static __inline__ void rt_drop(struct rtable *rt)
504 {
505         multipath_remove(rt);
506         ip_rt_put(rt);
507         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
508 }
509
510 static __inline__ int rt_fast_clean(struct rtable *rth)
511 {
512         /* Kill broadcast/multicast entries very aggresively, if they
513            collide in hash table with more useful entries */
514         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
515                 rth->fl.iif && rth->u.dst.rt_next;
516 }
517
518 static __inline__ int rt_valuable(struct rtable *rth)
519 {
520         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
521                 rth->u.dst.expires;
522 }
523
524 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
525 {
526         unsigned long age;
527         int ret = 0;
528
529         if (atomic_read(&rth->u.dst.__refcnt))
530                 goto out;
531
532         ret = 1;
533         if (rth->u.dst.expires &&
534             time_after_eq(jiffies, rth->u.dst.expires))
535                 goto out;
536
537         age = jiffies - rth->u.dst.lastuse;
538         ret = 0;
539         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
540             (age <= tmo2 && rt_valuable(rth)))
541                 goto out;
542         ret = 1;
543 out:    return ret;
544 }
545
546 /* Bits of score are:
547  * 31: very valuable
548  * 30: not quite useless
549  * 29..0: usage counter
550  */
551 static inline u32 rt_score(struct rtable *rt)
552 {
553         u32 score = jiffies - rt->u.dst.lastuse;
554
555         score = ~score & ~(3<<30);
556
557         if (rt_valuable(rt))
558                 score |= (1<<31);
559
560         if (!rt->fl.iif ||
561             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
562                 score |= (1<<30);
563
564         return score;
565 }
566
567 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
568 {
569         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
570                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
571                 (fl1->mark ^ fl2->mark) |
572                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
573                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
574                 (fl1->oif ^ fl2->oif) |
575                 (fl1->iif ^ fl2->iif)) == 0;
576 }
577
578 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
579 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
580                                                 struct rtable *expentry,
581                                                 int *removed_count)
582 {
583         int passedexpired = 0;
584         struct rtable **nextstep = NULL;
585         struct rtable **rthp = chain_head;
586         struct rtable *rth;
587
588         if (removed_count)
589                 *removed_count = 0;
590
591         while ((rth = *rthp) != NULL) {
592                 if (rth == expentry)
593                         passedexpired = 1;
594
595                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
596                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
597                         if (*rthp == expentry) {
598                                 *rthp = rth->u.dst.rt_next;
599                                 continue;
600                         } else {
601                                 *rthp = rth->u.dst.rt_next;
602                                 rt_free(rth);
603                                 if (removed_count)
604                                         ++(*removed_count);
605                         }
606                 } else {
607                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
608                             passedexpired && !nextstep)
609                                 nextstep = &rth->u.dst.rt_next;
610
611                         rthp = &rth->u.dst.rt_next;
612                 }
613         }
614
615         rt_free(expentry);
616         if (removed_count)
617                 ++(*removed_count);
618
619         return nextstep;
620 }
621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622
623
624 /* This runs via a timer and thus is always in BH context. */
625 static void rt_check_expire(unsigned long dummy)
626 {
627         static unsigned int rover;
628         unsigned int i = rover, goal;
629         struct rtable *rth, **rthp;
630         unsigned long now = jiffies;
631         u64 mult;
632
633         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
634         if (ip_rt_gc_timeout > 1)
635                 do_div(mult, ip_rt_gc_timeout);
636         goal = (unsigned int)mult;
637         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
638         for (; goal > 0; goal--) {
639                 unsigned long tmo = ip_rt_gc_timeout;
640
641                 i = (i + 1) & rt_hash_mask;
642                 rthp = &rt_hash_table[i].chain;
643
644                 if (*rthp == 0)
645                         continue;
646                 spin_lock(rt_hash_lock_addr(i));
647                 while ((rth = *rthp) != NULL) {
648                         if (rth->u.dst.expires) {
649                                 /* Entry is expired even if it is in use */
650                                 if (time_before_eq(now, rth->u.dst.expires)) {
651                                         tmo >>= 1;
652                                         rthp = &rth->u.dst.rt_next;
653                                         continue;
654                                 }
655                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
656                                 tmo >>= 1;
657                                 rthp = &rth->u.dst.rt_next;
658                                 continue;
659                         }
660
661                         /* Cleanup aged off entries. */
662 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
663                         /* remove all related balanced entries if necessary */
664                         if (rth->u.dst.flags & DST_BALANCED) {
665                                 rthp = rt_remove_balanced_route(
666                                         &rt_hash_table[i].chain,
667                                         rth, NULL);
668                                 if (!rthp)
669                                         break;
670                         } else {
671                                 *rthp = rth->u.dst.rt_next;
672                                 rt_free(rth);
673                         }
674 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
675                         *rthp = rth->u.dst.rt_next;
676                         rt_free(rth);
677 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
678                 }
679                 spin_unlock(rt_hash_lock_addr(i));
680
681                 /* Fallback loop breaker. */
682                 if (time_after(jiffies, now))
683                         break;
684         }
685         rover = i;
686         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
687 }
688
689 /* This can run from both BH and non-BH contexts, the latter
690  * in the case of a forced flush event.
691  */
692 static void rt_run_flush(unsigned long dummy)
693 {
694         int i;
695         struct rtable *rth, *next;
696
697         rt_deadline = 0;
698
699         get_random_bytes(&rt_hash_rnd, 4);
700
701         for (i = rt_hash_mask; i >= 0; i--) {
702                 spin_lock_bh(rt_hash_lock_addr(i));
703                 rth = rt_hash_table[i].chain;
704                 if (rth)
705                         rt_hash_table[i].chain = NULL;
706                 spin_unlock_bh(rt_hash_lock_addr(i));
707
708                 for (; rth; rth = next) {
709                         next = rth->u.dst.rt_next;
710                         rt_free(rth);
711                 }
712         }
713 }
714
715 static DEFINE_SPINLOCK(rt_flush_lock);
716
717 void rt_cache_flush(int delay)
718 {
719         unsigned long now = jiffies;
720         int user_mode = !in_softirq();
721
722         if (delay < 0)
723                 delay = ip_rt_min_delay;
724
725         /* flush existing multipath state*/
726         multipath_flush();
727
728         spin_lock_bh(&rt_flush_lock);
729
730         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
731                 long tmo = (long)(rt_deadline - now);
732
733                 /* If flush timer is already running
734                    and flush request is not immediate (delay > 0):
735
736                    if deadline is not achieved, prolongate timer to "delay",
737                    otherwise fire it at deadline time.
738                  */
739
740                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
741                         tmo = 0;
742
743                 if (delay > tmo)
744                         delay = tmo;
745         }
746
747         if (delay <= 0) {
748                 spin_unlock_bh(&rt_flush_lock);
749                 rt_run_flush(0);
750                 return;
751         }
752
753         if (rt_deadline == 0)
754                 rt_deadline = now + ip_rt_max_delay;
755
756         mod_timer(&rt_flush_timer, now+delay);
757         spin_unlock_bh(&rt_flush_lock);
758 }
759
760 static void rt_secret_rebuild(unsigned long dummy)
761 {
762         unsigned long now = jiffies;
763
764         rt_cache_flush(0);
765         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
766 }
767
768 /*
769    Short description of GC goals.
770
771    We want to build algorithm, which will keep routing cache
772    at some equilibrium point, when number of aged off entries
773    is kept approximately equal to newly generated ones.
774
775    Current expiration strength is variable "expire".
776    We try to adjust it dynamically, so that if networking
777    is idle expires is large enough to keep enough of warm entries,
778    and when load increases it reduces to limit cache size.
779  */
780
781 static int rt_garbage_collect(void)
782 {
783         static unsigned long expire = RT_GC_TIMEOUT;
784         static unsigned long last_gc;
785         static int rover;
786         static int equilibrium;
787         struct rtable *rth, **rthp;
788         unsigned long now = jiffies;
789         int goal;
790
791         /*
792          * Garbage collection is pretty expensive,
793          * do not make it too frequently.
794          */
795
796         RT_CACHE_STAT_INC(gc_total);
797
798         if (now - last_gc < ip_rt_gc_min_interval &&
799             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
800                 RT_CACHE_STAT_INC(gc_ignored);
801                 goto out;
802         }
803
804         /* Calculate number of entries, which we want to expire now. */
805         goal = atomic_read(&ipv4_dst_ops.entries) -
806                 (ip_rt_gc_elasticity << rt_hash_log);
807         if (goal <= 0) {
808                 if (equilibrium < ipv4_dst_ops.gc_thresh)
809                         equilibrium = ipv4_dst_ops.gc_thresh;
810                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
811                 if (goal > 0) {
812                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
813                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
814                 }
815         } else {
816                 /* We are in dangerous area. Try to reduce cache really
817                  * aggressively.
818                  */
819                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
820                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
821         }
822
823         if (now - last_gc >= ip_rt_gc_min_interval)
824                 last_gc = now;
825
826         if (goal <= 0) {
827                 equilibrium += goal;
828                 goto work_done;
829         }
830
831         do {
832                 int i, k;
833
834                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
835                         unsigned long tmo = expire;
836
837                         k = (k + 1) & rt_hash_mask;
838                         rthp = &rt_hash_table[k].chain;
839                         spin_lock_bh(rt_hash_lock_addr(k));
840                         while ((rth = *rthp) != NULL) {
841                                 if (!rt_may_expire(rth, tmo, expire)) {
842                                         tmo >>= 1;
843                                         rthp = &rth->u.dst.rt_next;
844                                         continue;
845                                 }
846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
847                                 /* remove all related balanced entries
848                                  * if necessary
849                                  */
850                                 if (rth->u.dst.flags & DST_BALANCED) {
851                                         int r;
852
853                                         rthp = rt_remove_balanced_route(
854                                                 &rt_hash_table[k].chain,
855                                                 rth,
856                                                 &r);
857                                         goal -= r;
858                                         if (!rthp)
859                                                 break;
860                                 } else {
861                                         *rthp = rth->u.dst.rt_next;
862                                         rt_free(rth);
863                                         goal--;
864                                 }
865 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
866                                 *rthp = rth->u.dst.rt_next;
867                                 rt_free(rth);
868                                 goal--;
869 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
870                         }
871                         spin_unlock_bh(rt_hash_lock_addr(k));
872                         if (goal <= 0)
873                                 break;
874                 }
875                 rover = k;
876
877                 if (goal <= 0)
878                         goto work_done;
879
880                 /* Goal is not achieved. We stop process if:
881
882                    - if expire reduced to zero. Otherwise, expire is halfed.
883                    - if table is not full.
884                    - if we are called from interrupt.
885                    - jiffies check is just fallback/debug loop breaker.
886                      We will not spin here for long time in any case.
887                  */
888
889                 RT_CACHE_STAT_INC(gc_goal_miss);
890
891                 if (expire == 0)
892                         break;
893
894                 expire >>= 1;
895 #if RT_CACHE_DEBUG >= 2
896                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
897                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
898 #endif
899
900                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901                         goto out;
902         } while (!in_softirq() && time_before_eq(jiffies, now));
903
904         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
905                 goto out;
906         if (net_ratelimit())
907                 printk(KERN_WARNING "dst cache overflow\n");
908         RT_CACHE_STAT_INC(gc_dst_overflow);
909         return 1;
910
911 work_done:
912         expire += ip_rt_gc_min_interval;
913         if (expire > ip_rt_gc_timeout ||
914             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
915                 expire = ip_rt_gc_timeout;
916 #if RT_CACHE_DEBUG >= 2
917         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
918                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
919 #endif
920 out:    return 0;
921 }
922
923 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
924 {
925         struct rtable   *rth, **rthp;
926         unsigned long   now;
927         struct rtable *cand, **candp;
928         u32             min_score;
929         int             chain_length;
930         int attempts = !in_softirq();
931
932 restart:
933         chain_length = 0;
934         min_score = ~(u32)0;
935         cand = NULL;
936         candp = NULL;
937         now = jiffies;
938
939         rthp = &rt_hash_table[hash].chain;
940
941         spin_lock_bh(rt_hash_lock_addr(hash));
942         while ((rth = *rthp) != NULL) {
943 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
944                 if (!(rth->u.dst.flags & DST_BALANCED) &&
945                     compare_keys(&rth->fl, &rt->fl)) {
946 #else
947                 if (compare_keys(&rth->fl, &rt->fl)) {
948 #endif
949                         /* Put it first */
950                         *rthp = rth->u.dst.rt_next;
951                         /*
952                          * Since lookup is lockfree, the deletion
953                          * must be visible to another weakly ordered CPU before
954                          * the insertion at the start of the hash chain.
955                          */
956                         rcu_assign_pointer(rth->u.dst.rt_next,
957                                            rt_hash_table[hash].chain);
958                         /*
959                          * Since lookup is lockfree, the update writes
960                          * must be ordered for consistency on SMP.
961                          */
962                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
963
964                         rth->u.dst.__use++;
965                         dst_hold(&rth->u.dst);
966                         rth->u.dst.lastuse = now;
967                         spin_unlock_bh(rt_hash_lock_addr(hash));
968
969                         rt_drop(rt);
970                         *rp = rth;
971                         return 0;
972                 }
973
974                 if (!atomic_read(&rth->u.dst.__refcnt)) {
975                         u32 score = rt_score(rth);
976
977                         if (score <= min_score) {
978                                 cand = rth;
979                                 candp = rthp;
980                                 min_score = score;
981                         }
982                 }
983
984                 chain_length++;
985
986                 rthp = &rth->u.dst.rt_next;
987         }
988
989         if (cand) {
990                 /* ip_rt_gc_elasticity used to be average length of chain
991                  * length, when exceeded gc becomes really aggressive.
992                  *
993                  * The second limit is less certain. At the moment it allows
994                  * only 2 entries per bucket. We will see.
995                  */
996                 if (chain_length > ip_rt_gc_elasticity) {
997                         *candp = cand->u.dst.rt_next;
998                         rt_free(cand);
999                 }
1000         }
1001
1002         /* Try to bind route to arp only if it is output
1003            route or unicast forwarding path.
1004          */
1005         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1006                 int err = arp_bind_neighbour(&rt->u.dst);
1007                 if (err) {
1008                         spin_unlock_bh(rt_hash_lock_addr(hash));
1009
1010                         if (err != -ENOBUFS) {
1011                                 rt_drop(rt);
1012                                 return err;
1013                         }
1014
1015                         /* Neighbour tables are full and nothing
1016                            can be released. Try to shrink route cache,
1017                            it is most likely it holds some neighbour records.
1018                          */
1019                         if (attempts-- > 0) {
1020                                 int saved_elasticity = ip_rt_gc_elasticity;
1021                                 int saved_int = ip_rt_gc_min_interval;
1022                                 ip_rt_gc_elasticity     = 1;
1023                                 ip_rt_gc_min_interval   = 0;
1024                                 rt_garbage_collect();
1025                                 ip_rt_gc_min_interval   = saved_int;
1026                                 ip_rt_gc_elasticity     = saved_elasticity;
1027                                 goto restart;
1028                         }
1029
1030                         if (net_ratelimit())
1031                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1032                         rt_drop(rt);
1033                         return -ENOBUFS;
1034                 }
1035         }
1036
1037         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1038 #if RT_CACHE_DEBUG >= 2
1039         if (rt->u.dst.rt_next) {
1040                 struct rtable *trt;
1041                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1042                        NIPQUAD(rt->rt_dst));
1043                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1044                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1045                 printk("\n");
1046         }
1047 #endif
1048         rt_hash_table[hash].chain = rt;
1049         spin_unlock_bh(rt_hash_lock_addr(hash));
1050         *rp = rt;
1051         return 0;
1052 }
1053
1054 void rt_bind_peer(struct rtable *rt, int create)
1055 {
1056         static DEFINE_SPINLOCK(rt_peer_lock);
1057         struct inet_peer *peer;
1058
1059         peer = inet_getpeer(rt->rt_dst, create);
1060
1061         spin_lock_bh(&rt_peer_lock);
1062         if (rt->peer == NULL) {
1063                 rt->peer = peer;
1064                 peer = NULL;
1065         }
1066         spin_unlock_bh(&rt_peer_lock);
1067         if (peer)
1068                 inet_putpeer(peer);
1069 }
1070
1071 /*
1072  * Peer allocation may fail only in serious out-of-memory conditions.  However
1073  * we still can generate some output.
1074  * Random ID selection looks a bit dangerous because we have no chances to
1075  * select ID being unique in a reasonable period of time.
1076  * But broken packet identifier may be better than no packet at all.
1077  */
1078 static void ip_select_fb_ident(struct iphdr *iph)
1079 {
1080         static DEFINE_SPINLOCK(ip_fb_id_lock);
1081         static u32 ip_fallback_id;
1082         u32 salt;
1083
1084         spin_lock_bh(&ip_fb_id_lock);
1085         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1086         iph->id = htons(salt & 0xFFFF);
1087         ip_fallback_id = salt;
1088         spin_unlock_bh(&ip_fb_id_lock);
1089 }
1090
1091 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1092 {
1093         struct rtable *rt = (struct rtable *) dst;
1094
1095         if (rt) {
1096                 if (rt->peer == NULL)
1097                         rt_bind_peer(rt, 1);
1098
1099                 /* If peer is attached to destination, it is never detached,
1100                    so that we need not to grab a lock to dereference it.
1101                  */
1102                 if (rt->peer) {
1103                         iph->id = htons(inet_getid(rt->peer, more));
1104                         return;
1105                 }
1106         } else
1107                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1108                        __builtin_return_address(0));
1109
1110         ip_select_fb_ident(iph);
1111 }
1112
1113 static void rt_del(unsigned hash, struct rtable *rt)
1114 {
1115         struct rtable **rthp;
1116
1117         spin_lock_bh(rt_hash_lock_addr(hash));
1118         ip_rt_put(rt);
1119         for (rthp = &rt_hash_table[hash].chain; *rthp;
1120              rthp = &(*rthp)->u.dst.rt_next)
1121                 if (*rthp == rt) {
1122                         *rthp = rt->u.dst.rt_next;
1123                         rt_free(rt);
1124                         break;
1125                 }
1126         spin_unlock_bh(rt_hash_lock_addr(hash));
1127 }
1128
1129 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1130                     __be32 saddr, struct net_device *dev)
1131 {
1132         int i, k;
1133         struct in_device *in_dev = in_dev_get(dev);
1134         struct rtable *rth, **rthp;
1135         __be32  skeys[2] = { saddr, 0 };
1136         int  ikeys[2] = { dev->ifindex, 0 };
1137         struct netevent_redirect netevent;
1138
1139         if (!in_dev)
1140                 return;
1141
1142         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1143             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1144                 goto reject_redirect;
1145
1146         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1147                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1148                         goto reject_redirect;
1149                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1150                         goto reject_redirect;
1151         } else {
1152                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1153                         goto reject_redirect;
1154         }
1155
1156         for (i = 0; i < 2; i++) {
1157                 for (k = 0; k < 2; k++) {
1158                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1159
1160                         rthp=&rt_hash_table[hash].chain;
1161
1162                         rcu_read_lock();
1163                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1164                                 struct rtable *rt;
1165
1166                                 if (rth->fl.fl4_dst != daddr ||
1167                                     rth->fl.fl4_src != skeys[i] ||
1168                                     rth->fl.oif != ikeys[k] ||
1169                                     rth->fl.iif != 0) {
1170                                         rthp = &rth->u.dst.rt_next;
1171                                         continue;
1172                                 }
1173
1174                                 if (rth->rt_dst != daddr ||
1175                                     rth->rt_src != saddr ||
1176                                     rth->u.dst.error ||
1177                                     rth->rt_gateway != old_gw ||
1178                                     rth->u.dst.dev != dev)
1179                                         break;
1180
1181                                 dst_hold(&rth->u.dst);
1182                                 rcu_read_unlock();
1183
1184                                 rt = dst_alloc(&ipv4_dst_ops);
1185                                 if (rt == NULL) {
1186                                         ip_rt_put(rth);
1187                                         in_dev_put(in_dev);
1188                                         return;
1189                                 }
1190
1191                                 /* Copy all the information. */
1192                                 *rt = *rth;
1193                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1194                                 rt->u.dst.__use         = 1;
1195                                 atomic_set(&rt->u.dst.__refcnt, 1);
1196                                 rt->u.dst.child         = NULL;
1197                                 if (rt->u.dst.dev)
1198                                         dev_hold(rt->u.dst.dev);
1199                                 if (rt->idev)
1200                                         in_dev_hold(rt->idev);
1201                                 rt->u.dst.obsolete      = 0;
1202                                 rt->u.dst.lastuse       = jiffies;
1203                                 rt->u.dst.path          = &rt->u.dst;
1204                                 rt->u.dst.neighbour     = NULL;
1205                                 rt->u.dst.hh            = NULL;
1206                                 rt->u.dst.xfrm          = NULL;
1207
1208                                 rt->rt_flags            |= RTCF_REDIRECTED;
1209
1210                                 /* Gateway is different ... */
1211                                 rt->rt_gateway          = new_gw;
1212
1213                                 /* Redirect received -> path was valid */
1214                                 dst_confirm(&rth->u.dst);
1215
1216                                 if (rt->peer)
1217                                         atomic_inc(&rt->peer->refcnt);
1218
1219                                 if (arp_bind_neighbour(&rt->u.dst) ||
1220                                     !(rt->u.dst.neighbour->nud_state &
1221                                             NUD_VALID)) {
1222                                         if (rt->u.dst.neighbour)
1223                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1224                                         ip_rt_put(rth);
1225                                         rt_drop(rt);
1226                                         goto do_next;
1227                                 }
1228
1229                                 netevent.old = &rth->u.dst;
1230                                 netevent.new = &rt->u.dst;
1231                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1232                                                         &netevent);
1233
1234                                 rt_del(hash, rth);
1235                                 if (!rt_intern_hash(hash, rt, &rt))
1236                                         ip_rt_put(rt);
1237                                 goto do_next;
1238                         }
1239                         rcu_read_unlock();
1240                 do_next:
1241                         ;
1242                 }
1243         }
1244         in_dev_put(in_dev);
1245         return;
1246
1247 reject_redirect:
1248 #ifdef CONFIG_IP_ROUTE_VERBOSE
1249         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1250                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1251                         "%u.%u.%u.%u ignored.\n"
1252                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1253                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1254                        NIPQUAD(saddr), NIPQUAD(daddr));
1255 #endif
1256         in_dev_put(in_dev);
1257 }
1258
1259 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1260 {
1261         struct rtable *rt = (struct rtable*)dst;
1262         struct dst_entry *ret = dst;
1263
1264         if (rt) {
1265                 if (dst->obsolete) {
1266                         ip_rt_put(rt);
1267                         ret = NULL;
1268                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1269                            rt->u.dst.expires) {
1270                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1271                                                 rt->fl.oif);
1272 #if RT_CACHE_DEBUG >= 1
1273                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1274                                           "%u.%u.%u.%u/%02x dropped\n",
1275                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1276 #endif
1277                         rt_del(hash, rt);
1278                         ret = NULL;
1279                 }
1280         }
1281         return ret;
1282 }
1283
1284 /*
1285  * Algorithm:
1286  *      1. The first ip_rt_redirect_number redirects are sent
1287  *         with exponential backoff, then we stop sending them at all,
1288  *         assuming that the host ignores our redirects.
1289  *      2. If we did not see packets requiring redirects
1290  *         during ip_rt_redirect_silence, we assume that the host
1291  *         forgot redirected route and start to send redirects again.
1292  *
1293  * This algorithm is much cheaper and more intelligent than dumb load limiting
1294  * in icmp.c.
1295  *
1296  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1297  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1298  */
1299
1300 void ip_rt_send_redirect(struct sk_buff *skb)
1301 {
1302         struct rtable *rt = (struct rtable*)skb->dst;
1303         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1304
1305         if (!in_dev)
1306                 return;
1307
1308         if (!IN_DEV_TX_REDIRECTS(in_dev))
1309                 goto out;
1310
1311         /* No redirected packets during ip_rt_redirect_silence;
1312          * reset the algorithm.
1313          */
1314         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1315                 rt->u.dst.rate_tokens = 0;
1316
1317         /* Too many ignored redirects; do not send anything
1318          * set u.dst.rate_last to the last seen redirected packet.
1319          */
1320         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1321                 rt->u.dst.rate_last = jiffies;
1322                 goto out;
1323         }
1324
1325         /* Check for load limit; set rate_last to the latest sent
1326          * redirect.
1327          */
1328         if (rt->u.dst.rate_tokens == 0 ||
1329             time_after(jiffies,
1330                        (rt->u.dst.rate_last +
1331                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1332                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1333                 rt->u.dst.rate_last = jiffies;
1334                 ++rt->u.dst.rate_tokens;
1335 #ifdef CONFIG_IP_ROUTE_VERBOSE
1336                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1337                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1338                     net_ratelimit())
1339                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1340                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1341                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1342                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1343 #endif
1344         }
1345 out:
1346         in_dev_put(in_dev);
1347 }
1348
1349 static int ip_error(struct sk_buff *skb)
1350 {
1351         struct rtable *rt = (struct rtable*)skb->dst;
1352         unsigned long now;
1353         int code;
1354
1355         switch (rt->u.dst.error) {
1356                 case EINVAL:
1357                 default:
1358                         goto out;
1359                 case EHOSTUNREACH:
1360                         code = ICMP_HOST_UNREACH;
1361                         break;
1362                 case ENETUNREACH:
1363                         code = ICMP_NET_UNREACH;
1364                         break;
1365                 case EACCES:
1366                         code = ICMP_PKT_FILTERED;
1367                         break;
1368         }
1369
1370         now = jiffies;
1371         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1372         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1373                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1374         rt->u.dst.rate_last = now;
1375         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1376                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1377                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1378         }
1379
1380 out:    kfree_skb(skb);
1381         return 0;
1382 }
1383
1384 /*
1385  *      The last two values are not from the RFC but
1386  *      are needed for AMPRnet AX.25 paths.
1387  */
1388
1389 static const unsigned short mtu_plateau[] =
1390 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1391
1392 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1393 {
1394         int i;
1395
1396         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1397                 if (old_mtu > mtu_plateau[i])
1398                         return mtu_plateau[i];
1399         return 68;
1400 }
1401
1402 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1403 {
1404         int i;
1405         unsigned short old_mtu = ntohs(iph->tot_len);
1406         struct rtable *rth;
1407         __be32  skeys[2] = { iph->saddr, 0, };
1408         __be32  daddr = iph->daddr;
1409         unsigned short est_mtu = 0;
1410
1411         if (ipv4_config.no_pmtu_disc)
1412                 return 0;
1413
1414         for (i = 0; i < 2; i++) {
1415                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1416
1417                 rcu_read_lock();
1418                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1419                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1420                         if (rth->fl.fl4_dst == daddr &&
1421                             rth->fl.fl4_src == skeys[i] &&
1422                             rth->rt_dst  == daddr &&
1423                             rth->rt_src  == iph->saddr &&
1424                             rth->fl.iif == 0 &&
1425                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1426                                 unsigned short mtu = new_mtu;
1427
1428                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1429
1430                                         /* BSD 4.2 compatibility hack :-( */
1431                                         if (mtu == 0 &&
1432                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1433                                             old_mtu >= 68 + (iph->ihl << 2))
1434                                                 old_mtu -= iph->ihl << 2;
1435
1436                                         mtu = guess_mtu(old_mtu);
1437                                 }
1438                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1439                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1440                                                 dst_confirm(&rth->u.dst);
1441                                                 if (mtu < ip_rt_min_pmtu) {
1442                                                         mtu = ip_rt_min_pmtu;
1443                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1444                                                                 (1 << RTAX_MTU);
1445                                                 }
1446                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1447                                                 dst_set_expires(&rth->u.dst,
1448                                                         ip_rt_mtu_expires);
1449                                         }
1450                                         est_mtu = mtu;
1451                                 }
1452                         }
1453                 }
1454                 rcu_read_unlock();
1455         }
1456         return est_mtu ? : new_mtu;
1457 }
1458
1459 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1460 {
1461         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1462             !(dst_metric_locked(dst, RTAX_MTU))) {
1463                 if (mtu < ip_rt_min_pmtu) {
1464                         mtu = ip_rt_min_pmtu;
1465                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1466                 }
1467                 dst->metrics[RTAX_MTU-1] = mtu;
1468                 dst_set_expires(dst, ip_rt_mtu_expires);
1469                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1470         }
1471 }
1472
1473 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1474 {
1475         return NULL;
1476 }
1477
1478 static void ipv4_dst_destroy(struct dst_entry *dst)
1479 {
1480         struct rtable *rt = (struct rtable *) dst;
1481         struct inet_peer *peer = rt->peer;
1482         struct in_device *idev = rt->idev;
1483
1484         if (peer) {
1485                 rt->peer = NULL;
1486                 inet_putpeer(peer);
1487         }
1488
1489         if (idev) {
1490                 rt->idev = NULL;
1491                 in_dev_put(idev);
1492         }
1493 }
1494
1495 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1496                             int how)
1497 {
1498         struct rtable *rt = (struct rtable *) dst;
1499         struct in_device *idev = rt->idev;
1500         if (dev != &loopback_dev && idev && idev->dev == dev) {
1501                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1502                 if (loopback_idev) {
1503                         rt->idev = loopback_idev;
1504                         in_dev_put(idev);
1505                 }
1506         }
1507 }
1508
1509 static void ipv4_link_failure(struct sk_buff *skb)
1510 {
1511         struct rtable *rt;
1512
1513         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1514
1515         rt = (struct rtable *) skb->dst;
1516         if (rt)
1517                 dst_set_expires(&rt->u.dst, 0);
1518 }
1519
1520 static int ip_rt_bug(struct sk_buff *skb)
1521 {
1522         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1523                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1524                 skb->dev ? skb->dev->name : "?");
1525         kfree_skb(skb);
1526         return 0;
1527 }
1528
1529 /*
1530    We do not cache source address of outgoing interface,
1531    because it is used only by IP RR, TS and SRR options,
1532    so that it out of fast path.
1533
1534    BTW remember: "addr" is allowed to be not aligned
1535    in IP options!
1536  */
1537
1538 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1539 {
1540         __be32 src;
1541         struct fib_result res;
1542
1543         if (rt->fl.iif == 0)
1544                 src = rt->rt_src;
1545         else if (fib_lookup(&rt->fl, &res) == 0) {
1546                 src = FIB_RES_PREFSRC(res);
1547                 fib_res_put(&res);
1548         } else
1549                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1550                                         RT_SCOPE_UNIVERSE);
1551         memcpy(addr, &src, 4);
1552 }
1553
1554 #ifdef CONFIG_NET_CLS_ROUTE
1555 static void set_class_tag(struct rtable *rt, u32 tag)
1556 {
1557         if (!(rt->u.dst.tclassid & 0xFFFF))
1558                 rt->u.dst.tclassid |= tag & 0xFFFF;
1559         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1560                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1561 }
1562 #endif
1563
1564 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1565 {
1566         struct fib_info *fi = res->fi;
1567
1568         if (fi) {
1569                 if (FIB_RES_GW(*res) &&
1570                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1571                         rt->rt_gateway = FIB_RES_GW(*res);
1572                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1573                        sizeof(rt->u.dst.metrics));
1574                 if (fi->fib_mtu == 0) {
1575                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1576                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1577                             rt->rt_gateway != rt->rt_dst &&
1578                             rt->u.dst.dev->mtu > 576)
1579                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1580                 }
1581 #ifdef CONFIG_NET_CLS_ROUTE
1582                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1583 #endif
1584         } else
1585                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1586
1587         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1588                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1589         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1590                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1591         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1592                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1593                                        ip_rt_min_advmss);
1594         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1595                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1596
1597 #ifdef CONFIG_NET_CLS_ROUTE
1598 #ifdef CONFIG_IP_MULTIPLE_TABLES
1599         set_class_tag(rt, fib_rules_tclass(res));
1600 #endif
1601         set_class_tag(rt, itag);
1602 #endif
1603         rt->rt_type = res->type;
1604 }
1605
1606 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1607                                 u8 tos, struct net_device *dev, int our)
1608 {
1609         unsigned hash;
1610         struct rtable *rth;
1611         __be32 spec_dst;
1612         struct in_device *in_dev = in_dev_get(dev);
1613         u32 itag = 0;
1614
1615         /* Primary sanity checks. */
1616
1617         if (in_dev == NULL)
1618                 return -EINVAL;
1619
1620         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1621             skb->protocol != htons(ETH_P_IP))
1622                 goto e_inval;
1623
1624         if (ZERONET(saddr)) {
1625                 if (!LOCAL_MCAST(daddr))
1626                         goto e_inval;
1627                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1628         } else if (fib_validate_source(saddr, 0, tos, 0,
1629                                         dev, &spec_dst, &itag) < 0)
1630                 goto e_inval;
1631
1632         rth = dst_alloc(&ipv4_dst_ops);
1633         if (!rth)
1634                 goto e_nobufs;
1635
1636         rth->u.dst.output= ip_rt_bug;
1637
1638         atomic_set(&rth->u.dst.__refcnt, 1);
1639         rth->u.dst.flags= DST_HOST;
1640         if (in_dev->cnf.no_policy)
1641                 rth->u.dst.flags |= DST_NOPOLICY;
1642         rth->fl.fl4_dst = daddr;
1643         rth->rt_dst     = daddr;
1644         rth->fl.fl4_tos = tos;
1645         rth->fl.mark    = skb->mark;
1646         rth->fl.fl4_src = saddr;
1647         rth->rt_src     = saddr;
1648 #ifdef CONFIG_NET_CLS_ROUTE
1649         rth->u.dst.tclassid = itag;
1650 #endif
1651         rth->rt_iif     =
1652         rth->fl.iif     = dev->ifindex;
1653         rth->u.dst.dev  = &loopback_dev;
1654         dev_hold(rth->u.dst.dev);
1655         rth->idev       = in_dev_get(rth->u.dst.dev);
1656         rth->fl.oif     = 0;
1657         rth->rt_gateway = daddr;
1658         rth->rt_spec_dst= spec_dst;
1659         rth->rt_type    = RTN_MULTICAST;
1660         rth->rt_flags   = RTCF_MULTICAST;
1661         if (our) {
1662                 rth->u.dst.input= ip_local_deliver;
1663                 rth->rt_flags |= RTCF_LOCAL;
1664         }
1665
1666 #ifdef CONFIG_IP_MROUTE
1667         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1668                 rth->u.dst.input = ip_mr_input;
1669 #endif
1670         RT_CACHE_STAT_INC(in_slow_mc);
1671
1672         in_dev_put(in_dev);
1673         hash = rt_hash(daddr, saddr, dev->ifindex);
1674         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1675
1676 e_nobufs:
1677         in_dev_put(in_dev);
1678         return -ENOBUFS;
1679
1680 e_inval:
1681         in_dev_put(in_dev);
1682         return -EINVAL;
1683 }
1684
1685
1686 static void ip_handle_martian_source(struct net_device *dev,
1687                                      struct in_device *in_dev,
1688                                      struct sk_buff *skb,
1689                                      __be32 daddr,
1690                                      __be32 saddr)
1691 {
1692         RT_CACHE_STAT_INC(in_martian_src);
1693 #ifdef CONFIG_IP_ROUTE_VERBOSE
1694         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1695                 /*
1696                  *      RFC1812 recommendation, if source is martian,
1697                  *      the only hint is MAC header.
1698                  */
1699                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1700                         "%u.%u.%u.%u, on dev %s\n",
1701                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1702                 if (dev->hard_header_len && skb->mac.raw) {
1703                         int i;
1704                         unsigned char *p = skb->mac.raw;
1705                         printk(KERN_WARNING "ll header: ");
1706                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1707                                 printk("%02x", *p);
1708                                 if (i < (dev->hard_header_len - 1))
1709                                         printk(":");
1710                         }
1711                         printk("\n");
1712                 }
1713         }
1714 #endif
1715 }
1716
1717 static inline int __mkroute_input(struct sk_buff *skb,
1718                                   struct fib_result* res,
1719                                   struct in_device *in_dev,
1720                                   __be32 daddr, __be32 saddr, u32 tos,
1721                                   struct rtable **result)
1722 {
1723
1724         struct rtable *rth;
1725         int err;
1726         struct in_device *out_dev;
1727         unsigned flags = 0;
1728         __be32 spec_dst;
1729         u32 itag;
1730
1731         /* get a working reference to the output device */
1732         out_dev = in_dev_get(FIB_RES_DEV(*res));
1733         if (out_dev == NULL) {
1734                 if (net_ratelimit())
1735                         printk(KERN_CRIT "Bug in ip_route_input" \
1736                                "_slow(). Please, report\n");
1737                 return -EINVAL;
1738         }
1739
1740
1741         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1742                                   in_dev->dev, &spec_dst, &itag);
1743         if (err < 0) {
1744                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1745                                          saddr);
1746
1747                 err = -EINVAL;
1748                 goto cleanup;
1749         }
1750
1751         if (err)
1752                 flags |= RTCF_DIRECTSRC;
1753
1754         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1755             (IN_DEV_SHARED_MEDIA(out_dev) ||
1756              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1757                 flags |= RTCF_DOREDIRECT;
1758
1759         if (skb->protocol != htons(ETH_P_IP)) {
1760                 /* Not IP (i.e. ARP). Do not create route, if it is
1761                  * invalid for proxy arp. DNAT routes are always valid.
1762                  */
1763                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1764                         err = -EINVAL;
1765                         goto cleanup;
1766                 }
1767         }
1768
1769
1770         rth = dst_alloc(&ipv4_dst_ops);
1771         if (!rth) {
1772                 err = -ENOBUFS;
1773                 goto cleanup;
1774         }
1775
1776         atomic_set(&rth->u.dst.__refcnt, 1);
1777         rth->u.dst.flags= DST_HOST;
1778 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1779         if (res->fi->fib_nhs > 1)
1780                 rth->u.dst.flags |= DST_BALANCED;
1781 #endif
1782         if (in_dev->cnf.no_policy)
1783                 rth->u.dst.flags |= DST_NOPOLICY;
1784         if (out_dev->cnf.no_xfrm)
1785                 rth->u.dst.flags |= DST_NOXFRM;
1786         rth->fl.fl4_dst = daddr;
1787         rth->rt_dst     = daddr;
1788         rth->fl.fl4_tos = tos;
1789         rth->fl.mark    = skb->mark;
1790         rth->fl.fl4_src = saddr;
1791         rth->rt_src     = saddr;
1792         rth->rt_gateway = daddr;
1793         rth->rt_iif     =
1794                 rth->fl.iif     = in_dev->dev->ifindex;
1795         rth->u.dst.dev  = (out_dev)->dev;
1796         dev_hold(rth->u.dst.dev);
1797         rth->idev       = in_dev_get(rth->u.dst.dev);
1798         rth->fl.oif     = 0;
1799         rth->rt_spec_dst= spec_dst;
1800
1801         rth->u.dst.input = ip_forward;
1802         rth->u.dst.output = ip_output;
1803
1804         rt_set_nexthop(rth, res, itag);
1805
1806         rth->rt_flags = flags;
1807
1808         *result = rth;
1809         err = 0;
1810  cleanup:
1811         /* release the working reference to the output device */
1812         in_dev_put(out_dev);
1813         return err;
1814 }
1815
1816 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1817                                        struct fib_result* res,
1818                                        const struct flowi *fl,
1819                                        struct in_device *in_dev,
1820                                        __be32 daddr, __be32 saddr, u32 tos)
1821 {
1822         struct rtable* rth = NULL;
1823         int err;
1824         unsigned hash;
1825
1826 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1827         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1828                 fib_select_multipath(fl, res);
1829 #endif
1830
1831         /* create a routing cache entry */
1832         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1833         if (err)
1834                 return err;
1835
1836         /* put it into the cache */
1837         hash = rt_hash(daddr, saddr, fl->iif);
1838         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1839 }
1840
1841 static inline int ip_mkroute_input(struct sk_buff *skb,
1842                                    struct fib_result* res,
1843                                    const struct flowi *fl,
1844                                    struct in_device *in_dev,
1845                                    __be32 daddr, __be32 saddr, u32 tos)
1846 {
1847 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1848         struct rtable* rth = NULL, *rtres;
1849         unsigned char hop, hopcount;
1850         int err = -EINVAL;
1851         unsigned int hash;
1852
1853         if (res->fi)
1854                 hopcount = res->fi->fib_nhs;
1855         else
1856                 hopcount = 1;
1857
1858         /* distinguish between multipath and singlepath */
1859         if (hopcount < 2)
1860                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1861                                             saddr, tos);
1862
1863         /* add all alternatives to the routing cache */
1864         for (hop = 0; hop < hopcount; hop++) {
1865                 res->nh_sel = hop;
1866
1867                 /* put reference to previous result */
1868                 if (hop)
1869                         ip_rt_put(rtres);
1870
1871                 /* create a routing cache entry */
1872                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1873                                       &rth);
1874                 if (err)
1875                         return err;
1876
1877                 /* put it into the cache */
1878                 hash = rt_hash(daddr, saddr, fl->iif);
1879                 err = rt_intern_hash(hash, rth, &rtres);
1880                 if (err)
1881                         return err;
1882
1883                 /* forward hop information to multipath impl. */
1884                 multipath_set_nhinfo(rth,
1885                                      FIB_RES_NETWORK(*res),
1886                                      FIB_RES_NETMASK(*res),
1887                                      res->prefixlen,
1888                                      &FIB_RES_NH(*res));
1889         }
1890         skb->dst = &rtres->u.dst;
1891         return err;
1892 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1893         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1894 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1895 }
1896
1897
1898 /*
1899  *      NOTE. We drop all the packets that has local source
1900  *      addresses, because every properly looped back packet
1901  *      must have correct destination already attached by output routine.
1902  *
1903  *      Such approach solves two big problems:
1904  *      1. Not simplex devices are handled properly.
1905  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1906  */
1907
1908 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1909                                u8 tos, struct net_device *dev)
1910 {
1911         struct fib_result res;
1912         struct in_device *in_dev = in_dev_get(dev);
1913         struct flowi fl = { .nl_u = { .ip4_u =
1914                                       { .daddr = daddr,
1915                                         .saddr = saddr,
1916                                         .tos = tos,
1917                                         .scope = RT_SCOPE_UNIVERSE,
1918                                       } },
1919                             .mark = skb->mark,
1920                             .iif = dev->ifindex };
1921         unsigned        flags = 0;
1922         u32             itag = 0;
1923         struct rtable * rth;
1924         unsigned        hash;
1925         __be32          spec_dst;
1926         int             err = -EINVAL;
1927         int             free_res = 0;
1928
1929         /* IP on this device is disabled. */
1930
1931         if (!in_dev)
1932                 goto out;
1933
1934         /* Check for the most weird martians, which can be not detected
1935            by fib_lookup.
1936          */
1937
1938         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1939                 goto martian_source;
1940
1941         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1942                 goto brd_input;
1943
1944         /* Accept zero addresses only to limited broadcast;
1945          * I even do not know to fix it or not. Waiting for complains :-)
1946          */
1947         if (ZERONET(saddr))
1948                 goto martian_source;
1949
1950         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1951                 goto martian_destination;
1952
1953         /*
1954          *      Now we are ready to route packet.
1955          */
1956         if ((err = fib_lookup(&fl, &res)) != 0) {
1957                 if (!IN_DEV_FORWARD(in_dev))
1958                         goto e_hostunreach;
1959                 goto no_route;
1960         }
1961         free_res = 1;
1962
1963         RT_CACHE_STAT_INC(in_slow_tot);
1964
1965         if (res.type == RTN_BROADCAST)
1966                 goto brd_input;
1967
1968         if (res.type == RTN_LOCAL) {
1969                 int result;
1970                 result = fib_validate_source(saddr, daddr, tos,
1971                                              loopback_dev.ifindex,
1972                                              dev, &spec_dst, &itag);
1973                 if (result < 0)
1974                         goto martian_source;
1975                 if (result)
1976                         flags |= RTCF_DIRECTSRC;
1977                 spec_dst = daddr;
1978                 goto local_input;
1979         }
1980
1981         if (!IN_DEV_FORWARD(in_dev))
1982                 goto e_hostunreach;
1983         if (res.type != RTN_UNICAST)
1984                 goto martian_destination;
1985
1986         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1987         if (err == -ENOBUFS)
1988                 goto e_nobufs;
1989         if (err == -EINVAL)
1990                 goto e_inval;
1991
1992 done:
1993         in_dev_put(in_dev);
1994         if (free_res)
1995                 fib_res_put(&res);
1996 out:    return err;
1997
1998 brd_input:
1999         if (skb->protocol != htons(ETH_P_IP))
2000                 goto e_inval;
2001
2002         if (ZERONET(saddr))
2003                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2004         else {
2005                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2006                                           &itag);
2007                 if (err < 0)
2008                         goto martian_source;
2009                 if (err)
2010                         flags |= RTCF_DIRECTSRC;
2011         }
2012         flags |= RTCF_BROADCAST;
2013         res.type = RTN_BROADCAST;
2014         RT_CACHE_STAT_INC(in_brd);
2015
2016 local_input:
2017         rth = dst_alloc(&ipv4_dst_ops);
2018         if (!rth)
2019                 goto e_nobufs;
2020
2021         rth->u.dst.output= ip_rt_bug;
2022
2023         atomic_set(&rth->u.dst.__refcnt, 1);
2024         rth->u.dst.flags= DST_HOST;
2025         if (in_dev->cnf.no_policy)
2026                 rth->u.dst.flags |= DST_NOPOLICY;
2027         rth->fl.fl4_dst = daddr;
2028         rth->rt_dst     = daddr;
2029         rth->fl.fl4_tos = tos;
2030         rth->fl.mark    = skb->mark;
2031         rth->fl.fl4_src = saddr;
2032         rth->rt_src     = saddr;
2033 #ifdef CONFIG_NET_CLS_ROUTE
2034         rth->u.dst.tclassid = itag;
2035 #endif
2036         rth->rt_iif     =
2037         rth->fl.iif     = dev->ifindex;
2038         rth->u.dst.dev  = &loopback_dev;
2039         dev_hold(rth->u.dst.dev);
2040         rth->idev       = in_dev_get(rth->u.dst.dev);
2041         rth->rt_gateway = daddr;
2042         rth->rt_spec_dst= spec_dst;
2043         rth->u.dst.input= ip_local_deliver;
2044         rth->rt_flags   = flags|RTCF_LOCAL;
2045         if (res.type == RTN_UNREACHABLE) {
2046                 rth->u.dst.input= ip_error;
2047                 rth->u.dst.error= -err;
2048                 rth->rt_flags   &= ~RTCF_LOCAL;
2049         }
2050         rth->rt_type    = res.type;
2051         hash = rt_hash(daddr, saddr, fl.iif);
2052         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2053         goto done;
2054
2055 no_route:
2056         RT_CACHE_STAT_INC(in_no_route);
2057         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2058         res.type = RTN_UNREACHABLE;
2059         goto local_input;
2060
2061         /*
2062          *      Do not cache martian addresses: they should be logged (RFC1812)
2063          */
2064 martian_destination:
2065         RT_CACHE_STAT_INC(in_martian_dst);
2066 #ifdef CONFIG_IP_ROUTE_VERBOSE
2067         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2068                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2069                         "%u.%u.%u.%u, dev %s\n",
2070                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2071 #endif
2072
2073 e_hostunreach:
2074         err = -EHOSTUNREACH;
2075         goto done;
2076
2077 e_inval:
2078         err = -EINVAL;
2079         goto done;
2080
2081 e_nobufs:
2082         err = -ENOBUFS;
2083         goto done;
2084
2085 martian_source:
2086         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2087         goto e_inval;
2088 }
2089
2090 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091                    u8 tos, struct net_device *dev)
2092 {
2093         struct rtable * rth;
2094         unsigned        hash;
2095         int iif = dev->ifindex;
2096
2097         tos &= IPTOS_RT_MASK;
2098         hash = rt_hash(daddr, saddr, iif);
2099
2100         rcu_read_lock();
2101         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2102              rth = rcu_dereference(rth->u.dst.rt_next)) {
2103                 if (rth->fl.fl4_dst == daddr &&
2104                     rth->fl.fl4_src == saddr &&
2105                     rth->fl.iif == iif &&
2106                     rth->fl.oif == 0 &&
2107                     rth->fl.mark == skb->mark &&
2108                     rth->fl.fl4_tos == tos) {
2109                         rth->u.dst.lastuse = jiffies;
2110                         dst_hold(&rth->u.dst);
2111                         rth->u.dst.__use++;
2112                         RT_CACHE_STAT_INC(in_hit);
2113                         rcu_read_unlock();
2114                         skb->dst = (struct dst_entry*)rth;
2115                         return 0;
2116                 }
2117                 RT_CACHE_STAT_INC(in_hlist_search);
2118         }
2119         rcu_read_unlock();
2120
2121         /* Multicast recognition logic is moved from route cache to here.
2122            The problem was that too many Ethernet cards have broken/missing
2123            hardware multicast filters :-( As result the host on multicasting
2124            network acquires a lot of useless route cache entries, sort of
2125            SDR messages from all the world. Now we try to get rid of them.
2126            Really, provided software IP multicast filter is organized
2127            reasonably (at least, hashed), it does not result in a slowdown
2128            comparing with route cache reject entries.
2129            Note, that multicast routers are not affected, because
2130            route cache entry is created eventually.
2131          */
2132         if (MULTICAST(daddr)) {
2133                 struct in_device *in_dev;
2134
2135                 rcu_read_lock();
2136                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2137                         int our = ip_check_mc(in_dev, daddr, saddr,
2138                                 skb->nh.iph->protocol);
2139                         if (our
2140 #ifdef CONFIG_IP_MROUTE
2141                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2142 #endif
2143                             ) {
2144                                 rcu_read_unlock();
2145                                 return ip_route_input_mc(skb, daddr, saddr,
2146                                                          tos, dev, our);
2147                         }
2148                 }
2149                 rcu_read_unlock();
2150                 return -EINVAL;
2151         }
2152         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2153 }
2154
2155 static inline int __mkroute_output(struct rtable **result,
2156                                    struct fib_result* res,
2157                                    const struct flowi *fl,
2158                                    const struct flowi *oldflp,
2159                                    struct net_device *dev_out,
2160                                    unsigned flags)
2161 {
2162         struct rtable *rth;
2163         struct in_device *in_dev;
2164         u32 tos = RT_FL_TOS(oldflp);
2165         int err = 0;
2166
2167         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2168                 return -EINVAL;
2169
2170         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2171                 res->type = RTN_BROADCAST;
2172         else if (MULTICAST(fl->fl4_dst))
2173                 res->type = RTN_MULTICAST;
2174         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2175                 return -EINVAL;
2176
2177         if (dev_out->flags & IFF_LOOPBACK)
2178                 flags |= RTCF_LOCAL;
2179
2180         /* get work reference to inet device */
2181         in_dev = in_dev_get(dev_out);
2182         if (!in_dev)
2183                 return -EINVAL;
2184
2185         if (res->type == RTN_BROADCAST) {
2186                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2187                 if (res->fi) {
2188                         fib_info_put(res->fi);
2189                         res->fi = NULL;
2190                 }
2191         } else if (res->type == RTN_MULTICAST) {
2192                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2193                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2194                                  oldflp->proto))
2195                         flags &= ~RTCF_LOCAL;
2196                 /* If multicast route do not exist use
2197                    default one, but do not gateway in this case.
2198                    Yes, it is hack.
2199                  */
2200                 if (res->fi && res->prefixlen < 4) {
2201                         fib_info_put(res->fi);
2202                         res->fi = NULL;
2203                 }
2204         }
2205
2206
2207         rth = dst_alloc(&ipv4_dst_ops);
2208         if (!rth) {
2209                 err = -ENOBUFS;
2210                 goto cleanup;
2211         }
2212
2213         atomic_set(&rth->u.dst.__refcnt, 1);
2214         rth->u.dst.flags= DST_HOST;
2215 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2216         if (res->fi) {
2217                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2218                 if (res->fi->fib_nhs > 1)
2219                         rth->u.dst.flags |= DST_BALANCED;
2220         }
2221 #endif
2222         if (in_dev->cnf.no_xfrm)
2223                 rth->u.dst.flags |= DST_NOXFRM;
2224         if (in_dev->cnf.no_policy)
2225                 rth->u.dst.flags |= DST_NOPOLICY;
2226
2227         rth->fl.fl4_dst = oldflp->fl4_dst;
2228         rth->fl.fl4_tos = tos;
2229         rth->fl.fl4_src = oldflp->fl4_src;
2230         rth->fl.oif     = oldflp->oif;
2231         rth->fl.mark    = oldflp->mark;
2232         rth->rt_dst     = fl->fl4_dst;
2233         rth->rt_src     = fl->fl4_src;
2234         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2235         /* get references to the devices that are to be hold by the routing
2236            cache entry */
2237         rth->u.dst.dev  = dev_out;
2238         dev_hold(dev_out);
2239         rth->idev       = in_dev_get(dev_out);
2240         rth->rt_gateway = fl->fl4_dst;
2241         rth->rt_spec_dst= fl->fl4_src;
2242
2243         rth->u.dst.output=ip_output;
2244
2245         RT_CACHE_STAT_INC(out_slow_tot);
2246
2247         if (flags & RTCF_LOCAL) {
2248                 rth->u.dst.input = ip_local_deliver;
2249                 rth->rt_spec_dst = fl->fl4_dst;
2250         }
2251         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2252                 rth->rt_spec_dst = fl->fl4_src;
2253                 if (flags & RTCF_LOCAL &&
2254                     !(dev_out->flags & IFF_LOOPBACK)) {
2255                         rth->u.dst.output = ip_mc_output;
2256                         RT_CACHE_STAT_INC(out_slow_mc);
2257                 }
2258 #ifdef CONFIG_IP_MROUTE
2259                 if (res->type == RTN_MULTICAST) {
2260                         if (IN_DEV_MFORWARD(in_dev) &&
2261                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2262                                 rth->u.dst.input = ip_mr_input;
2263                                 rth->u.dst.output = ip_mc_output;
2264                         }
2265                 }
2266 #endif
2267         }
2268
2269         rt_set_nexthop(rth, res, 0);
2270
2271         rth->rt_flags = flags;
2272
2273         *result = rth;
2274  cleanup:
2275         /* release work reference to inet device */
2276         in_dev_put(in_dev);
2277
2278         return err;
2279 }
2280
2281 static inline int ip_mkroute_output_def(struct rtable **rp,
2282                                         struct fib_result* res,
2283                                         const struct flowi *fl,
2284                                         const struct flowi *oldflp,
2285                                         struct net_device *dev_out,
2286                                         unsigned flags)
2287 {
2288         struct rtable *rth = NULL;
2289         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2290         unsigned hash;
2291         if (err == 0) {
2292                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2293                 err = rt_intern_hash(hash, rth, rp);
2294         }
2295
2296         return err;
2297 }
2298
2299 static inline int ip_mkroute_output(struct rtable** rp,
2300                                     struct fib_result* res,
2301                                     const struct flowi *fl,
2302                                     const struct flowi *oldflp,
2303                                     struct net_device *dev_out,
2304                                     unsigned flags)
2305 {
2306 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2307         unsigned char hop;
2308         unsigned hash;
2309         int err = -EINVAL;
2310         struct rtable *rth = NULL;
2311
2312         if (res->fi && res->fi->fib_nhs > 1) {
2313                 unsigned char hopcount = res->fi->fib_nhs;
2314
2315                 for (hop = 0; hop < hopcount; hop++) {
2316                         struct net_device *dev2nexthop;
2317
2318                         res->nh_sel = hop;
2319
2320                         /* hold a work reference to the output device */
2321                         dev2nexthop = FIB_RES_DEV(*res);
2322                         dev_hold(dev2nexthop);
2323
2324                         /* put reference to previous result */
2325                         if (hop)
2326                                 ip_rt_put(*rp);
2327
2328                         err = __mkroute_output(&rth, res, fl, oldflp,
2329                                                dev2nexthop, flags);
2330
2331                         if (err != 0)
2332                                 goto cleanup;
2333
2334                         hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2335                                         oldflp->oif);
2336                         err = rt_intern_hash(hash, rth, rp);
2337
2338                         /* forward hop information to multipath impl. */
2339                         multipath_set_nhinfo(rth,
2340                                              FIB_RES_NETWORK(*res),
2341                                              FIB_RES_NETMASK(*res),
2342                                              res->prefixlen,
2343                                              &FIB_RES_NH(*res));
2344                 cleanup:
2345                         /* release work reference to output device */
2346                         dev_put(dev2nexthop);
2347
2348                         if (err != 0)
2349                                 return err;
2350                 }
2351                 return err;
2352         } else {
2353                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2354                                              flags);
2355         }
2356 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2357         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2358 #endif
2359 }
2360
2361 /*
2362  * Major route resolver routine.
2363  */
2364
2365 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2366 {
2367         u32 tos = RT_FL_TOS(oldflp);
2368         struct flowi fl = { .nl_u = { .ip4_u =
2369                                       { .daddr = oldflp->fl4_dst,
2370                                         .saddr = oldflp->fl4_src,
2371                                         .tos = tos & IPTOS_RT_MASK,
2372                                         .scope = ((tos & RTO_ONLINK) ?
2373                                                   RT_SCOPE_LINK :
2374                                                   RT_SCOPE_UNIVERSE),
2375                                       } },
2376                             .mark = oldflp->mark,
2377                             .iif = loopback_dev.ifindex,
2378                             .oif = oldflp->oif };
2379         struct fib_result res;
2380         unsigned flags = 0;
2381         struct net_device *dev_out = NULL;
2382         int free_res = 0;
2383         int err;
2384
2385
2386         res.fi          = NULL;
2387 #ifdef CONFIG_IP_MULTIPLE_TABLES
2388         res.r           = NULL;
2389 #endif
2390
2391         if (oldflp->fl4_src) {
2392                 err = -EINVAL;
2393                 if (MULTICAST(oldflp->fl4_src) ||
2394                     BADCLASS(oldflp->fl4_src) ||
2395                     ZERONET(oldflp->fl4_src))
2396                         goto out;
2397
2398                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2399                 dev_out = ip_dev_find(oldflp->fl4_src);
2400                 if (dev_out == NULL)
2401                         goto out;
2402
2403                 /* I removed check for oif == dev_out->oif here.
2404                    It was wrong for two reasons:
2405                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2406                       assigned to multiple interfaces.
2407                    2. Moreover, we are allowed to send packets with saddr
2408                       of another iface. --ANK
2409                  */
2410
2411                 if (oldflp->oif == 0
2412                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2413                         /* Special hack: user can direct multicasts
2414                            and limited broadcast via necessary interface
2415                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2416                            This hack is not just for fun, it allows
2417                            vic,vat and friends to work.
2418                            They bind socket to loopback, set ttl to zero
2419                            and expect that it will work.
2420                            From the viewpoint of routing cache they are broken,
2421                            because we are not allowed to build multicast path
2422                            with loopback source addr (look, routing cache
2423                            cannot know, that ttl is zero, so that packet
2424                            will not leave this host and route is valid).
2425                            Luckily, this hack is good workaround.
2426                          */
2427
2428                         fl.oif = dev_out->ifindex;
2429                         goto make_route;
2430                 }
2431                 if (dev_out)
2432                         dev_put(dev_out);
2433                 dev_out = NULL;
2434         }
2435
2436
2437         if (oldflp->oif) {
2438                 dev_out = dev_get_by_index(oldflp->oif);
2439                 err = -ENODEV;
2440                 if (dev_out == NULL)
2441                         goto out;
2442
2443                 /* RACE: Check return value of inet_select_addr instead. */
2444                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2445                         dev_put(dev_out);
2446                         goto out;       /* Wrong error code */
2447                 }
2448
2449                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2450                         if (!fl.fl4_src)
2451                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2452                                                               RT_SCOPE_LINK);
2453                         goto make_route;
2454                 }
2455                 if (!fl.fl4_src) {
2456                         if (MULTICAST(oldflp->fl4_dst))
2457                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2458                                                               fl.fl4_scope);
2459                         else if (!oldflp->fl4_dst)
2460                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2461                                                               RT_SCOPE_HOST);
2462                 }
2463         }
2464
2465         if (!fl.fl4_dst) {
2466                 fl.fl4_dst = fl.fl4_src;
2467                 if (!fl.fl4_dst)
2468                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2469                 if (dev_out)
2470                         dev_put(dev_out);
2471                 dev_out = &loopback_dev;
2472                 dev_hold(dev_out);
2473                 fl.oif = loopback_dev.ifindex;
2474                 res.type = RTN_LOCAL;
2475                 flags |= RTCF_LOCAL;
2476                 goto make_route;
2477         }
2478
2479         if (fib_lookup(&fl, &res)) {
2480                 res.fi = NULL;
2481                 if (oldflp->oif) {
2482                         /* Apparently, routing tables are wrong. Assume,
2483                            that the destination is on link.
2484
2485                            WHY? DW.
2486                            Because we are allowed to send to iface
2487                            even if it has NO routes and NO assigned
2488                            addresses. When oif is specified, routing
2489                            tables are looked up with only one purpose:
2490                            to catch if destination is gatewayed, rather than
2491                            direct. Moreover, if MSG_DONTROUTE is set,
2492                            we send packet, ignoring both routing tables
2493                            and ifaddr state. --ANK
2494
2495
2496                            We could make it even if oif is unknown,
2497                            likely IPv6, but we do not.
2498                          */
2499
2500                         if (fl.fl4_src == 0)
2501                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2502                                                               RT_SCOPE_LINK);
2503                         res.type = RTN_UNICAST;
2504                         goto make_route;
2505                 }
2506                 if (dev_out)
2507                         dev_put(dev_out);
2508                 err = -ENETUNREACH;
2509                 goto out;
2510         }
2511         free_res = 1;
2512
2513         if (res.type == RTN_LOCAL) {
2514                 if (!fl.fl4_src)
2515                         fl.fl4_src = fl.fl4_dst;
2516                 if (dev_out)
2517                         dev_put(dev_out);
2518                 dev_out = &loopback_dev;
2519                 dev_hold(dev_out);
2520                 fl.oif = dev_out->ifindex;
2521                 if (res.fi)
2522                         fib_info_put(res.fi);
2523                 res.fi = NULL;
2524                 flags |= RTCF_LOCAL;
2525                 goto make_route;
2526         }
2527
2528 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2529         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2530                 fib_select_multipath(&fl, &res);
2531         else
2532 #endif
2533         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2534                 fib_select_default(&fl, &res);
2535
2536         if (!fl.fl4_src)
2537                 fl.fl4_src = FIB_RES_PREFSRC(res);
2538
2539         if (dev_out)
2540                 dev_put(dev_out);
2541         dev_out = FIB_RES_DEV(res);
2542         dev_hold(dev_out);
2543         fl.oif = dev_out->ifindex;
2544
2545
2546 make_route:
2547         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2548
2549
2550         if (free_res)
2551                 fib_res_put(&res);
2552         if (dev_out)
2553                 dev_put(dev_out);
2554 out:    return err;
2555 }
2556
2557 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2558 {
2559         unsigned hash;
2560         struct rtable *rth;
2561
2562         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2563
2564         rcu_read_lock_bh();
2565         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2566                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2567                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2568                     rth->fl.fl4_src == flp->fl4_src &&
2569                     rth->fl.iif == 0 &&
2570                     rth->fl.oif == flp->oif &&
2571                     rth->fl.mark == flp->mark &&
2572                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2573                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2574
2575                         /* check for multipath routes and choose one if
2576                          * necessary
2577                          */
2578                         if (multipath_select_route(flp, rth, rp)) {
2579                                 dst_hold(&(*rp)->u.dst);
2580                                 RT_CACHE_STAT_INC(out_hit);
2581                                 rcu_read_unlock_bh();
2582                                 return 0;
2583                         }
2584
2585                         rth->u.dst.lastuse = jiffies;
2586                         dst_hold(&rth->u.dst);
2587                         rth->u.dst.__use++;
2588                         RT_CACHE_STAT_INC(out_hit);
2589                         rcu_read_unlock_bh();
2590                         *rp = rth;
2591                         return 0;
2592                 }
2593                 RT_CACHE_STAT_INC(out_hlist_search);
2594         }
2595         rcu_read_unlock_bh();
2596
2597         return ip_route_output_slow(rp, flp);
2598 }
2599
2600 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2601
2602 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2603 {
2604         int err;
2605
2606         if ((err = __ip_route_output_key(rp, flp)) != 0)
2607                 return err;
2608
2609         if (flp->proto) {
2610                 if (!flp->fl4_src)
2611                         flp->fl4_src = (*rp)->rt_src;
2612                 if (!flp->fl4_dst)
2613                         flp->fl4_dst = (*rp)->rt_dst;
2614                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2615         }
2616
2617         return 0;
2618 }
2619
2620 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2621
2622 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2623 {
2624         return ip_route_output_flow(rp, flp, NULL, 0);
2625 }
2626
2627 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2628                         int nowait, unsigned int flags)
2629 {
2630         struct rtable *rt = (struct rtable*)skb->dst;
2631         struct rtmsg *r;
2632         struct nlmsghdr *nlh;
2633         long expires;
2634         u32 id = 0, ts = 0, tsage = 0, error;
2635
2636         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2637         if (nlh == NULL)
2638                 return -EMSGSIZE;
2639
2640         r = nlmsg_data(nlh);
2641         r->rtm_family    = AF_INET;
2642         r->rtm_dst_len  = 32;
2643         r->rtm_src_len  = 0;
2644         r->rtm_tos      = rt->fl.fl4_tos;
2645         r->rtm_table    = RT_TABLE_MAIN;
2646         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2647         r->rtm_type     = rt->rt_type;
2648         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2649         r->rtm_protocol = RTPROT_UNSPEC;
2650         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2651         if (rt->rt_flags & RTCF_NOTIFY)
2652                 r->rtm_flags |= RTM_F_NOTIFY;
2653
2654         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2655
2656         if (rt->fl.fl4_src) {
2657                 r->rtm_src_len = 32;
2658                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2659         }
2660         if (rt->u.dst.dev)
2661                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2662 #ifdef CONFIG_NET_CLS_ROUTE
2663         if (rt->u.dst.tclassid)
2664                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2665 #endif
2666 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2667         if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2668                 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2669 #endif
2670         if (rt->fl.iif)
2671                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2672         else if (rt->rt_src != rt->fl.fl4_src)
2673                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2674
2675         if (rt->rt_dst != rt->rt_gateway)
2676                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2677
2678         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2679                 goto nla_put_failure;
2680
2681         error = rt->u.dst.error;
2682         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2683         if (rt->peer) {
2684                 id = rt->peer->ip_id_count;
2685                 if (rt->peer->tcp_ts_stamp) {
2686                         ts = rt->peer->tcp_ts;
2687                         tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2688                 }
2689         }
2690
2691         if (rt->fl.iif) {
2692 #ifdef CONFIG_IP_MROUTE
2693                 __be32 dst = rt->rt_dst;
2694
2695                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2696                     ipv4_devconf.mc_forwarding) {
2697                         int err = ipmr_get_route(skb, r, nowait);
2698                         if (err <= 0) {
2699                                 if (!nowait) {
2700                                         if (err == 0)
2701                                                 return 0;
2702                                         goto nla_put_failure;
2703                                 } else {
2704                                         if (err == -EMSGSIZE)
2705                                                 goto nla_put_failure;
2706                                         error = err;
2707                                 }
2708                         }
2709                 } else
2710 #endif
2711                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2712         }
2713
2714         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2715                                expires, error) < 0)
2716                 goto nla_put_failure;
2717
2718         return nlmsg_end(skb, nlh);
2719
2720 nla_put_failure:
2721         nlmsg_cancel(skb, nlh);
2722         return -EMSGSIZE;
2723 }
2724
2725 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2726 {
2727         struct rtmsg *rtm;
2728         struct nlattr *tb[RTA_MAX+1];
2729         struct rtable *rt = NULL;
2730         __be32 dst = 0;
2731         __be32 src = 0;
2732         u32 iif;
2733         int err;
2734         struct sk_buff *skb;
2735
2736         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2737         if (err < 0)
2738                 goto errout;
2739
2740         rtm = nlmsg_data(nlh);
2741
2742         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2743         if (skb == NULL) {
2744                 err = -ENOBUFS;
2745                 goto errout;
2746         }
2747
2748         /* Reserve room for dummy headers, this skb can pass
2749            through good chunk of routing engine.
2750          */
2751         skb->mac.raw = skb->nh.raw = skb->data;
2752
2753         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2754         skb->nh.iph->protocol = IPPROTO_ICMP;
2755         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2756
2757         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2758         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2759         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2760
2761         if (iif) {
2762                 struct net_device *dev;
2763
2764                 dev = __dev_get_by_index(iif);
2765                 if (dev == NULL) {
2766                         err = -ENODEV;
2767                         goto errout_free;
2768                 }
2769
2770                 skb->protocol   = htons(ETH_P_IP);
2771                 skb->dev        = dev;
2772                 local_bh_disable();
2773                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2774                 local_bh_enable();
2775
2776                 rt = (struct rtable*) skb->dst;
2777                 if (err == 0 && rt->u.dst.error)
2778                         err = -rt->u.dst.error;
2779         } else {
2780                 struct flowi fl = {
2781                         .nl_u = {
2782                                 .ip4_u = {
2783                                         .daddr = dst,
2784                                         .saddr = src,
2785                                         .tos = rtm->rtm_tos,
2786                                 },
2787                         },
2788                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2789                 };
2790                 err = ip_route_output_key(&rt, &fl);
2791         }
2792
2793         if (err)
2794                 goto errout_free;
2795
2796         skb->dst = &rt->u.dst;
2797         if (rtm->rtm_flags & RTM_F_NOTIFY)
2798                 rt->rt_flags |= RTCF_NOTIFY;
2799
2800         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2801                                 RTM_NEWROUTE, 0, 0);
2802         if (err <= 0)
2803                 goto errout_free;
2804
2805         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2806 errout:
2807         return err;
2808
2809 errout_free:
2810         kfree_skb(skb);
2811         goto errout;
2812 }
2813
2814 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2815 {
2816         struct rtable *rt;
2817         int h, s_h;
2818         int idx, s_idx;
2819
2820         s_h = cb->args[0];
2821         s_idx = idx = cb->args[1];
2822         for (h = 0; h <= rt_hash_mask; h++) {
2823                 if (h < s_h) continue;
2824                 if (h > s_h)
2825                         s_idx = 0;
2826                 rcu_read_lock_bh();
2827                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2828                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2829                         if (idx < s_idx)
2830                                 continue;
2831                         skb->dst = dst_clone(&rt->u.dst);
2832                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2833                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2834                                          1, NLM_F_MULTI) <= 0) {
2835                                 dst_release(xchg(&skb->dst, NULL));
2836                                 rcu_read_unlock_bh();
2837                                 goto done;
2838                         }
2839                         dst_release(xchg(&skb->dst, NULL));
2840                 }
2841                 rcu_read_unlock_bh();
2842         }
2843
2844 done:
2845         cb->args[0] = h;
2846         cb->args[1] = idx;
2847         return skb->len;
2848 }
2849
2850 void ip_rt_multicast_event(struct in_device *in_dev)
2851 {
2852         rt_cache_flush(0);
2853 }
2854
2855 #ifdef CONFIG_SYSCTL
2856 static int flush_delay;
2857
2858 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2859                                         struct file *filp, void __user *buffer,
2860                                         size_t *lenp, loff_t *ppos)
2861 {
2862         if (write) {
2863                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2864                 rt_cache_flush(flush_delay);
2865                 return 0;
2866         }
2867
2868         return -EINVAL;
2869 }
2870
2871 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2872                                                 int __user *name,
2873                                                 int nlen,
2874                                                 void __user *oldval,
2875                                                 size_t __user *oldlenp,
2876                                                 void __user *newval,
2877                                                 size_t newlen)
2878 {
2879         int delay;
2880         if (newlen != sizeof(int))
2881                 return -EINVAL;
2882         if (get_user(delay, (int __user *)newval))
2883                 return -EFAULT;
2884         rt_cache_flush(delay);
2885         return 0;
2886 }
2887
2888 ctl_table ipv4_route_table[] = {
2889         {
2890                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2891                 .procname       = "flush",
2892                 .data           = &flush_delay,
2893                 .maxlen         = sizeof(int),
2894                 .mode           = 0200,
2895                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2896                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2897         },
2898         {
2899                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2900                 .procname       = "min_delay",
2901                 .data           = &ip_rt_min_delay,
2902                 .maxlen         = sizeof(int),
2903                 .mode           = 0644,
2904                 .proc_handler   = &proc_dointvec_jiffies,
2905                 .strategy       = &sysctl_jiffies,
2906         },
2907         {
2908                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2909                 .procname       = "max_delay",
2910                 .data           = &ip_rt_max_delay,
2911                 .maxlen         = sizeof(int),
2912                 .mode           = 0644,
2913                 .proc_handler   = &proc_dointvec_jiffies,
2914                 .strategy       = &sysctl_jiffies,
2915         },
2916         {
2917                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2918                 .procname       = "gc_thresh",
2919                 .data           = &ipv4_dst_ops.gc_thresh,
2920                 .maxlen         = sizeof(int),
2921                 .mode           = 0644,
2922                 .proc_handler   = &proc_dointvec,
2923         },
2924         {
2925                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2926                 .procname       = "max_size",
2927                 .data           = &ip_rt_max_size,
2928                 .maxlen         = sizeof(int),
2929                 .mode           = 0644,
2930                 .proc_handler   = &proc_dointvec,
2931         },
2932         {
2933                 /*  Deprecated. Use gc_min_interval_ms */
2934
2935                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936                 .procname       = "gc_min_interval",
2937                 .data           = &ip_rt_gc_min_interval,
2938                 .maxlen         = sizeof(int),
2939                 .mode           = 0644,
2940                 .proc_handler   = &proc_dointvec_jiffies,
2941                 .strategy       = &sysctl_jiffies,
2942         },
2943         {
2944                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945                 .procname       = "gc_min_interval_ms",
2946                 .data           = &ip_rt_gc_min_interval,
2947                 .maxlen         = sizeof(int),
2948                 .mode           = 0644,
2949                 .proc_handler   = &proc_dointvec_ms_jiffies,
2950                 .strategy       = &sysctl_ms_jiffies,
2951         },
2952         {
2953                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2954                 .procname       = "gc_timeout",
2955                 .data           = &ip_rt_gc_timeout,
2956                 .maxlen         = sizeof(int),
2957                 .mode           = 0644,
2958                 .proc_handler   = &proc_dointvec_jiffies,
2959                 .strategy       = &sysctl_jiffies,
2960         },
2961         {
2962                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2963                 .procname       = "gc_interval",
2964                 .data           = &ip_rt_gc_interval,
2965                 .maxlen         = sizeof(int),
2966                 .mode           = 0644,
2967                 .proc_handler   = &proc_dointvec_jiffies,
2968                 .strategy       = &sysctl_jiffies,
2969         },
2970         {
2971                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2972                 .procname       = "redirect_load",
2973                 .data           = &ip_rt_redirect_load,
2974                 .maxlen         = sizeof(int),
2975                 .mode           = 0644,
2976                 .proc_handler   = &proc_dointvec,
2977         },
2978         {
2979                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980                 .procname       = "redirect_number",
2981                 .data           = &ip_rt_redirect_number,
2982                 .maxlen         = sizeof(int),
2983                 .mode           = 0644,
2984                 .proc_handler   = &proc_dointvec,
2985         },
2986         {
2987                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988                 .procname       = "redirect_silence",
2989                 .data           = &ip_rt_redirect_silence,
2990                 .maxlen         = sizeof(int),
2991                 .mode           = 0644,
2992                 .proc_handler   = &proc_dointvec,
2993         },
2994         {
2995                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2996                 .procname       = "error_cost",
2997                 .data           = &ip_rt_error_cost,
2998                 .maxlen         = sizeof(int),
2999                 .mode           = 0644,
3000                 .proc_handler   = &proc_dointvec,
3001         },
3002         {
3003                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3004                 .procname       = "error_burst",
3005                 .data           = &ip_rt_error_burst,
3006                 .maxlen         = sizeof(int),
3007                 .mode           = 0644,
3008                 .proc_handler   = &proc_dointvec,
3009         },
3010         {
3011                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3012                 .procname       = "gc_elasticity",
3013                 .data           = &ip_rt_gc_elasticity,
3014                 .maxlen         = sizeof(int),
3015                 .mode           = 0644,
3016                 .proc_handler   = &proc_dointvec,
3017         },
3018         {
3019                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3020                 .procname       = "mtu_expires",
3021                 .data           = &ip_rt_mtu_expires,
3022                 .maxlen         = sizeof(int),
3023                 .mode           = 0644,
3024                 .proc_handler   = &proc_dointvec_jiffies,
3025                 .strategy       = &sysctl_jiffies,
3026         },
3027         {
3028                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3029                 .procname       = "min_pmtu",
3030                 .data           = &ip_rt_min_pmtu,
3031                 .maxlen         = sizeof(int),
3032                 .mode           = 0644,
3033                 .proc_handler   = &proc_dointvec,
3034         },
3035         {
3036                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3037                 .procname       = "min_adv_mss",
3038                 .data           = &ip_rt_min_advmss,
3039                 .maxlen         = sizeof(int),
3040                 .mode           = 0644,
3041                 .proc_handler   = &proc_dointvec,
3042         },
3043         {
3044                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3045                 .procname       = "secret_interval",
3046                 .data           = &ip_rt_secret_interval,
3047                 .maxlen         = sizeof(int),
3048                 .mode           = 0644,
3049                 .proc_handler   = &proc_dointvec_jiffies,
3050                 .strategy       = &sysctl_jiffies,
3051         },
3052         { .ctl_name = 0 }
3053 };
3054 #endif
3055
3056 #ifdef CONFIG_NET_CLS_ROUTE
3057 struct ip_rt_acct *ip_rt_acct;
3058
3059 /* This code sucks.  But you should have seen it before! --RR */
3060
3061 /* IP route accounting ptr for this logical cpu number. */
3062 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3063
3064 #ifdef CONFIG_PROC_FS
3065 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066                            int length, int *eof, void *data)
3067 {
3068         unsigned int i;
3069
3070         if ((offset & 3) || (length & 3))
3071                 return -EIO;
3072
3073         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3074                 *eof = 1;
3075                 return 0;
3076         }
3077
3078         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3080                 *eof = 1;
3081         }
3082
3083         offset /= sizeof(u32);
3084
3085         if (length > 0) {
3086                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087                 u32 *dst = (u32 *) buffer;
3088
3089                 /* Copy first cpu. */
3090                 *start = buffer;
3091                 memcpy(dst, src, length);
3092
3093                 /* Add the other cpus in, one int at a time */
3094                 for_each_possible_cpu(i) {
3095                         unsigned int j;
3096
3097                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3098
3099                         for (j = 0; j < length/4; j++)
3100                                 dst[j] += src[j];
3101                 }
3102         }
3103         return length;
3104 }
3105 #endif /* CONFIG_PROC_FS */
3106 #endif /* CONFIG_NET_CLS_ROUTE */
3107
3108 static __initdata unsigned long rhash_entries;
3109 static int __init set_rhash_entries(char *str)
3110 {
3111         if (!str)
3112                 return 0;
3113         rhash_entries = simple_strtoul(str, &str, 0);
3114         return 1;
3115 }
3116 __setup("rhash_entries=", set_rhash_entries);
3117
3118 int __init ip_rt_init(void)
3119 {
3120         int rc = 0;
3121
3122         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123                              (jiffies ^ (jiffies >> 7)));
3124
3125 #ifdef CONFIG_NET_CLS_ROUTE
3126         {
3127         int order;
3128         for (order = 0;
3129              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3130                 /* NOTHING */;
3131         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3132         if (!ip_rt_acct)
3133                 panic("IP: failed to allocate ip_rt_acct\n");
3134         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3135         }
3136 #endif
3137
3138         ipv4_dst_ops.kmem_cachep =
3139                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3140                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3141
3142         rt_hash_table = (struct rt_hash_bucket *)
3143                 alloc_large_system_hash("IP route cache",
3144                                         sizeof(struct rt_hash_bucket),
3145                                         rhash_entries,
3146                                         (num_physpages >= 128 * 1024) ?
3147                                         15 : 17,
3148                                         0,
3149                                         &rt_hash_log,
3150                                         &rt_hash_mask,
3151                                         0);
3152         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3153         rt_hash_lock_init();
3154
3155         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3156         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3157
3158         devinet_init();
3159         ip_fib_init();
3160
3161         init_timer(&rt_flush_timer);
3162         rt_flush_timer.function = rt_run_flush;
3163         init_timer(&rt_periodic_timer);
3164         rt_periodic_timer.function = rt_check_expire;
3165         init_timer(&rt_secret_timer);
3166         rt_secret_timer.function = rt_secret_rebuild;
3167
3168         /* All the timers, started at system startup tend
3169            to synchronize. Perturb it a bit.
3170          */
3171         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3172                                         ip_rt_gc_interval;
3173         add_timer(&rt_periodic_timer);
3174
3175         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3176                 ip_rt_secret_interval;
3177         add_timer(&rt_secret_timer);
3178
3179 #ifdef CONFIG_PROC_FS
3180         {
3181         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3182         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3183             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3184                                              proc_net_stat))) {
3185                 return -ENOMEM;
3186         }
3187         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3188         }
3189 #ifdef CONFIG_NET_CLS_ROUTE
3190         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3191 #endif
3192 #endif
3193 #ifdef CONFIG_XFRM
3194         xfrm_init();
3195         xfrm4_init();
3196 #endif
3197         return rc;
3198 }
3199
3200 EXPORT_SYMBOL(__ip_select_ident);
3201 EXPORT_SYMBOL(ip_route_input);
3202 EXPORT_SYMBOL(ip_route_output_key);