err.no Git - linux-2.6/blob - net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60  *
  61  *              This program is free software; you can redistribute it and/or
  62  *              modify it under the terms of the GNU General Public License
  63  *              as published by the Free Software Foundation; either version
  64  *              2 of the License, or (at your option) any later version.
  65  */
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <asm/system.h>
  70 #include <linux/bitops.h>
  71 #include <linux/types.h>
  72 #include <linux/kernel.h>
  73 #include <linux/mm.h>
  74 #include <linux/bootmem.h>
  75 #include <linux/string.h>
  76 #include <linux/socket.h>
  77 #include <linux/sockios.h>
  78 #include <linux/errno.h>
  79 #include <linux/in.h>
  80 #include <linux/inet.h>
  81 #include <linux/netdevice.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/init.h>
  84 #include <linux/workqueue.h>
  85 #include <linux/skbuff.h>
  86 #include <linux/inetdevice.h>
  87 #include <linux/igmp.h>
  88 #include <linux/pkt_sched.h>
  89 #include <linux/mroute.h>
  90 #include <linux/netfilter_ipv4.h>
  91 #include <linux/random.h>
  92 #include <linux/jhash.h>
  93 #include <linux/rcupdate.h>
  94 #include <linux/times.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp) \
 113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_min_delay              = 2 * HZ;
 120 static int ip_rt_max_delay              = 10 * HZ;
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval            = 60 * HZ;
 124 static int ip_rt_gc_min_interval        = HZ / 2;
 125 static int ip_rt_redirect_number        = 9;
 126 static int ip_rt_redirect_load          = HZ / 50;
 127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost             = HZ;
 129 static int ip_rt_error_burst            = 5 * HZ;
 130 static int ip_rt_gc_elasticity          = 8;
 131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 133 static int ip_rt_min_advmss             = 256;
 134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 135 static unsigned long rt_deadline;
 136
 137 #define RTprint(a...)   printk(KERN_DEBUG a)
 138
 139 static struct timer_list rt_flush_timer;
 140 static void rt_check_expire(struct work_struct *work);
 141 static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
 142 static struct timer_list rt_secret_timer;
 143
 144 /*
 145  *      Interface to generic destination cache.
 146  */
 147
 148 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 149 static void              ipv4_dst_destroy(struct dst_entry *dst);
 150 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 151                                          struct net_device *dev, int how);
 152 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 153 static void              ipv4_link_failure(struct sk_buff *skb);
 154 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 155 static int rt_garbage_collect(void);
 156
 157
 158 static struct dst_ops ipv4_dst_ops = {
 159         .family =               AF_INET,
 160         .protocol =             __constant_htons(ETH_P_IP),
 161         .gc =                   rt_garbage_collect,
 162         .check =                ipv4_dst_check,
 163         .destroy =              ipv4_dst_destroy,
 164         .ifdown =               ipv4_dst_ifdown,
 165         .negative_advice =      ipv4_negative_advice,
 166         .link_failure =         ipv4_link_failure,
 167         .update_pmtu =          ip_rt_update_pmtu,
 168         .entry_size =           sizeof(struct rtable),
 169 };
 170
 171 #define ECN_OR_COST(class)      TC_PRIO_##class
 172
 173 const __u8 ip_tos2prio[16] = {
 174         TC_PRIO_BESTEFFORT,
 175         ECN_OR_COST(FILLER),
 176         TC_PRIO_BESTEFFORT,
 177         ECN_OR_COST(BESTEFFORT),
 178         TC_PRIO_BULK,
 179         ECN_OR_COST(BULK),
 180         TC_PRIO_BULK,
 181         ECN_OR_COST(BULK),
 182         TC_PRIO_INTERACTIVE,
 183         ECN_OR_COST(INTERACTIVE),
 184         TC_PRIO_INTERACTIVE,
 185         ECN_OR_COST(INTERACTIVE),
 186         TC_PRIO_INTERACTIVE_BULK,
 187         ECN_OR_COST(INTERACTIVE_BULK),
 188         TC_PRIO_INTERACTIVE_BULK,
 189         ECN_OR_COST(INTERACTIVE_BULK)
 190 };
 191
 192
 193 /*
 194  * Route cache.
 195  */
 196
 197 /* The locking scheme is rather straight forward:
 198  *
 199  * 1) Read-Copy Update protects the buckets of the central route hash.
 200  * 2) Only writers remove entries, and they hold the lock
 201  *    as they look at rtable reference counts.
 202  * 3) Only readers acquire references to rtable entries,
 203  *    they do so with atomic increments and with the
 204  *    lock held.
 205  */
 206
 207 struct rt_hash_bucket {
 208         struct rtable   *chain;
 209 };
 210 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 211         defined(CONFIG_PROVE_LOCKING)
 212 /*
 213  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 214  * The size of this table is a power of two and depends on the number of CPUS.
 215  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 216  */
 217 #ifdef CONFIG_LOCKDEP
 218 # define RT_HASH_LOCK_SZ        256
 219 #else
 220 # if NR_CPUS >= 32
 221 #  define RT_HASH_LOCK_SZ       4096
 222 # elif NR_CPUS >= 16
 223 #  define RT_HASH_LOCK_SZ       2048
 224 # elif NR_CPUS >= 8
 225 #  define RT_HASH_LOCK_SZ       1024
 226 # elif NR_CPUS >= 4
 227 #  define RT_HASH_LOCK_SZ       512
 228 # else
 229 #  define RT_HASH_LOCK_SZ       256
 230 # endif
 231 #endif
 232
 233 static spinlock_t       *rt_hash_locks;
 234 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 235 # define rt_hash_lock_init()    { \
 236                 int i; \
 237                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 238                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 239                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 240                         spin_lock_init(&rt_hash_locks[i]); \
 241                 }
 242 #else
 243 # define rt_hash_lock_addr(slot) NULL
 244 # define rt_hash_lock_init()
 245 #endif
 246
 247 static struct rt_hash_bucket    *rt_hash_table;
 248 static unsigned                 rt_hash_mask;
 249 static int                      rt_hash_log;
 250 static unsigned int             rt_hash_rnd;
 251
 252 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 253 #define RT_CACHE_STAT_INC(field) \
 254         (__raw_get_cpu_var(rt_cache_stat).field++)
 255
 256 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 257                                 struct rtable **res);
 258
 259 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 260 {
 261         return (jhash_2words(daddr, saddr, rt_hash_rnd)
 262                 & rt_hash_mask);
 263 }
 264
 265 #define rt_hash(daddr, saddr, idx) \
 266         rt_hash_code((__force u32)(__be32)(daddr),\
 267                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
 268
 269 #ifdef CONFIG_PROC_FS
 270 struct rt_cache_iter_state {
 271         int bucket;
 272 };
 273
 274 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 275 {
 276         struct rtable *r = NULL;
 277         struct rt_cache_iter_state *st = seq->private;
 278
 279         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 280                 rcu_read_lock_bh();
 281                 r = rt_hash_table[st->bucket].chain;
 282                 if (r)
 283                         break;
 284                 rcu_read_unlock_bh();
 285         }
 286         return r;
 287 }
 288
 289 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 290 {
 291         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 292
 293         r = r->u.dst.rt_next;
 294         while (!r) {
 295                 rcu_read_unlock_bh();
 296                 if (--st->bucket < 0)
 297                         break;
 298                 rcu_read_lock_bh();
 299                 r = rt_hash_table[st->bucket].chain;
 300         }
 301         return r;
 302 }
 303
 304 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 305 {
 306         struct rtable *r = rt_cache_get_first(seq);
 307
 308         if (r)
 309                 while (pos && (r = rt_cache_get_next(seq, r)))
 310                         --pos;
 311         return pos ? NULL : r;
 312 }
 313
 314 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 315 {
 316         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 317 }
 318
 319 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 320 {
 321         struct rtable *r = NULL;
 322
 323         if (v == SEQ_START_TOKEN)
 324                 r = rt_cache_get_first(seq);
 325         else
 326                 r = rt_cache_get_next(seq, v);
 327         ++*pos;
 328         return r;
 329 }
 330
 331 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 332 {
 333         if (v && v != SEQ_START_TOKEN)
 334                 rcu_read_unlock_bh();
 335 }
 336
 337 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 338 {
 339         if (v == SEQ_START_TOKEN)
 340                 seq_printf(seq, "%-127s\n",
 341                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 342                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 343                            "HHUptod\tSpecDst");
 344         else {
 345                 struct rtable *r = v;
 346                 char temp[256];
 347
 348                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 349                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 350                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 351                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 352                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 353                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 354                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 355                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 356                         dst_metric(&r->u.dst, RTAX_WINDOW),
 357                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 358                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 359                         r->fl.fl4_tos,
 360                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 361                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 362                                        dev_queue_xmit) : 0,
 363                         r->rt_spec_dst);
 364                 seq_printf(seq, "%-127s\n", temp);
 365         }
 366         return 0;
 367 }
 368
 369 static const struct seq_operations rt_cache_seq_ops = {
 370         .start  = rt_cache_seq_start,
 371         .next   = rt_cache_seq_next,
 372         .stop   = rt_cache_seq_stop,
 373         .show   = rt_cache_seq_show,
 374 };
 375
 376 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 377 {
 378         struct seq_file *seq;
 379         int rc = -ENOMEM;
 380         struct rt_cache_iter_state *s;
 381
 382         s = kzalloc(sizeof(*s), GFP_KERNEL);
 383         if (!s)
 384                 goto out;
 385         rc = seq_open(file, &rt_cache_seq_ops);
 386         if (rc)
 387                 goto out_kfree;
 388         seq          = file->private_data;
 389         seq->private = s;
 390 out:
 391         return rc;
 392 out_kfree:
 393         kfree(s);
 394         goto out;
 395 }
 396
 397 static const struct file_operations rt_cache_seq_fops = {
 398         .owner   = THIS_MODULE,
 399         .open    = rt_cache_seq_open,
 400         .read    = seq_read,
 401         .llseek  = seq_lseek,
 402         .release = seq_release_private,
 403 };
 404
 405
 406 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 407 {
 408         int cpu;
 409
 410         if (*pos == 0)
 411                 return SEQ_START_TOKEN;
 412
 413         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 414                 if (!cpu_possible(cpu))
 415                         continue;
 416                 *pos = cpu+1;
 417                 return &per_cpu(rt_cache_stat, cpu);
 418         }
 419         return NULL;
 420 }
 421
 422 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 423 {
 424         int cpu;
 425
 426         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 427                 if (!cpu_possible(cpu))
 428                         continue;
 429                 *pos = cpu+1;
 430                 return &per_cpu(rt_cache_stat, cpu);
 431         }
 432         return NULL;
 433
 434 }
 435
 436 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 437 {
 438
 439 }
 440
 441 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 442 {
 443         struct rt_cache_stat *st = v;
 444
 445         if (v == SEQ_START_TOKEN) {
 446                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 447                 return 0;
 448         }
 449
 450         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 451                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 452                    atomic_read(&ipv4_dst_ops.entries),
 453                    st->in_hit,
 454                    st->in_slow_tot,
 455                    st->in_slow_mc,
 456                    st->in_no_route,
 457                    st->in_brd,
 458                    st->in_martian_dst,
 459                    st->in_martian_src,
 460
 461                    st->out_hit,
 462                    st->out_slow_tot,
 463                    st->out_slow_mc,
 464
 465                    st->gc_total,
 466                    st->gc_ignored,
 467                    st->gc_goal_miss,
 468                    st->gc_dst_overflow,
 469                    st->in_hlist_search,
 470                    st->out_hlist_search
 471                 );
 472         return 0;
 473 }
 474
 475 static const struct seq_operations rt_cpu_seq_ops = {
 476         .start  = rt_cpu_seq_start,
 477         .next   = rt_cpu_seq_next,
 478         .stop   = rt_cpu_seq_stop,
 479         .show   = rt_cpu_seq_show,
 480 };
 481
 482
 483 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 484 {
 485         return seq_open(file, &rt_cpu_seq_ops);
 486 }
 487
 488 static const struct file_operations rt_cpu_seq_fops = {
 489         .owner   = THIS_MODULE,
 490         .open    = rt_cpu_seq_open,
 491         .read    = seq_read,
 492         .llseek  = seq_lseek,
 493         .release = seq_release,
 494 };
 495
 496 #endif /* CONFIG_PROC_FS */
 497
 498 static __inline__ void rt_free(struct rtable *rt)
 499 {
 500         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 501 }
 502
 503 static __inline__ void rt_drop(struct rtable *rt)
 504 {
 505         ip_rt_put(rt);
 506         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 507 }
 508
 509 static __inline__ int rt_fast_clean(struct rtable *rth)
 510 {
 511         /* Kill broadcast/multicast entries very aggresively, if they
 512            collide in hash table with more useful entries */
 513         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 514                 rth->fl.iif && rth->u.dst.rt_next;
 515 }
 516
 517 static __inline__ int rt_valuable(struct rtable *rth)
 518 {
 519         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 520                 rth->u.dst.expires;
 521 }
 522
 523 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 524 {
 525         unsigned long age;
 526         int ret = 0;
 527
 528         if (atomic_read(&rth->u.dst.__refcnt))
 529                 goto out;
 530
 531         ret = 1;
 532         if (rth->u.dst.expires &&
 533             time_after_eq(jiffies, rth->u.dst.expires))
 534                 goto out;
 535
 536         age = jiffies - rth->u.dst.lastuse;
 537         ret = 0;
 538         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 539             (age <= tmo2 && rt_valuable(rth)))
 540                 goto out;
 541         ret = 1;
 542 out:    return ret;
 543 }
 544
 545 /* Bits of score are:
 546  * 31: very valuable
 547  * 30: not quite useless
 548  * 29..0: usage counter
 549  */
 550 static inline u32 rt_score(struct rtable *rt)
 551 {
 552         u32 score = jiffies - rt->u.dst.lastuse;
 553
 554         score = ~score & ~(3<<30);
 555
 556         if (rt_valuable(rt))
 557                 score |= (1<<31);
 558
 559         if (!rt->fl.iif ||
 560             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 561                 score |= (1<<30);
 562
 563         return score;
 564 }
 565
 566 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 567 {
 568         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 569                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 570                 (fl1->mark ^ fl2->mark) |
 571                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 572                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 573                 (fl1->oif ^ fl2->oif) |
 574                 (fl1->iif ^ fl2->iif)) == 0;
 575 }
 576
 577 static void rt_check_expire(struct work_struct *work)
 578 {
 579         static unsigned int rover;
 580         unsigned int i = rover, goal;
 581         struct rtable *rth, **rthp;
 582         u64 mult;
 583
 584         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 585         if (ip_rt_gc_timeout > 1)
 586                 do_div(mult, ip_rt_gc_timeout);
 587         goal = (unsigned int)mult;
 588         if (goal > rt_hash_mask)
 589                 goal = rt_hash_mask + 1;
 590         for (; goal > 0; goal--) {
 591                 unsigned long tmo = ip_rt_gc_timeout;
 592
 593                 i = (i + 1) & rt_hash_mask;
 594                 rthp = &rt_hash_table[i].chain;
 595
 596                 if (*rthp == 0)
 597                         continue;
 598                 spin_lock_bh(rt_hash_lock_addr(i));
 599                 while ((rth = *rthp) != NULL) {
 600                         if (rth->u.dst.expires) {
 601                                 /* Entry is expired even if it is in use */
 602                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 603                                         tmo >>= 1;
 604                                         rthp = &rth->u.dst.rt_next;
 605                                         continue;
 606                                 }
 607                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 608                                 tmo >>= 1;
 609                                 rthp = &rth->u.dst.rt_next;
 610                                 continue;
 611                         }
 612
 613                         /* Cleanup aged off entries. */
 614                         *rthp = rth->u.dst.rt_next;
 615                         rt_free(rth);
 616                 }
 617                 spin_unlock_bh(rt_hash_lock_addr(i));
 618         }
 619         rover = i;
 620         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 621 }
 622
 623 /* This can run from both BH and non-BH contexts, the latter
 624  * in the case of a forced flush event.
 625  */
 626 static void rt_run_flush(unsigned long dummy)
 627 {
 628         int i;
 629         struct rtable *rth, *next;
 630
 631         rt_deadline = 0;
 632
 633         get_random_bytes(&rt_hash_rnd, 4);
 634
 635         for (i = rt_hash_mask; i >= 0; i--) {
 636                 spin_lock_bh(rt_hash_lock_addr(i));
 637                 rth = rt_hash_table[i].chain;
 638                 if (rth)
 639                         rt_hash_table[i].chain = NULL;
 640                 spin_unlock_bh(rt_hash_lock_addr(i));
 641
 642                 for (; rth; rth = next) {
 643                         next = rth->u.dst.rt_next;
 644                         rt_free(rth);
 645                 }
 646         }
 647 }
 648
 649 static DEFINE_SPINLOCK(rt_flush_lock);
 650
 651 void rt_cache_flush(int delay)
 652 {
 653         unsigned long now = jiffies;
 654         int user_mode = !in_softirq();
 655
 656         if (delay < 0)
 657                 delay = ip_rt_min_delay;
 658
 659         spin_lock_bh(&rt_flush_lock);
 660
 661         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 662                 long tmo = (long)(rt_deadline - now);
 663
 664                 /* If flush timer is already running
 665                    and flush request is not immediate (delay > 0):
 666
 667                    if deadline is not achieved, prolongate timer to "delay",
 668                    otherwise fire it at deadline time.
 669                  */
 670
 671                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 672                         tmo = 0;
 673
 674                 if (delay > tmo)
 675                         delay = tmo;
 676         }
 677
 678         if (delay <= 0) {
 679                 spin_unlock_bh(&rt_flush_lock);
 680                 rt_run_flush(0);
 681                 return;
 682         }
 683
 684         if (rt_deadline == 0)
 685                 rt_deadline = now + ip_rt_max_delay;
 686
 687         mod_timer(&rt_flush_timer, now+delay);
 688         spin_unlock_bh(&rt_flush_lock);
 689 }
 690
 691 static void rt_secret_rebuild(unsigned long dummy)
 692 {
 693         unsigned long now = jiffies;
 694
 695         rt_cache_flush(0);
 696         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 697 }
 698
 699 /*
 700    Short description of GC goals.
 701
 702    We want to build algorithm, which will keep routing cache
 703    at some equilibrium point, when number of aged off entries
 704    is kept approximately equal to newly generated ones.
 705
 706    Current expiration strength is variable "expire".
 707    We try to adjust it dynamically, so that if networking
 708    is idle expires is large enough to keep enough of warm entries,
 709    and when load increases it reduces to limit cache size.
 710  */
 711
 712 static int rt_garbage_collect(void)
 713 {
 714         static unsigned long expire = RT_GC_TIMEOUT;
 715         static unsigned long last_gc;
 716         static int rover;
 717         static int equilibrium;
 718         struct rtable *rth, **rthp;
 719         unsigned long now = jiffies;
 720         int goal;
 721
 722         /*
 723          * Garbage collection is pretty expensive,
 724          * do not make it too frequently.
 725          */
 726
 727         RT_CACHE_STAT_INC(gc_total);
 728
 729         if (now - last_gc < ip_rt_gc_min_interval &&
 730             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 731                 RT_CACHE_STAT_INC(gc_ignored);
 732                 goto out;
 733         }
 734
 735         /* Calculate number of entries, which we want to expire now. */
 736         goal = atomic_read(&ipv4_dst_ops.entries) -
 737                 (ip_rt_gc_elasticity << rt_hash_log);
 738         if (goal <= 0) {
 739                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 740                         equilibrium = ipv4_dst_ops.gc_thresh;
 741                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 742                 if (goal > 0) {
 743                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 744                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 745                 }
 746         } else {
 747                 /* We are in dangerous area. Try to reduce cache really
 748                  * aggressively.
 749                  */
 750                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 751                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 752         }
 753
 754         if (now - last_gc >= ip_rt_gc_min_interval)
 755                 last_gc = now;
 756
 757         if (goal <= 0) {
 758                 equilibrium += goal;
 759                 goto work_done;
 760         }
 761
 762         do {
 763                 int i, k;
 764
 765                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 766                         unsigned long tmo = expire;
 767
 768                         k = (k + 1) & rt_hash_mask;
 769                         rthp = &rt_hash_table[k].chain;
 770                         spin_lock_bh(rt_hash_lock_addr(k));
 771                         while ((rth = *rthp) != NULL) {
 772                                 if (!rt_may_expire(rth, tmo, expire)) {
 773                                         tmo >>= 1;
 774                                         rthp = &rth->u.dst.rt_next;
 775                                         continue;
 776                                 }
 777                                 *rthp = rth->u.dst.rt_next;
 778                                 rt_free(rth);
 779                                 goal--;
 780                         }
 781                         spin_unlock_bh(rt_hash_lock_addr(k));
 782                         if (goal <= 0)
 783                                 break;
 784                 }
 785                 rover = k;
 786
 787                 if (goal <= 0)
 788                         goto work_done;
 789
 790                 /* Goal is not achieved. We stop process if:
 791
 792                    - if expire reduced to zero. Otherwise, expire is halfed.
 793                    - if table is not full.
 794                    - if we are called from interrupt.
 795                    - jiffies check is just fallback/debug loop breaker.
 796                      We will not spin here for long time in any case.
 797                  */
 798
 799                 RT_CACHE_STAT_INC(gc_goal_miss);
 800
 801                 if (expire == 0)
 802                         break;
 803
 804                 expire >>= 1;
 805 #if RT_CACHE_DEBUG >= 2
 806                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 807                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 808 #endif
 809
 810                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 811                         goto out;
 812         } while (!in_softirq() && time_before_eq(jiffies, now));
 813
 814         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 815                 goto out;
 816         if (net_ratelimit())
 817                 printk(KERN_WARNING "dst cache overflow\n");
 818         RT_CACHE_STAT_INC(gc_dst_overflow);
 819         return 1;
 820
 821 work_done:
 822         expire += ip_rt_gc_min_interval;
 823         if (expire > ip_rt_gc_timeout ||
 824             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 825                 expire = ip_rt_gc_timeout;
 826 #if RT_CACHE_DEBUG >= 2
 827         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 828                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 829 #endif
 830 out:    return 0;
 831 }
 832
 833 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 834 {
 835         struct rtable   *rth, **rthp;
 836         unsigned long   now;
 837         struct rtable *cand, **candp;
 838         u32             min_score;
 839         int             chain_length;
 840         int attempts = !in_softirq();
 841
 842 restart:
 843         chain_length = 0;
 844         min_score = ~(u32)0;
 845         cand = NULL;
 846         candp = NULL;
 847         now = jiffies;
 848
 849         rthp = &rt_hash_table[hash].chain;
 850
 851         spin_lock_bh(rt_hash_lock_addr(hash));
 852         while ((rth = *rthp) != NULL) {
 853                 if (compare_keys(&rth->fl, &rt->fl)) {
 854                         /* Put it first */
 855                         *rthp = rth->u.dst.rt_next;
 856                         /*
 857                          * Since lookup is lockfree, the deletion
 858                          * must be visible to another weakly ordered CPU before
 859                          * the insertion at the start of the hash chain.
 860                          */
 861                         rcu_assign_pointer(rth->u.dst.rt_next,
 862                                            rt_hash_table[hash].chain);
 863                         /*
 864                          * Since lookup is lockfree, the update writes
 865                          * must be ordered for consistency on SMP.
 866                          */
 867                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 868
 869                         rth->u.dst.__use++;
 870                         dst_hold(&rth->u.dst);
 871                         rth->u.dst.lastuse = now;
 872                         spin_unlock_bh(rt_hash_lock_addr(hash));
 873
 874                         rt_drop(rt);
 875                         *rp = rth;
 876                         return 0;
 877                 }
 878
 879                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 880                         u32 score = rt_score(rth);
 881
 882                         if (score <= min_score) {
 883                                 cand = rth;
 884                                 candp = rthp;
 885                                 min_score = score;
 886                         }
 887                 }
 888
 889                 chain_length++;
 890
 891                 rthp = &rth->u.dst.rt_next;
 892         }
 893
 894         if (cand) {
 895                 /* ip_rt_gc_elasticity used to be average length of chain
 896                  * length, when exceeded gc becomes really aggressive.
 897                  *
 898                  * The second limit is less certain. At the moment it allows
 899                  * only 2 entries per bucket. We will see.
 900                  */
 901                 if (chain_length > ip_rt_gc_elasticity) {
 902                         *candp = cand->u.dst.rt_next;
 903                         rt_free(cand);
 904                 }
 905         }
 906
 907         /* Try to bind route to arp only if it is output
 908            route or unicast forwarding path.
 909          */
 910         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 911                 int err = arp_bind_neighbour(&rt->u.dst);
 912                 if (err) {
 913                         spin_unlock_bh(rt_hash_lock_addr(hash));
 914
 915                         if (err != -ENOBUFS) {
 916                                 rt_drop(rt);
 917                                 return err;
 918                         }
 919
 920                         /* Neighbour tables are full and nothing
 921                            can be released. Try to shrink route cache,
 922                            it is most likely it holds some neighbour records.
 923                          */
 924                         if (attempts-- > 0) {
 925                                 int saved_elasticity = ip_rt_gc_elasticity;
 926                                 int saved_int = ip_rt_gc_min_interval;
 927                                 ip_rt_gc_elasticity     = 1;
 928                                 ip_rt_gc_min_interval   = 0;
 929                                 rt_garbage_collect();
 930                                 ip_rt_gc_min_interval   = saved_int;
 931                                 ip_rt_gc_elasticity     = saved_elasticity;
 932                                 goto restart;
 933                         }
 934
 935                         if (net_ratelimit())
 936                                 printk(KERN_WARNING "Neighbour table overflow.\n");
 937                         rt_drop(rt);
 938                         return -ENOBUFS;
 939                 }
 940         }
 941
 942         rt->u.dst.rt_next = rt_hash_table[hash].chain;
 943 #if RT_CACHE_DEBUG >= 2
 944         if (rt->u.dst.rt_next) {
 945                 struct rtable *trt;
 946                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
 947                        NIPQUAD(rt->rt_dst));
 948                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
 949                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
 950                 printk("\n");
 951         }
 952 #endif
 953         rt_hash_table[hash].chain = rt;
 954         spin_unlock_bh(rt_hash_lock_addr(hash));
 955         *rp = rt;
 956         return 0;
 957 }
 958
 959 void rt_bind_peer(struct rtable *rt, int create)
 960 {
 961         static DEFINE_SPINLOCK(rt_peer_lock);
 962         struct inet_peer *peer;
 963
 964         peer = inet_getpeer(rt->rt_dst, create);
 965
 966         spin_lock_bh(&rt_peer_lock);
 967         if (rt->peer == NULL) {
 968                 rt->peer = peer;
 969                 peer = NULL;
 970         }
 971         spin_unlock_bh(&rt_peer_lock);
 972         if (peer)
 973                 inet_putpeer(peer);
 974 }
 975
 976 /*
 977  * Peer allocation may fail only in serious out-of-memory conditions.  However
 978  * we still can generate some output.
 979  * Random ID selection looks a bit dangerous because we have no chances to
 980  * select ID being unique in a reasonable period of time.
 981  * But broken packet identifier may be better than no packet at all.
 982  */
 983 static void ip_select_fb_ident(struct iphdr *iph)
 984 {
 985         static DEFINE_SPINLOCK(ip_fb_id_lock);
 986         static u32 ip_fallback_id;
 987         u32 salt;
 988
 989         spin_lock_bh(&ip_fb_id_lock);
 990         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
 991         iph->id = htons(salt & 0xFFFF);
 992         ip_fallback_id = salt;
 993         spin_unlock_bh(&ip_fb_id_lock);
 994 }
 995
 996 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 997 {
 998         struct rtable *rt = (struct rtable *) dst;
 999
1000         if (rt) {
1001                 if (rt->peer == NULL)
1002                         rt_bind_peer(rt, 1);
1003
1004                 /* If peer is attached to destination, it is never detached,
1005                    so that we need not to grab a lock to dereference it.
1006                  */
1007                 if (rt->peer) {
1008                         iph->id = htons(inet_getid(rt->peer, more));
1009                         return;
1010                 }
1011         } else
1012                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1013                        __builtin_return_address(0));
1014
1015         ip_select_fb_ident(iph);
1016 }
1017
1018 static void rt_del(unsigned hash, struct rtable *rt)
1019 {
1020         struct rtable **rthp;
1021
1022         spin_lock_bh(rt_hash_lock_addr(hash));
1023         ip_rt_put(rt);
1024         for (rthp = &rt_hash_table[hash].chain; *rthp;
1025              rthp = &(*rthp)->u.dst.rt_next)
1026                 if (*rthp == rt) {
1027                         *rthp = rt->u.dst.rt_next;
1028                         rt_free(rt);
1029                         break;
1030                 }
1031         spin_unlock_bh(rt_hash_lock_addr(hash));
1032 }
1033
1034 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1035                     __be32 saddr, struct net_device *dev)
1036 {
1037         int i, k;
1038         struct in_device *in_dev = in_dev_get(dev);
1039         struct rtable *rth, **rthp;
1040         __be32  skeys[2] = { saddr, 0 };
1041         int  ikeys[2] = { dev->ifindex, 0 };
1042         struct netevent_redirect netevent;
1043
1044         if (!in_dev)
1045                 return;
1046
1047         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1048             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1049                 goto reject_redirect;
1050
1051         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1052                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1053                         goto reject_redirect;
1054                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1055                         goto reject_redirect;
1056         } else {
1057                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1058                         goto reject_redirect;
1059         }
1060
1061         for (i = 0; i < 2; i++) {
1062                 for (k = 0; k < 2; k++) {
1063                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1064
1065                         rthp=&rt_hash_table[hash].chain;
1066
1067                         rcu_read_lock();
1068                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1069                                 struct rtable *rt;
1070
1071                                 if (rth->fl.fl4_dst != daddr ||
1072                                     rth->fl.fl4_src != skeys[i] ||
1073                                     rth->fl.oif != ikeys[k] ||
1074                                     rth->fl.iif != 0) {
1075                                         rthp = &rth->u.dst.rt_next;
1076                                         continue;
1077                                 }
1078
1079                                 if (rth->rt_dst != daddr ||
1080                                     rth->rt_src != saddr ||
1081                                     rth->u.dst.error ||
1082                                     rth->rt_gateway != old_gw ||
1083                                     rth->u.dst.dev != dev)
1084                                         break;
1085
1086                                 dst_hold(&rth->u.dst);
1087                                 rcu_read_unlock();
1088
1089                                 rt = dst_alloc(&ipv4_dst_ops);
1090                                 if (rt == NULL) {
1091                                         ip_rt_put(rth);
1092                                         in_dev_put(in_dev);
1093                                         return;
1094                                 }
1095
1096                                 /* Copy all the information. */
1097                                 *rt = *rth;
1098                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1099                                 rt->u.dst.__use         = 1;
1100                                 atomic_set(&rt->u.dst.__refcnt, 1);
1101                                 rt->u.dst.child         = NULL;
1102                                 if (rt->u.dst.dev)
1103                                         dev_hold(rt->u.dst.dev);
1104                                 if (rt->idev)
1105                                         in_dev_hold(rt->idev);
1106                                 rt->u.dst.obsolete      = 0;
1107                                 rt->u.dst.lastuse       = jiffies;
1108                                 rt->u.dst.path          = &rt->u.dst;
1109                                 rt->u.dst.neighbour     = NULL;
1110                                 rt->u.dst.hh            = NULL;
1111                                 rt->u.dst.xfrm          = NULL;
1112
1113                                 rt->rt_flags            |= RTCF_REDIRECTED;
1114
1115                                 /* Gateway is different ... */
1116                                 rt->rt_gateway          = new_gw;
1117
1118                                 /* Redirect received -> path was valid */
1119                                 dst_confirm(&rth->u.dst);
1120
1121                                 if (rt->peer)
1122                                         atomic_inc(&rt->peer->refcnt);
1123
1124                                 if (arp_bind_neighbour(&rt->u.dst) ||
1125                                     !(rt->u.dst.neighbour->nud_state &
1126                                             NUD_VALID)) {
1127                                         if (rt->u.dst.neighbour)
1128                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1129                                         ip_rt_put(rth);
1130                                         rt_drop(rt);
1131                                         goto do_next;
1132                                 }
1133
1134                                 netevent.old = &rth->u.dst;
1135                                 netevent.new = &rt->u.dst;
1136                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1137                                                         &netevent);
1138
1139                                 rt_del(hash, rth);
1140                                 if (!rt_intern_hash(hash, rt, &rt))
1141                                         ip_rt_put(rt);
1142                                 goto do_next;
1143                         }
1144                         rcu_read_unlock();
1145                 do_next:
1146                         ;
1147                 }
1148         }
1149         in_dev_put(in_dev);
1150         return;
1151
1152 reject_redirect:
1153 #ifdef CONFIG_IP_ROUTE_VERBOSE
1154         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1155                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1156                         "%u.%u.%u.%u ignored.\n"
1157                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1158                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1159                        NIPQUAD(saddr), NIPQUAD(daddr));
1160 #endif
1161         in_dev_put(in_dev);
1162 }
1163
1164 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1165 {
1166         struct rtable *rt = (struct rtable*)dst;
1167         struct dst_entry *ret = dst;
1168
1169         if (rt) {
1170                 if (dst->obsolete) {
1171                         ip_rt_put(rt);
1172                         ret = NULL;
1173                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1174                            rt->u.dst.expires) {
1175                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1176                                                 rt->fl.oif);
1177 #if RT_CACHE_DEBUG >= 1
1178                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1179                                           "%u.%u.%u.%u/%02x dropped\n",
1180                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1181 #endif
1182                         rt_del(hash, rt);
1183                         ret = NULL;
1184                 }
1185         }
1186         return ret;
1187 }
1188
1189 /*
1190  * Algorithm:
1191  *      1. The first ip_rt_redirect_number redirects are sent
1192  *         with exponential backoff, then we stop sending them at all,
1193  *         assuming that the host ignores our redirects.
1194  *      2. If we did not see packets requiring redirects
1195  *         during ip_rt_redirect_silence, we assume that the host
1196  *         forgot redirected route and start to send redirects again.
1197  *
1198  * This algorithm is much cheaper and more intelligent than dumb load limiting
1199  * in icmp.c.
1200  *
1201  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1202  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1203  */
1204
1205 void ip_rt_send_redirect(struct sk_buff *skb)
1206 {
1207         struct rtable *rt = (struct rtable*)skb->dst;
1208         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1209
1210         if (!in_dev)
1211                 return;
1212
1213         if (!IN_DEV_TX_REDIRECTS(in_dev))
1214                 goto out;
1215
1216         /* No redirected packets during ip_rt_redirect_silence;
1217          * reset the algorithm.
1218          */
1219         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1220                 rt->u.dst.rate_tokens = 0;
1221
1222         /* Too many ignored redirects; do not send anything
1223          * set u.dst.rate_last to the last seen redirected packet.
1224          */
1225         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1226                 rt->u.dst.rate_last = jiffies;
1227                 goto out;
1228         }
1229
1230         /* Check for load limit; set rate_last to the latest sent
1231          * redirect.
1232          */
1233         if (rt->u.dst.rate_tokens == 0 ||
1234             time_after(jiffies,
1235                        (rt->u.dst.rate_last +
1236                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1237                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1238                 rt->u.dst.rate_last = jiffies;
1239                 ++rt->u.dst.rate_tokens;
1240 #ifdef CONFIG_IP_ROUTE_VERBOSE
1241                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1242                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1243                     net_ratelimit())
1244                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1245                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1246                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1247                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1248 #endif
1249         }
1250 out:
1251         in_dev_put(in_dev);
1252 }
1253
1254 static int ip_error(struct sk_buff *skb)
1255 {
1256         struct rtable *rt = (struct rtable*)skb->dst;
1257         unsigned long now;
1258         int code;
1259
1260         switch (rt->u.dst.error) {
1261                 case EINVAL:
1262                 default:
1263                         goto out;
1264                 case EHOSTUNREACH:
1265                         code = ICMP_HOST_UNREACH;
1266                         break;
1267                 case ENETUNREACH:
1268                         code = ICMP_NET_UNREACH;
1269                         break;
1270                 case EACCES:
1271                         code = ICMP_PKT_FILTERED;
1272                         break;
1273         }
1274
1275         now = jiffies;
1276         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1277         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1278                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1279         rt->u.dst.rate_last = now;
1280         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1281                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1282                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1283         }
1284
1285 out:    kfree_skb(skb);
1286         return 0;
1287 }
1288
1289 /*
1290  *      The last two values are not from the RFC but
1291  *      are needed for AMPRnet AX.25 paths.
1292  */
1293
1294 static const unsigned short mtu_plateau[] =
1295 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1296
1297 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1298 {
1299         int i;
1300
1301         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1302                 if (old_mtu > mtu_plateau[i])
1303                         return mtu_plateau[i];
1304         return 68;
1305 }
1306
1307 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1308 {
1309         int i;
1310         unsigned short old_mtu = ntohs(iph->tot_len);
1311         struct rtable *rth;
1312         __be32  skeys[2] = { iph->saddr, 0, };
1313         __be32  daddr = iph->daddr;
1314         unsigned short est_mtu = 0;
1315
1316         if (ipv4_config.no_pmtu_disc)
1317                 return 0;
1318
1319         for (i = 0; i < 2; i++) {
1320                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1321
1322                 rcu_read_lock();
1323                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1324                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1325                         if (rth->fl.fl4_dst == daddr &&
1326                             rth->fl.fl4_src == skeys[i] &&
1327                             rth->rt_dst  == daddr &&
1328                             rth->rt_src  == iph->saddr &&
1329                             rth->fl.iif == 0 &&
1330                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1331                                 unsigned short mtu = new_mtu;
1332
1333                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1334
1335                                         /* BSD 4.2 compatibility hack :-( */
1336                                         if (mtu == 0 &&
1337                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1338                                             old_mtu >= 68 + (iph->ihl << 2))
1339                                                 old_mtu -= iph->ihl << 2;
1340
1341                                         mtu = guess_mtu(old_mtu);
1342                                 }
1343                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1344                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1345                                                 dst_confirm(&rth->u.dst);
1346                                                 if (mtu < ip_rt_min_pmtu) {
1347                                                         mtu = ip_rt_min_pmtu;
1348                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1349                                                                 (1 << RTAX_MTU);
1350                                                 }
1351                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1352                                                 dst_set_expires(&rth->u.dst,
1353                                                         ip_rt_mtu_expires);
1354                                         }
1355                                         est_mtu = mtu;
1356                                 }
1357                         }
1358                 }
1359                 rcu_read_unlock();
1360         }
1361         return est_mtu ? : new_mtu;
1362 }
1363
1364 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1365 {
1366         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1367             !(dst_metric_locked(dst, RTAX_MTU))) {
1368                 if (mtu < ip_rt_min_pmtu) {
1369                         mtu = ip_rt_min_pmtu;
1370                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1371                 }
1372                 dst->metrics[RTAX_MTU-1] = mtu;
1373                 dst_set_expires(dst, ip_rt_mtu_expires);
1374                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1375         }
1376 }
1377
1378 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1379 {
1380         return NULL;
1381 }
1382
1383 static void ipv4_dst_destroy(struct dst_entry *dst)
1384 {
1385         struct rtable *rt = (struct rtable *) dst;
1386         struct inet_peer *peer = rt->peer;
1387         struct in_device *idev = rt->idev;
1388
1389         if (peer) {
1390                 rt->peer = NULL;
1391                 inet_putpeer(peer);
1392         }
1393
1394         if (idev) {
1395                 rt->idev = NULL;
1396                 in_dev_put(idev);
1397         }
1398 }
1399
1400 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1401                             int how)
1402 {
1403         struct rtable *rt = (struct rtable *) dst;
1404         struct in_device *idev = rt->idev;
1405         if (dev != loopback_dev && idev && idev->dev == dev) {
1406                 struct in_device *loopback_idev = in_dev_get(loopback_dev);
1407                 if (loopback_idev) {
1408                         rt->idev = loopback_idev;
1409                         in_dev_put(idev);
1410                 }
1411         }
1412 }
1413
1414 static void ipv4_link_failure(struct sk_buff *skb)
1415 {
1416         struct rtable *rt;
1417
1418         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1419
1420         rt = (struct rtable *) skb->dst;
1421         if (rt)
1422                 dst_set_expires(&rt->u.dst, 0);
1423 }
1424
1425 static int ip_rt_bug(struct sk_buff *skb)
1426 {
1427         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1428                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1429                 skb->dev ? skb->dev->name : "?");
1430         kfree_skb(skb);
1431         return 0;
1432 }
1433
1434 /*
1435    We do not cache source address of outgoing interface,
1436    because it is used only by IP RR, TS and SRR options,
1437    so that it out of fast path.
1438
1439    BTW remember: "addr" is allowed to be not aligned
1440    in IP options!
1441  */
1442
1443 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1444 {
1445         __be32 src;
1446         struct fib_result res;
1447
1448         if (rt->fl.iif == 0)
1449                 src = rt->rt_src;
1450         else if (fib_lookup(&rt->fl, &res) == 0) {
1451                 src = FIB_RES_PREFSRC(res);
1452                 fib_res_put(&res);
1453         } else
1454                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1455                                         RT_SCOPE_UNIVERSE);
1456         memcpy(addr, &src, 4);
1457 }
1458
1459 #ifdef CONFIG_NET_CLS_ROUTE
1460 static void set_class_tag(struct rtable *rt, u32 tag)
1461 {
1462         if (!(rt->u.dst.tclassid & 0xFFFF))
1463                 rt->u.dst.tclassid |= tag & 0xFFFF;
1464         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1465                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1466 }
1467 #endif
1468
1469 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1470 {
1471         struct fib_info *fi = res->fi;
1472
1473         if (fi) {
1474                 if (FIB_RES_GW(*res) &&
1475                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1476                         rt->rt_gateway = FIB_RES_GW(*res);
1477                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1478                        sizeof(rt->u.dst.metrics));
1479                 if (fi->fib_mtu == 0) {
1480                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1481                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1482                             rt->rt_gateway != rt->rt_dst &&
1483                             rt->u.dst.dev->mtu > 576)
1484                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1485                 }
1486 #ifdef CONFIG_NET_CLS_ROUTE
1487                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1488 #endif
1489         } else
1490                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1491
1492         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1493                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1494         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1495                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1496         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1497                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1498                                        ip_rt_min_advmss);
1499         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1500                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1501
1502 #ifdef CONFIG_NET_CLS_ROUTE
1503 #ifdef CONFIG_IP_MULTIPLE_TABLES
1504         set_class_tag(rt, fib_rules_tclass(res));
1505 #endif
1506         set_class_tag(rt, itag);
1507 #endif
1508         rt->rt_type = res->type;
1509 }
1510
1511 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1512                                 u8 tos, struct net_device *dev, int our)
1513 {
1514         unsigned hash;
1515         struct rtable *rth;
1516         __be32 spec_dst;
1517         struct in_device *in_dev = in_dev_get(dev);
1518         u32 itag = 0;
1519
1520         /* Primary sanity checks. */
1521
1522         if (in_dev == NULL)
1523                 return -EINVAL;
1524
1525         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1526             skb->protocol != htons(ETH_P_IP))
1527                 goto e_inval;
1528
1529         if (ZERONET(saddr)) {
1530                 if (!LOCAL_MCAST(daddr))
1531                         goto e_inval;
1532                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1533         } else if (fib_validate_source(saddr, 0, tos, 0,
1534                                         dev, &spec_dst, &itag) < 0)
1535                 goto e_inval;
1536
1537         rth = dst_alloc(&ipv4_dst_ops);
1538         if (!rth)
1539                 goto e_nobufs;
1540
1541         rth->u.dst.output= ip_rt_bug;
1542
1543         atomic_set(&rth->u.dst.__refcnt, 1);
1544         rth->u.dst.flags= DST_HOST;
1545         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1546                 rth->u.dst.flags |= DST_NOPOLICY;
1547         rth->fl.fl4_dst = daddr;
1548         rth->rt_dst     = daddr;
1549         rth->fl.fl4_tos = tos;
1550         rth->fl.mark    = skb->mark;
1551         rth->fl.fl4_src = saddr;
1552         rth->rt_src     = saddr;
1553 #ifdef CONFIG_NET_CLS_ROUTE
1554         rth->u.dst.tclassid = itag;
1555 #endif
1556         rth->rt_iif     =
1557         rth->fl.iif     = dev->ifindex;
1558         rth->u.dst.dev  = loopback_dev;
1559         dev_hold(rth->u.dst.dev);
1560         rth->idev       = in_dev_get(rth->u.dst.dev);
1561         rth->fl.oif     = 0;
1562         rth->rt_gateway = daddr;
1563         rth->rt_spec_dst= spec_dst;
1564         rth->rt_type    = RTN_MULTICAST;
1565         rth->rt_flags   = RTCF_MULTICAST;
1566         if (our) {
1567                 rth->u.dst.input= ip_local_deliver;
1568                 rth->rt_flags |= RTCF_LOCAL;
1569         }
1570
1571 #ifdef CONFIG_IP_MROUTE
1572         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1573                 rth->u.dst.input = ip_mr_input;
1574 #endif
1575         RT_CACHE_STAT_INC(in_slow_mc);
1576
1577         in_dev_put(in_dev);
1578         hash = rt_hash(daddr, saddr, dev->ifindex);
1579         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1580
1581 e_nobufs:
1582         in_dev_put(in_dev);
1583         return -ENOBUFS;
1584
1585 e_inval:
1586         in_dev_put(in_dev);
1587         return -EINVAL;
1588 }
1589
1590
1591 static void ip_handle_martian_source(struct net_device *dev,
1592                                      struct in_device *in_dev,
1593                                      struct sk_buff *skb,
1594                                      __be32 daddr,
1595                                      __be32 saddr)
1596 {
1597         RT_CACHE_STAT_INC(in_martian_src);
1598 #ifdef CONFIG_IP_ROUTE_VERBOSE
1599         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1600                 /*
1601                  *      RFC1812 recommendation, if source is martian,
1602                  *      the only hint is MAC header.
1603                  */
1604                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1605                         "%u.%u.%u.%u, on dev %s\n",
1606                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1607                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1608                         int i;
1609                         const unsigned char *p = skb_mac_header(skb);
1610                         printk(KERN_WARNING "ll header: ");
1611                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1612                                 printk("%02x", *p);
1613                                 if (i < (dev->hard_header_len - 1))
1614                                         printk(":");
1615                         }
1616                         printk("\n");
1617                 }
1618         }
1619 #endif
1620 }
1621
1622 static inline int __mkroute_input(struct sk_buff *skb,
1623                                   struct fib_result* res,
1624                                   struct in_device *in_dev,
1625                                   __be32 daddr, __be32 saddr, u32 tos,
1626                                   struct rtable **result)
1627 {
1628
1629         struct rtable *rth;
1630         int err;
1631         struct in_device *out_dev;
1632         unsigned flags = 0;
1633         __be32 spec_dst;
1634         u32 itag;
1635
1636         /* get a working reference to the output device */
1637         out_dev = in_dev_get(FIB_RES_DEV(*res));
1638         if (out_dev == NULL) {
1639                 if (net_ratelimit())
1640                         printk(KERN_CRIT "Bug in ip_route_input" \
1641                                "_slow(). Please, report\n");
1642                 return -EINVAL;
1643         }
1644
1645
1646         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1647                                   in_dev->dev, &spec_dst, &itag);
1648         if (err < 0) {
1649                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1650                                          saddr);
1651
1652                 err = -EINVAL;
1653                 goto cleanup;
1654         }
1655
1656         if (err)
1657                 flags |= RTCF_DIRECTSRC;
1658
1659         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1660             (IN_DEV_SHARED_MEDIA(out_dev) ||
1661              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1662                 flags |= RTCF_DOREDIRECT;
1663
1664         if (skb->protocol != htons(ETH_P_IP)) {
1665                 /* Not IP (i.e. ARP). Do not create route, if it is
1666                  * invalid for proxy arp. DNAT routes are always valid.
1667                  */
1668                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1669                         err = -EINVAL;
1670                         goto cleanup;
1671                 }
1672         }
1673
1674
1675         rth = dst_alloc(&ipv4_dst_ops);
1676         if (!rth) {
1677                 err = -ENOBUFS;
1678                 goto cleanup;
1679         }
1680
1681         atomic_set(&rth->u.dst.__refcnt, 1);
1682         rth->u.dst.flags= DST_HOST;
1683         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1684                 rth->u.dst.flags |= DST_NOPOLICY;
1685         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1686                 rth->u.dst.flags |= DST_NOXFRM;
1687         rth->fl.fl4_dst = daddr;
1688         rth->rt_dst     = daddr;
1689         rth->fl.fl4_tos = tos;
1690         rth->fl.mark    = skb->mark;
1691         rth->fl.fl4_src = saddr;
1692         rth->rt_src     = saddr;
1693         rth->rt_gateway = daddr;
1694         rth->rt_iif     =
1695                 rth->fl.iif     = in_dev->dev->ifindex;
1696         rth->u.dst.dev  = (out_dev)->dev;
1697         dev_hold(rth->u.dst.dev);
1698         rth->idev       = in_dev_get(rth->u.dst.dev);
1699         rth->fl.oif     = 0;
1700         rth->rt_spec_dst= spec_dst;
1701
1702         rth->u.dst.input = ip_forward;
1703         rth->u.dst.output = ip_output;
1704
1705         rt_set_nexthop(rth, res, itag);
1706
1707         rth->rt_flags = flags;
1708
1709         *result = rth;
1710         err = 0;
1711  cleanup:
1712         /* release the working reference to the output device */
1713         in_dev_put(out_dev);
1714         return err;
1715 }
1716
1717 static inline int ip_mkroute_input(struct sk_buff *skb,
1718                                    struct fib_result* res,
1719                                    const struct flowi *fl,
1720                                    struct in_device *in_dev,
1721                                    __be32 daddr, __be32 saddr, u32 tos)
1722 {
1723         struct rtable* rth = NULL;
1724         int err;
1725         unsigned hash;
1726
1727 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1728         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1729                 fib_select_multipath(fl, res);
1730 #endif
1731
1732         /* create a routing cache entry */
1733         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1734         if (err)
1735                 return err;
1736
1737         /* put it into the cache */
1738         hash = rt_hash(daddr, saddr, fl->iif);
1739         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1740 }
1741
1742 /*
1743  *      NOTE. We drop all the packets that has local source
1744  *      addresses, because every properly looped back packet
1745  *      must have correct destination already attached by output routine.
1746  *
1747  *      Such approach solves two big problems:
1748  *      1. Not simplex devices are handled properly.
1749  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1750  */
1751
1752 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1753                                u8 tos, struct net_device *dev)
1754 {
1755         struct fib_result res;
1756         struct in_device *in_dev = in_dev_get(dev);
1757         struct flowi fl = { .nl_u = { .ip4_u =
1758                                       { .daddr = daddr,
1759                                         .saddr = saddr,
1760                                         .tos = tos,
1761                                         .scope = RT_SCOPE_UNIVERSE,
1762                                       } },
1763                             .mark = skb->mark,
1764                             .iif = dev->ifindex };
1765         unsigned        flags = 0;
1766         u32             itag = 0;
1767         struct rtable * rth;
1768         unsigned        hash;
1769         __be32          spec_dst;
1770         int             err = -EINVAL;
1771         int             free_res = 0;
1772
1773         /* IP on this device is disabled. */
1774
1775         if (!in_dev)
1776                 goto out;
1777
1778         /* Check for the most weird martians, which can be not detected
1779            by fib_lookup.
1780          */
1781
1782         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1783                 goto martian_source;
1784
1785         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1786                 goto brd_input;
1787
1788         /* Accept zero addresses only to limited broadcast;
1789          * I even do not know to fix it or not. Waiting for complains :-)
1790          */
1791         if (ZERONET(saddr))
1792                 goto martian_source;
1793
1794         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1795                 goto martian_destination;
1796
1797         /*
1798          *      Now we are ready to route packet.
1799          */
1800         if ((err = fib_lookup(&fl, &res)) != 0) {
1801                 if (!IN_DEV_FORWARD(in_dev))
1802                         goto e_hostunreach;
1803                 goto no_route;
1804         }
1805         free_res = 1;
1806
1807         RT_CACHE_STAT_INC(in_slow_tot);
1808
1809         if (res.type == RTN_BROADCAST)
1810                 goto brd_input;
1811
1812         if (res.type == RTN_LOCAL) {
1813                 int result;
1814                 result = fib_validate_source(saddr, daddr, tos,
1815                                              loopback_dev->ifindex,
1816                                              dev, &spec_dst, &itag);
1817                 if (result < 0)
1818                         goto martian_source;
1819                 if (result)
1820                         flags |= RTCF_DIRECTSRC;
1821                 spec_dst = daddr;
1822                 goto local_input;
1823         }
1824
1825         if (!IN_DEV_FORWARD(in_dev))
1826                 goto e_hostunreach;
1827         if (res.type != RTN_UNICAST)
1828                 goto martian_destination;
1829
1830         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1831         if (err == -ENOBUFS)
1832                 goto e_nobufs;
1833         if (err == -EINVAL)
1834                 goto e_inval;
1835
1836 done:
1837         in_dev_put(in_dev);
1838         if (free_res)
1839                 fib_res_put(&res);
1840 out:    return err;
1841
1842 brd_input:
1843         if (skb->protocol != htons(ETH_P_IP))
1844                 goto e_inval;
1845
1846         if (ZERONET(saddr))
1847                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1848         else {
1849                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1850                                           &itag);
1851                 if (err < 0)
1852                         goto martian_source;
1853                 if (err)
1854                         flags |= RTCF_DIRECTSRC;
1855         }
1856         flags |= RTCF_BROADCAST;
1857         res.type = RTN_BROADCAST;
1858         RT_CACHE_STAT_INC(in_brd);
1859
1860 local_input:
1861         rth = dst_alloc(&ipv4_dst_ops);
1862         if (!rth)
1863                 goto e_nobufs;
1864
1865         rth->u.dst.output= ip_rt_bug;
1866
1867         atomic_set(&rth->u.dst.__refcnt, 1);
1868         rth->u.dst.flags= DST_HOST;
1869         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1870                 rth->u.dst.flags |= DST_NOPOLICY;
1871         rth->fl.fl4_dst = daddr;
1872         rth->rt_dst     = daddr;
1873         rth->fl.fl4_tos = tos;
1874         rth->fl.mark    = skb->mark;
1875         rth->fl.fl4_src = saddr;
1876         rth->rt_src     = saddr;
1877 #ifdef CONFIG_NET_CLS_ROUTE
1878         rth->u.dst.tclassid = itag;
1879 #endif
1880         rth->rt_iif     =
1881         rth->fl.iif     = dev->ifindex;
1882         rth->u.dst.dev  = loopback_dev;
1883         dev_hold(rth->u.dst.dev);
1884         rth->idev       = in_dev_get(rth->u.dst.dev);
1885         rth->rt_gateway = daddr;
1886         rth->rt_spec_dst= spec_dst;
1887         rth->u.dst.input= ip_local_deliver;
1888         rth->rt_flags   = flags|RTCF_LOCAL;
1889         if (res.type == RTN_UNREACHABLE) {
1890                 rth->u.dst.input= ip_error;
1891                 rth->u.dst.error= -err;
1892                 rth->rt_flags   &= ~RTCF_LOCAL;
1893         }
1894         rth->rt_type    = res.type;
1895         hash = rt_hash(daddr, saddr, fl.iif);
1896         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1897         goto done;
1898
1899 no_route:
1900         RT_CACHE_STAT_INC(in_no_route);
1901         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1902         res.type = RTN_UNREACHABLE;
1903         goto local_input;
1904
1905         /*
1906          *      Do not cache martian addresses: they should be logged (RFC1812)
1907          */
1908 martian_destination:
1909         RT_CACHE_STAT_INC(in_martian_dst);
1910 #ifdef CONFIG_IP_ROUTE_VERBOSE
1911         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1912                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1913                         "%u.%u.%u.%u, dev %s\n",
1914                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1915 #endif
1916
1917 e_hostunreach:
1918         err = -EHOSTUNREACH;
1919         goto done;
1920
1921 e_inval:
1922         err = -EINVAL;
1923         goto done;
1924
1925 e_nobufs:
1926         err = -ENOBUFS;
1927         goto done;
1928
1929 martian_source:
1930         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1931         goto e_inval;
1932 }
1933
1934 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1935                    u8 tos, struct net_device *dev)
1936 {
1937         struct rtable * rth;
1938         unsigned        hash;
1939         int iif = dev->ifindex;
1940
1941         tos &= IPTOS_RT_MASK;
1942         hash = rt_hash(daddr, saddr, iif);
1943
1944         rcu_read_lock();
1945         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1946              rth = rcu_dereference(rth->u.dst.rt_next)) {
1947                 if (rth->fl.fl4_dst == daddr &&
1948                     rth->fl.fl4_src == saddr &&
1949                     rth->fl.iif == iif &&
1950                     rth->fl.oif == 0 &&
1951                     rth->fl.mark == skb->mark &&
1952                     rth->fl.fl4_tos == tos) {
1953                         rth->u.dst.lastuse = jiffies;
1954                         dst_hold(&rth->u.dst);
1955                         rth->u.dst.__use++;
1956                         RT_CACHE_STAT_INC(in_hit);
1957                         rcu_read_unlock();
1958                         skb->dst = (struct dst_entry*)rth;
1959                         return 0;
1960                 }
1961                 RT_CACHE_STAT_INC(in_hlist_search);
1962         }
1963         rcu_read_unlock();
1964
1965         /* Multicast recognition logic is moved from route cache to here.
1966            The problem was that too many Ethernet cards have broken/missing
1967            hardware multicast filters :-( As result the host on multicasting
1968            network acquires a lot of useless route cache entries, sort of
1969            SDR messages from all the world. Now we try to get rid of them.
1970            Really, provided software IP multicast filter is organized
1971            reasonably (at least, hashed), it does not result in a slowdown
1972            comparing with route cache reject entries.
1973            Note, that multicast routers are not affected, because
1974            route cache entry is created eventually.
1975          */
1976         if (MULTICAST(daddr)) {
1977                 struct in_device *in_dev;
1978
1979                 rcu_read_lock();
1980                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1981                         int our = ip_check_mc(in_dev, daddr, saddr,
1982                                 ip_hdr(skb)->protocol);
1983                         if (our
1984 #ifdef CONFIG_IP_MROUTE
1985                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1986 #endif
1987                             ) {
1988                                 rcu_read_unlock();
1989                                 return ip_route_input_mc(skb, daddr, saddr,
1990                                                          tos, dev, our);
1991                         }
1992                 }
1993                 rcu_read_unlock();
1994                 return -EINVAL;
1995         }
1996         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1997 }
1998
1999 static inline int __mkroute_output(struct rtable **result,
2000                                    struct fib_result* res,
2001                                    const struct flowi *fl,
2002                                    const struct flowi *oldflp,
2003                                    struct net_device *dev_out,
2004                                    unsigned flags)
2005 {
2006         struct rtable *rth;
2007         struct in_device *in_dev;
2008         u32 tos = RT_FL_TOS(oldflp);
2009         int err = 0;
2010
2011         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2012                 return -EINVAL;
2013
2014         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2015                 res->type = RTN_BROADCAST;
2016         else if (MULTICAST(fl->fl4_dst))
2017                 res->type = RTN_MULTICAST;
2018         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2019                 return -EINVAL;
2020
2021         if (dev_out->flags & IFF_LOOPBACK)
2022                 flags |= RTCF_LOCAL;
2023
2024         /* get work reference to inet device */
2025         in_dev = in_dev_get(dev_out);
2026         if (!in_dev)
2027                 return -EINVAL;
2028
2029         if (res->type == RTN_BROADCAST) {
2030                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2031                 if (res->fi) {
2032                         fib_info_put(res->fi);
2033                         res->fi = NULL;
2034                 }
2035         } else if (res->type == RTN_MULTICAST) {
2036                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2037                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2038                                  oldflp->proto))
2039                         flags &= ~RTCF_LOCAL;
2040                 /* If multicast route do not exist use
2041                    default one, but do not gateway in this case.
2042                    Yes, it is hack.
2043                  */
2044                 if (res->fi && res->prefixlen < 4) {
2045                         fib_info_put(res->fi);
2046                         res->fi = NULL;
2047                 }
2048         }
2049
2050
2051         rth = dst_alloc(&ipv4_dst_ops);
2052         if (!rth) {
2053                 err = -ENOBUFS;
2054                 goto cleanup;
2055         }
2056
2057         atomic_set(&rth->u.dst.__refcnt, 1);
2058         rth->u.dst.flags= DST_HOST;
2059         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2060                 rth->u.dst.flags |= DST_NOXFRM;
2061         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2062                 rth->u.dst.flags |= DST_NOPOLICY;
2063
2064         rth->fl.fl4_dst = oldflp->fl4_dst;
2065         rth->fl.fl4_tos = tos;
2066         rth->fl.fl4_src = oldflp->fl4_src;
2067         rth->fl.oif     = oldflp->oif;
2068         rth->fl.mark    = oldflp->mark;
2069         rth->rt_dst     = fl->fl4_dst;
2070         rth->rt_src     = fl->fl4_src;
2071         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2072         /* get references to the devices that are to be hold by the routing
2073            cache entry */
2074         rth->u.dst.dev  = dev_out;
2075         dev_hold(dev_out);
2076         rth->idev       = in_dev_get(dev_out);
2077         rth->rt_gateway = fl->fl4_dst;
2078         rth->rt_spec_dst= fl->fl4_src;
2079
2080         rth->u.dst.output=ip_output;
2081
2082         RT_CACHE_STAT_INC(out_slow_tot);
2083
2084         if (flags & RTCF_LOCAL) {
2085                 rth->u.dst.input = ip_local_deliver;
2086                 rth->rt_spec_dst = fl->fl4_dst;
2087         }
2088         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2089                 rth->rt_spec_dst = fl->fl4_src;
2090                 if (flags & RTCF_LOCAL &&
2091                     !(dev_out->flags & IFF_LOOPBACK)) {
2092                         rth->u.dst.output = ip_mc_output;
2093                         RT_CACHE_STAT_INC(out_slow_mc);
2094                 }
2095 #ifdef CONFIG_IP_MROUTE
2096                 if (res->type == RTN_MULTICAST) {
2097                         if (IN_DEV_MFORWARD(in_dev) &&
2098                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2099                                 rth->u.dst.input = ip_mr_input;
2100                                 rth->u.dst.output = ip_mc_output;
2101                         }
2102                 }
2103 #endif
2104         }
2105
2106         rt_set_nexthop(rth, res, 0);
2107
2108         rth->rt_flags = flags;
2109
2110         *result = rth;
2111  cleanup:
2112         /* release work reference to inet device */
2113         in_dev_put(in_dev);
2114
2115         return err;
2116 }
2117
2118 static inline int ip_mkroute_output(struct rtable **rp,
2119                                     struct fib_result* res,
2120                                     const struct flowi *fl,
2121                                     const struct flowi *oldflp,
2122                                     struct net_device *dev_out,
2123                                     unsigned flags)
2124 {
2125         struct rtable *rth = NULL;
2126         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2127         unsigned hash;
2128         if (err == 0) {
2129                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2130                 err = rt_intern_hash(hash, rth, rp);
2131         }
2132
2133         return err;
2134 }
2135
2136 /*
2137  * Major route resolver routine.
2138  */
2139
2140 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2141 {
2142         u32 tos = RT_FL_TOS(oldflp);
2143         struct flowi fl = { .nl_u = { .ip4_u =
2144                                       { .daddr = oldflp->fl4_dst,
2145                                         .saddr = oldflp->fl4_src,
2146                                         .tos = tos & IPTOS_RT_MASK,
2147                                         .scope = ((tos & RTO_ONLINK) ?
2148                                                   RT_SCOPE_LINK :
2149                                                   RT_SCOPE_UNIVERSE),
2150                                       } },
2151                             .mark = oldflp->mark,
2152                             .iif = loopback_dev->ifindex,
2153                             .oif = oldflp->oif };
2154         struct fib_result res;
2155         unsigned flags = 0;
2156         struct net_device *dev_out = NULL;
2157         int free_res = 0;
2158         int err;
2159
2160
2161         res.fi          = NULL;
2162 #ifdef CONFIG_IP_MULTIPLE_TABLES
2163         res.r           = NULL;
2164 #endif
2165
2166         if (oldflp->fl4_src) {
2167                 err = -EINVAL;
2168                 if (MULTICAST(oldflp->fl4_src) ||
2169                     BADCLASS(oldflp->fl4_src) ||
2170                     ZERONET(oldflp->fl4_src))
2171                         goto out;
2172
2173                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2174                 dev_out = ip_dev_find(oldflp->fl4_src);
2175                 if (dev_out == NULL)
2176                         goto out;
2177
2178                 /* I removed check for oif == dev_out->oif here.
2179                    It was wrong for two reasons:
2180                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2181                       assigned to multiple interfaces.
2182                    2. Moreover, we are allowed to send packets with saddr
2183                       of another iface. --ANK
2184                  */
2185
2186                 if (oldflp->oif == 0
2187                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2188                         /* Special hack: user can direct multicasts
2189                            and limited broadcast via necessary interface
2190                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2191                            This hack is not just for fun, it allows
2192                            vic,vat and friends to work.
2193                            They bind socket to loopback, set ttl to zero
2194                            and expect that it will work.
2195                            From the viewpoint of routing cache they are broken,
2196                            because we are not allowed to build multicast path
2197                            with loopback source addr (look, routing cache
2198                            cannot know, that ttl is zero, so that packet
2199                            will not leave this host and route is valid).
2200                            Luckily, this hack is good workaround.
2201                          */
2202
2203                         fl.oif = dev_out->ifindex;
2204                         goto make_route;
2205                 }
2206                 if (dev_out)
2207                         dev_put(dev_out);
2208                 dev_out = NULL;
2209         }
2210
2211
2212         if (oldflp->oif) {
2213                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2214                 err = -ENODEV;
2215                 if (dev_out == NULL)
2216                         goto out;
2217
2218                 /* RACE: Check return value of inet_select_addr instead. */
2219                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2220                         dev_put(dev_out);
2221                         goto out;       /* Wrong error code */
2222                 }
2223
2224                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2225                         if (!fl.fl4_src)
2226                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2227                                                               RT_SCOPE_LINK);
2228                         goto make_route;
2229                 }
2230                 if (!fl.fl4_src) {
2231                         if (MULTICAST(oldflp->fl4_dst))
2232                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2233                                                               fl.fl4_scope);
2234                         else if (!oldflp->fl4_dst)
2235                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2236                                                               RT_SCOPE_HOST);
2237                 }
2238         }
2239
2240         if (!fl.fl4_dst) {
2241                 fl.fl4_dst = fl.fl4_src;
2242                 if (!fl.fl4_dst)
2243                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2244                 if (dev_out)
2245                         dev_put(dev_out);
2246                 dev_out = loopback_dev;
2247                 dev_hold(dev_out);
2248                 fl.oif = loopback_dev->ifindex;
2249                 res.type = RTN_LOCAL;
2250                 flags |= RTCF_LOCAL;
2251                 goto make_route;
2252         }
2253
2254         if (fib_lookup(&fl, &res)) {
2255                 res.fi = NULL;
2256                 if (oldflp->oif) {
2257                         /* Apparently, routing tables are wrong. Assume,
2258                            that the destination is on link.
2259
2260                            WHY? DW.
2261                            Because we are allowed to send to iface
2262                            even if it has NO routes and NO assigned
2263                            addresses. When oif is specified, routing
2264                            tables are looked up with only one purpose:
2265                            to catch if destination is gatewayed, rather than
2266                            direct. Moreover, if MSG_DONTROUTE is set,
2267                            we send packet, ignoring both routing tables
2268                            and ifaddr state. --ANK
2269
2270
2271                            We could make it even if oif is unknown,
2272                            likely IPv6, but we do not.
2273                          */
2274
2275                         if (fl.fl4_src == 0)
2276                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2277                                                               RT_SCOPE_LINK);
2278                         res.type = RTN_UNICAST;
2279                         goto make_route;
2280                 }
2281                 if (dev_out)
2282                         dev_put(dev_out);
2283                 err = -ENETUNREACH;
2284                 goto out;
2285         }
2286         free_res = 1;
2287
2288         if (res.type == RTN_LOCAL) {
2289                 if (!fl.fl4_src)
2290                         fl.fl4_src = fl.fl4_dst;
2291                 if (dev_out)
2292                         dev_put(dev_out);
2293                 dev_out = loopback_dev;
2294                 dev_hold(dev_out);
2295                 fl.oif = dev_out->ifindex;
2296                 if (res.fi)
2297                         fib_info_put(res.fi);
2298                 res.fi = NULL;
2299                 flags |= RTCF_LOCAL;
2300                 goto make_route;
2301         }
2302
2303 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2304         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2305                 fib_select_multipath(&fl, &res);
2306         else
2307 #endif
2308         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2309                 fib_select_default(&fl, &res);
2310
2311         if (!fl.fl4_src)
2312                 fl.fl4_src = FIB_RES_PREFSRC(res);
2313
2314         if (dev_out)
2315                 dev_put(dev_out);
2316         dev_out = FIB_RES_DEV(res);
2317         dev_hold(dev_out);
2318         fl.oif = dev_out->ifindex;
2319
2320
2321 make_route:
2322         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2323
2324
2325         if (free_res)
2326                 fib_res_put(&res);
2327         if (dev_out)
2328                 dev_put(dev_out);
2329 out:    return err;
2330 }
2331
2332 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2333 {
2334         unsigned hash;
2335         struct rtable *rth;
2336
2337         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2338
2339         rcu_read_lock_bh();
2340         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2341                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2342                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2343                     rth->fl.fl4_src == flp->fl4_src &&
2344                     rth->fl.iif == 0 &&
2345                     rth->fl.oif == flp->oif &&
2346                     rth->fl.mark == flp->mark &&
2347                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2348                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2349                         rth->u.dst.lastuse = jiffies;
2350                         dst_hold(&rth->u.dst);
2351                         rth->u.dst.__use++;
2352                         RT_CACHE_STAT_INC(out_hit);
2353                         rcu_read_unlock_bh();
2354                         *rp = rth;
2355                         return 0;
2356                 }
2357                 RT_CACHE_STAT_INC(out_hlist_search);
2358         }
2359         rcu_read_unlock_bh();
2360
2361         return ip_route_output_slow(rp, flp);
2362 }
2363
2364 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2365
2366 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2367 {
2368 }
2369
2370 static struct dst_ops ipv4_dst_blackhole_ops = {
2371         .family                 =       AF_INET,
2372         .protocol               =       __constant_htons(ETH_P_IP),
2373         .destroy                =       ipv4_dst_destroy,
2374         .check                  =       ipv4_dst_check,
2375         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2376         .entry_size             =       sizeof(struct rtable),
2377 };
2378
2379
2380 static int ipv4_blackhole_output(struct sk_buff *skb)
2381 {
2382         kfree_skb(skb);
2383         return 0;
2384 }
2385
2386 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2387 {
2388         struct rtable *ort = *rp;
2389         struct rtable *rt = (struct rtable *)
2390                 dst_alloc(&ipv4_dst_blackhole_ops);
2391
2392         if (rt) {
2393                 struct dst_entry *new = &rt->u.dst;
2394
2395                 atomic_set(&new->__refcnt, 1);
2396                 new->__use = 1;
2397                 new->input = ipv4_blackhole_output;
2398                 new->output = ipv4_blackhole_output;
2399                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2400
2401                 new->dev = ort->u.dst.dev;
2402                 if (new->dev)
2403                         dev_hold(new->dev);
2404
2405                 rt->fl = ort->fl;
2406
2407                 rt->idev = ort->idev;
2408                 if (rt->idev)
2409                         in_dev_hold(rt->idev);
2410                 rt->rt_flags = ort->rt_flags;
2411                 rt->rt_type = ort->rt_type;
2412                 rt->rt_dst = ort->rt_dst;
2413                 rt->rt_src = ort->rt_src;
2414                 rt->rt_iif = ort->rt_iif;
2415                 rt->rt_gateway = ort->rt_gateway;
2416                 rt->rt_spec_dst = ort->rt_spec_dst;
2417                 rt->peer = ort->peer;
2418                 if (rt->peer)
2419                         atomic_inc(&rt->peer->refcnt);
2420
2421                 dst_free(new);
2422         }
2423
2424         dst_release(&(*rp)->u.dst);
2425         *rp = rt;
2426         return (rt ? 0 : -ENOMEM);
2427 }
2428
2429 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2430 {
2431         int err;
2432
2433         if ((err = __ip_route_output_key(rp, flp)) != 0)
2434                 return err;
2435
2436         if (flp->proto) {
2437                 if (!flp->fl4_src)
2438                         flp->fl4_src = (*rp)->rt_src;
2439                 if (!flp->fl4_dst)
2440                         flp->fl4_dst = (*rp)->rt_dst;
2441                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2442                 if (err == -EREMOTE)
2443                         err = ipv4_dst_blackhole(rp, flp, sk);
2444
2445                 return err;
2446         }
2447
2448         return 0;
2449 }
2450
2451 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2452
2453 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2454 {
2455         return ip_route_output_flow(rp, flp, NULL, 0);
2456 }
2457
2458 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2459                         int nowait, unsigned int flags)
2460 {
2461         struct rtable *rt = (struct rtable*)skb->dst;
2462         struct rtmsg *r;
2463         struct nlmsghdr *nlh;
2464         long expires;
2465         u32 id = 0, ts = 0, tsage = 0, error;
2466
2467         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2468         if (nlh == NULL)
2469                 return -EMSGSIZE;
2470
2471         r = nlmsg_data(nlh);
2472         r->rtm_family    = AF_INET;
2473         r->rtm_dst_len  = 32;
2474         r->rtm_src_len  = 0;
2475         r->rtm_tos      = rt->fl.fl4_tos;
2476         r->rtm_table    = RT_TABLE_MAIN;
2477         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2478         r->rtm_type     = rt->rt_type;
2479         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2480         r->rtm_protocol = RTPROT_UNSPEC;
2481         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2482         if (rt->rt_flags & RTCF_NOTIFY)
2483                 r->rtm_flags |= RTM_F_NOTIFY;
2484
2485         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2486
2487         if (rt->fl.fl4_src) {
2488                 r->rtm_src_len = 32;
2489                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2490         }
2491         if (rt->u.dst.dev)
2492                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2493 #ifdef CONFIG_NET_CLS_ROUTE
2494         if (rt->u.dst.tclassid)
2495                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2496 #endif
2497         if (rt->fl.iif)
2498                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2499         else if (rt->rt_src != rt->fl.fl4_src)
2500                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2501
2502         if (rt->rt_dst != rt->rt_gateway)
2503                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2504
2505         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2506                 goto nla_put_failure;
2507
2508         error = rt->u.dst.error;
2509         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2510         if (rt->peer) {
2511                 id = rt->peer->ip_id_count;
2512                 if (rt->peer->tcp_ts_stamp) {
2513                         ts = rt->peer->tcp_ts;
2514                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2515                 }
2516         }
2517
2518         if (rt->fl.iif) {
2519 #ifdef CONFIG_IP_MROUTE
2520                 __be32 dst = rt->rt_dst;
2521
2522                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2523                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2524                         int err = ipmr_get_route(skb, r, nowait);
2525                         if (err <= 0) {
2526                                 if (!nowait) {
2527                                         if (err == 0)
2528                                                 return 0;
2529                                         goto nla_put_failure;
2530                                 } else {
2531                                         if (err == -EMSGSIZE)
2532                                                 goto nla_put_failure;
2533                                         error = err;
2534                                 }
2535                         }
2536                 } else
2537 #endif
2538                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2539         }
2540
2541         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2542                                expires, error) < 0)
2543                 goto nla_put_failure;
2544
2545         return nlmsg_end(skb, nlh);
2546
2547 nla_put_failure:
2548         nlmsg_cancel(skb, nlh);
2549         return -EMSGSIZE;
2550 }
2551
2552 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2553 {
2554         struct rtmsg *rtm;
2555         struct nlattr *tb[RTA_MAX+1];
2556         struct rtable *rt = NULL;
2557         __be32 dst = 0;
2558         __be32 src = 0;
2559         u32 iif;
2560         int err;
2561         struct sk_buff *skb;
2562
2563         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2564         if (err < 0)
2565                 goto errout;
2566
2567         rtm = nlmsg_data(nlh);
2568
2569         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2570         if (skb == NULL) {
2571                 err = -ENOBUFS;
2572                 goto errout;
2573         }
2574
2575         /* Reserve room for dummy headers, this skb can pass
2576            through good chunk of routing engine.
2577          */
2578         skb_reset_mac_header(skb);
2579         skb_reset_network_header(skb);
2580
2581         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2582         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2583         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2584
2585         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2586         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2587         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2588
2589         if (iif) {
2590                 struct net_device *dev;
2591
2592                 dev = __dev_get_by_index(&init_net, iif);
2593                 if (dev == NULL) {
2594                         err = -ENODEV;
2595                         goto errout_free;
2596                 }
2597
2598                 skb->protocol   = htons(ETH_P_IP);
2599                 skb->dev        = dev;
2600                 local_bh_disable();
2601                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2602                 local_bh_enable();
2603
2604                 rt = (struct rtable*) skb->dst;
2605                 if (err == 0 && rt->u.dst.error)
2606                         err = -rt->u.dst.error;
2607         } else {
2608                 struct flowi fl = {
2609                         .nl_u = {
2610                                 .ip4_u = {
2611                                         .daddr = dst,
2612                                         .saddr = src,
2613                                         .tos = rtm->rtm_tos,
2614                                 },
2615                         },
2616                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2617                 };
2618                 err = ip_route_output_key(&rt, &fl);
2619         }
2620
2621         if (err)
2622                 goto errout_free;
2623
2624         skb->dst = &rt->u.dst;
2625         if (rtm->rtm_flags & RTM_F_NOTIFY)
2626                 rt->rt_flags |= RTCF_NOTIFY;
2627
2628         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2629                                 RTM_NEWROUTE, 0, 0);
2630         if (err <= 0)
2631                 goto errout_free;
2632
2633         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2634 errout:
2635         return err;
2636
2637 errout_free:
2638         kfree_skb(skb);
2639         goto errout;
2640 }
2641
2642 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2643 {
2644         struct rtable *rt;
2645         int h, s_h;
2646         int idx, s_idx;
2647
2648         s_h = cb->args[0];
2649         s_idx = idx = cb->args[1];
2650         for (h = 0; h <= rt_hash_mask; h++) {
2651                 if (h < s_h) continue;
2652                 if (h > s_h)
2653                         s_idx = 0;
2654                 rcu_read_lock_bh();
2655                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2656                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2657                         if (idx < s_idx)
2658                                 continue;
2659                         skb->dst = dst_clone(&rt->u.dst);
2660                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2661                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2662                                          1, NLM_F_MULTI) <= 0) {
2663                                 dst_release(xchg(&skb->dst, NULL));
2664                                 rcu_read_unlock_bh();
2665                                 goto done;
2666                         }
2667                         dst_release(xchg(&skb->dst, NULL));
2668                 }
2669                 rcu_read_unlock_bh();
2670         }
2671
2672 done:
2673         cb->args[0] = h;
2674         cb->args[1] = idx;
2675         return skb->len;
2676 }
2677
2678 void ip_rt_multicast_event(struct in_device *in_dev)
2679 {
2680         rt_cache_flush(0);
2681 }
2682
2683 #ifdef CONFIG_SYSCTL
2684 static int flush_delay;
2685
2686 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2687                                         struct file *filp, void __user *buffer,
2688                                         size_t *lenp, loff_t *ppos)
2689 {
2690         if (write) {
2691                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2692                 rt_cache_flush(flush_delay);
2693                 return 0;
2694         }
2695
2696         return -EINVAL;
2697 }
2698
2699 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2700                                                 int __user *name,
2701                                                 int nlen,
2702                                                 void __user *oldval,
2703                                                 size_t __user *oldlenp,
2704                                                 void __user *newval,
2705                                                 size_t newlen)
2706 {
2707         int delay;
2708         if (newlen != sizeof(int))
2709                 return -EINVAL;
2710         if (get_user(delay, (int __user *)newval))
2711                 return -EFAULT;
2712         rt_cache_flush(delay);
2713         return 0;
2714 }
2715
2716 ctl_table ipv4_route_table[] = {
2717         {
2718                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2719                 .procname       = "flush",
2720                 .data           = &flush_delay,
2721                 .maxlen         = sizeof(int),
2722                 .mode           = 0200,
2723                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2724                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2725         },
2726         {
2727                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2728                 .procname       = "min_delay",
2729                 .data           = &ip_rt_min_delay,
2730                 .maxlen         = sizeof(int),
2731                 .mode           = 0644,
2732                 .proc_handler   = &proc_dointvec_jiffies,
2733                 .strategy       = &sysctl_jiffies,
2734         },
2735         {
2736                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2737                 .procname       = "max_delay",
2738                 .data           = &ip_rt_max_delay,
2739                 .maxlen         = sizeof(int),
2740                 .mode           = 0644,
2741                 .proc_handler   = &proc_dointvec_jiffies,
2742                 .strategy       = &sysctl_jiffies,
2743         },
2744         {
2745                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2746                 .procname       = "gc_thresh",
2747                 .data           = &ipv4_dst_ops.gc_thresh,
2748                 .maxlen         = sizeof(int),
2749                 .mode           = 0644,
2750                 .proc_handler   = &proc_dointvec,
2751         },
2752         {
2753                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2754                 .procname       = "max_size",
2755                 .data           = &ip_rt_max_size,
2756                 .maxlen         = sizeof(int),
2757                 .mode           = 0644,
2758                 .proc_handler   = &proc_dointvec,
2759         },
2760         {
2761                 /*  Deprecated. Use gc_min_interval_ms */
2762
2763                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2764                 .procname       = "gc_min_interval",
2765                 .data           = &ip_rt_gc_min_interval,
2766                 .maxlen         = sizeof(int),
2767                 .mode           = 0644,
2768                 .proc_handler   = &proc_dointvec_jiffies,
2769                 .strategy       = &sysctl_jiffies,
2770         },
2771         {
2772                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2773                 .procname       = "gc_min_interval_ms",
2774                 .data           = &ip_rt_gc_min_interval,
2775                 .maxlen         = sizeof(int),
2776                 .mode           = 0644,
2777                 .proc_handler   = &proc_dointvec_ms_jiffies,
2778                 .strategy       = &sysctl_ms_jiffies,
2779         },
2780         {
2781                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2782                 .procname       = "gc_timeout",
2783                 .data           = &ip_rt_gc_timeout,
2784                 .maxlen         = sizeof(int),
2785                 .mode           = 0644,
2786                 .proc_handler   = &proc_dointvec_jiffies,
2787                 .strategy       = &sysctl_jiffies,
2788         },
2789         {
2790                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2791                 .procname       = "gc_interval",
2792                 .data           = &ip_rt_gc_interval,
2793                 .maxlen         = sizeof(int),
2794                 .mode           = 0644,
2795                 .proc_handler   = &proc_dointvec_jiffies,
2796                 .strategy       = &sysctl_jiffies,
2797         },
2798         {
2799                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2800                 .procname       = "redirect_load",
2801                 .data           = &ip_rt_redirect_load,
2802                 .maxlen         = sizeof(int),
2803                 .mode           = 0644,
2804                 .proc_handler   = &proc_dointvec,
2805         },
2806         {
2807                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2808                 .procname       = "redirect_number",
2809                 .data           = &ip_rt_redirect_number,
2810                 .maxlen         = sizeof(int),
2811                 .mode           = 0644,
2812                 .proc_handler   = &proc_dointvec,
2813         },
2814         {
2815                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2816                 .procname       = "redirect_silence",
2817                 .data           = &ip_rt_redirect_silence,
2818                 .maxlen         = sizeof(int),
2819                 .mode           = 0644,
2820                 .proc_handler   = &proc_dointvec,
2821         },
2822         {
2823                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2824                 .procname       = "error_cost",
2825                 .data           = &ip_rt_error_cost,
2826                 .maxlen         = sizeof(int),
2827                 .mode           = 0644,
2828                 .proc_handler   = &proc_dointvec,
2829         },
2830         {
2831                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2832                 .procname       = "error_burst",
2833                 .data           = &ip_rt_error_burst,
2834                 .maxlen         = sizeof(int),
2835                 .mode           = 0644,
2836                 .proc_handler   = &proc_dointvec,
2837         },
2838         {
2839                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2840                 .procname       = "gc_elasticity",
2841                 .data           = &ip_rt_gc_elasticity,
2842                 .maxlen         = sizeof(int),
2843                 .mode           = 0644,
2844                 .proc_handler   = &proc_dointvec,
2845         },
2846         {
2847                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2848                 .procname       = "mtu_expires",
2849                 .data           = &ip_rt_mtu_expires,
2850                 .maxlen         = sizeof(int),
2851                 .mode           = 0644,
2852                 .proc_handler   = &proc_dointvec_jiffies,
2853                 .strategy       = &sysctl_jiffies,
2854         },
2855         {
2856                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2857                 .procname       = "min_pmtu",
2858                 .data           = &ip_rt_min_pmtu,
2859                 .maxlen         = sizeof(int),
2860                 .mode           = 0644,
2861                 .proc_handler   = &proc_dointvec,
2862         },
2863         {
2864                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2865                 .procname       = "min_adv_mss",
2866                 .data           = &ip_rt_min_advmss,
2867                 .maxlen         = sizeof(int),
2868                 .mode           = 0644,
2869                 .proc_handler   = &proc_dointvec,
2870         },
2871         {
2872                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2873                 .procname       = "secret_interval",
2874                 .data           = &ip_rt_secret_interval,
2875                 .maxlen         = sizeof(int),
2876                 .mode           = 0644,
2877                 .proc_handler   = &proc_dointvec_jiffies,
2878                 .strategy       = &sysctl_jiffies,
2879         },
2880         { .ctl_name = 0 }
2881 };
2882 #endif
2883
2884 #ifdef CONFIG_NET_CLS_ROUTE
2885 struct ip_rt_acct *ip_rt_acct;
2886
2887 /* This code sucks.  But you should have seen it before! --RR */
2888
2889 /* IP route accounting ptr for this logical cpu number. */
2890 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2891
2892 #ifdef CONFIG_PROC_FS
2893 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2894                            int length, int *eof, void *data)
2895 {
2896         unsigned int i;
2897
2898         if ((offset & 3) || (length & 3))
2899                 return -EIO;
2900
2901         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2902                 *eof = 1;
2903                 return 0;
2904         }
2905
2906         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2907                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2908                 *eof = 1;
2909         }
2910
2911         offset /= sizeof(u32);
2912
2913         if (length > 0) {
2914                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2915                 u32 *dst = (u32 *) buffer;
2916
2917                 /* Copy first cpu. */
2918                 *start = buffer;
2919                 memcpy(dst, src, length);
2920
2921                 /* Add the other cpus in, one int at a time */
2922                 for_each_possible_cpu(i) {
2923                         unsigned int j;
2924
2925                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2926
2927                         for (j = 0; j < length/4; j++)
2928                                 dst[j] += src[j];
2929                 }
2930         }
2931         return length;
2932 }
2933 #endif /* CONFIG_PROC_FS */
2934 #endif /* CONFIG_NET_CLS_ROUTE */
2935
2936 static __initdata unsigned long rhash_entries;
2937 static int __init set_rhash_entries(char *str)
2938 {
2939         if (!str)
2940                 return 0;
2941         rhash_entries = simple_strtoul(str, &str, 0);
2942         return 1;
2943 }
2944 __setup("rhash_entries=", set_rhash_entries);
2945
2946 int __init ip_rt_init(void)
2947 {
2948         int rc = 0;
2949
2950         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2951                              (jiffies ^ (jiffies >> 7)));
2952
2953 #ifdef CONFIG_NET_CLS_ROUTE
2954         {
2955         int order;
2956         for (order = 0;
2957              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2958                 /* NOTHING */;
2959         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2960         if (!ip_rt_acct)
2961                 panic("IP: failed to allocate ip_rt_acct\n");
2962         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2963         }
2964 #endif
2965
2966         ipv4_dst_ops.kmem_cachep =
2967                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2968                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2969
2970         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2971
2972         rt_hash_table = (struct rt_hash_bucket *)
2973                 alloc_large_system_hash("IP route cache",
2974                                         sizeof(struct rt_hash_bucket),
2975                                         rhash_entries,
2976                                         (num_physpages >= 128 * 1024) ?
2977                                         15 : 17,
2978                                         0,
2979                                         &rt_hash_log,
2980                                         &rt_hash_mask,
2981                                         0);
2982         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2983         rt_hash_lock_init();
2984
2985         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2986         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2987
2988         devinet_init();
2989         ip_fib_init();
2990
2991         init_timer(&rt_flush_timer);
2992         rt_flush_timer.function = rt_run_flush;
2993         init_timer(&rt_secret_timer);
2994         rt_secret_timer.function = rt_secret_rebuild;
2995
2996         /* All the timers, started at system startup tend
2997            to synchronize. Perturb it a bit.
2998          */
2999         schedule_delayed_work(&expires_work,
3000                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3001
3002         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3003                 ip_rt_secret_interval;
3004         add_timer(&rt_secret_timer);
3005
3006 #ifdef CONFIG_PROC_FS
3007         {
3008         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3009         if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3010             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3011                                              init_net.proc_net_stat))) {
3012                 return -ENOMEM;
3013         }
3014         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3015         }
3016 #ifdef CONFIG_NET_CLS_ROUTE
3017         create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
3018 #endif
3019 #endif
3020 #ifdef CONFIG_XFRM
3021         xfrm_init();
3022         xfrm4_init();
3023 #endif
3024         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3025
3026         return rc;
3027 }
3028
3029 EXPORT_SYMBOL(__ip_select_ident);
3030 EXPORT_SYMBOL(ip_route_input);
3031 EXPORT_SYMBOL(ip_route_output_key);