err.no Git - linux-2.6/blob - net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60  *
  61  *              This program is free software; you can redistribute it and/or
  62  *              modify it under the terms of the GNU General Public License
  63  *              as published by the Free Software Foundation; either version
  64  *              2 of the License, or (at your option) any later version.
  65  */
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <asm/system.h>
  70 #include <linux/bitops.h>
  71 #include <linux/types.h>
  72 #include <linux/kernel.h>
  73 #include <linux/mm.h>
  74 #include <linux/bootmem.h>
  75 #include <linux/string.h>
  76 #include <linux/socket.h>
  77 #include <linux/sockios.h>
  78 #include <linux/errno.h>
  79 #include <linux/in.h>
  80 #include <linux/inet.h>
  81 #include <linux/netdevice.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/init.h>
  84 #include <linux/workqueue.h>
  85 #include <linux/skbuff.h>
  86 #include <linux/inetdevice.h>
  87 #include <linux/igmp.h>
  88 #include <linux/pkt_sched.h>
  89 #include <linux/mroute.h>
  90 #include <linux/netfilter_ipv4.h>
  91 #include <linux/random.h>
  92 #include <linux/jhash.h>
  93 #include <linux/rcupdate.h>
  94 #include <linux/times.h>
  95 #include <net/dst.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112
 113 #define RT_FL_TOS(oldflp) \
 114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 115
 116 #define IP_MAX_MTU      0xFFF0
 117
 118 #define RT_GC_TIMEOUT (300*HZ)
 119
 120 static int ip_rt_max_size;
 121 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 122 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_gc_elasticity __read_mostly    = 8;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 134
 135 static void rt_worker_func(struct work_struct *work);
 136 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 137 static struct timer_list rt_secret_timer;
 138
 139 /*
 140  *      Interface to generic destination cache.
 141  */
 142
 143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 144 static void              ipv4_dst_destroy(struct dst_entry *dst);
 145 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 146                                          struct net_device *dev, int how);
 147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 148 static void              ipv4_link_failure(struct sk_buff *skb);
 149 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 150 static int rt_garbage_collect(struct dst_ops *ops);
 151
 152
 153 static struct dst_ops ipv4_dst_ops = {
 154         .family =               AF_INET,
 155         .protocol =             __constant_htons(ETH_P_IP),
 156         .gc =                   rt_garbage_collect,
 157         .check =                ipv4_dst_check,
 158         .destroy =              ipv4_dst_destroy,
 159         .ifdown =               ipv4_dst_ifdown,
 160         .negative_advice =      ipv4_negative_advice,
 161         .link_failure =         ipv4_link_failure,
 162         .update_pmtu =          ip_rt_update_pmtu,
 163         .local_out =            ip_local_out,
 164         .entry_size =           sizeof(struct rtable),
 165         .entries =              ATOMIC_INIT(0),
 166 };
 167
 168 #define ECN_OR_COST(class)      TC_PRIO_##class
 169
 170 const __u8 ip_tos2prio[16] = {
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(FILLER),
 173         TC_PRIO_BESTEFFORT,
 174         ECN_OR_COST(BESTEFFORT),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_BULK,
 178         ECN_OR_COST(BULK),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE,
 182         ECN_OR_COST(INTERACTIVE),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK),
 185         TC_PRIO_INTERACTIVE_BULK,
 186         ECN_OR_COST(INTERACTIVE_BULK)
 187 };
 188
 189
 190 /*
 191  * Route cache.
 192  */
 193
 194 /* The locking scheme is rather straight forward:
 195  *
 196  * 1) Read-Copy Update protects the buckets of the central route hash.
 197  * 2) Only writers remove entries, and they hold the lock
 198  *    as they look at rtable reference counts.
 199  * 3) Only readers acquire references to rtable entries,
 200  *    they do so with atomic increments and with the
 201  *    lock held.
 202  */
 203
 204 struct rt_hash_bucket {
 205         struct rtable   *chain;
 206 };
 207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 208         defined(CONFIG_PROVE_LOCKING)
 209 /*
 210  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 211  * The size of this table is a power of two and depends on the number of CPUS.
 212  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 213  */
 214 #ifdef CONFIG_LOCKDEP
 215 # define RT_HASH_LOCK_SZ        256
 216 #else
 217 # if NR_CPUS >= 32
 218 #  define RT_HASH_LOCK_SZ       4096
 219 # elif NR_CPUS >= 16
 220 #  define RT_HASH_LOCK_SZ       2048
 221 # elif NR_CPUS >= 8
 222 #  define RT_HASH_LOCK_SZ       1024
 223 # elif NR_CPUS >= 4
 224 #  define RT_HASH_LOCK_SZ       512
 225 # else
 226 #  define RT_HASH_LOCK_SZ       256
 227 # endif
 228 #endif
 229
 230 static spinlock_t       *rt_hash_locks;
 231 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 232
 233 static __init void rt_hash_lock_init(void)
 234 {
 235         int i;
 236
 237         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 238                         GFP_KERNEL);
 239         if (!rt_hash_locks)
 240                 panic("IP: failed to allocate rt_hash_locks\n");
 241
 242         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 243                 spin_lock_init(&rt_hash_locks[i]);
 244 }
 245 #else
 246 # define rt_hash_lock_addr(slot) NULL
 247
 248 static inline void rt_hash_lock_init(void)
 249 {
 250 }
 251 #endif
 252
 253 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 254 static unsigned                 rt_hash_mask __read_mostly;
 255 static unsigned int             rt_hash_log  __read_mostly;
 256 static atomic_t                 rt_genid __read_mostly;
 257
 258 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 259 #define RT_CACHE_STAT_INC(field) \
 260         (__raw_get_cpu_var(rt_cache_stat).field++)
 261
 262 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
 263 {
 264         return jhash_3words((__force u32)(__be32)(daddr),
 265                             (__force u32)(__be32)(saddr),
 266                             idx, atomic_read(&rt_genid))
 267                 & rt_hash_mask;
 268 }
 269
 270 #ifdef CONFIG_PROC_FS
 271 struct rt_cache_iter_state {
 272         struct seq_net_private p;
 273         int bucket;
 274         int genid;
 275 };
 276
 277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 278 {
 279         struct rt_cache_iter_state *st = seq->private;
 280         struct rtable *r = NULL;
 281
 282         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 283                 rcu_read_lock_bh();
 284                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
 285                 while (r) {
 286                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 287                             r->rt_genid == st->genid)
 288                                 return r;
 289                         r = rcu_dereference(r->u.dst.rt_next);
 290                 }
 291                 rcu_read_unlock_bh();
 292         }
 293         return r;
 294 }
 295
 296 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 297                                           struct rtable *r)
 298 {
 299         struct rt_cache_iter_state *st = seq->private;
 300         r = r->u.dst.rt_next;
 301         while (!r) {
 302                 rcu_read_unlock_bh();
 303                 if (--st->bucket < 0)
 304                         break;
 305                 rcu_read_lock_bh();
 306                 r = rt_hash_table[st->bucket].chain;
 307         }
 308         return rcu_dereference(r);
 309 }
 310
 311 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 312                                         struct rtable *r)
 313 {
 314         struct rt_cache_iter_state *st = seq->private;
 315         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 316                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 317                         continue;
 318                 if (r->rt_genid == st->genid)
 319                         break;
 320         }
 321         return r;
 322 }
 323
 324 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 325 {
 326         struct rtable *r = rt_cache_get_first(seq);
 327
 328         if (r)
 329                 while (pos && (r = rt_cache_get_next(seq, r)))
 330                         --pos;
 331         return pos ? NULL : r;
 332 }
 333
 334 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 335 {
 336         struct rt_cache_iter_state *st = seq->private;
 337         if (*pos)
 338                 return rt_cache_get_idx(seq, *pos - 1);
 339         st->genid = atomic_read(&rt_genid);
 340         return SEQ_START_TOKEN;
 341 }
 342
 343 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 344 {
 345         struct rtable *r;
 346
 347         if (v == SEQ_START_TOKEN)
 348                 r = rt_cache_get_first(seq);
 349         else
 350                 r = rt_cache_get_next(seq, v);
 351         ++*pos;
 352         return r;
 353 }
 354
 355 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 356 {
 357         if (v && v != SEQ_START_TOKEN)
 358                 rcu_read_unlock_bh();
 359 }
 360
 361 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 362 {
 363         if (v == SEQ_START_TOKEN)
 364                 seq_printf(seq, "%-127s\n",
 365                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 366                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 367                            "HHUptod\tSpecDst");
 368         else {
 369                 struct rtable *r = v;
 370                 char temp[256];
 371
 372                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 373                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 374                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 375                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 376                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 377                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 378                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 379                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 380                         dst_metric(&r->u.dst, RTAX_WINDOW),
 381                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 382                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 383                         r->fl.fl4_tos,
 384                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 385                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 386                                        dev_queue_xmit) : 0,
 387                         r->rt_spec_dst);
 388                 seq_printf(seq, "%-127s\n", temp);
 389         }
 390         return 0;
 391 }
 392
 393 static const struct seq_operations rt_cache_seq_ops = {
 394         .start  = rt_cache_seq_start,
 395         .next   = rt_cache_seq_next,
 396         .stop   = rt_cache_seq_stop,
 397         .show   = rt_cache_seq_show,
 398 };
 399
 400 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 401 {
 402         return seq_open_net(inode, file, &rt_cache_seq_ops,
 403                         sizeof(struct rt_cache_iter_state));
 404 }
 405
 406 static const struct file_operations rt_cache_seq_fops = {
 407         .owner   = THIS_MODULE,
 408         .open    = rt_cache_seq_open,
 409         .read    = seq_read,
 410         .llseek  = seq_lseek,
 411         .release = seq_release_net,
 412 };
 413
 414
 415 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 416 {
 417         int cpu;
 418
 419         if (*pos == 0)
 420                 return SEQ_START_TOKEN;
 421
 422         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 423                 if (!cpu_possible(cpu))
 424                         continue;
 425                 *pos = cpu+1;
 426                 return &per_cpu(rt_cache_stat, cpu);
 427         }
 428         return NULL;
 429 }
 430
 431 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 432 {
 433         int cpu;
 434
 435         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 436                 if (!cpu_possible(cpu))
 437                         continue;
 438                 *pos = cpu+1;
 439                 return &per_cpu(rt_cache_stat, cpu);
 440         }
 441         return NULL;
 442
 443 }
 444
 445 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 446 {
 447
 448 }
 449
 450 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 451 {
 452         struct rt_cache_stat *st = v;
 453
 454         if (v == SEQ_START_TOKEN) {
 455                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 456                 return 0;
 457         }
 458
 459         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 460                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 461                    atomic_read(&ipv4_dst_ops.entries),
 462                    st->in_hit,
 463                    st->in_slow_tot,
 464                    st->in_slow_mc,
 465                    st->in_no_route,
 466                    st->in_brd,
 467                    st->in_martian_dst,
 468                    st->in_martian_src,
 469
 470                    st->out_hit,
 471                    st->out_slow_tot,
 472                    st->out_slow_mc,
 473
 474                    st->gc_total,
 475                    st->gc_ignored,
 476                    st->gc_goal_miss,
 477                    st->gc_dst_overflow,
 478                    st->in_hlist_search,
 479                    st->out_hlist_search
 480                 );
 481         return 0;
 482 }
 483
 484 static const struct seq_operations rt_cpu_seq_ops = {
 485         .start  = rt_cpu_seq_start,
 486         .next   = rt_cpu_seq_next,
 487         .stop   = rt_cpu_seq_stop,
 488         .show   = rt_cpu_seq_show,
 489 };
 490
 491
 492 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 493 {
 494         return seq_open(file, &rt_cpu_seq_ops);
 495 }
 496
 497 static const struct file_operations rt_cpu_seq_fops = {
 498         .owner   = THIS_MODULE,
 499         .open    = rt_cpu_seq_open,
 500         .read    = seq_read,
 501         .llseek  = seq_lseek,
 502         .release = seq_release,
 503 };
 504
 505 #ifdef CONFIG_NET_CLS_ROUTE
 506 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 507                            int length, int *eof, void *data)
 508 {
 509         unsigned int i;
 510
 511         if ((offset & 3) || (length & 3))
 512                 return -EIO;
 513
 514         if (offset >= sizeof(struct ip_rt_acct) * 256) {
 515                 *eof = 1;
 516                 return 0;
 517         }
 518
 519         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 520                 length = sizeof(struct ip_rt_acct) * 256 - offset;
 521                 *eof = 1;
 522         }
 523
 524         offset /= sizeof(u32);
 525
 526         if (length > 0) {
 527                 u32 *dst = (u32 *) buffer;
 528
 529                 *start = buffer;
 530                 memset(dst, 0, length);
 531
 532                 for_each_possible_cpu(i) {
 533                         unsigned int j;
 534                         u32 *src;
 535
 536                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 537                         for (j = 0; j < length/4; j++)
 538                                 dst[j] += src[j];
 539                 }
 540         }
 541         return length;
 542 }
 543 #endif
 544
 545 static int __net_init ip_rt_do_proc_init(struct net *net)
 546 {
 547         struct proc_dir_entry *pde;
 548
 549         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 550                         &rt_cache_seq_fops);
 551         if (!pde)
 552                 goto err1;
 553
 554         pde = proc_create("rt_cache", S_IRUGO,
 555                           net->proc_net_stat, &rt_cpu_seq_fops);
 556         if (!pde)
 557                 goto err2;
 558
 559 #ifdef CONFIG_NET_CLS_ROUTE
 560         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 561                         ip_rt_acct_read, NULL);
 562         if (!pde)
 563                 goto err3;
 564 #endif
 565         return 0;
 566
 567 #ifdef CONFIG_NET_CLS_ROUTE
 568 err3:
 569         remove_proc_entry("rt_cache", net->proc_net_stat);
 570 #endif
 571 err2:
 572         remove_proc_entry("rt_cache", net->proc_net);
 573 err1:
 574         return -ENOMEM;
 575 }
 576
 577 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 578 {
 579         remove_proc_entry("rt_cache", net->proc_net_stat);
 580         remove_proc_entry("rt_cache", net->proc_net);
 581         remove_proc_entry("rt_acct", net->proc_net);
 582 }
 583
 584 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 585         .init = ip_rt_do_proc_init,
 586         .exit = ip_rt_do_proc_exit,
 587 };
 588
 589 static int __init ip_rt_proc_init(void)
 590 {
 591         return register_pernet_subsys(&ip_rt_proc_ops);
 592 }
 593
 594 #else
 595 static inline int ip_rt_proc_init(void)
 596 {
 597         return 0;
 598 }
 599 #endif /* CONFIG_PROC_FS */
 600
 601 static inline void rt_free(struct rtable *rt)
 602 {
 603         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 604 }
 605
 606 static inline void rt_drop(struct rtable *rt)
 607 {
 608         ip_rt_put(rt);
 609         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 610 }
 611
 612 static inline int rt_fast_clean(struct rtable *rth)
 613 {
 614         /* Kill broadcast/multicast entries very aggresively, if they
 615            collide in hash table with more useful entries */
 616         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 617                 rth->fl.iif && rth->u.dst.rt_next;
 618 }
 619
 620 static inline int rt_valuable(struct rtable *rth)
 621 {
 622         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 623                 rth->u.dst.expires;
 624 }
 625
 626 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 627 {
 628         unsigned long age;
 629         int ret = 0;
 630
 631         if (atomic_read(&rth->u.dst.__refcnt))
 632                 goto out;
 633
 634         ret = 1;
 635         if (rth->u.dst.expires &&
 636             time_after_eq(jiffies, rth->u.dst.expires))
 637                 goto out;
 638
 639         age = jiffies - rth->u.dst.lastuse;
 640         ret = 0;
 641         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 642             (age <= tmo2 && rt_valuable(rth)))
 643                 goto out;
 644         ret = 1;
 645 out:    return ret;
 646 }
 647
 648 /* Bits of score are:
 649  * 31: very valuable
 650  * 30: not quite useless
 651  * 29..0: usage counter
 652  */
 653 static inline u32 rt_score(struct rtable *rt)
 654 {
 655         u32 score = jiffies - rt->u.dst.lastuse;
 656
 657         score = ~score & ~(3<<30);
 658
 659         if (rt_valuable(rt))
 660                 score |= (1<<31);
 661
 662         if (!rt->fl.iif ||
 663             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 664                 score |= (1<<30);
 665
 666         return score;
 667 }
 668
 669 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 670 {
 671         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 672                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 673                 (fl1->mark ^ fl2->mark) |
 674                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 675                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 676                 (fl1->oif ^ fl2->oif) |
 677                 (fl1->iif ^ fl2->iif)) == 0;
 678 }
 679
 680 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 681 {
 682         return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 683 }
 684
 685 /*
 686  * Perform a full scan of hash table and free all entries.
 687  * Can be called by a softirq or a process.
 688  * In the later case, we want to be reschedule if necessary
 689  */
 690 static void rt_do_flush(int process_context)
 691 {
 692         unsigned int i;
 693         struct rtable *rth, *next;
 694
 695         for (i = 0; i <= rt_hash_mask; i++) {
 696                 if (process_context && need_resched())
 697                         cond_resched();
 698                 rth = rt_hash_table[i].chain;
 699                 if (!rth)
 700                         continue;
 701
 702                 spin_lock_bh(rt_hash_lock_addr(i));
 703                 rth = rt_hash_table[i].chain;
 704                 rt_hash_table[i].chain = NULL;
 705                 spin_unlock_bh(rt_hash_lock_addr(i));
 706
 707                 for (; rth; rth = next) {
 708                         next = rth->u.dst.rt_next;
 709                         rt_free(rth);
 710                 }
 711         }
 712 }
 713
 714 static void rt_check_expire(void)
 715 {
 716         static unsigned int rover;
 717         unsigned int i = rover, goal;
 718         struct rtable *rth, **rthp;
 719         u64 mult;
 720
 721         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 722         if (ip_rt_gc_timeout > 1)
 723                 do_div(mult, ip_rt_gc_timeout);
 724         goal = (unsigned int)mult;
 725         if (goal > rt_hash_mask)
 726                 goal = rt_hash_mask + 1;
 727         for (; goal > 0; goal--) {
 728                 unsigned long tmo = ip_rt_gc_timeout;
 729
 730                 i = (i + 1) & rt_hash_mask;
 731                 rthp = &rt_hash_table[i].chain;
 732
 733                 if (need_resched())
 734                         cond_resched();
 735
 736                 if (*rthp == NULL)
 737                         continue;
 738                 spin_lock_bh(rt_hash_lock_addr(i));
 739                 while ((rth = *rthp) != NULL) {
 740                         if (rth->rt_genid != atomic_read(&rt_genid)) {
 741                                 *rthp = rth->u.dst.rt_next;
 742                                 rt_free(rth);
 743                                 continue;
 744                         }
 745                         if (rth->u.dst.expires) {
 746                                 /* Entry is expired even if it is in use */
 747                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 748                                         tmo >>= 1;
 749                                         rthp = &rth->u.dst.rt_next;
 750                                         continue;
 751                                 }
 752                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 753                                 tmo >>= 1;
 754                                 rthp = &rth->u.dst.rt_next;
 755                                 continue;
 756                         }
 757
 758                         /* Cleanup aged off entries. */
 759                         *rthp = rth->u.dst.rt_next;
 760                         rt_free(rth);
 761                 }
 762                 spin_unlock_bh(rt_hash_lock_addr(i));
 763         }
 764         rover = i;
 765 }
 766
 767 /*
 768  * rt_worker_func() is run in process context.
 769  * we call rt_check_expire() to scan part of the hash table
 770  */
 771 static void rt_worker_func(struct work_struct *work)
 772 {
 773         rt_check_expire();
 774         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 775 }
 776
 777 /*
 778  * Pertubation of rt_genid by a small quantity [1..256]
 779  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 780  * many times (2^24) without giving recent rt_genid.
 781  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 782  */
 783 static void rt_cache_invalidate(void)
 784 {
 785         unsigned char shuffle;
 786
 787         get_random_bytes(&shuffle, sizeof(shuffle));
 788         atomic_add(shuffle + 1U, &rt_genid);
 789 }
 790
 791 /*
 792  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 793  * delay >= 0 : invalidate & flush cache (can be long)
 794  */
 795 void rt_cache_flush(int delay)
 796 {
 797         rt_cache_invalidate();
 798         if (delay >= 0)
 799                 rt_do_flush(!in_softirq());
 800 }
 801
 802 /*
 803  * We change rt_genid and let gc do the cleanup
 804  */
 805 static void rt_secret_rebuild(unsigned long dummy)
 806 {
 807         rt_cache_invalidate();
 808         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
 809 }
 810
 811 /*
 812    Short description of GC goals.
 813
 814    We want to build algorithm, which will keep routing cache
 815    at some equilibrium point, when number of aged off entries
 816    is kept approximately equal to newly generated ones.
 817
 818    Current expiration strength is variable "expire".
 819    We try to adjust it dynamically, so that if networking
 820    is idle expires is large enough to keep enough of warm entries,
 821    and when load increases it reduces to limit cache size.
 822  */
 823
 824 static int rt_garbage_collect(struct dst_ops *ops)
 825 {
 826         static unsigned long expire = RT_GC_TIMEOUT;
 827         static unsigned long last_gc;
 828         static int rover;
 829         static int equilibrium;
 830         struct rtable *rth, **rthp;
 831         unsigned long now = jiffies;
 832         int goal;
 833
 834         /*
 835          * Garbage collection is pretty expensive,
 836          * do not make it too frequently.
 837          */
 838
 839         RT_CACHE_STAT_INC(gc_total);
 840
 841         if (now - last_gc < ip_rt_gc_min_interval &&
 842             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 843                 RT_CACHE_STAT_INC(gc_ignored);
 844                 goto out;
 845         }
 846
 847         /* Calculate number of entries, which we want to expire now. */
 848         goal = atomic_read(&ipv4_dst_ops.entries) -
 849                 (ip_rt_gc_elasticity << rt_hash_log);
 850         if (goal <= 0) {
 851                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 852                         equilibrium = ipv4_dst_ops.gc_thresh;
 853                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 854                 if (goal > 0) {
 855                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 856                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 857                 }
 858         } else {
 859                 /* We are in dangerous area. Try to reduce cache really
 860                  * aggressively.
 861                  */
 862                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 863                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 864         }
 865
 866         if (now - last_gc >= ip_rt_gc_min_interval)
 867                 last_gc = now;
 868
 869         if (goal <= 0) {
 870                 equilibrium += goal;
 871                 goto work_done;
 872         }
 873
 874         do {
 875                 int i, k;
 876
 877                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 878                         unsigned long tmo = expire;
 879
 880                         k = (k + 1) & rt_hash_mask;
 881                         rthp = &rt_hash_table[k].chain;
 882                         spin_lock_bh(rt_hash_lock_addr(k));
 883                         while ((rth = *rthp) != NULL) {
 884                                 if (rth->rt_genid == atomic_read(&rt_genid) &&
 885                                         !rt_may_expire(rth, tmo, expire)) {
 886                                         tmo >>= 1;
 887                                         rthp = &rth->u.dst.rt_next;
 888                                         continue;
 889                                 }
 890                                 *rthp = rth->u.dst.rt_next;
 891                                 rt_free(rth);
 892                                 goal--;
 893                         }
 894                         spin_unlock_bh(rt_hash_lock_addr(k));
 895                         if (goal <= 0)
 896                                 break;
 897                 }
 898                 rover = k;
 899
 900                 if (goal <= 0)
 901                         goto work_done;
 902
 903                 /* Goal is not achieved. We stop process if:
 904
 905                    - if expire reduced to zero. Otherwise, expire is halfed.
 906                    - if table is not full.
 907                    - if we are called from interrupt.
 908                    - jiffies check is just fallback/debug loop breaker.
 909                      We will not spin here for long time in any case.
 910                  */
 911
 912                 RT_CACHE_STAT_INC(gc_goal_miss);
 913
 914                 if (expire == 0)
 915                         break;
 916
 917                 expire >>= 1;
 918 #if RT_CACHE_DEBUG >= 2
 919                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 920                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 921 #endif
 922
 923                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 924                         goto out;
 925         } while (!in_softirq() && time_before_eq(jiffies, now));
 926
 927         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 928                 goto out;
 929         if (net_ratelimit())
 930                 printk(KERN_WARNING "dst cache overflow\n");
 931         RT_CACHE_STAT_INC(gc_dst_overflow);
 932         return 1;
 933
 934 work_done:
 935         expire += ip_rt_gc_min_interval;
 936         if (expire > ip_rt_gc_timeout ||
 937             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 938                 expire = ip_rt_gc_timeout;
 939 #if RT_CACHE_DEBUG >= 2
 940         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 941                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 942 #endif
 943 out:    return 0;
 944 }
 945
 946 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 947 {
 948         struct rtable   *rth, **rthp;
 949         unsigned long   now;
 950         struct rtable *cand, **candp;
 951         u32             min_score;
 952         int             chain_length;
 953         int attempts = !in_softirq();
 954
 955 restart:
 956         chain_length = 0;
 957         min_score = ~(u32)0;
 958         cand = NULL;
 959         candp = NULL;
 960         now = jiffies;
 961
 962         rthp = &rt_hash_table[hash].chain;
 963
 964         spin_lock_bh(rt_hash_lock_addr(hash));
 965         while ((rth = *rthp) != NULL) {
 966                 if (rth->rt_genid != atomic_read(&rt_genid)) {
 967                         *rthp = rth->u.dst.rt_next;
 968                         rt_free(rth);
 969                         continue;
 970                 }
 971                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
 972                         /* Put it first */
 973                         *rthp = rth->u.dst.rt_next;
 974                         /*
 975                          * Since lookup is lockfree, the deletion
 976                          * must be visible to another weakly ordered CPU before
 977                          * the insertion at the start of the hash chain.
 978                          */
 979                         rcu_assign_pointer(rth->u.dst.rt_next,
 980                                            rt_hash_table[hash].chain);
 981                         /*
 982                          * Since lookup is lockfree, the update writes
 983                          * must be ordered for consistency on SMP.
 984                          */
 985                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 986
 987                         dst_use(&rth->u.dst, now);
 988                         spin_unlock_bh(rt_hash_lock_addr(hash));
 989
 990                         rt_drop(rt);
 991                         *rp = rth;
 992                         return 0;
 993                 }
 994
 995                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 996                         u32 score = rt_score(rth);
 997
 998                         if (score <= min_score) {
 999                                 cand = rth;
1000                                 candp = rthp;
1001                                 min_score = score;
1002                         }
1003                 }
1004
1005                 chain_length++;
1006
1007                 rthp = &rth->u.dst.rt_next;
1008         }
1009
1010         if (cand) {
1011                 /* ip_rt_gc_elasticity used to be average length of chain
1012                  * length, when exceeded gc becomes really aggressive.
1013                  *
1014                  * The second limit is less certain. At the moment it allows
1015                  * only 2 entries per bucket. We will see.
1016                  */
1017                 if (chain_length > ip_rt_gc_elasticity) {
1018                         *candp = cand->u.dst.rt_next;
1019                         rt_free(cand);
1020                 }
1021         }
1022
1023         /* Try to bind route to arp only if it is output
1024            route or unicast forwarding path.
1025          */
1026         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1027                 int err = arp_bind_neighbour(&rt->u.dst);
1028                 if (err) {
1029                         spin_unlock_bh(rt_hash_lock_addr(hash));
1030
1031                         if (err != -ENOBUFS) {
1032                                 rt_drop(rt);
1033                                 return err;
1034                         }
1035
1036                         /* Neighbour tables are full and nothing
1037                            can be released. Try to shrink route cache,
1038                            it is most likely it holds some neighbour records.
1039                          */
1040                         if (attempts-- > 0) {
1041                                 int saved_elasticity = ip_rt_gc_elasticity;
1042                                 int saved_int = ip_rt_gc_min_interval;
1043                                 ip_rt_gc_elasticity     = 1;
1044                                 ip_rt_gc_min_interval   = 0;
1045                                 rt_garbage_collect(&ipv4_dst_ops);
1046                                 ip_rt_gc_min_interval   = saved_int;
1047                                 ip_rt_gc_elasticity     = saved_elasticity;
1048                                 goto restart;
1049                         }
1050
1051                         if (net_ratelimit())
1052                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1053                         rt_drop(rt);
1054                         return -ENOBUFS;
1055                 }
1056         }
1057
1058         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1059 #if RT_CACHE_DEBUG >= 2
1060         if (rt->u.dst.rt_next) {
1061                 struct rtable *trt;
1062                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1063                        NIPQUAD(rt->rt_dst));
1064                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1065                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1066                 printk("\n");
1067         }
1068 #endif
1069         rt_hash_table[hash].chain = rt;
1070         spin_unlock_bh(rt_hash_lock_addr(hash));
1071         *rp = rt;
1072         return 0;
1073 }
1074
1075 void rt_bind_peer(struct rtable *rt, int create)
1076 {
1077         static DEFINE_SPINLOCK(rt_peer_lock);
1078         struct inet_peer *peer;
1079
1080         peer = inet_getpeer(rt->rt_dst, create);
1081
1082         spin_lock_bh(&rt_peer_lock);
1083         if (rt->peer == NULL) {
1084                 rt->peer = peer;
1085                 peer = NULL;
1086         }
1087         spin_unlock_bh(&rt_peer_lock);
1088         if (peer)
1089                 inet_putpeer(peer);
1090 }
1091
1092 /*
1093  * Peer allocation may fail only in serious out-of-memory conditions.  However
1094  * we still can generate some output.
1095  * Random ID selection looks a bit dangerous because we have no chances to
1096  * select ID being unique in a reasonable period of time.
1097  * But broken packet identifier may be better than no packet at all.
1098  */
1099 static void ip_select_fb_ident(struct iphdr *iph)
1100 {
1101         static DEFINE_SPINLOCK(ip_fb_id_lock);
1102         static u32 ip_fallback_id;
1103         u32 salt;
1104
1105         spin_lock_bh(&ip_fb_id_lock);
1106         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1107         iph->id = htons(salt & 0xFFFF);
1108         ip_fallback_id = salt;
1109         spin_unlock_bh(&ip_fb_id_lock);
1110 }
1111
1112 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1113 {
1114         struct rtable *rt = (struct rtable *) dst;
1115
1116         if (rt) {
1117                 if (rt->peer == NULL)
1118                         rt_bind_peer(rt, 1);
1119
1120                 /* If peer is attached to destination, it is never detached,
1121                    so that we need not to grab a lock to dereference it.
1122                  */
1123                 if (rt->peer) {
1124                         iph->id = htons(inet_getid(rt->peer, more));
1125                         return;
1126                 }
1127         } else
1128                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1129                        __builtin_return_address(0));
1130
1131         ip_select_fb_ident(iph);
1132 }
1133
1134 static void rt_del(unsigned hash, struct rtable *rt)
1135 {
1136         struct rtable **rthp, *aux;
1137
1138         rthp = &rt_hash_table[hash].chain;
1139         spin_lock_bh(rt_hash_lock_addr(hash));
1140         ip_rt_put(rt);
1141         while ((aux = *rthp) != NULL) {
1142                 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1143                         *rthp = aux->u.dst.rt_next;
1144                         rt_free(aux);
1145                         continue;
1146                 }
1147                 rthp = &aux->u.dst.rt_next;
1148         }
1149         spin_unlock_bh(rt_hash_lock_addr(hash));
1150 }
1151
1152 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1153                     __be32 saddr, struct net_device *dev)
1154 {
1155         int i, k;
1156         struct in_device *in_dev = in_dev_get(dev);
1157         struct rtable *rth, **rthp;
1158         __be32  skeys[2] = { saddr, 0 };
1159         int  ikeys[2] = { dev->ifindex, 0 };
1160         struct netevent_redirect netevent;
1161         struct net *net;
1162
1163         if (!in_dev)
1164                 return;
1165
1166         net = dev_net(dev);
1167         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1168             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1169             || ipv4_is_zeronet(new_gw))
1170                 goto reject_redirect;
1171
1172         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1173                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1174                         goto reject_redirect;
1175                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1176                         goto reject_redirect;
1177         } else {
1178                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1179                         goto reject_redirect;
1180         }
1181
1182         for (i = 0; i < 2; i++) {
1183                 for (k = 0; k < 2; k++) {
1184                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1185
1186                         rthp=&rt_hash_table[hash].chain;
1187
1188                         rcu_read_lock();
1189                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1190                                 struct rtable *rt;
1191
1192                                 if (rth->fl.fl4_dst != daddr ||
1193                                     rth->fl.fl4_src != skeys[i] ||
1194                                     rth->fl.oif != ikeys[k] ||
1195                                     rth->fl.iif != 0 ||
1196                                     rth->rt_genid != atomic_read(&rt_genid) ||
1197                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1198                                         rthp = &rth->u.dst.rt_next;
1199                                         continue;
1200                                 }
1201
1202                                 if (rth->rt_dst != daddr ||
1203                                     rth->rt_src != saddr ||
1204                                     rth->u.dst.error ||
1205                                     rth->rt_gateway != old_gw ||
1206                                     rth->u.dst.dev != dev)
1207                                         break;
1208
1209                                 dst_hold(&rth->u.dst);
1210                                 rcu_read_unlock();
1211
1212                                 rt = dst_alloc(&ipv4_dst_ops);
1213                                 if (rt == NULL) {
1214                                         ip_rt_put(rth);
1215                                         in_dev_put(in_dev);
1216                                         return;
1217                                 }
1218
1219                                 /* Copy all the information. */
1220                                 *rt = *rth;
1221                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1222                                 rt->u.dst.__use         = 1;
1223                                 atomic_set(&rt->u.dst.__refcnt, 1);
1224                                 rt->u.dst.child         = NULL;
1225                                 if (rt->u.dst.dev)
1226                                         dev_hold(rt->u.dst.dev);
1227                                 if (rt->idev)
1228                                         in_dev_hold(rt->idev);
1229                                 rt->u.dst.obsolete      = 0;
1230                                 rt->u.dst.lastuse       = jiffies;
1231                                 rt->u.dst.path          = &rt->u.dst;
1232                                 rt->u.dst.neighbour     = NULL;
1233                                 rt->u.dst.hh            = NULL;
1234                                 rt->u.dst.xfrm          = NULL;
1235                                 rt->rt_genid            = atomic_read(&rt_genid);
1236                                 rt->rt_flags            |= RTCF_REDIRECTED;
1237
1238                                 /* Gateway is different ... */
1239                                 rt->rt_gateway          = new_gw;
1240
1241                                 /* Redirect received -> path was valid */
1242                                 dst_confirm(&rth->u.dst);
1243
1244                                 if (rt->peer)
1245                                         atomic_inc(&rt->peer->refcnt);
1246
1247                                 if (arp_bind_neighbour(&rt->u.dst) ||
1248                                     !(rt->u.dst.neighbour->nud_state &
1249                                             NUD_VALID)) {
1250                                         if (rt->u.dst.neighbour)
1251                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1252                                         ip_rt_put(rth);
1253                                         rt_drop(rt);
1254                                         goto do_next;
1255                                 }
1256
1257                                 netevent.old = &rth->u.dst;
1258                                 netevent.new = &rt->u.dst;
1259                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1260                                                         &netevent);
1261
1262                                 rt_del(hash, rth);
1263                                 if (!rt_intern_hash(hash, rt, &rt))
1264                                         ip_rt_put(rt);
1265                                 goto do_next;
1266                         }
1267                         rcu_read_unlock();
1268                 do_next:
1269                         ;
1270                 }
1271         }
1272         in_dev_put(in_dev);
1273         return;
1274
1275 reject_redirect:
1276 #ifdef CONFIG_IP_ROUTE_VERBOSE
1277         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1278                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1279                         "%u.%u.%u.%u ignored.\n"
1280                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1281                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1282                        NIPQUAD(saddr), NIPQUAD(daddr));
1283 #endif
1284         in_dev_put(in_dev);
1285 }
1286
1287 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1288 {
1289         struct rtable *rt = (struct rtable *)dst;
1290         struct dst_entry *ret = dst;
1291
1292         if (rt) {
1293                 if (dst->obsolete) {
1294                         ip_rt_put(rt);
1295                         ret = NULL;
1296                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1297                            rt->u.dst.expires) {
1298                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1299                                                 rt->fl.oif);
1300 #if RT_CACHE_DEBUG >= 1
1301                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1302                                           "%u.%u.%u.%u/%02x dropped\n",
1303                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1304 #endif
1305                         rt_del(hash, rt);
1306                         ret = NULL;
1307                 }
1308         }
1309         return ret;
1310 }
1311
1312 /*
1313  * Algorithm:
1314  *      1. The first ip_rt_redirect_number redirects are sent
1315  *         with exponential backoff, then we stop sending them at all,
1316  *         assuming that the host ignores our redirects.
1317  *      2. If we did not see packets requiring redirects
1318  *         during ip_rt_redirect_silence, we assume that the host
1319  *         forgot redirected route and start to send redirects again.
1320  *
1321  * This algorithm is much cheaper and more intelligent than dumb load limiting
1322  * in icmp.c.
1323  *
1324  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1325  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1326  */
1327
1328 void ip_rt_send_redirect(struct sk_buff *skb)
1329 {
1330         struct rtable *rt = skb->rtable;
1331         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1332
1333         if (!in_dev)
1334                 return;
1335
1336         if (!IN_DEV_TX_REDIRECTS(in_dev))
1337                 goto out;
1338
1339         /* No redirected packets during ip_rt_redirect_silence;
1340          * reset the algorithm.
1341          */
1342         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1343                 rt->u.dst.rate_tokens = 0;
1344
1345         /* Too many ignored redirects; do not send anything
1346          * set u.dst.rate_last to the last seen redirected packet.
1347          */
1348         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1349                 rt->u.dst.rate_last = jiffies;
1350                 goto out;
1351         }
1352
1353         /* Check for load limit; set rate_last to the latest sent
1354          * redirect.
1355          */
1356         if (rt->u.dst.rate_tokens == 0 ||
1357             time_after(jiffies,
1358                        (rt->u.dst.rate_last +
1359                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1360                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1361                 rt->u.dst.rate_last = jiffies;
1362                 ++rt->u.dst.rate_tokens;
1363 #ifdef CONFIG_IP_ROUTE_VERBOSE
1364                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1365                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1366                     net_ratelimit())
1367                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1368                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1369                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1370                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1371 #endif
1372         }
1373 out:
1374         in_dev_put(in_dev);
1375 }
1376
1377 static int ip_error(struct sk_buff *skb)
1378 {
1379         struct rtable *rt = skb->rtable;
1380         unsigned long now;
1381         int code;
1382
1383         switch (rt->u.dst.error) {
1384                 case EINVAL:
1385                 default:
1386                         goto out;
1387                 case EHOSTUNREACH:
1388                         code = ICMP_HOST_UNREACH;
1389                         break;
1390                 case ENETUNREACH:
1391                         code = ICMP_NET_UNREACH;
1392                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1393                         break;
1394                 case EACCES:
1395                         code = ICMP_PKT_FILTERED;
1396                         break;
1397         }
1398
1399         now = jiffies;
1400         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1401         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1402                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1403         rt->u.dst.rate_last = now;
1404         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1405                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1406                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1407         }
1408
1409 out:    kfree_skb(skb);
1410         return 0;
1411 }
1412
1413 /*
1414  *      The last two values are not from the RFC but
1415  *      are needed for AMPRnet AX.25 paths.
1416  */
1417
1418 static const unsigned short mtu_plateau[] =
1419 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1420
1421 static inline unsigned short guess_mtu(unsigned short old_mtu)
1422 {
1423         int i;
1424
1425         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1426                 if (old_mtu > mtu_plateau[i])
1427                         return mtu_plateau[i];
1428         return 68;
1429 }
1430
1431 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1432                                  unsigned short new_mtu)
1433 {
1434         int i;
1435         unsigned short old_mtu = ntohs(iph->tot_len);
1436         struct rtable *rth;
1437         __be32  skeys[2] = { iph->saddr, 0, };
1438         __be32  daddr = iph->daddr;
1439         unsigned short est_mtu = 0;
1440
1441         if (ipv4_config.no_pmtu_disc)
1442                 return 0;
1443
1444         for (i = 0; i < 2; i++) {
1445                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1446
1447                 rcu_read_lock();
1448                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1449                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1450                         if (rth->fl.fl4_dst == daddr &&
1451                             rth->fl.fl4_src == skeys[i] &&
1452                             rth->rt_dst  == daddr &&
1453                             rth->rt_src  == iph->saddr &&
1454                             rth->fl.iif == 0 &&
1455                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1456                             net_eq(dev_net(rth->u.dst.dev), net) &&
1457                             rth->rt_genid == atomic_read(&rt_genid)) {
1458                                 unsigned short mtu = new_mtu;
1459
1460                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1461
1462                                         /* BSD 4.2 compatibility hack :-( */
1463                                         if (mtu == 0 &&
1464                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1465                                             old_mtu >= 68 + (iph->ihl << 2))
1466                                                 old_mtu -= iph->ihl << 2;
1467
1468                                         mtu = guess_mtu(old_mtu);
1469                                 }
1470                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1471                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1472                                                 dst_confirm(&rth->u.dst);
1473                                                 if (mtu < ip_rt_min_pmtu) {
1474                                                         mtu = ip_rt_min_pmtu;
1475                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1476                                                                 (1 << RTAX_MTU);
1477                                                 }
1478                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1479                                                 dst_set_expires(&rth->u.dst,
1480                                                         ip_rt_mtu_expires);
1481                                         }
1482                                         est_mtu = mtu;
1483                                 }
1484                         }
1485                 }
1486                 rcu_read_unlock();
1487         }
1488         return est_mtu ? : new_mtu;
1489 }
1490
1491 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1492 {
1493         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1494             !(dst_metric_locked(dst, RTAX_MTU))) {
1495                 if (mtu < ip_rt_min_pmtu) {
1496                         mtu = ip_rt_min_pmtu;
1497                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1498                 }
1499                 dst->metrics[RTAX_MTU-1] = mtu;
1500                 dst_set_expires(dst, ip_rt_mtu_expires);
1501                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1502         }
1503 }
1504
1505 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1506 {
1507         return NULL;
1508 }
1509
1510 static void ipv4_dst_destroy(struct dst_entry *dst)
1511 {
1512         struct rtable *rt = (struct rtable *) dst;
1513         struct inet_peer *peer = rt->peer;
1514         struct in_device *idev = rt->idev;
1515
1516         if (peer) {
1517                 rt->peer = NULL;
1518                 inet_putpeer(peer);
1519         }
1520
1521         if (idev) {
1522                 rt->idev = NULL;
1523                 in_dev_put(idev);
1524         }
1525 }
1526
1527 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1528                             int how)
1529 {
1530         struct rtable *rt = (struct rtable *) dst;
1531         struct in_device *idev = rt->idev;
1532         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1533                 struct in_device *loopback_idev =
1534                         in_dev_get(dev_net(dev)->loopback_dev);
1535                 if (loopback_idev) {
1536                         rt->idev = loopback_idev;
1537                         in_dev_put(idev);
1538                 }
1539         }
1540 }
1541
1542 static void ipv4_link_failure(struct sk_buff *skb)
1543 {
1544         struct rtable *rt;
1545
1546         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1547
1548         rt = skb->rtable;
1549         if (rt)
1550                 dst_set_expires(&rt->u.dst, 0);
1551 }
1552
1553 static int ip_rt_bug(struct sk_buff *skb)
1554 {
1555         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1556                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1557                 skb->dev ? skb->dev->name : "?");
1558         kfree_skb(skb);
1559         return 0;
1560 }
1561
1562 /*
1563    We do not cache source address of outgoing interface,
1564    because it is used only by IP RR, TS and SRR options,
1565    so that it out of fast path.
1566
1567    BTW remember: "addr" is allowed to be not aligned
1568    in IP options!
1569  */
1570
1571 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1572 {
1573         __be32 src;
1574         struct fib_result res;
1575
1576         if (rt->fl.iif == 0)
1577                 src = rt->rt_src;
1578         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1579                 src = FIB_RES_PREFSRC(res);
1580                 fib_res_put(&res);
1581         } else
1582                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1583                                         RT_SCOPE_UNIVERSE);
1584         memcpy(addr, &src, 4);
1585 }
1586
1587 #ifdef CONFIG_NET_CLS_ROUTE
1588 static void set_class_tag(struct rtable *rt, u32 tag)
1589 {
1590         if (!(rt->u.dst.tclassid & 0xFFFF))
1591                 rt->u.dst.tclassid |= tag & 0xFFFF;
1592         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1593                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1594 }
1595 #endif
1596
1597 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1598 {
1599         struct fib_info *fi = res->fi;
1600
1601         if (fi) {
1602                 if (FIB_RES_GW(*res) &&
1603                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1604                         rt->rt_gateway = FIB_RES_GW(*res);
1605                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1606                        sizeof(rt->u.dst.metrics));
1607                 if (fi->fib_mtu == 0) {
1608                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1609                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1610                             rt->rt_gateway != rt->rt_dst &&
1611                             rt->u.dst.dev->mtu > 576)
1612                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1613                 }
1614 #ifdef CONFIG_NET_CLS_ROUTE
1615                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1616 #endif
1617         } else
1618                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1619
1620         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1621                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1622         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1623                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1624         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1625                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1626                                        ip_rt_min_advmss);
1627         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1628                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1629
1630 #ifdef CONFIG_NET_CLS_ROUTE
1631 #ifdef CONFIG_IP_MULTIPLE_TABLES
1632         set_class_tag(rt, fib_rules_tclass(res));
1633 #endif
1634         set_class_tag(rt, itag);
1635 #endif
1636         rt->rt_type = res->type;
1637 }
1638
1639 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1640                                 u8 tos, struct net_device *dev, int our)
1641 {
1642         unsigned hash;
1643         struct rtable *rth;
1644         __be32 spec_dst;
1645         struct in_device *in_dev = in_dev_get(dev);
1646         u32 itag = 0;
1647
1648         /* Primary sanity checks. */
1649
1650         if (in_dev == NULL)
1651                 return -EINVAL;
1652
1653         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1654             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1655                 goto e_inval;
1656
1657         if (ipv4_is_zeronet(saddr)) {
1658                 if (!ipv4_is_local_multicast(daddr))
1659                         goto e_inval;
1660                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1661         } else if (fib_validate_source(saddr, 0, tos, 0,
1662                                         dev, &spec_dst, &itag) < 0)
1663                 goto e_inval;
1664
1665         rth = dst_alloc(&ipv4_dst_ops);
1666         if (!rth)
1667                 goto e_nobufs;
1668
1669         rth->u.dst.output= ip_rt_bug;
1670
1671         atomic_set(&rth->u.dst.__refcnt, 1);
1672         rth->u.dst.flags= DST_HOST;
1673         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1674                 rth->u.dst.flags |= DST_NOPOLICY;
1675         rth->fl.fl4_dst = daddr;
1676         rth->rt_dst     = daddr;
1677         rth->fl.fl4_tos = tos;
1678         rth->fl.mark    = skb->mark;
1679         rth->fl.fl4_src = saddr;
1680         rth->rt_src     = saddr;
1681 #ifdef CONFIG_NET_CLS_ROUTE
1682         rth->u.dst.tclassid = itag;
1683 #endif
1684         rth->rt_iif     =
1685         rth->fl.iif     = dev->ifindex;
1686         rth->u.dst.dev  = init_net.loopback_dev;
1687         dev_hold(rth->u.dst.dev);
1688         rth->idev       = in_dev_get(rth->u.dst.dev);
1689         rth->fl.oif     = 0;
1690         rth->rt_gateway = daddr;
1691         rth->rt_spec_dst= spec_dst;
1692         rth->rt_genid   = atomic_read(&rt_genid);
1693         rth->rt_flags   = RTCF_MULTICAST;
1694         rth->rt_type    = RTN_MULTICAST;
1695         if (our) {
1696                 rth->u.dst.input= ip_local_deliver;
1697                 rth->rt_flags |= RTCF_LOCAL;
1698         }
1699
1700 #ifdef CONFIG_IP_MROUTE
1701         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1702                 rth->u.dst.input = ip_mr_input;
1703 #endif
1704         RT_CACHE_STAT_INC(in_slow_mc);
1705
1706         in_dev_put(in_dev);
1707         hash = rt_hash(daddr, saddr, dev->ifindex);
1708         return rt_intern_hash(hash, rth, &skb->rtable);
1709
1710 e_nobufs:
1711         in_dev_put(in_dev);
1712         return -ENOBUFS;
1713
1714 e_inval:
1715         in_dev_put(in_dev);
1716         return -EINVAL;
1717 }
1718
1719
1720 static void ip_handle_martian_source(struct net_device *dev,
1721                                      struct in_device *in_dev,
1722                                      struct sk_buff *skb,
1723                                      __be32 daddr,
1724                                      __be32 saddr)
1725 {
1726         RT_CACHE_STAT_INC(in_martian_src);
1727 #ifdef CONFIG_IP_ROUTE_VERBOSE
1728         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1729                 /*
1730                  *      RFC1812 recommendation, if source is martian,
1731                  *      the only hint is MAC header.
1732                  */
1733                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1734                         "%u.%u.%u.%u, on dev %s\n",
1735                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1736                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1737                         int i;
1738                         const unsigned char *p = skb_mac_header(skb);
1739                         printk(KERN_WARNING "ll header: ");
1740                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1741                                 printk("%02x", *p);
1742                                 if (i < (dev->hard_header_len - 1))
1743                                         printk(":");
1744                         }
1745                         printk("\n");
1746                 }
1747         }
1748 #endif
1749 }
1750
1751 static int __mkroute_input(struct sk_buff *skb,
1752                            struct fib_result *res,
1753                            struct in_device *in_dev,
1754                            __be32 daddr, __be32 saddr, u32 tos,
1755                            struct rtable **result)
1756 {
1757
1758         struct rtable *rth;
1759         int err;
1760         struct in_device *out_dev;
1761         unsigned flags = 0;
1762         __be32 spec_dst;
1763         u32 itag;
1764
1765         /* get a working reference to the output device */
1766         out_dev = in_dev_get(FIB_RES_DEV(*res));
1767         if (out_dev == NULL) {
1768                 if (net_ratelimit())
1769                         printk(KERN_CRIT "Bug in ip_route_input" \
1770                                "_slow(). Please, report\n");
1771                 return -EINVAL;
1772         }
1773
1774
1775         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1776                                   in_dev->dev, &spec_dst, &itag);
1777         if (err < 0) {
1778                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1779                                          saddr);
1780
1781                 err = -EINVAL;
1782                 goto cleanup;
1783         }
1784
1785         if (err)
1786                 flags |= RTCF_DIRECTSRC;
1787
1788         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1789             (IN_DEV_SHARED_MEDIA(out_dev) ||
1790              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1791                 flags |= RTCF_DOREDIRECT;
1792
1793         if (skb->protocol != htons(ETH_P_IP)) {
1794                 /* Not IP (i.e. ARP). Do not create route, if it is
1795                  * invalid for proxy arp. DNAT routes are always valid.
1796                  */
1797                 if (out_dev == in_dev) {
1798                         err = -EINVAL;
1799                         goto cleanup;
1800                 }
1801         }
1802
1803
1804         rth = dst_alloc(&ipv4_dst_ops);
1805         if (!rth) {
1806                 err = -ENOBUFS;
1807                 goto cleanup;
1808         }
1809
1810         atomic_set(&rth->u.dst.__refcnt, 1);
1811         rth->u.dst.flags= DST_HOST;
1812         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1813                 rth->u.dst.flags |= DST_NOPOLICY;
1814         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1815                 rth->u.dst.flags |= DST_NOXFRM;
1816         rth->fl.fl4_dst = daddr;
1817         rth->rt_dst     = daddr;
1818         rth->fl.fl4_tos = tos;
1819         rth->fl.mark    = skb->mark;
1820         rth->fl.fl4_src = saddr;
1821         rth->rt_src     = saddr;
1822         rth->rt_gateway = daddr;
1823         rth->rt_iif     =
1824                 rth->fl.iif     = in_dev->dev->ifindex;
1825         rth->u.dst.dev  = (out_dev)->dev;
1826         dev_hold(rth->u.dst.dev);
1827         rth->idev       = in_dev_get(rth->u.dst.dev);
1828         rth->fl.oif     = 0;
1829         rth->rt_spec_dst= spec_dst;
1830
1831         rth->u.dst.input = ip_forward;
1832         rth->u.dst.output = ip_output;
1833         rth->rt_genid = atomic_read(&rt_genid);
1834
1835         rt_set_nexthop(rth, res, itag);
1836
1837         rth->rt_flags = flags;
1838
1839         *result = rth;
1840         err = 0;
1841  cleanup:
1842         /* release the working reference to the output device */
1843         in_dev_put(out_dev);
1844         return err;
1845 }
1846
1847 static int ip_mkroute_input(struct sk_buff *skb,
1848                             struct fib_result *res,
1849                             const struct flowi *fl,
1850                             struct in_device *in_dev,
1851                             __be32 daddr, __be32 saddr, u32 tos)
1852 {
1853         struct rtable* rth = NULL;
1854         int err;
1855         unsigned hash;
1856
1857 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1858         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1859                 fib_select_multipath(fl, res);
1860 #endif
1861
1862         /* create a routing cache entry */
1863         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1864         if (err)
1865                 return err;
1866
1867         /* put it into the cache */
1868         hash = rt_hash(daddr, saddr, fl->iif);
1869         return rt_intern_hash(hash, rth, &skb->rtable);
1870 }
1871
1872 /*
1873  *      NOTE. We drop all the packets that has local source
1874  *      addresses, because every properly looped back packet
1875  *      must have correct destination already attached by output routine.
1876  *
1877  *      Such approach solves two big problems:
1878  *      1. Not simplex devices are handled properly.
1879  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1880  */
1881
1882 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1883                                u8 tos, struct net_device *dev)
1884 {
1885         struct fib_result res;
1886         struct in_device *in_dev = in_dev_get(dev);
1887         struct flowi fl = { .nl_u = { .ip4_u =
1888                                       { .daddr = daddr,
1889                                         .saddr = saddr,
1890                                         .tos = tos,
1891                                         .scope = RT_SCOPE_UNIVERSE,
1892                                       } },
1893                             .mark = skb->mark,
1894                             .iif = dev->ifindex };
1895         unsigned        flags = 0;
1896         u32             itag = 0;
1897         struct rtable * rth;
1898         unsigned        hash;
1899         __be32          spec_dst;
1900         int             err = -EINVAL;
1901         int             free_res = 0;
1902         struct net    * net = dev_net(dev);
1903
1904         /* IP on this device is disabled. */
1905
1906         if (!in_dev)
1907                 goto out;
1908
1909         /* Check for the most weird martians, which can be not detected
1910            by fib_lookup.
1911          */
1912
1913         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1914             ipv4_is_loopback(saddr))
1915                 goto martian_source;
1916
1917         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1918                 goto brd_input;
1919
1920         /* Accept zero addresses only to limited broadcast;
1921          * I even do not know to fix it or not. Waiting for complains :-)
1922          */
1923         if (ipv4_is_zeronet(saddr))
1924                 goto martian_source;
1925
1926         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1927             ipv4_is_loopback(daddr))
1928                 goto martian_destination;
1929
1930         /*
1931          *      Now we are ready to route packet.
1932          */
1933         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1934                 if (!IN_DEV_FORWARD(in_dev))
1935                         goto e_hostunreach;
1936                 goto no_route;
1937         }
1938         free_res = 1;
1939
1940         RT_CACHE_STAT_INC(in_slow_tot);
1941
1942         if (res.type == RTN_BROADCAST)
1943                 goto brd_input;
1944
1945         if (res.type == RTN_LOCAL) {
1946                 int result;
1947                 result = fib_validate_source(saddr, daddr, tos,
1948                                              net->loopback_dev->ifindex,
1949                                              dev, &spec_dst, &itag);
1950                 if (result < 0)
1951                         goto martian_source;
1952                 if (result)
1953                         flags |= RTCF_DIRECTSRC;
1954                 spec_dst = daddr;
1955                 goto local_input;
1956         }
1957
1958         if (!IN_DEV_FORWARD(in_dev))
1959                 goto e_hostunreach;
1960         if (res.type != RTN_UNICAST)
1961                 goto martian_destination;
1962
1963         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1964 done:
1965         in_dev_put(in_dev);
1966         if (free_res)
1967                 fib_res_put(&res);
1968 out:    return err;
1969
1970 brd_input:
1971         if (skb->protocol != htons(ETH_P_IP))
1972                 goto e_inval;
1973
1974         if (ipv4_is_zeronet(saddr))
1975                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1976         else {
1977                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1978                                           &itag);
1979                 if (err < 0)
1980                         goto martian_source;
1981                 if (err)
1982                         flags |= RTCF_DIRECTSRC;
1983         }
1984         flags |= RTCF_BROADCAST;
1985         res.type = RTN_BROADCAST;
1986         RT_CACHE_STAT_INC(in_brd);
1987
1988 local_input:
1989         rth = dst_alloc(&ipv4_dst_ops);
1990         if (!rth)
1991                 goto e_nobufs;
1992
1993         rth->u.dst.output= ip_rt_bug;
1994         rth->rt_genid = atomic_read(&rt_genid);
1995
1996         atomic_set(&rth->u.dst.__refcnt, 1);
1997         rth->u.dst.flags= DST_HOST;
1998         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1999                 rth->u.dst.flags |= DST_NOPOLICY;
2000         rth->fl.fl4_dst = daddr;
2001         rth->rt_dst     = daddr;
2002         rth->fl.fl4_tos = tos;
2003         rth->fl.mark    = skb->mark;
2004         rth->fl.fl4_src = saddr;
2005         rth->rt_src     = saddr;
2006 #ifdef CONFIG_NET_CLS_ROUTE
2007         rth->u.dst.tclassid = itag;
2008 #endif
2009         rth->rt_iif     =
2010         rth->fl.iif     = dev->ifindex;
2011         rth->u.dst.dev  = net->loopback_dev;
2012         dev_hold(rth->u.dst.dev);
2013         rth->idev       = in_dev_get(rth->u.dst.dev);
2014         rth->rt_gateway = daddr;
2015         rth->rt_spec_dst= spec_dst;
2016         rth->u.dst.input= ip_local_deliver;
2017         rth->rt_flags   = flags|RTCF_LOCAL;
2018         if (res.type == RTN_UNREACHABLE) {
2019                 rth->u.dst.input= ip_error;
2020                 rth->u.dst.error= -err;
2021                 rth->rt_flags   &= ~RTCF_LOCAL;
2022         }
2023         rth->rt_type    = res.type;
2024         hash = rt_hash(daddr, saddr, fl.iif);
2025         err = rt_intern_hash(hash, rth, &skb->rtable);
2026         goto done;
2027
2028 no_route:
2029         RT_CACHE_STAT_INC(in_no_route);
2030         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2031         res.type = RTN_UNREACHABLE;
2032         if (err == -ESRCH)
2033                 err = -ENETUNREACH;
2034         goto local_input;
2035
2036         /*
2037          *      Do not cache martian addresses: they should be logged (RFC1812)
2038          */
2039 martian_destination:
2040         RT_CACHE_STAT_INC(in_martian_dst);
2041 #ifdef CONFIG_IP_ROUTE_VERBOSE
2042         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2043                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2044                         "%u.%u.%u.%u, dev %s\n",
2045                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2046 #endif
2047
2048 e_hostunreach:
2049         err = -EHOSTUNREACH;
2050         goto done;
2051
2052 e_inval:
2053         err = -EINVAL;
2054         goto done;
2055
2056 e_nobufs:
2057         err = -ENOBUFS;
2058         goto done;
2059
2060 martian_source:
2061         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2062         goto e_inval;
2063 }
2064
2065 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2066                    u8 tos, struct net_device *dev)
2067 {
2068         struct rtable * rth;
2069         unsigned        hash;
2070         int iif = dev->ifindex;
2071         struct net *net;
2072
2073         net = dev_net(dev);
2074         tos &= IPTOS_RT_MASK;
2075         hash = rt_hash(daddr, saddr, iif);
2076
2077         rcu_read_lock();
2078         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2079              rth = rcu_dereference(rth->u.dst.rt_next)) {
2080                 if (((rth->fl.fl4_dst ^ daddr) |
2081                      (rth->fl.fl4_src ^ saddr) |
2082                      (rth->fl.iif ^ iif) |
2083                      rth->fl.oif |
2084                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2085                     rth->fl.mark == skb->mark &&
2086                     net_eq(dev_net(rth->u.dst.dev), net) &&
2087                     rth->rt_genid == atomic_read(&rt_genid)) {
2088                         dst_use(&rth->u.dst, jiffies);
2089                         RT_CACHE_STAT_INC(in_hit);
2090                         rcu_read_unlock();
2091                         skb->rtable = rth;
2092                         return 0;
2093                 }
2094                 RT_CACHE_STAT_INC(in_hlist_search);
2095         }
2096         rcu_read_unlock();
2097
2098         /* Multicast recognition logic is moved from route cache to here.
2099            The problem was that too many Ethernet cards have broken/missing
2100            hardware multicast filters :-( As result the host on multicasting
2101            network acquires a lot of useless route cache entries, sort of
2102            SDR messages from all the world. Now we try to get rid of them.
2103            Really, provided software IP multicast filter is organized
2104            reasonably (at least, hashed), it does not result in a slowdown
2105            comparing with route cache reject entries.
2106            Note, that multicast routers are not affected, because
2107            route cache entry is created eventually.
2108          */
2109         if (ipv4_is_multicast(daddr)) {
2110                 struct in_device *in_dev;
2111
2112                 rcu_read_lock();
2113                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2114                         int our = ip_check_mc(in_dev, daddr, saddr,
2115                                 ip_hdr(skb)->protocol);
2116                         if (our
2117 #ifdef CONFIG_IP_MROUTE
2118                             || (!ipv4_is_local_multicast(daddr) &&
2119                                 IN_DEV_MFORWARD(in_dev))
2120 #endif
2121                             ) {
2122                                 rcu_read_unlock();
2123                                 return ip_route_input_mc(skb, daddr, saddr,
2124                                                          tos, dev, our);
2125                         }
2126                 }
2127                 rcu_read_unlock();
2128                 return -EINVAL;
2129         }
2130         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2131 }
2132
2133 static int __mkroute_output(struct rtable **result,
2134                             struct fib_result *res,
2135                             const struct flowi *fl,
2136                             const struct flowi *oldflp,
2137                             struct net_device *dev_out,
2138                             unsigned flags)
2139 {
2140         struct rtable *rth;
2141         struct in_device *in_dev;
2142         u32 tos = RT_FL_TOS(oldflp);
2143         int err = 0;
2144
2145         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2146                 return -EINVAL;
2147
2148         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2149                 res->type = RTN_BROADCAST;
2150         else if (ipv4_is_multicast(fl->fl4_dst))
2151                 res->type = RTN_MULTICAST;
2152         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2153                 return -EINVAL;
2154
2155         if (dev_out->flags & IFF_LOOPBACK)
2156                 flags |= RTCF_LOCAL;
2157
2158         /* get work reference to inet device */
2159         in_dev = in_dev_get(dev_out);
2160         if (!in_dev)
2161                 return -EINVAL;
2162
2163         if (res->type == RTN_BROADCAST) {
2164                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2165                 if (res->fi) {
2166                         fib_info_put(res->fi);
2167                         res->fi = NULL;
2168                 }
2169         } else if (res->type == RTN_MULTICAST) {
2170                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2171                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2172                                  oldflp->proto))
2173                         flags &= ~RTCF_LOCAL;
2174                 /* If multicast route do not exist use
2175                    default one, but do not gateway in this case.
2176                    Yes, it is hack.
2177                  */
2178                 if (res->fi && res->prefixlen < 4) {
2179                         fib_info_put(res->fi);
2180                         res->fi = NULL;
2181                 }
2182         }
2183
2184
2185         rth = dst_alloc(&ipv4_dst_ops);
2186         if (!rth) {
2187                 err = -ENOBUFS;
2188                 goto cleanup;
2189         }
2190
2191         atomic_set(&rth->u.dst.__refcnt, 1);
2192         rth->u.dst.flags= DST_HOST;
2193         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2194                 rth->u.dst.flags |= DST_NOXFRM;
2195         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2196                 rth->u.dst.flags |= DST_NOPOLICY;
2197
2198         rth->fl.fl4_dst = oldflp->fl4_dst;
2199         rth->fl.fl4_tos = tos;
2200         rth->fl.fl4_src = oldflp->fl4_src;
2201         rth->fl.oif     = oldflp->oif;
2202         rth->fl.mark    = oldflp->mark;
2203         rth->rt_dst     = fl->fl4_dst;
2204         rth->rt_src     = fl->fl4_src;
2205         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2206         /* get references to the devices that are to be hold by the routing
2207            cache entry */
2208         rth->u.dst.dev  = dev_out;
2209         dev_hold(dev_out);
2210         rth->idev       = in_dev_get(dev_out);
2211         rth->rt_gateway = fl->fl4_dst;
2212         rth->rt_spec_dst= fl->fl4_src;
2213
2214         rth->u.dst.output=ip_output;
2215         rth->rt_genid = atomic_read(&rt_genid);
2216
2217         RT_CACHE_STAT_INC(out_slow_tot);
2218
2219         if (flags & RTCF_LOCAL) {
2220                 rth->u.dst.input = ip_local_deliver;
2221                 rth->rt_spec_dst = fl->fl4_dst;
2222         }
2223         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2224                 rth->rt_spec_dst = fl->fl4_src;
2225                 if (flags & RTCF_LOCAL &&
2226                     !(dev_out->flags & IFF_LOOPBACK)) {
2227                         rth->u.dst.output = ip_mc_output;
2228                         RT_CACHE_STAT_INC(out_slow_mc);
2229                 }
2230 #ifdef CONFIG_IP_MROUTE
2231                 if (res->type == RTN_MULTICAST) {
2232                         if (IN_DEV_MFORWARD(in_dev) &&
2233                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2234                                 rth->u.dst.input = ip_mr_input;
2235                                 rth->u.dst.output = ip_mc_output;
2236                         }
2237                 }
2238 #endif
2239         }
2240
2241         rt_set_nexthop(rth, res, 0);
2242
2243         rth->rt_flags = flags;
2244
2245         *result = rth;
2246  cleanup:
2247         /* release work reference to inet device */
2248         in_dev_put(in_dev);
2249
2250         return err;
2251 }
2252
2253 static int ip_mkroute_output(struct rtable **rp,
2254                              struct fib_result *res,
2255                              const struct flowi *fl,
2256                              const struct flowi *oldflp,
2257                              struct net_device *dev_out,
2258                              unsigned flags)
2259 {
2260         struct rtable *rth = NULL;
2261         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2262         unsigned hash;
2263         if (err == 0) {
2264                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2265                 err = rt_intern_hash(hash, rth, rp);
2266         }
2267
2268         return err;
2269 }
2270
2271 /*
2272  * Major route resolver routine.
2273  */
2274
2275 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2276                                 const struct flowi *oldflp)
2277 {
2278         u32 tos = RT_FL_TOS(oldflp);
2279         struct flowi fl = { .nl_u = { .ip4_u =
2280                                       { .daddr = oldflp->fl4_dst,
2281                                         .saddr = oldflp->fl4_src,
2282                                         .tos = tos & IPTOS_RT_MASK,
2283                                         .scope = ((tos & RTO_ONLINK) ?
2284                                                   RT_SCOPE_LINK :
2285                                                   RT_SCOPE_UNIVERSE),
2286                                       } },
2287                             .mark = oldflp->mark,
2288                             .iif = net->loopback_dev->ifindex,
2289                             .oif = oldflp->oif };
2290         struct fib_result res;
2291         unsigned flags = 0;
2292         struct net_device *dev_out = NULL;
2293         int free_res = 0;
2294         int err;
2295
2296
2297         res.fi          = NULL;
2298 #ifdef CONFIG_IP_MULTIPLE_TABLES
2299         res.r           = NULL;
2300 #endif
2301
2302         if (oldflp->fl4_src) {
2303                 err = -EINVAL;
2304                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2305                     ipv4_is_lbcast(oldflp->fl4_src) ||
2306                     ipv4_is_zeronet(oldflp->fl4_src))
2307                         goto out;
2308
2309                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2310                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2311                 if (dev_out == NULL)
2312                         goto out;
2313
2314                 /* I removed check for oif == dev_out->oif here.
2315                    It was wrong for two reasons:
2316                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2317                       is assigned to multiple interfaces.
2318                    2. Moreover, we are allowed to send packets with saddr
2319                       of another iface. --ANK
2320                  */
2321
2322                 if (oldflp->oif == 0
2323                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2324                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2325                         /* Special hack: user can direct multicasts
2326                            and limited broadcast via necessary interface
2327                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2328                            This hack is not just for fun, it allows
2329                            vic,vat and friends to work.
2330                            They bind socket to loopback, set ttl to zero
2331                            and expect that it will work.
2332                            From the viewpoint of routing cache they are broken,
2333                            because we are not allowed to build multicast path
2334                            with loopback source addr (look, routing cache
2335                            cannot know, that ttl is zero, so that packet
2336                            will not leave this host and route is valid).
2337                            Luckily, this hack is good workaround.
2338                          */
2339
2340                         fl.oif = dev_out->ifindex;
2341                         goto make_route;
2342                 }
2343                 if (dev_out)
2344                         dev_put(dev_out);
2345                 dev_out = NULL;
2346         }
2347
2348
2349         if (oldflp->oif) {
2350                 dev_out = dev_get_by_index(net, oldflp->oif);
2351                 err = -ENODEV;
2352                 if (dev_out == NULL)
2353                         goto out;
2354
2355                 /* RACE: Check return value of inet_select_addr instead. */
2356                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2357                         dev_put(dev_out);
2358                         goto out;       /* Wrong error code */
2359                 }
2360
2361                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2362                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2363                         if (!fl.fl4_src)
2364                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2365                                                               RT_SCOPE_LINK);
2366                         goto make_route;
2367                 }
2368                 if (!fl.fl4_src) {
2369                         if (ipv4_is_multicast(oldflp->fl4_dst))
2370                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2371                                                               fl.fl4_scope);
2372                         else if (!oldflp->fl4_dst)
2373                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2374                                                               RT_SCOPE_HOST);
2375                 }
2376         }
2377
2378         if (!fl.fl4_dst) {
2379                 fl.fl4_dst = fl.fl4_src;
2380                 if (!fl.fl4_dst)
2381                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2382                 if (dev_out)
2383                         dev_put(dev_out);
2384                 dev_out = net->loopback_dev;
2385                 dev_hold(dev_out);
2386                 fl.oif = net->loopback_dev->ifindex;
2387                 res.type = RTN_LOCAL;
2388                 flags |= RTCF_LOCAL;
2389                 goto make_route;
2390         }
2391
2392         if (fib_lookup(net, &fl, &res)) {
2393                 res.fi = NULL;
2394                 if (oldflp->oif) {
2395                         /* Apparently, routing tables are wrong. Assume,
2396                            that the destination is on link.
2397
2398                            WHY? DW.
2399                            Because we are allowed to send to iface
2400                            even if it has NO routes and NO assigned
2401                            addresses. When oif is specified, routing
2402                            tables are looked up with only one purpose:
2403                            to catch if destination is gatewayed, rather than
2404                            direct. Moreover, if MSG_DONTROUTE is set,
2405                            we send packet, ignoring both routing tables
2406                            and ifaddr state. --ANK
2407
2408
2409                            We could make it even if oif is unknown,
2410                            likely IPv6, but we do not.
2411                          */
2412
2413                         if (fl.fl4_src == 0)
2414                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2415                                                               RT_SCOPE_LINK);
2416                         res.type = RTN_UNICAST;
2417                         goto make_route;
2418                 }
2419                 if (dev_out)
2420                         dev_put(dev_out);
2421                 err = -ENETUNREACH;
2422                 goto out;
2423         }
2424         free_res = 1;
2425
2426         if (res.type == RTN_LOCAL) {
2427                 if (!fl.fl4_src)
2428                         fl.fl4_src = fl.fl4_dst;
2429                 if (dev_out)
2430                         dev_put(dev_out);
2431                 dev_out = net->loopback_dev;
2432                 dev_hold(dev_out);
2433                 fl.oif = dev_out->ifindex;
2434                 if (res.fi)
2435                         fib_info_put(res.fi);
2436                 res.fi = NULL;
2437                 flags |= RTCF_LOCAL;
2438                 goto make_route;
2439         }
2440
2441 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2442         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2443                 fib_select_multipath(&fl, &res);
2444         else
2445 #endif
2446         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2447                 fib_select_default(net, &fl, &res);
2448
2449         if (!fl.fl4_src)
2450                 fl.fl4_src = FIB_RES_PREFSRC(res);
2451
2452         if (dev_out)
2453                 dev_put(dev_out);
2454         dev_out = FIB_RES_DEV(res);
2455         dev_hold(dev_out);
2456         fl.oif = dev_out->ifindex;
2457
2458
2459 make_route:
2460         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2461
2462
2463         if (free_res)
2464                 fib_res_put(&res);
2465         if (dev_out)
2466                 dev_put(dev_out);
2467 out:    return err;
2468 }
2469
2470 int __ip_route_output_key(struct net *net, struct rtable **rp,
2471                           const struct flowi *flp)
2472 {
2473         unsigned hash;
2474         struct rtable *rth;
2475
2476         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2477
2478         rcu_read_lock_bh();
2479         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2480                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2481                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2482                     rth->fl.fl4_src == flp->fl4_src &&
2483                     rth->fl.iif == 0 &&
2484                     rth->fl.oif == flp->oif &&
2485                     rth->fl.mark == flp->mark &&
2486                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2487                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2488                     net_eq(dev_net(rth->u.dst.dev), net) &&
2489                     rth->rt_genid == atomic_read(&rt_genid)) {
2490                         dst_use(&rth->u.dst, jiffies);
2491                         RT_CACHE_STAT_INC(out_hit);
2492                         rcu_read_unlock_bh();
2493                         *rp = rth;
2494                         return 0;
2495                 }
2496                 RT_CACHE_STAT_INC(out_hlist_search);
2497         }
2498         rcu_read_unlock_bh();
2499
2500         return ip_route_output_slow(net, rp, flp);
2501 }
2502
2503 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2504
2505 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2506 {
2507 }
2508
2509 static struct dst_ops ipv4_dst_blackhole_ops = {
2510         .family                 =       AF_INET,
2511         .protocol               =       __constant_htons(ETH_P_IP),
2512         .destroy                =       ipv4_dst_destroy,
2513         .check                  =       ipv4_dst_check,
2514         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2515         .entry_size             =       sizeof(struct rtable),
2516         .entries                =       ATOMIC_INIT(0),
2517 };
2518
2519
2520 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2521 {
2522         struct rtable *ort = *rp;
2523         struct rtable *rt = (struct rtable *)
2524                 dst_alloc(&ipv4_dst_blackhole_ops);
2525
2526         if (rt) {
2527                 struct dst_entry *new = &rt->u.dst;
2528
2529                 atomic_set(&new->__refcnt, 1);
2530                 new->__use = 1;
2531                 new->input = dst_discard;
2532                 new->output = dst_discard;
2533                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2534
2535                 new->dev = ort->u.dst.dev;
2536                 if (new->dev)
2537                         dev_hold(new->dev);
2538
2539                 rt->fl = ort->fl;
2540
2541                 rt->idev = ort->idev;
2542                 if (rt->idev)
2543                         in_dev_hold(rt->idev);
2544                 rt->rt_genid = atomic_read(&rt_genid);
2545                 rt->rt_flags = ort->rt_flags;
2546                 rt->rt_type = ort->rt_type;
2547                 rt->rt_dst = ort->rt_dst;
2548                 rt->rt_src = ort->rt_src;
2549                 rt->rt_iif = ort->rt_iif;
2550                 rt->rt_gateway = ort->rt_gateway;
2551                 rt->rt_spec_dst = ort->rt_spec_dst;
2552                 rt->peer = ort->peer;
2553                 if (rt->peer)
2554                         atomic_inc(&rt->peer->refcnt);
2555
2556                 dst_free(new);
2557         }
2558
2559         dst_release(&(*rp)->u.dst);
2560         *rp = rt;
2561         return (rt ? 0 : -ENOMEM);
2562 }
2563
2564 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2565                          struct sock *sk, int flags)
2566 {
2567         int err;
2568
2569         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2570                 return err;
2571
2572         if (flp->proto) {
2573                 if (!flp->fl4_src)
2574                         flp->fl4_src = (*rp)->rt_src;
2575                 if (!flp->fl4_dst)
2576                         flp->fl4_dst = (*rp)->rt_dst;
2577                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2578                                     flags ? XFRM_LOOKUP_WAIT : 0);
2579                 if (err == -EREMOTE)
2580                         err = ipv4_dst_blackhole(rp, flp);
2581
2582                 return err;
2583         }
2584
2585         return 0;
2586 }
2587
2588 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2589
2590 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2591 {
2592         return ip_route_output_flow(net, rp, flp, NULL, 0);
2593 }
2594
2595 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2596                         int nowait, unsigned int flags)
2597 {
2598         struct rtable *rt = skb->rtable;
2599         struct rtmsg *r;
2600         struct nlmsghdr *nlh;
2601         long expires;
2602         u32 id = 0, ts = 0, tsage = 0, error;
2603
2604         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2605         if (nlh == NULL)
2606                 return -EMSGSIZE;
2607
2608         r = nlmsg_data(nlh);
2609         r->rtm_family    = AF_INET;
2610         r->rtm_dst_len  = 32;
2611         r->rtm_src_len  = 0;
2612         r->rtm_tos      = rt->fl.fl4_tos;
2613         r->rtm_table    = RT_TABLE_MAIN;
2614         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2615         r->rtm_type     = rt->rt_type;
2616         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2617         r->rtm_protocol = RTPROT_UNSPEC;
2618         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2619         if (rt->rt_flags & RTCF_NOTIFY)
2620                 r->rtm_flags |= RTM_F_NOTIFY;
2621
2622         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2623
2624         if (rt->fl.fl4_src) {
2625                 r->rtm_src_len = 32;
2626                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2627         }
2628         if (rt->u.dst.dev)
2629                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2630 #ifdef CONFIG_NET_CLS_ROUTE
2631         if (rt->u.dst.tclassid)
2632                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2633 #endif
2634         if (rt->fl.iif)
2635                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2636         else if (rt->rt_src != rt->fl.fl4_src)
2637                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2638
2639         if (rt->rt_dst != rt->rt_gateway)
2640                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2641
2642         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2643                 goto nla_put_failure;
2644
2645         error = rt->u.dst.error;
2646         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2647         if (rt->peer) {
2648                 id = rt->peer->ip_id_count;
2649                 if (rt->peer->tcp_ts_stamp) {
2650                         ts = rt->peer->tcp_ts;
2651                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2652                 }
2653         }
2654
2655         if (rt->fl.iif) {
2656 #ifdef CONFIG_IP_MROUTE
2657                 __be32 dst = rt->rt_dst;
2658
2659                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2660                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2661                         int err = ipmr_get_route(skb, r, nowait);
2662                         if (err <= 0) {
2663                                 if (!nowait) {
2664                                         if (err == 0)
2665                                                 return 0;
2666                                         goto nla_put_failure;
2667                                 } else {
2668                                         if (err == -EMSGSIZE)
2669                                                 goto nla_put_failure;
2670                                         error = err;
2671                                 }
2672                         }
2673                 } else
2674 #endif
2675                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2676         }
2677
2678         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2679                                expires, error) < 0)
2680                 goto nla_put_failure;
2681
2682         return nlmsg_end(skb, nlh);
2683
2684 nla_put_failure:
2685         nlmsg_cancel(skb, nlh);
2686         return -EMSGSIZE;
2687 }
2688
2689 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2690 {
2691         struct net *net = sock_net(in_skb->sk);
2692         struct rtmsg *rtm;
2693         struct nlattr *tb[RTA_MAX+1];
2694         struct rtable *rt = NULL;
2695         __be32 dst = 0;
2696         __be32 src = 0;
2697         u32 iif;
2698         int err;
2699         struct sk_buff *skb;
2700
2701         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2702         if (err < 0)
2703                 goto errout;
2704
2705         rtm = nlmsg_data(nlh);
2706
2707         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2708         if (skb == NULL) {
2709                 err = -ENOBUFS;
2710                 goto errout;
2711         }
2712
2713         /* Reserve room for dummy headers, this skb can pass
2714            through good chunk of routing engine.
2715          */
2716         skb_reset_mac_header(skb);
2717         skb_reset_network_header(skb);
2718
2719         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2720         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2721         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2722
2723         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2724         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2725         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2726
2727         if (iif) {
2728                 struct net_device *dev;
2729
2730                 dev = __dev_get_by_index(net, iif);
2731                 if (dev == NULL) {
2732                         err = -ENODEV;
2733                         goto errout_free;
2734                 }
2735
2736                 skb->protocol   = htons(ETH_P_IP);
2737                 skb->dev        = dev;
2738                 local_bh_disable();
2739                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2740                 local_bh_enable();
2741
2742                 rt = skb->rtable;
2743                 if (err == 0 && rt->u.dst.error)
2744                         err = -rt->u.dst.error;
2745         } else {
2746                 struct flowi fl = {
2747                         .nl_u = {
2748                                 .ip4_u = {
2749                                         .daddr = dst,
2750                                         .saddr = src,
2751                                         .tos = rtm->rtm_tos,
2752                                 },
2753                         },
2754                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2755                 };
2756                 err = ip_route_output_key(net, &rt, &fl);
2757         }
2758
2759         if (err)
2760                 goto errout_free;
2761
2762         skb->rtable = rt;
2763         if (rtm->rtm_flags & RTM_F_NOTIFY)
2764                 rt->rt_flags |= RTCF_NOTIFY;
2765
2766         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2767                            RTM_NEWROUTE, 0, 0);
2768         if (err <= 0)
2769                 goto errout_free;
2770
2771         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2772 errout:
2773         return err;
2774
2775 errout_free:
2776         kfree_skb(skb);
2777         goto errout;
2778 }
2779
2780 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2781 {
2782         struct rtable *rt;
2783         int h, s_h;
2784         int idx, s_idx;
2785         struct net *net;
2786
2787         net = sock_net(skb->sk);
2788
2789         s_h = cb->args[0];
2790         if (s_h < 0)
2791                 s_h = 0;
2792         s_idx = idx = cb->args[1];
2793         for (h = s_h; h <= rt_hash_mask; h++) {
2794                 rcu_read_lock_bh();
2795                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2796                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2797                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2798                                 continue;
2799                         if (rt->rt_genid != atomic_read(&rt_genid))
2800                                 continue;
2801                         skb->dst = dst_clone(&rt->u.dst);
2802                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2803                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2804                                          1, NLM_F_MULTI) <= 0) {
2805                                 dst_release(xchg(&skb->dst, NULL));
2806                                 rcu_read_unlock_bh();
2807                                 goto done;
2808                         }
2809                         dst_release(xchg(&skb->dst, NULL));
2810                 }
2811                 rcu_read_unlock_bh();
2812                 s_idx = 0;
2813         }
2814
2815 done:
2816         cb->args[0] = h;
2817         cb->args[1] = idx;
2818         return skb->len;
2819 }
2820
2821 void ip_rt_multicast_event(struct in_device *in_dev)
2822 {
2823         rt_cache_flush(0);
2824 }
2825
2826 #ifdef CONFIG_SYSCTL
2827 static int flush_delay;
2828
2829 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2830                                         struct file *filp, void __user *buffer,
2831                                         size_t *lenp, loff_t *ppos)
2832 {
2833         if (write) {
2834                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2835                 rt_cache_flush(flush_delay);
2836                 return 0;
2837         }
2838
2839         return -EINVAL;
2840 }
2841
2842 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2843                                                 int __user *name,
2844                                                 int nlen,
2845                                                 void __user *oldval,
2846                                                 size_t __user *oldlenp,
2847                                                 void __user *newval,
2848                                                 size_t newlen)
2849 {
2850         int delay;
2851         if (newlen != sizeof(int))
2852                 return -EINVAL;
2853         if (get_user(delay, (int __user *)newval))
2854                 return -EFAULT;
2855         rt_cache_flush(delay);
2856         return 0;
2857 }
2858
2859 ctl_table ipv4_route_table[] = {
2860         {
2861                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2862                 .procname       = "flush",
2863                 .data           = &flush_delay,
2864                 .maxlen         = sizeof(int),
2865                 .mode           = 0200,
2866                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2867                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2868         },
2869         {
2870                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2871                 .procname       = "gc_thresh",
2872                 .data           = &ipv4_dst_ops.gc_thresh,
2873                 .maxlen         = sizeof(int),
2874                 .mode           = 0644,
2875                 .proc_handler   = &proc_dointvec,
2876         },
2877         {
2878                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2879                 .procname       = "max_size",
2880                 .data           = &ip_rt_max_size,
2881                 .maxlen         = sizeof(int),
2882                 .mode           = 0644,
2883                 .proc_handler   = &proc_dointvec,
2884         },
2885         {
2886                 /*  Deprecated. Use gc_min_interval_ms */
2887
2888                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2889                 .procname       = "gc_min_interval",
2890                 .data           = &ip_rt_gc_min_interval,
2891                 .maxlen         = sizeof(int),
2892                 .mode           = 0644,
2893                 .proc_handler   = &proc_dointvec_jiffies,
2894                 .strategy       = &sysctl_jiffies,
2895         },
2896         {
2897                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2898                 .procname       = "gc_min_interval_ms",
2899                 .data           = &ip_rt_gc_min_interval,
2900                 .maxlen         = sizeof(int),
2901                 .mode           = 0644,
2902                 .proc_handler   = &proc_dointvec_ms_jiffies,
2903                 .strategy       = &sysctl_ms_jiffies,
2904         },
2905         {
2906                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2907                 .procname       = "gc_timeout",
2908                 .data           = &ip_rt_gc_timeout,
2909                 .maxlen         = sizeof(int),
2910                 .mode           = 0644,
2911                 .proc_handler   = &proc_dointvec_jiffies,
2912                 .strategy       = &sysctl_jiffies,
2913         },
2914         {
2915                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2916                 .procname       = "gc_interval",
2917                 .data           = &ip_rt_gc_interval,
2918                 .maxlen         = sizeof(int),
2919                 .mode           = 0644,
2920                 .proc_handler   = &proc_dointvec_jiffies,
2921                 .strategy       = &sysctl_jiffies,
2922         },
2923         {
2924                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2925                 .procname       = "redirect_load",
2926                 .data           = &ip_rt_redirect_load,
2927                 .maxlen         = sizeof(int),
2928                 .mode           = 0644,
2929                 .proc_handler   = &proc_dointvec,
2930         },
2931         {
2932                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2933                 .procname       = "redirect_number",
2934                 .data           = &ip_rt_redirect_number,
2935                 .maxlen         = sizeof(int),
2936                 .mode           = 0644,
2937                 .proc_handler   = &proc_dointvec,
2938         },
2939         {
2940                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2941                 .procname       = "redirect_silence",
2942                 .data           = &ip_rt_redirect_silence,
2943                 .maxlen         = sizeof(int),
2944                 .mode           = 0644,
2945                 .proc_handler   = &proc_dointvec,
2946         },
2947         {
2948                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2949                 .procname       = "error_cost",
2950                 .data           = &ip_rt_error_cost,
2951                 .maxlen         = sizeof(int),
2952                 .mode           = 0644,
2953                 .proc_handler   = &proc_dointvec,
2954         },
2955         {
2956                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2957                 .procname       = "error_burst",
2958                 .data           = &ip_rt_error_burst,
2959                 .maxlen         = sizeof(int),
2960                 .mode           = 0644,
2961                 .proc_handler   = &proc_dointvec,
2962         },
2963         {
2964                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2965                 .procname       = "gc_elasticity",
2966                 .data           = &ip_rt_gc_elasticity,
2967                 .maxlen         = sizeof(int),
2968                 .mode           = 0644,
2969                 .proc_handler   = &proc_dointvec,
2970         },
2971         {
2972                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2973                 .procname       = "mtu_expires",
2974                 .data           = &ip_rt_mtu_expires,
2975                 .maxlen         = sizeof(int),
2976                 .mode           = 0644,
2977                 .proc_handler   = &proc_dointvec_jiffies,
2978                 .strategy       = &sysctl_jiffies,
2979         },
2980         {
2981                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2982                 .procname       = "min_pmtu",
2983                 .data           = &ip_rt_min_pmtu,
2984                 .maxlen         = sizeof(int),
2985                 .mode           = 0644,
2986                 .proc_handler   = &proc_dointvec,
2987         },
2988         {
2989                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2990                 .procname       = "min_adv_mss",
2991                 .data           = &ip_rt_min_advmss,
2992                 .maxlen         = sizeof(int),
2993                 .mode           = 0644,
2994                 .proc_handler   = &proc_dointvec,
2995         },
2996         {
2997                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2998                 .procname       = "secret_interval",
2999                 .data           = &ip_rt_secret_interval,
3000                 .maxlen         = sizeof(int),
3001                 .mode           = 0644,
3002                 .proc_handler   = &proc_dointvec_jiffies,
3003                 .strategy       = &sysctl_jiffies,
3004         },
3005         { .ctl_name = 0 }
3006 };
3007 #endif
3008
3009 #ifdef CONFIG_NET_CLS_ROUTE
3010 struct ip_rt_acct *ip_rt_acct __read_mostly;
3011 #endif /* CONFIG_NET_CLS_ROUTE */
3012
3013 static __initdata unsigned long rhash_entries;
3014 static int __init set_rhash_entries(char *str)
3015 {
3016         if (!str)
3017                 return 0;
3018         rhash_entries = simple_strtoul(str, &str, 0);
3019         return 1;
3020 }
3021 __setup("rhash_entries=", set_rhash_entries);
3022
3023 int __init ip_rt_init(void)
3024 {
3025         int rc = 0;
3026
3027         atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3028                              (jiffies ^ (jiffies >> 7))));
3029
3030 #ifdef CONFIG_NET_CLS_ROUTE
3031         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3032         if (!ip_rt_acct)
3033                 panic("IP: failed to allocate ip_rt_acct\n");
3034 #endif
3035
3036         ipv4_dst_ops.kmem_cachep =
3037                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3038                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3039
3040         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3041
3042         rt_hash_table = (struct rt_hash_bucket *)
3043                 alloc_large_system_hash("IP route cache",
3044                                         sizeof(struct rt_hash_bucket),
3045                                         rhash_entries,
3046                                         (num_physpages >= 128 * 1024) ?
3047                                         15 : 17,
3048                                         0,
3049                                         &rt_hash_log,
3050                                         &rt_hash_mask,
3051                                         0);
3052         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3053         rt_hash_lock_init();
3054
3055         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3056         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3057
3058         devinet_init();
3059         ip_fib_init();
3060
3061         rt_secret_timer.function = rt_secret_rebuild;
3062         rt_secret_timer.data = 0;
3063         init_timer_deferrable(&rt_secret_timer);
3064
3065         /* All the timers, started at system startup tend
3066            to synchronize. Perturb it a bit.
3067          */
3068         schedule_delayed_work(&expires_work,
3069                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3070
3071         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3072                 ip_rt_secret_interval;
3073         add_timer(&rt_secret_timer);
3074
3075         if (ip_rt_proc_init())
3076                 printk(KERN_ERR "Unable to create route proc files\n");
3077 #ifdef CONFIG_XFRM
3078         xfrm_init();
3079         xfrm4_init();
3080 #endif
3081         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3082
3083         return rc;
3084 }
3085
3086 EXPORT_SYMBOL(__ip_select_ident);
3087 EXPORT_SYMBOL(ip_route_input);
3088 EXPORT_SYMBOL(ip_route_output_key);