2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
28 * Code from fib_hash has been reused which includes the following header:
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
35 * IPv4 FIB: lookup engine and maintenance routines.
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
46 #define VERSION "0.325"
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/skbuff.h>
66 #include <linux/netlink.h>
67 #include <linux/init.h>
68 #include <linux/list.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
74 #include <net/ip_fib.h>
75 #include "fib_lookup.h"
77 #undef CONFIG_IP_FIB_TRIE_STATS
78 #define MAX_CHILDS 16384
80 #define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
85 static DEFINE_RWLOCK(fib_lock);
87 typedef unsigned int t_key;
91 #define NODE_TYPE_MASK 0x1UL
92 #define NODE_PARENT(node) \
93 ((struct tnode *)((node)->parent & ~NODE_TYPE_MASK))
94 #define NODE_SET_PARENT(node, ptr) \
95 ((node)->parent = (((unsigned long)(ptr)) | \
96 ((node)->parent & NODE_TYPE_MASK)))
97 #define NODE_INIT_PARENT(node, type) \
98 ((node)->parent = (type))
99 #define NODE_TYPE(node) \
100 ((node)->parent & NODE_TYPE_MASK)
102 #define IS_TNODE(n) (!(n->parent & T_LEAF))
103 #define IS_LEAF(n) (n->parent & T_LEAF)
107 unsigned long parent;
112 unsigned long parent;
113 struct hlist_head list;
117 struct hlist_node hlist;
119 struct list_head falh;
124 unsigned long parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
144 unsigned int totdepth;
145 unsigned int maxdepth;
148 unsigned int nullpointers;
149 unsigned int nodesizes[MAX_CHILDS];
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
158 unsigned int revision;
161 static int trie_debug = 0;
163 #define DBG(x...) do { if (trie_debug) printk(x); } while (0)
165 static int tnode_full(struct tnode *tn, struct node *n);
166 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
167 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
168 static int tnode_child_length(struct tnode *tn);
169 static struct node *resize(struct trie *t, struct tnode *tn);
170 static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
171 static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
172 static void tnode_free(struct tnode *tn);
173 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
174 extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
175 extern int fib_detect_death(struct fib_info *fi, int order,
176 struct fib_info **last_resort, int *last_idx, int *dflt);
178 extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
179 struct nlmsghdr *n, struct netlink_skb_parms *req);
181 static kmem_cache_t *fn_alias_kmem;
182 static struct trie *trie_local = NULL, *trie_main = NULL;
184 static inline struct node *tnode_get_child(struct tnode *tn, int i)
186 BUG_ON(i >= 1 << tn->bits);
191 static inline int tnode_child_length(struct tnode *tn)
193 return 1 << tn->bits;
196 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
198 if (offset < KEYLENGTH)
199 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
204 static inline int tkey_equals(t_key a, t_key b)
209 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
211 if (bits == 0 || offset >= KEYLENGTH)
213 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
214 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
217 static inline int tkey_mismatch(t_key a, int offset, t_key b)
224 while ((diff << i) >> (KEYLENGTH-1) == 0)
229 /* Candidate for fib_semantics */
231 static void fn_free_alias(struct fib_alias *fa)
233 fib_release_info(fa->fa_info);
234 kmem_cache_free(fn_alias_kmem, fa);
238 To understand this stuff, an understanding of keys and all their bits is
239 necessary. Every node in the trie has a key associated with it, but not
240 all of the bits in that key are significant.
242 Consider a node 'n' and its parent 'tp'.
244 If n is a leaf, every bit in its key is significant. Its presence is
245 necessitaded by path compression, since during a tree traversal (when
246 searching for a leaf - unless we are doing an insertion) we will completely
247 ignore all skipped bits we encounter. Thus we need to verify, at the end of
248 a potentially successful search, that we have indeed been walking the
251 Note that we can never "miss" the correct key in the tree if present by
252 following the wrong path. Path compression ensures that segments of the key
253 that are the same for all keys with a given prefix are skipped, but the
254 skipped part *is* identical for each node in the subtrie below the skipped
255 bit! trie_insert() in this implementation takes care of that - note the
256 call to tkey_sub_equals() in trie_insert().
258 if n is an internal node - a 'tnode' here, the various parts of its key
259 have many different meanings.
262 _________________________________________________________________
263 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
264 -----------------------------------------------------------------
265 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
267 _________________________________________________________________
268 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
269 -----------------------------------------------------------------
270 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
277 First, let's just ignore the bits that come before the parent tp, that is
278 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
279 not use them for anything.
281 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
282 index into the parent's child array. That is, they will be used to find
283 'n' among tp's children.
285 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
288 All the bits we have seen so far are significant to the node n. The rest
289 of the bits are really not needed or indeed known in n->key.
291 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
292 n's child array, and will of course be different for each child.
295 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
300 static void check_tnode(struct tnode *tn)
302 if (tn && tn->pos+tn->bits > 32) {
303 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
307 static int halve_threshold = 25;
308 static int inflate_threshold = 50;
310 static struct leaf *leaf_new(void)
312 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
314 NODE_INIT_PARENT(l, T_LEAF);
315 INIT_HLIST_HEAD(&l->list);
320 static struct leaf_info *leaf_info_new(int plen)
322 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
328 INIT_LIST_HEAD(&li->falh);
333 static inline void free_leaf(struct leaf *l)
338 static inline void free_leaf_info(struct leaf_info *li)
343 static struct tnode *tnode_alloc(unsigned int size)
345 if (size <= PAGE_SIZE) {
346 return kmalloc(size, GFP_KERNEL);
348 return (struct tnode *)
349 __get_free_pages(GFP_KERNEL, get_order(size));
353 static void __tnode_free(struct tnode *tn)
355 unsigned int size = sizeof(struct tnode) +
356 (1 << tn->bits) * sizeof(struct node *);
358 if (size <= PAGE_SIZE)
361 free_pages((unsigned long)tn, get_order(size));
364 static struct tnode* tnode_new(t_key key, int pos, int bits)
366 int nchildren = 1<<bits;
367 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
368 struct tnode *tn = tnode_alloc(sz);
372 NODE_INIT_PARENT(tn, T_TNODE);
376 tn->full_children = 0;
377 tn->empty_children = 1<<bits;
380 DBG("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
381 (unsigned int) (sizeof(struct node) * 1<<bits));
385 static void tnode_free(struct tnode *tn)
390 free_leaf((struct leaf *)tn);
399 * Check whether a tnode 'n' is "full", i.e. it is an internal node
400 * and no bits are skipped. See discussion in dyntree paper p. 6
403 static inline int tnode_full(struct tnode *tn, struct node *n)
405 if (n == NULL || IS_LEAF(n))
408 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
411 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
413 tnode_put_child_reorg(tn, i, n, -1);
417 * Add a child at position i overwriting the old value.
418 * Update the value of full_children and empty_children.
421 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
426 if (i >= 1<<tn->bits) {
427 printk("bits=%d, i=%d\n", tn->bits, i);
430 write_lock_bh(&fib_lock);
433 /* update emptyChildren */
434 if (n == NULL && chi != NULL)
435 tn->empty_children++;
436 else if (n != NULL && chi == NULL)
437 tn->empty_children--;
439 /* update fullChildren */
441 wasfull = tnode_full(tn, chi);
443 isfull = tnode_full(tn, n);
444 if (wasfull && !isfull)
446 else if (!wasfull && isfull)
450 NODE_SET_PARENT(n, tn);
453 write_unlock_bh(&fib_lock);
456 static struct node *resize(struct trie *t, struct tnode *tn)
464 DBG("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
465 tn, inflate_threshold, halve_threshold);
468 if (tn->empty_children == tnode_child_length(tn)) {
473 if (tn->empty_children == tnode_child_length(tn) - 1)
474 for (i = 0; i < tnode_child_length(tn); i++) {
477 write_lock_bh(&fib_lock);
480 write_unlock_bh(&fib_lock);
484 /* compress one level */
485 NODE_INIT_PARENT(n, NODE_TYPE(n));
487 write_unlock_bh(&fib_lock);
492 * Double as long as the resulting node has a number of
493 * nonempty nodes that are above the threshold.
497 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
498 * the Helsinki University of Technology and Matti Tikkanen of Nokia
499 * Telecommunications, page 6:
500 * "A node is doubled if the ratio of non-empty children to all
501 * children in the *doubled* node is at least 'high'."
503 * 'high' in this instance is the variable 'inflate_threshold'. It
504 * is expressed as a percentage, so we multiply it with
505 * tnode_child_length() and instead of multiplying by 2 (since the
506 * child array will be doubled by inflate()) and multiplying
507 * the left-hand side by 100 (to handle the percentage thing) we
508 * multiply the left-hand side by 50.
510 * The left-hand side may look a bit weird: tnode_child_length(tn)
511 * - tn->empty_children is of course the number of non-null children
512 * in the current node. tn->full_children is the number of "full"
513 * children, that is non-null tnodes with a skip value of 0.
514 * All of those will be doubled in the resulting inflated tnode, so
515 * we just count them one extra time here.
517 * A clearer way to write this would be:
519 * to_be_doubled = tn->full_children;
520 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
523 * new_child_length = tnode_child_length(tn) * 2;
525 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
527 * if (new_fill_factor >= inflate_threshold)
529 * ...and so on, tho it would mess up the while () loop.
532 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
536 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
537 * inflate_threshold * new_child_length
539 * expand not_to_be_doubled and to_be_doubled, and shorten:
540 * 100 * (tnode_child_length(tn) - tn->empty_children +
541 * tn->full_children) >= inflate_threshold * new_child_length
543 * expand new_child_length:
544 * 100 * (tnode_child_length(tn) - tn->empty_children +
545 * tn->full_children) >=
546 * inflate_threshold * tnode_child_length(tn) * 2
549 * 50 * (tn->full_children + tnode_child_length(tn) -
550 * tn->empty_children) >= inflate_threshold *
551 * tnode_child_length(tn)
558 while ((tn->full_children > 0 &&
559 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
560 inflate_threshold * tnode_child_length(tn))) {
562 tn = inflate(t, tn, &err);
565 #ifdef CONFIG_IP_FIB_TRIE_STATS
566 t->stats.resize_node_skipped++;
575 * Halve as long as the number of empty children in this
576 * node is above threshold.
580 while (tn->bits > 1 &&
581 100 * (tnode_child_length(tn) - tn->empty_children) <
582 halve_threshold * tnode_child_length(tn)) {
584 tn = halve(t, tn, &err);
587 #ifdef CONFIG_IP_FIB_TRIE_STATS
588 t->stats.resize_node_skipped++;
595 /* Only one child remains */
597 if (tn->empty_children == tnode_child_length(tn) - 1)
598 for (i = 0; i < tnode_child_length(tn); i++) {
601 write_lock_bh(&fib_lock);
605 write_unlock_bh(&fib_lock);
609 /* compress one level */
611 NODE_INIT_PARENT(n, NODE_TYPE(n));
613 write_unlock_bh(&fib_lock);
618 return (struct node *) tn;
621 static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
624 struct tnode *oldtnode = tn;
625 int olen = tnode_child_length(tn);
630 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
638 * Preallocate and store tnodes before the actual work so we
639 * don't get into an inconsistent state if memory allocation
640 * fails. In case of failure we return the oldnode and inflate
641 * of tnode is ignored.
644 for (i = 0; i < olen; i++) {
645 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
649 inode->pos == oldtnode->pos + oldtnode->bits &&
651 struct tnode *left, *right;
652 t_key m = TKEY_GET_MASK(inode->pos, 1);
654 left = tnode_new(inode->key&(~m), inode->pos + 1,
662 right = tnode_new(inode->key|m, inode->pos + 1,
670 put_child(t, tn, 2*i, (struct node *) left);
671 put_child(t, tn, 2*i+1, (struct node *) right);
676 int size = tnode_child_length(tn);
679 for (j = 0; j < size; j++)
681 tnode_free((struct tnode *)tn->child[j]);
689 for (i = 0; i < olen; i++) {
690 struct node *node = tnode_get_child(oldtnode, i);
691 struct tnode *left, *right;
698 /* A leaf or an internal node with skipped bits */
700 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
701 tn->pos + tn->bits - 1) {
702 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
704 put_child(t, tn, 2*i, node);
706 put_child(t, tn, 2*i+1, node);
710 /* An internal node with two children */
711 inode = (struct tnode *) node;
713 if (inode->bits == 1) {
714 put_child(t, tn, 2*i, inode->child[0]);
715 put_child(t, tn, 2*i+1, inode->child[1]);
721 /* An internal node with more than two children */
723 /* We will replace this node 'inode' with two new
724 * ones, 'left' and 'right', each with half of the
725 * original children. The two new nodes will have
726 * a position one bit further down the key and this
727 * means that the "significant" part of their keys
728 * (see the discussion near the top of this file)
729 * will differ by one bit, which will be "0" in
730 * left's key and "1" in right's key. Since we are
731 * moving the key position by one step, the bit that
732 * we are moving away from - the bit at position
733 * (inode->pos) - is the one that will differ between
734 * left and right. So... we synthesize that bit in the
736 * The mask 'm' below will be a single "one" bit at
737 * the position (inode->pos)
740 /* Use the old key, but set the new significant
744 left = (struct tnode *) tnode_get_child(tn, 2*i);
745 put_child(t, tn, 2*i, NULL);
749 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
750 put_child(t, tn, 2*i+1, NULL);
754 size = tnode_child_length(left);
755 for (j = 0; j < size; j++) {
756 put_child(t, left, j, inode->child[j]);
757 put_child(t, right, j, inode->child[j + size]);
759 put_child(t, tn, 2*i, resize(t, left));
760 put_child(t, tn, 2*i+1, resize(t, right));
764 tnode_free(oldtnode);
768 static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
770 struct tnode *oldtnode = tn;
771 struct node *left, *right;
773 int olen = tnode_child_length(tn);
777 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
785 * Preallocate and store tnodes before the actual work so we
786 * don't get into an inconsistent state if memory allocation
787 * fails. In case of failure we return the oldnode and halve
788 * of tnode is ignored.
791 for (i = 0; i < olen; i += 2) {
792 left = tnode_get_child(oldtnode, i);
793 right = tnode_get_child(oldtnode, i+1);
795 /* Two nonempty children */
797 struct tnode *newBinNode =
798 tnode_new(left->key, tn->pos + tn->bits, 1);
804 put_child(t, tn, i/2, (struct node *)newBinNode);
809 int size = tnode_child_length(tn);
812 for (j = 0; j < size; j++)
814 tnode_free((struct tnode *)tn->child[j]);
822 for (i = 0; i < olen; i += 2) {
823 struct tnode *newBinNode;
825 left = tnode_get_child(oldtnode, i);
826 right = tnode_get_child(oldtnode, i+1);
828 /* At least one of the children is empty */
830 if (right == NULL) /* Both are empty */
832 put_child(t, tn, i/2, right);
837 put_child(t, tn, i/2, left);
841 /* Two nonempty children */
842 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
843 put_child(t, tn, i/2, NULL);
847 put_child(t, newBinNode, 0, left);
848 put_child(t, newBinNode, 1, right);
849 put_child(t, tn, i/2, resize(t, newBinNode));
851 tnode_free(oldtnode);
855 static void trie_init(struct trie *t)
863 #ifdef CONFIG_IP_FIB_TRIE_STATS
864 memset(&t->stats, 0, sizeof(struct trie_use_stats));
868 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
870 struct hlist_node *node;
871 struct leaf_info *li;
873 hlist_for_each_entry(li, node, head, hlist)
874 if (li->plen == plen)
880 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
882 struct leaf_info *li = find_leaf_info(&l->list, plen);
890 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
892 struct leaf_info *li = NULL, *last = NULL;
893 struct hlist_node *node;
895 write_lock_bh(&fib_lock);
897 if (hlist_empty(head)) {
898 hlist_add_head(&new->hlist, head);
900 hlist_for_each_entry(li, node, head, hlist) {
901 if (new->plen > li->plen)
907 hlist_add_after(&last->hlist, &new->hlist);
909 hlist_add_before(&new->hlist, &li->hlist);
911 write_unlock_bh(&fib_lock);
915 fib_find_node(struct trie *t, u32 key)
924 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
925 tn = (struct tnode *) n;
929 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
930 pos = tn->pos + tn->bits;
931 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
935 /* Case we have found a leaf. Compare prefixes */
937 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
938 return (struct leaf *)n;
943 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
948 struct tnode *tp = NULL;
955 while (tn != NULL && NODE_PARENT(tn) != NULL) {
957 printk("Rebalance tn=%p \n", tn);
959 printk("tn->parent=%p \n", NODE_PARENT(tn));
961 printk("Rebalance tp=%p \n", tp);
963 printk("tp->parent=%p \n", NODE_PARENT(tp));
966 BUG_ON(i > 12); /* Why is this a bug? -ojn */
969 tp = NODE_PARENT(tn);
970 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
971 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
972 tn = (struct tnode *) resize (t, (struct tnode *)tn);
973 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
975 if (!NODE_PARENT(tn))
978 tn = NODE_PARENT(tn);
980 /* Handle last (top) tnode */
982 tn = (struct tnode*) resize(t, (struct tnode *)tn);
984 return (struct node*) tn;
987 static struct list_head *
988 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
991 struct tnode *tp = NULL, *tn = NULL;
995 struct list_head *fa_head = NULL;
996 struct leaf_info *li;
1002 /* If we point to NULL, stop. Either the tree is empty and we should
1003 * just put a new leaf in if, or we have reached an empty child slot,
1004 * and we should just put our new leaf in that.
1005 * If we point to a T_TNODE, check if it matches our key. Note that
1006 * a T_TNODE might be skipping any number of bits - its 'pos' need
1007 * not be the parent's 'pos'+'bits'!
1009 * If it does match the current key, get pos/bits from it, extract
1010 * the index from our key, push the T_TNODE and walk the tree.
1012 * If it doesn't, we have to replace it with a new T_TNODE.
1014 * If we point to a T_LEAF, it might or might not have the same key
1015 * as we do. If it does, just change the value, update the T_LEAF's
1016 * value, and return it.
1017 * If it doesn't, we need to replace it with a T_TNODE.
1020 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1021 tn = (struct tnode *) n;
1025 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1027 pos = tn->pos + tn->bits;
1028 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1030 if (n && NODE_PARENT(n) != tn) {
1031 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1039 * n ----> NULL, LEAF or TNODE
1041 * tp is n's (parent) ----> NULL or TNODE
1044 BUG_ON(tp && IS_LEAF(tp));
1046 /* Case 1: n is a leaf. Compare prefixes */
1048 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1049 struct leaf *l = (struct leaf *) n;
1051 li = leaf_info_new(plen);
1058 fa_head = &li->falh;
1059 insert_leaf_info(&l->list, li);
1071 li = leaf_info_new(plen);
1074 tnode_free((struct tnode *) l);
1079 fa_head = &li->falh;
1080 insert_leaf_info(&l->list, li);
1082 if (t->trie && n == NULL) {
1083 /* Case 2: n is NULL, and will just insert a new leaf */
1085 NODE_SET_PARENT(l, tp);
1089 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1090 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1092 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1094 * Add a new tnode here
1095 * first tnode need some special handling
1099 pos = tp->pos+tp->bits;
1104 newpos = tkey_mismatch(key, pos, n->key);
1105 tn = tnode_new(n->key, newpos, 1);
1108 tn = tnode_new(key, newpos, 1); /* First tnode */
1113 tnode_free((struct tnode *) l);
1118 NODE_SET_PARENT(tn, tp);
1120 missbit = tkey_extract_bits(key, newpos, 1);
1121 put_child(t, tn, missbit, (struct node *)l);
1122 put_child(t, tn, 1-missbit, n);
1125 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1126 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1128 t->trie = (struct node*) tn; /* First tnode */
1133 if (tp && tp->pos + tp->bits > 32)
1134 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1135 tp, tp->pos, tp->bits, key, plen);
1137 /* Rebalance the trie */
1138 t->trie = trie_rebalance(t, tp);
1146 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1147 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1149 struct trie *t = (struct trie *) tb->tb_data;
1150 struct fib_alias *fa, *new_fa;
1151 struct list_head *fa_head = NULL;
1152 struct fib_info *fi;
1153 int plen = r->rtm_dst_len;
1154 int type = r->rtm_type;
1155 u8 tos = r->rtm_tos;
1165 memcpy(&key, rta->rta_dst, 4);
1169 DBG("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1171 mask = ntohl(inet_make_mask(plen));
1178 fi = fib_create_info(r, rta, nlhdr, &err);
1183 l = fib_find_node(t, key);
1187 fa_head = get_fa_head(l, plen);
1188 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1191 /* Now fa, if non-NULL, points to the first fib alias
1192 * with the same keys [prefix,tos,priority], if such key already
1193 * exists or to the node before which we will insert new one.
1195 * If fa is NULL, we will need to allocate a new one and
1196 * insert to the head of f.
1198 * If f is NULL, no fib node matched the destination key
1199 * and we need to allocate a new one of those as well.
1202 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1203 struct fib_alias *fa_orig;
1206 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1209 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1210 struct fib_info *fi_drop;
1213 write_lock_bh(&fib_lock);
1215 fi_drop = fa->fa_info;
1218 fa->fa_scope = r->rtm_scope;
1219 state = fa->fa_state;
1220 fa->fa_state &= ~FA_S_ACCESSED;
1222 write_unlock_bh(&fib_lock);
1224 fib_release_info(fi_drop);
1225 if (state & FA_S_ACCESSED)
1230 /* Error if we find a perfect match which
1231 * uses the same scope, type, and nexthop
1235 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1236 if (fa->fa_tos != tos)
1238 if (fa->fa_info->fib_priority != fi->fib_priority)
1240 if (fa->fa_type == type &&
1241 fa->fa_scope == r->rtm_scope &&
1242 fa->fa_info == fi) {
1246 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1250 if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1254 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1258 new_fa->fa_info = fi;
1259 new_fa->fa_tos = tos;
1260 new_fa->fa_type = type;
1261 new_fa->fa_scope = r->rtm_scope;
1262 new_fa->fa_state = 0;
1264 * Insert new entry to the list.
1268 fa_head = fib_insert_node(t, &err, key, plen);
1271 goto out_free_new_fa;
1274 write_lock_bh(&fib_lock);
1276 list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head));
1278 write_unlock_bh(&fib_lock);
1281 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1286 kmem_cache_free(fn_alias_kmem, new_fa);
1288 fib_release_info(fi);
1293 static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
1294 struct fib_result *res)
1298 struct leaf_info *li;
1299 struct hlist_head *hhead = &l->list;
1300 struct hlist_node *node;
1302 hlist_for_each_entry(li, node, hhead, hlist) {
1304 mask = ntohl(inet_make_mask(i));
1305 if (l->key != (key & mask))
1308 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1310 #ifdef CONFIG_IP_FIB_TRIE_STATS
1311 t->stats.semantic_match_passed++;
1315 #ifdef CONFIG_IP_FIB_TRIE_STATS
1316 t->stats.semantic_match_miss++;
1323 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1325 struct trie *t = (struct trie *) tb->tb_data;
1330 t_key key = ntohl(flp->fl4_dst);
1333 int current_prefix_length = KEYLENGTH;
1335 t_key node_prefix, key_prefix, pref_mismatch;
1340 read_lock(&fib_lock);
1345 #ifdef CONFIG_IP_FIB_TRIE_STATS
1351 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1355 pn = (struct tnode *) n;
1363 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1365 n = tnode_get_child(pn, cindex);
1368 #ifdef CONFIG_IP_FIB_TRIE_STATS
1369 t->stats.null_node_hit++;
1375 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1383 cn = (struct tnode *)n;
1386 * It's a tnode, and we can do some extra checks here if we
1387 * like, to avoid descending into a dead-end branch.
1388 * This tnode is in the parent's child array at index
1389 * key[p_pos..p_pos+p_bits] but potentially with some bits
1390 * chopped off, so in reality the index may be just a
1391 * subprefix, padded with zero at the end.
1392 * We can also take a look at any skipped bits in this
1393 * tnode - everything up to p_pos is supposed to be ok,
1394 * and the non-chopped bits of the index (se previous
1395 * paragraph) are also guaranteed ok, but the rest is
1396 * considered unknown.
1398 * The skipped bits are key[pos+bits..cn->pos].
1401 /* If current_prefix_length < pos+bits, we are already doing
1402 * actual prefix matching, which means everything from
1403 * pos+(bits-chopped_off) onward must be zero along some
1404 * branch of this subtree - otherwise there is *no* valid
1405 * prefix present. Here we can only check the skipped
1406 * bits. Remember, since we have already indexed into the
1407 * parent's child array, we know that the bits we chopped of
1411 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1413 if (current_prefix_length < pos+bits) {
1414 if (tkey_extract_bits(cn->key, current_prefix_length,
1415 cn->pos - current_prefix_length) != 0 ||
1421 * If chopped_off=0, the index is fully validated and we
1422 * only need to look at the skipped bits for this, the new,
1423 * tnode. What we actually want to do is to find out if
1424 * these skipped bits match our key perfectly, or if we will
1425 * have to count on finding a matching prefix further down,
1426 * because if we do, we would like to have some way of
1427 * verifying the existence of such a prefix at this point.
1430 /* The only thing we can do at this point is to verify that
1431 * any such matching prefix can indeed be a prefix to our
1432 * key, and if the bits in the node we are inspecting that
1433 * do not match our key are not ZERO, this cannot be true.
1434 * Thus, find out where there is a mismatch (before cn->pos)
1435 * and verify that all the mismatching bits are zero in the
1439 /* Note: We aren't very concerned about the piece of the key
1440 * that precede pn->pos+pn->bits, since these have already been
1441 * checked. The bits after cn->pos aren't checked since these are
1442 * by definition "unknown" at this point. Thus, what we want to
1443 * see is if we are about to enter the "prefix matching" state,
1444 * and in that case verify that the skipped bits that will prevail
1445 * throughout this subtree are zero, as they have to be if we are
1446 * to find a matching prefix.
1449 node_prefix = MASK_PFX(cn->key, cn->pos);
1450 key_prefix = MASK_PFX(key, cn->pos);
1451 pref_mismatch = key_prefix^node_prefix;
1454 /* In short: If skipped bits in this node do not match the search
1455 * key, enter the "prefix matching" state.directly.
1457 if (pref_mismatch) {
1458 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1460 pref_mismatch = pref_mismatch <<1;
1462 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1464 if (key_prefix != 0)
1467 if (current_prefix_length >= cn->pos)
1468 current_prefix_length = mp;
1471 pn = (struct tnode *)n; /* Descend */
1478 /* As zero don't change the child key (cindex) */
1479 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1482 /* Decrease current_... with bits chopped off */
1483 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1484 current_prefix_length = pn->pos + pn->bits - chopped_off;
1487 * Either we do the actual chop off according or if we have
1488 * chopped off all bits in this tnode walk up to our parent.
1491 if (chopped_off <= pn->bits) {
1492 cindex &= ~(1 << (chopped_off-1));
1494 if (NODE_PARENT(pn) == NULL)
1497 /* Get Child's index */
1498 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1499 pn = NODE_PARENT(pn);
1502 #ifdef CONFIG_IP_FIB_TRIE_STATS
1503 t->stats.backtrack++;
1511 read_unlock(&fib_lock);
1515 static int trie_leaf_remove(struct trie *t, t_key key)
1518 struct tnode *tp = NULL;
1519 struct node *n = t->trie;
1522 DBG("entering trie_leaf_remove(%p)\n", n);
1524 /* Note that in the case skipped bits, those bits are *not* checked!
1525 * When we finish this, we will have NULL or a T_LEAF, and the
1526 * T_LEAF may or may not match our key.
1529 while (n != NULL && IS_TNODE(n)) {
1530 struct tnode *tn = (struct tnode *) n;
1532 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1534 if (n && NODE_PARENT(n) != tn) {
1535 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1539 l = (struct leaf *) n;
1541 if (!n || !tkey_equals(l->key, key))
1546 * Remove the leaf and rebalance the tree
1552 tp = NODE_PARENT(n);
1553 tnode_free((struct tnode *) n);
1556 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1557 put_child(t, (struct tnode *)tp, cindex, NULL);
1558 t->trie = trie_rebalance(t, tp);
1566 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1567 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1569 struct trie *t = (struct trie *) tb->tb_data;
1571 int plen = r->rtm_dst_len;
1572 u8 tos = r->rtm_tos;
1573 struct fib_alias *fa, *fa_to_delete;
1574 struct list_head *fa_head;
1577 struct leaf_info *li;
1585 memcpy(&key, rta->rta_dst, 4);
1588 mask = ntohl(inet_make_mask(plen));
1594 l = fib_find_node(t, key);
1599 fa_head = get_fa_head(l, plen);
1600 fa = fib_find_alias(fa_head, tos, 0);
1605 DBG("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1607 fa_to_delete = NULL;
1608 fa_head = fa->fa_list.prev;
1609 list_for_each_entry(fa, fa_head, fa_list) {
1610 struct fib_info *fi = fa->fa_info;
1612 if (fa->fa_tos != tos)
1615 if ((!r->rtm_type ||
1616 fa->fa_type == r->rtm_type) &&
1617 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1618 fa->fa_scope == r->rtm_scope) &&
1619 (!r->rtm_protocol ||
1620 fi->fib_protocol == r->rtm_protocol) &&
1621 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1631 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1633 l = fib_find_node(t, key);
1634 li = find_leaf_info(&l->list, plen);
1636 write_lock_bh(&fib_lock);
1638 list_del(&fa->fa_list);
1640 if (list_empty(fa_head)) {
1641 hlist_del(&li->hlist);
1644 write_unlock_bh(&fib_lock);
1649 if (hlist_empty(&l->list))
1650 trie_leaf_remove(t, key);
1652 if (fa->fa_state & FA_S_ACCESSED)
1659 static int trie_flush_list(struct trie *t, struct list_head *head)
1661 struct fib_alias *fa, *fa_node;
1664 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1665 struct fib_info *fi = fa->fa_info;
1667 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1668 write_lock_bh(&fib_lock);
1669 list_del(&fa->fa_list);
1670 write_unlock_bh(&fib_lock);
1679 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1682 struct hlist_head *lih = &l->list;
1683 struct hlist_node *node, *tmp;
1684 struct leaf_info *li = NULL;
1686 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1687 found += trie_flush_list(t, &li->falh);
1689 if (list_empty(&li->falh)) {
1690 write_lock_bh(&fib_lock);
1691 hlist_del(&li->hlist);
1692 write_unlock_bh(&fib_lock);
1700 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1702 struct node *c = (struct node *) thisleaf;
1707 if (t->trie == NULL)
1710 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1711 return (struct leaf *) t->trie;
1713 p = (struct tnode*) t->trie; /* Start */
1715 p = (struct tnode *) NODE_PARENT(c);
1720 /* Find the next child of the parent */
1722 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1726 last = 1 << p->bits;
1727 for (idx = pos; idx < last ; idx++) {
1731 /* Decend if tnode */
1732 while (IS_TNODE(p->child[idx])) {
1733 p = (struct tnode*) p->child[idx];
1736 /* Rightmost non-NULL branch */
1737 if (p && IS_TNODE(p))
1738 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
1740 /* Done with this tnode? */
1741 if (idx >= (1 << p->bits) || p->child[idx] == NULL)
1744 return (struct leaf*) p->child[idx];
1747 /* No more children go up one step */
1748 c = (struct node *) p;
1749 p = (struct tnode *) NODE_PARENT(p);
1751 return NULL; /* Ready. Root of trie */
1754 static int fn_trie_flush(struct fib_table *tb)
1756 struct trie *t = (struct trie *) tb->tb_data;
1757 struct leaf *ll = NULL, *l = NULL;
1762 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1763 found += trie_flush_leaf(t, l);
1765 if (ll && hlist_empty(&ll->list))
1766 trie_leaf_remove(t, ll->key);
1770 if (ll && hlist_empty(&ll->list))
1771 trie_leaf_remove(t, ll->key);
1773 DBG("trie_flush found=%d\n", found);
1777 static int trie_last_dflt = -1;
1780 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1782 struct trie *t = (struct trie *) tb->tb_data;
1783 int order, last_idx;
1784 struct fib_info *fi = NULL;
1785 struct fib_info *last_resort;
1786 struct fib_alias *fa = NULL;
1787 struct list_head *fa_head;
1794 read_lock(&fib_lock);
1796 l = fib_find_node(t, 0);
1800 fa_head = get_fa_head(l, 0);
1804 if (list_empty(fa_head))
1807 list_for_each_entry(fa, fa_head, fa_list) {
1808 struct fib_info *next_fi = fa->fa_info;
1810 if (fa->fa_scope != res->scope ||
1811 fa->fa_type != RTN_UNICAST)
1814 if (next_fi->fib_priority > res->fi->fib_priority)
1816 if (!next_fi->fib_nh[0].nh_gw ||
1817 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1819 fa->fa_state |= FA_S_ACCESSED;
1822 if (next_fi != res->fi)
1824 } else if (!fib_detect_death(fi, order, &last_resort,
1825 &last_idx, &trie_last_dflt)) {
1827 fib_info_put(res->fi);
1829 atomic_inc(&fi->fib_clntref);
1830 trie_last_dflt = order;
1836 if (order <= 0 || fi == NULL) {
1837 trie_last_dflt = -1;
1841 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1843 fib_info_put(res->fi);
1845 atomic_inc(&fi->fib_clntref);
1846 trie_last_dflt = order;
1849 if (last_idx >= 0) {
1851 fib_info_put(res->fi);
1852 res->fi = last_resort;
1854 atomic_inc(&last_resort->fib_clntref);
1856 trie_last_dflt = last_idx;
1858 read_unlock(&fib_lock);
1861 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1862 struct sk_buff *skb, struct netlink_callback *cb)
1865 struct fib_alias *fa;
1867 u32 xkey = htonl(key);
1872 list_for_each_entry(fa, fah, fa_list) {
1877 if (fa->fa_info->fib_nh == NULL) {
1878 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1882 if (fa->fa_info == NULL) {
1883 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1888 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1897 fa->fa_info, 0) < 0) {
1907 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1908 struct netlink_callback *cb)
1911 struct list_head *fa_head;
1912 struct leaf *l = NULL;
1916 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1920 memset(&cb->args[3], 0,
1921 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1923 fa_head = get_fa_head(l, plen);
1928 if (list_empty(fa_head))
1931 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1940 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1943 struct trie *t = (struct trie *) tb->tb_data;
1947 read_lock(&fib_lock);
1948 for (m = 0; m <= 32; m++) {
1952 memset(&cb->args[2], 0,
1953 sizeof(cb->args) - 2*sizeof(cb->args[0]));
1955 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1960 read_unlock(&fib_lock);
1964 read_unlock(&fib_lock);
1968 /* Fix more generic FIB names for init later */
1970 #ifdef CONFIG_IP_MULTIPLE_TABLES
1971 struct fib_table * fib_hash_init(int id)
1973 struct fib_table * __init fib_hash_init(int id)
1976 struct fib_table *tb;
1979 if (fn_alias_kmem == NULL)
1980 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1981 sizeof(struct fib_alias),
1982 0, SLAB_HWCACHE_ALIGN,
1985 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1991 tb->tb_lookup = fn_trie_lookup;
1992 tb->tb_insert = fn_trie_insert;
1993 tb->tb_delete = fn_trie_delete;
1994 tb->tb_flush = fn_trie_flush;
1995 tb->tb_select_default = fn_trie_select_default;
1996 tb->tb_dump = fn_trie_dump;
1997 memset(tb->tb_data, 0, sizeof(struct trie));
1999 t = (struct trie *) tb->tb_data;
2003 if (id == RT_TABLE_LOCAL)
2005 else if (id == RT_TABLE_MAIN)
2008 if (id == RT_TABLE_LOCAL)
2009 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2014 /* Trie dump functions */
2016 static void putspace_seq(struct seq_file *seq, int n)
2019 seq_printf(seq, " ");
2022 static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2025 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2028 static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2029 int pend, int cindex, int bits)
2031 putspace_seq(seq, indent);
2033 seq_printf(seq, "|");
2035 seq_printf(seq, "+");
2037 seq_printf(seq, "%d/", cindex);
2038 printbin_seq(seq, cindex, bits);
2039 seq_printf(seq, ": ");
2041 seq_printf(seq, "<root>: ");
2042 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2045 struct leaf *l = (struct leaf *)n;
2046 struct fib_alias *fa;
2049 seq_printf(seq, "key=%d.%d.%d.%d\n",
2050 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2052 for (i = 32; i >= 0; i--)
2053 if (find_leaf_info(&l->list, i)) {
2054 struct list_head *fa_head = get_fa_head(l, i);
2059 if (list_empty(fa_head))
2062 putspace_seq(seq, indent+2);
2063 seq_printf(seq, "{/%d...dumping}\n", i);
2065 list_for_each_entry(fa, fa_head, fa_list) {
2066 putspace_seq(seq, indent+2);
2067 if (fa->fa_info == NULL) {
2068 seq_printf(seq, "Error fa_info=NULL\n");
2071 if (fa->fa_info->fib_nh == NULL) {
2072 seq_printf(seq, "Error _fib_nh=NULL\n");
2076 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2083 struct tnode *tn = (struct tnode *)n;
2084 int plen = ((struct tnode *)n)->pos;
2085 t_key prf = MASK_PFX(n->key, plen);
2087 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2088 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2090 putspace_seq(seq, indent); seq_printf(seq, "| ");
2091 seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
2092 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2093 seq_printf(seq, "}\n");
2094 putspace_seq(seq, indent); seq_printf(seq, "| ");
2095 seq_printf(seq, "{pos=%d", tn->pos);
2096 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2097 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2098 putspace_seq(seq, indent); seq_printf(seq, "| ");
2099 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2103 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2105 struct node *n = t->trie;
2112 read_lock(&fib_lock);
2114 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2117 seq_printf(seq, "------ trie is empty\n");
2119 read_unlock(&fib_lock);
2123 printnode_seq(seq, indent, n, pend, cindex, 0);
2126 read_unlock(&fib_lock);
2130 tn = (struct tnode *)n;
2131 pend = tn->pos+tn->bits;
2132 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2136 while (tn && cindex < (1 << tn->bits)) {
2137 if (tn->child[cindex]) {
2140 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2141 if (IS_LEAF(tn->child[cindex])) {
2145 * New tnode. Decend one level
2149 tn = (struct tnode *)tn->child[cindex];
2150 pend = tn->pos + tn->bits;
2151 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2159 * Test if we are done
2162 while (cindex >= (1 << tn->bits)) {
2164 * Move upwards and test for root
2165 * pop off all traversed nodes
2168 if (NODE_PARENT(tn) == NULL) {
2173 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2175 tn = NODE_PARENT(tn);
2176 pend = tn->pos + tn->bits;
2182 read_unlock(&fib_lock);
2185 static struct trie_stat *trie_stat_new(void)
2187 struct trie_stat *s;
2190 s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2198 s->nullpointers = 0;
2200 for (i = 0; i < MAX_CHILDS; i++)
2201 s->nodesizes[i] = 0;
2206 static struct trie_stat *trie_collect_stats(struct trie *t)
2208 struct node *n = t->trie;
2209 struct trie_stat *s = trie_stat_new();
2219 read_lock(&fib_lock);
2222 struct tnode *tn = (struct tnode *)n;
2223 pend = tn->pos+tn->bits;
2224 s->nodesizes[tn->bits]++;
2227 while (tn && cindex < (1 << tn->bits)) {
2228 if (tn->child[cindex]) {
2231 if (IS_LEAF(tn->child[cindex])) {
2235 if (depth > s->maxdepth)
2236 s->maxdepth = depth;
2237 s->totdepth += depth;
2241 * New tnode. Decend one level
2245 s->nodesizes[tn->bits]++;
2248 n = tn->child[cindex];
2249 tn = (struct tnode *)n;
2250 pend = tn->pos+tn->bits;
2260 * Test if we are done
2263 while (cindex >= (1 << tn->bits)) {
2265 * Move upwards and test for root
2266 * pop off all traversed nodes
2269 if (NODE_PARENT(tn) == NULL) {
2275 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2276 tn = NODE_PARENT(tn);
2278 n = (struct node *)tn;
2279 pend = tn->pos+tn->bits;
2285 read_unlock(&fib_lock);
2289 #ifdef CONFIG_PROC_FS
2291 static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2296 static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2301 static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2303 if (!ip_fib_main_table)
2307 return fib_triestat_get_next(seq);
2309 return SEQ_START_TOKEN;
2312 static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2315 if (v == SEQ_START_TOKEN)
2316 return fib_triestat_get_first(seq);
2318 return fib_triestat_get_next(seq);
2321 static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2327 * This outputs /proc/net/fib_triestats
2329 * It always works in backward compatibility mode.
2330 * The format of the file is not supposed to be changed.
2333 static void collect_and_show(struct trie *t, struct seq_file *seq)
2335 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2336 int i, max, pointers;
2337 struct trie_stat *stat;
2340 stat = trie_collect_stats(t);
2343 seq_printf(seq, "trie=%p\n", t);
2347 avdepth = stat->totdepth*100 / stat->leaves;
2350 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
2351 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2353 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2354 bytes += sizeof(struct leaf) * stat->leaves;
2355 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2356 bytes += sizeof(struct tnode) * stat->tnodes;
2360 while (max >= 0 && stat->nodesizes[max] == 0)
2364 for (i = 1; i <= max; i++)
2365 if (stat->nodesizes[i] != 0) {
2366 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2367 pointers += (1<<i) * stat->nodesizes[i];
2369 seq_printf(seq, "\n");
2370 seq_printf(seq, "Pointers: %d\n", pointers);
2371 bytes += sizeof(struct node *) * pointers;
2372 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2373 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2378 #ifdef CONFIG_IP_FIB_TRIE_STATS
2379 seq_printf(seq, "Counters:\n---------\n");
2380 seq_printf(seq,"gets = %d\n", t->stats.gets);
2381 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2382 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2383 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2384 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2385 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2387 memset(&(t->stats), 0, sizeof(t->stats));
2389 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2392 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2396 if (v == SEQ_START_TOKEN) {
2397 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2398 sizeof(struct leaf), sizeof(struct tnode));
2400 collect_and_show(trie_local, seq);
2403 collect_and_show(trie_main, seq);
2405 snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
2407 seq_printf(seq, "%-127s\n", bf);
2412 static struct seq_operations fib_triestat_seq_ops = {
2413 .start = fib_triestat_seq_start,
2414 .next = fib_triestat_seq_next,
2415 .stop = fib_triestat_seq_stop,
2416 .show = fib_triestat_seq_show,
2419 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2421 struct seq_file *seq;
2424 rc = seq_open(file, &fib_triestat_seq_ops);
2428 seq = file->private_data;
2435 static struct file_operations fib_triestat_seq_fops = {
2436 .owner = THIS_MODULE,
2437 .open = fib_triestat_seq_open,
2439 .llseek = seq_lseek,
2440 .release = seq_release_private,
2443 int __init fib_stat_proc_init(void)
2445 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2450 void __init fib_stat_proc_exit(void)
2452 proc_net_remove("fib_triestat");
2455 static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2460 static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2465 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2467 if (!ip_fib_main_table)
2471 return fib_trie_get_next(seq);
2473 return SEQ_START_TOKEN;
2476 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2479 if (v == SEQ_START_TOKEN)
2480 return fib_trie_get_first(seq);
2482 return fib_trie_get_next(seq);
2486 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2491 * This outputs /proc/net/fib_trie.
2493 * It always works in backward compatibility mode.
2494 * The format of the file is not supposed to be changed.
2497 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2501 if (v == SEQ_START_TOKEN) {
2503 trie_dump_seq(seq, trie_local);
2506 trie_dump_seq(seq, trie_main);
2508 snprintf(bf, sizeof(bf),
2509 "*\t%08X\t%08X", 200, 400);
2510 seq_printf(seq, "%-127s\n", bf);
2516 static struct seq_operations fib_trie_seq_ops = {
2517 .start = fib_trie_seq_start,
2518 .next = fib_trie_seq_next,
2519 .stop = fib_trie_seq_stop,
2520 .show = fib_trie_seq_show,
2523 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2525 struct seq_file *seq;
2528 rc = seq_open(file, &fib_trie_seq_ops);
2532 seq = file->private_data;
2539 static struct file_operations fib_trie_seq_fops = {
2540 .owner = THIS_MODULE,
2541 .open = fib_trie_seq_open,
2543 .llseek = seq_lseek,
2544 .release= seq_release_private,
2547 int __init fib_proc_init(void)
2549 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2554 void __init fib_proc_exit(void)
2556 proc_net_remove("fib_trie");
2559 #endif /* CONFIG_PROC_FS */