]> err.no Git - linux-2.6/blob - net/ipv4/netfilter/ip_conntrack_core.c
[NETFILTER]: connection tracking event notifiers
[linux-2.6] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43    registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION    "2.2"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep;
74 static kmem_cache_t *ip_conntrack_expect_cachep;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
81 struct notifier_block *ip_conntrack_chain;
82 struct notifier_block *ip_conntrack_expect_chain;
83
84 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
85
86 static inline void __deliver_cached_events(struct ip_conntrack_ecache *ecache)
87 {
88         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
89                 notifier_call_chain(&ip_conntrack_chain, ecache->events,
90                                     ecache->ct);
91         ecache->events = 0;
92 }
93
94 void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
95 {
96         __deliver_cached_events(ecache);
97 }
98
99 /* Deliver all cached events for a particular conntrack. This is called
100  * by code prior to async packet handling or freeing the skb */
101 void 
102 ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct)
103 {
104         struct ip_conntrack_ecache *ecache = 
105                                         &__get_cpu_var(ip_conntrack_ecache);
106
107         if (!ct)
108                 return;
109
110         if (ecache->ct == ct) {
111                 DEBUGP("ecache: delivering event for %p\n", ct);
112                 __deliver_cached_events(ecache);
113         } else {
114                 if (net_ratelimit())
115                         printk(KERN_WARNING "ecache: want to deliver for %p, "
116                                 "but cache has %p\n", ct, ecache->ct);
117         }
118
119         /* signalize that events have already been delivered */
120         ecache->ct = NULL;
121 }
122
123 /* Deliver cached events for old pending events, if current conntrack != old */
124 void ip_conntrack_event_cache_init(const struct sk_buff *skb)
125 {
126         struct ip_conntrack *ct = (struct ip_conntrack *) skb->nfct;
127         struct ip_conntrack_ecache *ecache = 
128                                         &__get_cpu_var(ip_conntrack_ecache);
129
130         /* take care of delivering potentially old events */
131         if (ecache->ct != ct) {
132                 enum ip_conntrack_info ctinfo;
133                 /* we have to check, since at startup the cache is NULL */
134                 if (likely(ecache->ct)) {
135                         DEBUGP("ecache: entered for different conntrack: "
136                                "ecache->ct=%p, skb->nfct=%p. delivering "
137                                "events\n", ecache->ct, ct);
138                         __deliver_cached_events(ecache);
139                         ip_conntrack_put(ecache->ct);
140                 } else {
141                         DEBUGP("ecache: entered for conntrack %p, "
142                                 "cache was clean before\n", ct);
143                 }
144
145                 /* initialize for this conntrack/packet */
146                 ecache->ct = ip_conntrack_get(skb, &ctinfo);
147                 /* ecache->events cleared by __deliver_cached_devents() */
148         } else {
149                 DEBUGP("ecache: re-entered for conntrack %p.\n", ct);
150         }
151 }
152
153 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
154
155 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
156
157 void 
158 ip_conntrack_put(struct ip_conntrack *ct)
159 {
160         IP_NF_ASSERT(ct);
161         nf_conntrack_put(&ct->ct_general);
162 }
163
164 static int ip_conntrack_hash_rnd_initted;
165 static unsigned int ip_conntrack_hash_rnd;
166
167 static u_int32_t
168 hash_conntrack(const struct ip_conntrack_tuple *tuple)
169 {
170 #if 0
171         dump_tuple(tuple);
172 #endif
173         return (jhash_3words(tuple->src.ip,
174                              (tuple->dst.ip ^ tuple->dst.protonum),
175                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
176                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
177 }
178
179 int
180 ip_ct_get_tuple(const struct iphdr *iph,
181                 const struct sk_buff *skb,
182                 unsigned int dataoff,
183                 struct ip_conntrack_tuple *tuple,
184                 const struct ip_conntrack_protocol *protocol)
185 {
186         /* Never happen */
187         if (iph->frag_off & htons(IP_OFFSET)) {
188                 printk("ip_conntrack_core: Frag of proto %u.\n",
189                        iph->protocol);
190                 return 0;
191         }
192
193         tuple->src.ip = iph->saddr;
194         tuple->dst.ip = iph->daddr;
195         tuple->dst.protonum = iph->protocol;
196         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
197
198         return protocol->pkt_to_tuple(skb, dataoff, tuple);
199 }
200
201 int
202 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
203                    const struct ip_conntrack_tuple *orig,
204                    const struct ip_conntrack_protocol *protocol)
205 {
206         inverse->src.ip = orig->dst.ip;
207         inverse->dst.ip = orig->src.ip;
208         inverse->dst.protonum = orig->dst.protonum;
209         inverse->dst.dir = !orig->dst.dir;
210
211         return protocol->invert_tuple(inverse, orig);
212 }
213
214
215 /* ip_conntrack_expect helper functions */
216 static void unlink_expect(struct ip_conntrack_expect *exp)
217 {
218         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
219         IP_NF_ASSERT(!timer_pending(&exp->timeout));
220         list_del(&exp->list);
221         CONNTRACK_STAT_INC(expect_delete);
222         exp->master->expecting--;
223 }
224
225 static void expectation_timed_out(unsigned long ul_expect)
226 {
227         struct ip_conntrack_expect *exp = (void *)ul_expect;
228
229         write_lock_bh(&ip_conntrack_lock);
230         unlink_expect(exp);
231         write_unlock_bh(&ip_conntrack_lock);
232         ip_conntrack_expect_put(exp);
233 }
234
235 /* If an expectation for this connection is found, it gets delete from
236  * global list then returned. */
237 static struct ip_conntrack_expect *
238 find_expectation(const struct ip_conntrack_tuple *tuple)
239 {
240         struct ip_conntrack_expect *i;
241
242         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
243                 /* If master is not in hash table yet (ie. packet hasn't left
244                    this machine yet), how can other end know about expected?
245                    Hence these are not the droids you are looking for (if
246                    master ct never got confirmed, we'd hold a reference to it
247                    and weird things would happen to future packets). */
248                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
249                     && is_confirmed(i->master)
250                     && del_timer(&i->timeout)) {
251                         unlink_expect(i);
252                         return i;
253                 }
254         }
255         return NULL;
256 }
257
258 /* delete all expectations for this conntrack */
259 static void remove_expectations(struct ip_conntrack *ct)
260 {
261         struct ip_conntrack_expect *i, *tmp;
262
263         /* Optimization: most connection never expect any others. */
264         if (ct->expecting == 0)
265                 return;
266
267         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
268                 if (i->master == ct && del_timer(&i->timeout)) {
269                         unlink_expect(i);
270                         ip_conntrack_expect_put(i);
271                 }
272         }
273 }
274
275 static void
276 clean_from_lists(struct ip_conntrack *ct)
277 {
278         unsigned int ho, hr;
279         
280         DEBUGP("clean_from_lists(%p)\n", ct);
281         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
282
283         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
284         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
285         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
286         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
287
288         /* Destroy all pending expectations */
289         remove_expectations(ct);
290 }
291
292 static void
293 destroy_conntrack(struct nf_conntrack *nfct)
294 {
295         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
296         struct ip_conntrack_protocol *proto;
297
298         DEBUGP("destroy_conntrack(%p)\n", ct);
299         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
300         IP_NF_ASSERT(!timer_pending(&ct->timeout));
301
302         set_bit(IPS_DYING_BIT, &ct->status);
303
304         /* To make sure we don't get any weird locking issues here:
305          * destroy_conntrack() MUST NOT be called with a write lock
306          * to ip_conntrack_lock!!! -HW */
307         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
308         if (proto && proto->destroy)
309                 proto->destroy(ct);
310
311         if (ip_conntrack_destroyed)
312                 ip_conntrack_destroyed(ct);
313
314         write_lock_bh(&ip_conntrack_lock);
315         /* Expectations will have been removed in clean_from_lists,
316          * except TFTP can create an expectation on the first packet,
317          * before connection is in the list, so we need to clean here,
318          * too. */
319         remove_expectations(ct);
320
321         /* We overload first tuple to link into unconfirmed list. */
322         if (!is_confirmed(ct)) {
323                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
324                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
325         }
326
327         CONNTRACK_STAT_INC(delete);
328         write_unlock_bh(&ip_conntrack_lock);
329
330         if (ct->master)
331                 ip_conntrack_put(ct->master);
332
333         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
334         kmem_cache_free(ip_conntrack_cachep, ct);
335         atomic_dec(&ip_conntrack_count);
336 }
337
338 static void death_by_timeout(unsigned long ul_conntrack)
339 {
340         struct ip_conntrack *ct = (void *)ul_conntrack;
341
342         ip_conntrack_event(IPCT_DESTROY, ct);
343         write_lock_bh(&ip_conntrack_lock);
344         /* Inside lock so preempt is disabled on module removal path.
345          * Otherwise we can get spurious warnings. */
346         CONNTRACK_STAT_INC(delete_list);
347         clean_from_lists(ct);
348         write_unlock_bh(&ip_conntrack_lock);
349         ip_conntrack_put(ct);
350 }
351
352 static inline int
353 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
354                     const struct ip_conntrack_tuple *tuple,
355                     const struct ip_conntrack *ignored_conntrack)
356 {
357         ASSERT_READ_LOCK(&ip_conntrack_lock);
358         return tuplehash_to_ctrack(i) != ignored_conntrack
359                 && ip_ct_tuple_equal(tuple, &i->tuple);
360 }
361
362 static struct ip_conntrack_tuple_hash *
363 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
364                     const struct ip_conntrack *ignored_conntrack)
365 {
366         struct ip_conntrack_tuple_hash *h;
367         unsigned int hash = hash_conntrack(tuple);
368
369         ASSERT_READ_LOCK(&ip_conntrack_lock);
370         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
371                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
372                         CONNTRACK_STAT_INC(found);
373                         return h;
374                 }
375                 CONNTRACK_STAT_INC(searched);
376         }
377
378         return NULL;
379 }
380
381 /* Find a connection corresponding to a tuple. */
382 struct ip_conntrack_tuple_hash *
383 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
384                       const struct ip_conntrack *ignored_conntrack)
385 {
386         struct ip_conntrack_tuple_hash *h;
387
388         read_lock_bh(&ip_conntrack_lock);
389         h = __ip_conntrack_find(tuple, ignored_conntrack);
390         if (h)
391                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
392         read_unlock_bh(&ip_conntrack_lock);
393
394         return h;
395 }
396
397 /* Confirm a connection given skb; places it in hash table */
398 int
399 __ip_conntrack_confirm(struct sk_buff **pskb)
400 {
401         unsigned int hash, repl_hash;
402         struct ip_conntrack *ct;
403         enum ip_conntrack_info ctinfo;
404
405         ct = ip_conntrack_get(*pskb, &ctinfo);
406
407         /* ipt_REJECT uses ip_conntrack_attach to attach related
408            ICMP/TCP RST packets in other direction.  Actual packet
409            which created connection will be IP_CT_NEW or for an
410            expected connection, IP_CT_RELATED. */
411         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
412                 return NF_ACCEPT;
413
414         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
415         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
416
417         /* We're not in hash table, and we refuse to set up related
418            connections for unconfirmed conns.  But packet copies and
419            REJECT will give spurious warnings here. */
420         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
421
422         /* No external references means noone else could have
423            confirmed us. */
424         IP_NF_ASSERT(!is_confirmed(ct));
425         DEBUGP("Confirming conntrack %p\n", ct);
426
427         write_lock_bh(&ip_conntrack_lock);
428
429         /* See if there's one in the list already, including reverse:
430            NAT could have grabbed it without realizing, since we're
431            not in the hash.  If there is, we lost race. */
432         if (!LIST_FIND(&ip_conntrack_hash[hash],
433                        conntrack_tuple_cmp,
434                        struct ip_conntrack_tuple_hash *,
435                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
436             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
437                           conntrack_tuple_cmp,
438                           struct ip_conntrack_tuple_hash *,
439                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
440                 /* Remove from unconfirmed list */
441                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
442
443                 list_prepend(&ip_conntrack_hash[hash],
444                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
445                 list_prepend(&ip_conntrack_hash[repl_hash],
446                              &ct->tuplehash[IP_CT_DIR_REPLY]);
447                 /* Timer relative to confirmation time, not original
448                    setting time, otherwise we'd get timer wrap in
449                    weird delay cases. */
450                 ct->timeout.expires += jiffies;
451                 add_timer(&ct->timeout);
452                 atomic_inc(&ct->ct_general.use);
453                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
454                 CONNTRACK_STAT_INC(insert);
455                 write_unlock_bh(&ip_conntrack_lock);
456                 if (ct->helper)
457                         ip_conntrack_event_cache(IPCT_HELPER, *pskb);
458 #ifdef CONFIG_IP_NF_NAT_NEEDED
459                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
460                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
461                         ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
462 #endif
463                 ip_conntrack_event_cache(master_ct(ct) ?
464                                          IPCT_RELATED : IPCT_NEW, *pskb);
465
466                 return NF_ACCEPT;
467         }
468
469         CONNTRACK_STAT_INC(insert_failed);
470         write_unlock_bh(&ip_conntrack_lock);
471
472         return NF_DROP;
473 }
474
475 /* Returns true if a connection correspondings to the tuple (required
476    for NAT). */
477 int
478 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
479                          const struct ip_conntrack *ignored_conntrack)
480 {
481         struct ip_conntrack_tuple_hash *h;
482
483         read_lock_bh(&ip_conntrack_lock);
484         h = __ip_conntrack_find(tuple, ignored_conntrack);
485         read_unlock_bh(&ip_conntrack_lock);
486
487         return h != NULL;
488 }
489
490 /* There's a small race here where we may free a just-assured
491    connection.  Too bad: we're in trouble anyway. */
492 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
493 {
494         return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
495 }
496
497 static int early_drop(struct list_head *chain)
498 {
499         /* Traverse backwards: gives us oldest, which is roughly LRU */
500         struct ip_conntrack_tuple_hash *h;
501         struct ip_conntrack *ct = NULL;
502         int dropped = 0;
503
504         read_lock_bh(&ip_conntrack_lock);
505         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
506         if (h) {
507                 ct = tuplehash_to_ctrack(h);
508                 atomic_inc(&ct->ct_general.use);
509         }
510         read_unlock_bh(&ip_conntrack_lock);
511
512         if (!ct)
513                 return dropped;
514
515         if (del_timer(&ct->timeout)) {
516                 death_by_timeout((unsigned long)ct);
517                 dropped = 1;
518                 CONNTRACK_STAT_INC(early_drop);
519         }
520         ip_conntrack_put(ct);
521         return dropped;
522 }
523
524 static inline int helper_cmp(const struct ip_conntrack_helper *i,
525                              const struct ip_conntrack_tuple *rtuple)
526 {
527         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
528 }
529
530 static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
531 {
532         return LIST_FIND(&helpers, helper_cmp,
533                          struct ip_conntrack_helper *,
534                          tuple);
535 }
536
537 /* Allocate a new conntrack: we return -ENOMEM if classification
538    failed due to stress.  Otherwise it really is unclassifiable. */
539 static struct ip_conntrack_tuple_hash *
540 init_conntrack(const struct ip_conntrack_tuple *tuple,
541                struct ip_conntrack_protocol *protocol,
542                struct sk_buff *skb)
543 {
544         struct ip_conntrack *conntrack;
545         struct ip_conntrack_tuple repl_tuple;
546         size_t hash;
547         struct ip_conntrack_expect *exp;
548
549         if (!ip_conntrack_hash_rnd_initted) {
550                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
551                 ip_conntrack_hash_rnd_initted = 1;
552         }
553
554         hash = hash_conntrack(tuple);
555
556         if (ip_conntrack_max
557             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
558                 /* Try dropping from this hash chain. */
559                 if (!early_drop(&ip_conntrack_hash[hash])) {
560                         if (net_ratelimit())
561                                 printk(KERN_WARNING
562                                        "ip_conntrack: table full, dropping"
563                                        " packet.\n");
564                         return ERR_PTR(-ENOMEM);
565                 }
566         }
567
568         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
569                 DEBUGP("Can't invert tuple.\n");
570                 return NULL;
571         }
572
573         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
574         if (!conntrack) {
575                 DEBUGP("Can't allocate conntrack.\n");
576                 return ERR_PTR(-ENOMEM);
577         }
578
579         memset(conntrack, 0, sizeof(*conntrack));
580         atomic_set(&conntrack->ct_general.use, 1);
581         conntrack->ct_general.destroy = destroy_conntrack;
582         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
583         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
584         if (!protocol->new(conntrack, skb)) {
585                 kmem_cache_free(ip_conntrack_cachep, conntrack);
586                 return NULL;
587         }
588         /* Don't set timer yet: wait for confirmation */
589         init_timer(&conntrack->timeout);
590         conntrack->timeout.data = (unsigned long)conntrack;
591         conntrack->timeout.function = death_by_timeout;
592
593         write_lock_bh(&ip_conntrack_lock);
594         exp = find_expectation(tuple);
595
596         if (exp) {
597                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
598                         conntrack, exp);
599                 /* Welcome, Mr. Bond.  We've been expecting you... */
600                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
601                 conntrack->master = exp->master;
602 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
603                 conntrack->mark = exp->master->mark;
604 #endif
605 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
606     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
607                 /* this is ugly, but there is no other place where to put it */
608                 conntrack->nat.masq_index = exp->master->nat.masq_index;
609 #endif
610                 nf_conntrack_get(&conntrack->master->ct_general);
611                 CONNTRACK_STAT_INC(expect_new);
612         } else {
613                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
614
615                 CONNTRACK_STAT_INC(new);
616         }
617
618         /* Overload tuple linked list to put us in unconfirmed list. */
619         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
620
621         atomic_inc(&ip_conntrack_count);
622         write_unlock_bh(&ip_conntrack_lock);
623
624         if (exp) {
625                 if (exp->expectfn)
626                         exp->expectfn(conntrack, exp);
627                 ip_conntrack_expect_put(exp);
628         }
629
630         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
631 }
632
633 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
634 static inline struct ip_conntrack *
635 resolve_normal_ct(struct sk_buff *skb,
636                   struct ip_conntrack_protocol *proto,
637                   int *set_reply,
638                   unsigned int hooknum,
639                   enum ip_conntrack_info *ctinfo)
640 {
641         struct ip_conntrack_tuple tuple;
642         struct ip_conntrack_tuple_hash *h;
643         struct ip_conntrack *ct;
644
645         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
646
647         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
648                                 &tuple,proto))
649                 return NULL;
650
651         /* look for tuple match */
652         h = ip_conntrack_find_get(&tuple, NULL);
653         if (!h) {
654                 h = init_conntrack(&tuple, proto, skb);
655                 if (!h)
656                         return NULL;
657                 if (IS_ERR(h))
658                         return (void *)h;
659         }
660         ct = tuplehash_to_ctrack(h);
661
662         /* It exists; we have (non-exclusive) reference. */
663         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
664                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
665                 /* Please set reply bit if this packet OK */
666                 *set_reply = 1;
667         } else {
668                 /* Once we've had two way comms, always ESTABLISHED. */
669                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
670                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
671                                ct);
672                         *ctinfo = IP_CT_ESTABLISHED;
673                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
674                         DEBUGP("ip_conntrack_in: related packet for %p\n",
675                                ct);
676                         *ctinfo = IP_CT_RELATED;
677                 } else {
678                         DEBUGP("ip_conntrack_in: new packet for %p\n",
679                                ct);
680                         *ctinfo = IP_CT_NEW;
681                 }
682                 *set_reply = 0;
683         }
684         skb->nfct = &ct->ct_general;
685         skb->nfctinfo = *ctinfo;
686         return ct;
687 }
688
689 /* Netfilter hook itself. */
690 unsigned int ip_conntrack_in(unsigned int hooknum,
691                              struct sk_buff **pskb,
692                              const struct net_device *in,
693                              const struct net_device *out,
694                              int (*okfn)(struct sk_buff *))
695 {
696         struct ip_conntrack *ct;
697         enum ip_conntrack_info ctinfo;
698         struct ip_conntrack_protocol *proto;
699         int set_reply = 0;
700         int ret;
701
702         /* Previously seen (loopback or untracked)?  Ignore. */
703         if ((*pskb)->nfct) {
704                 CONNTRACK_STAT_INC(ignore);
705                 return NF_ACCEPT;
706         }
707
708         /* Never happen */
709         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
710                 if (net_ratelimit()) {
711                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
712                        (*pskb)->nh.iph->protocol, hooknum);
713                 }
714                 return NF_DROP;
715         }
716
717 /* Doesn't cover locally-generated broadcast, so not worth it. */
718 #if 0
719         /* Ignore broadcast: no `connection'. */
720         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
721                 printk("Broadcast packet!\n");
722                 return NF_ACCEPT;
723         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
724                    == htonl(0x000000FF)) {
725                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
726                        NIPQUAD((*pskb)->nh.iph->saddr),
727                        NIPQUAD((*pskb)->nh.iph->daddr),
728                        (*pskb)->sk, (*pskb)->pkt_type);
729         }
730 #endif
731
732         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
733
734         /* It may be an special packet, error, unclean...
735          * inverse of the return code tells to the netfilter
736          * core what to do with the packet. */
737         if (proto->error != NULL 
738             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
739                 CONNTRACK_STAT_INC(error);
740                 CONNTRACK_STAT_INC(invalid);
741                 return -ret;
742         }
743
744         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
745                 /* Not valid part of a connection */
746                 CONNTRACK_STAT_INC(invalid);
747                 return NF_ACCEPT;
748         }
749
750         if (IS_ERR(ct)) {
751                 /* Too stressed to deal. */
752                 CONNTRACK_STAT_INC(drop);
753                 return NF_DROP;
754         }
755
756         IP_NF_ASSERT((*pskb)->nfct);
757
758         ip_conntrack_event_cache_init(*pskb);
759
760         ret = proto->packet(ct, *pskb, ctinfo);
761         if (ret < 0) {
762                 /* Invalid: inverse of the return code tells
763                  * the netfilter core what to do*/
764                 nf_conntrack_put((*pskb)->nfct);
765                 (*pskb)->nfct = NULL;
766                 CONNTRACK_STAT_INC(invalid);
767                 return -ret;
768         }
769
770         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
771                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
772
773         return ret;
774 }
775
776 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
777                    const struct ip_conntrack_tuple *orig)
778 {
779         return ip_ct_invert_tuple(inverse, orig, 
780                                   ip_ct_find_proto(orig->dst.protonum));
781 }
782
783 /* Would two expected things clash? */
784 static inline int expect_clash(const struct ip_conntrack_expect *a,
785                                const struct ip_conntrack_expect *b)
786 {
787         /* Part covered by intersection of masks must be unequal,
788            otherwise they clash */
789         struct ip_conntrack_tuple intersect_mask
790                 = { { a->mask.src.ip & b->mask.src.ip,
791                       { a->mask.src.u.all & b->mask.src.u.all } },
792                     { a->mask.dst.ip & b->mask.dst.ip,
793                       { a->mask.dst.u.all & b->mask.dst.u.all },
794                       a->mask.dst.protonum & b->mask.dst.protonum } };
795
796         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
797 }
798
799 static inline int expect_matches(const struct ip_conntrack_expect *a,
800                                  const struct ip_conntrack_expect *b)
801 {
802         return a->master == b->master
803                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
804                 && ip_ct_tuple_equal(&a->mask, &b->mask);
805 }
806
807 /* Generally a bad idea to call this: could have matched already. */
808 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
809 {
810         struct ip_conntrack_expect *i;
811
812         write_lock_bh(&ip_conntrack_lock);
813         /* choose the the oldest expectation to evict */
814         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
815                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
816                         unlink_expect(i);
817                         write_unlock_bh(&ip_conntrack_lock);
818                         ip_conntrack_expect_put(i);
819                         return;
820                 }
821         }
822         write_unlock_bh(&ip_conntrack_lock);
823 }
824
825 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
826 {
827         struct ip_conntrack_expect *new;
828
829         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
830         if (!new) {
831                 DEBUGP("expect_related: OOM allocating expect\n");
832                 return NULL;
833         }
834         new->master = me;
835         atomic_inc(&new->master->ct_general.use);
836         atomic_set(&new->use, 1);
837         return new;
838 }
839
840 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
841 {
842         if (atomic_dec_and_test(&exp->use)) {
843                 ip_conntrack_put(exp->master);
844                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
845         }
846 }
847
848 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
849 {
850         atomic_inc(&exp->use);
851         exp->master->expecting++;
852         list_add(&exp->list, &ip_conntrack_expect_list);
853
854         init_timer(&exp->timeout);
855         exp->timeout.data = (unsigned long)exp;
856         exp->timeout.function = expectation_timed_out;
857         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
858         add_timer(&exp->timeout);
859
860         CONNTRACK_STAT_INC(expect_create);
861 }
862
863 /* Race with expectations being used means we could have none to find; OK. */
864 static void evict_oldest_expect(struct ip_conntrack *master)
865 {
866         struct ip_conntrack_expect *i;
867
868         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
869                 if (i->master == master) {
870                         if (del_timer(&i->timeout)) {
871                                 unlink_expect(i);
872                                 ip_conntrack_expect_put(i);
873                         }
874                         break;
875                 }
876         }
877 }
878
879 static inline int refresh_timer(struct ip_conntrack_expect *i)
880 {
881         if (!del_timer(&i->timeout))
882                 return 0;
883
884         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
885         add_timer(&i->timeout);
886         return 1;
887 }
888
889 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
890 {
891         struct ip_conntrack_expect *i;
892         int ret;
893
894         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
895         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
896         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
897
898         write_lock_bh(&ip_conntrack_lock);
899         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
900                 if (expect_matches(i, expect)) {
901                         /* Refresh timer: if it's dying, ignore.. */
902                         if (refresh_timer(i)) {
903                                 ret = 0;
904                                 goto out;
905                         }
906                 } else if (expect_clash(i, expect)) {
907                         ret = -EBUSY;
908                         goto out;
909                 }
910         }
911
912         /* Will be over limit? */
913         if (expect->master->helper->max_expected && 
914             expect->master->expecting >= expect->master->helper->max_expected)
915                 evict_oldest_expect(expect->master);
916
917         ip_conntrack_expect_insert(expect);
918         ip_conntrack_expect_event(IPEXP_NEW, expect);
919         ret = 0;
920 out:
921         write_unlock_bh(&ip_conntrack_lock);
922         return ret;
923 }
924
925 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
926    implicitly racy: see __ip_conntrack_confirm */
927 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
928                               const struct ip_conntrack_tuple *newreply)
929 {
930         write_lock_bh(&ip_conntrack_lock);
931         /* Should be unconfirmed, so not in hash table yet */
932         IP_NF_ASSERT(!is_confirmed(conntrack));
933
934         DEBUGP("Altering reply tuple of %p to ", conntrack);
935         DUMP_TUPLE(newreply);
936
937         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
938         if (!conntrack->master && conntrack->expecting == 0)
939                 conntrack->helper = ip_ct_find_helper(newreply);
940         write_unlock_bh(&ip_conntrack_lock);
941 }
942
943 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
944 {
945         BUG_ON(me->timeout == 0);
946         write_lock_bh(&ip_conntrack_lock);
947         list_prepend(&helpers, me);
948         write_unlock_bh(&ip_conntrack_lock);
949
950         return 0;
951 }
952
953 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
954                          const struct ip_conntrack_helper *me)
955 {
956         if (tuplehash_to_ctrack(i)->helper == me) {
957                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
958                 tuplehash_to_ctrack(i)->helper = NULL;
959         }
960         return 0;
961 }
962
963 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
964 {
965         unsigned int i;
966         struct ip_conntrack_expect *exp, *tmp;
967
968         /* Need write lock here, to delete helper. */
969         write_lock_bh(&ip_conntrack_lock);
970         LIST_DELETE(&helpers, me);
971
972         /* Get rid of expectations */
973         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
974                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
975                         unlink_expect(exp);
976                         ip_conntrack_expect_put(exp);
977                 }
978         }
979         /* Get rid of expecteds, set helpers to NULL. */
980         LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
981         for (i = 0; i < ip_conntrack_htable_size; i++)
982                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
983                             struct ip_conntrack_tuple_hash *, me);
984         write_unlock_bh(&ip_conntrack_lock);
985
986         /* Someone could be still looking at the helper in a bh. */
987         synchronize_net();
988 }
989
990 static inline void ct_add_counters(struct ip_conntrack *ct,
991                                    enum ip_conntrack_info ctinfo,
992                                    const struct sk_buff *skb)
993 {
994 #ifdef CONFIG_IP_NF_CT_ACCT
995         if (skb) {
996                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
997                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
998                                         ntohs(skb->nh.iph->tot_len);
999         }
1000 #endif
1001 }
1002
1003 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1004 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
1005                         enum ip_conntrack_info ctinfo,
1006                         const struct sk_buff *skb,
1007                         unsigned long extra_jiffies)
1008 {
1009         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1010
1011         /* If not in hash table, timer will not be active yet */
1012         if (!is_confirmed(ct)) {
1013                 ct->timeout.expires = extra_jiffies;
1014                 ct_add_counters(ct, ctinfo, skb);
1015         } else {
1016                 write_lock_bh(&ip_conntrack_lock);
1017                 /* Need del_timer for race avoidance (may already be dying). */
1018                 if (del_timer(&ct->timeout)) {
1019                         ct->timeout.expires = jiffies + extra_jiffies;
1020                         add_timer(&ct->timeout);
1021                         ip_conntrack_event_cache(IPCT_REFRESH, skb);
1022                 }
1023                 ct_add_counters(ct, ctinfo, skb);
1024                 write_unlock_bh(&ip_conntrack_lock);
1025         }
1026 }
1027
1028 /* Returns new sk_buff, or NULL */
1029 struct sk_buff *
1030 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1031 {
1032         skb_orphan(skb);
1033
1034         local_bh_disable(); 
1035         skb = ip_defrag(skb, user);
1036         local_bh_enable();
1037
1038         if (skb)
1039                 ip_send_check(skb->nh.iph);
1040         return skb;
1041 }
1042
1043 /* Used by ipt_REJECT. */
1044 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1045 {
1046         struct ip_conntrack *ct;
1047         enum ip_conntrack_info ctinfo;
1048
1049         /* This ICMP is in reverse direction to the packet which caused it */
1050         ct = ip_conntrack_get(skb, &ctinfo);
1051         
1052         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1053                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1054         else
1055                 ctinfo = IP_CT_RELATED;
1056
1057         /* Attach to new skbuff, and increment count */
1058         nskb->nfct = &ct->ct_general;
1059         nskb->nfctinfo = ctinfo;
1060         nf_conntrack_get(nskb->nfct);
1061 }
1062
1063 static inline int
1064 do_iter(const struct ip_conntrack_tuple_hash *i,
1065         int (*iter)(struct ip_conntrack *i, void *data),
1066         void *data)
1067 {
1068         return iter(tuplehash_to_ctrack(i), data);
1069 }
1070
1071 /* Bring out ya dead! */
1072 static struct ip_conntrack_tuple_hash *
1073 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1074                 void *data, unsigned int *bucket)
1075 {
1076         struct ip_conntrack_tuple_hash *h = NULL;
1077
1078         write_lock_bh(&ip_conntrack_lock);
1079         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1080                 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1081                                 struct ip_conntrack_tuple_hash *, iter, data);
1082                 if (h)
1083                         break;
1084         }
1085         if (!h)
1086                 h = LIST_FIND_W(&unconfirmed, do_iter,
1087                                 struct ip_conntrack_tuple_hash *, iter, data);
1088         if (h)
1089                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1090         write_unlock_bh(&ip_conntrack_lock);
1091
1092         return h;
1093 }
1094
1095 void
1096 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1097 {
1098         struct ip_conntrack_tuple_hash *h;
1099         unsigned int bucket = 0;
1100
1101         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1102                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1103                 /* Time to push up daises... */
1104                 if (del_timer(&ct->timeout))
1105                         death_by_timeout((unsigned long)ct);
1106                 /* ... else the timer will get him soon. */
1107
1108                 ip_conntrack_put(ct);
1109         }
1110
1111 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1112         {
1113                 /* we need to deliver all cached events in order to drop
1114                  * the reference counts */
1115                 int cpu;
1116                 for_each_cpu(cpu) {
1117                         struct ip_conntrack_ecache *ecache = 
1118                                         &per_cpu(ip_conntrack_ecache, cpu);
1119                         if (ecache->ct) {
1120                                 __ip_ct_deliver_cached_events(ecache);
1121                                 ip_conntrack_put(ecache->ct);
1122                                 ecache->ct = NULL;
1123                         }
1124                 }
1125         }
1126 #endif
1127 }
1128
1129 /* Fast function for those who don't want to parse /proc (and I don't
1130    blame them). */
1131 /* Reversing the socket's dst/src point of view gives us the reply
1132    mapping. */
1133 static int
1134 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1135 {
1136         struct inet_sock *inet = inet_sk(sk);
1137         struct ip_conntrack_tuple_hash *h;
1138         struct ip_conntrack_tuple tuple;
1139         
1140         IP_CT_TUPLE_U_BLANK(&tuple);
1141         tuple.src.ip = inet->rcv_saddr;
1142         tuple.src.u.tcp.port = inet->sport;
1143         tuple.dst.ip = inet->daddr;
1144         tuple.dst.u.tcp.port = inet->dport;
1145         tuple.dst.protonum = IPPROTO_TCP;
1146
1147         /* We only do TCP at the moment: is there a better way? */
1148         if (strcmp(sk->sk_prot->name, "TCP")) {
1149                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1150                 return -ENOPROTOOPT;
1151         }
1152
1153         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1154                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1155                        *len, sizeof(struct sockaddr_in));
1156                 return -EINVAL;
1157         }
1158
1159         h = ip_conntrack_find_get(&tuple, NULL);
1160         if (h) {
1161                 struct sockaddr_in sin;
1162                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1163
1164                 sin.sin_family = AF_INET;
1165                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1166                         .tuple.dst.u.tcp.port;
1167                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1168                         .tuple.dst.ip;
1169
1170                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1171                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1172                 ip_conntrack_put(ct);
1173                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1174                         return -EFAULT;
1175                 else
1176                         return 0;
1177         }
1178         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1179                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1180                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1181         return -ENOENT;
1182 }
1183
1184 static struct nf_sockopt_ops so_getorigdst = {
1185         .pf             = PF_INET,
1186         .get_optmin     = SO_ORIGINAL_DST,
1187         .get_optmax     = SO_ORIGINAL_DST+1,
1188         .get            = &getorigdst,
1189 };
1190
1191 static int kill_all(struct ip_conntrack *i, void *data)
1192 {
1193         return 1;
1194 }
1195
1196 static void free_conntrack_hash(void)
1197 {
1198         if (ip_conntrack_vmalloc)
1199                 vfree(ip_conntrack_hash);
1200         else
1201                 free_pages((unsigned long)ip_conntrack_hash, 
1202                            get_order(sizeof(struct list_head)
1203                                      * ip_conntrack_htable_size));
1204 }
1205
1206 /* Mishearing the voices in his head, our hero wonders how he's
1207    supposed to kill the mall. */
1208 void ip_conntrack_cleanup(void)
1209 {
1210         ip_ct_attach = NULL;
1211         /* This makes sure all current packets have passed through
1212            netfilter framework.  Roll on, two-stage module
1213            delete... */
1214         synchronize_net();
1215  
1216  i_see_dead_people:
1217         ip_ct_iterate_cleanup(kill_all, NULL);
1218         if (atomic_read(&ip_conntrack_count) != 0) {
1219                 schedule();
1220                 goto i_see_dead_people;
1221         }
1222         /* wait until all references to ip_conntrack_untracked are dropped */
1223         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1224                 schedule();
1225
1226         kmem_cache_destroy(ip_conntrack_cachep);
1227         kmem_cache_destroy(ip_conntrack_expect_cachep);
1228         free_conntrack_hash();
1229         nf_unregister_sockopt(&so_getorigdst);
1230 }
1231
1232 static int hashsize;
1233 module_param(hashsize, int, 0400);
1234
1235 int __init ip_conntrack_init(void)
1236 {
1237         unsigned int i;
1238         int ret;
1239
1240         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1241          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1242         if (hashsize) {
1243                 ip_conntrack_htable_size = hashsize;
1244         } else {
1245                 ip_conntrack_htable_size
1246                         = (((num_physpages << PAGE_SHIFT) / 16384)
1247                            / sizeof(struct list_head));
1248                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1249                         ip_conntrack_htable_size = 8192;
1250                 if (ip_conntrack_htable_size < 16)
1251                         ip_conntrack_htable_size = 16;
1252         }
1253         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1254
1255         printk("ip_conntrack version %s (%u buckets, %d max)"
1256                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1257                ip_conntrack_htable_size, ip_conntrack_max,
1258                sizeof(struct ip_conntrack));
1259
1260         ret = nf_register_sockopt(&so_getorigdst);
1261         if (ret != 0) {
1262                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1263                 return ret;
1264         }
1265
1266         /* AK: the hash table is twice as big than needed because it
1267            uses list_head.  it would be much nicer to caches to use a
1268            single pointer list head here. */
1269         ip_conntrack_vmalloc = 0; 
1270         ip_conntrack_hash 
1271                 =(void*)__get_free_pages(GFP_KERNEL, 
1272                                          get_order(sizeof(struct list_head)
1273                                                    *ip_conntrack_htable_size));
1274         if (!ip_conntrack_hash) { 
1275                 ip_conntrack_vmalloc = 1;
1276                 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1277                 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1278                                             * ip_conntrack_htable_size);
1279         }
1280         if (!ip_conntrack_hash) {
1281                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1282                 goto err_unreg_sockopt;
1283         }
1284
1285         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1286                                                 sizeof(struct ip_conntrack), 0,
1287                                                 0, NULL, NULL);
1288         if (!ip_conntrack_cachep) {
1289                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1290                 goto err_free_hash;
1291         }
1292
1293         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1294                                         sizeof(struct ip_conntrack_expect),
1295                                         0, 0, NULL, NULL);
1296         if (!ip_conntrack_expect_cachep) {
1297                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1298                 goto err_free_conntrack_slab;
1299         }
1300
1301         /* Don't NEED lock here, but good form anyway. */
1302         write_lock_bh(&ip_conntrack_lock);
1303         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1304                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1305         /* Sew in builtin protocols. */
1306         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1307         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1308         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1309         write_unlock_bh(&ip_conntrack_lock);
1310
1311         for (i = 0; i < ip_conntrack_htable_size; i++)
1312                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1313
1314         /* For use by ipt_REJECT */
1315         ip_ct_attach = ip_conntrack_attach;
1316
1317         /* Set up fake conntrack:
1318             - to never be deleted, not in any hashes */
1319         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1320         /*  - and look it like as a confirmed connection */
1321         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1322
1323         return ret;
1324
1325 err_free_conntrack_slab:
1326         kmem_cache_destroy(ip_conntrack_cachep);
1327 err_free_hash:
1328         free_conntrack_hash();
1329 err_unreg_sockopt:
1330         nf_unregister_sockopt(&so_getorigdst);
1331
1332         return -ENOMEM;
1333 }