]> err.no Git - linux-2.6/blob - net/ipv4/netfilter/ip_conntrack_core.c
[SOCK]: Introduce sk_clone
[linux-2.6] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43    registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION    "2.3"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep;
74 static kmem_cache_t *ip_conntrack_expect_cachep;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 static unsigned int ip_conntrack_next_id = 1;
81 static unsigned int ip_conntrack_expect_next_id = 1;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 struct notifier_block *ip_conntrack_chain;
84 struct notifier_block *ip_conntrack_expect_chain;
85
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88 /* deliver cached events and clear cache entry - must be called with locally
89  * disabled softirqs */
90 static inline void
91 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
92 {
93         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
94         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95                 notifier_call_chain(&ip_conntrack_chain, ecache->events,
96                                     ecache->ct);
97         ecache->events = 0;
98         ip_conntrack_put(ecache->ct);
99         ecache->ct = NULL;
100 }
101
102 /* Deliver all cached events for a particular conntrack. This is called
103  * by code prior to async packet handling or freeing the skb */
104 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
105 {
106         struct ip_conntrack_ecache *ecache;
107         
108         local_bh_disable();
109         ecache = &__get_cpu_var(ip_conntrack_ecache);
110         if (ecache->ct == ct)
111                 __ip_ct_deliver_cached_events(ecache);
112         local_bh_enable();
113 }
114
115 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116 {
117         struct ip_conntrack_ecache *ecache;
118
119         /* take care of delivering potentially old events */
120         ecache = &__get_cpu_var(ip_conntrack_ecache);
121         BUG_ON(ecache->ct == ct);
122         if (ecache->ct)
123                 __ip_ct_deliver_cached_events(ecache);
124         /* initialize for this conntrack/packet */
125         ecache->ct = ct;
126         nf_conntrack_get(&ct->ct_general);
127 }
128
129 /* flush the event cache - touches other CPU's data and must not be called while
130  * packets are still passing through the code */
131 static void ip_ct_event_cache_flush(void)
132 {
133         struct ip_conntrack_ecache *ecache;
134         int cpu;
135
136         for_each_cpu(cpu) {
137                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138                 if (ecache->ct)
139                         ip_conntrack_put(ecache->ct);
140         }
141 }
142 #else
143 static inline void ip_ct_event_cache_flush(void) {}
144 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
146 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
148 static int ip_conntrack_hash_rnd_initted;
149 static unsigned int ip_conntrack_hash_rnd;
150
151 static u_int32_t
152 hash_conntrack(const struct ip_conntrack_tuple *tuple)
153 {
154 #if 0
155         dump_tuple(tuple);
156 #endif
157         return (jhash_3words(tuple->src.ip,
158                              (tuple->dst.ip ^ tuple->dst.protonum),
159                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
160                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
161 }
162
163 int
164 ip_ct_get_tuple(const struct iphdr *iph,
165                 const struct sk_buff *skb,
166                 unsigned int dataoff,
167                 struct ip_conntrack_tuple *tuple,
168                 const struct ip_conntrack_protocol *protocol)
169 {
170         /* Never happen */
171         if (iph->frag_off & htons(IP_OFFSET)) {
172                 printk("ip_conntrack_core: Frag of proto %u.\n",
173                        iph->protocol);
174                 return 0;
175         }
176
177         tuple->src.ip = iph->saddr;
178         tuple->dst.ip = iph->daddr;
179         tuple->dst.protonum = iph->protocol;
180         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
181
182         return protocol->pkt_to_tuple(skb, dataoff, tuple);
183 }
184
185 int
186 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
187                    const struct ip_conntrack_tuple *orig,
188                    const struct ip_conntrack_protocol *protocol)
189 {
190         inverse->src.ip = orig->dst.ip;
191         inverse->dst.ip = orig->src.ip;
192         inverse->dst.protonum = orig->dst.protonum;
193         inverse->dst.dir = !orig->dst.dir;
194
195         return protocol->invert_tuple(inverse, orig);
196 }
197
198
199 /* ip_conntrack_expect helper functions */
200 static void unlink_expect(struct ip_conntrack_expect *exp)
201 {
202         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
203         IP_NF_ASSERT(!timer_pending(&exp->timeout));
204         list_del(&exp->list);
205         CONNTRACK_STAT_INC(expect_delete);
206         exp->master->expecting--;
207         ip_conntrack_expect_put(exp);
208 }
209
210 void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
211 {
212         unlink_expect(exp);
213         ip_conntrack_expect_put(exp);
214 }
215
216 static void expectation_timed_out(unsigned long ul_expect)
217 {
218         struct ip_conntrack_expect *exp = (void *)ul_expect;
219
220         write_lock_bh(&ip_conntrack_lock);
221         unlink_expect(exp);
222         write_unlock_bh(&ip_conntrack_lock);
223         ip_conntrack_expect_put(exp);
224 }
225
226 struct ip_conntrack_expect *
227 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
228 {
229         struct ip_conntrack_expect *i;
230         
231         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
232                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
233                         atomic_inc(&i->use);
234                         return i;
235                 }
236         }
237         return NULL;
238 }
239
240 /* Just find a expectation corresponding to a tuple. */
241 struct ip_conntrack_expect *
242 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
243 {
244         struct ip_conntrack_expect *i;
245         
246         read_lock_bh(&ip_conntrack_lock);
247         i = __ip_conntrack_expect_find(tuple);
248         read_unlock_bh(&ip_conntrack_lock);
249
250         return i;
251 }
252
253 /* If an expectation for this connection is found, it gets delete from
254  * global list then returned. */
255 static struct ip_conntrack_expect *
256 find_expectation(const struct ip_conntrack_tuple *tuple)
257 {
258         struct ip_conntrack_expect *i;
259
260         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
261                 /* If master is not in hash table yet (ie. packet hasn't left
262                    this machine yet), how can other end know about expected?
263                    Hence these are not the droids you are looking for (if
264                    master ct never got confirmed, we'd hold a reference to it
265                    and weird things would happen to future packets). */
266                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
267                     && is_confirmed(i->master)
268                     && del_timer(&i->timeout)) {
269                         unlink_expect(i);
270                         return i;
271                 }
272         }
273         return NULL;
274 }
275
276 /* delete all expectations for this conntrack */
277 void ip_ct_remove_expectations(struct ip_conntrack *ct)
278 {
279         struct ip_conntrack_expect *i, *tmp;
280
281         /* Optimization: most connection never expect any others. */
282         if (ct->expecting == 0)
283                 return;
284
285         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
286                 if (i->master == ct && del_timer(&i->timeout)) {
287                         unlink_expect(i);
288                         ip_conntrack_expect_put(i);
289                 }
290         }
291 }
292
293 static void
294 clean_from_lists(struct ip_conntrack *ct)
295 {
296         unsigned int ho, hr;
297         
298         DEBUGP("clean_from_lists(%p)\n", ct);
299         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
300
301         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
302         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
303         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
304         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
305
306         /* Destroy all pending expectations */
307         ip_ct_remove_expectations(ct);
308 }
309
310 static void
311 destroy_conntrack(struct nf_conntrack *nfct)
312 {
313         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
314         struct ip_conntrack_protocol *proto;
315
316         DEBUGP("destroy_conntrack(%p)\n", ct);
317         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
318         IP_NF_ASSERT(!timer_pending(&ct->timeout));
319
320         ip_conntrack_event(IPCT_DESTROY, ct);
321         set_bit(IPS_DYING_BIT, &ct->status);
322
323         /* To make sure we don't get any weird locking issues here:
324          * destroy_conntrack() MUST NOT be called with a write lock
325          * to ip_conntrack_lock!!! -HW */
326         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
327         if (proto && proto->destroy)
328                 proto->destroy(ct);
329
330         if (ip_conntrack_destroyed)
331                 ip_conntrack_destroyed(ct);
332
333         write_lock_bh(&ip_conntrack_lock);
334         /* Expectations will have been removed in clean_from_lists,
335          * except TFTP can create an expectation on the first packet,
336          * before connection is in the list, so we need to clean here,
337          * too. */
338         ip_ct_remove_expectations(ct);
339
340         /* We overload first tuple to link into unconfirmed list. */
341         if (!is_confirmed(ct)) {
342                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
343                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
344         }
345
346         CONNTRACK_STAT_INC(delete);
347         write_unlock_bh(&ip_conntrack_lock);
348
349         if (ct->master)
350                 ip_conntrack_put(ct->master);
351
352         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
353         ip_conntrack_free(ct);
354 }
355
356 static void death_by_timeout(unsigned long ul_conntrack)
357 {
358         struct ip_conntrack *ct = (void *)ul_conntrack;
359
360         write_lock_bh(&ip_conntrack_lock);
361         /* Inside lock so preempt is disabled on module removal path.
362          * Otherwise we can get spurious warnings. */
363         CONNTRACK_STAT_INC(delete_list);
364         clean_from_lists(ct);
365         write_unlock_bh(&ip_conntrack_lock);
366         ip_conntrack_put(ct);
367 }
368
369 static inline int
370 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
371                     const struct ip_conntrack_tuple *tuple,
372                     const struct ip_conntrack *ignored_conntrack)
373 {
374         ASSERT_READ_LOCK(&ip_conntrack_lock);
375         return tuplehash_to_ctrack(i) != ignored_conntrack
376                 && ip_ct_tuple_equal(tuple, &i->tuple);
377 }
378
379 struct ip_conntrack_tuple_hash *
380 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
381                     const struct ip_conntrack *ignored_conntrack)
382 {
383         struct ip_conntrack_tuple_hash *h;
384         unsigned int hash = hash_conntrack(tuple);
385
386         ASSERT_READ_LOCK(&ip_conntrack_lock);
387         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
388                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
389                         CONNTRACK_STAT_INC(found);
390                         return h;
391                 }
392                 CONNTRACK_STAT_INC(searched);
393         }
394
395         return NULL;
396 }
397
398 /* Find a connection corresponding to a tuple. */
399 struct ip_conntrack_tuple_hash *
400 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
401                       const struct ip_conntrack *ignored_conntrack)
402 {
403         struct ip_conntrack_tuple_hash *h;
404
405         read_lock_bh(&ip_conntrack_lock);
406         h = __ip_conntrack_find(tuple, ignored_conntrack);
407         if (h)
408                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
409         read_unlock_bh(&ip_conntrack_lock);
410
411         return h;
412 }
413
414 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
415                                         unsigned int hash,
416                                         unsigned int repl_hash) 
417 {
418         ct->id = ++ip_conntrack_next_id;
419         list_prepend(&ip_conntrack_hash[hash],
420                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
421         list_prepend(&ip_conntrack_hash[repl_hash],
422                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
423 }
424
425 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
426 {
427         unsigned int hash, repl_hash;
428
429         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
430         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
431
432         write_lock_bh(&ip_conntrack_lock);
433         __ip_conntrack_hash_insert(ct, hash, repl_hash);
434         write_unlock_bh(&ip_conntrack_lock);
435 }
436
437 /* Confirm a connection given skb; places it in hash table */
438 int
439 __ip_conntrack_confirm(struct sk_buff **pskb)
440 {
441         unsigned int hash, repl_hash;
442         struct ip_conntrack *ct;
443         enum ip_conntrack_info ctinfo;
444
445         ct = ip_conntrack_get(*pskb, &ctinfo);
446
447         /* ipt_REJECT uses ip_conntrack_attach to attach related
448            ICMP/TCP RST packets in other direction.  Actual packet
449            which created connection will be IP_CT_NEW or for an
450            expected connection, IP_CT_RELATED. */
451         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
452                 return NF_ACCEPT;
453
454         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
455         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
456
457         /* We're not in hash table, and we refuse to set up related
458            connections for unconfirmed conns.  But packet copies and
459            REJECT will give spurious warnings here. */
460         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
461
462         /* No external references means noone else could have
463            confirmed us. */
464         IP_NF_ASSERT(!is_confirmed(ct));
465         DEBUGP("Confirming conntrack %p\n", ct);
466
467         write_lock_bh(&ip_conntrack_lock);
468
469         /* See if there's one in the list already, including reverse:
470            NAT could have grabbed it without realizing, since we're
471            not in the hash.  If there is, we lost race. */
472         if (!LIST_FIND(&ip_conntrack_hash[hash],
473                        conntrack_tuple_cmp,
474                        struct ip_conntrack_tuple_hash *,
475                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
476             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
477                           conntrack_tuple_cmp,
478                           struct ip_conntrack_tuple_hash *,
479                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
480                 /* Remove from unconfirmed list */
481                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
482
483                 __ip_conntrack_hash_insert(ct, hash, repl_hash);
484                 /* Timer relative to confirmation time, not original
485                    setting time, otherwise we'd get timer wrap in
486                    weird delay cases. */
487                 ct->timeout.expires += jiffies;
488                 add_timer(&ct->timeout);
489                 atomic_inc(&ct->ct_general.use);
490                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
491                 CONNTRACK_STAT_INC(insert);
492                 write_unlock_bh(&ip_conntrack_lock);
493                 if (ct->helper)
494                         ip_conntrack_event_cache(IPCT_HELPER, *pskb);
495 #ifdef CONFIG_IP_NF_NAT_NEEDED
496                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
497                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
498                         ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
499 #endif
500                 ip_conntrack_event_cache(master_ct(ct) ?
501                                          IPCT_RELATED : IPCT_NEW, *pskb);
502
503                 return NF_ACCEPT;
504         }
505
506         CONNTRACK_STAT_INC(insert_failed);
507         write_unlock_bh(&ip_conntrack_lock);
508
509         return NF_DROP;
510 }
511
512 /* Returns true if a connection correspondings to the tuple (required
513    for NAT). */
514 int
515 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
516                          const struct ip_conntrack *ignored_conntrack)
517 {
518         struct ip_conntrack_tuple_hash *h;
519
520         read_lock_bh(&ip_conntrack_lock);
521         h = __ip_conntrack_find(tuple, ignored_conntrack);
522         read_unlock_bh(&ip_conntrack_lock);
523
524         return h != NULL;
525 }
526
527 /* There's a small race here where we may free a just-assured
528    connection.  Too bad: we're in trouble anyway. */
529 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
530 {
531         return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
532 }
533
534 static int early_drop(struct list_head *chain)
535 {
536         /* Traverse backwards: gives us oldest, which is roughly LRU */
537         struct ip_conntrack_tuple_hash *h;
538         struct ip_conntrack *ct = NULL;
539         int dropped = 0;
540
541         read_lock_bh(&ip_conntrack_lock);
542         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
543         if (h) {
544                 ct = tuplehash_to_ctrack(h);
545                 atomic_inc(&ct->ct_general.use);
546         }
547         read_unlock_bh(&ip_conntrack_lock);
548
549         if (!ct)
550                 return dropped;
551
552         if (del_timer(&ct->timeout)) {
553                 death_by_timeout((unsigned long)ct);
554                 dropped = 1;
555                 CONNTRACK_STAT_INC(early_drop);
556         }
557         ip_conntrack_put(ct);
558         return dropped;
559 }
560
561 static inline int helper_cmp(const struct ip_conntrack_helper *i,
562                              const struct ip_conntrack_tuple *rtuple)
563 {
564         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
565 }
566
567 static struct ip_conntrack_helper *
568 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
569 {
570         return LIST_FIND(&helpers, helper_cmp,
571                          struct ip_conntrack_helper *,
572                          tuple);
573 }
574
575 struct ip_conntrack_helper *
576 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
577 {
578         struct ip_conntrack_helper *helper;
579
580         /* need ip_conntrack_lock to assure that helper exists until
581          * try_module_get() is called */
582         read_lock_bh(&ip_conntrack_lock);
583
584         helper = __ip_conntrack_helper_find(tuple);
585         if (helper) {
586                 /* need to increase module usage count to assure helper will
587                  * not go away while the caller is e.g. busy putting a
588                  * conntrack in the hash that uses the helper */
589                 if (!try_module_get(helper->me))
590                         helper = NULL;
591         }
592
593         read_unlock_bh(&ip_conntrack_lock);
594
595         return helper;
596 }
597
598 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
599 {
600         module_put(helper->me);
601 }
602
603 struct ip_conntrack_protocol *
604 __ip_conntrack_proto_find(u_int8_t protocol)
605 {
606         return ip_ct_protos[protocol];
607 }
608
609 /* this is guaranteed to always return a valid protocol helper, since
610  * it falls back to generic_protocol */
611 struct ip_conntrack_protocol *
612 ip_conntrack_proto_find_get(u_int8_t protocol)
613 {
614         struct ip_conntrack_protocol *p;
615
616         preempt_disable();
617         p = __ip_conntrack_proto_find(protocol);
618         if (p) {
619                 if (!try_module_get(p->me))
620                         p = &ip_conntrack_generic_protocol;
621         }
622         preempt_enable();
623         
624         return p;
625 }
626
627 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
628 {
629         module_put(p->me);
630 }
631
632 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
633                                         struct ip_conntrack_tuple *repl)
634 {
635         struct ip_conntrack *conntrack;
636
637         if (!ip_conntrack_hash_rnd_initted) {
638                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
639                 ip_conntrack_hash_rnd_initted = 1;
640         }
641
642         if (ip_conntrack_max
643             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
644                 unsigned int hash = hash_conntrack(orig);
645                 /* Try dropping from this hash chain. */
646                 if (!early_drop(&ip_conntrack_hash[hash])) {
647                         if (net_ratelimit())
648                                 printk(KERN_WARNING
649                                        "ip_conntrack: table full, dropping"
650                                        " packet.\n");
651                         return ERR_PTR(-ENOMEM);
652                 }
653         }
654
655         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
656         if (!conntrack) {
657                 DEBUGP("Can't allocate conntrack.\n");
658                 return NULL;
659         }
660
661         memset(conntrack, 0, sizeof(*conntrack));
662         atomic_set(&conntrack->ct_general.use, 1);
663         conntrack->ct_general.destroy = destroy_conntrack;
664         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
665         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
666         /* Don't set timer yet: wait for confirmation */
667         init_timer(&conntrack->timeout);
668         conntrack->timeout.data = (unsigned long)conntrack;
669         conntrack->timeout.function = death_by_timeout;
670
671         atomic_inc(&ip_conntrack_count);
672
673         return conntrack;
674 }
675
676 void
677 ip_conntrack_free(struct ip_conntrack *conntrack)
678 {
679         atomic_dec(&ip_conntrack_count);
680         kmem_cache_free(ip_conntrack_cachep, conntrack);
681 }
682
683 /* Allocate a new conntrack: we return -ENOMEM if classification
684  * failed due to stress.   Otherwise it really is unclassifiable */
685 static struct ip_conntrack_tuple_hash *
686 init_conntrack(struct ip_conntrack_tuple *tuple,
687                struct ip_conntrack_protocol *protocol,
688                struct sk_buff *skb)
689 {
690         struct ip_conntrack *conntrack;
691         struct ip_conntrack_tuple repl_tuple;
692         struct ip_conntrack_expect *exp;
693
694         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
695                 DEBUGP("Can't invert tuple.\n");
696                 return NULL;
697         }
698
699         if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple)))
700                 return NULL;
701
702         if (!protocol->new(conntrack, skb)) {
703                 ip_conntrack_free(conntrack);
704                 return NULL;
705         }
706
707         write_lock_bh(&ip_conntrack_lock);
708         exp = find_expectation(tuple);
709
710         if (exp) {
711                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
712                         conntrack, exp);
713                 /* Welcome, Mr. Bond.  We've been expecting you... */
714                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
715                 conntrack->master = exp->master;
716 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
717                 conntrack->mark = exp->master->mark;
718 #endif
719 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
720     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
721                 /* this is ugly, but there is no other place where to put it */
722                 conntrack->nat.masq_index = exp->master->nat.masq_index;
723 #endif
724                 nf_conntrack_get(&conntrack->master->ct_general);
725                 CONNTRACK_STAT_INC(expect_new);
726         } else {
727                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
728
729                 CONNTRACK_STAT_INC(new);
730         }
731
732         /* Overload tuple linked list to put us in unconfirmed list. */
733         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
734
735         write_unlock_bh(&ip_conntrack_lock);
736
737         if (exp) {
738                 if (exp->expectfn)
739                         exp->expectfn(conntrack, exp);
740                 ip_conntrack_expect_put(exp);
741         }
742
743         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
744 }
745
746 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
747 static inline struct ip_conntrack *
748 resolve_normal_ct(struct sk_buff *skb,
749                   struct ip_conntrack_protocol *proto,
750                   int *set_reply,
751                   unsigned int hooknum,
752                   enum ip_conntrack_info *ctinfo)
753 {
754         struct ip_conntrack_tuple tuple;
755         struct ip_conntrack_tuple_hash *h;
756         struct ip_conntrack *ct;
757
758         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
759
760         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
761                                 &tuple,proto))
762                 return NULL;
763
764         /* look for tuple match */
765         h = ip_conntrack_find_get(&tuple, NULL);
766         if (!h) {
767                 h = init_conntrack(&tuple, proto, skb);
768                 if (!h)
769                         return NULL;
770                 if (IS_ERR(h))
771                         return (void *)h;
772         }
773         ct = tuplehash_to_ctrack(h);
774
775         /* It exists; we have (non-exclusive) reference. */
776         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
777                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
778                 /* Please set reply bit if this packet OK */
779                 *set_reply = 1;
780         } else {
781                 /* Once we've had two way comms, always ESTABLISHED. */
782                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
783                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
784                                ct);
785                         *ctinfo = IP_CT_ESTABLISHED;
786                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
787                         DEBUGP("ip_conntrack_in: related packet for %p\n",
788                                ct);
789                         *ctinfo = IP_CT_RELATED;
790                 } else {
791                         DEBUGP("ip_conntrack_in: new packet for %p\n",
792                                ct);
793                         *ctinfo = IP_CT_NEW;
794                 }
795                 *set_reply = 0;
796         }
797         skb->nfct = &ct->ct_general;
798         skb->nfctinfo = *ctinfo;
799         return ct;
800 }
801
802 /* Netfilter hook itself. */
803 unsigned int ip_conntrack_in(unsigned int hooknum,
804                              struct sk_buff **pskb,
805                              const struct net_device *in,
806                              const struct net_device *out,
807                              int (*okfn)(struct sk_buff *))
808 {
809         struct ip_conntrack *ct;
810         enum ip_conntrack_info ctinfo;
811         struct ip_conntrack_protocol *proto;
812         int set_reply = 0;
813         int ret;
814
815         /* Previously seen (loopback or untracked)?  Ignore. */
816         if ((*pskb)->nfct) {
817                 CONNTRACK_STAT_INC(ignore);
818                 return NF_ACCEPT;
819         }
820
821         /* Never happen */
822         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
823                 if (net_ratelimit()) {
824                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
825                        (*pskb)->nh.iph->protocol, hooknum);
826                 }
827                 return NF_DROP;
828         }
829
830 /* Doesn't cover locally-generated broadcast, so not worth it. */
831 #if 0
832         /* Ignore broadcast: no `connection'. */
833         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
834                 printk("Broadcast packet!\n");
835                 return NF_ACCEPT;
836         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
837                    == htonl(0x000000FF)) {
838                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
839                        NIPQUAD((*pskb)->nh.iph->saddr),
840                        NIPQUAD((*pskb)->nh.iph->daddr),
841                        (*pskb)->sk, (*pskb)->pkt_type);
842         }
843 #endif
844
845         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
846
847         /* It may be an special packet, error, unclean...
848          * inverse of the return code tells to the netfilter
849          * core what to do with the packet. */
850         if (proto->error != NULL 
851             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
852                 CONNTRACK_STAT_INC(error);
853                 CONNTRACK_STAT_INC(invalid);
854                 return -ret;
855         }
856
857         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
858                 /* Not valid part of a connection */
859                 CONNTRACK_STAT_INC(invalid);
860                 return NF_ACCEPT;
861         }
862
863         if (IS_ERR(ct)) {
864                 /* Too stressed to deal. */
865                 CONNTRACK_STAT_INC(drop);
866                 return NF_DROP;
867         }
868
869         IP_NF_ASSERT((*pskb)->nfct);
870
871         ret = proto->packet(ct, *pskb, ctinfo);
872         if (ret < 0) {
873                 /* Invalid: inverse of the return code tells
874                  * the netfilter core what to do*/
875                 nf_conntrack_put((*pskb)->nfct);
876                 (*pskb)->nfct = NULL;
877                 CONNTRACK_STAT_INC(invalid);
878                 return -ret;
879         }
880
881         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
882                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
883
884         return ret;
885 }
886
887 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
888                    const struct ip_conntrack_tuple *orig)
889 {
890         return ip_ct_invert_tuple(inverse, orig, 
891                                   __ip_conntrack_proto_find(orig->dst.protonum));
892 }
893
894 /* Would two expected things clash? */
895 static inline int expect_clash(const struct ip_conntrack_expect *a,
896                                const struct ip_conntrack_expect *b)
897 {
898         /* Part covered by intersection of masks must be unequal,
899            otherwise they clash */
900         struct ip_conntrack_tuple intersect_mask
901                 = { { a->mask.src.ip & b->mask.src.ip,
902                       { a->mask.src.u.all & b->mask.src.u.all } },
903                     { a->mask.dst.ip & b->mask.dst.ip,
904                       { a->mask.dst.u.all & b->mask.dst.u.all },
905                       a->mask.dst.protonum & b->mask.dst.protonum } };
906
907         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
908 }
909
910 static inline int expect_matches(const struct ip_conntrack_expect *a,
911                                  const struct ip_conntrack_expect *b)
912 {
913         return a->master == b->master
914                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
915                 && ip_ct_tuple_equal(&a->mask, &b->mask);
916 }
917
918 /* Generally a bad idea to call this: could have matched already. */
919 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
920 {
921         struct ip_conntrack_expect *i;
922
923         write_lock_bh(&ip_conntrack_lock);
924         /* choose the the oldest expectation to evict */
925         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
926                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
927                         unlink_expect(i);
928                         write_unlock_bh(&ip_conntrack_lock);
929                         ip_conntrack_expect_put(i);
930                         return;
931                 }
932         }
933         write_unlock_bh(&ip_conntrack_lock);
934 }
935
936 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
937 {
938         struct ip_conntrack_expect *new;
939
940         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
941         if (!new) {
942                 DEBUGP("expect_related: OOM allocating expect\n");
943                 return NULL;
944         }
945         new->master = me;
946         atomic_inc(&new->master->ct_general.use);
947         atomic_set(&new->use, 1);
948         return new;
949 }
950
951 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
952 {
953         if (atomic_dec_and_test(&exp->use)) {
954                 ip_conntrack_put(exp->master);
955                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
956         }
957 }
958
959 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
960 {
961         atomic_inc(&exp->use);
962         exp->master->expecting++;
963         list_add(&exp->list, &ip_conntrack_expect_list);
964
965         init_timer(&exp->timeout);
966         exp->timeout.data = (unsigned long)exp;
967         exp->timeout.function = expectation_timed_out;
968         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
969         add_timer(&exp->timeout);
970
971         exp->id = ++ip_conntrack_expect_next_id;
972         atomic_inc(&exp->use);
973         CONNTRACK_STAT_INC(expect_create);
974 }
975
976 /* Race with expectations being used means we could have none to find; OK. */
977 static void evict_oldest_expect(struct ip_conntrack *master)
978 {
979         struct ip_conntrack_expect *i;
980
981         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
982                 if (i->master == master) {
983                         if (del_timer(&i->timeout)) {
984                                 unlink_expect(i);
985                                 ip_conntrack_expect_put(i);
986                         }
987                         break;
988                 }
989         }
990 }
991
992 static inline int refresh_timer(struct ip_conntrack_expect *i)
993 {
994         if (!del_timer(&i->timeout))
995                 return 0;
996
997         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
998         add_timer(&i->timeout);
999         return 1;
1000 }
1001
1002 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1003 {
1004         struct ip_conntrack_expect *i;
1005         int ret;
1006
1007         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1008         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1009         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1010
1011         write_lock_bh(&ip_conntrack_lock);
1012         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1013                 if (expect_matches(i, expect)) {
1014                         /* Refresh timer: if it's dying, ignore.. */
1015                         if (refresh_timer(i)) {
1016                                 ret = 0;
1017                                 goto out;
1018                         }
1019                 } else if (expect_clash(i, expect)) {
1020                         ret = -EBUSY;
1021                         goto out;
1022                 }
1023         }
1024
1025         /* Will be over limit? */
1026         if (expect->master->helper->max_expected && 
1027             expect->master->expecting >= expect->master->helper->max_expected)
1028                 evict_oldest_expect(expect->master);
1029
1030         ip_conntrack_expect_insert(expect);
1031         ip_conntrack_expect_event(IPEXP_NEW, expect);
1032         ret = 0;
1033 out:
1034         write_unlock_bh(&ip_conntrack_lock);
1035         return ret;
1036 }
1037
1038 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1039    implicitly racy: see __ip_conntrack_confirm */
1040 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1041                               const struct ip_conntrack_tuple *newreply)
1042 {
1043         write_lock_bh(&ip_conntrack_lock);
1044         /* Should be unconfirmed, so not in hash table yet */
1045         IP_NF_ASSERT(!is_confirmed(conntrack));
1046
1047         DEBUGP("Altering reply tuple of %p to ", conntrack);
1048         DUMP_TUPLE(newreply);
1049
1050         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1051         if (!conntrack->master && conntrack->expecting == 0)
1052                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1053         write_unlock_bh(&ip_conntrack_lock);
1054 }
1055
1056 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1057 {
1058         BUG_ON(me->timeout == 0);
1059         write_lock_bh(&ip_conntrack_lock);
1060         list_prepend(&helpers, me);
1061         write_unlock_bh(&ip_conntrack_lock);
1062
1063         return 0;
1064 }
1065
1066 struct ip_conntrack_helper *
1067 __ip_conntrack_helper_find_byname(const char *name)
1068 {
1069         struct ip_conntrack_helper *h;
1070
1071         list_for_each_entry(h, &helpers, list) {
1072                 if (!strcmp(h->name, name))
1073                         return h;
1074         }
1075
1076         return NULL;
1077 }
1078
1079 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1080                          const struct ip_conntrack_helper *me)
1081 {
1082         if (tuplehash_to_ctrack(i)->helper == me) {
1083                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1084                 tuplehash_to_ctrack(i)->helper = NULL;
1085         }
1086         return 0;
1087 }
1088
1089 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1090 {
1091         unsigned int i;
1092         struct ip_conntrack_expect *exp, *tmp;
1093
1094         /* Need write lock here, to delete helper. */
1095         write_lock_bh(&ip_conntrack_lock);
1096         LIST_DELETE(&helpers, me);
1097
1098         /* Get rid of expectations */
1099         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1100                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1101                         unlink_expect(exp);
1102                         ip_conntrack_expect_put(exp);
1103                 }
1104         }
1105         /* Get rid of expecteds, set helpers to NULL. */
1106         LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1107         for (i = 0; i < ip_conntrack_htable_size; i++)
1108                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1109                             struct ip_conntrack_tuple_hash *, me);
1110         write_unlock_bh(&ip_conntrack_lock);
1111
1112         /* Someone could be still looking at the helper in a bh. */
1113         synchronize_net();
1114 }
1115
1116 static inline void ct_add_counters(struct ip_conntrack *ct,
1117                                    enum ip_conntrack_info ctinfo,
1118                                    const struct sk_buff *skb)
1119 {
1120 #ifdef CONFIG_IP_NF_CT_ACCT
1121         if (skb) {
1122                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1123                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1124                                         ntohs(skb->nh.iph->tot_len);
1125         }
1126 #endif
1127 }
1128
1129 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1130 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
1131                         enum ip_conntrack_info ctinfo,
1132                         const struct sk_buff *skb,
1133                         unsigned long extra_jiffies)
1134 {
1135         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1136
1137         /* If not in hash table, timer will not be active yet */
1138         if (!is_confirmed(ct)) {
1139                 ct->timeout.expires = extra_jiffies;
1140                 ct_add_counters(ct, ctinfo, skb);
1141         } else {
1142                 write_lock_bh(&ip_conntrack_lock);
1143                 /* Need del_timer for race avoidance (may already be dying). */
1144                 if (del_timer(&ct->timeout)) {
1145                         ct->timeout.expires = jiffies + extra_jiffies;
1146                         add_timer(&ct->timeout);
1147                         ip_conntrack_event_cache(IPCT_REFRESH, skb);
1148                 }
1149                 ct_add_counters(ct, ctinfo, skb);
1150                 write_unlock_bh(&ip_conntrack_lock);
1151         }
1152 }
1153
1154 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1155     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1156 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1157  * in ip_conntrack_core, since we don't want the protocols to autoload
1158  * or depend on ctnetlink */
1159 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1160                                const struct ip_conntrack_tuple *tuple)
1161 {
1162         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1163                 &tuple->src.u.tcp.port);
1164         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1165                 &tuple->dst.u.tcp.port);
1166         return 0;
1167
1168 nfattr_failure:
1169         return -1;
1170 }
1171
1172 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1173                                struct ip_conntrack_tuple *t)
1174 {
1175         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1176                 return -EINVAL;
1177
1178         t->src.u.tcp.port =
1179                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1180         t->dst.u.tcp.port =
1181                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1182
1183         return 0;
1184 }
1185 #endif
1186
1187 /* Returns new sk_buff, or NULL */
1188 struct sk_buff *
1189 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1190 {
1191         skb_orphan(skb);
1192
1193         local_bh_disable(); 
1194         skb = ip_defrag(skb, user);
1195         local_bh_enable();
1196
1197         if (skb)
1198                 ip_send_check(skb->nh.iph);
1199         return skb;
1200 }
1201
1202 /* Used by ipt_REJECT. */
1203 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1204 {
1205         struct ip_conntrack *ct;
1206         enum ip_conntrack_info ctinfo;
1207
1208         /* This ICMP is in reverse direction to the packet which caused it */
1209         ct = ip_conntrack_get(skb, &ctinfo);
1210         
1211         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1212                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1213         else
1214                 ctinfo = IP_CT_RELATED;
1215
1216         /* Attach to new skbuff, and increment count */
1217         nskb->nfct = &ct->ct_general;
1218         nskb->nfctinfo = ctinfo;
1219         nf_conntrack_get(nskb->nfct);
1220 }
1221
1222 static inline int
1223 do_iter(const struct ip_conntrack_tuple_hash *i,
1224         int (*iter)(struct ip_conntrack *i, void *data),
1225         void *data)
1226 {
1227         return iter(tuplehash_to_ctrack(i), data);
1228 }
1229
1230 /* Bring out ya dead! */
1231 static struct ip_conntrack_tuple_hash *
1232 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1233                 void *data, unsigned int *bucket)
1234 {
1235         struct ip_conntrack_tuple_hash *h = NULL;
1236
1237         write_lock_bh(&ip_conntrack_lock);
1238         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1239                 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1240                                 struct ip_conntrack_tuple_hash *, iter, data);
1241                 if (h)
1242                         break;
1243         }
1244         if (!h)
1245                 h = LIST_FIND_W(&unconfirmed, do_iter,
1246                                 struct ip_conntrack_tuple_hash *, iter, data);
1247         if (h)
1248                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1249         write_unlock_bh(&ip_conntrack_lock);
1250
1251         return h;
1252 }
1253
1254 void
1255 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1256 {
1257         struct ip_conntrack_tuple_hash *h;
1258         unsigned int bucket = 0;
1259
1260         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1261                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1262                 /* Time to push up daises... */
1263                 if (del_timer(&ct->timeout))
1264                         death_by_timeout((unsigned long)ct);
1265                 /* ... else the timer will get him soon. */
1266
1267                 ip_conntrack_put(ct);
1268         }
1269 }
1270
1271 /* Fast function for those who don't want to parse /proc (and I don't
1272    blame them). */
1273 /* Reversing the socket's dst/src point of view gives us the reply
1274    mapping. */
1275 static int
1276 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1277 {
1278         struct inet_sock *inet = inet_sk(sk);
1279         struct ip_conntrack_tuple_hash *h;
1280         struct ip_conntrack_tuple tuple;
1281         
1282         IP_CT_TUPLE_U_BLANK(&tuple);
1283         tuple.src.ip = inet->rcv_saddr;
1284         tuple.src.u.tcp.port = inet->sport;
1285         tuple.dst.ip = inet->daddr;
1286         tuple.dst.u.tcp.port = inet->dport;
1287         tuple.dst.protonum = IPPROTO_TCP;
1288
1289         /* We only do TCP at the moment: is there a better way? */
1290         if (strcmp(sk->sk_prot->name, "TCP")) {
1291                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1292                 return -ENOPROTOOPT;
1293         }
1294
1295         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1296                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1297                        *len, sizeof(struct sockaddr_in));
1298                 return -EINVAL;
1299         }
1300
1301         h = ip_conntrack_find_get(&tuple, NULL);
1302         if (h) {
1303                 struct sockaddr_in sin;
1304                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1305
1306                 sin.sin_family = AF_INET;
1307                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1308                         .tuple.dst.u.tcp.port;
1309                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1310                         .tuple.dst.ip;
1311
1312                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1313                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1314                 ip_conntrack_put(ct);
1315                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1316                         return -EFAULT;
1317                 else
1318                         return 0;
1319         }
1320         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1321                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1322                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1323         return -ENOENT;
1324 }
1325
1326 static struct nf_sockopt_ops so_getorigdst = {
1327         .pf             = PF_INET,
1328         .get_optmin     = SO_ORIGINAL_DST,
1329         .get_optmax     = SO_ORIGINAL_DST+1,
1330         .get            = &getorigdst,
1331 };
1332
1333 static int kill_all(struct ip_conntrack *i, void *data)
1334 {
1335         return 1;
1336 }
1337
1338 static void free_conntrack_hash(void)
1339 {
1340         if (ip_conntrack_vmalloc)
1341                 vfree(ip_conntrack_hash);
1342         else
1343                 free_pages((unsigned long)ip_conntrack_hash, 
1344                            get_order(sizeof(struct list_head)
1345                                      * ip_conntrack_htable_size));
1346 }
1347
1348 void ip_conntrack_flush()
1349 {
1350         /* This makes sure all current packets have passed through
1351            netfilter framework.  Roll on, two-stage module
1352            delete... */
1353         synchronize_net();
1354
1355         ip_ct_event_cache_flush();
1356  i_see_dead_people:
1357         ip_ct_iterate_cleanup(kill_all, NULL);
1358         if (atomic_read(&ip_conntrack_count) != 0) {
1359                 schedule();
1360                 goto i_see_dead_people;
1361         }
1362         /* wait until all references to ip_conntrack_untracked are dropped */
1363         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1364                 schedule();
1365 }
1366
1367 /* Mishearing the voices in his head, our hero wonders how he's
1368    supposed to kill the mall. */
1369 void ip_conntrack_cleanup(void)
1370 {
1371         ip_ct_attach = NULL;
1372         ip_conntrack_flush();
1373         kmem_cache_destroy(ip_conntrack_cachep);
1374         kmem_cache_destroy(ip_conntrack_expect_cachep);
1375         free_conntrack_hash();
1376         nf_unregister_sockopt(&so_getorigdst);
1377 }
1378
1379 static int hashsize;
1380 module_param(hashsize, int, 0400);
1381
1382 int __init ip_conntrack_init(void)
1383 {
1384         unsigned int i;
1385         int ret;
1386
1387         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1388          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1389         if (hashsize) {
1390                 ip_conntrack_htable_size = hashsize;
1391         } else {
1392                 ip_conntrack_htable_size
1393                         = (((num_physpages << PAGE_SHIFT) / 16384)
1394                            / sizeof(struct list_head));
1395                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1396                         ip_conntrack_htable_size = 8192;
1397                 if (ip_conntrack_htable_size < 16)
1398                         ip_conntrack_htable_size = 16;
1399         }
1400         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1401
1402         printk("ip_conntrack version %s (%u buckets, %d max)"
1403                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1404                ip_conntrack_htable_size, ip_conntrack_max,
1405                sizeof(struct ip_conntrack));
1406
1407         ret = nf_register_sockopt(&so_getorigdst);
1408         if (ret != 0) {
1409                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1410                 return ret;
1411         }
1412
1413         /* AK: the hash table is twice as big than needed because it
1414            uses list_head.  it would be much nicer to caches to use a
1415            single pointer list head here. */
1416         ip_conntrack_vmalloc = 0; 
1417         ip_conntrack_hash 
1418                 =(void*)__get_free_pages(GFP_KERNEL, 
1419                                          get_order(sizeof(struct list_head)
1420                                                    *ip_conntrack_htable_size));
1421         if (!ip_conntrack_hash) { 
1422                 ip_conntrack_vmalloc = 1;
1423                 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1424                 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1425                                             * ip_conntrack_htable_size);
1426         }
1427         if (!ip_conntrack_hash) {
1428                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1429                 goto err_unreg_sockopt;
1430         }
1431
1432         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1433                                                 sizeof(struct ip_conntrack), 0,
1434                                                 0, NULL, NULL);
1435         if (!ip_conntrack_cachep) {
1436                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1437                 goto err_free_hash;
1438         }
1439
1440         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1441                                         sizeof(struct ip_conntrack_expect),
1442                                         0, 0, NULL, NULL);
1443         if (!ip_conntrack_expect_cachep) {
1444                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1445                 goto err_free_conntrack_slab;
1446         }
1447
1448         /* Don't NEED lock here, but good form anyway. */
1449         write_lock_bh(&ip_conntrack_lock);
1450         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1451                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1452         /* Sew in builtin protocols. */
1453         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1454         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1455         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1456         write_unlock_bh(&ip_conntrack_lock);
1457
1458         for (i = 0; i < ip_conntrack_htable_size; i++)
1459                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1460
1461         /* For use by ipt_REJECT */
1462         ip_ct_attach = ip_conntrack_attach;
1463
1464         /* Set up fake conntrack:
1465             - to never be deleted, not in any hashes */
1466         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1467         /*  - and look it like as a confirmed connection */
1468         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1469
1470         return ret;
1471
1472 err_free_conntrack_slab:
1473         kmem_cache_destroy(ip_conntrack_cachep);
1474 err_free_hash:
1475         free_conntrack_hash();
1476 err_unreg_sockopt:
1477         nf_unregister_sockopt(&so_getorigdst);
1478
1479         return -ENOMEM;
1480 }