]> err.no Git - linux-2.6/blob - net/ipv4/netfilter/ip_conntrack_core.c
babce304c619243cf4dfbc766e4e6520671be57d
[linux-2.6] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43    registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION    "2.3"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
74 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 static unsigned int ip_conntrack_next_id = 1;
81 static unsigned int ip_conntrack_expect_next_id = 1;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 struct notifier_block *ip_conntrack_chain;
84 struct notifier_block *ip_conntrack_expect_chain;
85
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88 /* deliver cached events and clear cache entry - must be called with locally
89  * disabled softirqs */
90 static inline void
91 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
92 {
93         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
94         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95                 notifier_call_chain(&ip_conntrack_chain, ecache->events,
96                                     ecache->ct);
97         ecache->events = 0;
98         ip_conntrack_put(ecache->ct);
99         ecache->ct = NULL;
100 }
101
102 /* Deliver all cached events for a particular conntrack. This is called
103  * by code prior to async packet handling or freeing the skb */
104 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
105 {
106         struct ip_conntrack_ecache *ecache;
107         
108         local_bh_disable();
109         ecache = &__get_cpu_var(ip_conntrack_ecache);
110         if (ecache->ct == ct)
111                 __ip_ct_deliver_cached_events(ecache);
112         local_bh_enable();
113 }
114
115 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116 {
117         struct ip_conntrack_ecache *ecache;
118
119         /* take care of delivering potentially old events */
120         ecache = &__get_cpu_var(ip_conntrack_ecache);
121         BUG_ON(ecache->ct == ct);
122         if (ecache->ct)
123                 __ip_ct_deliver_cached_events(ecache);
124         /* initialize for this conntrack/packet */
125         ecache->ct = ct;
126         nf_conntrack_get(&ct->ct_general);
127 }
128
129 /* flush the event cache - touches other CPU's data and must not be called while
130  * packets are still passing through the code */
131 static void ip_ct_event_cache_flush(void)
132 {
133         struct ip_conntrack_ecache *ecache;
134         int cpu;
135
136         for_each_cpu(cpu) {
137                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138                 if (ecache->ct)
139                         ip_conntrack_put(ecache->ct);
140         }
141 }
142 #else
143 static inline void ip_ct_event_cache_flush(void) {}
144 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
146 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
148 static int ip_conntrack_hash_rnd_initted;
149 static unsigned int ip_conntrack_hash_rnd;
150
151 static u_int32_t
152 hash_conntrack(const struct ip_conntrack_tuple *tuple)
153 {
154 #if 0
155         dump_tuple(tuple);
156 #endif
157         return (jhash_3words(tuple->src.ip,
158                              (tuple->dst.ip ^ tuple->dst.protonum),
159                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
160                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
161 }
162
163 int
164 ip_ct_get_tuple(const struct iphdr *iph,
165                 const struct sk_buff *skb,
166                 unsigned int dataoff,
167                 struct ip_conntrack_tuple *tuple,
168                 const struct ip_conntrack_protocol *protocol)
169 {
170         /* Never happen */
171         if (iph->frag_off & htons(IP_OFFSET)) {
172                 printk("ip_conntrack_core: Frag of proto %u.\n",
173                        iph->protocol);
174                 return 0;
175         }
176
177         tuple->src.ip = iph->saddr;
178         tuple->dst.ip = iph->daddr;
179         tuple->dst.protonum = iph->protocol;
180         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
181
182         return protocol->pkt_to_tuple(skb, dataoff, tuple);
183 }
184
185 int
186 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
187                    const struct ip_conntrack_tuple *orig,
188                    const struct ip_conntrack_protocol *protocol)
189 {
190         inverse->src.ip = orig->dst.ip;
191         inverse->dst.ip = orig->src.ip;
192         inverse->dst.protonum = orig->dst.protonum;
193         inverse->dst.dir = !orig->dst.dir;
194
195         return protocol->invert_tuple(inverse, orig);
196 }
197
198
199 /* ip_conntrack_expect helper functions */
200 static void unlink_expect(struct ip_conntrack_expect *exp)
201 {
202         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
203         IP_NF_ASSERT(!timer_pending(&exp->timeout));
204         list_del(&exp->list);
205         CONNTRACK_STAT_INC(expect_delete);
206         exp->master->expecting--;
207         ip_conntrack_expect_put(exp);
208 }
209
210 void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
211 {
212         unlink_expect(exp);
213         ip_conntrack_expect_put(exp);
214 }
215
216 static void expectation_timed_out(unsigned long ul_expect)
217 {
218         struct ip_conntrack_expect *exp = (void *)ul_expect;
219
220         write_lock_bh(&ip_conntrack_lock);
221         unlink_expect(exp);
222         write_unlock_bh(&ip_conntrack_lock);
223         ip_conntrack_expect_put(exp);
224 }
225
226 struct ip_conntrack_expect *
227 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
228 {
229         struct ip_conntrack_expect *i;
230         
231         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
232                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
233                         atomic_inc(&i->use);
234                         return i;
235                 }
236         }
237         return NULL;
238 }
239
240 /* Just find a expectation corresponding to a tuple. */
241 struct ip_conntrack_expect *
242 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
243 {
244         struct ip_conntrack_expect *i;
245         
246         read_lock_bh(&ip_conntrack_lock);
247         i = __ip_conntrack_expect_find(tuple);
248         read_unlock_bh(&ip_conntrack_lock);
249
250         return i;
251 }
252
253 /* If an expectation for this connection is found, it gets delete from
254  * global list then returned. */
255 static struct ip_conntrack_expect *
256 find_expectation(const struct ip_conntrack_tuple *tuple)
257 {
258         struct ip_conntrack_expect *i;
259
260         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
261                 /* If master is not in hash table yet (ie. packet hasn't left
262                    this machine yet), how can other end know about expected?
263                    Hence these are not the droids you are looking for (if
264                    master ct never got confirmed, we'd hold a reference to it
265                    and weird things would happen to future packets). */
266                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
267                     && is_confirmed(i->master)) {
268                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
269                                 atomic_inc(&i->use);
270                                 return i;
271                         } else if (del_timer(&i->timeout)) {
272                                 unlink_expect(i);
273                                 return i;
274                         }
275                 }
276         }
277         return NULL;
278 }
279
280 /* delete all expectations for this conntrack */
281 void ip_ct_remove_expectations(struct ip_conntrack *ct)
282 {
283         struct ip_conntrack_expect *i, *tmp;
284
285         /* Optimization: most connection never expect any others. */
286         if (ct->expecting == 0)
287                 return;
288
289         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
290                 if (i->master == ct && del_timer(&i->timeout)) {
291                         unlink_expect(i);
292                         ip_conntrack_expect_put(i);
293                 }
294         }
295 }
296
297 static void
298 clean_from_lists(struct ip_conntrack *ct)
299 {
300         unsigned int ho, hr;
301         
302         DEBUGP("clean_from_lists(%p)\n", ct);
303         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
304
305         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
306         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
307         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
308         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
309
310         /* Destroy all pending expectations */
311         ip_ct_remove_expectations(ct);
312 }
313
314 static void
315 destroy_conntrack(struct nf_conntrack *nfct)
316 {
317         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
318         struct ip_conntrack_protocol *proto;
319
320         DEBUGP("destroy_conntrack(%p)\n", ct);
321         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
322         IP_NF_ASSERT(!timer_pending(&ct->timeout));
323
324         ip_conntrack_event(IPCT_DESTROY, ct);
325         set_bit(IPS_DYING_BIT, &ct->status);
326
327         /* To make sure we don't get any weird locking issues here:
328          * destroy_conntrack() MUST NOT be called with a write lock
329          * to ip_conntrack_lock!!! -HW */
330         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
331         if (proto && proto->destroy)
332                 proto->destroy(ct);
333
334         if (ip_conntrack_destroyed)
335                 ip_conntrack_destroyed(ct);
336
337         write_lock_bh(&ip_conntrack_lock);
338         /* Expectations will have been removed in clean_from_lists,
339          * except TFTP can create an expectation on the first packet,
340          * before connection is in the list, so we need to clean here,
341          * too. */
342         ip_ct_remove_expectations(ct);
343
344         /* We overload first tuple to link into unconfirmed list. */
345         if (!is_confirmed(ct)) {
346                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
347                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
348         }
349
350         CONNTRACK_STAT_INC(delete);
351         write_unlock_bh(&ip_conntrack_lock);
352
353         if (ct->master)
354                 ip_conntrack_put(ct->master);
355
356         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
357         ip_conntrack_free(ct);
358 }
359
360 static void death_by_timeout(unsigned long ul_conntrack)
361 {
362         struct ip_conntrack *ct = (void *)ul_conntrack;
363
364         write_lock_bh(&ip_conntrack_lock);
365         /* Inside lock so preempt is disabled on module removal path.
366          * Otherwise we can get spurious warnings. */
367         CONNTRACK_STAT_INC(delete_list);
368         clean_from_lists(ct);
369         write_unlock_bh(&ip_conntrack_lock);
370         ip_conntrack_put(ct);
371 }
372
373 static inline int
374 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
375                     const struct ip_conntrack_tuple *tuple,
376                     const struct ip_conntrack *ignored_conntrack)
377 {
378         ASSERT_READ_LOCK(&ip_conntrack_lock);
379         return tuplehash_to_ctrack(i) != ignored_conntrack
380                 && ip_ct_tuple_equal(tuple, &i->tuple);
381 }
382
383 struct ip_conntrack_tuple_hash *
384 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
385                     const struct ip_conntrack *ignored_conntrack)
386 {
387         struct ip_conntrack_tuple_hash *h;
388         unsigned int hash = hash_conntrack(tuple);
389
390         ASSERT_READ_LOCK(&ip_conntrack_lock);
391         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
392                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
393                         CONNTRACK_STAT_INC(found);
394                         return h;
395                 }
396                 CONNTRACK_STAT_INC(searched);
397         }
398
399         return NULL;
400 }
401
402 /* Find a connection corresponding to a tuple. */
403 struct ip_conntrack_tuple_hash *
404 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
405                       const struct ip_conntrack *ignored_conntrack)
406 {
407         struct ip_conntrack_tuple_hash *h;
408
409         read_lock_bh(&ip_conntrack_lock);
410         h = __ip_conntrack_find(tuple, ignored_conntrack);
411         if (h)
412                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
413         read_unlock_bh(&ip_conntrack_lock);
414
415         return h;
416 }
417
418 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
419                                         unsigned int hash,
420                                         unsigned int repl_hash) 
421 {
422         ct->id = ++ip_conntrack_next_id;
423         list_prepend(&ip_conntrack_hash[hash],
424                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
425         list_prepend(&ip_conntrack_hash[repl_hash],
426                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
427 }
428
429 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
430 {
431         unsigned int hash, repl_hash;
432
433         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
434         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
435
436         write_lock_bh(&ip_conntrack_lock);
437         __ip_conntrack_hash_insert(ct, hash, repl_hash);
438         write_unlock_bh(&ip_conntrack_lock);
439 }
440
441 /* Confirm a connection given skb; places it in hash table */
442 int
443 __ip_conntrack_confirm(struct sk_buff **pskb)
444 {
445         unsigned int hash, repl_hash;
446         struct ip_conntrack *ct;
447         enum ip_conntrack_info ctinfo;
448
449         ct = ip_conntrack_get(*pskb, &ctinfo);
450
451         /* ipt_REJECT uses ip_conntrack_attach to attach related
452            ICMP/TCP RST packets in other direction.  Actual packet
453            which created connection will be IP_CT_NEW or for an
454            expected connection, IP_CT_RELATED. */
455         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
456                 return NF_ACCEPT;
457
458         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
459         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
460
461         /* We're not in hash table, and we refuse to set up related
462            connections for unconfirmed conns.  But packet copies and
463            REJECT will give spurious warnings here. */
464         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
465
466         /* No external references means noone else could have
467            confirmed us. */
468         IP_NF_ASSERT(!is_confirmed(ct));
469         DEBUGP("Confirming conntrack %p\n", ct);
470
471         write_lock_bh(&ip_conntrack_lock);
472
473         /* See if there's one in the list already, including reverse:
474            NAT could have grabbed it without realizing, since we're
475            not in the hash.  If there is, we lost race. */
476         if (!LIST_FIND(&ip_conntrack_hash[hash],
477                        conntrack_tuple_cmp,
478                        struct ip_conntrack_tuple_hash *,
479                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
480             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
481                           conntrack_tuple_cmp,
482                           struct ip_conntrack_tuple_hash *,
483                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
484                 /* Remove from unconfirmed list */
485                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
486
487                 __ip_conntrack_hash_insert(ct, hash, repl_hash);
488                 /* Timer relative to confirmation time, not original
489                    setting time, otherwise we'd get timer wrap in
490                    weird delay cases. */
491                 ct->timeout.expires += jiffies;
492                 add_timer(&ct->timeout);
493                 atomic_inc(&ct->ct_general.use);
494                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
495                 CONNTRACK_STAT_INC(insert);
496                 write_unlock_bh(&ip_conntrack_lock);
497                 if (ct->helper)
498                         ip_conntrack_event_cache(IPCT_HELPER, *pskb);
499 #ifdef CONFIG_IP_NF_NAT_NEEDED
500                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
501                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
502                         ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
503 #endif
504                 ip_conntrack_event_cache(master_ct(ct) ?
505                                          IPCT_RELATED : IPCT_NEW, *pskb);
506
507                 return NF_ACCEPT;
508         }
509
510         CONNTRACK_STAT_INC(insert_failed);
511         write_unlock_bh(&ip_conntrack_lock);
512
513         return NF_DROP;
514 }
515
516 /* Returns true if a connection correspondings to the tuple (required
517    for NAT). */
518 int
519 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
520                          const struct ip_conntrack *ignored_conntrack)
521 {
522         struct ip_conntrack_tuple_hash *h;
523
524         read_lock_bh(&ip_conntrack_lock);
525         h = __ip_conntrack_find(tuple, ignored_conntrack);
526         read_unlock_bh(&ip_conntrack_lock);
527
528         return h != NULL;
529 }
530
531 /* There's a small race here where we may free a just-assured
532    connection.  Too bad: we're in trouble anyway. */
533 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
534 {
535         return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
536 }
537
538 static int early_drop(struct list_head *chain)
539 {
540         /* Traverse backwards: gives us oldest, which is roughly LRU */
541         struct ip_conntrack_tuple_hash *h;
542         struct ip_conntrack *ct = NULL;
543         int dropped = 0;
544
545         read_lock_bh(&ip_conntrack_lock);
546         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
547         if (h) {
548                 ct = tuplehash_to_ctrack(h);
549                 atomic_inc(&ct->ct_general.use);
550         }
551         read_unlock_bh(&ip_conntrack_lock);
552
553         if (!ct)
554                 return dropped;
555
556         if (del_timer(&ct->timeout)) {
557                 death_by_timeout((unsigned long)ct);
558                 dropped = 1;
559                 CONNTRACK_STAT_INC(early_drop);
560         }
561         ip_conntrack_put(ct);
562         return dropped;
563 }
564
565 static inline int helper_cmp(const struct ip_conntrack_helper *i,
566                              const struct ip_conntrack_tuple *rtuple)
567 {
568         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
569 }
570
571 static struct ip_conntrack_helper *
572 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
573 {
574         return LIST_FIND(&helpers, helper_cmp,
575                          struct ip_conntrack_helper *,
576                          tuple);
577 }
578
579 struct ip_conntrack_helper *
580 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
581 {
582         struct ip_conntrack_helper *helper;
583
584         /* need ip_conntrack_lock to assure that helper exists until
585          * try_module_get() is called */
586         read_lock_bh(&ip_conntrack_lock);
587
588         helper = __ip_conntrack_helper_find(tuple);
589         if (helper) {
590                 /* need to increase module usage count to assure helper will
591                  * not go away while the caller is e.g. busy putting a
592                  * conntrack in the hash that uses the helper */
593                 if (!try_module_get(helper->me))
594                         helper = NULL;
595         }
596
597         read_unlock_bh(&ip_conntrack_lock);
598
599         return helper;
600 }
601
602 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
603 {
604         module_put(helper->me);
605 }
606
607 struct ip_conntrack_protocol *
608 __ip_conntrack_proto_find(u_int8_t protocol)
609 {
610         return ip_ct_protos[protocol];
611 }
612
613 /* this is guaranteed to always return a valid protocol helper, since
614  * it falls back to generic_protocol */
615 struct ip_conntrack_protocol *
616 ip_conntrack_proto_find_get(u_int8_t protocol)
617 {
618         struct ip_conntrack_protocol *p;
619
620         preempt_disable();
621         p = __ip_conntrack_proto_find(protocol);
622         if (p) {
623                 if (!try_module_get(p->me))
624                         p = &ip_conntrack_generic_protocol;
625         }
626         preempt_enable();
627         
628         return p;
629 }
630
631 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
632 {
633         module_put(p->me);
634 }
635
636 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
637                                         struct ip_conntrack_tuple *repl)
638 {
639         struct ip_conntrack *conntrack;
640
641         if (!ip_conntrack_hash_rnd_initted) {
642                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
643                 ip_conntrack_hash_rnd_initted = 1;
644         }
645
646         if (ip_conntrack_max
647             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
648                 unsigned int hash = hash_conntrack(orig);
649                 /* Try dropping from this hash chain. */
650                 if (!early_drop(&ip_conntrack_hash[hash])) {
651                         if (net_ratelimit())
652                                 printk(KERN_WARNING
653                                        "ip_conntrack: table full, dropping"
654                                        " packet.\n");
655                         return ERR_PTR(-ENOMEM);
656                 }
657         }
658
659         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
660         if (!conntrack) {
661                 DEBUGP("Can't allocate conntrack.\n");
662                 return ERR_PTR(-ENOMEM);
663         }
664
665         memset(conntrack, 0, sizeof(*conntrack));
666         atomic_set(&conntrack->ct_general.use, 1);
667         conntrack->ct_general.destroy = destroy_conntrack;
668         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
669         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
670         /* Don't set timer yet: wait for confirmation */
671         init_timer(&conntrack->timeout);
672         conntrack->timeout.data = (unsigned long)conntrack;
673         conntrack->timeout.function = death_by_timeout;
674
675         atomic_inc(&ip_conntrack_count);
676
677         return conntrack;
678 }
679
680 void
681 ip_conntrack_free(struct ip_conntrack *conntrack)
682 {
683         atomic_dec(&ip_conntrack_count);
684         kmem_cache_free(ip_conntrack_cachep, conntrack);
685 }
686
687 /* Allocate a new conntrack: we return -ENOMEM if classification
688  * failed due to stress.   Otherwise it really is unclassifiable */
689 static struct ip_conntrack_tuple_hash *
690 init_conntrack(struct ip_conntrack_tuple *tuple,
691                struct ip_conntrack_protocol *protocol,
692                struct sk_buff *skb)
693 {
694         struct ip_conntrack *conntrack;
695         struct ip_conntrack_tuple repl_tuple;
696         struct ip_conntrack_expect *exp;
697
698         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
699                 DEBUGP("Can't invert tuple.\n");
700                 return NULL;
701         }
702
703         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
704         if (conntrack == NULL || IS_ERR(conntrack))
705                 return (struct ip_conntrack_tuple_hash *)conntrack;
706
707         if (!protocol->new(conntrack, skb)) {
708                 ip_conntrack_free(conntrack);
709                 return NULL;
710         }
711
712         write_lock_bh(&ip_conntrack_lock);
713         exp = find_expectation(tuple);
714
715         if (exp) {
716                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
717                         conntrack, exp);
718                 /* Welcome, Mr. Bond.  We've been expecting you... */
719                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
720                 conntrack->master = exp->master;
721 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
722                 conntrack->mark = exp->master->mark;
723 #endif
724 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
725     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
726                 /* this is ugly, but there is no other place where to put it */
727                 conntrack->nat.masq_index = exp->master->nat.masq_index;
728 #endif
729                 nf_conntrack_get(&conntrack->master->ct_general);
730                 CONNTRACK_STAT_INC(expect_new);
731         } else {
732                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
733
734                 CONNTRACK_STAT_INC(new);
735         }
736
737         /* Overload tuple linked list to put us in unconfirmed list. */
738         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
739
740         write_unlock_bh(&ip_conntrack_lock);
741
742         if (exp) {
743                 if (exp->expectfn)
744                         exp->expectfn(conntrack, exp);
745                 ip_conntrack_expect_put(exp);
746         }
747
748         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
749 }
750
751 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
752 static inline struct ip_conntrack *
753 resolve_normal_ct(struct sk_buff *skb,
754                   struct ip_conntrack_protocol *proto,
755                   int *set_reply,
756                   unsigned int hooknum,
757                   enum ip_conntrack_info *ctinfo)
758 {
759         struct ip_conntrack_tuple tuple;
760         struct ip_conntrack_tuple_hash *h;
761         struct ip_conntrack *ct;
762
763         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
764
765         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
766                                 &tuple,proto))
767                 return NULL;
768
769         /* look for tuple match */
770         h = ip_conntrack_find_get(&tuple, NULL);
771         if (!h) {
772                 h = init_conntrack(&tuple, proto, skb);
773                 if (!h)
774                         return NULL;
775                 if (IS_ERR(h))
776                         return (void *)h;
777         }
778         ct = tuplehash_to_ctrack(h);
779
780         /* It exists; we have (non-exclusive) reference. */
781         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
782                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
783                 /* Please set reply bit if this packet OK */
784                 *set_reply = 1;
785         } else {
786                 /* Once we've had two way comms, always ESTABLISHED. */
787                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
788                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
789                                ct);
790                         *ctinfo = IP_CT_ESTABLISHED;
791                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
792                         DEBUGP("ip_conntrack_in: related packet for %p\n",
793                                ct);
794                         *ctinfo = IP_CT_RELATED;
795                 } else {
796                         DEBUGP("ip_conntrack_in: new packet for %p\n",
797                                ct);
798                         *ctinfo = IP_CT_NEW;
799                 }
800                 *set_reply = 0;
801         }
802         skb->nfct = &ct->ct_general;
803         skb->nfctinfo = *ctinfo;
804         return ct;
805 }
806
807 /* Netfilter hook itself. */
808 unsigned int ip_conntrack_in(unsigned int hooknum,
809                              struct sk_buff **pskb,
810                              const struct net_device *in,
811                              const struct net_device *out,
812                              int (*okfn)(struct sk_buff *))
813 {
814         struct ip_conntrack *ct;
815         enum ip_conntrack_info ctinfo;
816         struct ip_conntrack_protocol *proto;
817         int set_reply = 0;
818         int ret;
819
820         /* Previously seen (loopback or untracked)?  Ignore. */
821         if ((*pskb)->nfct) {
822                 CONNTRACK_STAT_INC(ignore);
823                 return NF_ACCEPT;
824         }
825
826         /* Never happen */
827         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
828                 if (net_ratelimit()) {
829                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
830                        (*pskb)->nh.iph->protocol, hooknum);
831                 }
832                 return NF_DROP;
833         }
834
835 /* Doesn't cover locally-generated broadcast, so not worth it. */
836 #if 0
837         /* Ignore broadcast: no `connection'. */
838         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
839                 printk("Broadcast packet!\n");
840                 return NF_ACCEPT;
841         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
842                    == htonl(0x000000FF)) {
843                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
844                        NIPQUAD((*pskb)->nh.iph->saddr),
845                        NIPQUAD((*pskb)->nh.iph->daddr),
846                        (*pskb)->sk, (*pskb)->pkt_type);
847         }
848 #endif
849
850         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
851
852         /* It may be an special packet, error, unclean...
853          * inverse of the return code tells to the netfilter
854          * core what to do with the packet. */
855         if (proto->error != NULL 
856             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
857                 CONNTRACK_STAT_INC(error);
858                 CONNTRACK_STAT_INC(invalid);
859                 return -ret;
860         }
861
862         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
863                 /* Not valid part of a connection */
864                 CONNTRACK_STAT_INC(invalid);
865                 return NF_ACCEPT;
866         }
867
868         if (IS_ERR(ct)) {
869                 /* Too stressed to deal. */
870                 CONNTRACK_STAT_INC(drop);
871                 return NF_DROP;
872         }
873
874         IP_NF_ASSERT((*pskb)->nfct);
875
876         ret = proto->packet(ct, *pskb, ctinfo);
877         if (ret < 0) {
878                 /* Invalid: inverse of the return code tells
879                  * the netfilter core what to do*/
880                 nf_conntrack_put((*pskb)->nfct);
881                 (*pskb)->nfct = NULL;
882                 CONNTRACK_STAT_INC(invalid);
883                 return -ret;
884         }
885
886         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
887                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
888
889         return ret;
890 }
891
892 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
893                    const struct ip_conntrack_tuple *orig)
894 {
895         return ip_ct_invert_tuple(inverse, orig, 
896                                   __ip_conntrack_proto_find(orig->dst.protonum));
897 }
898
899 /* Would two expected things clash? */
900 static inline int expect_clash(const struct ip_conntrack_expect *a,
901                                const struct ip_conntrack_expect *b)
902 {
903         /* Part covered by intersection of masks must be unequal,
904            otherwise they clash */
905         struct ip_conntrack_tuple intersect_mask
906                 = { { a->mask.src.ip & b->mask.src.ip,
907                       { a->mask.src.u.all & b->mask.src.u.all } },
908                     { a->mask.dst.ip & b->mask.dst.ip,
909                       { a->mask.dst.u.all & b->mask.dst.u.all },
910                       a->mask.dst.protonum & b->mask.dst.protonum } };
911
912         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
913 }
914
915 static inline int expect_matches(const struct ip_conntrack_expect *a,
916                                  const struct ip_conntrack_expect *b)
917 {
918         return a->master == b->master
919                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
920                 && ip_ct_tuple_equal(&a->mask, &b->mask);
921 }
922
923 /* Generally a bad idea to call this: could have matched already. */
924 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
925 {
926         struct ip_conntrack_expect *i;
927
928         write_lock_bh(&ip_conntrack_lock);
929         /* choose the the oldest expectation to evict */
930         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
931                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
932                         unlink_expect(i);
933                         write_unlock_bh(&ip_conntrack_lock);
934                         ip_conntrack_expect_put(i);
935                         return;
936                 }
937         }
938         write_unlock_bh(&ip_conntrack_lock);
939 }
940
941 /* We don't increase the master conntrack refcount for non-fulfilled
942  * conntracks. During the conntrack destruction, the expectations are 
943  * always killed before the conntrack itself */
944 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
945 {
946         struct ip_conntrack_expect *new;
947
948         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
949         if (!new) {
950                 DEBUGP("expect_related: OOM allocating expect\n");
951                 return NULL;
952         }
953         new->master = me;
954         atomic_set(&new->use, 1);
955         return new;
956 }
957
958 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
959 {
960         if (atomic_dec_and_test(&exp->use))
961                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
962 }
963
964 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
965 {
966         atomic_inc(&exp->use);
967         exp->master->expecting++;
968         list_add(&exp->list, &ip_conntrack_expect_list);
969
970         init_timer(&exp->timeout);
971         exp->timeout.data = (unsigned long)exp;
972         exp->timeout.function = expectation_timed_out;
973         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
974         add_timer(&exp->timeout);
975
976         exp->id = ++ip_conntrack_expect_next_id;
977         atomic_inc(&exp->use);
978         CONNTRACK_STAT_INC(expect_create);
979 }
980
981 /* Race with expectations being used means we could have none to find; OK. */
982 static void evict_oldest_expect(struct ip_conntrack *master)
983 {
984         struct ip_conntrack_expect *i;
985
986         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
987                 if (i->master == master) {
988                         if (del_timer(&i->timeout)) {
989                                 unlink_expect(i);
990                                 ip_conntrack_expect_put(i);
991                         }
992                         break;
993                 }
994         }
995 }
996
997 static inline int refresh_timer(struct ip_conntrack_expect *i)
998 {
999         if (!del_timer(&i->timeout))
1000                 return 0;
1001
1002         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1003         add_timer(&i->timeout);
1004         return 1;
1005 }
1006
1007 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1008 {
1009         struct ip_conntrack_expect *i;
1010         int ret;
1011
1012         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1013         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1014         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1015
1016         write_lock_bh(&ip_conntrack_lock);
1017         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1018                 if (expect_matches(i, expect)) {
1019                         /* Refresh timer: if it's dying, ignore.. */
1020                         if (refresh_timer(i)) {
1021                                 ret = 0;
1022                                 goto out;
1023                         }
1024                 } else if (expect_clash(i, expect)) {
1025                         ret = -EBUSY;
1026                         goto out;
1027                 }
1028         }
1029
1030         /* Will be over limit? */
1031         if (expect->master->helper->max_expected && 
1032             expect->master->expecting >= expect->master->helper->max_expected)
1033                 evict_oldest_expect(expect->master);
1034
1035         ip_conntrack_expect_insert(expect);
1036         ip_conntrack_expect_event(IPEXP_NEW, expect);
1037         ret = 0;
1038 out:
1039         write_unlock_bh(&ip_conntrack_lock);
1040         return ret;
1041 }
1042
1043 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1044    implicitly racy: see __ip_conntrack_confirm */
1045 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1046                               const struct ip_conntrack_tuple *newreply)
1047 {
1048         write_lock_bh(&ip_conntrack_lock);
1049         /* Should be unconfirmed, so not in hash table yet */
1050         IP_NF_ASSERT(!is_confirmed(conntrack));
1051
1052         DEBUGP("Altering reply tuple of %p to ", conntrack);
1053         DUMP_TUPLE(newreply);
1054
1055         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1056         if (!conntrack->master && conntrack->expecting == 0)
1057                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1058         write_unlock_bh(&ip_conntrack_lock);
1059 }
1060
1061 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1062 {
1063         BUG_ON(me->timeout == 0);
1064         write_lock_bh(&ip_conntrack_lock);
1065         list_prepend(&helpers, me);
1066         write_unlock_bh(&ip_conntrack_lock);
1067
1068         return 0;
1069 }
1070
1071 struct ip_conntrack_helper *
1072 __ip_conntrack_helper_find_byname(const char *name)
1073 {
1074         struct ip_conntrack_helper *h;
1075
1076         list_for_each_entry(h, &helpers, list) {
1077                 if (!strcmp(h->name, name))
1078                         return h;
1079         }
1080
1081         return NULL;
1082 }
1083
1084 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1085                          const struct ip_conntrack_helper *me)
1086 {
1087         if (tuplehash_to_ctrack(i)->helper == me) {
1088                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1089                 tuplehash_to_ctrack(i)->helper = NULL;
1090         }
1091         return 0;
1092 }
1093
1094 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1095 {
1096         unsigned int i;
1097         struct ip_conntrack_expect *exp, *tmp;
1098
1099         /* Need write lock here, to delete helper. */
1100         write_lock_bh(&ip_conntrack_lock);
1101         LIST_DELETE(&helpers, me);
1102
1103         /* Get rid of expectations */
1104         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1105                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1106                         unlink_expect(exp);
1107                         ip_conntrack_expect_put(exp);
1108                 }
1109         }
1110         /* Get rid of expecteds, set helpers to NULL. */
1111         LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1112         for (i = 0; i < ip_conntrack_htable_size; i++)
1113                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1114                             struct ip_conntrack_tuple_hash *, me);
1115         write_unlock_bh(&ip_conntrack_lock);
1116
1117         /* Someone could be still looking at the helper in a bh. */
1118         synchronize_net();
1119 }
1120
1121 static inline void ct_add_counters(struct ip_conntrack *ct,
1122                                    enum ip_conntrack_info ctinfo,
1123                                    const struct sk_buff *skb)
1124 {
1125 #ifdef CONFIG_IP_NF_CT_ACCT
1126         if (skb) {
1127                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1128                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1129                                         ntohs(skb->nh.iph->tot_len);
1130         }
1131 #endif
1132 }
1133
1134 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1135 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
1136                         enum ip_conntrack_info ctinfo,
1137                         const struct sk_buff *skb,
1138                         unsigned long extra_jiffies)
1139 {
1140         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1141
1142         /* If not in hash table, timer will not be active yet */
1143         if (!is_confirmed(ct)) {
1144                 ct->timeout.expires = extra_jiffies;
1145                 ct_add_counters(ct, ctinfo, skb);
1146         } else {
1147                 write_lock_bh(&ip_conntrack_lock);
1148                 /* Need del_timer for race avoidance (may already be dying). */
1149                 if (del_timer(&ct->timeout)) {
1150                         ct->timeout.expires = jiffies + extra_jiffies;
1151                         add_timer(&ct->timeout);
1152                         ip_conntrack_event_cache(IPCT_REFRESH, skb);
1153                 }
1154                 ct_add_counters(ct, ctinfo, skb);
1155                 write_unlock_bh(&ip_conntrack_lock);
1156         }
1157 }
1158
1159 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1160     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1161 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1162  * in ip_conntrack_core, since we don't want the protocols to autoload
1163  * or depend on ctnetlink */
1164 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1165                                const struct ip_conntrack_tuple *tuple)
1166 {
1167         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1168                 &tuple->src.u.tcp.port);
1169         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1170                 &tuple->dst.u.tcp.port);
1171         return 0;
1172
1173 nfattr_failure:
1174         return -1;
1175 }
1176
1177 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1178                                struct ip_conntrack_tuple *t)
1179 {
1180         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1181                 return -EINVAL;
1182
1183         t->src.u.tcp.port =
1184                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1185         t->dst.u.tcp.port =
1186                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1187
1188         return 0;
1189 }
1190 #endif
1191
1192 /* Returns new sk_buff, or NULL */
1193 struct sk_buff *
1194 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1195 {
1196         skb_orphan(skb);
1197
1198         local_bh_disable(); 
1199         skb = ip_defrag(skb, user);
1200         local_bh_enable();
1201
1202         if (skb)
1203                 ip_send_check(skb->nh.iph);
1204         return skb;
1205 }
1206
1207 /* Used by ipt_REJECT. */
1208 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1209 {
1210         struct ip_conntrack *ct;
1211         enum ip_conntrack_info ctinfo;
1212
1213         /* This ICMP is in reverse direction to the packet which caused it */
1214         ct = ip_conntrack_get(skb, &ctinfo);
1215         
1216         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1217                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1218         else
1219                 ctinfo = IP_CT_RELATED;
1220
1221         /* Attach to new skbuff, and increment count */
1222         nskb->nfct = &ct->ct_general;
1223         nskb->nfctinfo = ctinfo;
1224         nf_conntrack_get(nskb->nfct);
1225 }
1226
1227 static inline int
1228 do_iter(const struct ip_conntrack_tuple_hash *i,
1229         int (*iter)(struct ip_conntrack *i, void *data),
1230         void *data)
1231 {
1232         return iter(tuplehash_to_ctrack(i), data);
1233 }
1234
1235 /* Bring out ya dead! */
1236 static struct ip_conntrack_tuple_hash *
1237 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1238                 void *data, unsigned int *bucket)
1239 {
1240         struct ip_conntrack_tuple_hash *h = NULL;
1241
1242         write_lock_bh(&ip_conntrack_lock);
1243         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1244                 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1245                                 struct ip_conntrack_tuple_hash *, iter, data);
1246                 if (h)
1247                         break;
1248         }
1249         if (!h)
1250                 h = LIST_FIND_W(&unconfirmed, do_iter,
1251                                 struct ip_conntrack_tuple_hash *, iter, data);
1252         if (h)
1253                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1254         write_unlock_bh(&ip_conntrack_lock);
1255
1256         return h;
1257 }
1258
1259 void
1260 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1261 {
1262         struct ip_conntrack_tuple_hash *h;
1263         unsigned int bucket = 0;
1264
1265         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1266                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1267                 /* Time to push up daises... */
1268                 if (del_timer(&ct->timeout))
1269                         death_by_timeout((unsigned long)ct);
1270                 /* ... else the timer will get him soon. */
1271
1272                 ip_conntrack_put(ct);
1273         }
1274 }
1275
1276 /* Fast function for those who don't want to parse /proc (and I don't
1277    blame them). */
1278 /* Reversing the socket's dst/src point of view gives us the reply
1279    mapping. */
1280 static int
1281 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1282 {
1283         struct inet_sock *inet = inet_sk(sk);
1284         struct ip_conntrack_tuple_hash *h;
1285         struct ip_conntrack_tuple tuple;
1286         
1287         IP_CT_TUPLE_U_BLANK(&tuple);
1288         tuple.src.ip = inet->rcv_saddr;
1289         tuple.src.u.tcp.port = inet->sport;
1290         tuple.dst.ip = inet->daddr;
1291         tuple.dst.u.tcp.port = inet->dport;
1292         tuple.dst.protonum = IPPROTO_TCP;
1293
1294         /* We only do TCP at the moment: is there a better way? */
1295         if (strcmp(sk->sk_prot->name, "TCP")) {
1296                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1297                 return -ENOPROTOOPT;
1298         }
1299
1300         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1301                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1302                        *len, sizeof(struct sockaddr_in));
1303                 return -EINVAL;
1304         }
1305
1306         h = ip_conntrack_find_get(&tuple, NULL);
1307         if (h) {
1308                 struct sockaddr_in sin;
1309                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1310
1311                 sin.sin_family = AF_INET;
1312                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1313                         .tuple.dst.u.tcp.port;
1314                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1315                         .tuple.dst.ip;
1316
1317                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1318                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1319                 ip_conntrack_put(ct);
1320                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1321                         return -EFAULT;
1322                 else
1323                         return 0;
1324         }
1325         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1326                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1327                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1328         return -ENOENT;
1329 }
1330
1331 static struct nf_sockopt_ops so_getorigdst = {
1332         .pf             = PF_INET,
1333         .get_optmin     = SO_ORIGINAL_DST,
1334         .get_optmax     = SO_ORIGINAL_DST+1,
1335         .get            = &getorigdst,
1336 };
1337
1338 static int kill_all(struct ip_conntrack *i, void *data)
1339 {
1340         return 1;
1341 }
1342
1343 static void free_conntrack_hash(void)
1344 {
1345         if (ip_conntrack_vmalloc)
1346                 vfree(ip_conntrack_hash);
1347         else
1348                 free_pages((unsigned long)ip_conntrack_hash, 
1349                            get_order(sizeof(struct list_head)
1350                                      * ip_conntrack_htable_size));
1351 }
1352
1353 void ip_conntrack_flush()
1354 {
1355         /* This makes sure all current packets have passed through
1356            netfilter framework.  Roll on, two-stage module
1357            delete... */
1358         synchronize_net();
1359
1360         ip_ct_event_cache_flush();
1361  i_see_dead_people:
1362         ip_ct_iterate_cleanup(kill_all, NULL);
1363         if (atomic_read(&ip_conntrack_count) != 0) {
1364                 schedule();
1365                 goto i_see_dead_people;
1366         }
1367         /* wait until all references to ip_conntrack_untracked are dropped */
1368         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1369                 schedule();
1370 }
1371
1372 /* Mishearing the voices in his head, our hero wonders how he's
1373    supposed to kill the mall. */
1374 void ip_conntrack_cleanup(void)
1375 {
1376         ip_ct_attach = NULL;
1377         ip_conntrack_flush();
1378         kmem_cache_destroy(ip_conntrack_cachep);
1379         kmem_cache_destroy(ip_conntrack_expect_cachep);
1380         free_conntrack_hash();
1381         nf_unregister_sockopt(&so_getorigdst);
1382 }
1383
1384 static int hashsize;
1385 module_param(hashsize, int, 0400);
1386
1387 int __init ip_conntrack_init(void)
1388 {
1389         unsigned int i;
1390         int ret;
1391
1392         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1393          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1394         if (hashsize) {
1395                 ip_conntrack_htable_size = hashsize;
1396         } else {
1397                 ip_conntrack_htable_size
1398                         = (((num_physpages << PAGE_SHIFT) / 16384)
1399                            / sizeof(struct list_head));
1400                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1401                         ip_conntrack_htable_size = 8192;
1402                 if (ip_conntrack_htable_size < 16)
1403                         ip_conntrack_htable_size = 16;
1404         }
1405         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1406
1407         printk("ip_conntrack version %s (%u buckets, %d max)"
1408                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1409                ip_conntrack_htable_size, ip_conntrack_max,
1410                sizeof(struct ip_conntrack));
1411
1412         ret = nf_register_sockopt(&so_getorigdst);
1413         if (ret != 0) {
1414                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1415                 return ret;
1416         }
1417
1418         /* AK: the hash table is twice as big than needed because it
1419            uses list_head.  it would be much nicer to caches to use a
1420            single pointer list head here. */
1421         ip_conntrack_vmalloc = 0; 
1422         ip_conntrack_hash 
1423                 =(void*)__get_free_pages(GFP_KERNEL, 
1424                                          get_order(sizeof(struct list_head)
1425                                                    *ip_conntrack_htable_size));
1426         if (!ip_conntrack_hash) { 
1427                 ip_conntrack_vmalloc = 1;
1428                 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1429                 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1430                                             * ip_conntrack_htable_size);
1431         }
1432         if (!ip_conntrack_hash) {
1433                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1434                 goto err_unreg_sockopt;
1435         }
1436
1437         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1438                                                 sizeof(struct ip_conntrack), 0,
1439                                                 0, NULL, NULL);
1440         if (!ip_conntrack_cachep) {
1441                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1442                 goto err_free_hash;
1443         }
1444
1445         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1446                                         sizeof(struct ip_conntrack_expect),
1447                                         0, 0, NULL, NULL);
1448         if (!ip_conntrack_expect_cachep) {
1449                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1450                 goto err_free_conntrack_slab;
1451         }
1452
1453         /* Don't NEED lock here, but good form anyway. */
1454         write_lock_bh(&ip_conntrack_lock);
1455         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1456                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1457         /* Sew in builtin protocols. */
1458         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1459         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1460         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1461         write_unlock_bh(&ip_conntrack_lock);
1462
1463         for (i = 0; i < ip_conntrack_htable_size; i++)
1464                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1465
1466         /* For use by ipt_REJECT */
1467         ip_ct_attach = ip_conntrack_attach;
1468
1469         /* Set up fake conntrack:
1470             - to never be deleted, not in any hashes */
1471         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1472         /*  - and look it like as a confirmed connection */
1473         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1474
1475         return ret;
1476
1477 err_free_conntrack_slab:
1478         kmem_cache_destroy(ip_conntrack_cachep);
1479 err_free_hash:
1480         free_conntrack_hash();
1481 err_unreg_sockopt:
1482         nf_unregister_sockopt(&so_getorigdst);
1483
1484         return -ENOMEM;
1485 }