1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x)
44 #define ASSERT_WRITE_LOCK(x)
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50 #include <linux/netfilter_ipv4/listhelp.h>
52 #define IP_CONNTRACK_VERSION "2.1"
57 #define DEBUGP(format, args...)
60 DEFINE_RWLOCK(ip_conntrack_lock);
62 /* ip_conntrack_standalone needs this */
63 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
66 LIST_HEAD(ip_conntrack_expect_list);
67 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
68 static LIST_HEAD(helpers);
69 unsigned int ip_conntrack_htable_size = 0;
71 struct list_head *ip_conntrack_hash;
72 static kmem_cache_t *ip_conntrack_cachep;
73 static kmem_cache_t *ip_conntrack_expect_cachep;
74 struct ip_conntrack ip_conntrack_untracked;
75 unsigned int ip_ct_log_invalid;
76 static LIST_HEAD(unconfirmed);
77 static int ip_conntrack_vmalloc;
79 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
82 ip_conntrack_put(struct ip_conntrack *ct)
85 nf_conntrack_put(&ct->ct_general);
88 static int ip_conntrack_hash_rnd_initted;
89 static unsigned int ip_conntrack_hash_rnd;
92 hash_conntrack(const struct ip_conntrack_tuple *tuple)
97 return (jhash_3words(tuple->src.ip,
98 (tuple->dst.ip ^ tuple->dst.protonum),
99 (tuple->src.u.all | (tuple->dst.u.all << 16)),
100 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
104 ip_ct_get_tuple(const struct iphdr *iph,
105 const struct sk_buff *skb,
106 unsigned int dataoff,
107 struct ip_conntrack_tuple *tuple,
108 const struct ip_conntrack_protocol *protocol)
111 if (iph->frag_off & htons(IP_OFFSET)) {
112 printk("ip_conntrack_core: Frag of proto %u.\n",
117 tuple->src.ip = iph->saddr;
118 tuple->dst.ip = iph->daddr;
119 tuple->dst.protonum = iph->protocol;
120 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
122 return protocol->pkt_to_tuple(skb, dataoff, tuple);
126 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
127 const struct ip_conntrack_tuple *orig,
128 const struct ip_conntrack_protocol *protocol)
130 inverse->src.ip = orig->dst.ip;
131 inverse->dst.ip = orig->src.ip;
132 inverse->dst.protonum = orig->dst.protonum;
133 inverse->dst.dir = !orig->dst.dir;
135 return protocol->invert_tuple(inverse, orig);
139 /* ip_conntrack_expect helper functions */
140 static void unlink_expect(struct ip_conntrack_expect *exp)
142 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout));
144 list_del(&exp->list);
145 CONNTRACK_STAT_INC(expect_delete);
146 exp->master->expecting--;
149 static void expectation_timed_out(unsigned long ul_expect)
151 struct ip_conntrack_expect *exp = (void *)ul_expect;
153 write_lock_bh(&ip_conntrack_lock);
155 write_unlock_bh(&ip_conntrack_lock);
156 ip_conntrack_expect_put(exp);
159 /* If an expectation for this connection is found, it gets delete from
160 * global list then returned. */
161 static struct ip_conntrack_expect *
162 find_expectation(const struct ip_conntrack_tuple *tuple)
164 struct ip_conntrack_expect *i;
166 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
167 /* If master is not in hash table yet (ie. packet hasn't left
168 this machine yet), how can other end know about expected?
169 Hence these are not the droids you are looking for (if
170 master ct never got confirmed, we'd hold a reference to it
171 and weird things would happen to future packets). */
172 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
173 && is_confirmed(i->master)
174 && del_timer(&i->timeout)) {
182 /* delete all expectations for this conntrack */
183 static void remove_expectations(struct ip_conntrack *ct)
185 struct ip_conntrack_expect *i, *tmp;
187 /* Optimization: most connection never expect any others. */
188 if (ct->expecting == 0)
191 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
192 if (i->master == ct && del_timer(&i->timeout)) {
194 ip_conntrack_expect_put(i);
200 clean_from_lists(struct ip_conntrack *ct)
204 DEBUGP("clean_from_lists(%p)\n", ct);
205 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
207 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
208 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
209 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
210 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
212 /* Destroy all pending expectations */
213 remove_expectations(ct);
217 destroy_conntrack(struct nf_conntrack *nfct)
219 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
220 struct ip_conntrack_protocol *proto;
222 DEBUGP("destroy_conntrack(%p)\n", ct);
223 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
224 IP_NF_ASSERT(!timer_pending(&ct->timeout));
226 /* To make sure we don't get any weird locking issues here:
227 * destroy_conntrack() MUST NOT be called with a write lock
228 * to ip_conntrack_lock!!! -HW */
229 proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
230 if (proto && proto->destroy)
233 if (ip_conntrack_destroyed)
234 ip_conntrack_destroyed(ct);
236 write_lock_bh(&ip_conntrack_lock);
237 /* Expectations will have been removed in clean_from_lists,
238 * except TFTP can create an expectation on the first packet,
239 * before connection is in the list, so we need to clean here,
241 remove_expectations(ct);
243 /* We overload first tuple to link into unconfirmed list. */
244 if (!is_confirmed(ct)) {
245 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
246 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
249 CONNTRACK_STAT_INC(delete);
250 write_unlock_bh(&ip_conntrack_lock);
253 ip_conntrack_put(ct->master);
255 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
256 kmem_cache_free(ip_conntrack_cachep, ct);
257 atomic_dec(&ip_conntrack_count);
260 static void death_by_timeout(unsigned long ul_conntrack)
262 struct ip_conntrack *ct = (void *)ul_conntrack;
264 write_lock_bh(&ip_conntrack_lock);
265 /* Inside lock so preempt is disabled on module removal path.
266 * Otherwise we can get spurious warnings. */
267 CONNTRACK_STAT_INC(delete_list);
268 clean_from_lists(ct);
269 write_unlock_bh(&ip_conntrack_lock);
270 ip_conntrack_put(ct);
274 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
275 const struct ip_conntrack_tuple *tuple,
276 const struct ip_conntrack *ignored_conntrack)
278 ASSERT_READ_LOCK(&ip_conntrack_lock);
279 return tuplehash_to_ctrack(i) != ignored_conntrack
280 && ip_ct_tuple_equal(tuple, &i->tuple);
283 static struct ip_conntrack_tuple_hash *
284 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
285 const struct ip_conntrack *ignored_conntrack)
287 struct ip_conntrack_tuple_hash *h;
288 unsigned int hash = hash_conntrack(tuple);
290 ASSERT_READ_LOCK(&ip_conntrack_lock);
291 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
292 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
293 CONNTRACK_STAT_INC(found);
296 CONNTRACK_STAT_INC(searched);
302 /* Find a connection corresponding to a tuple. */
303 struct ip_conntrack_tuple_hash *
304 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
305 const struct ip_conntrack *ignored_conntrack)
307 struct ip_conntrack_tuple_hash *h;
309 read_lock_bh(&ip_conntrack_lock);
310 h = __ip_conntrack_find(tuple, ignored_conntrack);
312 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
313 read_unlock_bh(&ip_conntrack_lock);
318 /* Confirm a connection given skb; places it in hash table */
320 __ip_conntrack_confirm(struct sk_buff **pskb)
322 unsigned int hash, repl_hash;
323 struct ip_conntrack *ct;
324 enum ip_conntrack_info ctinfo;
326 ct = ip_conntrack_get(*pskb, &ctinfo);
328 /* ipt_REJECT uses ip_conntrack_attach to attach related
329 ICMP/TCP RST packets in other direction. Actual packet
330 which created connection will be IP_CT_NEW or for an
331 expected connection, IP_CT_RELATED. */
332 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
335 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
336 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
338 /* We're not in hash table, and we refuse to set up related
339 connections for unconfirmed conns. But packet copies and
340 REJECT will give spurious warnings here. */
341 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
343 /* No external references means noone else could have
345 IP_NF_ASSERT(!is_confirmed(ct));
346 DEBUGP("Confirming conntrack %p\n", ct);
348 write_lock_bh(&ip_conntrack_lock);
350 /* See if there's one in the list already, including reverse:
351 NAT could have grabbed it without realizing, since we're
352 not in the hash. If there is, we lost race. */
353 if (!LIST_FIND(&ip_conntrack_hash[hash],
355 struct ip_conntrack_tuple_hash *,
356 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
357 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
359 struct ip_conntrack_tuple_hash *,
360 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
361 /* Remove from unconfirmed list */
362 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
364 list_prepend(&ip_conntrack_hash[hash],
365 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
366 list_prepend(&ip_conntrack_hash[repl_hash],
367 &ct->tuplehash[IP_CT_DIR_REPLY]);
368 /* Timer relative to confirmation time, not original
369 setting time, otherwise we'd get timer wrap in
370 weird delay cases. */
371 ct->timeout.expires += jiffies;
372 add_timer(&ct->timeout);
373 atomic_inc(&ct->ct_general.use);
374 set_bit(IPS_CONFIRMED_BIT, &ct->status);
375 CONNTRACK_STAT_INC(insert);
376 write_unlock_bh(&ip_conntrack_lock);
380 CONNTRACK_STAT_INC(insert_failed);
381 write_unlock_bh(&ip_conntrack_lock);
386 /* Returns true if a connection correspondings to the tuple (required
389 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
390 const struct ip_conntrack *ignored_conntrack)
392 struct ip_conntrack_tuple_hash *h;
394 read_lock_bh(&ip_conntrack_lock);
395 h = __ip_conntrack_find(tuple, ignored_conntrack);
396 read_unlock_bh(&ip_conntrack_lock);
401 /* There's a small race here where we may free a just-assured
402 connection. Too bad: we're in trouble anyway. */
403 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
405 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
408 static int early_drop(struct list_head *chain)
410 /* Traverse backwards: gives us oldest, which is roughly LRU */
411 struct ip_conntrack_tuple_hash *h;
412 struct ip_conntrack *ct = NULL;
415 read_lock_bh(&ip_conntrack_lock);
416 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
418 ct = tuplehash_to_ctrack(h);
419 atomic_inc(&ct->ct_general.use);
421 read_unlock_bh(&ip_conntrack_lock);
426 if (del_timer(&ct->timeout)) {
427 death_by_timeout((unsigned long)ct);
429 CONNTRACK_STAT_INC(early_drop);
431 ip_conntrack_put(ct);
435 static inline int helper_cmp(const struct ip_conntrack_helper *i,
436 const struct ip_conntrack_tuple *rtuple)
438 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
441 static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
443 return LIST_FIND(&helpers, helper_cmp,
444 struct ip_conntrack_helper *,
448 /* Allocate a new conntrack: we return -ENOMEM if classification
449 failed due to stress. Otherwise it really is unclassifiable. */
450 static struct ip_conntrack_tuple_hash *
451 init_conntrack(const struct ip_conntrack_tuple *tuple,
452 struct ip_conntrack_protocol *protocol,
455 struct ip_conntrack *conntrack;
456 struct ip_conntrack_tuple repl_tuple;
458 struct ip_conntrack_expect *exp;
460 if (!ip_conntrack_hash_rnd_initted) {
461 get_random_bytes(&ip_conntrack_hash_rnd, 4);
462 ip_conntrack_hash_rnd_initted = 1;
465 hash = hash_conntrack(tuple);
468 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
469 /* Try dropping from this hash chain. */
470 if (!early_drop(&ip_conntrack_hash[hash])) {
473 "ip_conntrack: table full, dropping"
475 return ERR_PTR(-ENOMEM);
479 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
480 DEBUGP("Can't invert tuple.\n");
484 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
486 DEBUGP("Can't allocate conntrack.\n");
487 return ERR_PTR(-ENOMEM);
490 memset(conntrack, 0, sizeof(*conntrack));
491 atomic_set(&conntrack->ct_general.use, 1);
492 conntrack->ct_general.destroy = destroy_conntrack;
493 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
494 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
495 if (!protocol->new(conntrack, skb)) {
496 kmem_cache_free(ip_conntrack_cachep, conntrack);
499 /* Don't set timer yet: wait for confirmation */
500 init_timer(&conntrack->timeout);
501 conntrack->timeout.data = (unsigned long)conntrack;
502 conntrack->timeout.function = death_by_timeout;
504 write_lock_bh(&ip_conntrack_lock);
505 exp = find_expectation(tuple);
508 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
510 /* Welcome, Mr. Bond. We've been expecting you... */
511 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
512 conntrack->master = exp->master;
513 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
514 conntrack->mark = exp->master->mark;
516 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
517 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
518 /* this is ugly, but there is no other place where to put it */
519 conntrack->nat.masq_index = exp->master->nat.masq_index;
521 nf_conntrack_get(&conntrack->master->ct_general);
522 CONNTRACK_STAT_INC(expect_new);
524 conntrack->helper = ip_ct_find_helper(&repl_tuple);
526 CONNTRACK_STAT_INC(new);
529 /* Overload tuple linked list to put us in unconfirmed list. */
530 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
532 atomic_inc(&ip_conntrack_count);
533 write_unlock_bh(&ip_conntrack_lock);
537 exp->expectfn(conntrack, exp);
538 ip_conntrack_expect_put(exp);
541 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
544 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
545 static inline struct ip_conntrack *
546 resolve_normal_ct(struct sk_buff *skb,
547 struct ip_conntrack_protocol *proto,
549 unsigned int hooknum,
550 enum ip_conntrack_info *ctinfo)
552 struct ip_conntrack_tuple tuple;
553 struct ip_conntrack_tuple_hash *h;
554 struct ip_conntrack *ct;
556 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
558 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
562 /* look for tuple match */
563 h = ip_conntrack_find_get(&tuple, NULL);
565 h = init_conntrack(&tuple, proto, skb);
571 ct = tuplehash_to_ctrack(h);
573 /* It exists; we have (non-exclusive) reference. */
574 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
575 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
576 /* Please set reply bit if this packet OK */
579 /* Once we've had two way comms, always ESTABLISHED. */
580 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
581 DEBUGP("ip_conntrack_in: normal packet for %p\n",
583 *ctinfo = IP_CT_ESTABLISHED;
584 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
585 DEBUGP("ip_conntrack_in: related packet for %p\n",
587 *ctinfo = IP_CT_RELATED;
589 DEBUGP("ip_conntrack_in: new packet for %p\n",
595 skb->nfct = &ct->ct_general;
596 skb->nfctinfo = *ctinfo;
600 /* Netfilter hook itself. */
601 unsigned int ip_conntrack_in(unsigned int hooknum,
602 struct sk_buff **pskb,
603 const struct net_device *in,
604 const struct net_device *out,
605 int (*okfn)(struct sk_buff *))
607 struct ip_conntrack *ct;
608 enum ip_conntrack_info ctinfo;
609 struct ip_conntrack_protocol *proto;
613 /* Previously seen (loopback or untracked)? Ignore. */
615 CONNTRACK_STAT_INC(ignore);
620 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
621 if (net_ratelimit()) {
622 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
623 (*pskb)->nh.iph->protocol, hooknum);
628 /* FIXME: Do this right please. --RR */
629 (*pskb)->nfcache |= NFC_UNKNOWN;
631 /* Doesn't cover locally-generated broadcast, so not worth it. */
633 /* Ignore broadcast: no `connection'. */
634 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
635 printk("Broadcast packet!\n");
637 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
638 == htonl(0x000000FF)) {
639 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
640 NIPQUAD((*pskb)->nh.iph->saddr),
641 NIPQUAD((*pskb)->nh.iph->daddr),
642 (*pskb)->sk, (*pskb)->pkt_type);
646 proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
648 /* It may be an special packet, error, unclean...
649 * inverse of the return code tells to the netfilter
650 * core what to do with the packet. */
651 if (proto->error != NULL
652 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
653 CONNTRACK_STAT_INC(error);
654 CONNTRACK_STAT_INC(invalid);
658 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
659 /* Not valid part of a connection */
660 CONNTRACK_STAT_INC(invalid);
665 /* Too stressed to deal. */
666 CONNTRACK_STAT_INC(drop);
670 IP_NF_ASSERT((*pskb)->nfct);
672 ret = proto->packet(ct, *pskb, ctinfo);
674 /* Invalid: inverse of the return code tells
675 * the netfilter core what to do*/
676 nf_conntrack_put((*pskb)->nfct);
677 (*pskb)->nfct = NULL;
678 CONNTRACK_STAT_INC(invalid);
683 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
688 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
689 const struct ip_conntrack_tuple *orig)
691 return ip_ct_invert_tuple(inverse, orig,
692 ip_ct_find_proto(orig->dst.protonum));
695 /* Would two expected things clash? */
696 static inline int expect_clash(const struct ip_conntrack_expect *a,
697 const struct ip_conntrack_expect *b)
699 /* Part covered by intersection of masks must be unequal,
700 otherwise they clash */
701 struct ip_conntrack_tuple intersect_mask
702 = { { a->mask.src.ip & b->mask.src.ip,
703 { a->mask.src.u.all & b->mask.src.u.all } },
704 { a->mask.dst.ip & b->mask.dst.ip,
705 { a->mask.dst.u.all & b->mask.dst.u.all },
706 a->mask.dst.protonum & b->mask.dst.protonum } };
708 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
711 static inline int expect_matches(const struct ip_conntrack_expect *a,
712 const struct ip_conntrack_expect *b)
714 return a->master == b->master
715 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
716 && ip_ct_tuple_equal(&a->mask, &b->mask);
719 /* Generally a bad idea to call this: could have matched already. */
720 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
722 struct ip_conntrack_expect *i;
724 write_lock_bh(&ip_conntrack_lock);
725 /* choose the the oldest expectation to evict */
726 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
727 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
729 write_unlock_bh(&ip_conntrack_lock);
730 ip_conntrack_expect_put(i);
734 write_unlock_bh(&ip_conntrack_lock);
737 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
739 struct ip_conntrack_expect *new;
741 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
743 DEBUGP("expect_related: OOM allocating expect\n");
747 atomic_inc(&new->master->ct_general.use);
748 atomic_set(&new->use, 1);
752 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
754 if (atomic_dec_and_test(&exp->use)) {
755 ip_conntrack_put(exp->master);
756 kmem_cache_free(ip_conntrack_expect_cachep, exp);
760 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
762 atomic_inc(&exp->use);
763 exp->master->expecting++;
764 list_add(&exp->list, &ip_conntrack_expect_list);
766 init_timer(&exp->timeout);
767 exp->timeout.data = (unsigned long)exp;
768 exp->timeout.function = expectation_timed_out;
769 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
770 add_timer(&exp->timeout);
772 CONNTRACK_STAT_INC(expect_create);
775 /* Race with expectations being used means we could have none to find; OK. */
776 static void evict_oldest_expect(struct ip_conntrack *master)
778 struct ip_conntrack_expect *i;
780 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
781 if (i->master == master) {
782 if (del_timer(&i->timeout)) {
784 ip_conntrack_expect_put(i);
791 static inline int refresh_timer(struct ip_conntrack_expect *i)
793 if (!del_timer(&i->timeout))
796 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
797 add_timer(&i->timeout);
801 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
803 struct ip_conntrack_expect *i;
806 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
807 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
808 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
810 write_lock_bh(&ip_conntrack_lock);
811 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
812 if (expect_matches(i, expect)) {
813 /* Refresh timer: if it's dying, ignore.. */
814 if (refresh_timer(i)) {
818 } else if (expect_clash(i, expect)) {
824 /* Will be over limit? */
825 if (expect->master->helper->max_expected &&
826 expect->master->expecting >= expect->master->helper->max_expected)
827 evict_oldest_expect(expect->master);
829 ip_conntrack_expect_insert(expect);
832 write_unlock_bh(&ip_conntrack_lock);
836 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
837 implicitly racy: see __ip_conntrack_confirm */
838 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
839 const struct ip_conntrack_tuple *newreply)
841 write_lock_bh(&ip_conntrack_lock);
842 /* Should be unconfirmed, so not in hash table yet */
843 IP_NF_ASSERT(!is_confirmed(conntrack));
845 DEBUGP("Altering reply tuple of %p to ", conntrack);
846 DUMP_TUPLE(newreply);
848 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
849 if (!conntrack->master && conntrack->expecting == 0)
850 conntrack->helper = ip_ct_find_helper(newreply);
851 write_unlock_bh(&ip_conntrack_lock);
854 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
856 BUG_ON(me->timeout == 0);
857 write_lock_bh(&ip_conntrack_lock);
858 list_prepend(&helpers, me);
859 write_unlock_bh(&ip_conntrack_lock);
864 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
865 const struct ip_conntrack_helper *me)
867 if (tuplehash_to_ctrack(i)->helper == me)
868 tuplehash_to_ctrack(i)->helper = NULL;
872 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
875 struct ip_conntrack_expect *exp, *tmp;
877 /* Need write lock here, to delete helper. */
878 write_lock_bh(&ip_conntrack_lock);
879 LIST_DELETE(&helpers, me);
881 /* Get rid of expectations */
882 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
883 if (exp->master->helper == me && del_timer(&exp->timeout)) {
885 ip_conntrack_expect_put(exp);
888 /* Get rid of expecteds, set helpers to NULL. */
889 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
890 for (i = 0; i < ip_conntrack_htable_size; i++)
891 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
892 struct ip_conntrack_tuple_hash *, me);
893 write_unlock_bh(&ip_conntrack_lock);
895 /* Someone could be still looking at the helper in a bh. */
899 static inline void ct_add_counters(struct ip_conntrack *ct,
900 enum ip_conntrack_info ctinfo,
901 const struct sk_buff *skb)
903 #ifdef CONFIG_IP_NF_CT_ACCT
905 ct->counters[CTINFO2DIR(ctinfo)].packets++;
906 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
907 ntohs(skb->nh.iph->tot_len);
912 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
913 void ip_ct_refresh_acct(struct ip_conntrack *ct,
914 enum ip_conntrack_info ctinfo,
915 const struct sk_buff *skb,
916 unsigned long extra_jiffies)
918 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
920 /* If not in hash table, timer will not be active yet */
921 if (!is_confirmed(ct)) {
922 ct->timeout.expires = extra_jiffies;
923 ct_add_counters(ct, ctinfo, skb);
925 write_lock_bh(&ip_conntrack_lock);
926 /* Need del_timer for race avoidance (may already be dying). */
927 if (del_timer(&ct->timeout)) {
928 ct->timeout.expires = jiffies + extra_jiffies;
929 add_timer(&ct->timeout);
931 ct_add_counters(ct, ctinfo, skb);
932 write_unlock_bh(&ip_conntrack_lock);
936 /* Returns new sk_buff, or NULL */
938 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
943 skb = ip_defrag(skb, user);
947 ip_send_check(skb->nh.iph);
948 skb->nfcache |= NFC_ALTERED;
953 /* Used by ipt_REJECT. */
954 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
956 struct ip_conntrack *ct;
957 enum ip_conntrack_info ctinfo;
959 /* This ICMP is in reverse direction to the packet which caused it */
960 ct = ip_conntrack_get(skb, &ctinfo);
962 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
963 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
965 ctinfo = IP_CT_RELATED;
967 /* Attach to new skbuff, and increment count */
968 nskb->nfct = &ct->ct_general;
969 nskb->nfctinfo = ctinfo;
970 nf_conntrack_get(nskb->nfct);
974 do_iter(const struct ip_conntrack_tuple_hash *i,
975 int (*iter)(struct ip_conntrack *i, void *data),
978 return iter(tuplehash_to_ctrack(i), data);
981 /* Bring out ya dead! */
982 static struct ip_conntrack_tuple_hash *
983 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
984 void *data, unsigned int *bucket)
986 struct ip_conntrack_tuple_hash *h = NULL;
988 write_lock_bh(&ip_conntrack_lock);
989 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
990 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
991 struct ip_conntrack_tuple_hash *, iter, data);
996 h = LIST_FIND_W(&unconfirmed, do_iter,
997 struct ip_conntrack_tuple_hash *, iter, data);
999 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1000 write_unlock_bh(&ip_conntrack_lock);
1006 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1008 struct ip_conntrack_tuple_hash *h;
1009 unsigned int bucket = 0;
1011 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1012 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1013 /* Time to push up daises... */
1014 if (del_timer(&ct->timeout))
1015 death_by_timeout((unsigned long)ct);
1016 /* ... else the timer will get him soon. */
1018 ip_conntrack_put(ct);
1022 /* Fast function for those who don't want to parse /proc (and I don't
1024 /* Reversing the socket's dst/src point of view gives us the reply
1027 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1029 struct inet_sock *inet = inet_sk(sk);
1030 struct ip_conntrack_tuple_hash *h;
1031 struct ip_conntrack_tuple tuple;
1033 IP_CT_TUPLE_U_BLANK(&tuple);
1034 tuple.src.ip = inet->rcv_saddr;
1035 tuple.src.u.tcp.port = inet->sport;
1036 tuple.dst.ip = inet->daddr;
1037 tuple.dst.u.tcp.port = inet->dport;
1038 tuple.dst.protonum = IPPROTO_TCP;
1040 /* We only do TCP at the moment: is there a better way? */
1041 if (strcmp(sk->sk_prot->name, "TCP")) {
1042 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1043 return -ENOPROTOOPT;
1046 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1047 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1048 *len, sizeof(struct sockaddr_in));
1052 h = ip_conntrack_find_get(&tuple, NULL);
1054 struct sockaddr_in sin;
1055 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1057 sin.sin_family = AF_INET;
1058 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1059 .tuple.dst.u.tcp.port;
1060 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1063 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1064 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1065 ip_conntrack_put(ct);
1066 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1071 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1072 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1073 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1077 static struct nf_sockopt_ops so_getorigdst = {
1079 .get_optmin = SO_ORIGINAL_DST,
1080 .get_optmax = SO_ORIGINAL_DST+1,
1084 static int kill_all(struct ip_conntrack *i, void *data)
1089 static void free_conntrack_hash(void)
1091 if (ip_conntrack_vmalloc)
1092 vfree(ip_conntrack_hash);
1094 free_pages((unsigned long)ip_conntrack_hash,
1095 get_order(sizeof(struct list_head)
1096 * ip_conntrack_htable_size));
1099 /* Mishearing the voices in his head, our hero wonders how he's
1100 supposed to kill the mall. */
1101 void ip_conntrack_cleanup(void)
1103 ip_ct_attach = NULL;
1104 /* This makes sure all current packets have passed through
1105 netfilter framework. Roll on, two-stage module
1110 ip_ct_iterate_cleanup(kill_all, NULL);
1111 if (atomic_read(&ip_conntrack_count) != 0) {
1113 goto i_see_dead_people;
1115 /* wait until all references to ip_conntrack_untracked are dropped */
1116 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1119 kmem_cache_destroy(ip_conntrack_cachep);
1120 kmem_cache_destroy(ip_conntrack_expect_cachep);
1121 free_conntrack_hash();
1122 nf_unregister_sockopt(&so_getorigdst);
1125 static int hashsize;
1126 module_param(hashsize, int, 0400);
1128 int __init ip_conntrack_init(void)
1133 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1134 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1136 ip_conntrack_htable_size = hashsize;
1138 ip_conntrack_htable_size
1139 = (((num_physpages << PAGE_SHIFT) / 16384)
1140 / sizeof(struct list_head));
1141 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1142 ip_conntrack_htable_size = 8192;
1143 if (ip_conntrack_htable_size < 16)
1144 ip_conntrack_htable_size = 16;
1146 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1148 printk("ip_conntrack version %s (%u buckets, %d max)"
1149 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1150 ip_conntrack_htable_size, ip_conntrack_max,
1151 sizeof(struct ip_conntrack));
1153 ret = nf_register_sockopt(&so_getorigdst);
1155 printk(KERN_ERR "Unable to register netfilter socket option\n");
1159 /* AK: the hash table is twice as big than needed because it
1160 uses list_head. it would be much nicer to caches to use a
1161 single pointer list head here. */
1162 ip_conntrack_vmalloc = 0;
1164 =(void*)__get_free_pages(GFP_KERNEL,
1165 get_order(sizeof(struct list_head)
1166 *ip_conntrack_htable_size));
1167 if (!ip_conntrack_hash) {
1168 ip_conntrack_vmalloc = 1;
1169 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1170 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1171 * ip_conntrack_htable_size);
1173 if (!ip_conntrack_hash) {
1174 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1175 goto err_unreg_sockopt;
1178 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1179 sizeof(struct ip_conntrack), 0,
1181 if (!ip_conntrack_cachep) {
1182 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1186 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1187 sizeof(struct ip_conntrack_expect),
1189 if (!ip_conntrack_expect_cachep) {
1190 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1191 goto err_free_conntrack_slab;
1194 /* Don't NEED lock here, but good form anyway. */
1195 write_lock_bh(&ip_conntrack_lock);
1196 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1197 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1198 /* Sew in builtin protocols. */
1199 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1200 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1201 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1202 write_unlock_bh(&ip_conntrack_lock);
1204 for (i = 0; i < ip_conntrack_htable_size; i++)
1205 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1207 /* For use by ipt_REJECT */
1208 ip_ct_attach = ip_conntrack_attach;
1210 /* Set up fake conntrack:
1211 - to never be deleted, not in any hashes */
1212 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1213 /* - and look it like as a confirmed connection */
1214 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1218 err_free_conntrack_slab:
1219 kmem_cache_destroy(ip_conntrack_cachep);
1221 free_conntrack_hash();
1223 nf_unregister_sockopt(&so_getorigdst);