X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=net%2Fipv4%2Fip_fragment.c;h=a2e92f9709db8459bd573e82c6842b6d9ea1d715;hb=7524d7d6de5d5d3f081de8cf5479819fad339661;hp=443b3f89192f5841f78e7783874f9c8b9b479020;hpb=e457f790d8b05977853aa238bbc667b3bb375671;p=linux-2.6 diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 443b3f8919..a2e92f9709 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -50,7 +50,7 @@ * as well. Or notify me, at least. --ANK */ -int sysctl_ipfrag_max_dist __read_mostly = 64; +static int sysctl_ipfrag_max_dist __read_mostly = 64; struct ipfrag_skb_cb { @@ -74,40 +74,26 @@ struct ipq { struct inet_peer *peer; }; -struct inet_frags_ctl ip4_frags_ctl __read_mostly = { - /* - * Fragment cache limits. We will commit 256K at one time. Should we - * cross that limit we will prune down to 192K. This should cope with - * even the most extreme cases without allowing an attacker to - * measurably harm machine performance. - */ - .high_thresh = 256 * 1024, - .low_thresh = 192 * 1024, - - /* - * Important NOTE! Fragment queue must be destroyed before MSL expires. - * RFC791 is wrong proposing to prolongate timer each fragment arrival - * by TTL. - */ - .timeout = IP_FRAG_TIME, - .secret_interval = 10 * 60 * HZ, -}; - static struct inet_frags ip4_frags; -int ip_frag_nqueues(void) +int ip_frag_nqueues(struct net *net) { - return ip4_frags.nqueues; + return net->ipv4.frags.nqueues; } -int ip_frag_mem(void) +int ip_frag_mem(struct net *net) { - return atomic_read(&ip4_frags.mem); + return atomic_read(&net->ipv4.frags.mem); } static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev); +struct ip4_create_arg { + struct iphdr *iph; + u32 user; +}; + static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) { return jhash_3words((__force u32)id << 16 | prot, @@ -123,15 +109,43 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q) return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); } +static int ip4_frag_match(struct inet_frag_queue *q, void *a) +{ + struct ipq *qp; + struct ip4_create_arg *arg = a; + + qp = container_of(q, struct ipq, q); + return (qp->id == arg->iph->id && + qp->saddr == arg->iph->saddr && + qp->daddr == arg->iph->daddr && + qp->protocol == arg->iph->protocol && + qp->user == arg->user); +} + /* Memory Tracking Functions. */ -static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work) +static __inline__ void frag_kfree_skb(struct netns_frags *nf, + struct sk_buff *skb, int *work) { if (work) *work -= skb->truesize; - atomic_sub(skb->truesize, &ip4_frags.mem); + atomic_sub(skb->truesize, &nf->mem); kfree_skb(skb); } +static void ip4_frag_init(struct inet_frag_queue *q, void *a) +{ + struct ipq *qp = container_of(q, struct ipq, q); + struct ip4_create_arg *arg = a; + + qp->protocol = arg->iph->protocol; + qp->id = arg->iph->id; + qp->saddr = arg->iph->saddr; + qp->daddr = arg->iph->daddr; + qp->user = arg->user; + qp->peer = sysctl_ipfrag_max_dist ? + inet_getpeer(arg->iph->saddr, 1) : NULL; +} + static __inline__ void ip4_frag_free(struct inet_frag_queue *q) { struct ipq *qp; @@ -139,17 +153,6 @@ static __inline__ void ip4_frag_free(struct inet_frag_queue *q) qp = container_of(q, struct ipq, q); if (qp->peer) inet_putpeer(qp->peer); - kfree(qp); -} - -static __inline__ struct ipq *frag_alloc_queue(void) -{ - struct ipq *qp = kzalloc(sizeof(struct ipq), GFP_ATOMIC); - - if (!qp) - return NULL; - atomic_add(sizeof(struct ipq), &ip4_frags.mem); - return qp; } @@ -171,11 +174,11 @@ static void ipq_kill(struct ipq *ipq) /* Memory limiting on fragments. Evictor trashes the oldest * fragment queue until we are back under the threshold. */ -static void ip_evictor(void) +static void ip_evictor(struct net *net) { int evicted; - evicted = inet_frag_evictor(&ip4_frags); + evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); if (evicted) IP_ADD_STATS_BH(IPSTATS_MIB_REASMFAILS, evicted); } @@ -185,7 +188,9 @@ static void ip_evictor(void) */ static void ip_expire(unsigned long arg) { - struct ipq *qp = (struct ipq *) arg; + struct ipq *qp; + + qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); spin_lock(&qp->q.lock); @@ -210,112 +215,30 @@ out: ipq_put(qp); } -/* Creation primitives. */ - -static struct ipq *ip_frag_intern(struct ipq *qp_in) +/* Find the correct entry in the "incomplete datagrams" queue for + * this IP datagram, and create new one, if nothing is found. + */ +static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) { - struct ipq *qp; -#ifdef CONFIG_SMP - struct hlist_node *n; -#endif + struct inet_frag_queue *q; + struct ip4_create_arg arg; unsigned int hash; - write_lock(&ip4_frags.lock); - hash = ipqhashfn(qp_in->id, qp_in->saddr, qp_in->daddr, - qp_in->protocol); -#ifdef CONFIG_SMP - /* With SMP race we have to recheck hash table, because - * such entry could be created on other cpu, while we - * promoted read lock to write lock. - */ - hlist_for_each_entry(qp, n, &ip4_frags.hash[hash], q.list) { - if (qp->id == qp_in->id && - qp->saddr == qp_in->saddr && - qp->daddr == qp_in->daddr && - qp->protocol == qp_in->protocol && - qp->user == qp_in->user) { - atomic_inc(&qp->q.refcnt); - write_unlock(&ip4_frags.lock); - qp_in->q.last_in |= COMPLETE; - ipq_put(qp_in); - return qp; - } - } -#endif - qp = qp_in; - - if (!mod_timer(&qp->q.timer, jiffies + ip4_frags_ctl.timeout)) - atomic_inc(&qp->q.refcnt); + arg.iph = iph; + arg.user = user; + hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); - atomic_inc(&qp->q.refcnt); - hlist_add_head(&qp->q.list, &ip4_frags.hash[hash]); - INIT_LIST_HEAD(&qp->q.lru_list); - list_add_tail(&qp->q.lru_list, &ip4_frags.lru_list); - ip4_frags.nqueues++; - write_unlock(&ip4_frags.lock); - return qp; -} - -/* Add an entry to the 'ipq' queue for a newly received IP datagram. */ -static struct ipq *ip_frag_create(struct iphdr *iph, u32 user) -{ - struct ipq *qp; - - if ((qp = frag_alloc_queue()) == NULL) + q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); + if (q == NULL) goto out_nomem; - qp->protocol = iph->protocol; - qp->id = iph->id; - qp->saddr = iph->saddr; - qp->daddr = iph->daddr; - qp->user = user; - qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL; - - /* Initialize a timer for this entry. */ - init_timer(&qp->q.timer); - qp->q.timer.data = (unsigned long) qp; /* pointer to queue */ - qp->q.timer.function = ip_expire; /* expire function */ - spin_lock_init(&qp->q.lock); - atomic_set(&qp->q.refcnt, 1); - - return ip_frag_intern(qp); + return container_of(q, struct ipq, q); out_nomem: LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); return NULL; } -/* Find the correct entry in the "incomplete datagrams" queue for - * this IP datagram, and create new one, if nothing is found. - */ -static inline struct ipq *ip_find(struct iphdr *iph, u32 user) -{ - __be16 id = iph->id; - __be32 saddr = iph->saddr; - __be32 daddr = iph->daddr; - __u8 protocol = iph->protocol; - unsigned int hash; - struct ipq *qp; - struct hlist_node *n; - - read_lock(&ip4_frags.lock); - hash = ipqhashfn(id, saddr, daddr, protocol); - hlist_for_each_entry(qp, n, &ip4_frags.hash[hash], q.list) { - if (qp->id == id && - qp->saddr == saddr && - qp->daddr == daddr && - qp->protocol == protocol && - qp->user == user) { - atomic_inc(&qp->q.refcnt); - read_unlock(&ip4_frags.lock); - return qp; - } - } - read_unlock(&ip4_frags.lock); - - return ip_frag_create(iph, user); -} - /* Is the fragment too far ahead to be part of ipq? */ static inline int ip_frag_too_far(struct ipq *qp) { @@ -345,7 +268,7 @@ static int ip_frag_reinit(struct ipq *qp) { struct sk_buff *fp; - if (!mod_timer(&qp->q.timer, jiffies + ip4_frags_ctl.timeout)) { + if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { atomic_inc(&qp->q.refcnt); return -ETIMEDOUT; } @@ -353,7 +276,7 @@ static int ip_frag_reinit(struct ipq *qp) fp = qp->q.fragments; do { struct sk_buff *xp = fp->next; - frag_kfree_skb(fp, NULL); + frag_kfree_skb(qp->q.net, fp, NULL); fp = xp; } while (fp); @@ -490,7 +413,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->q.fragments = next; qp->q.meat -= free_it->len; - frag_kfree_skb(free_it, NULL); + frag_kfree_skb(qp->q.net, free_it, NULL); } } @@ -510,7 +433,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; - atomic_add(skb->truesize, &ip4_frags.mem); + atomic_add(skb->truesize, &qp->q.net->mem); if (offset == 0) qp->q.last_in |= FIRST_IN; @@ -518,7 +441,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) return ip_frag_reasm(qp, prev, dev); write_lock(&ip4_frags.lock); - list_move_tail(&qp->q.lru_list, &ip4_frags.lru_list); + list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); write_unlock(&ip4_frags.lock); return -EINPROGRESS; @@ -545,7 +468,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, if (prev) { head = prev->next; fp = skb_clone(head, GFP_ATOMIC); - if (!fp) goto out_nomem; @@ -571,7 +493,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_oversize; /* Head of list must not be cloned. */ - err = -ENOMEM; if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) goto out_nomem; @@ -595,12 +516,12 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &ip4_frags.mem); + atomic_add(clone->truesize, &qp->q.net->mem); } skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &ip4_frags.mem); + atomic_sub(head->truesize, &qp->q.net->mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -610,7 +531,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &ip4_frags.mem); + atomic_sub(fp->truesize, &qp->q.net->mem); } head->next = NULL; @@ -627,6 +548,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, out_nomem: LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " "queue %p\n", qp); + err = -ENOMEM; goto out_fail; out_oversize: if (net_ratelimit()) @@ -642,15 +564,17 @@ out_fail: int ip_defrag(struct sk_buff *skb, u32 user) { struct ipq *qp; + struct net *net; IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); + net = skb->dev->nd_net; /* Start by cleaning up the memory. */ - if (atomic_read(&ip4_frags.mem) > ip4_frags_ctl.high_thresh) - ip_evictor(); + if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) + ip_evictor(net); /* Lookup (or create) queue header */ - if ((qp = ip_find(ip_hdr(skb), user)) != NULL) { + if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { int ret; spin_lock(&qp->q.lock); @@ -667,13 +591,150 @@ int ip_defrag(struct sk_buff *skb, u32 user) return -ENOMEM; } +#ifdef CONFIG_SYSCTL +static int zero; + +static struct ctl_table ip4_frags_ctl_table[] = { + { + .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH, + .procname = "ipfrag_high_thresh", + .data = &init_net.ipv4.frags.high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH, + .procname = "ipfrag_low_thresh", + .data = &init_net.ipv4.frags.low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_TIME, + .procname = "ipfrag_time", + .data = &init_net.ipv4.frags.timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL, + .procname = "ipfrag_secret_interval", + .data = &ip4_frags.secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .procname = "ipfrag_max_dist", + .data = &sysctl_ipfrag_max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero + }, + { } +}; + +static int ip4_frags_ctl_register(struct net *net) +{ + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = ip4_frags_ctl_table; + if (net != &init_net) { + table = kmemdup(table, sizeof(ip4_frags_ctl_table), GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = &net->ipv4.frags.high_thresh; + table[1].data = &net->ipv4.frags.low_thresh; + table[2].data = &net->ipv4.frags.timeout; + table[3].mode &= ~0222; + table[4].mode &= ~0222; + } + + hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); + if (hdr == NULL) + goto err_reg; + + net->ipv4.frags_hdr = hdr; + return 0; + +err_reg: + if (net != &init_net) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void ip4_frags_ctl_unregister(struct net *net) +{ + struct ctl_table *table; + + table = net->ipv4.frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.frags_hdr); + kfree(table); +} +#else +static inline int ip4_frags_ctl_register(struct net *net) +{ + return 0; +} + +static inline void ip4_frags_ctl_unregister(struct net *net) +{ +} +#endif + +static int ipv4_frags_init_net(struct net *net) +{ + /* + * Fragment cache limits. We will commit 256K at one time. Should we + * cross that limit we will prune down to 192K. This should cope with + * even the most extreme cases without allowing an attacker to + * measurably harm machine performance. + */ + net->ipv4.frags.high_thresh = 256 * 1024; + net->ipv4.frags.low_thresh = 192 * 1024; + /* + * Important NOTE! Fragment queue must be destroyed before MSL expires. + * RFC791 is wrong proposing to prolongate timer each fragment arrival + * by TTL. + */ + net->ipv4.frags.timeout = IP_FRAG_TIME; + + inet_frags_init_net(&net->ipv4.frags); + + return ip4_frags_ctl_register(net); +} + +static void ipv4_frags_exit_net(struct net *net) +{ + ip4_frags_ctl_unregister(net); + inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); +} + +static struct pernet_operations ip4_frags_ops = { + .init = ipv4_frags_init_net, + .exit = ipv4_frags_exit_net, +}; + void __init ipfrag_init(void) { - ip4_frags.ctl = &ip4_frags_ctl; + register_pernet_subsys(&ip4_frags_ops); ip4_frags.hashfn = ip4_hashfn; + ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; ip4_frags.skb_free = NULL; ip4_frags.qsize = sizeof(struct ipq); + ip4_frags.match = ip4_frag_match; + ip4_frags.frag_expire = ip_expire; + ip4_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip4_frags); }