X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=net%2Fipv4%2Ftcp_ipv4.c;h=c50dd1793643ea16db263f2b33ec6301cf03dace;hb=9427c4b36b8fe652df1d7c89eae678948e1f4b32;hp=77c1939a2b0d50b2dee673fa450a410cd82bc9c9;hpb=cec03afcb62fbbb0eaf943f6349ade61b89d7d40;p=linux-2.6 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 77c1939a2b..c50dd17936 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -88,9 +88,6 @@ int sysctl_tcp_low_latency __read_mostly; /* Check TCP sequence numbers in ICMP packets. */ #define ICMP_MIN_LENGTH 8 -/* Socket used for sending RSTs */ -static struct socket *tcp_socket __read_mostly; - void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); #ifdef CONFIG_TCP_MD5SIG @@ -108,22 +105,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), }; -static int tcp_v4_get_port(struct sock *sk, unsigned short snum) -{ - return inet_csk_get_port(&tcp_hashinfo, sk, snum, - inet_csk_bind_conflict); -} - -static void tcp_v4_hash(struct sock *sk) -{ - inet_hash(&tcp_hashinfo, sk); -} - -void tcp_unhash(struct sock *sk) -{ - inet_unhash(&tcp_hashinfo, sk); -} - static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) { return secure_tcp_sequence_number(ip_hdr(skb)->daddr, @@ -369,7 +350,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) return; } - sk = inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->daddr, th->dest, + sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, th->source, inet_iif(skb)); if (!sk) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); @@ -568,7 +549,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) if (th->rst) return; - if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL) + if (skb->rtable->rt_type != RTN_LOCAL) return; /* Swap the send and the receive. */ @@ -614,7 +595,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) sizeof(struct tcphdr), IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; - ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len); + ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); @@ -709,7 +691,8 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, if (twsk) arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if; - ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len); + ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); } @@ -735,12 +718,12 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, } /* - * Send a SYN-ACK after having received an ACK. + * Send a SYN-ACK after having received a SYN. * This still operates on a request_sock only, not on a big * socket. */ -static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, - struct dst_entry *dst) +static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, + struct dst_entry *dst) { const struct inet_request_sock *ireq = inet_rsk(req); int err = -1; @@ -748,7 +731,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) - goto out; + return -1; skb = tcp_make_synack(sk, dst, req); @@ -767,11 +750,15 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, err = net_xmit_eval(err); } -out: dst_release(dst); return err; } +static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req) +{ + return __tcp_v4_send_synack(sk, req, NULL); +} + /* * IPv4 request_sock destructor. */ @@ -1274,8 +1261,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) #endif /* Never answer to SYNs send to broadcast or multicast */ - if (((struct rtable *)skb->dst)->rt_flags & - (RTCF_BROADCAST | RTCF_MULTICAST)) + if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; /* TW buckets are converted to open requests without @@ -1313,10 +1299,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_parse_options(skb, &tmp_opt, 0); - if (want_cookie) { + if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); - tmp_opt.saw_tstamp = 0; - } if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) { /* Some OSes (unknown ones, but I see them on web server, which @@ -1344,6 +1328,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (want_cookie) { #ifdef CONFIG_SYN_COOKIES syn_flood_warning(skb); + req->cookie_ts = tmp_opt.tstamp_ok; #endif isn = cookie_v4_init_sequence(sk, skb, &req->mss); } else if (!isn) { @@ -1367,8 +1352,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED); - dst_release(dst); - goto drop_and_free; + goto drop_and_release; } } /* Kill the following clause, if you dislike this way. */ @@ -1388,24 +1372,21 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) "request from %u.%u.%u.%u/%u\n", NIPQUAD(saddr), ntohs(tcp_hdr(skb)->source)); - dst_release(dst); - goto drop_and_free; + goto drop_and_release; } isn = tcp_v4_init_sequence(skb); } tcp_rsk(req)->snt_isn = isn; - if (tcp_v4_send_synack(sk, req, dst)) + if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) goto drop_and_free; - if (want_cookie) { - reqsk_free(req); - } else { - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - } + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; +drop_and_release: + dst_release(dst); drop_and_free: reqsk_free(req); drop: @@ -1478,8 +1459,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } #endif - __inet_hash_nolisten(&tcp_hashinfo, newsk); - __inet_inherit_port(&tcp_hashinfo, sk, newsk); + __inet_hash_nolisten(newsk); + __inet_inherit_port(sk, newsk); return newsk; @@ -1503,7 +1484,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) if (req) return tcp_check_req(sk, skb, req, prev); - nsk = inet_lookup_established(sk->sk_net, &tcp_hashinfo, iph->saddr, + nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (nsk) { @@ -1661,7 +1642,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->flags = iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->saddr, + sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1735,7 +1716,7 @@ do_time_wait: } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { - struct sock *sk2 = inet_lookup_listener(skb->dev->nd_net, + struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest, inet_iif(skb)); @@ -1827,6 +1808,7 @@ struct inet_connection_sock_af_ops ipv4_specific = { .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), + .bind_conflict = inet_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, @@ -1926,7 +1908,7 @@ int tcp_v4_destroy_sock(struct sock *sk) /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) - inet_put_port(&tcp_hashinfo, sk); + inet_put_port(sk); /* * If sendmsg cached page exists, toss it. @@ -1936,6 +1918,14 @@ int tcp_v4_destroy_sock(struct sock *sk) sk->sk_sndmsg_page = NULL; } + if (tp->defer_tcp_accept.request) { + reqsk_free(tp->defer_tcp_accept.request); + sock_put(tp->defer_tcp_accept.listen_sk); + sock_put(sk); + tp->defer_tcp_accept.listen_sk = NULL; + tp->defer_tcp_accept.request = NULL; + } + atomic_dec(&tcp_sockets_allocated); return 0; @@ -1964,6 +1954,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct hlist_node *node; struct sock *sk = cur; struct tcp_iter_state* st = seq->private; + struct net *net = seq_file_net(seq); if (!sk) { st->bucket = 0; @@ -1980,7 +1971,8 @@ static void *listening_get_next(struct seq_file *seq, void *cur) req = req->dl_next; while (1) { while (req) { - if (req->rsk_ops->family == st->family) { + if (req->rsk_ops->family == st->family && + net_eq(sock_net(req->sk), net)) { cur = req; goto out; } @@ -2004,7 +1996,7 @@ get_req: } get_sk: sk_for_each_from(sk, node) { - if (sk->sk_family == st->family) { + if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { cur = sk; goto out; } @@ -2043,6 +2035,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) static void *established_get_first(struct seq_file *seq) { struct tcp_iter_state* st = seq->private; + struct net *net = seq_file_net(seq); void *rc = NULL; for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { @@ -2053,7 +2046,8 @@ static void *established_get_first(struct seq_file *seq) read_lock_bh(lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { - if (sk->sk_family != st->family) { + if (sk->sk_family != st->family || + !net_eq(sock_net(sk), net)) { continue; } rc = sk; @@ -2062,7 +2056,8 @@ static void *established_get_first(struct seq_file *seq) st->state = TCP_SEQ_STATE_TIME_WAIT; inet_twsk_for_each(tw, node, &tcp_hashinfo.ehash[st->bucket].twchain) { - if (tw->tw_family != st->family) { + if (tw->tw_family != st->family || + !net_eq(twsk_net(tw), net)) { continue; } rc = tw; @@ -2081,6 +2076,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) struct inet_timewait_sock *tw; struct hlist_node *node; struct tcp_iter_state* st = seq->private; + struct net *net = seq_file_net(seq); ++st->num; @@ -2088,7 +2084,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) tw = cur; tw = tw_next(tw); get_tw: - while (tw && tw->tw_family != st->family) { + while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { tw = tw_next(tw); } if (tw) { @@ -2109,7 +2105,7 @@ get_tw: sk = sk_next(sk); sk_for_each_from(sk, node) { - if (sk->sk_family == st->family) + if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) goto found; } @@ -2217,6 +2213,7 @@ static int tcp_seq_open(struct inode *inode, struct file *file) struct tcp_seq_afinfo *afinfo = PDE(inode)->data; struct seq_file *seq; struct tcp_iter_state *s; + struct net *net; int rc; if (unlikely(afinfo == NULL)) @@ -2225,38 +2222,54 @@ static int tcp_seq_open(struct inode *inode, struct file *file) s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; + + rc = -ENXIO; + net = get_proc_net(inode); + if (!net) + goto out_kfree; + s->family = afinfo->family; - s->seq_ops.start = tcp_seq_start; - s->seq_ops.next = tcp_seq_next; - s->seq_ops.show = afinfo->seq_show; - s->seq_ops.stop = tcp_seq_stop; + s->p.net = net; - rc = seq_open(file, &s->seq_ops); + rc = seq_open(file, &afinfo->seq_ops); if (rc) - goto out_kfree; - seq = file->private_data; + goto out_put_net; + seq = file->private_data; seq->private = s; out: return rc; +out_put_net: + put_net(net); out_kfree: kfree(s); goto out; } -int tcp_proc_register(struct tcp_seq_afinfo *afinfo) +static int tcp_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + put_net(seq_file_net(seq)); + seq_release_private(inode, file); + return 0; +} + +int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) { int rc = 0; struct proc_dir_entry *p; - if (!afinfo) - return -EINVAL; afinfo->seq_fops->owner = afinfo->owner; afinfo->seq_fops->open = tcp_seq_open; afinfo->seq_fops->read = seq_read; afinfo->seq_fops->llseek = seq_lseek; - afinfo->seq_fops->release = seq_release_private; + afinfo->seq_fops->release = tcp_seq_release; - p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops); + afinfo->seq_ops.start = tcp_seq_start; + afinfo->seq_ops.next = tcp_seq_next; + afinfo->seq_ops.stop = tcp_seq_stop; + + p = proc_net_fops_create(net, afinfo->name, S_IRUGO, afinfo->seq_fops); if (p) p->data = afinfo; else @@ -2264,11 +2277,9 @@ int tcp_proc_register(struct tcp_seq_afinfo *afinfo) return rc; } -void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo) +void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) { - if (!afinfo) - return; - proc_net_remove(&init_net, afinfo->name); + proc_net_remove(net, afinfo->name); memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); } @@ -2403,23 +2414,38 @@ static struct tcp_seq_afinfo tcp4_seq_afinfo = { .owner = THIS_MODULE, .name = "tcp", .family = AF_INET, - .seq_show = tcp4_seq_show, .seq_fops = &tcp4_seq_fops, + .seq_ops = { + .show = tcp4_seq_show, + }, +}; + +static int tcp4_proc_init_net(struct net *net) +{ + return tcp_proc_register(net, &tcp4_seq_afinfo); +} + +static void tcp4_proc_exit_net(struct net *net) +{ + tcp_proc_unregister(net, &tcp4_seq_afinfo); +} + +static struct pernet_operations tcp4_net_ops = { + .init = tcp4_proc_init_net, + .exit = tcp4_proc_exit_net, }; int __init tcp4_proc_init(void) { - return tcp_proc_register(&tcp4_seq_afinfo); + return register_pernet_subsys(&tcp4_net_ops); } void tcp4_proc_exit(void) { - tcp_proc_unregister(&tcp4_seq_afinfo); + unregister_pernet_subsys(&tcp4_net_ops); } #endif /* CONFIG_PROC_FS */ -DEFINE_PROTO_INUSE(tcp) - struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, @@ -2435,9 +2461,9 @@ struct proto tcp_prot = { .getsockopt = tcp_getsockopt, .recvmsg = tcp_recvmsg, .backlog_rcv = tcp_v4_do_rcv, - .hash = tcp_v4_hash, - .unhash = tcp_unhash, - .get_port = tcp_v4_get_port, + .hash = inet_hash, + .unhash = inet_unhash, + .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, @@ -2450,24 +2476,39 @@ struct proto tcp_prot = { .obj_size = sizeof(struct tcp_sock), .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, + .h.hashinfo = &tcp_hashinfo, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif - REF_PROTO_INUSE(tcp) }; -void __init tcp_v4_init(struct net_proto_family *ops) + +static int __net_init tcp_sk_init(struct net *net) +{ + return inet_ctl_sock_create(&net->ipv4.tcp_sock, + PF_INET, SOCK_RAW, IPPROTO_TCP, net); +} + +static void __net_exit tcp_sk_exit(struct net *net) +{ + inet_ctl_sock_destroy(net->ipv4.tcp_sock); +} + +static struct pernet_operations __net_initdata tcp_sk_ops = { + .init = tcp_sk_init, + .exit = tcp_sk_exit, +}; + +void __init tcp_v4_init(void) { - if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW, - IPPROTO_TCP) < 0) + if (register_pernet_device(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); } EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(tcp_hashinfo); EXPORT_SYMBOL(tcp_prot); -EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(tcp_v4_conn_request); EXPORT_SYMBOL(tcp_v4_connect); EXPORT_SYMBOL(tcp_v4_do_rcv);