2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
54 #include <linux/types.h>
55 #include <linux/fcntl.h>
56 #include <linux/module.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61 #include <linux/times.h>
63 #include <net/net_namespace.h>
65 #include <net/inet_hashtables.h>
67 #include <net/transp_v6.h>
69 #include <net/inet_common.h>
70 #include <net/timewait_sock.h>
72 #include <net/netdma.h>
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
80 #include <linux/crypto.h>
81 #include <linux/scatterlist.h>
83 int sysctl_tcp_tw_reuse __read_mostly;
84 int sysctl_tcp_low_latency __read_mostly;
87 #ifdef CONFIG_TCP_MD5SIG
88 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
90 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
91 __be32 saddr, __be32 daddr,
92 struct tcphdr *th, unsigned int tcplen);
95 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
101 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
102 .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
103 .lhash_users = ATOMIC_INIT(0),
104 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
109 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
112 tcp_hdr(skb)->source);
115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
118 struct tcp_sock *tp = tcp_sk(sk);
120 /* With PAWS, it is safe from the viewpoint
121 of data integrity. Even without PAWS it is safe provided sequence
122 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
124 Actually, the idea is close to VJ's one, only timestamp cache is
125 held not per host, but per port pair and TW bucket is used as state
128 If TW bucket has been already destroyed we fall back to VJ's scheme
129 and use initial timestamp retrieved from peer table.
131 if (tcptw->tw_ts_recent_stamp &&
132 (twp == NULL || (sysctl_tcp_tw_reuse &&
133 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
134 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
135 if (tp->write_seq == 0)
137 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
138 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
146 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
148 /* This will initiate an outgoing connection. */
149 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
151 struct inet_sock *inet = inet_sk(sk);
152 struct tcp_sock *tp = tcp_sk(sk);
153 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
155 __be32 daddr, nexthop;
159 if (addr_len < sizeof(struct sockaddr_in))
162 if (usin->sin_family != AF_INET)
163 return -EAFNOSUPPORT;
165 nexthop = daddr = usin->sin_addr.s_addr;
166 if (inet->opt && inet->opt->srr) {
169 nexthop = inet->opt->faddr;
172 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
173 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 inet->sport, usin->sin_port, sk, 1);
177 if (tmp == -ENETUNREACH)
178 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
182 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187 if (!inet->opt || !inet->opt->srr)
191 inet->saddr = rt->rt_src;
192 inet->rcv_saddr = inet->saddr;
194 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
195 /* Reset inherited state */
196 tp->rx_opt.ts_recent = 0;
197 tp->rx_opt.ts_recent_stamp = 0;
201 if (tcp_death_row.sysctl_tw_recycle &&
202 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
203 struct inet_peer *peer = rt_get_peer(rt);
205 * VJ's idea. We save last timestamp seen from
206 * the destination in peer table, when entering state
207 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
208 * when trying new connection.
211 peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
212 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
213 tp->rx_opt.ts_recent = peer->tcp_ts;
217 inet->dport = usin->sin_port;
220 inet_csk(sk)->icsk_ext_hdr_len = 0;
222 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
224 tp->rx_opt.mss_clamp = 536;
226 /* Socket identity is still unknown (sport may be zero).
227 * However we set state to SYN-SENT and not releasing socket
228 * lock select source port, enter ourselves into the hash tables and
229 * complete initialization after this.
231 tcp_set_state(sk, TCP_SYN_SENT);
232 err = inet_hash_connect(&tcp_death_row, sk);
236 err = ip_route_newports(&rt, IPPROTO_TCP,
237 inet->sport, inet->dport, sk);
241 /* OK, now commit destination to socket. */
242 sk->sk_gso_type = SKB_GSO_TCPV4;
243 sk_setup_caps(sk, &rt->u.dst);
246 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
251 inet->id = tp->write_seq ^ jiffies;
253 err = tcp_connect(sk);
262 * This unhashes the socket and releases the local port,
265 tcp_set_state(sk, TCP_CLOSE);
267 sk->sk_route_caps = 0;
273 * This routine does path mtu discovery as defined in RFC1191.
275 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
277 struct dst_entry *dst;
278 struct inet_sock *inet = inet_sk(sk);
280 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
281 * send out by Linux are always <576bytes so they should go through
284 if (sk->sk_state == TCP_LISTEN)
287 /* We don't check in the destentry if pmtu discovery is forbidden
288 * on this route. We just assume that no packet_to_big packets
289 * are send back when pmtu discovery is not active.
290 * There is a small race when the user changes this flag in the
291 * route, but I think that's acceptable.
293 if ((dst = __sk_dst_check(sk, 0)) == NULL)
296 dst->ops->update_pmtu(dst, mtu);
298 /* Something is about to be wrong... Remember soft error
299 * for the case, if this connection will not able to recover.
301 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
302 sk->sk_err_soft = EMSGSIZE;
306 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
307 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
308 tcp_sync_mss(sk, mtu);
310 /* Resend the TCP packet because it's
311 * clear that the old packet has been
312 * dropped. This is the new "fast" path mtu
315 tcp_simple_retransmit(sk);
316 } /* else let the usual retransmit timer handle it */
320 * This routine is called by the ICMP module when it gets some
321 * sort of error condition. If err < 0 then the socket should
322 * be closed and the error returned to the user. If err > 0
323 * it's just the icmp type << 8 | icmp code. After adjustment
324 * header points to the first 8 bytes of the tcp header. We need
325 * to find the appropriate port.
327 * The locking strategy used here is very "optimistic". When
328 * someone else accesses the socket the ICMP is just dropped
329 * and for some paths there is no check at all.
330 * A more general error queue to queue errors for later handling
331 * is probably better.
335 void tcp_v4_err(struct sk_buff *skb, u32 info)
337 struct iphdr *iph = (struct iphdr *)skb->data;
338 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
340 struct inet_sock *inet;
341 const int type = icmp_hdr(skb)->type;
342 const int code = icmp_hdr(skb)->code;
347 if (skb->len < (iph->ihl << 2) + 8) {
348 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
352 sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
353 iph->saddr, th->source, inet_iif(skb));
355 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
358 if (sk->sk_state == TCP_TIME_WAIT) {
359 inet_twsk_put(inet_twsk(sk));
364 /* If too many ICMPs get dropped on busy
365 * servers this needs to be solved differently.
367 if (sock_owned_by_user(sk))
368 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
370 if (sk->sk_state == TCP_CLOSE)
374 seq = ntohl(th->seq);
375 if (sk->sk_state != TCP_LISTEN &&
376 !between(seq, tp->snd_una, tp->snd_nxt)) {
377 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
382 case ICMP_SOURCE_QUENCH:
383 /* Just silently ignore these. */
385 case ICMP_PARAMETERPROB:
388 case ICMP_DEST_UNREACH:
389 if (code > NR_ICMP_UNREACH)
392 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
393 if (!sock_owned_by_user(sk))
394 do_pmtu_discovery(sk, iph, info);
398 err = icmp_err_convert[code].errno;
400 case ICMP_TIME_EXCEEDED:
407 switch (sk->sk_state) {
408 struct request_sock *req, **prev;
410 if (sock_owned_by_user(sk))
413 req = inet_csk_search_req(sk, &prev, th->dest,
414 iph->daddr, iph->saddr);
418 /* ICMPs are not backlogged, hence we cannot get
419 an established socket here.
423 if (seq != tcp_rsk(req)->snt_isn) {
424 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
429 * Still in SYN_RECV, just remove it silently.
430 * There is no good way to pass the error to the newly
431 * created socket, and POSIX does not want network
432 * errors returned from accept().
434 inet_csk_reqsk_queue_drop(sk, req, prev);
438 case TCP_SYN_RECV: /* Cannot happen.
439 It can f.e. if SYNs crossed.
441 if (!sock_owned_by_user(sk)) {
444 sk->sk_error_report(sk);
448 sk->sk_err_soft = err;
453 /* If we've already connected we will keep trying
454 * until we time out, or the user gives up.
456 * rfc1122 4.2.3.9 allows to consider as hard errors
457 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
458 * but it is obsoleted by pmtu discovery).
460 * Note, that in modern internet, where routing is unreliable
461 * and in each dark corner broken firewalls sit, sending random
462 * errors ordered by their masters even this two messages finally lose
463 * their original sense (even Linux sends invalid PORT_UNREACHs)
465 * Now we are in compliance with RFCs.
470 if (!sock_owned_by_user(sk) && inet->recverr) {
472 sk->sk_error_report(sk);
473 } else { /* Only an error on timeout */
474 sk->sk_err_soft = err;
482 /* This routine computes an IPv4 TCP checksum. */
483 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
485 struct inet_sock *inet = inet_sk(sk);
486 struct tcphdr *th = tcp_hdr(skb);
488 if (skb->ip_summed == CHECKSUM_PARTIAL) {
489 th->check = ~tcp_v4_check(len, inet->saddr,
491 skb->csum_start = skb_transport_header(skb) - skb->head;
492 skb->csum_offset = offsetof(struct tcphdr, check);
494 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
495 csum_partial((char *)th,
501 int tcp_v4_gso_send_check(struct sk_buff *skb)
503 const struct iphdr *iph;
506 if (!pskb_may_pull(skb, sizeof(*th)))
513 th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
514 skb->csum_start = skb_transport_header(skb) - skb->head;
515 skb->csum_offset = offsetof(struct tcphdr, check);
516 skb->ip_summed = CHECKSUM_PARTIAL;
521 * This routine will send an RST to the other tcp.
523 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
525 * Answer: if a packet caused RST, it is not for a socket
526 * existing in our system, if it is matched to a socket,
527 * it is just duplicate segment or bug in other side's TCP.
528 * So that we build reply only basing on parameters
529 * arrived with segment.
530 * Exception: precedence violation. We do not implement it in any case.
533 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
535 struct tcphdr *th = tcp_hdr(skb);
538 #ifdef CONFIG_TCP_MD5SIG
539 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
542 struct ip_reply_arg arg;
543 #ifdef CONFIG_TCP_MD5SIG
544 struct tcp_md5sig_key *key;
547 /* Never send a reset in response to a reset. */
551 if (skb->rtable->rt_type != RTN_LOCAL)
554 /* Swap the send and the receive. */
555 memset(&rep, 0, sizeof(rep));
556 rep.th.dest = th->source;
557 rep.th.source = th->dest;
558 rep.th.doff = sizeof(struct tcphdr) / 4;
562 rep.th.seq = th->ack_seq;
565 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
566 skb->len - (th->doff << 2));
569 memset(&arg, 0, sizeof(arg));
570 arg.iov[0].iov_base = (unsigned char *)&rep;
571 arg.iov[0].iov_len = sizeof(rep.th);
573 #ifdef CONFIG_TCP_MD5SIG
574 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
576 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
578 (TCPOPT_MD5SIG << 8) |
580 /* Update length and the length the header thinks exists */
581 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
582 rep.th.doff = arg.iov[0].iov_len / 4;
584 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
588 &rep.th, arg.iov[0].iov_len);
591 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
592 ip_hdr(skb)->saddr, /* XXX */
593 sizeof(struct tcphdr), IPPROTO_TCP, 0);
594 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
596 ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
597 &arg, arg.iov[0].iov_len);
599 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
600 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
603 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
604 outside socket context is ugly, certainly. What can I do?
607 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
608 u32 win, u32 ts, int oif,
609 struct tcp_md5sig_key *key)
611 struct tcphdr *th = tcp_hdr(skb);
614 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
615 #ifdef CONFIG_TCP_MD5SIG
616 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
620 struct ip_reply_arg arg;
622 memset(&rep.th, 0, sizeof(struct tcphdr));
623 memset(&arg, 0, sizeof(arg));
625 arg.iov[0].iov_base = (unsigned char *)&rep;
626 arg.iov[0].iov_len = sizeof(rep.th);
628 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
629 (TCPOPT_TIMESTAMP << 8) |
631 rep.opt[1] = htonl(tcp_time_stamp);
632 rep.opt[2] = htonl(ts);
633 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
636 /* Swap the send and the receive. */
637 rep.th.dest = th->source;
638 rep.th.source = th->dest;
639 rep.th.doff = arg.iov[0].iov_len / 4;
640 rep.th.seq = htonl(seq);
641 rep.th.ack_seq = htonl(ack);
643 rep.th.window = htons(win);
645 #ifdef CONFIG_TCP_MD5SIG
647 int offset = (ts) ? 3 : 0;
649 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
651 (TCPOPT_MD5SIG << 8) |
653 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
654 rep.th.doff = arg.iov[0].iov_len/4;
656 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
660 &rep.th, arg.iov[0].iov_len);
663 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
664 ip_hdr(skb)->saddr, /* XXX */
665 arg.iov[0].iov_len, IPPROTO_TCP, 0);
666 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
668 arg.bound_dev_if = oif;
670 ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
671 &arg, arg.iov[0].iov_len);
673 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
676 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
678 struct inet_timewait_sock *tw = inet_twsk(sk);
679 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
681 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
682 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
685 tcp_twsk_md5_key(tcptw)
691 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
692 struct request_sock *req)
694 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
695 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
698 tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr));
702 * Send a SYN-ACK after having received a SYN.
703 * This still operates on a request_sock only, not on a big
706 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
707 struct dst_entry *dst)
709 const struct inet_request_sock *ireq = inet_rsk(req);
711 struct sk_buff * skb;
713 /* First, grab a route. */
714 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
717 skb = tcp_make_synack(sk, dst, req);
720 struct tcphdr *th = tcp_hdr(skb);
722 th->check = tcp_v4_check(skb->len,
725 csum_partial((char *)th, skb->len,
728 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
731 err = net_xmit_eval(err);
738 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
740 return __tcp_v4_send_synack(sk, req, NULL);
744 * IPv4 request_sock destructor.
746 static void tcp_v4_reqsk_destructor(struct request_sock *req)
748 kfree(inet_rsk(req)->opt);
751 #ifdef CONFIG_SYN_COOKIES
752 static void syn_flood_warning(struct sk_buff *skb)
754 static unsigned long warntime;
756 if (time_after(jiffies, (warntime + HZ * 60))) {
759 "possible SYN flooding on port %d. Sending cookies.\n",
760 ntohs(tcp_hdr(skb)->dest));
766 * Save and compile IPv4 options into the request_sock if needed.
768 static struct ip_options *tcp_v4_save_options(struct sock *sk,
771 struct ip_options *opt = &(IPCB(skb)->opt);
772 struct ip_options *dopt = NULL;
774 if (opt && opt->optlen) {
775 int opt_size = optlength(opt);
776 dopt = kmalloc(opt_size, GFP_ATOMIC);
778 if (ip_options_echo(dopt, skb)) {
787 #ifdef CONFIG_TCP_MD5SIG
789 * RFC2385 MD5 checksumming requires a mapping of
790 * IP address->MD5 Key.
791 * We need to maintain these in the sk structure.
794 /* Find the Key structure for an address. */
795 static struct tcp_md5sig_key *
796 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
798 struct tcp_sock *tp = tcp_sk(sk);
801 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
803 for (i = 0; i < tp->md5sig_info->entries4; i++) {
804 if (tp->md5sig_info->keys4[i].addr == addr)
805 return &tp->md5sig_info->keys4[i].base;
810 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
811 struct sock *addr_sk)
813 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
816 EXPORT_SYMBOL(tcp_v4_md5_lookup);
818 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
819 struct request_sock *req)
821 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
824 /* This can be called on a newly created socket, from other files */
825 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
826 u8 *newkey, u8 newkeylen)
828 /* Add Key to the list */
829 struct tcp_md5sig_key *key;
830 struct tcp_sock *tp = tcp_sk(sk);
831 struct tcp4_md5sig_key *keys;
833 key = tcp_v4_md5_do_lookup(sk, addr);
835 /* Pre-existing entry - just update that one. */
838 key->keylen = newkeylen;
840 struct tcp_md5sig_info *md5sig;
842 if (!tp->md5sig_info) {
843 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
845 if (!tp->md5sig_info) {
849 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
851 if (tcp_alloc_md5sig_pool() == NULL) {
855 md5sig = tp->md5sig_info;
857 if (md5sig->alloced4 == md5sig->entries4) {
858 keys = kmalloc((sizeof(*keys) *
859 (md5sig->entries4 + 1)), GFP_ATOMIC);
862 tcp_free_md5sig_pool();
866 if (md5sig->entries4)
867 memcpy(keys, md5sig->keys4,
868 sizeof(*keys) * md5sig->entries4);
870 /* Free old key list, and reference new one */
871 kfree(md5sig->keys4);
872 md5sig->keys4 = keys;
876 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
877 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
878 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
883 EXPORT_SYMBOL(tcp_v4_md5_do_add);
885 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
886 u8 *newkey, u8 newkeylen)
888 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
892 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
894 struct tcp_sock *tp = tcp_sk(sk);
897 for (i = 0; i < tp->md5sig_info->entries4; i++) {
898 if (tp->md5sig_info->keys4[i].addr == addr) {
900 kfree(tp->md5sig_info->keys4[i].base.key);
901 tp->md5sig_info->entries4--;
903 if (tp->md5sig_info->entries4 == 0) {
904 kfree(tp->md5sig_info->keys4);
905 tp->md5sig_info->keys4 = NULL;
906 tp->md5sig_info->alloced4 = 0;
907 } else if (tp->md5sig_info->entries4 != i) {
908 /* Need to do some manipulation */
909 memmove(&tp->md5sig_info->keys4[i],
910 &tp->md5sig_info->keys4[i+1],
911 (tp->md5sig_info->entries4 - i) *
912 sizeof(struct tcp4_md5sig_key));
914 tcp_free_md5sig_pool();
921 EXPORT_SYMBOL(tcp_v4_md5_do_del);
923 static void tcp_v4_clear_md5_list(struct sock *sk)
925 struct tcp_sock *tp = tcp_sk(sk);
927 /* Free each key, then the set of key keys,
928 * the crypto element, and then decrement our
929 * hold on the last resort crypto.
931 if (tp->md5sig_info->entries4) {
933 for (i = 0; i < tp->md5sig_info->entries4; i++)
934 kfree(tp->md5sig_info->keys4[i].base.key);
935 tp->md5sig_info->entries4 = 0;
936 tcp_free_md5sig_pool();
938 if (tp->md5sig_info->keys4) {
939 kfree(tp->md5sig_info->keys4);
940 tp->md5sig_info->keys4 = NULL;
941 tp->md5sig_info->alloced4 = 0;
945 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
948 struct tcp_md5sig cmd;
949 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
952 if (optlen < sizeof(cmd))
955 if (copy_from_user(&cmd, optval, sizeof(cmd)))
958 if (sin->sin_family != AF_INET)
961 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
962 if (!tcp_sk(sk)->md5sig_info)
964 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
967 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
970 if (!tcp_sk(sk)->md5sig_info) {
971 struct tcp_sock *tp = tcp_sk(sk);
972 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
978 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
981 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
984 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
985 newkey, cmd.tcpm_keylen);
988 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
989 __be32 saddr, __be32 daddr,
993 struct tcp_md5sig_pool *hp;
994 struct tcp4_pseudohdr *bp;
998 * Okay, so RFC2385 is turned on for this connection,
999 * so we need to generate the MD5 hash for the packet now.
1002 hp = tcp_get_md5sig_pool();
1004 goto clear_hash_noput;
1006 bp = &hp->md5_blk.ip4;
1009 * The TCP pseudo-header (in the order: source IP address,
1010 * destination IP address, zero-padded protocol number, and
1016 bp->protocol = IPPROTO_TCP;
1017 bp->len = htons(tcplen);
1019 err = tcp_calc_md5_hash(md5_hash, key, sizeof(*bp),
1024 /* Free up the crypto pool */
1025 tcp_put_md5sig_pool();
1029 tcp_put_md5sig_pool();
1031 memset(md5_hash, 0, 16);
1035 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1037 struct dst_entry *dst,
1038 struct request_sock *req,
1040 unsigned int tcplen)
1042 __be32 saddr, daddr;
1045 saddr = inet_sk(sk)->saddr;
1046 daddr = inet_sk(sk)->daddr;
1048 struct rtable *rt = (struct rtable *)dst;
1053 return tcp_v4_do_calc_md5_hash(md5_hash, key,
1058 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1060 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1063 * This gets called for each TCP segment that arrives
1064 * so we want to be efficient.
1065 * We have 3 drop cases:
1066 * o No MD5 hash and one expected.
1067 * o MD5 hash and we're not expecting one.
1068 * o MD5 hash and its wrong.
1070 __u8 *hash_location = NULL;
1071 struct tcp_md5sig_key *hash_expected;
1072 const struct iphdr *iph = ip_hdr(skb);
1073 struct tcphdr *th = tcp_hdr(skb);
1075 unsigned char newhash[16];
1077 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1078 hash_location = tcp_parse_md5sig_option(th);
1080 /* We've parsed the options - do we have a hash? */
1081 if (!hash_expected && !hash_location)
1084 if (hash_expected && !hash_location) {
1085 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1086 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1087 NIPQUAD(iph->saddr), ntohs(th->source),
1088 NIPQUAD(iph->daddr), ntohs(th->dest));
1092 if (!hash_expected && hash_location) {
1093 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1094 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1095 NIPQUAD(iph->saddr), ntohs(th->source),
1096 NIPQUAD(iph->daddr), ntohs(th->dest));
1100 /* Okay, so this is hash_expected and hash_location -
1101 * so we need to calculate the checksum.
1103 genhash = tcp_v4_do_calc_md5_hash(newhash,
1105 iph->saddr, iph->daddr,
1108 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1109 if (net_ratelimit()) {
1110 printk(KERN_INFO "MD5 Hash failed for "
1111 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1112 NIPQUAD(iph->saddr), ntohs(th->source),
1113 NIPQUAD(iph->daddr), ntohs(th->dest),
1114 genhash ? " tcp_v4_calc_md5_hash failed" : "");
1123 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1125 .obj_size = sizeof(struct tcp_request_sock),
1126 .rtx_syn_ack = tcp_v4_send_synack,
1127 .send_ack = tcp_v4_reqsk_send_ack,
1128 .destructor = tcp_v4_reqsk_destructor,
1129 .send_reset = tcp_v4_send_reset,
1132 #ifdef CONFIG_TCP_MD5SIG
1133 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1134 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1138 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1139 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1140 .twsk_unique = tcp_twsk_unique,
1141 .twsk_destructor= tcp_twsk_destructor,
1144 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1146 struct inet_request_sock *ireq;
1147 struct tcp_options_received tmp_opt;
1148 struct request_sock *req;
1149 __be32 saddr = ip_hdr(skb)->saddr;
1150 __be32 daddr = ip_hdr(skb)->daddr;
1151 __u32 isn = TCP_SKB_CB(skb)->when;
1152 struct dst_entry *dst = NULL;
1153 #ifdef CONFIG_SYN_COOKIES
1154 int want_cookie = 0;
1156 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1159 /* Never answer to SYNs send to broadcast or multicast */
1160 if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1163 /* TW buckets are converted to open requests without
1164 * limitations, they conserve resources and peer is
1165 * evidently real one.
1167 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1168 #ifdef CONFIG_SYN_COOKIES
1169 if (sysctl_tcp_syncookies) {
1176 /* Accept backlog is full. If we have already queued enough
1177 * of warm entries in syn queue, drop request. It is better than
1178 * clogging syn queue with openreqs with exponentially increasing
1181 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1184 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1188 #ifdef CONFIG_TCP_MD5SIG
1189 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1192 tcp_clear_options(&tmp_opt);
1193 tmp_opt.mss_clamp = 536;
1194 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1196 tcp_parse_options(skb, &tmp_opt, 0);
1198 if (want_cookie && !tmp_opt.saw_tstamp)
1199 tcp_clear_options(&tmp_opt);
1201 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1202 /* Some OSes (unknown ones, but I see them on web server, which
1203 * contains information interesting only for windows'
1204 * users) do not send their stamp in SYN. It is easy case.
1205 * We simply do not advertise TS support.
1207 tmp_opt.saw_tstamp = 0;
1208 tmp_opt.tstamp_ok = 0;
1210 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1212 tcp_openreq_init(req, &tmp_opt, skb);
1214 if (security_inet_conn_request(sk, skb, req))
1217 ireq = inet_rsk(req);
1218 ireq->loc_addr = daddr;
1219 ireq->rmt_addr = saddr;
1220 ireq->opt = tcp_v4_save_options(sk, skb);
1222 TCP_ECN_create_request(req, tcp_hdr(skb));
1225 #ifdef CONFIG_SYN_COOKIES
1226 syn_flood_warning(skb);
1227 req->cookie_ts = tmp_opt.tstamp_ok;
1229 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1231 struct inet_peer *peer = NULL;
1233 /* VJ's idea. We save last timestamp seen
1234 * from the destination in peer table, when entering
1235 * state TIME-WAIT, and check against it before
1236 * accepting new connection request.
1238 * If "isn" is not zero, this request hit alive
1239 * timewait bucket, so that all the necessary checks
1240 * are made in the function processing timewait state.
1242 if (tmp_opt.saw_tstamp &&
1243 tcp_death_row.sysctl_tw_recycle &&
1244 (dst = inet_csk_route_req(sk, req)) != NULL &&
1245 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1246 peer->v4daddr == saddr) {
1247 if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1248 (s32)(peer->tcp_ts - req->ts_recent) >
1250 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1251 goto drop_and_release;
1254 /* Kill the following clause, if you dislike this way. */
1255 else if (!sysctl_tcp_syncookies &&
1256 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1257 (sysctl_max_syn_backlog >> 2)) &&
1258 (!peer || !peer->tcp_ts_stamp) &&
1259 (!dst || !dst_metric(dst, RTAX_RTT))) {
1260 /* Without syncookies last quarter of
1261 * backlog is filled with destinations,
1262 * proven to be alive.
1263 * It means that we continue to communicate
1264 * to destinations, already remembered
1265 * to the moment of synflood.
1267 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1268 "request from " NIPQUAD_FMT "/%u\n",
1270 ntohs(tcp_hdr(skb)->source));
1271 goto drop_and_release;
1274 isn = tcp_v4_init_sequence(skb);
1276 tcp_rsk(req)->snt_isn = isn;
1278 if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1281 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1294 * The three way handshake has completed - we got a valid synack -
1295 * now create the new socket.
1297 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1298 struct request_sock *req,
1299 struct dst_entry *dst)
1301 struct inet_request_sock *ireq;
1302 struct inet_sock *newinet;
1303 struct tcp_sock *newtp;
1305 #ifdef CONFIG_TCP_MD5SIG
1306 struct tcp_md5sig_key *key;
1309 if (sk_acceptq_is_full(sk))
1312 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1315 newsk = tcp_create_openreq_child(sk, req, skb);
1319 newsk->sk_gso_type = SKB_GSO_TCPV4;
1320 sk_setup_caps(newsk, dst);
1322 newtp = tcp_sk(newsk);
1323 newinet = inet_sk(newsk);
1324 ireq = inet_rsk(req);
1325 newinet->daddr = ireq->rmt_addr;
1326 newinet->rcv_saddr = ireq->loc_addr;
1327 newinet->saddr = ireq->loc_addr;
1328 newinet->opt = ireq->opt;
1330 newinet->mc_index = inet_iif(skb);
1331 newinet->mc_ttl = ip_hdr(skb)->ttl;
1332 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1334 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1335 newinet->id = newtp->write_seq ^ jiffies;
1337 tcp_mtup_init(newsk);
1338 tcp_sync_mss(newsk, dst_mtu(dst));
1339 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1340 tcp_initialize_rcv_mss(newsk);
1342 #ifdef CONFIG_TCP_MD5SIG
1343 /* Copy over the MD5 key from the original socket */
1344 if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1346 * We're using one, so create a matching key
1347 * on the newsk structure. If we fail to get
1348 * memory, then we end up not copying the key
1351 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1353 tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1354 newkey, key->keylen);
1358 __inet_hash_nolisten(newsk);
1359 __inet_inherit_port(sk, newsk);
1364 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1366 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1371 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1373 struct tcphdr *th = tcp_hdr(skb);
1374 const struct iphdr *iph = ip_hdr(skb);
1376 struct request_sock **prev;
1377 /* Find possible connection requests. */
1378 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1379 iph->saddr, iph->daddr);
1381 return tcp_check_req(sk, skb, req, prev);
1383 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1384 th->source, iph->daddr, th->dest, inet_iif(skb));
1387 if (nsk->sk_state != TCP_TIME_WAIT) {
1391 inet_twsk_put(inet_twsk(nsk));
1395 #ifdef CONFIG_SYN_COOKIES
1396 if (!th->rst && !th->syn && th->ack)
1397 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1402 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1404 const struct iphdr *iph = ip_hdr(skb);
1406 if (skb->ip_summed == CHECKSUM_COMPLETE) {
1407 if (!tcp_v4_check(skb->len, iph->saddr,
1408 iph->daddr, skb->csum)) {
1409 skb->ip_summed = CHECKSUM_UNNECESSARY;
1414 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1415 skb->len, IPPROTO_TCP, 0);
1417 if (skb->len <= 76) {
1418 return __skb_checksum_complete(skb);
1424 /* The socket must have it's spinlock held when we get
1427 * We have a potential double-lock case here, so even when
1428 * doing backlog processing we use the BH locking scheme.
1429 * This is because we cannot sleep with the original spinlock
1432 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1435 #ifdef CONFIG_TCP_MD5SIG
1437 * We really want to reject the packet as early as possible
1439 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1440 * o There is an MD5 option and we're not expecting one
1442 if (tcp_v4_inbound_md5_hash(sk, skb))
1446 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1447 TCP_CHECK_TIMER(sk);
1448 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1452 TCP_CHECK_TIMER(sk);
1456 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1459 if (sk->sk_state == TCP_LISTEN) {
1460 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1465 if (tcp_child_process(sk, nsk, skb)) {
1473 TCP_CHECK_TIMER(sk);
1474 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1478 TCP_CHECK_TIMER(sk);
1482 tcp_v4_send_reset(rsk, skb);
1485 /* Be careful here. If this function gets more complicated and
1486 * gcc suffers from register pressure on the x86, sk (in %ebx)
1487 * might be destroyed here. This current version compiles correctly,
1488 * but you have been warned.
1493 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1501 int tcp_v4_rcv(struct sk_buff *skb)
1503 const struct iphdr *iph;
1508 if (skb->pkt_type != PACKET_HOST)
1511 /* Count it even if it's bad */
1512 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1514 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1519 if (th->doff < sizeof(struct tcphdr) / 4)
1521 if (!pskb_may_pull(skb, th->doff * 4))
1524 /* An explanation is required here, I think.
1525 * Packet length and doff are validated by header prediction,
1526 * provided case of th->doff==0 is eliminated.
1527 * So, we defer the checks. */
1528 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1533 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1534 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1535 skb->len - th->doff * 4);
1536 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1537 TCP_SKB_CB(skb)->when = 0;
1538 TCP_SKB_CB(skb)->flags = iph->tos;
1539 TCP_SKB_CB(skb)->sacked = 0;
1541 sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1542 th->source, iph->daddr, th->dest, inet_iif(skb));
1547 if (sk->sk_state == TCP_TIME_WAIT)
1550 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1551 goto discard_and_relse;
1554 if (sk_filter(sk, skb))
1555 goto discard_and_relse;
1559 bh_lock_sock_nested(sk);
1561 if (!sock_owned_by_user(sk)) {
1562 #ifdef CONFIG_NET_DMA
1563 struct tcp_sock *tp = tcp_sk(sk);
1564 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1565 tp->ucopy.dma_chan = get_softnet_dma();
1566 if (tp->ucopy.dma_chan)
1567 ret = tcp_v4_do_rcv(sk, skb);
1571 if (!tcp_prequeue(sk, skb))
1572 ret = tcp_v4_do_rcv(sk, skb);
1575 sk_add_backlog(sk, skb);
1583 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1586 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1588 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1590 tcp_v4_send_reset(NULL, skb);
1594 /* Discard frame. */
1603 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1604 inet_twsk_put(inet_twsk(sk));
1608 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1609 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1610 inet_twsk_put(inet_twsk(sk));
1613 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1615 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1617 iph->daddr, th->dest,
1620 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1621 inet_twsk_put(inet_twsk(sk));
1625 /* Fall through to ACK */
1628 tcp_v4_timewait_ack(sk, skb);
1632 case TCP_TW_SUCCESS:;
1637 /* VJ's idea. Save last timestamp seen from this destination
1638 * and hold it at least for normal timewait interval to use for duplicate
1639 * segment detection in subsequent connections, before they enter synchronized
1643 int tcp_v4_remember_stamp(struct sock *sk)
1645 struct inet_sock *inet = inet_sk(sk);
1646 struct tcp_sock *tp = tcp_sk(sk);
1647 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1648 struct inet_peer *peer = NULL;
1651 if (!rt || rt->rt_dst != inet->daddr) {
1652 peer = inet_getpeer(inet->daddr, 1);
1656 rt_bind_peer(rt, 1);
1661 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1662 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1663 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1664 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1665 peer->tcp_ts = tp->rx_opt.ts_recent;
1675 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1677 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1680 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1682 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1683 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1684 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1685 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1686 peer->tcp_ts = tcptw->tw_ts_recent;
1695 struct inet_connection_sock_af_ops ipv4_specific = {
1696 .queue_xmit = ip_queue_xmit,
1697 .send_check = tcp_v4_send_check,
1698 .rebuild_header = inet_sk_rebuild_header,
1699 .conn_request = tcp_v4_conn_request,
1700 .syn_recv_sock = tcp_v4_syn_recv_sock,
1701 .remember_stamp = tcp_v4_remember_stamp,
1702 .net_header_len = sizeof(struct iphdr),
1703 .setsockopt = ip_setsockopt,
1704 .getsockopt = ip_getsockopt,
1705 .addr2sockaddr = inet_csk_addr2sockaddr,
1706 .sockaddr_len = sizeof(struct sockaddr_in),
1707 .bind_conflict = inet_csk_bind_conflict,
1708 #ifdef CONFIG_COMPAT
1709 .compat_setsockopt = compat_ip_setsockopt,
1710 .compat_getsockopt = compat_ip_getsockopt,
1714 #ifdef CONFIG_TCP_MD5SIG
1715 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1716 .md5_lookup = tcp_v4_md5_lookup,
1717 .calc_md5_hash = tcp_v4_calc_md5_hash,
1718 .md5_add = tcp_v4_md5_add_func,
1719 .md5_parse = tcp_v4_parse_md5_keys,
1723 /* NOTE: A lot of things set to zero explicitly by call to
1724 * sk_alloc() so need not be done here.
1726 static int tcp_v4_init_sock(struct sock *sk)
1728 struct inet_connection_sock *icsk = inet_csk(sk);
1729 struct tcp_sock *tp = tcp_sk(sk);
1731 skb_queue_head_init(&tp->out_of_order_queue);
1732 tcp_init_xmit_timers(sk);
1733 tcp_prequeue_init(tp);
1735 icsk->icsk_rto = TCP_TIMEOUT_INIT;
1736 tp->mdev = TCP_TIMEOUT_INIT;
1738 /* So many TCP implementations out there (incorrectly) count the
1739 * initial SYN frame in their delayed-ACK and congestion control
1740 * algorithms that we must have the following bandaid to talk
1741 * efficiently to them. -DaveM
1745 /* See draft-stevens-tcpca-spec-01 for discussion of the
1746 * initialization of these values.
1748 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1749 tp->snd_cwnd_clamp = ~0;
1750 tp->mss_cache = 536;
1752 tp->reordering = sysctl_tcp_reordering;
1753 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1755 sk->sk_state = TCP_CLOSE;
1757 sk->sk_write_space = sk_stream_write_space;
1758 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1760 icsk->icsk_af_ops = &ipv4_specific;
1761 icsk->icsk_sync_mss = tcp_sync_mss;
1762 #ifdef CONFIG_TCP_MD5SIG
1763 tp->af_specific = &tcp_sock_ipv4_specific;
1766 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1767 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1769 atomic_inc(&tcp_sockets_allocated);
1774 void tcp_v4_destroy_sock(struct sock *sk)
1776 struct tcp_sock *tp = tcp_sk(sk);
1778 tcp_clear_xmit_timers(sk);
1780 tcp_cleanup_congestion_control(sk);
1782 /* Cleanup up the write buffer. */
1783 tcp_write_queue_purge(sk);
1785 /* Cleans up our, hopefully empty, out_of_order_queue. */
1786 __skb_queue_purge(&tp->out_of_order_queue);
1788 #ifdef CONFIG_TCP_MD5SIG
1789 /* Clean up the MD5 key list, if any */
1790 if (tp->md5sig_info) {
1791 tcp_v4_clear_md5_list(sk);
1792 kfree(tp->md5sig_info);
1793 tp->md5sig_info = NULL;
1797 #ifdef CONFIG_NET_DMA
1798 /* Cleans up our sk_async_wait_queue */
1799 __skb_queue_purge(&sk->sk_async_wait_queue);
1802 /* Clean prequeue, it must be empty really */
1803 __skb_queue_purge(&tp->ucopy.prequeue);
1805 /* Clean up a referenced TCP bind bucket. */
1806 if (inet_csk(sk)->icsk_bind_hash)
1810 * If sendmsg cached page exists, toss it.
1812 if (sk->sk_sndmsg_page) {
1813 __free_page(sk->sk_sndmsg_page);
1814 sk->sk_sndmsg_page = NULL;
1817 atomic_dec(&tcp_sockets_allocated);
1820 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1822 #ifdef CONFIG_PROC_FS
1823 /* Proc filesystem TCP sock list dumping. */
1825 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1827 return hlist_empty(head) ? NULL :
1828 list_entry(head->first, struct inet_timewait_sock, tw_node);
1831 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1833 return tw->tw_node.next ?
1834 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1837 static void *listening_get_next(struct seq_file *seq, void *cur)
1839 struct inet_connection_sock *icsk;
1840 struct hlist_node *node;
1841 struct sock *sk = cur;
1842 struct tcp_iter_state* st = seq->private;
1843 struct net *net = seq_file_net(seq);
1847 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1853 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1854 struct request_sock *req = cur;
1856 icsk = inet_csk(st->syn_wait_sk);
1860 if (req->rsk_ops->family == st->family &&
1861 net_eq(sock_net(req->sk), net)) {
1867 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1870 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1872 sk = sk_next(st->syn_wait_sk);
1873 st->state = TCP_SEQ_STATE_LISTENING;
1874 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1876 icsk = inet_csk(sk);
1877 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1878 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1880 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1884 sk_for_each_from(sk, node) {
1885 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1889 icsk = inet_csk(sk);
1890 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1891 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1893 st->uid = sock_i_uid(sk);
1894 st->syn_wait_sk = sk;
1895 st->state = TCP_SEQ_STATE_OPENREQ;
1899 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1901 if (++st->bucket < INET_LHTABLE_SIZE) {
1902 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1910 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1912 void *rc = listening_get_next(seq, NULL);
1914 while (rc && *pos) {
1915 rc = listening_get_next(seq, rc);
1921 static void *established_get_first(struct seq_file *seq)
1923 struct tcp_iter_state* st = seq->private;
1924 struct net *net = seq_file_net(seq);
1927 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1929 struct hlist_node *node;
1930 struct inet_timewait_sock *tw;
1931 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1934 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1935 if (sk->sk_family != st->family ||
1936 !net_eq(sock_net(sk), net)) {
1942 st->state = TCP_SEQ_STATE_TIME_WAIT;
1943 inet_twsk_for_each(tw, node,
1944 &tcp_hashinfo.ehash[st->bucket].twchain) {
1945 if (tw->tw_family != st->family ||
1946 !net_eq(twsk_net(tw), net)) {
1952 read_unlock_bh(lock);
1953 st->state = TCP_SEQ_STATE_ESTABLISHED;
1959 static void *established_get_next(struct seq_file *seq, void *cur)
1961 struct sock *sk = cur;
1962 struct inet_timewait_sock *tw;
1963 struct hlist_node *node;
1964 struct tcp_iter_state* st = seq->private;
1965 struct net *net = seq_file_net(seq);
1969 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1973 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1980 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1981 st->state = TCP_SEQ_STATE_ESTABLISHED;
1983 if (++st->bucket < tcp_hashinfo.ehash_size) {
1984 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1985 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1993 sk_for_each_from(sk, node) {
1994 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1998 st->state = TCP_SEQ_STATE_TIME_WAIT;
1999 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2007 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2009 void *rc = established_get_first(seq);
2012 rc = established_get_next(seq, rc);
2018 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2021 struct tcp_iter_state* st = seq->private;
2023 inet_listen_lock(&tcp_hashinfo);
2024 st->state = TCP_SEQ_STATE_LISTENING;
2025 rc = listening_get_idx(seq, &pos);
2028 inet_listen_unlock(&tcp_hashinfo);
2029 st->state = TCP_SEQ_STATE_ESTABLISHED;
2030 rc = established_get_idx(seq, pos);
2036 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2038 struct tcp_iter_state* st = seq->private;
2039 st->state = TCP_SEQ_STATE_LISTENING;
2041 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2044 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2047 struct tcp_iter_state* st;
2049 if (v == SEQ_START_TOKEN) {
2050 rc = tcp_get_idx(seq, 0);
2055 switch (st->state) {
2056 case TCP_SEQ_STATE_OPENREQ:
2057 case TCP_SEQ_STATE_LISTENING:
2058 rc = listening_get_next(seq, v);
2060 inet_listen_unlock(&tcp_hashinfo);
2061 st->state = TCP_SEQ_STATE_ESTABLISHED;
2062 rc = established_get_first(seq);
2065 case TCP_SEQ_STATE_ESTABLISHED:
2066 case TCP_SEQ_STATE_TIME_WAIT:
2067 rc = established_get_next(seq, v);
2075 static void tcp_seq_stop(struct seq_file *seq, void *v)
2077 struct tcp_iter_state* st = seq->private;
2079 switch (st->state) {
2080 case TCP_SEQ_STATE_OPENREQ:
2082 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2083 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2085 case TCP_SEQ_STATE_LISTENING:
2086 if (v != SEQ_START_TOKEN)
2087 inet_listen_unlock(&tcp_hashinfo);
2089 case TCP_SEQ_STATE_TIME_WAIT:
2090 case TCP_SEQ_STATE_ESTABLISHED:
2092 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2097 static int tcp_seq_open(struct inode *inode, struct file *file)
2099 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2100 struct tcp_iter_state *s;
2103 err = seq_open_net(inode, file, &afinfo->seq_ops,
2104 sizeof(struct tcp_iter_state));
2108 s = ((struct seq_file *)file->private_data)->private;
2109 s->family = afinfo->family;
2113 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2116 struct proc_dir_entry *p;
2118 afinfo->seq_fops.open = tcp_seq_open;
2119 afinfo->seq_fops.read = seq_read;
2120 afinfo->seq_fops.llseek = seq_lseek;
2121 afinfo->seq_fops.release = seq_release_net;
2123 afinfo->seq_ops.start = tcp_seq_start;
2124 afinfo->seq_ops.next = tcp_seq_next;
2125 afinfo->seq_ops.stop = tcp_seq_stop;
2127 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2128 &afinfo->seq_fops, afinfo);
2134 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2136 proc_net_remove(net, afinfo->name);
2139 static void get_openreq4(struct sock *sk, struct request_sock *req,
2140 struct seq_file *f, int i, int uid, int *len)
2142 const struct inet_request_sock *ireq = inet_rsk(req);
2143 int ttd = req->expires - jiffies;
2145 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2146 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2149 ntohs(inet_sk(sk)->sport),
2151 ntohs(ireq->rmt_port),
2153 0, 0, /* could print option size, but that is af dependent. */
2154 1, /* timers active (only the expire timer) */
2155 jiffies_to_clock_t(ttd),
2158 0, /* non standard timer */
2159 0, /* open_requests have no inode */
2160 atomic_read(&sk->sk_refcnt),
2165 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2168 unsigned long timer_expires;
2169 struct tcp_sock *tp = tcp_sk(sk);
2170 const struct inet_connection_sock *icsk = inet_csk(sk);
2171 struct inet_sock *inet = inet_sk(sk);
2172 __be32 dest = inet->daddr;
2173 __be32 src = inet->rcv_saddr;
2174 __u16 destp = ntohs(inet->dport);
2175 __u16 srcp = ntohs(inet->sport);
2177 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2179 timer_expires = icsk->icsk_timeout;
2180 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2182 timer_expires = icsk->icsk_timeout;
2183 } else if (timer_pending(&sk->sk_timer)) {
2185 timer_expires = sk->sk_timer.expires;
2188 timer_expires = jiffies;
2191 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2192 "%08X %5d %8d %lu %d %p %u %u %u %u %d%n",
2193 i, src, srcp, dest, destp, sk->sk_state,
2194 tp->write_seq - tp->snd_una,
2195 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2196 (tp->rcv_nxt - tp->copied_seq),
2198 jiffies_to_clock_t(timer_expires - jiffies),
2199 icsk->icsk_retransmits,
2201 icsk->icsk_probes_out,
2203 atomic_read(&sk->sk_refcnt), sk,
2206 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2208 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2212 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2213 struct seq_file *f, int i, int *len)
2217 int ttd = tw->tw_ttd - jiffies;
2222 dest = tw->tw_daddr;
2223 src = tw->tw_rcv_saddr;
2224 destp = ntohs(tw->tw_dport);
2225 srcp = ntohs(tw->tw_sport);
2227 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2228 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2229 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2230 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2231 atomic_read(&tw->tw_refcnt), tw, len);
2236 static int tcp4_seq_show(struct seq_file *seq, void *v)
2238 struct tcp_iter_state* st;
2241 if (v == SEQ_START_TOKEN) {
2242 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2243 " sl local_address rem_address st tx_queue "
2244 "rx_queue tr tm->when retrnsmt uid timeout "
2250 switch (st->state) {
2251 case TCP_SEQ_STATE_LISTENING:
2252 case TCP_SEQ_STATE_ESTABLISHED:
2253 get_tcp4_sock(v, seq, st->num, &len);
2255 case TCP_SEQ_STATE_OPENREQ:
2256 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2258 case TCP_SEQ_STATE_TIME_WAIT:
2259 get_timewait4_sock(v, seq, st->num, &len);
2262 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2267 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2271 .owner = THIS_MODULE,
2274 .show = tcp4_seq_show,
2278 static int tcp4_proc_init_net(struct net *net)
2280 return tcp_proc_register(net, &tcp4_seq_afinfo);
2283 static void tcp4_proc_exit_net(struct net *net)
2285 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2288 static struct pernet_operations tcp4_net_ops = {
2289 .init = tcp4_proc_init_net,
2290 .exit = tcp4_proc_exit_net,
2293 int __init tcp4_proc_init(void)
2295 return register_pernet_subsys(&tcp4_net_ops);
2298 void tcp4_proc_exit(void)
2300 unregister_pernet_subsys(&tcp4_net_ops);
2302 #endif /* CONFIG_PROC_FS */
2304 struct proto tcp_prot = {
2306 .owner = THIS_MODULE,
2308 .connect = tcp_v4_connect,
2309 .disconnect = tcp_disconnect,
2310 .accept = inet_csk_accept,
2312 .init = tcp_v4_init_sock,
2313 .destroy = tcp_v4_destroy_sock,
2314 .shutdown = tcp_shutdown,
2315 .setsockopt = tcp_setsockopt,
2316 .getsockopt = tcp_getsockopt,
2317 .recvmsg = tcp_recvmsg,
2318 .backlog_rcv = tcp_v4_do_rcv,
2320 .unhash = inet_unhash,
2321 .get_port = inet_csk_get_port,
2322 .enter_memory_pressure = tcp_enter_memory_pressure,
2323 .sockets_allocated = &tcp_sockets_allocated,
2324 .orphan_count = &tcp_orphan_count,
2325 .memory_allocated = &tcp_memory_allocated,
2326 .memory_pressure = &tcp_memory_pressure,
2327 .sysctl_mem = sysctl_tcp_mem,
2328 .sysctl_wmem = sysctl_tcp_wmem,
2329 .sysctl_rmem = sysctl_tcp_rmem,
2330 .max_header = MAX_TCP_HEADER,
2331 .obj_size = sizeof(struct tcp_sock),
2332 .twsk_prot = &tcp_timewait_sock_ops,
2333 .rsk_prot = &tcp_request_sock_ops,
2334 .h.hashinfo = &tcp_hashinfo,
2335 #ifdef CONFIG_COMPAT
2336 .compat_setsockopt = compat_tcp_setsockopt,
2337 .compat_getsockopt = compat_tcp_getsockopt,
2342 static int __net_init tcp_sk_init(struct net *net)
2344 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2345 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2348 static void __net_exit tcp_sk_exit(struct net *net)
2350 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2353 static struct pernet_operations __net_initdata tcp_sk_ops = {
2354 .init = tcp_sk_init,
2355 .exit = tcp_sk_exit,
2358 void __init tcp_v4_init(void)
2360 if (register_pernet_device(&tcp_sk_ops))
2361 panic("Failed to create the TCP control socket.\n");
2364 EXPORT_SYMBOL(ipv4_specific);
2365 EXPORT_SYMBOL(tcp_hashinfo);
2366 EXPORT_SYMBOL(tcp_prot);
2367 EXPORT_SYMBOL(tcp_v4_conn_request);
2368 EXPORT_SYMBOL(tcp_v4_connect);
2369 EXPORT_SYMBOL(tcp_v4_do_rcv);
2370 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2371 EXPORT_SYMBOL(tcp_v4_send_check);
2372 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2374 #ifdef CONFIG_PROC_FS
2375 EXPORT_SYMBOL(tcp_proc_register);
2376 EXPORT_SYMBOL(tcp_proc_unregister);
2378 EXPORT_SYMBOL(sysctl_tcp_low_latency);