2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
67 #include <net/inet_hashtables.h>
70 #include <net/inet_common.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 .port_rover = 1024 - 1,
100 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
102 return inet_csk_get_port(&tcp_hashinfo, sk, snum);
105 static void tcp_v4_hash(struct sock *sk)
107 inet_hash(&tcp_hashinfo, sk);
110 void tcp_unhash(struct sock *sk)
112 inet_unhash(&tcp_hashinfo, sk);
115 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
117 return secure_tcp_sequence_number(skb->nh.iph->daddr,
123 /* called with local bh disabled */
124 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
125 struct inet_timewait_sock **twp)
127 struct inet_sock *inet = inet_sk(sk);
128 u32 daddr = inet->rcv_saddr;
129 u32 saddr = inet->daddr;
130 int dif = sk->sk_bound_dev_if;
131 INET_ADDR_COOKIE(acookie, saddr, daddr)
132 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
133 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
134 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
136 const struct hlist_node *node;
137 struct inet_timewait_sock *tw;
139 write_lock(&head->lock);
141 /* Check TIME-WAIT sockets first. */
142 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
145 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
146 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
147 struct tcp_sock *tp = tcp_sk(sk);
149 /* With PAWS, it is safe from the viewpoint
150 of data integrity. Even without PAWS it
151 is safe provided sequence spaces do not
152 overlap i.e. at data rates <= 80Mbit/sec.
154 Actually, the idea is close to VJ's one,
155 only timestamp cache is held not per host,
156 but per port pair and TW bucket is used
159 If TW bucket has been already destroyed we
160 fall back to VJ's scheme and use initial
161 timestamp retrieved from peer table.
163 if (tcptw->tw_ts_recent_stamp &&
164 (!twp || (sysctl_tcp_tw_reuse &&
166 tcptw->tw_ts_recent_stamp > 1))) {
167 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
168 if (tp->write_seq == 0)
170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
180 /* And established part... */
181 sk_for_each(sk2, node, &head->chain) {
182 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
187 /* Must record num and sport now. Otherwise we will see
188 * in hash table socket with a funny identity. */
190 inet->sport = htons(lport);
191 sk->sk_hashent = hash;
192 BUG_TRAP(sk_unhashed(sk));
193 __sk_add_node(sk, &head->chain);
194 sock_prot_inc_use(sk->sk_prot);
195 write_unlock(&head->lock);
199 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
201 /* Silly. Should hash-dance instead... */
202 tcp_tw_deschedule(tw);
203 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
211 write_unlock(&head->lock);
212 return -EADDRNOTAVAIL;
215 static inline u32 connect_port_offset(const struct sock *sk)
217 const struct inet_sock *inet = inet_sk(sk);
219 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
224 * Bind a port for a connect operation and hash it.
226 static inline int tcp_v4_hash_connect(struct sock *sk)
228 const unsigned short snum = inet_sk(sk)->num;
229 struct inet_bind_hashbucket *head;
230 struct inet_bind_bucket *tb;
234 int low = sysctl_local_port_range[0];
235 int high = sysctl_local_port_range[1];
236 int range = high - low;
240 u32 offset = hint + connect_port_offset(sk);
241 struct hlist_node *node;
242 struct inet_timewait_sock *tw = NULL;
245 for (i = 1; i <= range; i++) {
246 port = low + (i + offset) % range;
247 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
248 spin_lock(&head->lock);
250 /* Does not bother with rcv_saddr checks,
251 * because the established check is already
254 inet_bind_bucket_for_each(tb, node, &head->chain) {
255 if (tb->port == port) {
256 BUG_TRAP(!hlist_empty(&tb->owners));
257 if (tb->fastreuse >= 0)
259 if (!__tcp_v4_check_established(sk,
267 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
269 spin_unlock(&head->lock);
276 spin_unlock(&head->lock);
280 return -EADDRNOTAVAIL;
285 /* Head lock still held and bh's disabled */
286 inet_bind_hash(sk, tb, port);
287 if (sk_unhashed(sk)) {
288 inet_sk(sk)->sport = htons(port);
289 __inet_hash(&tcp_hashinfo, sk, 0);
291 spin_unlock(&head->lock);
294 tcp_tw_deschedule(tw);
302 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
303 tb = inet_csk(sk)->icsk_bind_hash;
304 spin_lock_bh(&head->lock);
305 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
306 __inet_hash(&tcp_hashinfo, sk, 0);
307 spin_unlock_bh(&head->lock);
310 spin_unlock(&head->lock);
311 /* No definite answer... Walk to established hash table */
312 ret = __tcp_v4_check_established(sk, snum, NULL);
319 /* This will initiate an outgoing connection. */
320 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
322 struct inet_sock *inet = inet_sk(sk);
323 struct tcp_sock *tp = tcp_sk(sk);
324 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
330 if (addr_len < sizeof(struct sockaddr_in))
333 if (usin->sin_family != AF_INET)
334 return -EAFNOSUPPORT;
336 nexthop = daddr = usin->sin_addr.s_addr;
337 if (inet->opt && inet->opt->srr) {
340 nexthop = inet->opt->faddr;
343 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
344 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
346 inet->sport, usin->sin_port, sk);
350 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
355 if (!inet->opt || !inet->opt->srr)
359 inet->saddr = rt->rt_src;
360 inet->rcv_saddr = inet->saddr;
362 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
363 /* Reset inherited state */
364 tp->rx_opt.ts_recent = 0;
365 tp->rx_opt.ts_recent_stamp = 0;
369 if (sysctl_tcp_tw_recycle &&
370 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
371 struct inet_peer *peer = rt_get_peer(rt);
373 /* VJ's idea. We save last timestamp seen from
374 * the destination in peer table, when entering state TIME-WAIT
375 * and initialize rx_opt.ts_recent from it, when trying new connection.
378 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
379 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
380 tp->rx_opt.ts_recent = peer->tcp_ts;
384 inet->dport = usin->sin_port;
387 tp->ext_header_len = 0;
389 tp->ext_header_len = inet->opt->optlen;
391 tp->rx_opt.mss_clamp = 536;
393 /* Socket identity is still unknown (sport may be zero).
394 * However we set state to SYN-SENT and not releasing socket
395 * lock select source port, enter ourselves into the hash tables and
396 * complete initialization after this.
398 tcp_set_state(sk, TCP_SYN_SENT);
399 err = tcp_v4_hash_connect(sk);
403 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
407 /* OK, now commit destination to socket. */
408 sk_setup_caps(sk, &rt->u.dst);
411 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
416 inet->id = tp->write_seq ^ jiffies;
418 err = tcp_connect(sk);
426 /* This unhashes the socket and releases the local port, if necessary. */
427 tcp_set_state(sk, TCP_CLOSE);
429 sk->sk_route_caps = 0;
434 static inline int inet_iif(const struct sk_buff *skb)
436 return ((struct rtable *)skb->dst)->rt_iif;
440 * This routine does path mtu discovery as defined in RFC1191.
442 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
445 struct dst_entry *dst;
446 struct inet_sock *inet = inet_sk(sk);
447 struct tcp_sock *tp = tcp_sk(sk);
449 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
450 * send out by Linux are always <576bytes so they should go through
453 if (sk->sk_state == TCP_LISTEN)
456 /* We don't check in the destentry if pmtu discovery is forbidden
457 * on this route. We just assume that no packet_to_big packets
458 * are send back when pmtu discovery is not active.
459 * There is a small race when the user changes this flag in the
460 * route, but I think that's acceptable.
462 if ((dst = __sk_dst_check(sk, 0)) == NULL)
465 dst->ops->update_pmtu(dst, mtu);
467 /* Something is about to be wrong... Remember soft error
468 * for the case, if this connection will not able to recover.
470 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
471 sk->sk_err_soft = EMSGSIZE;
475 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
476 tp->pmtu_cookie > mtu) {
477 tcp_sync_mss(sk, mtu);
479 /* Resend the TCP packet because it's
480 * clear that the old packet has been
481 * dropped. This is the new "fast" path mtu
484 tcp_simple_retransmit(sk);
485 } /* else let the usual retransmit timer handle it */
489 * This routine is called by the ICMP module when it gets some
490 * sort of error condition. If err < 0 then the socket should
491 * be closed and the error returned to the user. If err > 0
492 * it's just the icmp type << 8 | icmp code. After adjustment
493 * header points to the first 8 bytes of the tcp header. We need
494 * to find the appropriate port.
496 * The locking strategy used here is very "optimistic". When
497 * someone else accesses the socket the ICMP is just dropped
498 * and for some paths there is no check at all.
499 * A more general error queue to queue errors for later handling
500 * is probably better.
504 void tcp_v4_err(struct sk_buff *skb, u32 info)
506 struct iphdr *iph = (struct iphdr *)skb->data;
507 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
509 struct inet_sock *inet;
510 int type = skb->h.icmph->type;
511 int code = skb->h.icmph->code;
516 if (skb->len < (iph->ihl << 2) + 8) {
517 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
521 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
522 th->source, inet_iif(skb));
524 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
527 if (sk->sk_state == TCP_TIME_WAIT) {
528 inet_twsk_put((struct inet_timewait_sock *)sk);
533 /* If too many ICMPs get dropped on busy
534 * servers this needs to be solved differently.
536 if (sock_owned_by_user(sk))
537 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
539 if (sk->sk_state == TCP_CLOSE)
543 seq = ntohl(th->seq);
544 if (sk->sk_state != TCP_LISTEN &&
545 !between(seq, tp->snd_una, tp->snd_nxt)) {
546 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
551 case ICMP_SOURCE_QUENCH:
552 /* Just silently ignore these. */
554 case ICMP_PARAMETERPROB:
557 case ICMP_DEST_UNREACH:
558 if (code > NR_ICMP_UNREACH)
561 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
562 if (!sock_owned_by_user(sk))
563 do_pmtu_discovery(sk, iph, info);
567 err = icmp_err_convert[code].errno;
569 case ICMP_TIME_EXCEEDED:
576 switch (sk->sk_state) {
577 struct request_sock *req, **prev;
579 if (sock_owned_by_user(sk))
582 req = inet_csk_search_req(sk, &prev, th->dest,
583 iph->daddr, iph->saddr);
587 /* ICMPs are not backlogged, hence we cannot get
588 an established socket here.
592 if (seq != tcp_rsk(req)->snt_isn) {
593 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
598 * Still in SYN_RECV, just remove it silently.
599 * There is no good way to pass the error to the newly
600 * created socket, and POSIX does not want network
601 * errors returned from accept().
603 inet_csk_reqsk_queue_drop(sk, req, prev);
607 case TCP_SYN_RECV: /* Cannot happen.
608 It can f.e. if SYNs crossed.
610 if (!sock_owned_by_user(sk)) {
611 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
614 sk->sk_error_report(sk);
618 sk->sk_err_soft = err;
623 /* If we've already connected we will keep trying
624 * until we time out, or the user gives up.
626 * rfc1122 4.2.3.9 allows to consider as hard errors
627 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
628 * but it is obsoleted by pmtu discovery).
630 * Note, that in modern internet, where routing is unreliable
631 * and in each dark corner broken firewalls sit, sending random
632 * errors ordered by their masters even this two messages finally lose
633 * their original sense (even Linux sends invalid PORT_UNREACHs)
635 * Now we are in compliance with RFCs.
640 if (!sock_owned_by_user(sk) && inet->recverr) {
642 sk->sk_error_report(sk);
643 } else { /* Only an error on timeout */
644 sk->sk_err_soft = err;
652 /* This routine computes an IPv4 TCP checksum. */
653 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
656 struct inet_sock *inet = inet_sk(sk);
658 if (skb->ip_summed == CHECKSUM_HW) {
659 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
660 skb->csum = offsetof(struct tcphdr, check);
662 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
663 csum_partial((char *)th,
670 * This routine will send an RST to the other tcp.
672 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
674 * Answer: if a packet caused RST, it is not for a socket
675 * existing in our system, if it is matched to a socket,
676 * it is just duplicate segment or bug in other side's TCP.
677 * So that we build reply only basing on parameters
678 * arrived with segment.
679 * Exception: precedence violation. We do not implement it in any case.
682 static void tcp_v4_send_reset(struct sk_buff *skb)
684 struct tcphdr *th = skb->h.th;
686 struct ip_reply_arg arg;
688 /* Never send a reset in response to a reset. */
692 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
695 /* Swap the send and the receive. */
696 memset(&rth, 0, sizeof(struct tcphdr));
697 rth.dest = th->source;
698 rth.source = th->dest;
699 rth.doff = sizeof(struct tcphdr) / 4;
703 rth.seq = th->ack_seq;
706 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
707 skb->len - (th->doff << 2));
710 memset(&arg, 0, sizeof arg);
711 arg.iov[0].iov_base = (unsigned char *)&rth;
712 arg.iov[0].iov_len = sizeof rth;
713 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
714 skb->nh.iph->saddr, /*XXX*/
715 sizeof(struct tcphdr), IPPROTO_TCP, 0);
716 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
718 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
720 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
721 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
724 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
725 outside socket context is ugly, certainly. What can I do?
728 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
731 struct tcphdr *th = skb->h.th;
736 struct ip_reply_arg arg;
738 memset(&rep.th, 0, sizeof(struct tcphdr));
739 memset(&arg, 0, sizeof arg);
741 arg.iov[0].iov_base = (unsigned char *)&rep;
742 arg.iov[0].iov_len = sizeof(rep.th);
744 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
745 (TCPOPT_TIMESTAMP << 8) |
747 rep.tsopt[1] = htonl(tcp_time_stamp);
748 rep.tsopt[2] = htonl(ts);
749 arg.iov[0].iov_len = sizeof(rep);
752 /* Swap the send and the receive. */
753 rep.th.dest = th->source;
754 rep.th.source = th->dest;
755 rep.th.doff = arg.iov[0].iov_len / 4;
756 rep.th.seq = htonl(seq);
757 rep.th.ack_seq = htonl(ack);
759 rep.th.window = htons(win);
761 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
762 skb->nh.iph->saddr, /*XXX*/
763 arg.iov[0].iov_len, IPPROTO_TCP, 0);
764 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
766 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
768 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
771 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
773 struct inet_timewait_sock *tw = inet_twsk(sk);
774 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
776 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
777 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
782 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
784 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
789 * Send a SYN-ACK after having received an ACK.
790 * This still operates on a request_sock only, not on a big
793 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
794 struct dst_entry *dst)
796 const struct inet_request_sock *ireq = inet_rsk(req);
798 struct sk_buff * skb;
800 /* First, grab a route. */
801 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
804 skb = tcp_make_synack(sk, dst, req);
807 struct tcphdr *th = skb->h.th;
809 th->check = tcp_v4_check(th, skb->len,
812 csum_partial((char *)th, skb->len,
815 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
818 if (err == NET_XMIT_CN)
828 * IPv4 request_sock destructor.
830 static void tcp_v4_reqsk_destructor(struct request_sock *req)
832 if (inet_rsk(req)->opt)
833 kfree(inet_rsk(req)->opt);
836 static inline void syn_flood_warning(struct sk_buff *skb)
838 static unsigned long warntime;
840 if (time_after(jiffies, (warntime + HZ * 60))) {
843 "possible SYN flooding on port %d. Sending cookies.\n",
844 ntohs(skb->h.th->dest));
849 * Save and compile IPv4 options into the request_sock if needed.
851 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
854 struct ip_options *opt = &(IPCB(skb)->opt);
855 struct ip_options *dopt = NULL;
857 if (opt && opt->optlen) {
858 int opt_size = optlength(opt);
859 dopt = kmalloc(opt_size, GFP_ATOMIC);
861 if (ip_options_echo(dopt, skb)) {
870 struct request_sock_ops tcp_request_sock_ops = {
872 .obj_size = sizeof(struct tcp_request_sock),
873 .rtx_syn_ack = tcp_v4_send_synack,
874 .send_ack = tcp_v4_reqsk_send_ack,
875 .destructor = tcp_v4_reqsk_destructor,
876 .send_reset = tcp_v4_send_reset,
879 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
881 struct inet_request_sock *ireq;
882 struct tcp_options_received tmp_opt;
883 struct request_sock *req;
884 __u32 saddr = skb->nh.iph->saddr;
885 __u32 daddr = skb->nh.iph->daddr;
886 __u32 isn = TCP_SKB_CB(skb)->when;
887 struct dst_entry *dst = NULL;
888 #ifdef CONFIG_SYN_COOKIES
891 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
894 /* Never answer to SYNs send to broadcast or multicast */
895 if (((struct rtable *)skb->dst)->rt_flags &
896 (RTCF_BROADCAST | RTCF_MULTICAST))
899 /* TW buckets are converted to open requests without
900 * limitations, they conserve resources and peer is
901 * evidently real one.
903 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
904 #ifdef CONFIG_SYN_COOKIES
905 if (sysctl_tcp_syncookies) {
912 /* Accept backlog is full. If we have already queued enough
913 * of warm entries in syn queue, drop request. It is better than
914 * clogging syn queue with openreqs with exponentially increasing
917 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
920 req = reqsk_alloc(&tcp_request_sock_ops);
924 tcp_clear_options(&tmp_opt);
925 tmp_opt.mss_clamp = 536;
926 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
928 tcp_parse_options(skb, &tmp_opt, 0);
931 tcp_clear_options(&tmp_opt);
932 tmp_opt.saw_tstamp = 0;
935 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
936 /* Some OSes (unknown ones, but I see them on web server, which
937 * contains information interesting only for windows'
938 * users) do not send their stamp in SYN. It is easy case.
939 * We simply do not advertise TS support.
941 tmp_opt.saw_tstamp = 0;
942 tmp_opt.tstamp_ok = 0;
944 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
946 tcp_openreq_init(req, &tmp_opt, skb);
948 ireq = inet_rsk(req);
949 ireq->loc_addr = daddr;
950 ireq->rmt_addr = saddr;
951 ireq->opt = tcp_v4_save_options(sk, skb);
953 TCP_ECN_create_request(req, skb->h.th);
956 #ifdef CONFIG_SYN_COOKIES
957 syn_flood_warning(skb);
959 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
961 struct inet_peer *peer = NULL;
963 /* VJ's idea. We save last timestamp seen
964 * from the destination in peer table, when entering
965 * state TIME-WAIT, and check against it before
966 * accepting new connection request.
968 * If "isn" is not zero, this request hit alive
969 * timewait bucket, so that all the necessary checks
970 * are made in the function processing timewait state.
972 if (tmp_opt.saw_tstamp &&
973 sysctl_tcp_tw_recycle &&
974 (dst = inet_csk_route_req(sk, req)) != NULL &&
975 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
976 peer->v4daddr == saddr) {
977 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
978 (s32)(peer->tcp_ts - req->ts_recent) >
980 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
985 /* Kill the following clause, if you dislike this way. */
986 else if (!sysctl_tcp_syncookies &&
987 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
988 (sysctl_max_syn_backlog >> 2)) &&
989 (!peer || !peer->tcp_ts_stamp) &&
990 (!dst || !dst_metric(dst, RTAX_RTT))) {
991 /* Without syncookies last quarter of
992 * backlog is filled with destinations,
993 * proven to be alive.
994 * It means that we continue to communicate
995 * to destinations, already remembered
996 * to the moment of synflood.
998 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
999 "request from %u.%u."
1002 ntohs(skb->h.th->source)));
1007 isn = tcp_v4_init_sequence(sk, skb);
1009 tcp_rsk(req)->snt_isn = isn;
1011 if (tcp_v4_send_synack(sk, req, dst))
1017 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1024 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1030 * The three way handshake has completed - we got a valid synack -
1031 * now create the new socket.
1033 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1034 struct request_sock *req,
1035 struct dst_entry *dst)
1037 struct inet_request_sock *ireq;
1038 struct inet_sock *newinet;
1039 struct tcp_sock *newtp;
1042 if (sk_acceptq_is_full(sk))
1045 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1048 newsk = tcp_create_openreq_child(sk, req, skb);
1052 sk_setup_caps(newsk, dst);
1054 newtp = tcp_sk(newsk);
1055 newinet = inet_sk(newsk);
1056 ireq = inet_rsk(req);
1057 newinet->daddr = ireq->rmt_addr;
1058 newinet->rcv_saddr = ireq->loc_addr;
1059 newinet->saddr = ireq->loc_addr;
1060 newinet->opt = ireq->opt;
1062 newinet->mc_index = inet_iif(skb);
1063 newinet->mc_ttl = skb->nh.iph->ttl;
1064 newtp->ext_header_len = 0;
1066 newtp->ext_header_len = newinet->opt->optlen;
1067 newinet->id = newtp->write_seq ^ jiffies;
1069 tcp_sync_mss(newsk, dst_mtu(dst));
1070 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1071 tcp_initialize_rcv_mss(newsk);
1073 __inet_hash(&tcp_hashinfo, newsk, 0);
1074 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1079 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1081 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1086 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1088 struct tcphdr *th = skb->h.th;
1089 struct iphdr *iph = skb->nh.iph;
1091 struct request_sock **prev;
1092 /* Find possible connection requests. */
1093 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1094 iph->saddr, iph->daddr);
1096 return tcp_check_req(sk, skb, req, prev);
1098 nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1099 th->source, skb->nh.iph->daddr,
1100 ntohs(th->dest), inet_iif(skb));
1103 if (nsk->sk_state != TCP_TIME_WAIT) {
1107 inet_twsk_put((struct inet_timewait_sock *)nsk);
1111 #ifdef CONFIG_SYN_COOKIES
1112 if (!th->rst && !th->syn && th->ack)
1113 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1118 static int tcp_v4_checksum_init(struct sk_buff *skb)
1120 if (skb->ip_summed == CHECKSUM_HW) {
1121 skb->ip_summed = CHECKSUM_UNNECESSARY;
1122 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1123 skb->nh.iph->daddr, skb->csum))
1126 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1127 skb->ip_summed = CHECKSUM_NONE;
1129 if (skb->len <= 76) {
1130 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1132 skb_checksum(skb, 0, skb->len, 0)))
1134 skb->ip_summed = CHECKSUM_UNNECESSARY;
1136 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1138 skb->nh.iph->daddr, 0);
1144 /* The socket must have it's spinlock held when we get
1147 * We have a potential double-lock case here, so even when
1148 * doing backlog processing we use the BH locking scheme.
1149 * This is because we cannot sleep with the original spinlock
1152 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1154 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1155 TCP_CHECK_TIMER(sk);
1156 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1158 TCP_CHECK_TIMER(sk);
1162 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1165 if (sk->sk_state == TCP_LISTEN) {
1166 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1171 if (tcp_child_process(sk, nsk, skb))
1177 TCP_CHECK_TIMER(sk);
1178 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1180 TCP_CHECK_TIMER(sk);
1184 tcp_v4_send_reset(skb);
1187 /* Be careful here. If this function gets more complicated and
1188 * gcc suffers from register pressure on the x86, sk (in %ebx)
1189 * might be destroyed here. This current version compiles correctly,
1190 * but you have been warned.
1195 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1203 int tcp_v4_rcv(struct sk_buff *skb)
1209 if (skb->pkt_type != PACKET_HOST)
1212 /* Count it even if it's bad */
1213 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1215 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1220 if (th->doff < sizeof(struct tcphdr) / 4)
1222 if (!pskb_may_pull(skb, th->doff * 4))
1225 /* An explanation is required here, I think.
1226 * Packet length and doff are validated by header prediction,
1227 * provided case of th->doff==0 is elimineted.
1228 * So, we defer the checks. */
1229 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1230 tcp_v4_checksum_init(skb) < 0))
1234 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1235 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1236 skb->len - th->doff * 4);
1237 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1238 TCP_SKB_CB(skb)->when = 0;
1239 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1240 TCP_SKB_CB(skb)->sacked = 0;
1242 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1243 skb->nh.iph->daddr, ntohs(th->dest),
1250 if (sk->sk_state == TCP_TIME_WAIT)
1253 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1254 goto discard_and_relse;
1256 if (sk_filter(sk, skb, 0))
1257 goto discard_and_relse;
1263 if (!sock_owned_by_user(sk)) {
1264 if (!tcp_prequeue(sk, skb))
1265 ret = tcp_v4_do_rcv(sk, skb);
1267 sk_add_backlog(sk, skb);
1275 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1278 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1280 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1282 tcp_v4_send_reset(skb);
1286 /* Discard frame. */
1295 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1296 inet_twsk_put((struct inet_timewait_sock *) sk);
1300 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1301 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1302 inet_twsk_put((struct inet_timewait_sock *) sk);
1305 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1308 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1313 tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1314 inet_twsk_put((struct inet_timewait_sock *)sk);
1318 /* Fall through to ACK */
1321 tcp_v4_timewait_ack(sk, skb);
1325 case TCP_TW_SUCCESS:;
1330 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1332 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1333 struct inet_sock *inet = inet_sk(sk);
1335 sin->sin_family = AF_INET;
1336 sin->sin_addr.s_addr = inet->daddr;
1337 sin->sin_port = inet->dport;
1340 /* VJ's idea. Save last timestamp seen from this destination
1341 * and hold it at least for normal timewait interval to use for duplicate
1342 * segment detection in subsequent connections, before they enter synchronized
1346 int tcp_v4_remember_stamp(struct sock *sk)
1348 struct inet_sock *inet = inet_sk(sk);
1349 struct tcp_sock *tp = tcp_sk(sk);
1350 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1351 struct inet_peer *peer = NULL;
1354 if (!rt || rt->rt_dst != inet->daddr) {
1355 peer = inet_getpeer(inet->daddr, 1);
1359 rt_bind_peer(rt, 1);
1364 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1365 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1366 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1367 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1368 peer->tcp_ts = tp->rx_opt.ts_recent;
1378 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1380 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1383 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1385 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1386 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1387 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1388 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1389 peer->tcp_ts = tcptw->tw_ts_recent;
1398 struct tcp_func ipv4_specific = {
1399 .queue_xmit = ip_queue_xmit,
1400 .send_check = tcp_v4_send_check,
1401 .rebuild_header = inet_sk_rebuild_header,
1402 .conn_request = tcp_v4_conn_request,
1403 .syn_recv_sock = tcp_v4_syn_recv_sock,
1404 .remember_stamp = tcp_v4_remember_stamp,
1405 .net_header_len = sizeof(struct iphdr),
1406 .setsockopt = ip_setsockopt,
1407 .getsockopt = ip_getsockopt,
1408 .addr2sockaddr = v4_addr2sockaddr,
1409 .sockaddr_len = sizeof(struct sockaddr_in),
1412 /* NOTE: A lot of things set to zero explicitly by call to
1413 * sk_alloc() so need not be done here.
1415 static int tcp_v4_init_sock(struct sock *sk)
1417 struct tcp_sock *tp = tcp_sk(sk);
1419 skb_queue_head_init(&tp->out_of_order_queue);
1420 tcp_init_xmit_timers(sk);
1421 tcp_prequeue_init(tp);
1423 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
1424 tp->mdev = TCP_TIMEOUT_INIT;
1426 /* So many TCP implementations out there (incorrectly) count the
1427 * initial SYN frame in their delayed-ACK and congestion control
1428 * algorithms that we must have the following bandaid to talk
1429 * efficiently to them. -DaveM
1433 /* See draft-stevens-tcpca-spec-01 for discussion of the
1434 * initialization of these values.
1436 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1437 tp->snd_cwnd_clamp = ~0;
1438 tp->mss_cache = 536;
1440 tp->reordering = sysctl_tcp_reordering;
1441 tp->ca_ops = &tcp_init_congestion_ops;
1443 sk->sk_state = TCP_CLOSE;
1445 sk->sk_write_space = sk_stream_write_space;
1446 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1448 tp->af_specific = &ipv4_specific;
1450 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1451 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1453 atomic_inc(&tcp_sockets_allocated);
1458 int tcp_v4_destroy_sock(struct sock *sk)
1460 struct tcp_sock *tp = tcp_sk(sk);
1462 tcp_clear_xmit_timers(sk);
1464 tcp_cleanup_congestion_control(tp);
1466 /* Cleanup up the write buffer. */
1467 sk_stream_writequeue_purge(sk);
1469 /* Cleans up our, hopefully empty, out_of_order_queue. */
1470 __skb_queue_purge(&tp->out_of_order_queue);
1472 /* Clean prequeue, it must be empty really */
1473 __skb_queue_purge(&tp->ucopy.prequeue);
1475 /* Clean up a referenced TCP bind bucket. */
1476 if (inet_csk(sk)->icsk_bind_hash)
1477 inet_put_port(&tcp_hashinfo, sk);
1480 * If sendmsg cached page exists, toss it.
1482 if (sk->sk_sndmsg_page) {
1483 __free_page(sk->sk_sndmsg_page);
1484 sk->sk_sndmsg_page = NULL;
1487 atomic_dec(&tcp_sockets_allocated);
1492 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1494 #ifdef CONFIG_PROC_FS
1495 /* Proc filesystem TCP sock list dumping. */
1497 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1499 return hlist_empty(head) ? NULL :
1500 list_entry(head->first, struct inet_timewait_sock, tw_node);
1503 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1505 return tw->tw_node.next ?
1506 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1509 static void *listening_get_next(struct seq_file *seq, void *cur)
1511 struct inet_connection_sock *icsk;
1512 struct hlist_node *node;
1513 struct sock *sk = cur;
1514 struct tcp_iter_state* st = seq->private;
1518 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1524 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1525 struct request_sock *req = cur;
1527 icsk = inet_csk(st->syn_wait_sk);
1531 if (req->rsk_ops->family == st->family) {
1537 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1540 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1542 sk = sk_next(st->syn_wait_sk);
1543 st->state = TCP_SEQ_STATE_LISTENING;
1544 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1546 icsk = inet_csk(sk);
1547 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1548 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1550 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1554 sk_for_each_from(sk, node) {
1555 if (sk->sk_family == st->family) {
1559 icsk = inet_csk(sk);
1560 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1561 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1563 st->uid = sock_i_uid(sk);
1564 st->syn_wait_sk = sk;
1565 st->state = TCP_SEQ_STATE_OPENREQ;
1569 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1571 if (++st->bucket < INET_LHTABLE_SIZE) {
1572 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1580 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1582 void *rc = listening_get_next(seq, NULL);
1584 while (rc && *pos) {
1585 rc = listening_get_next(seq, rc);
1591 static void *established_get_first(struct seq_file *seq)
1593 struct tcp_iter_state* st = seq->private;
1596 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1598 struct hlist_node *node;
1599 struct inet_timewait_sock *tw;
1601 /* We can reschedule _before_ having picked the target: */
1602 cond_resched_softirq();
1604 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1605 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1606 if (sk->sk_family != st->family) {
1612 st->state = TCP_SEQ_STATE_TIME_WAIT;
1613 inet_twsk_for_each(tw, node,
1614 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1615 if (tw->tw_family != st->family) {
1621 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1622 st->state = TCP_SEQ_STATE_ESTABLISHED;
1628 static void *established_get_next(struct seq_file *seq, void *cur)
1630 struct sock *sk = cur;
1631 struct inet_timewait_sock *tw;
1632 struct hlist_node *node;
1633 struct tcp_iter_state* st = seq->private;
1637 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1641 while (tw && tw->tw_family != st->family) {
1648 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1649 st->state = TCP_SEQ_STATE_ESTABLISHED;
1651 /* We can reschedule between buckets: */
1652 cond_resched_softirq();
1654 if (++st->bucket < tcp_hashinfo.ehash_size) {
1655 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1656 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1664 sk_for_each_from(sk, node) {
1665 if (sk->sk_family == st->family)
1669 st->state = TCP_SEQ_STATE_TIME_WAIT;
1670 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1678 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1680 void *rc = established_get_first(seq);
1683 rc = established_get_next(seq, rc);
1689 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1692 struct tcp_iter_state* st = seq->private;
1694 inet_listen_lock(&tcp_hashinfo);
1695 st->state = TCP_SEQ_STATE_LISTENING;
1696 rc = listening_get_idx(seq, &pos);
1699 inet_listen_unlock(&tcp_hashinfo);
1701 st->state = TCP_SEQ_STATE_ESTABLISHED;
1702 rc = established_get_idx(seq, pos);
1708 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1710 struct tcp_iter_state* st = seq->private;
1711 st->state = TCP_SEQ_STATE_LISTENING;
1713 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1716 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1719 struct tcp_iter_state* st;
1721 if (v == SEQ_START_TOKEN) {
1722 rc = tcp_get_idx(seq, 0);
1727 switch (st->state) {
1728 case TCP_SEQ_STATE_OPENREQ:
1729 case TCP_SEQ_STATE_LISTENING:
1730 rc = listening_get_next(seq, v);
1732 inet_listen_unlock(&tcp_hashinfo);
1734 st->state = TCP_SEQ_STATE_ESTABLISHED;
1735 rc = established_get_first(seq);
1738 case TCP_SEQ_STATE_ESTABLISHED:
1739 case TCP_SEQ_STATE_TIME_WAIT:
1740 rc = established_get_next(seq, v);
1748 static void tcp_seq_stop(struct seq_file *seq, void *v)
1750 struct tcp_iter_state* st = seq->private;
1752 switch (st->state) {
1753 case TCP_SEQ_STATE_OPENREQ:
1755 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1756 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1758 case TCP_SEQ_STATE_LISTENING:
1759 if (v != SEQ_START_TOKEN)
1760 inet_listen_unlock(&tcp_hashinfo);
1762 case TCP_SEQ_STATE_TIME_WAIT:
1763 case TCP_SEQ_STATE_ESTABLISHED:
1765 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1771 static int tcp_seq_open(struct inode *inode, struct file *file)
1773 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1774 struct seq_file *seq;
1775 struct tcp_iter_state *s;
1778 if (unlikely(afinfo == NULL))
1781 s = kmalloc(sizeof(*s), GFP_KERNEL);
1784 memset(s, 0, sizeof(*s));
1785 s->family = afinfo->family;
1786 s->seq_ops.start = tcp_seq_start;
1787 s->seq_ops.next = tcp_seq_next;
1788 s->seq_ops.show = afinfo->seq_show;
1789 s->seq_ops.stop = tcp_seq_stop;
1791 rc = seq_open(file, &s->seq_ops);
1794 seq = file->private_data;
1803 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1806 struct proc_dir_entry *p;
1810 afinfo->seq_fops->owner = afinfo->owner;
1811 afinfo->seq_fops->open = tcp_seq_open;
1812 afinfo->seq_fops->read = seq_read;
1813 afinfo->seq_fops->llseek = seq_lseek;
1814 afinfo->seq_fops->release = seq_release_private;
1816 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1824 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1828 proc_net_remove(afinfo->name);
1829 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1832 static void get_openreq4(struct sock *sk, struct request_sock *req,
1833 char *tmpbuf, int i, int uid)
1835 const struct inet_request_sock *ireq = inet_rsk(req);
1836 int ttd = req->expires - jiffies;
1838 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1839 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1842 ntohs(inet_sk(sk)->sport),
1844 ntohs(ireq->rmt_port),
1846 0, 0, /* could print option size, but that is af dependent. */
1847 1, /* timers active (only the expire timer) */
1848 jiffies_to_clock_t(ttd),
1851 0, /* non standard timer */
1852 0, /* open_requests have no inode */
1853 atomic_read(&sk->sk_refcnt),
1857 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1860 unsigned long timer_expires;
1861 struct tcp_sock *tp = tcp_sk(sp);
1862 const struct inet_connection_sock *icsk = inet_csk(sp);
1863 struct inet_sock *inet = inet_sk(sp);
1864 unsigned int dest = inet->daddr;
1865 unsigned int src = inet->rcv_saddr;
1866 __u16 destp = ntohs(inet->dport);
1867 __u16 srcp = ntohs(inet->sport);
1869 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1871 timer_expires = icsk->icsk_timeout;
1872 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1874 timer_expires = icsk->icsk_timeout;
1875 } else if (timer_pending(&sp->sk_timer)) {
1877 timer_expires = sp->sk_timer.expires;
1880 timer_expires = jiffies;
1883 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1884 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
1885 i, src, srcp, dest, destp, sp->sk_state,
1886 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1888 jiffies_to_clock_t(timer_expires - jiffies),
1889 icsk->icsk_retransmits,
1893 atomic_read(&sp->sk_refcnt), sp,
1896 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1898 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1901 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1903 unsigned int dest, src;
1905 int ttd = tw->tw_ttd - jiffies;
1910 dest = tw->tw_daddr;
1911 src = tw->tw_rcv_saddr;
1912 destp = ntohs(tw->tw_dport);
1913 srcp = ntohs(tw->tw_sport);
1915 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1916 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1917 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1918 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1919 atomic_read(&tw->tw_refcnt), tw);
1924 static int tcp4_seq_show(struct seq_file *seq, void *v)
1926 struct tcp_iter_state* st;
1927 char tmpbuf[TMPSZ + 1];
1929 if (v == SEQ_START_TOKEN) {
1930 seq_printf(seq, "%-*s\n", TMPSZ - 1,
1931 " sl local_address rem_address st tx_queue "
1932 "rx_queue tr tm->when retrnsmt uid timeout "
1938 switch (st->state) {
1939 case TCP_SEQ_STATE_LISTENING:
1940 case TCP_SEQ_STATE_ESTABLISHED:
1941 get_tcp4_sock(v, tmpbuf, st->num);
1943 case TCP_SEQ_STATE_OPENREQ:
1944 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1946 case TCP_SEQ_STATE_TIME_WAIT:
1947 get_timewait4_sock(v, tmpbuf, st->num);
1950 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1955 static struct file_operations tcp4_seq_fops;
1956 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1957 .owner = THIS_MODULE,
1960 .seq_show = tcp4_seq_show,
1961 .seq_fops = &tcp4_seq_fops,
1964 int __init tcp4_proc_init(void)
1966 return tcp_proc_register(&tcp4_seq_afinfo);
1969 void tcp4_proc_exit(void)
1971 tcp_proc_unregister(&tcp4_seq_afinfo);
1973 #endif /* CONFIG_PROC_FS */
1975 struct proto tcp_prot = {
1977 .owner = THIS_MODULE,
1979 .connect = tcp_v4_connect,
1980 .disconnect = tcp_disconnect,
1981 .accept = inet_csk_accept,
1983 .init = tcp_v4_init_sock,
1984 .destroy = tcp_v4_destroy_sock,
1985 .shutdown = tcp_shutdown,
1986 .setsockopt = tcp_setsockopt,
1987 .getsockopt = tcp_getsockopt,
1988 .sendmsg = tcp_sendmsg,
1989 .recvmsg = tcp_recvmsg,
1990 .backlog_rcv = tcp_v4_do_rcv,
1991 .hash = tcp_v4_hash,
1992 .unhash = tcp_unhash,
1993 .get_port = tcp_v4_get_port,
1994 .enter_memory_pressure = tcp_enter_memory_pressure,
1995 .sockets_allocated = &tcp_sockets_allocated,
1996 .memory_allocated = &tcp_memory_allocated,
1997 .memory_pressure = &tcp_memory_pressure,
1998 .sysctl_mem = sysctl_tcp_mem,
1999 .sysctl_wmem = sysctl_tcp_wmem,
2000 .sysctl_rmem = sysctl_tcp_rmem,
2001 .max_header = MAX_TCP_HEADER,
2002 .obj_size = sizeof(struct tcp_sock),
2003 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2004 .rsk_prot = &tcp_request_sock_ops,
2009 void __init tcp_v4_init(struct net_proto_family *ops)
2011 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2013 panic("Failed to create the TCP control socket.\n");
2014 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2015 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2017 /* Unhash it so that IP input processing does not even
2018 * see it, we do not wish this socket to see incoming
2021 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2024 EXPORT_SYMBOL(ipv4_specific);
2025 EXPORT_SYMBOL(inet_bind_bucket_create);
2026 EXPORT_SYMBOL(tcp_hashinfo);
2027 EXPORT_SYMBOL(tcp_prot);
2028 EXPORT_SYMBOL(tcp_unhash);
2029 EXPORT_SYMBOL(tcp_v4_conn_request);
2030 EXPORT_SYMBOL(tcp_v4_connect);
2031 EXPORT_SYMBOL(tcp_v4_do_rcv);
2032 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2033 EXPORT_SYMBOL(tcp_v4_send_check);
2034 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2036 #ifdef CONFIG_PROC_FS
2037 EXPORT_SYMBOL(tcp_proc_register);
2038 EXPORT_SYMBOL(tcp_proc_unregister);
2040 EXPORT_SYMBOL(sysctl_local_port_range);
2041 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2042 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);