2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
67 #include <net/inet_hashtables.h>
70 #include <net/inet_common.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 .port_rover = 1024 - 1,
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
105 int sysctl_local_port_range[2] = { 1024, 4999 };
107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
109 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
114 sk_for_each_bound(sk2, node, &tb->owners) {
116 !tcp_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
122 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
132 /* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
137 struct inet_bind_hashbucket *head;
138 struct hlist_node *node;
139 struct inet_bind_bucket *tb;
144 int low = sysctl_local_port_range[0];
145 int high = sysctl_local_port_range[1];
146 int remaining = (high - low) + 1;
149 spin_lock(&tcp_hashinfo.portalloc_lock);
150 if (tcp_hashinfo.port_rover < low)
153 rover = tcp_hashinfo.port_rover;
158 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
159 spin_lock(&head->lock);
160 inet_bind_bucket_for_each(tb, node, &head->chain)
161 if (tb->port == rover)
165 spin_unlock(&head->lock);
166 } while (--remaining > 0);
167 tcp_hashinfo.port_rover = rover;
168 spin_unlock(&tcp_hashinfo.portalloc_lock);
170 /* Exhausted local port range during search? It is not
171 * possible for us to be holding one of the bind hash
172 * locks if this test triggers, because if 'remaining'
173 * drops to zero, we broke out of the do/while loop at
174 * the top level, not from the 'break;' statement.
177 if (unlikely(remaining <= 0))
180 /* OK, here is the one we will use. HEAD is
181 * non-NULL and we hold it's mutex.
185 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
186 spin_lock(&head->lock);
187 inet_bind_bucket_for_each(tb, node, &head->chain)
188 if (tb->port == snum)
194 if (!hlist_empty(&tb->owners)) {
195 if (sk->sk_reuse > 1)
197 if (tb->fastreuse > 0 &&
198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
202 if (tcp_bind_conflict(sk, tb))
208 if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
210 if (hlist_empty(&tb->owners)) {
211 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
215 } else if (tb->fastreuse &&
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
219 if (!inet_sk(sk)->bind_hash)
220 inet_bind_hash(sk, tb, snum);
221 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
225 spin_unlock(&head->lock);
231 static void tcp_v4_hash(struct sock *sk)
233 if (sk->sk_state != TCP_CLOSE) {
235 __inet_hash(&tcp_hashinfo, sk, 1);
240 void tcp_unhash(struct sock *sk)
247 if (sk->sk_state == TCP_LISTEN) {
249 inet_listen_wlock(&tcp_hashinfo);
250 lock = &tcp_hashinfo.lhash_lock;
252 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[sk->sk_hashent];
254 write_lock_bh(&head->lock);
257 if (__sk_del_node_init(sk))
258 sock_prot_dec_use(sk->sk_prot);
259 write_unlock_bh(lock);
262 if (sk->sk_state == TCP_LISTEN)
263 wake_up(&tcp_hashinfo.lhash_wait);
266 /* Don't inline this cruft. Here are some nice properties to
267 * exploit here. The BSD API does not allow a listening TCP
268 * to specify the remote port nor the remote address for the
269 * connection. So always assume those are both wildcarded
270 * during the search since they can never be otherwise.
272 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
274 const unsigned short hnum,
277 struct sock *result = NULL, *sk;
278 struct hlist_node *node;
282 sk_for_each(sk, node, head) {
283 struct inet_sock *inet = inet_sk(sk);
285 if (inet->num == hnum && !ipv6_only_sock(sk)) {
286 __u32 rcv_saddr = inet->rcv_saddr;
288 score = (sk->sk_family == PF_INET ? 1 : 0);
290 if (rcv_saddr != daddr)
294 if (sk->sk_bound_dev_if) {
295 if (sk->sk_bound_dev_if != dif)
301 if (score > hiscore) {
310 /* Optimize the common listener case. */
311 static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
312 const unsigned short hnum,
315 struct sock *sk = NULL;
316 struct hlist_head *head;
318 read_lock(&tcp_hashinfo.lhash_lock);
319 head = &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)];
320 if (!hlist_empty(head)) {
321 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
323 if (inet->num == hnum && !sk->sk_node.next &&
324 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
325 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
326 !sk->sk_bound_dev_if)
328 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
334 read_unlock(&tcp_hashinfo.lhash_lock);
338 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
339 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
341 * Local BH must be disabled here.
344 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
350 struct inet_ehash_bucket *head;
351 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
352 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
354 struct hlist_node *node;
355 /* Optimize here for direct hit, only listening connections can
356 * have wildcards anyways.
358 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
359 head = &tcp_hashinfo.ehash[hash];
360 read_lock(&head->lock);
361 sk_for_each(sk, node, &head->chain) {
362 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
363 goto hit; /* You sunk my battleship! */
366 /* Must check for a TIME_WAIT'er before going to listener hash. */
367 sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
368 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
373 read_unlock(&head->lock);
380 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
381 u32 daddr, u16 hnum, int dif)
383 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
386 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
389 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
395 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
401 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
403 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
405 return secure_tcp_sequence_number(skb->nh.iph->daddr,
411 /* called with local bh disabled */
412 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
413 struct tcp_tw_bucket **twp)
415 struct inet_sock *inet = inet_sk(sk);
416 u32 daddr = inet->rcv_saddr;
417 u32 saddr = inet->daddr;
418 int dif = sk->sk_bound_dev_if;
419 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
420 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
421 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
422 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
424 struct hlist_node *node;
425 struct tcp_tw_bucket *tw;
427 write_lock(&head->lock);
429 /* Check TIME-WAIT sockets first. */
430 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
431 tw = (struct tcp_tw_bucket *)sk2;
433 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
434 struct tcp_sock *tp = tcp_sk(sk);
436 /* With PAWS, it is safe from the viewpoint
437 of data integrity. Even without PAWS it
438 is safe provided sequence spaces do not
439 overlap i.e. at data rates <= 80Mbit/sec.
441 Actually, the idea is close to VJ's one,
442 only timestamp cache is held not per host,
443 but per port pair and TW bucket is used
446 If TW bucket has been already destroyed we
447 fall back to VJ's scheme and use initial
448 timestamp retrieved from peer table.
450 if (tw->tw_ts_recent_stamp &&
451 (!twp || (sysctl_tcp_tw_reuse &&
453 tw->tw_ts_recent_stamp > 1))) {
455 tw->tw_snd_nxt + 65535 + 2) == 0)
457 tp->rx_opt.ts_recent = tw->tw_ts_recent;
458 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
467 /* And established part... */
468 sk_for_each(sk2, node, &head->chain) {
469 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
474 /* Must record num and sport now. Otherwise we will see
475 * in hash table socket with a funny identity. */
477 inet->sport = htons(lport);
478 sk->sk_hashent = hash;
479 BUG_TRAP(sk_unhashed(sk));
480 __sk_add_node(sk, &head->chain);
481 sock_prot_inc_use(sk->sk_prot);
482 write_unlock(&head->lock);
486 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
488 /* Silly. Should hash-dance instead... */
489 tcp_tw_deschedule(tw);
490 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
498 write_unlock(&head->lock);
499 return -EADDRNOTAVAIL;
502 static inline u32 connect_port_offset(const struct sock *sk)
504 const struct inet_sock *inet = inet_sk(sk);
506 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
511 * Bind a port for a connect operation and hash it.
513 static inline int tcp_v4_hash_connect(struct sock *sk)
515 const unsigned short snum = inet_sk(sk)->num;
516 struct inet_bind_hashbucket *head;
517 struct inet_bind_bucket *tb;
521 int low = sysctl_local_port_range[0];
522 int high = sysctl_local_port_range[1];
523 int range = high - low;
527 u32 offset = hint + connect_port_offset(sk);
528 struct hlist_node *node;
529 struct tcp_tw_bucket *tw = NULL;
532 for (i = 1; i <= range; i++) {
533 port = low + (i + offset) % range;
534 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
535 spin_lock(&head->lock);
537 /* Does not bother with rcv_saddr checks,
538 * because the established check is already
541 inet_bind_bucket_for_each(tb, node, &head->chain) {
542 if (tb->port == port) {
543 BUG_TRAP(!hlist_empty(&tb->owners));
544 if (tb->fastreuse >= 0)
546 if (!__tcp_v4_check_established(sk,
554 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
556 spin_unlock(&head->lock);
563 spin_unlock(&head->lock);
567 return -EADDRNOTAVAIL;
572 /* Head lock still held and bh's disabled */
573 inet_bind_hash(sk, tb, port);
574 if (sk_unhashed(sk)) {
575 inet_sk(sk)->sport = htons(port);
576 __inet_hash(&tcp_hashinfo, sk, 0);
578 spin_unlock(&head->lock);
581 tcp_tw_deschedule(tw);
589 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
590 tb = inet_sk(sk)->bind_hash;
591 spin_lock_bh(&head->lock);
592 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
593 __inet_hash(&tcp_hashinfo, sk, 0);
594 spin_unlock_bh(&head->lock);
597 spin_unlock(&head->lock);
598 /* No definite answer... Walk to established hash table */
599 ret = __tcp_v4_check_established(sk, snum, NULL);
606 /* This will initiate an outgoing connection. */
607 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
609 struct inet_sock *inet = inet_sk(sk);
610 struct tcp_sock *tp = tcp_sk(sk);
611 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
617 if (addr_len < sizeof(struct sockaddr_in))
620 if (usin->sin_family != AF_INET)
621 return -EAFNOSUPPORT;
623 nexthop = daddr = usin->sin_addr.s_addr;
624 if (inet->opt && inet->opt->srr) {
627 nexthop = inet->opt->faddr;
630 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
631 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
633 inet->sport, usin->sin_port, sk);
637 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
642 if (!inet->opt || !inet->opt->srr)
646 inet->saddr = rt->rt_src;
647 inet->rcv_saddr = inet->saddr;
649 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
650 /* Reset inherited state */
651 tp->rx_opt.ts_recent = 0;
652 tp->rx_opt.ts_recent_stamp = 0;
656 if (sysctl_tcp_tw_recycle &&
657 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
658 struct inet_peer *peer = rt_get_peer(rt);
660 /* VJ's idea. We save last timestamp seen from
661 * the destination in peer table, when entering state TIME-WAIT
662 * and initialize rx_opt.ts_recent from it, when trying new connection.
665 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
666 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
667 tp->rx_opt.ts_recent = peer->tcp_ts;
671 inet->dport = usin->sin_port;
674 tp->ext_header_len = 0;
676 tp->ext_header_len = inet->opt->optlen;
678 tp->rx_opt.mss_clamp = 536;
680 /* Socket identity is still unknown (sport may be zero).
681 * However we set state to SYN-SENT and not releasing socket
682 * lock select source port, enter ourselves into the hash tables and
683 * complete initialization after this.
685 tcp_set_state(sk, TCP_SYN_SENT);
686 err = tcp_v4_hash_connect(sk);
690 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
694 /* OK, now commit destination to socket. */
695 sk_setup_caps(sk, &rt->u.dst);
698 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
703 inet->id = tp->write_seq ^ jiffies;
705 err = tcp_connect(sk);
713 /* This unhashes the socket and releases the local port, if necessary. */
714 tcp_set_state(sk, TCP_CLOSE);
716 sk->sk_route_caps = 0;
721 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
723 return ((struct rtable *)skb->dst)->rt_iif;
726 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
728 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
731 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
732 struct request_sock ***prevp,
734 __u32 raddr, __u32 laddr)
736 struct listen_sock *lopt = tp->accept_queue.listen_opt;
737 struct request_sock *req, **prev;
739 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
740 (req = *prev) != NULL;
741 prev = &req->dl_next) {
742 const struct inet_request_sock *ireq = inet_rsk(req);
744 if (ireq->rmt_port == rport &&
745 ireq->rmt_addr == raddr &&
746 ireq->loc_addr == laddr &&
747 TCP_INET_FAMILY(req->rsk_ops->family)) {
757 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
759 struct tcp_sock *tp = tcp_sk(sk);
760 struct listen_sock *lopt = tp->accept_queue.listen_opt;
761 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
763 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
769 * This routine does path mtu discovery as defined in RFC1191.
771 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
774 struct dst_entry *dst;
775 struct inet_sock *inet = inet_sk(sk);
776 struct tcp_sock *tp = tcp_sk(sk);
778 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
779 * send out by Linux are always <576bytes so they should go through
782 if (sk->sk_state == TCP_LISTEN)
785 /* We don't check in the destentry if pmtu discovery is forbidden
786 * on this route. We just assume that no packet_to_big packets
787 * are send back when pmtu discovery is not active.
788 * There is a small race when the user changes this flag in the
789 * route, but I think that's acceptable.
791 if ((dst = __sk_dst_check(sk, 0)) == NULL)
794 dst->ops->update_pmtu(dst, mtu);
796 /* Something is about to be wrong... Remember soft error
797 * for the case, if this connection will not able to recover.
799 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
800 sk->sk_err_soft = EMSGSIZE;
804 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
805 tp->pmtu_cookie > mtu) {
806 tcp_sync_mss(sk, mtu);
808 /* Resend the TCP packet because it's
809 * clear that the old packet has been
810 * dropped. This is the new "fast" path mtu
813 tcp_simple_retransmit(sk);
814 } /* else let the usual retransmit timer handle it */
818 * This routine is called by the ICMP module when it gets some
819 * sort of error condition. If err < 0 then the socket should
820 * be closed and the error returned to the user. If err > 0
821 * it's just the icmp type << 8 | icmp code. After adjustment
822 * header points to the first 8 bytes of the tcp header. We need
823 * to find the appropriate port.
825 * The locking strategy used here is very "optimistic". When
826 * someone else accesses the socket the ICMP is just dropped
827 * and for some paths there is no check at all.
828 * A more general error queue to queue errors for later handling
829 * is probably better.
833 void tcp_v4_err(struct sk_buff *skb, u32 info)
835 struct iphdr *iph = (struct iphdr *)skb->data;
836 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
838 struct inet_sock *inet;
839 int type = skb->h.icmph->type;
840 int code = skb->h.icmph->code;
845 if (skb->len < (iph->ihl << 2) + 8) {
846 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
850 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
851 th->source, tcp_v4_iif(skb));
853 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
856 if (sk->sk_state == TCP_TIME_WAIT) {
857 tcp_tw_put((struct tcp_tw_bucket *)sk);
862 /* If too many ICMPs get dropped on busy
863 * servers this needs to be solved differently.
865 if (sock_owned_by_user(sk))
866 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
868 if (sk->sk_state == TCP_CLOSE)
872 seq = ntohl(th->seq);
873 if (sk->sk_state != TCP_LISTEN &&
874 !between(seq, tp->snd_una, tp->snd_nxt)) {
875 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
880 case ICMP_SOURCE_QUENCH:
881 /* Just silently ignore these. */
883 case ICMP_PARAMETERPROB:
886 case ICMP_DEST_UNREACH:
887 if (code > NR_ICMP_UNREACH)
890 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
891 if (!sock_owned_by_user(sk))
892 do_pmtu_discovery(sk, iph, info);
896 err = icmp_err_convert[code].errno;
898 case ICMP_TIME_EXCEEDED:
905 switch (sk->sk_state) {
906 struct request_sock *req, **prev;
908 if (sock_owned_by_user(sk))
911 req = tcp_v4_search_req(tp, &prev, th->dest,
912 iph->daddr, iph->saddr);
916 /* ICMPs are not backlogged, hence we cannot get
917 an established socket here.
921 if (seq != tcp_rsk(req)->snt_isn) {
922 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
927 * Still in SYN_RECV, just remove it silently.
928 * There is no good way to pass the error to the newly
929 * created socket, and POSIX does not want network
930 * errors returned from accept().
932 tcp_synq_drop(sk, req, prev);
936 case TCP_SYN_RECV: /* Cannot happen.
937 It can f.e. if SYNs crossed.
939 if (!sock_owned_by_user(sk)) {
940 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
943 sk->sk_error_report(sk);
947 sk->sk_err_soft = err;
952 /* If we've already connected we will keep trying
953 * until we time out, or the user gives up.
955 * rfc1122 4.2.3.9 allows to consider as hard errors
956 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
957 * but it is obsoleted by pmtu discovery).
959 * Note, that in modern internet, where routing is unreliable
960 * and in each dark corner broken firewalls sit, sending random
961 * errors ordered by their masters even this two messages finally lose
962 * their original sense (even Linux sends invalid PORT_UNREACHs)
964 * Now we are in compliance with RFCs.
969 if (!sock_owned_by_user(sk) && inet->recverr) {
971 sk->sk_error_report(sk);
972 } else { /* Only an error on timeout */
973 sk->sk_err_soft = err;
981 /* This routine computes an IPv4 TCP checksum. */
982 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
985 struct inet_sock *inet = inet_sk(sk);
987 if (skb->ip_summed == CHECKSUM_HW) {
988 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
989 skb->csum = offsetof(struct tcphdr, check);
991 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
992 csum_partial((char *)th,
999 * This routine will send an RST to the other tcp.
1001 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1003 * Answer: if a packet caused RST, it is not for a socket
1004 * existing in our system, if it is matched to a socket,
1005 * it is just duplicate segment or bug in other side's TCP.
1006 * So that we build reply only basing on parameters
1007 * arrived with segment.
1008 * Exception: precedence violation. We do not implement it in any case.
1011 static void tcp_v4_send_reset(struct sk_buff *skb)
1013 struct tcphdr *th = skb->h.th;
1015 struct ip_reply_arg arg;
1017 /* Never send a reset in response to a reset. */
1021 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1024 /* Swap the send and the receive. */
1025 memset(&rth, 0, sizeof(struct tcphdr));
1026 rth.dest = th->source;
1027 rth.source = th->dest;
1028 rth.doff = sizeof(struct tcphdr) / 4;
1032 rth.seq = th->ack_seq;
1035 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1036 skb->len - (th->doff << 2));
1039 memset(&arg, 0, sizeof arg);
1040 arg.iov[0].iov_base = (unsigned char *)&rth;
1041 arg.iov[0].iov_len = sizeof rth;
1042 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1043 skb->nh.iph->saddr, /*XXX*/
1044 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1045 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1047 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1049 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1050 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1053 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1054 outside socket context is ugly, certainly. What can I do?
1057 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1060 struct tcphdr *th = skb->h.th;
1065 struct ip_reply_arg arg;
1067 memset(&rep.th, 0, sizeof(struct tcphdr));
1068 memset(&arg, 0, sizeof arg);
1070 arg.iov[0].iov_base = (unsigned char *)&rep;
1071 arg.iov[0].iov_len = sizeof(rep.th);
1073 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1074 (TCPOPT_TIMESTAMP << 8) |
1076 rep.tsopt[1] = htonl(tcp_time_stamp);
1077 rep.tsopt[2] = htonl(ts);
1078 arg.iov[0].iov_len = sizeof(rep);
1081 /* Swap the send and the receive. */
1082 rep.th.dest = th->source;
1083 rep.th.source = th->dest;
1084 rep.th.doff = arg.iov[0].iov_len / 4;
1085 rep.th.seq = htonl(seq);
1086 rep.th.ack_seq = htonl(ack);
1088 rep.th.window = htons(win);
1090 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1091 skb->nh.iph->saddr, /*XXX*/
1092 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1093 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1095 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1097 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1100 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1102 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1104 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1105 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1110 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1112 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1116 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1117 struct request_sock *req)
1120 const struct inet_request_sock *ireq = inet_rsk(req);
1121 struct ip_options *opt = inet_rsk(req)->opt;
1122 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1124 { .daddr = ((opt && opt->srr) ?
1127 .saddr = ireq->loc_addr,
1128 .tos = RT_CONN_FLAGS(sk) } },
1129 .proto = IPPROTO_TCP,
1131 { .sport = inet_sk(sk)->sport,
1132 .dport = ireq->rmt_port } } };
1134 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1135 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1138 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1140 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1147 * Send a SYN-ACK after having received an ACK.
1148 * This still operates on a request_sock only, not on a big
1151 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1152 struct dst_entry *dst)
1154 const struct inet_request_sock *ireq = inet_rsk(req);
1156 struct sk_buff * skb;
1158 /* First, grab a route. */
1159 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1162 skb = tcp_make_synack(sk, dst, req);
1165 struct tcphdr *th = skb->h.th;
1167 th->check = tcp_v4_check(th, skb->len,
1170 csum_partial((char *)th, skb->len,
1173 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1176 if (err == NET_XMIT_CN)
1186 * IPv4 request_sock destructor.
1188 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1190 if (inet_rsk(req)->opt)
1191 kfree(inet_rsk(req)->opt);
1194 static inline void syn_flood_warning(struct sk_buff *skb)
1196 static unsigned long warntime;
1198 if (time_after(jiffies, (warntime + HZ * 60))) {
1201 "possible SYN flooding on port %d. Sending cookies.\n",
1202 ntohs(skb->h.th->dest));
1207 * Save and compile IPv4 options into the request_sock if needed.
1209 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1210 struct sk_buff *skb)
1212 struct ip_options *opt = &(IPCB(skb)->opt);
1213 struct ip_options *dopt = NULL;
1215 if (opt && opt->optlen) {
1216 int opt_size = optlength(opt);
1217 dopt = kmalloc(opt_size, GFP_ATOMIC);
1219 if (ip_options_echo(dopt, skb)) {
1228 struct request_sock_ops tcp_request_sock_ops = {
1230 .obj_size = sizeof(struct tcp_request_sock),
1231 .rtx_syn_ack = tcp_v4_send_synack,
1232 .send_ack = tcp_v4_reqsk_send_ack,
1233 .destructor = tcp_v4_reqsk_destructor,
1234 .send_reset = tcp_v4_send_reset,
1237 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1239 struct inet_request_sock *ireq;
1240 struct tcp_options_received tmp_opt;
1241 struct request_sock *req;
1242 __u32 saddr = skb->nh.iph->saddr;
1243 __u32 daddr = skb->nh.iph->daddr;
1244 __u32 isn = TCP_SKB_CB(skb)->when;
1245 struct dst_entry *dst = NULL;
1246 #ifdef CONFIG_SYN_COOKIES
1247 int want_cookie = 0;
1249 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1252 /* Never answer to SYNs send to broadcast or multicast */
1253 if (((struct rtable *)skb->dst)->rt_flags &
1254 (RTCF_BROADCAST | RTCF_MULTICAST))
1257 /* TW buckets are converted to open requests without
1258 * limitations, they conserve resources and peer is
1259 * evidently real one.
1261 if (tcp_synq_is_full(sk) && !isn) {
1262 #ifdef CONFIG_SYN_COOKIES
1263 if (sysctl_tcp_syncookies) {
1270 /* Accept backlog is full. If we have already queued enough
1271 * of warm entries in syn queue, drop request. It is better than
1272 * clogging syn queue with openreqs with exponentially increasing
1275 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1278 req = reqsk_alloc(&tcp_request_sock_ops);
1282 tcp_clear_options(&tmp_opt);
1283 tmp_opt.mss_clamp = 536;
1284 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1286 tcp_parse_options(skb, &tmp_opt, 0);
1289 tcp_clear_options(&tmp_opt);
1290 tmp_opt.saw_tstamp = 0;
1293 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1294 /* Some OSes (unknown ones, but I see them on web server, which
1295 * contains information interesting only for windows'
1296 * users) do not send their stamp in SYN. It is easy case.
1297 * We simply do not advertise TS support.
1299 tmp_opt.saw_tstamp = 0;
1300 tmp_opt.tstamp_ok = 0;
1302 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1304 tcp_openreq_init(req, &tmp_opt, skb);
1306 ireq = inet_rsk(req);
1307 ireq->loc_addr = daddr;
1308 ireq->rmt_addr = saddr;
1309 ireq->opt = tcp_v4_save_options(sk, skb);
1311 TCP_ECN_create_request(req, skb->h.th);
1314 #ifdef CONFIG_SYN_COOKIES
1315 syn_flood_warning(skb);
1317 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1319 struct inet_peer *peer = NULL;
1321 /* VJ's idea. We save last timestamp seen
1322 * from the destination in peer table, when entering
1323 * state TIME-WAIT, and check against it before
1324 * accepting new connection request.
1326 * If "isn" is not zero, this request hit alive
1327 * timewait bucket, so that all the necessary checks
1328 * are made in the function processing timewait state.
1330 if (tmp_opt.saw_tstamp &&
1331 sysctl_tcp_tw_recycle &&
1332 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1333 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1334 peer->v4daddr == saddr) {
1335 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1336 (s32)(peer->tcp_ts - req->ts_recent) >
1338 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1343 /* Kill the following clause, if you dislike this way. */
1344 else if (!sysctl_tcp_syncookies &&
1345 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1346 (sysctl_max_syn_backlog >> 2)) &&
1347 (!peer || !peer->tcp_ts_stamp) &&
1348 (!dst || !dst_metric(dst, RTAX_RTT))) {
1349 /* Without syncookies last quarter of
1350 * backlog is filled with destinations,
1351 * proven to be alive.
1352 * It means that we continue to communicate
1353 * to destinations, already remembered
1354 * to the moment of synflood.
1356 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1357 "request from %u.%u."
1360 ntohs(skb->h.th->source)));
1365 isn = tcp_v4_init_sequence(sk, skb);
1367 tcp_rsk(req)->snt_isn = isn;
1369 if (tcp_v4_send_synack(sk, req, dst))
1375 tcp_v4_synq_add(sk, req);
1382 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1388 * The three way handshake has completed - we got a valid synack -
1389 * now create the new socket.
1391 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1392 struct request_sock *req,
1393 struct dst_entry *dst)
1395 struct inet_request_sock *ireq;
1396 struct inet_sock *newinet;
1397 struct tcp_sock *newtp;
1400 if (sk_acceptq_is_full(sk))
1403 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1406 newsk = tcp_create_openreq_child(sk, req, skb);
1410 sk_setup_caps(newsk, dst);
1412 newtp = tcp_sk(newsk);
1413 newinet = inet_sk(newsk);
1414 ireq = inet_rsk(req);
1415 newinet->daddr = ireq->rmt_addr;
1416 newinet->rcv_saddr = ireq->loc_addr;
1417 newinet->saddr = ireq->loc_addr;
1418 newinet->opt = ireq->opt;
1420 newinet->mc_index = tcp_v4_iif(skb);
1421 newinet->mc_ttl = skb->nh.iph->ttl;
1422 newtp->ext_header_len = 0;
1424 newtp->ext_header_len = newinet->opt->optlen;
1425 newinet->id = newtp->write_seq ^ jiffies;
1427 tcp_sync_mss(newsk, dst_mtu(dst));
1428 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1429 tcp_initialize_rcv_mss(newsk);
1431 __inet_hash(&tcp_hashinfo, newsk, 0);
1432 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1437 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1439 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1444 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1446 struct tcphdr *th = skb->h.th;
1447 struct iphdr *iph = skb->nh.iph;
1448 struct tcp_sock *tp = tcp_sk(sk);
1450 struct request_sock **prev;
1451 /* Find possible connection requests. */
1452 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1453 iph->saddr, iph->daddr);
1455 return tcp_check_req(sk, skb, req, prev);
1457 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1464 if (nsk->sk_state != TCP_TIME_WAIT) {
1468 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1472 #ifdef CONFIG_SYN_COOKIES
1473 if (!th->rst && !th->syn && th->ack)
1474 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1479 static int tcp_v4_checksum_init(struct sk_buff *skb)
1481 if (skb->ip_summed == CHECKSUM_HW) {
1482 skb->ip_summed = CHECKSUM_UNNECESSARY;
1483 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1484 skb->nh.iph->daddr, skb->csum))
1487 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1488 skb->ip_summed = CHECKSUM_NONE;
1490 if (skb->len <= 76) {
1491 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1493 skb_checksum(skb, 0, skb->len, 0)))
1495 skb->ip_summed = CHECKSUM_UNNECESSARY;
1497 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1499 skb->nh.iph->daddr, 0);
1505 /* The socket must have it's spinlock held when we get
1508 * We have a potential double-lock case here, so even when
1509 * doing backlog processing we use the BH locking scheme.
1510 * This is because we cannot sleep with the original spinlock
1513 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1515 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1516 TCP_CHECK_TIMER(sk);
1517 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1519 TCP_CHECK_TIMER(sk);
1523 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1526 if (sk->sk_state == TCP_LISTEN) {
1527 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1532 if (tcp_child_process(sk, nsk, skb))
1538 TCP_CHECK_TIMER(sk);
1539 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1541 TCP_CHECK_TIMER(sk);
1545 tcp_v4_send_reset(skb);
1548 /* Be careful here. If this function gets more complicated and
1549 * gcc suffers from register pressure on the x86, sk (in %ebx)
1550 * might be destroyed here. This current version compiles correctly,
1551 * but you have been warned.
1556 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1564 int tcp_v4_rcv(struct sk_buff *skb)
1570 if (skb->pkt_type != PACKET_HOST)
1573 /* Count it even if it's bad */
1574 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1576 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1581 if (th->doff < sizeof(struct tcphdr) / 4)
1583 if (!pskb_may_pull(skb, th->doff * 4))
1586 /* An explanation is required here, I think.
1587 * Packet length and doff are validated by header prediction,
1588 * provided case of th->doff==0 is elimineted.
1589 * So, we defer the checks. */
1590 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1591 tcp_v4_checksum_init(skb) < 0))
1595 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1596 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1597 skb->len - th->doff * 4);
1598 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1599 TCP_SKB_CB(skb)->when = 0;
1600 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1601 TCP_SKB_CB(skb)->sacked = 0;
1603 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1604 skb->nh.iph->daddr, ntohs(th->dest),
1611 if (sk->sk_state == TCP_TIME_WAIT)
1614 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1615 goto discard_and_relse;
1617 if (sk_filter(sk, skb, 0))
1618 goto discard_and_relse;
1624 if (!sock_owned_by_user(sk)) {
1625 if (!tcp_prequeue(sk, skb))
1626 ret = tcp_v4_do_rcv(sk, skb);
1628 sk_add_backlog(sk, skb);
1636 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1639 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1641 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1643 tcp_v4_send_reset(skb);
1647 /* Discard frame. */
1656 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1657 tcp_tw_put((struct tcp_tw_bucket *) sk);
1661 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1662 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1663 tcp_tw_put((struct tcp_tw_bucket *) sk);
1666 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1667 skb, th, skb->len)) {
1669 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1673 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1674 tcp_tw_put((struct tcp_tw_bucket *)sk);
1678 /* Fall through to ACK */
1681 tcp_v4_timewait_ack(sk, skb);
1685 case TCP_TW_SUCCESS:;
1690 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1692 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1693 struct inet_sock *inet = inet_sk(sk);
1695 sin->sin_family = AF_INET;
1696 sin->sin_addr.s_addr = inet->daddr;
1697 sin->sin_port = inet->dport;
1700 /* VJ's idea. Save last timestamp seen from this destination
1701 * and hold it at least for normal timewait interval to use for duplicate
1702 * segment detection in subsequent connections, before they enter synchronized
1706 int tcp_v4_remember_stamp(struct sock *sk)
1708 struct inet_sock *inet = inet_sk(sk);
1709 struct tcp_sock *tp = tcp_sk(sk);
1710 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1711 struct inet_peer *peer = NULL;
1714 if (!rt || rt->rt_dst != inet->daddr) {
1715 peer = inet_getpeer(inet->daddr, 1);
1719 rt_bind_peer(rt, 1);
1724 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1725 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1726 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1727 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1728 peer->tcp_ts = tp->rx_opt.ts_recent;
1738 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1740 struct inet_peer *peer = NULL;
1742 peer = inet_getpeer(tw->tw_daddr, 1);
1745 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1746 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1747 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1748 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1749 peer->tcp_ts = tw->tw_ts_recent;
1758 struct tcp_func ipv4_specific = {
1759 .queue_xmit = ip_queue_xmit,
1760 .send_check = tcp_v4_send_check,
1761 .rebuild_header = inet_sk_rebuild_header,
1762 .conn_request = tcp_v4_conn_request,
1763 .syn_recv_sock = tcp_v4_syn_recv_sock,
1764 .remember_stamp = tcp_v4_remember_stamp,
1765 .net_header_len = sizeof(struct iphdr),
1766 .setsockopt = ip_setsockopt,
1767 .getsockopt = ip_getsockopt,
1768 .addr2sockaddr = v4_addr2sockaddr,
1769 .sockaddr_len = sizeof(struct sockaddr_in),
1772 /* NOTE: A lot of things set to zero explicitly by call to
1773 * sk_alloc() so need not be done here.
1775 static int tcp_v4_init_sock(struct sock *sk)
1777 struct tcp_sock *tp = tcp_sk(sk);
1779 skb_queue_head_init(&tp->out_of_order_queue);
1780 tcp_init_xmit_timers(sk);
1781 tcp_prequeue_init(tp);
1783 tp->rto = TCP_TIMEOUT_INIT;
1784 tp->mdev = TCP_TIMEOUT_INIT;
1786 /* So many TCP implementations out there (incorrectly) count the
1787 * initial SYN frame in their delayed-ACK and congestion control
1788 * algorithms that we must have the following bandaid to talk
1789 * efficiently to them. -DaveM
1793 /* See draft-stevens-tcpca-spec-01 for discussion of the
1794 * initialization of these values.
1796 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1797 tp->snd_cwnd_clamp = ~0;
1798 tp->mss_cache = 536;
1800 tp->reordering = sysctl_tcp_reordering;
1801 tp->ca_ops = &tcp_init_congestion_ops;
1803 sk->sk_state = TCP_CLOSE;
1805 sk->sk_write_space = sk_stream_write_space;
1806 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1808 tp->af_specific = &ipv4_specific;
1810 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1811 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1813 atomic_inc(&tcp_sockets_allocated);
1818 int tcp_v4_destroy_sock(struct sock *sk)
1820 struct tcp_sock *tp = tcp_sk(sk);
1822 tcp_clear_xmit_timers(sk);
1824 tcp_cleanup_congestion_control(tp);
1826 /* Cleanup up the write buffer. */
1827 sk_stream_writequeue_purge(sk);
1829 /* Cleans up our, hopefully empty, out_of_order_queue. */
1830 __skb_queue_purge(&tp->out_of_order_queue);
1832 /* Clean prequeue, it must be empty really */
1833 __skb_queue_purge(&tp->ucopy.prequeue);
1835 /* Clean up a referenced TCP bind bucket. */
1836 if (inet_sk(sk)->bind_hash)
1837 inet_put_port(&tcp_hashinfo, sk);
1840 * If sendmsg cached page exists, toss it.
1842 if (sk->sk_sndmsg_page) {
1843 __free_page(sk->sk_sndmsg_page);
1844 sk->sk_sndmsg_page = NULL;
1847 atomic_dec(&tcp_sockets_allocated);
1852 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1854 #ifdef CONFIG_PROC_FS
1855 /* Proc filesystem TCP sock list dumping. */
1857 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1859 return hlist_empty(head) ? NULL :
1860 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1863 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1865 return tw->tw_node.next ?
1866 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1869 static void *listening_get_next(struct seq_file *seq, void *cur)
1871 struct tcp_sock *tp;
1872 struct hlist_node *node;
1873 struct sock *sk = cur;
1874 struct tcp_iter_state* st = seq->private;
1878 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1884 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1885 struct request_sock *req = cur;
1887 tp = tcp_sk(st->syn_wait_sk);
1891 if (req->rsk_ops->family == st->family) {
1897 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1900 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1902 sk = sk_next(st->syn_wait_sk);
1903 st->state = TCP_SEQ_STATE_LISTENING;
1904 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1907 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1908 if (reqsk_queue_len(&tp->accept_queue))
1910 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1914 sk_for_each_from(sk, node) {
1915 if (sk->sk_family == st->family) {
1920 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1921 if (reqsk_queue_len(&tp->accept_queue)) {
1923 st->uid = sock_i_uid(sk);
1924 st->syn_wait_sk = sk;
1925 st->state = TCP_SEQ_STATE_OPENREQ;
1929 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1931 if (++st->bucket < INET_LHTABLE_SIZE) {
1932 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1940 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1942 void *rc = listening_get_next(seq, NULL);
1944 while (rc && *pos) {
1945 rc = listening_get_next(seq, rc);
1951 static void *established_get_first(struct seq_file *seq)
1953 struct tcp_iter_state* st = seq->private;
1956 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1958 struct hlist_node *node;
1959 struct tcp_tw_bucket *tw;
1961 /* We can reschedule _before_ having picked the target: */
1962 cond_resched_softirq();
1964 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1965 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1966 if (sk->sk_family != st->family) {
1972 st->state = TCP_SEQ_STATE_TIME_WAIT;
1973 tw_for_each(tw, node,
1974 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1975 if (tw->tw_family != st->family) {
1981 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1982 st->state = TCP_SEQ_STATE_ESTABLISHED;
1988 static void *established_get_next(struct seq_file *seq, void *cur)
1990 struct sock *sk = cur;
1991 struct tcp_tw_bucket *tw;
1992 struct hlist_node *node;
1993 struct tcp_iter_state* st = seq->private;
1997 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2001 while (tw && tw->tw_family != st->family) {
2008 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2009 st->state = TCP_SEQ_STATE_ESTABLISHED;
2011 /* We can reschedule between buckets: */
2012 cond_resched_softirq();
2014 if (++st->bucket < tcp_hashinfo.ehash_size) {
2015 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2016 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2024 sk_for_each_from(sk, node) {
2025 if (sk->sk_family == st->family)
2029 st->state = TCP_SEQ_STATE_TIME_WAIT;
2030 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
2038 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2040 void *rc = established_get_first(seq);
2043 rc = established_get_next(seq, rc);
2049 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2052 struct tcp_iter_state* st = seq->private;
2054 inet_listen_lock(&tcp_hashinfo);
2055 st->state = TCP_SEQ_STATE_LISTENING;
2056 rc = listening_get_idx(seq, &pos);
2059 inet_listen_unlock(&tcp_hashinfo);
2061 st->state = TCP_SEQ_STATE_ESTABLISHED;
2062 rc = established_get_idx(seq, pos);
2068 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2070 struct tcp_iter_state* st = seq->private;
2071 st->state = TCP_SEQ_STATE_LISTENING;
2073 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2076 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2079 struct tcp_iter_state* st;
2081 if (v == SEQ_START_TOKEN) {
2082 rc = tcp_get_idx(seq, 0);
2087 switch (st->state) {
2088 case TCP_SEQ_STATE_OPENREQ:
2089 case TCP_SEQ_STATE_LISTENING:
2090 rc = listening_get_next(seq, v);
2092 inet_listen_unlock(&tcp_hashinfo);
2094 st->state = TCP_SEQ_STATE_ESTABLISHED;
2095 rc = established_get_first(seq);
2098 case TCP_SEQ_STATE_ESTABLISHED:
2099 case TCP_SEQ_STATE_TIME_WAIT:
2100 rc = established_get_next(seq, v);
2108 static void tcp_seq_stop(struct seq_file *seq, void *v)
2110 struct tcp_iter_state* st = seq->private;
2112 switch (st->state) {
2113 case TCP_SEQ_STATE_OPENREQ:
2115 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2116 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2118 case TCP_SEQ_STATE_LISTENING:
2119 if (v != SEQ_START_TOKEN)
2120 inet_listen_unlock(&tcp_hashinfo);
2122 case TCP_SEQ_STATE_TIME_WAIT:
2123 case TCP_SEQ_STATE_ESTABLISHED:
2125 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2131 static int tcp_seq_open(struct inode *inode, struct file *file)
2133 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2134 struct seq_file *seq;
2135 struct tcp_iter_state *s;
2138 if (unlikely(afinfo == NULL))
2141 s = kmalloc(sizeof(*s), GFP_KERNEL);
2144 memset(s, 0, sizeof(*s));
2145 s->family = afinfo->family;
2146 s->seq_ops.start = tcp_seq_start;
2147 s->seq_ops.next = tcp_seq_next;
2148 s->seq_ops.show = afinfo->seq_show;
2149 s->seq_ops.stop = tcp_seq_stop;
2151 rc = seq_open(file, &s->seq_ops);
2154 seq = file->private_data;
2163 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2166 struct proc_dir_entry *p;
2170 afinfo->seq_fops->owner = afinfo->owner;
2171 afinfo->seq_fops->open = tcp_seq_open;
2172 afinfo->seq_fops->read = seq_read;
2173 afinfo->seq_fops->llseek = seq_lseek;
2174 afinfo->seq_fops->release = seq_release_private;
2176 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2184 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2188 proc_net_remove(afinfo->name);
2189 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2192 static void get_openreq4(struct sock *sk, struct request_sock *req,
2193 char *tmpbuf, int i, int uid)
2195 const struct inet_request_sock *ireq = inet_rsk(req);
2196 int ttd = req->expires - jiffies;
2198 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2199 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2202 ntohs(inet_sk(sk)->sport),
2204 ntohs(ireq->rmt_port),
2206 0, 0, /* could print option size, but that is af dependent. */
2207 1, /* timers active (only the expire timer) */
2208 jiffies_to_clock_t(ttd),
2211 0, /* non standard timer */
2212 0, /* open_requests have no inode */
2213 atomic_read(&sk->sk_refcnt),
2217 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2220 unsigned long timer_expires;
2221 struct tcp_sock *tp = tcp_sk(sp);
2222 struct inet_sock *inet = inet_sk(sp);
2223 unsigned int dest = inet->daddr;
2224 unsigned int src = inet->rcv_saddr;
2225 __u16 destp = ntohs(inet->dport);
2226 __u16 srcp = ntohs(inet->sport);
2228 if (tp->pending == TCP_TIME_RETRANS) {
2230 timer_expires = tp->timeout;
2231 } else if (tp->pending == TCP_TIME_PROBE0) {
2233 timer_expires = tp->timeout;
2234 } else if (timer_pending(&sp->sk_timer)) {
2236 timer_expires = sp->sk_timer.expires;
2239 timer_expires = jiffies;
2242 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2243 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2244 i, src, srcp, dest, destp, sp->sk_state,
2245 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2247 jiffies_to_clock_t(timer_expires - jiffies),
2252 atomic_read(&sp->sk_refcnt), sp,
2253 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2255 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2258 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2260 unsigned int dest, src;
2262 int ttd = tw->tw_ttd - jiffies;
2267 dest = tw->tw_daddr;
2268 src = tw->tw_rcv_saddr;
2269 destp = ntohs(tw->tw_dport);
2270 srcp = ntohs(tw->tw_sport);
2272 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2273 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2274 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2275 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2276 atomic_read(&tw->tw_refcnt), tw);
2281 static int tcp4_seq_show(struct seq_file *seq, void *v)
2283 struct tcp_iter_state* st;
2284 char tmpbuf[TMPSZ + 1];
2286 if (v == SEQ_START_TOKEN) {
2287 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2288 " sl local_address rem_address st tx_queue "
2289 "rx_queue tr tm->when retrnsmt uid timeout "
2295 switch (st->state) {
2296 case TCP_SEQ_STATE_LISTENING:
2297 case TCP_SEQ_STATE_ESTABLISHED:
2298 get_tcp4_sock(v, tmpbuf, st->num);
2300 case TCP_SEQ_STATE_OPENREQ:
2301 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2303 case TCP_SEQ_STATE_TIME_WAIT:
2304 get_timewait4_sock(v, tmpbuf, st->num);
2307 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2312 static struct file_operations tcp4_seq_fops;
2313 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2314 .owner = THIS_MODULE,
2317 .seq_show = tcp4_seq_show,
2318 .seq_fops = &tcp4_seq_fops,
2321 int __init tcp4_proc_init(void)
2323 return tcp_proc_register(&tcp4_seq_afinfo);
2326 void tcp4_proc_exit(void)
2328 tcp_proc_unregister(&tcp4_seq_afinfo);
2330 #endif /* CONFIG_PROC_FS */
2332 struct proto tcp_prot = {
2334 .owner = THIS_MODULE,
2336 .connect = tcp_v4_connect,
2337 .disconnect = tcp_disconnect,
2338 .accept = tcp_accept,
2340 .init = tcp_v4_init_sock,
2341 .destroy = tcp_v4_destroy_sock,
2342 .shutdown = tcp_shutdown,
2343 .setsockopt = tcp_setsockopt,
2344 .getsockopt = tcp_getsockopt,
2345 .sendmsg = tcp_sendmsg,
2346 .recvmsg = tcp_recvmsg,
2347 .backlog_rcv = tcp_v4_do_rcv,
2348 .hash = tcp_v4_hash,
2349 .unhash = tcp_unhash,
2350 .get_port = tcp_v4_get_port,
2351 .enter_memory_pressure = tcp_enter_memory_pressure,
2352 .sockets_allocated = &tcp_sockets_allocated,
2353 .memory_allocated = &tcp_memory_allocated,
2354 .memory_pressure = &tcp_memory_pressure,
2355 .sysctl_mem = sysctl_tcp_mem,
2356 .sysctl_wmem = sysctl_tcp_wmem,
2357 .sysctl_rmem = sysctl_tcp_rmem,
2358 .max_header = MAX_TCP_HEADER,
2359 .obj_size = sizeof(struct tcp_sock),
2360 .rsk_prot = &tcp_request_sock_ops,
2365 void __init tcp_v4_init(struct net_proto_family *ops)
2367 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2369 panic("Failed to create the TCP control socket.\n");
2370 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2371 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2373 /* Unhash it so that IP input processing does not even
2374 * see it, we do not wish this socket to see incoming
2377 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2380 EXPORT_SYMBOL(ipv4_specific);
2381 EXPORT_SYMBOL(inet_bind_bucket_create);
2382 EXPORT_SYMBOL(tcp_hashinfo);
2383 EXPORT_SYMBOL(tcp_prot);
2384 EXPORT_SYMBOL(tcp_unhash);
2385 EXPORT_SYMBOL(tcp_v4_conn_request);
2386 EXPORT_SYMBOL(tcp_v4_connect);
2387 EXPORT_SYMBOL(tcp_v4_do_rcv);
2388 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2389 EXPORT_SYMBOL(tcp_v4_send_check);
2390 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2392 #ifdef CONFIG_PROC_FS
2393 EXPORT_SYMBOL(tcp_proc_register);
2394 EXPORT_SYMBOL(tcp_proc_unregister);
2396 EXPORT_SYMBOL(sysctl_local_port_range);
2397 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2398 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);