2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
67 #include <net/inet_hashtables.h>
70 #include <net/inet_common.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 .port_rover = 1024 - 1,
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
105 int sysctl_local_port_range[2] = { 1024, 4999 };
107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
109 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
114 sk_for_each_bound(sk2, node, &tb->owners) {
116 !tcp_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
122 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
132 /* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
137 struct inet_bind_hashbucket *head;
138 struct hlist_node *node;
139 struct inet_bind_bucket *tb;
144 int low = sysctl_local_port_range[0];
145 int high = sysctl_local_port_range[1];
146 int remaining = (high - low) + 1;
149 spin_lock(&tcp_hashinfo.portalloc_lock);
150 if (tcp_hashinfo.port_rover < low)
153 rover = tcp_hashinfo.port_rover;
158 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
159 spin_lock(&head->lock);
160 inet_bind_bucket_for_each(tb, node, &head->chain)
161 if (tb->port == rover)
165 spin_unlock(&head->lock);
166 } while (--remaining > 0);
167 tcp_hashinfo.port_rover = rover;
168 spin_unlock(&tcp_hashinfo.portalloc_lock);
170 /* Exhausted local port range during search? It is not
171 * possible for us to be holding one of the bind hash
172 * locks if this test triggers, because if 'remaining'
173 * drops to zero, we broke out of the do/while loop at
174 * the top level, not from the 'break;' statement.
177 if (unlikely(remaining <= 0))
180 /* OK, here is the one we will use. HEAD is
181 * non-NULL and we hold it's mutex.
185 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
186 spin_lock(&head->lock);
187 inet_bind_bucket_for_each(tb, node, &head->chain)
188 if (tb->port == snum)
194 if (!hlist_empty(&tb->owners)) {
195 if (sk->sk_reuse > 1)
197 if (tb->fastreuse > 0 &&
198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
202 if (tcp_bind_conflict(sk, tb))
208 if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
210 if (hlist_empty(&tb->owners)) {
211 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
215 } else if (tb->fastreuse &&
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
219 if (!inet_sk(sk)->bind_hash)
220 inet_bind_hash(sk, tb, snum);
221 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
225 spin_unlock(&head->lock);
231 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
232 * Look, when several writers sleep and reader wakes them up, all but one
233 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
234 * this, _but_ remember, it adds useless work on UP machines (wake up each
235 * exclusive lock release). It should be ifdefed really.
238 void tcp_listen_wlock(void)
240 write_lock(&tcp_hashinfo.lhash_lock);
242 if (atomic_read(&tcp_hashinfo.lhash_users)) {
246 prepare_to_wait_exclusive(&tcp_hashinfo.lhash_wait,
247 &wait, TASK_UNINTERRUPTIBLE);
248 if (!atomic_read(&tcp_hashinfo.lhash_users))
250 write_unlock_bh(&tcp_hashinfo.lhash_lock);
252 write_lock_bh(&tcp_hashinfo.lhash_lock);
255 finish_wait(&tcp_hashinfo.lhash_wait, &wait);
259 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
261 struct hlist_head *list;
264 BUG_TRAP(sk_unhashed(sk));
265 if (listen_possible && sk->sk_state == TCP_LISTEN) {
266 list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)];
267 lock = &tcp_hashinfo.lhash_lock;
270 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_hashinfo.ehash_size);
271 list = &tcp_hashinfo.ehash[sk->sk_hashent].chain;
272 lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock;
275 __sk_add_node(sk, list);
276 sock_prot_inc_use(sk->sk_prot);
278 if (listen_possible && sk->sk_state == TCP_LISTEN)
279 wake_up(&tcp_hashinfo.lhash_wait);
282 static void tcp_v4_hash(struct sock *sk)
284 if (sk->sk_state != TCP_CLOSE) {
286 __tcp_v4_hash(sk, 1);
291 void tcp_unhash(struct sock *sk)
298 if (sk->sk_state == TCP_LISTEN) {
301 lock = &tcp_hashinfo.lhash_lock;
303 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[sk->sk_hashent];
305 write_lock_bh(&head->lock);
308 if (__sk_del_node_init(sk))
309 sock_prot_dec_use(sk->sk_prot);
310 write_unlock_bh(lock);
313 if (sk->sk_state == TCP_LISTEN)
314 wake_up(&tcp_hashinfo.lhash_wait);
317 /* Don't inline this cruft. Here are some nice properties to
318 * exploit here. The BSD API does not allow a listening TCP
319 * to specify the remote port nor the remote address for the
320 * connection. So always assume those are both wildcarded
321 * during the search since they can never be otherwise.
323 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
325 const unsigned short hnum,
328 struct sock *result = NULL, *sk;
329 struct hlist_node *node;
333 sk_for_each(sk, node, head) {
334 struct inet_sock *inet = inet_sk(sk);
336 if (inet->num == hnum && !ipv6_only_sock(sk)) {
337 __u32 rcv_saddr = inet->rcv_saddr;
339 score = (sk->sk_family == PF_INET ? 1 : 0);
341 if (rcv_saddr != daddr)
345 if (sk->sk_bound_dev_if) {
346 if (sk->sk_bound_dev_if != dif)
352 if (score > hiscore) {
361 /* Optimize the common listener case. */
362 static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
363 const unsigned short hnum,
366 struct sock *sk = NULL;
367 struct hlist_head *head;
369 read_lock(&tcp_hashinfo.lhash_lock);
370 head = &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)];
371 if (!hlist_empty(head)) {
372 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
374 if (inet->num == hnum && !sk->sk_node.next &&
375 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
376 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
377 !sk->sk_bound_dev_if)
379 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
385 read_unlock(&tcp_hashinfo.lhash_lock);
389 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
390 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
392 * Local BH must be disabled here.
395 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
401 struct inet_ehash_bucket *head;
402 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
403 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
405 struct hlist_node *node;
406 /* Optimize here for direct hit, only listening connections can
407 * have wildcards anyways.
409 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
410 head = &tcp_hashinfo.ehash[hash];
411 read_lock(&head->lock);
412 sk_for_each(sk, node, &head->chain) {
413 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
414 goto hit; /* You sunk my battleship! */
417 /* Must check for a TIME_WAIT'er before going to listener hash. */
418 sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
419 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
424 read_unlock(&head->lock);
431 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
432 u32 daddr, u16 hnum, int dif)
434 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
437 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
440 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
446 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
452 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
454 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
456 return secure_tcp_sequence_number(skb->nh.iph->daddr,
462 /* called with local bh disabled */
463 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
464 struct tcp_tw_bucket **twp)
466 struct inet_sock *inet = inet_sk(sk);
467 u32 daddr = inet->rcv_saddr;
468 u32 saddr = inet->daddr;
469 int dif = sk->sk_bound_dev_if;
470 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
471 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
472 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
473 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
475 struct hlist_node *node;
476 struct tcp_tw_bucket *tw;
478 write_lock(&head->lock);
480 /* Check TIME-WAIT sockets first. */
481 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
482 tw = (struct tcp_tw_bucket *)sk2;
484 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
485 struct tcp_sock *tp = tcp_sk(sk);
487 /* With PAWS, it is safe from the viewpoint
488 of data integrity. Even without PAWS it
489 is safe provided sequence spaces do not
490 overlap i.e. at data rates <= 80Mbit/sec.
492 Actually, the idea is close to VJ's one,
493 only timestamp cache is held not per host,
494 but per port pair and TW bucket is used
497 If TW bucket has been already destroyed we
498 fall back to VJ's scheme and use initial
499 timestamp retrieved from peer table.
501 if (tw->tw_ts_recent_stamp &&
502 (!twp || (sysctl_tcp_tw_reuse &&
504 tw->tw_ts_recent_stamp > 1))) {
506 tw->tw_snd_nxt + 65535 + 2) == 0)
508 tp->rx_opt.ts_recent = tw->tw_ts_recent;
509 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
518 /* And established part... */
519 sk_for_each(sk2, node, &head->chain) {
520 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
525 /* Must record num and sport now. Otherwise we will see
526 * in hash table socket with a funny identity. */
528 inet->sport = htons(lport);
529 sk->sk_hashent = hash;
530 BUG_TRAP(sk_unhashed(sk));
531 __sk_add_node(sk, &head->chain);
532 sock_prot_inc_use(sk->sk_prot);
533 write_unlock(&head->lock);
537 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
539 /* Silly. Should hash-dance instead... */
540 tcp_tw_deschedule(tw);
541 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
549 write_unlock(&head->lock);
550 return -EADDRNOTAVAIL;
553 static inline u32 connect_port_offset(const struct sock *sk)
555 const struct inet_sock *inet = inet_sk(sk);
557 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
562 * Bind a port for a connect operation and hash it.
564 static inline int tcp_v4_hash_connect(struct sock *sk)
566 const unsigned short snum = inet_sk(sk)->num;
567 struct inet_bind_hashbucket *head;
568 struct inet_bind_bucket *tb;
572 int low = sysctl_local_port_range[0];
573 int high = sysctl_local_port_range[1];
574 int range = high - low;
578 u32 offset = hint + connect_port_offset(sk);
579 struct hlist_node *node;
580 struct tcp_tw_bucket *tw = NULL;
583 for (i = 1; i <= range; i++) {
584 port = low + (i + offset) % range;
585 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
586 spin_lock(&head->lock);
588 /* Does not bother with rcv_saddr checks,
589 * because the established check is already
592 inet_bind_bucket_for_each(tb, node, &head->chain) {
593 if (tb->port == port) {
594 BUG_TRAP(!hlist_empty(&tb->owners));
595 if (tb->fastreuse >= 0)
597 if (!__tcp_v4_check_established(sk,
605 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
607 spin_unlock(&head->lock);
614 spin_unlock(&head->lock);
618 return -EADDRNOTAVAIL;
623 /* Head lock still held and bh's disabled */
624 inet_bind_hash(sk, tb, port);
625 if (sk_unhashed(sk)) {
626 inet_sk(sk)->sport = htons(port);
627 __tcp_v4_hash(sk, 0);
629 spin_unlock(&head->lock);
632 tcp_tw_deschedule(tw);
640 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
641 tb = inet_sk(sk)->bind_hash;
642 spin_lock_bh(&head->lock);
643 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
644 __tcp_v4_hash(sk, 0);
645 spin_unlock_bh(&head->lock);
648 spin_unlock(&head->lock);
649 /* No definite answer... Walk to established hash table */
650 ret = __tcp_v4_check_established(sk, snum, NULL);
657 /* This will initiate an outgoing connection. */
658 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
660 struct inet_sock *inet = inet_sk(sk);
661 struct tcp_sock *tp = tcp_sk(sk);
662 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
668 if (addr_len < sizeof(struct sockaddr_in))
671 if (usin->sin_family != AF_INET)
672 return -EAFNOSUPPORT;
674 nexthop = daddr = usin->sin_addr.s_addr;
675 if (inet->opt && inet->opt->srr) {
678 nexthop = inet->opt->faddr;
681 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
682 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
684 inet->sport, usin->sin_port, sk);
688 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
693 if (!inet->opt || !inet->opt->srr)
697 inet->saddr = rt->rt_src;
698 inet->rcv_saddr = inet->saddr;
700 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
701 /* Reset inherited state */
702 tp->rx_opt.ts_recent = 0;
703 tp->rx_opt.ts_recent_stamp = 0;
707 if (sysctl_tcp_tw_recycle &&
708 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
709 struct inet_peer *peer = rt_get_peer(rt);
711 /* VJ's idea. We save last timestamp seen from
712 * the destination in peer table, when entering state TIME-WAIT
713 * and initialize rx_opt.ts_recent from it, when trying new connection.
716 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
717 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
718 tp->rx_opt.ts_recent = peer->tcp_ts;
722 inet->dport = usin->sin_port;
725 tp->ext_header_len = 0;
727 tp->ext_header_len = inet->opt->optlen;
729 tp->rx_opt.mss_clamp = 536;
731 /* Socket identity is still unknown (sport may be zero).
732 * However we set state to SYN-SENT and not releasing socket
733 * lock select source port, enter ourselves into the hash tables and
734 * complete initialization after this.
736 tcp_set_state(sk, TCP_SYN_SENT);
737 err = tcp_v4_hash_connect(sk);
741 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
745 /* OK, now commit destination to socket. */
746 sk_setup_caps(sk, &rt->u.dst);
749 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
754 inet->id = tp->write_seq ^ jiffies;
756 err = tcp_connect(sk);
764 /* This unhashes the socket and releases the local port, if necessary. */
765 tcp_set_state(sk, TCP_CLOSE);
767 sk->sk_route_caps = 0;
772 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
774 return ((struct rtable *)skb->dst)->rt_iif;
777 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
779 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
782 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
783 struct request_sock ***prevp,
785 __u32 raddr, __u32 laddr)
787 struct listen_sock *lopt = tp->accept_queue.listen_opt;
788 struct request_sock *req, **prev;
790 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
791 (req = *prev) != NULL;
792 prev = &req->dl_next) {
793 const struct inet_request_sock *ireq = inet_rsk(req);
795 if (ireq->rmt_port == rport &&
796 ireq->rmt_addr == raddr &&
797 ireq->loc_addr == laddr &&
798 TCP_INET_FAMILY(req->rsk_ops->family)) {
808 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
810 struct tcp_sock *tp = tcp_sk(sk);
811 struct listen_sock *lopt = tp->accept_queue.listen_opt;
812 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
814 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
820 * This routine does path mtu discovery as defined in RFC1191.
822 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
825 struct dst_entry *dst;
826 struct inet_sock *inet = inet_sk(sk);
827 struct tcp_sock *tp = tcp_sk(sk);
829 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
830 * send out by Linux are always <576bytes so they should go through
833 if (sk->sk_state == TCP_LISTEN)
836 /* We don't check in the destentry if pmtu discovery is forbidden
837 * on this route. We just assume that no packet_to_big packets
838 * are send back when pmtu discovery is not active.
839 * There is a small race when the user changes this flag in the
840 * route, but I think that's acceptable.
842 if ((dst = __sk_dst_check(sk, 0)) == NULL)
845 dst->ops->update_pmtu(dst, mtu);
847 /* Something is about to be wrong... Remember soft error
848 * for the case, if this connection will not able to recover.
850 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
851 sk->sk_err_soft = EMSGSIZE;
855 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
856 tp->pmtu_cookie > mtu) {
857 tcp_sync_mss(sk, mtu);
859 /* Resend the TCP packet because it's
860 * clear that the old packet has been
861 * dropped. This is the new "fast" path mtu
864 tcp_simple_retransmit(sk);
865 } /* else let the usual retransmit timer handle it */
869 * This routine is called by the ICMP module when it gets some
870 * sort of error condition. If err < 0 then the socket should
871 * be closed and the error returned to the user. If err > 0
872 * it's just the icmp type << 8 | icmp code. After adjustment
873 * header points to the first 8 bytes of the tcp header. We need
874 * to find the appropriate port.
876 * The locking strategy used here is very "optimistic". When
877 * someone else accesses the socket the ICMP is just dropped
878 * and for some paths there is no check at all.
879 * A more general error queue to queue errors for later handling
880 * is probably better.
884 void tcp_v4_err(struct sk_buff *skb, u32 info)
886 struct iphdr *iph = (struct iphdr *)skb->data;
887 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
889 struct inet_sock *inet;
890 int type = skb->h.icmph->type;
891 int code = skb->h.icmph->code;
896 if (skb->len < (iph->ihl << 2) + 8) {
897 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
901 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
902 th->source, tcp_v4_iif(skb));
904 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
907 if (sk->sk_state == TCP_TIME_WAIT) {
908 tcp_tw_put((struct tcp_tw_bucket *)sk);
913 /* If too many ICMPs get dropped on busy
914 * servers this needs to be solved differently.
916 if (sock_owned_by_user(sk))
917 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
919 if (sk->sk_state == TCP_CLOSE)
923 seq = ntohl(th->seq);
924 if (sk->sk_state != TCP_LISTEN &&
925 !between(seq, tp->snd_una, tp->snd_nxt)) {
926 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
931 case ICMP_SOURCE_QUENCH:
932 /* Just silently ignore these. */
934 case ICMP_PARAMETERPROB:
937 case ICMP_DEST_UNREACH:
938 if (code > NR_ICMP_UNREACH)
941 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
942 if (!sock_owned_by_user(sk))
943 do_pmtu_discovery(sk, iph, info);
947 err = icmp_err_convert[code].errno;
949 case ICMP_TIME_EXCEEDED:
956 switch (sk->sk_state) {
957 struct request_sock *req, **prev;
959 if (sock_owned_by_user(sk))
962 req = tcp_v4_search_req(tp, &prev, th->dest,
963 iph->daddr, iph->saddr);
967 /* ICMPs are not backlogged, hence we cannot get
968 an established socket here.
972 if (seq != tcp_rsk(req)->snt_isn) {
973 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
978 * Still in SYN_RECV, just remove it silently.
979 * There is no good way to pass the error to the newly
980 * created socket, and POSIX does not want network
981 * errors returned from accept().
983 tcp_synq_drop(sk, req, prev);
987 case TCP_SYN_RECV: /* Cannot happen.
988 It can f.e. if SYNs crossed.
990 if (!sock_owned_by_user(sk)) {
991 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
994 sk->sk_error_report(sk);
998 sk->sk_err_soft = err;
1003 /* If we've already connected we will keep trying
1004 * until we time out, or the user gives up.
1006 * rfc1122 4.2.3.9 allows to consider as hard errors
1007 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1008 * but it is obsoleted by pmtu discovery).
1010 * Note, that in modern internet, where routing is unreliable
1011 * and in each dark corner broken firewalls sit, sending random
1012 * errors ordered by their masters even this two messages finally lose
1013 * their original sense (even Linux sends invalid PORT_UNREACHs)
1015 * Now we are in compliance with RFCs.
1020 if (!sock_owned_by_user(sk) && inet->recverr) {
1022 sk->sk_error_report(sk);
1023 } else { /* Only an error on timeout */
1024 sk->sk_err_soft = err;
1032 /* This routine computes an IPv4 TCP checksum. */
1033 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1034 struct sk_buff *skb)
1036 struct inet_sock *inet = inet_sk(sk);
1038 if (skb->ip_summed == CHECKSUM_HW) {
1039 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1040 skb->csum = offsetof(struct tcphdr, check);
1042 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1043 csum_partial((char *)th,
1050 * This routine will send an RST to the other tcp.
1052 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1054 * Answer: if a packet caused RST, it is not for a socket
1055 * existing in our system, if it is matched to a socket,
1056 * it is just duplicate segment or bug in other side's TCP.
1057 * So that we build reply only basing on parameters
1058 * arrived with segment.
1059 * Exception: precedence violation. We do not implement it in any case.
1062 static void tcp_v4_send_reset(struct sk_buff *skb)
1064 struct tcphdr *th = skb->h.th;
1066 struct ip_reply_arg arg;
1068 /* Never send a reset in response to a reset. */
1072 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1075 /* Swap the send and the receive. */
1076 memset(&rth, 0, sizeof(struct tcphdr));
1077 rth.dest = th->source;
1078 rth.source = th->dest;
1079 rth.doff = sizeof(struct tcphdr) / 4;
1083 rth.seq = th->ack_seq;
1086 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1087 skb->len - (th->doff << 2));
1090 memset(&arg, 0, sizeof arg);
1091 arg.iov[0].iov_base = (unsigned char *)&rth;
1092 arg.iov[0].iov_len = sizeof rth;
1093 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1094 skb->nh.iph->saddr, /*XXX*/
1095 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1096 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1098 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1100 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1101 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1104 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1105 outside socket context is ugly, certainly. What can I do?
1108 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1111 struct tcphdr *th = skb->h.th;
1116 struct ip_reply_arg arg;
1118 memset(&rep.th, 0, sizeof(struct tcphdr));
1119 memset(&arg, 0, sizeof arg);
1121 arg.iov[0].iov_base = (unsigned char *)&rep;
1122 arg.iov[0].iov_len = sizeof(rep.th);
1124 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1125 (TCPOPT_TIMESTAMP << 8) |
1127 rep.tsopt[1] = htonl(tcp_time_stamp);
1128 rep.tsopt[2] = htonl(ts);
1129 arg.iov[0].iov_len = sizeof(rep);
1132 /* Swap the send and the receive. */
1133 rep.th.dest = th->source;
1134 rep.th.source = th->dest;
1135 rep.th.doff = arg.iov[0].iov_len / 4;
1136 rep.th.seq = htonl(seq);
1137 rep.th.ack_seq = htonl(ack);
1139 rep.th.window = htons(win);
1141 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1142 skb->nh.iph->saddr, /*XXX*/
1143 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1144 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1146 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1148 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1151 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1153 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1155 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1156 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1161 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1163 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1167 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1168 struct request_sock *req)
1171 const struct inet_request_sock *ireq = inet_rsk(req);
1172 struct ip_options *opt = inet_rsk(req)->opt;
1173 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1175 { .daddr = ((opt && opt->srr) ?
1178 .saddr = ireq->loc_addr,
1179 .tos = RT_CONN_FLAGS(sk) } },
1180 .proto = IPPROTO_TCP,
1182 { .sport = inet_sk(sk)->sport,
1183 .dport = ireq->rmt_port } } };
1185 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1186 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1189 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1191 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1198 * Send a SYN-ACK after having received an ACK.
1199 * This still operates on a request_sock only, not on a big
1202 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1203 struct dst_entry *dst)
1205 const struct inet_request_sock *ireq = inet_rsk(req);
1207 struct sk_buff * skb;
1209 /* First, grab a route. */
1210 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1213 skb = tcp_make_synack(sk, dst, req);
1216 struct tcphdr *th = skb->h.th;
1218 th->check = tcp_v4_check(th, skb->len,
1221 csum_partial((char *)th, skb->len,
1224 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1227 if (err == NET_XMIT_CN)
1237 * IPv4 request_sock destructor.
1239 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1241 if (inet_rsk(req)->opt)
1242 kfree(inet_rsk(req)->opt);
1245 static inline void syn_flood_warning(struct sk_buff *skb)
1247 static unsigned long warntime;
1249 if (time_after(jiffies, (warntime + HZ * 60))) {
1252 "possible SYN flooding on port %d. Sending cookies.\n",
1253 ntohs(skb->h.th->dest));
1258 * Save and compile IPv4 options into the request_sock if needed.
1260 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1261 struct sk_buff *skb)
1263 struct ip_options *opt = &(IPCB(skb)->opt);
1264 struct ip_options *dopt = NULL;
1266 if (opt && opt->optlen) {
1267 int opt_size = optlength(opt);
1268 dopt = kmalloc(opt_size, GFP_ATOMIC);
1270 if (ip_options_echo(dopt, skb)) {
1279 struct request_sock_ops tcp_request_sock_ops = {
1281 .obj_size = sizeof(struct tcp_request_sock),
1282 .rtx_syn_ack = tcp_v4_send_synack,
1283 .send_ack = tcp_v4_reqsk_send_ack,
1284 .destructor = tcp_v4_reqsk_destructor,
1285 .send_reset = tcp_v4_send_reset,
1288 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1290 struct inet_request_sock *ireq;
1291 struct tcp_options_received tmp_opt;
1292 struct request_sock *req;
1293 __u32 saddr = skb->nh.iph->saddr;
1294 __u32 daddr = skb->nh.iph->daddr;
1295 __u32 isn = TCP_SKB_CB(skb)->when;
1296 struct dst_entry *dst = NULL;
1297 #ifdef CONFIG_SYN_COOKIES
1298 int want_cookie = 0;
1300 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1303 /* Never answer to SYNs send to broadcast or multicast */
1304 if (((struct rtable *)skb->dst)->rt_flags &
1305 (RTCF_BROADCAST | RTCF_MULTICAST))
1308 /* TW buckets are converted to open requests without
1309 * limitations, they conserve resources and peer is
1310 * evidently real one.
1312 if (tcp_synq_is_full(sk) && !isn) {
1313 #ifdef CONFIG_SYN_COOKIES
1314 if (sysctl_tcp_syncookies) {
1321 /* Accept backlog is full. If we have already queued enough
1322 * of warm entries in syn queue, drop request. It is better than
1323 * clogging syn queue with openreqs with exponentially increasing
1326 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1329 req = reqsk_alloc(&tcp_request_sock_ops);
1333 tcp_clear_options(&tmp_opt);
1334 tmp_opt.mss_clamp = 536;
1335 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1337 tcp_parse_options(skb, &tmp_opt, 0);
1340 tcp_clear_options(&tmp_opt);
1341 tmp_opt.saw_tstamp = 0;
1344 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1345 /* Some OSes (unknown ones, but I see them on web server, which
1346 * contains information interesting only for windows'
1347 * users) do not send their stamp in SYN. It is easy case.
1348 * We simply do not advertise TS support.
1350 tmp_opt.saw_tstamp = 0;
1351 tmp_opt.tstamp_ok = 0;
1353 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1355 tcp_openreq_init(req, &tmp_opt, skb);
1357 ireq = inet_rsk(req);
1358 ireq->loc_addr = daddr;
1359 ireq->rmt_addr = saddr;
1360 ireq->opt = tcp_v4_save_options(sk, skb);
1362 TCP_ECN_create_request(req, skb->h.th);
1365 #ifdef CONFIG_SYN_COOKIES
1366 syn_flood_warning(skb);
1368 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1370 struct inet_peer *peer = NULL;
1372 /* VJ's idea. We save last timestamp seen
1373 * from the destination in peer table, when entering
1374 * state TIME-WAIT, and check against it before
1375 * accepting new connection request.
1377 * If "isn" is not zero, this request hit alive
1378 * timewait bucket, so that all the necessary checks
1379 * are made in the function processing timewait state.
1381 if (tmp_opt.saw_tstamp &&
1382 sysctl_tcp_tw_recycle &&
1383 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1384 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1385 peer->v4daddr == saddr) {
1386 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1387 (s32)(peer->tcp_ts - req->ts_recent) >
1389 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1394 /* Kill the following clause, if you dislike this way. */
1395 else if (!sysctl_tcp_syncookies &&
1396 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1397 (sysctl_max_syn_backlog >> 2)) &&
1398 (!peer || !peer->tcp_ts_stamp) &&
1399 (!dst || !dst_metric(dst, RTAX_RTT))) {
1400 /* Without syncookies last quarter of
1401 * backlog is filled with destinations,
1402 * proven to be alive.
1403 * It means that we continue to communicate
1404 * to destinations, already remembered
1405 * to the moment of synflood.
1407 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1408 "request from %u.%u."
1411 ntohs(skb->h.th->source)));
1416 isn = tcp_v4_init_sequence(sk, skb);
1418 tcp_rsk(req)->snt_isn = isn;
1420 if (tcp_v4_send_synack(sk, req, dst))
1426 tcp_v4_synq_add(sk, req);
1433 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1439 * The three way handshake has completed - we got a valid synack -
1440 * now create the new socket.
1442 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1443 struct request_sock *req,
1444 struct dst_entry *dst)
1446 struct inet_request_sock *ireq;
1447 struct inet_sock *newinet;
1448 struct tcp_sock *newtp;
1451 if (sk_acceptq_is_full(sk))
1454 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1457 newsk = tcp_create_openreq_child(sk, req, skb);
1461 sk_setup_caps(newsk, dst);
1463 newtp = tcp_sk(newsk);
1464 newinet = inet_sk(newsk);
1465 ireq = inet_rsk(req);
1466 newinet->daddr = ireq->rmt_addr;
1467 newinet->rcv_saddr = ireq->loc_addr;
1468 newinet->saddr = ireq->loc_addr;
1469 newinet->opt = ireq->opt;
1471 newinet->mc_index = tcp_v4_iif(skb);
1472 newinet->mc_ttl = skb->nh.iph->ttl;
1473 newtp->ext_header_len = 0;
1475 newtp->ext_header_len = newinet->opt->optlen;
1476 newinet->id = newtp->write_seq ^ jiffies;
1478 tcp_sync_mss(newsk, dst_mtu(dst));
1479 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1480 tcp_initialize_rcv_mss(newsk);
1482 __tcp_v4_hash(newsk, 0);
1483 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1488 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1490 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1495 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1497 struct tcphdr *th = skb->h.th;
1498 struct iphdr *iph = skb->nh.iph;
1499 struct tcp_sock *tp = tcp_sk(sk);
1501 struct request_sock **prev;
1502 /* Find possible connection requests. */
1503 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1504 iph->saddr, iph->daddr);
1506 return tcp_check_req(sk, skb, req, prev);
1508 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1515 if (nsk->sk_state != TCP_TIME_WAIT) {
1519 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1523 #ifdef CONFIG_SYN_COOKIES
1524 if (!th->rst && !th->syn && th->ack)
1525 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1530 static int tcp_v4_checksum_init(struct sk_buff *skb)
1532 if (skb->ip_summed == CHECKSUM_HW) {
1533 skb->ip_summed = CHECKSUM_UNNECESSARY;
1534 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1535 skb->nh.iph->daddr, skb->csum))
1538 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1539 skb->ip_summed = CHECKSUM_NONE;
1541 if (skb->len <= 76) {
1542 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1544 skb_checksum(skb, 0, skb->len, 0)))
1546 skb->ip_summed = CHECKSUM_UNNECESSARY;
1548 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1550 skb->nh.iph->daddr, 0);
1556 /* The socket must have it's spinlock held when we get
1559 * We have a potential double-lock case here, so even when
1560 * doing backlog processing we use the BH locking scheme.
1561 * This is because we cannot sleep with the original spinlock
1564 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1566 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1567 TCP_CHECK_TIMER(sk);
1568 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1570 TCP_CHECK_TIMER(sk);
1574 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1577 if (sk->sk_state == TCP_LISTEN) {
1578 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1583 if (tcp_child_process(sk, nsk, skb))
1589 TCP_CHECK_TIMER(sk);
1590 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1592 TCP_CHECK_TIMER(sk);
1596 tcp_v4_send_reset(skb);
1599 /* Be careful here. If this function gets more complicated and
1600 * gcc suffers from register pressure on the x86, sk (in %ebx)
1601 * might be destroyed here. This current version compiles correctly,
1602 * but you have been warned.
1607 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1615 int tcp_v4_rcv(struct sk_buff *skb)
1621 if (skb->pkt_type != PACKET_HOST)
1624 /* Count it even if it's bad */
1625 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1627 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1632 if (th->doff < sizeof(struct tcphdr) / 4)
1634 if (!pskb_may_pull(skb, th->doff * 4))
1637 /* An explanation is required here, I think.
1638 * Packet length and doff are validated by header prediction,
1639 * provided case of th->doff==0 is elimineted.
1640 * So, we defer the checks. */
1641 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1642 tcp_v4_checksum_init(skb) < 0))
1646 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1647 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1648 skb->len - th->doff * 4);
1649 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1650 TCP_SKB_CB(skb)->when = 0;
1651 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1652 TCP_SKB_CB(skb)->sacked = 0;
1654 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1655 skb->nh.iph->daddr, ntohs(th->dest),
1662 if (sk->sk_state == TCP_TIME_WAIT)
1665 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1666 goto discard_and_relse;
1668 if (sk_filter(sk, skb, 0))
1669 goto discard_and_relse;
1675 if (!sock_owned_by_user(sk)) {
1676 if (!tcp_prequeue(sk, skb))
1677 ret = tcp_v4_do_rcv(sk, skb);
1679 sk_add_backlog(sk, skb);
1687 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1690 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1692 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1694 tcp_v4_send_reset(skb);
1698 /* Discard frame. */
1707 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1708 tcp_tw_put((struct tcp_tw_bucket *) sk);
1712 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1713 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1714 tcp_tw_put((struct tcp_tw_bucket *) sk);
1717 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1718 skb, th, skb->len)) {
1720 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1724 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1725 tcp_tw_put((struct tcp_tw_bucket *)sk);
1729 /* Fall through to ACK */
1732 tcp_v4_timewait_ack(sk, skb);
1736 case TCP_TW_SUCCESS:;
1741 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1743 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1744 struct inet_sock *inet = inet_sk(sk);
1746 sin->sin_family = AF_INET;
1747 sin->sin_addr.s_addr = inet->daddr;
1748 sin->sin_port = inet->dport;
1751 /* VJ's idea. Save last timestamp seen from this destination
1752 * and hold it at least for normal timewait interval to use for duplicate
1753 * segment detection in subsequent connections, before they enter synchronized
1757 int tcp_v4_remember_stamp(struct sock *sk)
1759 struct inet_sock *inet = inet_sk(sk);
1760 struct tcp_sock *tp = tcp_sk(sk);
1761 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1762 struct inet_peer *peer = NULL;
1765 if (!rt || rt->rt_dst != inet->daddr) {
1766 peer = inet_getpeer(inet->daddr, 1);
1770 rt_bind_peer(rt, 1);
1775 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1776 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1777 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1778 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1779 peer->tcp_ts = tp->rx_opt.ts_recent;
1789 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1791 struct inet_peer *peer = NULL;
1793 peer = inet_getpeer(tw->tw_daddr, 1);
1796 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1797 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1798 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1799 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1800 peer->tcp_ts = tw->tw_ts_recent;
1809 struct tcp_func ipv4_specific = {
1810 .queue_xmit = ip_queue_xmit,
1811 .send_check = tcp_v4_send_check,
1812 .rebuild_header = inet_sk_rebuild_header,
1813 .conn_request = tcp_v4_conn_request,
1814 .syn_recv_sock = tcp_v4_syn_recv_sock,
1815 .remember_stamp = tcp_v4_remember_stamp,
1816 .net_header_len = sizeof(struct iphdr),
1817 .setsockopt = ip_setsockopt,
1818 .getsockopt = ip_getsockopt,
1819 .addr2sockaddr = v4_addr2sockaddr,
1820 .sockaddr_len = sizeof(struct sockaddr_in),
1823 /* NOTE: A lot of things set to zero explicitly by call to
1824 * sk_alloc() so need not be done here.
1826 static int tcp_v4_init_sock(struct sock *sk)
1828 struct tcp_sock *tp = tcp_sk(sk);
1830 skb_queue_head_init(&tp->out_of_order_queue);
1831 tcp_init_xmit_timers(sk);
1832 tcp_prequeue_init(tp);
1834 tp->rto = TCP_TIMEOUT_INIT;
1835 tp->mdev = TCP_TIMEOUT_INIT;
1837 /* So many TCP implementations out there (incorrectly) count the
1838 * initial SYN frame in their delayed-ACK and congestion control
1839 * algorithms that we must have the following bandaid to talk
1840 * efficiently to them. -DaveM
1844 /* See draft-stevens-tcpca-spec-01 for discussion of the
1845 * initialization of these values.
1847 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1848 tp->snd_cwnd_clamp = ~0;
1849 tp->mss_cache = 536;
1851 tp->reordering = sysctl_tcp_reordering;
1852 tp->ca_ops = &tcp_init_congestion_ops;
1854 sk->sk_state = TCP_CLOSE;
1856 sk->sk_write_space = sk_stream_write_space;
1857 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1859 tp->af_specific = &ipv4_specific;
1861 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1862 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1864 atomic_inc(&tcp_sockets_allocated);
1869 int tcp_v4_destroy_sock(struct sock *sk)
1871 struct tcp_sock *tp = tcp_sk(sk);
1873 tcp_clear_xmit_timers(sk);
1875 tcp_cleanup_congestion_control(tp);
1877 /* Cleanup up the write buffer. */
1878 sk_stream_writequeue_purge(sk);
1880 /* Cleans up our, hopefully empty, out_of_order_queue. */
1881 __skb_queue_purge(&tp->out_of_order_queue);
1883 /* Clean prequeue, it must be empty really */
1884 __skb_queue_purge(&tp->ucopy.prequeue);
1886 /* Clean up a referenced TCP bind bucket. */
1887 if (inet_sk(sk)->bind_hash)
1888 inet_put_port(&tcp_hashinfo, sk);
1891 * If sendmsg cached page exists, toss it.
1893 if (sk->sk_sndmsg_page) {
1894 __free_page(sk->sk_sndmsg_page);
1895 sk->sk_sndmsg_page = NULL;
1898 atomic_dec(&tcp_sockets_allocated);
1903 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1905 #ifdef CONFIG_PROC_FS
1906 /* Proc filesystem TCP sock list dumping. */
1908 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1910 return hlist_empty(head) ? NULL :
1911 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1914 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1916 return tw->tw_node.next ?
1917 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1920 static void *listening_get_next(struct seq_file *seq, void *cur)
1922 struct tcp_sock *tp;
1923 struct hlist_node *node;
1924 struct sock *sk = cur;
1925 struct tcp_iter_state* st = seq->private;
1929 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1935 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1936 struct request_sock *req = cur;
1938 tp = tcp_sk(st->syn_wait_sk);
1942 if (req->rsk_ops->family == st->family) {
1948 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1951 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1953 sk = sk_next(st->syn_wait_sk);
1954 st->state = TCP_SEQ_STATE_LISTENING;
1955 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1958 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1959 if (reqsk_queue_len(&tp->accept_queue))
1961 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1965 sk_for_each_from(sk, node) {
1966 if (sk->sk_family == st->family) {
1971 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1972 if (reqsk_queue_len(&tp->accept_queue)) {
1974 st->uid = sock_i_uid(sk);
1975 st->syn_wait_sk = sk;
1976 st->state = TCP_SEQ_STATE_OPENREQ;
1980 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1982 if (++st->bucket < INET_LHTABLE_SIZE) {
1983 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1991 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1993 void *rc = listening_get_next(seq, NULL);
1995 while (rc && *pos) {
1996 rc = listening_get_next(seq, rc);
2002 static void *established_get_first(struct seq_file *seq)
2004 struct tcp_iter_state* st = seq->private;
2007 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2009 struct hlist_node *node;
2010 struct tcp_tw_bucket *tw;
2012 /* We can reschedule _before_ having picked the target: */
2013 cond_resched_softirq();
2015 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2016 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2017 if (sk->sk_family != st->family) {
2023 st->state = TCP_SEQ_STATE_TIME_WAIT;
2024 tw_for_each(tw, node,
2025 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
2026 if (tw->tw_family != st->family) {
2032 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2033 st->state = TCP_SEQ_STATE_ESTABLISHED;
2039 static void *established_get_next(struct seq_file *seq, void *cur)
2041 struct sock *sk = cur;
2042 struct tcp_tw_bucket *tw;
2043 struct hlist_node *node;
2044 struct tcp_iter_state* st = seq->private;
2048 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2052 while (tw && tw->tw_family != st->family) {
2059 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2060 st->state = TCP_SEQ_STATE_ESTABLISHED;
2062 /* We can reschedule between buckets: */
2063 cond_resched_softirq();
2065 if (++st->bucket < tcp_hashinfo.ehash_size) {
2066 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2067 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2075 sk_for_each_from(sk, node) {
2076 if (sk->sk_family == st->family)
2080 st->state = TCP_SEQ_STATE_TIME_WAIT;
2081 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
2089 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2091 void *rc = established_get_first(seq);
2094 rc = established_get_next(seq, rc);
2100 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2103 struct tcp_iter_state* st = seq->private;
2106 st->state = TCP_SEQ_STATE_LISTENING;
2107 rc = listening_get_idx(seq, &pos);
2110 tcp_listen_unlock();
2112 st->state = TCP_SEQ_STATE_ESTABLISHED;
2113 rc = established_get_idx(seq, pos);
2119 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2121 struct tcp_iter_state* st = seq->private;
2122 st->state = TCP_SEQ_STATE_LISTENING;
2124 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2127 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2130 struct tcp_iter_state* st;
2132 if (v == SEQ_START_TOKEN) {
2133 rc = tcp_get_idx(seq, 0);
2138 switch (st->state) {
2139 case TCP_SEQ_STATE_OPENREQ:
2140 case TCP_SEQ_STATE_LISTENING:
2141 rc = listening_get_next(seq, v);
2143 tcp_listen_unlock();
2145 st->state = TCP_SEQ_STATE_ESTABLISHED;
2146 rc = established_get_first(seq);
2149 case TCP_SEQ_STATE_ESTABLISHED:
2150 case TCP_SEQ_STATE_TIME_WAIT:
2151 rc = established_get_next(seq, v);
2159 static void tcp_seq_stop(struct seq_file *seq, void *v)
2161 struct tcp_iter_state* st = seq->private;
2163 switch (st->state) {
2164 case TCP_SEQ_STATE_OPENREQ:
2166 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2167 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2169 case TCP_SEQ_STATE_LISTENING:
2170 if (v != SEQ_START_TOKEN)
2171 tcp_listen_unlock();
2173 case TCP_SEQ_STATE_TIME_WAIT:
2174 case TCP_SEQ_STATE_ESTABLISHED:
2176 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2182 static int tcp_seq_open(struct inode *inode, struct file *file)
2184 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2185 struct seq_file *seq;
2186 struct tcp_iter_state *s;
2189 if (unlikely(afinfo == NULL))
2192 s = kmalloc(sizeof(*s), GFP_KERNEL);
2195 memset(s, 0, sizeof(*s));
2196 s->family = afinfo->family;
2197 s->seq_ops.start = tcp_seq_start;
2198 s->seq_ops.next = tcp_seq_next;
2199 s->seq_ops.show = afinfo->seq_show;
2200 s->seq_ops.stop = tcp_seq_stop;
2202 rc = seq_open(file, &s->seq_ops);
2205 seq = file->private_data;
2214 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2217 struct proc_dir_entry *p;
2221 afinfo->seq_fops->owner = afinfo->owner;
2222 afinfo->seq_fops->open = tcp_seq_open;
2223 afinfo->seq_fops->read = seq_read;
2224 afinfo->seq_fops->llseek = seq_lseek;
2225 afinfo->seq_fops->release = seq_release_private;
2227 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2235 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2239 proc_net_remove(afinfo->name);
2240 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2243 static void get_openreq4(struct sock *sk, struct request_sock *req,
2244 char *tmpbuf, int i, int uid)
2246 const struct inet_request_sock *ireq = inet_rsk(req);
2247 int ttd = req->expires - jiffies;
2249 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2250 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2253 ntohs(inet_sk(sk)->sport),
2255 ntohs(ireq->rmt_port),
2257 0, 0, /* could print option size, but that is af dependent. */
2258 1, /* timers active (only the expire timer) */
2259 jiffies_to_clock_t(ttd),
2262 0, /* non standard timer */
2263 0, /* open_requests have no inode */
2264 atomic_read(&sk->sk_refcnt),
2268 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2271 unsigned long timer_expires;
2272 struct tcp_sock *tp = tcp_sk(sp);
2273 struct inet_sock *inet = inet_sk(sp);
2274 unsigned int dest = inet->daddr;
2275 unsigned int src = inet->rcv_saddr;
2276 __u16 destp = ntohs(inet->dport);
2277 __u16 srcp = ntohs(inet->sport);
2279 if (tp->pending == TCP_TIME_RETRANS) {
2281 timer_expires = tp->timeout;
2282 } else if (tp->pending == TCP_TIME_PROBE0) {
2284 timer_expires = tp->timeout;
2285 } else if (timer_pending(&sp->sk_timer)) {
2287 timer_expires = sp->sk_timer.expires;
2290 timer_expires = jiffies;
2293 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2294 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2295 i, src, srcp, dest, destp, sp->sk_state,
2296 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2298 jiffies_to_clock_t(timer_expires - jiffies),
2303 atomic_read(&sp->sk_refcnt), sp,
2304 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2306 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2309 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2311 unsigned int dest, src;
2313 int ttd = tw->tw_ttd - jiffies;
2318 dest = tw->tw_daddr;
2319 src = tw->tw_rcv_saddr;
2320 destp = ntohs(tw->tw_dport);
2321 srcp = ntohs(tw->tw_sport);
2323 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2324 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2325 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2326 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2327 atomic_read(&tw->tw_refcnt), tw);
2332 static int tcp4_seq_show(struct seq_file *seq, void *v)
2334 struct tcp_iter_state* st;
2335 char tmpbuf[TMPSZ + 1];
2337 if (v == SEQ_START_TOKEN) {
2338 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2339 " sl local_address rem_address st tx_queue "
2340 "rx_queue tr tm->when retrnsmt uid timeout "
2346 switch (st->state) {
2347 case TCP_SEQ_STATE_LISTENING:
2348 case TCP_SEQ_STATE_ESTABLISHED:
2349 get_tcp4_sock(v, tmpbuf, st->num);
2351 case TCP_SEQ_STATE_OPENREQ:
2352 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2354 case TCP_SEQ_STATE_TIME_WAIT:
2355 get_timewait4_sock(v, tmpbuf, st->num);
2358 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2363 static struct file_operations tcp4_seq_fops;
2364 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2365 .owner = THIS_MODULE,
2368 .seq_show = tcp4_seq_show,
2369 .seq_fops = &tcp4_seq_fops,
2372 int __init tcp4_proc_init(void)
2374 return tcp_proc_register(&tcp4_seq_afinfo);
2377 void tcp4_proc_exit(void)
2379 tcp_proc_unregister(&tcp4_seq_afinfo);
2381 #endif /* CONFIG_PROC_FS */
2383 struct proto tcp_prot = {
2385 .owner = THIS_MODULE,
2387 .connect = tcp_v4_connect,
2388 .disconnect = tcp_disconnect,
2389 .accept = tcp_accept,
2391 .init = tcp_v4_init_sock,
2392 .destroy = tcp_v4_destroy_sock,
2393 .shutdown = tcp_shutdown,
2394 .setsockopt = tcp_setsockopt,
2395 .getsockopt = tcp_getsockopt,
2396 .sendmsg = tcp_sendmsg,
2397 .recvmsg = tcp_recvmsg,
2398 .backlog_rcv = tcp_v4_do_rcv,
2399 .hash = tcp_v4_hash,
2400 .unhash = tcp_unhash,
2401 .get_port = tcp_v4_get_port,
2402 .enter_memory_pressure = tcp_enter_memory_pressure,
2403 .sockets_allocated = &tcp_sockets_allocated,
2404 .memory_allocated = &tcp_memory_allocated,
2405 .memory_pressure = &tcp_memory_pressure,
2406 .sysctl_mem = sysctl_tcp_mem,
2407 .sysctl_wmem = sysctl_tcp_wmem,
2408 .sysctl_rmem = sysctl_tcp_rmem,
2409 .max_header = MAX_TCP_HEADER,
2410 .obj_size = sizeof(struct tcp_sock),
2411 .rsk_prot = &tcp_request_sock_ops,
2416 void __init tcp_v4_init(struct net_proto_family *ops)
2418 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2420 panic("Failed to create the TCP control socket.\n");
2421 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2422 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2424 /* Unhash it so that IP input processing does not even
2425 * see it, we do not wish this socket to see incoming
2428 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2431 EXPORT_SYMBOL(ipv4_specific);
2432 EXPORT_SYMBOL(inet_bind_bucket_create);
2433 EXPORT_SYMBOL(tcp_hashinfo);
2434 EXPORT_SYMBOL(tcp_listen_wlock);
2435 EXPORT_SYMBOL(tcp_prot);
2436 EXPORT_SYMBOL(tcp_unhash);
2437 EXPORT_SYMBOL(tcp_v4_conn_request);
2438 EXPORT_SYMBOL(tcp_v4_connect);
2439 EXPORT_SYMBOL(tcp_v4_do_rcv);
2440 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2441 EXPORT_SYMBOL(tcp_v4_send_check);
2442 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2444 #ifdef CONFIG_PROC_FS
2445 EXPORT_SYMBOL(tcp_proc_register);
2446 EXPORT_SYMBOL(tcp_proc_unregister);
2448 EXPORT_SYMBOL(sysctl_local_port_range);
2449 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2450 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);