2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
67 #include <net/inet_hashtables.h>
70 #include <net/inet_common.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
94 .__tcp_lhash_users = ATOMIC_INIT(0),
96 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
97 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106 int tcp_port_rover = 1024 - 1;
108 /* Allocate and initialize a new TCP local port bind bucket.
109 * The bindhash mutex for snum's hash chain must be held here.
111 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
114 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
119 INIT_HLIST_HEAD(&tb->owners);
120 hlist_add_head(&tb->node, &head->chain);
125 /* Caller must hold hashbucket lock for this tb with local BH disabled */
126 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
128 if (hlist_empty(&tb->owners)) {
129 __hlist_del(&tb->node);
130 kmem_cache_free(tcp_bucket_cachep, tb);
134 /* Caller must disable local BH processing. */
135 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
137 struct tcp_bind_hashbucket *head =
138 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
139 struct tcp_bind_bucket *tb;
141 spin_lock(&head->lock);
142 tb = tcp_sk(sk)->bind_hash;
143 sk_add_bind_node(child, &tb->owners);
144 tcp_sk(child)->bind_hash = tb;
145 spin_unlock(&head->lock);
148 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
151 __tcp_inherit_port(sk, child);
155 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
158 inet_sk(sk)->num = snum;
159 sk_add_bind_node(sk, &tb->owners);
160 tcp_sk(sk)->bind_hash = tb;
163 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
165 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
167 struct hlist_node *node;
168 int reuse = sk->sk_reuse;
170 sk_for_each_bound(sk2, node, &tb->owners) {
172 !tcp_v6_ipv6only(sk2) &&
173 (!sk->sk_bound_dev_if ||
174 !sk2->sk_bound_dev_if ||
175 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
176 if (!reuse || !sk2->sk_reuse ||
177 sk2->sk_state == TCP_LISTEN) {
178 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
179 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
180 sk2_rcv_saddr == sk_rcv_saddr)
188 /* Obtain a reference to a local port for the given sock,
189 * if snum is zero it means select any available local port.
191 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
193 struct tcp_bind_hashbucket *head;
194 struct hlist_node *node;
195 struct tcp_bind_bucket *tb;
200 int low = sysctl_local_port_range[0];
201 int high = sysctl_local_port_range[1];
202 int remaining = (high - low) + 1;
205 spin_lock(&tcp_portalloc_lock);
206 if (tcp_port_rover < low)
209 rover = tcp_port_rover;
214 head = &tcp_bhash[tcp_bhashfn(rover)];
215 spin_lock(&head->lock);
216 tb_for_each(tb, node, &head->chain)
217 if (tb->port == rover)
221 spin_unlock(&head->lock);
222 } while (--remaining > 0);
223 tcp_port_rover = rover;
224 spin_unlock(&tcp_portalloc_lock);
226 /* Exhausted local port range during search? It is not
227 * possible for us to be holding one of the bind hash
228 * locks if this test triggers, because if 'remaining'
229 * drops to zero, we broke out of the do/while loop at
230 * the top level, not from the 'break;' statement.
233 if (unlikely(remaining <= 0))
236 /* OK, here is the one we will use. HEAD is
237 * non-NULL and we hold it's mutex.
241 head = &tcp_bhash[tcp_bhashfn(snum)];
242 spin_lock(&head->lock);
243 tb_for_each(tb, node, &head->chain)
244 if (tb->port == snum)
250 if (!hlist_empty(&tb->owners)) {
251 if (sk->sk_reuse > 1)
253 if (tb->fastreuse > 0 &&
254 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
258 if (tcp_bind_conflict(sk, tb))
264 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
266 if (hlist_empty(&tb->owners)) {
267 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
271 } else if (tb->fastreuse &&
272 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
275 if (!tcp_sk(sk)->bind_hash)
276 tcp_bind_hash(sk, tb, snum);
277 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
281 spin_unlock(&head->lock);
287 /* Get rid of any references to a local port held by the
290 static void __tcp_put_port(struct sock *sk)
292 struct inet_sock *inet = inet_sk(sk);
293 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
294 struct tcp_bind_bucket *tb;
296 spin_lock(&head->lock);
297 tb = tcp_sk(sk)->bind_hash;
298 __sk_del_bind_node(sk);
299 tcp_sk(sk)->bind_hash = NULL;
301 tcp_bucket_destroy(tb);
302 spin_unlock(&head->lock);
305 void tcp_put_port(struct sock *sk)
312 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
313 * Look, when several writers sleep and reader wakes them up, all but one
314 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
315 * this, _but_ remember, it adds useless work on UP machines (wake up each
316 * exclusive lock release). It should be ifdefed really.
319 void tcp_listen_wlock(void)
321 write_lock(&tcp_lhash_lock);
323 if (atomic_read(&tcp_lhash_users)) {
327 prepare_to_wait_exclusive(&tcp_lhash_wait,
328 &wait, TASK_UNINTERRUPTIBLE);
329 if (!atomic_read(&tcp_lhash_users))
331 write_unlock_bh(&tcp_lhash_lock);
333 write_lock_bh(&tcp_lhash_lock);
336 finish_wait(&tcp_lhash_wait, &wait);
340 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
342 struct hlist_head *list;
345 BUG_TRAP(sk_unhashed(sk));
346 if (listen_possible && sk->sk_state == TCP_LISTEN) {
347 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
348 lock = &tcp_lhash_lock;
351 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size);
352 list = &tcp_ehash[sk->sk_hashent].chain;
353 lock = &tcp_ehash[sk->sk_hashent].lock;
356 __sk_add_node(sk, list);
357 sock_prot_inc_use(sk->sk_prot);
359 if (listen_possible && sk->sk_state == TCP_LISTEN)
360 wake_up(&tcp_lhash_wait);
363 static void tcp_v4_hash(struct sock *sk)
365 if (sk->sk_state != TCP_CLOSE) {
367 __tcp_v4_hash(sk, 1);
372 void tcp_unhash(struct sock *sk)
379 if (sk->sk_state == TCP_LISTEN) {
382 lock = &tcp_lhash_lock;
384 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
386 write_lock_bh(&head->lock);
389 if (__sk_del_node_init(sk))
390 sock_prot_dec_use(sk->sk_prot);
391 write_unlock_bh(lock);
394 if (sk->sk_state == TCP_LISTEN)
395 wake_up(&tcp_lhash_wait);
398 /* Don't inline this cruft. Here are some nice properties to
399 * exploit here. The BSD API does not allow a listening TCP
400 * to specify the remote port nor the remote address for the
401 * connection. So always assume those are both wildcarded
402 * during the search since they can never be otherwise.
404 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
405 unsigned short hnum, int dif)
407 struct sock *result = NULL, *sk;
408 struct hlist_node *node;
412 sk_for_each(sk, node, head) {
413 struct inet_sock *inet = inet_sk(sk);
415 if (inet->num == hnum && !ipv6_only_sock(sk)) {
416 __u32 rcv_saddr = inet->rcv_saddr;
418 score = (sk->sk_family == PF_INET ? 1 : 0);
420 if (rcv_saddr != daddr)
424 if (sk->sk_bound_dev_if) {
425 if (sk->sk_bound_dev_if != dif)
431 if (score > hiscore) {
440 /* Optimize the common listener case. */
441 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
442 unsigned short hnum, int dif)
444 struct sock *sk = NULL;
445 struct hlist_head *head;
447 read_lock(&tcp_lhash_lock);
448 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
449 if (!hlist_empty(head)) {
450 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
452 if (inet->num == hnum && !sk->sk_node.next &&
453 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
454 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
455 !sk->sk_bound_dev_if)
457 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
463 read_unlock(&tcp_lhash_lock);
467 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
468 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
470 * Local BH must be disabled here.
473 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
477 struct tcp_ehash_bucket *head;
478 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
479 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
481 struct hlist_node *node;
482 /* Optimize here for direct hit, only listening connections can
483 * have wildcards anyways.
485 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size);
486 head = &tcp_ehash[hash];
487 read_lock(&head->lock);
488 sk_for_each(sk, node, &head->chain) {
489 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
490 goto hit; /* You sunk my battleship! */
493 /* Must check for a TIME_WAIT'er before going to listener hash. */
494 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
495 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
500 read_unlock(&head->lock);
507 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
508 u32 daddr, u16 hnum, int dif)
510 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
513 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
516 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
522 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
528 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
530 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
532 return secure_tcp_sequence_number(skb->nh.iph->daddr,
538 /* called with local bh disabled */
539 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
540 struct tcp_tw_bucket **twp)
542 struct inet_sock *inet = inet_sk(sk);
543 u32 daddr = inet->rcv_saddr;
544 u32 saddr = inet->daddr;
545 int dif = sk->sk_bound_dev_if;
546 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
547 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
548 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size);
549 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
551 struct hlist_node *node;
552 struct tcp_tw_bucket *tw;
554 write_lock(&head->lock);
556 /* Check TIME-WAIT sockets first. */
557 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
558 tw = (struct tcp_tw_bucket *)sk2;
560 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
561 struct tcp_sock *tp = tcp_sk(sk);
563 /* With PAWS, it is safe from the viewpoint
564 of data integrity. Even without PAWS it
565 is safe provided sequence spaces do not
566 overlap i.e. at data rates <= 80Mbit/sec.
568 Actually, the idea is close to VJ's one,
569 only timestamp cache is held not per host,
570 but per port pair and TW bucket is used
573 If TW bucket has been already destroyed we
574 fall back to VJ's scheme and use initial
575 timestamp retrieved from peer table.
577 if (tw->tw_ts_recent_stamp &&
578 (!twp || (sysctl_tcp_tw_reuse &&
580 tw->tw_ts_recent_stamp > 1))) {
582 tw->tw_snd_nxt + 65535 + 2) == 0)
584 tp->rx_opt.ts_recent = tw->tw_ts_recent;
585 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
594 /* And established part... */
595 sk_for_each(sk2, node, &head->chain) {
596 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
601 /* Must record num and sport now. Otherwise we will see
602 * in hash table socket with a funny identity. */
604 inet->sport = htons(lport);
605 sk->sk_hashent = hash;
606 BUG_TRAP(sk_unhashed(sk));
607 __sk_add_node(sk, &head->chain);
608 sock_prot_inc_use(sk->sk_prot);
609 write_unlock(&head->lock);
613 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
615 /* Silly. Should hash-dance instead... */
616 tcp_tw_deschedule(tw);
617 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
625 write_unlock(&head->lock);
626 return -EADDRNOTAVAIL;
629 static inline u32 connect_port_offset(const struct sock *sk)
631 const struct inet_sock *inet = inet_sk(sk);
633 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
638 * Bind a port for a connect operation and hash it.
640 static inline int tcp_v4_hash_connect(struct sock *sk)
642 unsigned short snum = inet_sk(sk)->num;
643 struct tcp_bind_hashbucket *head;
644 struct tcp_bind_bucket *tb;
648 int low = sysctl_local_port_range[0];
649 int high = sysctl_local_port_range[1];
650 int range = high - low;
654 u32 offset = hint + connect_port_offset(sk);
655 struct hlist_node *node;
656 struct tcp_tw_bucket *tw = NULL;
659 for (i = 1; i <= range; i++) {
660 port = low + (i + offset) % range;
661 head = &tcp_bhash[tcp_bhashfn(port)];
662 spin_lock(&head->lock);
664 /* Does not bother with rcv_saddr checks,
665 * because the established check is already
668 tb_for_each(tb, node, &head->chain) {
669 if (tb->port == port) {
670 BUG_TRAP(!hlist_empty(&tb->owners));
671 if (tb->fastreuse >= 0)
673 if (!__tcp_v4_check_established(sk,
681 tb = tcp_bucket_create(head, port);
683 spin_unlock(&head->lock);
690 spin_unlock(&head->lock);
694 return -EADDRNOTAVAIL;
699 /* Head lock still held and bh's disabled */
700 tcp_bind_hash(sk, tb, port);
701 if (sk_unhashed(sk)) {
702 inet_sk(sk)->sport = htons(port);
703 __tcp_v4_hash(sk, 0);
705 spin_unlock(&head->lock);
708 tcp_tw_deschedule(tw);
716 head = &tcp_bhash[tcp_bhashfn(snum)];
717 tb = tcp_sk(sk)->bind_hash;
718 spin_lock_bh(&head->lock);
719 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
720 __tcp_v4_hash(sk, 0);
721 spin_unlock_bh(&head->lock);
724 spin_unlock(&head->lock);
725 /* No definite answer... Walk to established hash table */
726 ret = __tcp_v4_check_established(sk, snum, NULL);
733 /* This will initiate an outgoing connection. */
734 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
736 struct inet_sock *inet = inet_sk(sk);
737 struct tcp_sock *tp = tcp_sk(sk);
738 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
744 if (addr_len < sizeof(struct sockaddr_in))
747 if (usin->sin_family != AF_INET)
748 return -EAFNOSUPPORT;
750 nexthop = daddr = usin->sin_addr.s_addr;
751 if (inet->opt && inet->opt->srr) {
754 nexthop = inet->opt->faddr;
757 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
758 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
760 inet->sport, usin->sin_port, sk);
764 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
769 if (!inet->opt || !inet->opt->srr)
773 inet->saddr = rt->rt_src;
774 inet->rcv_saddr = inet->saddr;
776 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
777 /* Reset inherited state */
778 tp->rx_opt.ts_recent = 0;
779 tp->rx_opt.ts_recent_stamp = 0;
783 if (sysctl_tcp_tw_recycle &&
784 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
785 struct inet_peer *peer = rt_get_peer(rt);
787 /* VJ's idea. We save last timestamp seen from
788 * the destination in peer table, when entering state TIME-WAIT
789 * and initialize rx_opt.ts_recent from it, when trying new connection.
792 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
793 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
794 tp->rx_opt.ts_recent = peer->tcp_ts;
798 inet->dport = usin->sin_port;
801 tp->ext_header_len = 0;
803 tp->ext_header_len = inet->opt->optlen;
805 tp->rx_opt.mss_clamp = 536;
807 /* Socket identity is still unknown (sport may be zero).
808 * However we set state to SYN-SENT and not releasing socket
809 * lock select source port, enter ourselves into the hash tables and
810 * complete initialization after this.
812 tcp_set_state(sk, TCP_SYN_SENT);
813 err = tcp_v4_hash_connect(sk);
817 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
821 /* OK, now commit destination to socket. */
822 sk_setup_caps(sk, &rt->u.dst);
825 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
830 inet->id = tp->write_seq ^ jiffies;
832 err = tcp_connect(sk);
840 /* This unhashes the socket and releases the local port, if necessary. */
841 tcp_set_state(sk, TCP_CLOSE);
843 sk->sk_route_caps = 0;
848 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
850 return ((struct rtable *)skb->dst)->rt_iif;
853 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
855 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
858 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
859 struct request_sock ***prevp,
861 __u32 raddr, __u32 laddr)
863 struct listen_sock *lopt = tp->accept_queue.listen_opt;
864 struct request_sock *req, **prev;
866 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
867 (req = *prev) != NULL;
868 prev = &req->dl_next) {
869 const struct inet_request_sock *ireq = inet_rsk(req);
871 if (ireq->rmt_port == rport &&
872 ireq->rmt_addr == raddr &&
873 ireq->loc_addr == laddr &&
874 TCP_INET_FAMILY(req->rsk_ops->family)) {
884 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
886 struct tcp_sock *tp = tcp_sk(sk);
887 struct listen_sock *lopt = tp->accept_queue.listen_opt;
888 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
890 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
896 * This routine does path mtu discovery as defined in RFC1191.
898 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
901 struct dst_entry *dst;
902 struct inet_sock *inet = inet_sk(sk);
903 struct tcp_sock *tp = tcp_sk(sk);
905 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
906 * send out by Linux are always <576bytes so they should go through
909 if (sk->sk_state == TCP_LISTEN)
912 /* We don't check in the destentry if pmtu discovery is forbidden
913 * on this route. We just assume that no packet_to_big packets
914 * are send back when pmtu discovery is not active.
915 * There is a small race when the user changes this flag in the
916 * route, but I think that's acceptable.
918 if ((dst = __sk_dst_check(sk, 0)) == NULL)
921 dst->ops->update_pmtu(dst, mtu);
923 /* Something is about to be wrong... Remember soft error
924 * for the case, if this connection will not able to recover.
926 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
927 sk->sk_err_soft = EMSGSIZE;
931 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
932 tp->pmtu_cookie > mtu) {
933 tcp_sync_mss(sk, mtu);
935 /* Resend the TCP packet because it's
936 * clear that the old packet has been
937 * dropped. This is the new "fast" path mtu
940 tcp_simple_retransmit(sk);
941 } /* else let the usual retransmit timer handle it */
945 * This routine is called by the ICMP module when it gets some
946 * sort of error condition. If err < 0 then the socket should
947 * be closed and the error returned to the user. If err > 0
948 * it's just the icmp type << 8 | icmp code. After adjustment
949 * header points to the first 8 bytes of the tcp header. We need
950 * to find the appropriate port.
952 * The locking strategy used here is very "optimistic". When
953 * someone else accesses the socket the ICMP is just dropped
954 * and for some paths there is no check at all.
955 * A more general error queue to queue errors for later handling
956 * is probably better.
960 void tcp_v4_err(struct sk_buff *skb, u32 info)
962 struct iphdr *iph = (struct iphdr *)skb->data;
963 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
965 struct inet_sock *inet;
966 int type = skb->h.icmph->type;
967 int code = skb->h.icmph->code;
972 if (skb->len < (iph->ihl << 2) + 8) {
973 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
977 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
978 th->source, tcp_v4_iif(skb));
980 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
983 if (sk->sk_state == TCP_TIME_WAIT) {
984 tcp_tw_put((struct tcp_tw_bucket *)sk);
989 /* If too many ICMPs get dropped on busy
990 * servers this needs to be solved differently.
992 if (sock_owned_by_user(sk))
993 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
995 if (sk->sk_state == TCP_CLOSE)
999 seq = ntohl(th->seq);
1000 if (sk->sk_state != TCP_LISTEN &&
1001 !between(seq, tp->snd_una, tp->snd_nxt)) {
1002 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1007 case ICMP_SOURCE_QUENCH:
1008 /* Just silently ignore these. */
1010 case ICMP_PARAMETERPROB:
1013 case ICMP_DEST_UNREACH:
1014 if (code > NR_ICMP_UNREACH)
1017 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1018 if (!sock_owned_by_user(sk))
1019 do_pmtu_discovery(sk, iph, info);
1023 err = icmp_err_convert[code].errno;
1025 case ICMP_TIME_EXCEEDED:
1032 switch (sk->sk_state) {
1033 struct request_sock *req, **prev;
1035 if (sock_owned_by_user(sk))
1038 req = tcp_v4_search_req(tp, &prev, th->dest,
1039 iph->daddr, iph->saddr);
1043 /* ICMPs are not backlogged, hence we cannot get
1044 an established socket here.
1048 if (seq != tcp_rsk(req)->snt_isn) {
1049 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1054 * Still in SYN_RECV, just remove it silently.
1055 * There is no good way to pass the error to the newly
1056 * created socket, and POSIX does not want network
1057 * errors returned from accept().
1059 tcp_synq_drop(sk, req, prev);
1063 case TCP_SYN_RECV: /* Cannot happen.
1064 It can f.e. if SYNs crossed.
1066 if (!sock_owned_by_user(sk)) {
1067 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1070 sk->sk_error_report(sk);
1074 sk->sk_err_soft = err;
1079 /* If we've already connected we will keep trying
1080 * until we time out, or the user gives up.
1082 * rfc1122 4.2.3.9 allows to consider as hard errors
1083 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1084 * but it is obsoleted by pmtu discovery).
1086 * Note, that in modern internet, where routing is unreliable
1087 * and in each dark corner broken firewalls sit, sending random
1088 * errors ordered by their masters even this two messages finally lose
1089 * their original sense (even Linux sends invalid PORT_UNREACHs)
1091 * Now we are in compliance with RFCs.
1096 if (!sock_owned_by_user(sk) && inet->recverr) {
1098 sk->sk_error_report(sk);
1099 } else { /* Only an error on timeout */
1100 sk->sk_err_soft = err;
1108 /* This routine computes an IPv4 TCP checksum. */
1109 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1110 struct sk_buff *skb)
1112 struct inet_sock *inet = inet_sk(sk);
1114 if (skb->ip_summed == CHECKSUM_HW) {
1115 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1116 skb->csum = offsetof(struct tcphdr, check);
1118 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1119 csum_partial((char *)th,
1126 * This routine will send an RST to the other tcp.
1128 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1130 * Answer: if a packet caused RST, it is not for a socket
1131 * existing in our system, if it is matched to a socket,
1132 * it is just duplicate segment or bug in other side's TCP.
1133 * So that we build reply only basing on parameters
1134 * arrived with segment.
1135 * Exception: precedence violation. We do not implement it in any case.
1138 static void tcp_v4_send_reset(struct sk_buff *skb)
1140 struct tcphdr *th = skb->h.th;
1142 struct ip_reply_arg arg;
1144 /* Never send a reset in response to a reset. */
1148 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1151 /* Swap the send and the receive. */
1152 memset(&rth, 0, sizeof(struct tcphdr));
1153 rth.dest = th->source;
1154 rth.source = th->dest;
1155 rth.doff = sizeof(struct tcphdr) / 4;
1159 rth.seq = th->ack_seq;
1162 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1163 skb->len - (th->doff << 2));
1166 memset(&arg, 0, sizeof arg);
1167 arg.iov[0].iov_base = (unsigned char *)&rth;
1168 arg.iov[0].iov_len = sizeof rth;
1169 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1170 skb->nh.iph->saddr, /*XXX*/
1171 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1172 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1174 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1176 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1177 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1180 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1181 outside socket context is ugly, certainly. What can I do?
1184 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1187 struct tcphdr *th = skb->h.th;
1192 struct ip_reply_arg arg;
1194 memset(&rep.th, 0, sizeof(struct tcphdr));
1195 memset(&arg, 0, sizeof arg);
1197 arg.iov[0].iov_base = (unsigned char *)&rep;
1198 arg.iov[0].iov_len = sizeof(rep.th);
1200 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1201 (TCPOPT_TIMESTAMP << 8) |
1203 rep.tsopt[1] = htonl(tcp_time_stamp);
1204 rep.tsopt[2] = htonl(ts);
1205 arg.iov[0].iov_len = sizeof(rep);
1208 /* Swap the send and the receive. */
1209 rep.th.dest = th->source;
1210 rep.th.source = th->dest;
1211 rep.th.doff = arg.iov[0].iov_len / 4;
1212 rep.th.seq = htonl(seq);
1213 rep.th.ack_seq = htonl(ack);
1215 rep.th.window = htons(win);
1217 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1218 skb->nh.iph->saddr, /*XXX*/
1219 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1220 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1222 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1224 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1227 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1229 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1231 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1232 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1237 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1239 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1243 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1244 struct request_sock *req)
1247 const struct inet_request_sock *ireq = inet_rsk(req);
1248 struct ip_options *opt = inet_rsk(req)->opt;
1249 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1251 { .daddr = ((opt && opt->srr) ?
1254 .saddr = ireq->loc_addr,
1255 .tos = RT_CONN_FLAGS(sk) } },
1256 .proto = IPPROTO_TCP,
1258 { .sport = inet_sk(sk)->sport,
1259 .dport = ireq->rmt_port } } };
1261 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1262 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1265 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1267 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1274 * Send a SYN-ACK after having received an ACK.
1275 * This still operates on a request_sock only, not on a big
1278 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1279 struct dst_entry *dst)
1281 const struct inet_request_sock *ireq = inet_rsk(req);
1283 struct sk_buff * skb;
1285 /* First, grab a route. */
1286 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1289 skb = tcp_make_synack(sk, dst, req);
1292 struct tcphdr *th = skb->h.th;
1294 th->check = tcp_v4_check(th, skb->len,
1297 csum_partial((char *)th, skb->len,
1300 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1303 if (err == NET_XMIT_CN)
1313 * IPv4 request_sock destructor.
1315 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1317 if (inet_rsk(req)->opt)
1318 kfree(inet_rsk(req)->opt);
1321 static inline void syn_flood_warning(struct sk_buff *skb)
1323 static unsigned long warntime;
1325 if (time_after(jiffies, (warntime + HZ * 60))) {
1328 "possible SYN flooding on port %d. Sending cookies.\n",
1329 ntohs(skb->h.th->dest));
1334 * Save and compile IPv4 options into the request_sock if needed.
1336 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1337 struct sk_buff *skb)
1339 struct ip_options *opt = &(IPCB(skb)->opt);
1340 struct ip_options *dopt = NULL;
1342 if (opt && opt->optlen) {
1343 int opt_size = optlength(opt);
1344 dopt = kmalloc(opt_size, GFP_ATOMIC);
1346 if (ip_options_echo(dopt, skb)) {
1355 struct request_sock_ops tcp_request_sock_ops = {
1357 .obj_size = sizeof(struct tcp_request_sock),
1358 .rtx_syn_ack = tcp_v4_send_synack,
1359 .send_ack = tcp_v4_reqsk_send_ack,
1360 .destructor = tcp_v4_reqsk_destructor,
1361 .send_reset = tcp_v4_send_reset,
1364 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1366 struct inet_request_sock *ireq;
1367 struct tcp_options_received tmp_opt;
1368 struct request_sock *req;
1369 __u32 saddr = skb->nh.iph->saddr;
1370 __u32 daddr = skb->nh.iph->daddr;
1371 __u32 isn = TCP_SKB_CB(skb)->when;
1372 struct dst_entry *dst = NULL;
1373 #ifdef CONFIG_SYN_COOKIES
1374 int want_cookie = 0;
1376 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1379 /* Never answer to SYNs send to broadcast or multicast */
1380 if (((struct rtable *)skb->dst)->rt_flags &
1381 (RTCF_BROADCAST | RTCF_MULTICAST))
1384 /* TW buckets are converted to open requests without
1385 * limitations, they conserve resources and peer is
1386 * evidently real one.
1388 if (tcp_synq_is_full(sk) && !isn) {
1389 #ifdef CONFIG_SYN_COOKIES
1390 if (sysctl_tcp_syncookies) {
1397 /* Accept backlog is full. If we have already queued enough
1398 * of warm entries in syn queue, drop request. It is better than
1399 * clogging syn queue with openreqs with exponentially increasing
1402 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1405 req = reqsk_alloc(&tcp_request_sock_ops);
1409 tcp_clear_options(&tmp_opt);
1410 tmp_opt.mss_clamp = 536;
1411 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1413 tcp_parse_options(skb, &tmp_opt, 0);
1416 tcp_clear_options(&tmp_opt);
1417 tmp_opt.saw_tstamp = 0;
1420 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1421 /* Some OSes (unknown ones, but I see them on web server, which
1422 * contains information interesting only for windows'
1423 * users) do not send their stamp in SYN. It is easy case.
1424 * We simply do not advertise TS support.
1426 tmp_opt.saw_tstamp = 0;
1427 tmp_opt.tstamp_ok = 0;
1429 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1431 tcp_openreq_init(req, &tmp_opt, skb);
1433 ireq = inet_rsk(req);
1434 ireq->loc_addr = daddr;
1435 ireq->rmt_addr = saddr;
1436 ireq->opt = tcp_v4_save_options(sk, skb);
1438 TCP_ECN_create_request(req, skb->h.th);
1441 #ifdef CONFIG_SYN_COOKIES
1442 syn_flood_warning(skb);
1444 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1446 struct inet_peer *peer = NULL;
1448 /* VJ's idea. We save last timestamp seen
1449 * from the destination in peer table, when entering
1450 * state TIME-WAIT, and check against it before
1451 * accepting new connection request.
1453 * If "isn" is not zero, this request hit alive
1454 * timewait bucket, so that all the necessary checks
1455 * are made in the function processing timewait state.
1457 if (tmp_opt.saw_tstamp &&
1458 sysctl_tcp_tw_recycle &&
1459 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1460 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1461 peer->v4daddr == saddr) {
1462 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1463 (s32)(peer->tcp_ts - req->ts_recent) >
1465 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1470 /* Kill the following clause, if you dislike this way. */
1471 else if (!sysctl_tcp_syncookies &&
1472 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1473 (sysctl_max_syn_backlog >> 2)) &&
1474 (!peer || !peer->tcp_ts_stamp) &&
1475 (!dst || !dst_metric(dst, RTAX_RTT))) {
1476 /* Without syncookies last quarter of
1477 * backlog is filled with destinations,
1478 * proven to be alive.
1479 * It means that we continue to communicate
1480 * to destinations, already remembered
1481 * to the moment of synflood.
1483 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1484 "request from %u.%u."
1487 ntohs(skb->h.th->source)));
1492 isn = tcp_v4_init_sequence(sk, skb);
1494 tcp_rsk(req)->snt_isn = isn;
1496 if (tcp_v4_send_synack(sk, req, dst))
1502 tcp_v4_synq_add(sk, req);
1509 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1515 * The three way handshake has completed - we got a valid synack -
1516 * now create the new socket.
1518 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1519 struct request_sock *req,
1520 struct dst_entry *dst)
1522 struct inet_request_sock *ireq;
1523 struct inet_sock *newinet;
1524 struct tcp_sock *newtp;
1527 if (sk_acceptq_is_full(sk))
1530 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1533 newsk = tcp_create_openreq_child(sk, req, skb);
1537 sk_setup_caps(newsk, dst);
1539 newtp = tcp_sk(newsk);
1540 newinet = inet_sk(newsk);
1541 ireq = inet_rsk(req);
1542 newinet->daddr = ireq->rmt_addr;
1543 newinet->rcv_saddr = ireq->loc_addr;
1544 newinet->saddr = ireq->loc_addr;
1545 newinet->opt = ireq->opt;
1547 newinet->mc_index = tcp_v4_iif(skb);
1548 newinet->mc_ttl = skb->nh.iph->ttl;
1549 newtp->ext_header_len = 0;
1551 newtp->ext_header_len = newinet->opt->optlen;
1552 newinet->id = newtp->write_seq ^ jiffies;
1554 tcp_sync_mss(newsk, dst_mtu(dst));
1555 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1556 tcp_initialize_rcv_mss(newsk);
1558 __tcp_v4_hash(newsk, 0);
1559 __tcp_inherit_port(sk, newsk);
1564 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1566 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1571 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1573 struct tcphdr *th = skb->h.th;
1574 struct iphdr *iph = skb->nh.iph;
1575 struct tcp_sock *tp = tcp_sk(sk);
1577 struct request_sock **prev;
1578 /* Find possible connection requests. */
1579 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1580 iph->saddr, iph->daddr);
1582 return tcp_check_req(sk, skb, req, prev);
1584 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1591 if (nsk->sk_state != TCP_TIME_WAIT) {
1595 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1599 #ifdef CONFIG_SYN_COOKIES
1600 if (!th->rst && !th->syn && th->ack)
1601 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1606 static int tcp_v4_checksum_init(struct sk_buff *skb)
1608 if (skb->ip_summed == CHECKSUM_HW) {
1609 skb->ip_summed = CHECKSUM_UNNECESSARY;
1610 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1611 skb->nh.iph->daddr, skb->csum))
1614 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1615 skb->ip_summed = CHECKSUM_NONE;
1617 if (skb->len <= 76) {
1618 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1620 skb_checksum(skb, 0, skb->len, 0)))
1622 skb->ip_summed = CHECKSUM_UNNECESSARY;
1624 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1626 skb->nh.iph->daddr, 0);
1632 /* The socket must have it's spinlock held when we get
1635 * We have a potential double-lock case here, so even when
1636 * doing backlog processing we use the BH locking scheme.
1637 * This is because we cannot sleep with the original spinlock
1640 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1642 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1643 TCP_CHECK_TIMER(sk);
1644 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1646 TCP_CHECK_TIMER(sk);
1650 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1653 if (sk->sk_state == TCP_LISTEN) {
1654 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1659 if (tcp_child_process(sk, nsk, skb))
1665 TCP_CHECK_TIMER(sk);
1666 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1668 TCP_CHECK_TIMER(sk);
1672 tcp_v4_send_reset(skb);
1675 /* Be careful here. If this function gets more complicated and
1676 * gcc suffers from register pressure on the x86, sk (in %ebx)
1677 * might be destroyed here. This current version compiles correctly,
1678 * but you have been warned.
1683 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1691 int tcp_v4_rcv(struct sk_buff *skb)
1697 if (skb->pkt_type != PACKET_HOST)
1700 /* Count it even if it's bad */
1701 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1703 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1708 if (th->doff < sizeof(struct tcphdr) / 4)
1710 if (!pskb_may_pull(skb, th->doff * 4))
1713 /* An explanation is required here, I think.
1714 * Packet length and doff are validated by header prediction,
1715 * provided case of th->doff==0 is elimineted.
1716 * So, we defer the checks. */
1717 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1718 tcp_v4_checksum_init(skb) < 0))
1722 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1723 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1724 skb->len - th->doff * 4);
1725 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1726 TCP_SKB_CB(skb)->when = 0;
1727 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1728 TCP_SKB_CB(skb)->sacked = 0;
1730 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1731 skb->nh.iph->daddr, ntohs(th->dest),
1738 if (sk->sk_state == TCP_TIME_WAIT)
1741 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1742 goto discard_and_relse;
1744 if (sk_filter(sk, skb, 0))
1745 goto discard_and_relse;
1751 if (!sock_owned_by_user(sk)) {
1752 if (!tcp_prequeue(sk, skb))
1753 ret = tcp_v4_do_rcv(sk, skb);
1755 sk_add_backlog(sk, skb);
1763 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1766 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1768 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1770 tcp_v4_send_reset(skb);
1774 /* Discard frame. */
1783 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1784 tcp_tw_put((struct tcp_tw_bucket *) sk);
1788 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1789 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1790 tcp_tw_put((struct tcp_tw_bucket *) sk);
1793 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1794 skb, th, skb->len)) {
1796 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1800 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1801 tcp_tw_put((struct tcp_tw_bucket *)sk);
1805 /* Fall through to ACK */
1808 tcp_v4_timewait_ack(sk, skb);
1812 case TCP_TW_SUCCESS:;
1817 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1819 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1820 struct inet_sock *inet = inet_sk(sk);
1822 sin->sin_family = AF_INET;
1823 sin->sin_addr.s_addr = inet->daddr;
1824 sin->sin_port = inet->dport;
1827 /* VJ's idea. Save last timestamp seen from this destination
1828 * and hold it at least for normal timewait interval to use for duplicate
1829 * segment detection in subsequent connections, before they enter synchronized
1833 int tcp_v4_remember_stamp(struct sock *sk)
1835 struct inet_sock *inet = inet_sk(sk);
1836 struct tcp_sock *tp = tcp_sk(sk);
1837 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1838 struct inet_peer *peer = NULL;
1841 if (!rt || rt->rt_dst != inet->daddr) {
1842 peer = inet_getpeer(inet->daddr, 1);
1846 rt_bind_peer(rt, 1);
1851 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1852 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1853 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1854 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1855 peer->tcp_ts = tp->rx_opt.ts_recent;
1865 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1867 struct inet_peer *peer = NULL;
1869 peer = inet_getpeer(tw->tw_daddr, 1);
1872 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1873 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1874 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1875 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1876 peer->tcp_ts = tw->tw_ts_recent;
1885 struct tcp_func ipv4_specific = {
1886 .queue_xmit = ip_queue_xmit,
1887 .send_check = tcp_v4_send_check,
1888 .rebuild_header = inet_sk_rebuild_header,
1889 .conn_request = tcp_v4_conn_request,
1890 .syn_recv_sock = tcp_v4_syn_recv_sock,
1891 .remember_stamp = tcp_v4_remember_stamp,
1892 .net_header_len = sizeof(struct iphdr),
1893 .setsockopt = ip_setsockopt,
1894 .getsockopt = ip_getsockopt,
1895 .addr2sockaddr = v4_addr2sockaddr,
1896 .sockaddr_len = sizeof(struct sockaddr_in),
1899 /* NOTE: A lot of things set to zero explicitly by call to
1900 * sk_alloc() so need not be done here.
1902 static int tcp_v4_init_sock(struct sock *sk)
1904 struct tcp_sock *tp = tcp_sk(sk);
1906 skb_queue_head_init(&tp->out_of_order_queue);
1907 tcp_init_xmit_timers(sk);
1908 tcp_prequeue_init(tp);
1910 tp->rto = TCP_TIMEOUT_INIT;
1911 tp->mdev = TCP_TIMEOUT_INIT;
1913 /* So many TCP implementations out there (incorrectly) count the
1914 * initial SYN frame in their delayed-ACK and congestion control
1915 * algorithms that we must have the following bandaid to talk
1916 * efficiently to them. -DaveM
1920 /* See draft-stevens-tcpca-spec-01 for discussion of the
1921 * initialization of these values.
1923 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1924 tp->snd_cwnd_clamp = ~0;
1925 tp->mss_cache = 536;
1927 tp->reordering = sysctl_tcp_reordering;
1928 tp->ca_ops = &tcp_init_congestion_ops;
1930 sk->sk_state = TCP_CLOSE;
1932 sk->sk_write_space = sk_stream_write_space;
1933 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1935 tp->af_specific = &ipv4_specific;
1937 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1938 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1940 atomic_inc(&tcp_sockets_allocated);
1945 int tcp_v4_destroy_sock(struct sock *sk)
1947 struct tcp_sock *tp = tcp_sk(sk);
1949 tcp_clear_xmit_timers(sk);
1951 tcp_cleanup_congestion_control(tp);
1953 /* Cleanup up the write buffer. */
1954 sk_stream_writequeue_purge(sk);
1956 /* Cleans up our, hopefully empty, out_of_order_queue. */
1957 __skb_queue_purge(&tp->out_of_order_queue);
1959 /* Clean prequeue, it must be empty really */
1960 __skb_queue_purge(&tp->ucopy.prequeue);
1962 /* Clean up a referenced TCP bind bucket. */
1967 * If sendmsg cached page exists, toss it.
1969 if (sk->sk_sndmsg_page) {
1970 __free_page(sk->sk_sndmsg_page);
1971 sk->sk_sndmsg_page = NULL;
1974 atomic_dec(&tcp_sockets_allocated);
1979 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1981 #ifdef CONFIG_PROC_FS
1982 /* Proc filesystem TCP sock list dumping. */
1984 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1986 return hlist_empty(head) ? NULL :
1987 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1990 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1992 return tw->tw_node.next ?
1993 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1996 static void *listening_get_next(struct seq_file *seq, void *cur)
1998 struct tcp_sock *tp;
1999 struct hlist_node *node;
2000 struct sock *sk = cur;
2001 struct tcp_iter_state* st = seq->private;
2005 sk = sk_head(&tcp_listening_hash[0]);
2011 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2012 struct request_sock *req = cur;
2014 tp = tcp_sk(st->syn_wait_sk);
2018 if (req->rsk_ops->family == st->family) {
2024 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2027 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2029 sk = sk_next(st->syn_wait_sk);
2030 st->state = TCP_SEQ_STATE_LISTENING;
2031 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2034 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2035 if (reqsk_queue_len(&tp->accept_queue))
2037 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2041 sk_for_each_from(sk, node) {
2042 if (sk->sk_family == st->family) {
2047 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2048 if (reqsk_queue_len(&tp->accept_queue)) {
2050 st->uid = sock_i_uid(sk);
2051 st->syn_wait_sk = sk;
2052 st->state = TCP_SEQ_STATE_OPENREQ;
2056 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2058 if (++st->bucket < TCP_LHTABLE_SIZE) {
2059 sk = sk_head(&tcp_listening_hash[st->bucket]);
2067 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2069 void *rc = listening_get_next(seq, NULL);
2071 while (rc && *pos) {
2072 rc = listening_get_next(seq, rc);
2078 static void *established_get_first(struct seq_file *seq)
2080 struct tcp_iter_state* st = seq->private;
2083 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2085 struct hlist_node *node;
2086 struct tcp_tw_bucket *tw;
2088 /* We can reschedule _before_ having picked the target: */
2089 cond_resched_softirq();
2091 read_lock(&tcp_ehash[st->bucket].lock);
2092 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2093 if (sk->sk_family != st->family) {
2099 st->state = TCP_SEQ_STATE_TIME_WAIT;
2100 tw_for_each(tw, node,
2101 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2102 if (tw->tw_family != st->family) {
2108 read_unlock(&tcp_ehash[st->bucket].lock);
2109 st->state = TCP_SEQ_STATE_ESTABLISHED;
2115 static void *established_get_next(struct seq_file *seq, void *cur)
2117 struct sock *sk = cur;
2118 struct tcp_tw_bucket *tw;
2119 struct hlist_node *node;
2120 struct tcp_iter_state* st = seq->private;
2124 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2128 while (tw && tw->tw_family != st->family) {
2135 read_unlock(&tcp_ehash[st->bucket].lock);
2136 st->state = TCP_SEQ_STATE_ESTABLISHED;
2138 /* We can reschedule between buckets: */
2139 cond_resched_softirq();
2141 if (++st->bucket < tcp_ehash_size) {
2142 read_lock(&tcp_ehash[st->bucket].lock);
2143 sk = sk_head(&tcp_ehash[st->bucket].chain);
2151 sk_for_each_from(sk, node) {
2152 if (sk->sk_family == st->family)
2156 st->state = TCP_SEQ_STATE_TIME_WAIT;
2157 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2165 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2167 void *rc = established_get_first(seq);
2170 rc = established_get_next(seq, rc);
2176 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2179 struct tcp_iter_state* st = seq->private;
2182 st->state = TCP_SEQ_STATE_LISTENING;
2183 rc = listening_get_idx(seq, &pos);
2186 tcp_listen_unlock();
2188 st->state = TCP_SEQ_STATE_ESTABLISHED;
2189 rc = established_get_idx(seq, pos);
2195 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2197 struct tcp_iter_state* st = seq->private;
2198 st->state = TCP_SEQ_STATE_LISTENING;
2200 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2203 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2206 struct tcp_iter_state* st;
2208 if (v == SEQ_START_TOKEN) {
2209 rc = tcp_get_idx(seq, 0);
2214 switch (st->state) {
2215 case TCP_SEQ_STATE_OPENREQ:
2216 case TCP_SEQ_STATE_LISTENING:
2217 rc = listening_get_next(seq, v);
2219 tcp_listen_unlock();
2221 st->state = TCP_SEQ_STATE_ESTABLISHED;
2222 rc = established_get_first(seq);
2225 case TCP_SEQ_STATE_ESTABLISHED:
2226 case TCP_SEQ_STATE_TIME_WAIT:
2227 rc = established_get_next(seq, v);
2235 static void tcp_seq_stop(struct seq_file *seq, void *v)
2237 struct tcp_iter_state* st = seq->private;
2239 switch (st->state) {
2240 case TCP_SEQ_STATE_OPENREQ:
2242 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2243 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2245 case TCP_SEQ_STATE_LISTENING:
2246 if (v != SEQ_START_TOKEN)
2247 tcp_listen_unlock();
2249 case TCP_SEQ_STATE_TIME_WAIT:
2250 case TCP_SEQ_STATE_ESTABLISHED:
2252 read_unlock(&tcp_ehash[st->bucket].lock);
2258 static int tcp_seq_open(struct inode *inode, struct file *file)
2260 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2261 struct seq_file *seq;
2262 struct tcp_iter_state *s;
2265 if (unlikely(afinfo == NULL))
2268 s = kmalloc(sizeof(*s), GFP_KERNEL);
2271 memset(s, 0, sizeof(*s));
2272 s->family = afinfo->family;
2273 s->seq_ops.start = tcp_seq_start;
2274 s->seq_ops.next = tcp_seq_next;
2275 s->seq_ops.show = afinfo->seq_show;
2276 s->seq_ops.stop = tcp_seq_stop;
2278 rc = seq_open(file, &s->seq_ops);
2281 seq = file->private_data;
2290 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2293 struct proc_dir_entry *p;
2297 afinfo->seq_fops->owner = afinfo->owner;
2298 afinfo->seq_fops->open = tcp_seq_open;
2299 afinfo->seq_fops->read = seq_read;
2300 afinfo->seq_fops->llseek = seq_lseek;
2301 afinfo->seq_fops->release = seq_release_private;
2303 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2311 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2315 proc_net_remove(afinfo->name);
2316 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2319 static void get_openreq4(struct sock *sk, struct request_sock *req,
2320 char *tmpbuf, int i, int uid)
2322 const struct inet_request_sock *ireq = inet_rsk(req);
2323 int ttd = req->expires - jiffies;
2325 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2326 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2329 ntohs(inet_sk(sk)->sport),
2331 ntohs(ireq->rmt_port),
2333 0, 0, /* could print option size, but that is af dependent. */
2334 1, /* timers active (only the expire timer) */
2335 jiffies_to_clock_t(ttd),
2338 0, /* non standard timer */
2339 0, /* open_requests have no inode */
2340 atomic_read(&sk->sk_refcnt),
2344 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2347 unsigned long timer_expires;
2348 struct tcp_sock *tp = tcp_sk(sp);
2349 struct inet_sock *inet = inet_sk(sp);
2350 unsigned int dest = inet->daddr;
2351 unsigned int src = inet->rcv_saddr;
2352 __u16 destp = ntohs(inet->dport);
2353 __u16 srcp = ntohs(inet->sport);
2355 if (tp->pending == TCP_TIME_RETRANS) {
2357 timer_expires = tp->timeout;
2358 } else if (tp->pending == TCP_TIME_PROBE0) {
2360 timer_expires = tp->timeout;
2361 } else if (timer_pending(&sp->sk_timer)) {
2363 timer_expires = sp->sk_timer.expires;
2366 timer_expires = jiffies;
2369 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2370 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2371 i, src, srcp, dest, destp, sp->sk_state,
2372 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2374 jiffies_to_clock_t(timer_expires - jiffies),
2379 atomic_read(&sp->sk_refcnt), sp,
2380 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2382 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2385 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2387 unsigned int dest, src;
2389 int ttd = tw->tw_ttd - jiffies;
2394 dest = tw->tw_daddr;
2395 src = tw->tw_rcv_saddr;
2396 destp = ntohs(tw->tw_dport);
2397 srcp = ntohs(tw->tw_sport);
2399 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2400 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2401 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2402 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2403 atomic_read(&tw->tw_refcnt), tw);
2408 static int tcp4_seq_show(struct seq_file *seq, void *v)
2410 struct tcp_iter_state* st;
2411 char tmpbuf[TMPSZ + 1];
2413 if (v == SEQ_START_TOKEN) {
2414 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2415 " sl local_address rem_address st tx_queue "
2416 "rx_queue tr tm->when retrnsmt uid timeout "
2422 switch (st->state) {
2423 case TCP_SEQ_STATE_LISTENING:
2424 case TCP_SEQ_STATE_ESTABLISHED:
2425 get_tcp4_sock(v, tmpbuf, st->num);
2427 case TCP_SEQ_STATE_OPENREQ:
2428 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2430 case TCP_SEQ_STATE_TIME_WAIT:
2431 get_timewait4_sock(v, tmpbuf, st->num);
2434 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2439 static struct file_operations tcp4_seq_fops;
2440 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2441 .owner = THIS_MODULE,
2444 .seq_show = tcp4_seq_show,
2445 .seq_fops = &tcp4_seq_fops,
2448 int __init tcp4_proc_init(void)
2450 return tcp_proc_register(&tcp4_seq_afinfo);
2453 void tcp4_proc_exit(void)
2455 tcp_proc_unregister(&tcp4_seq_afinfo);
2457 #endif /* CONFIG_PROC_FS */
2459 struct proto tcp_prot = {
2461 .owner = THIS_MODULE,
2463 .connect = tcp_v4_connect,
2464 .disconnect = tcp_disconnect,
2465 .accept = tcp_accept,
2467 .init = tcp_v4_init_sock,
2468 .destroy = tcp_v4_destroy_sock,
2469 .shutdown = tcp_shutdown,
2470 .setsockopt = tcp_setsockopt,
2471 .getsockopt = tcp_getsockopt,
2472 .sendmsg = tcp_sendmsg,
2473 .recvmsg = tcp_recvmsg,
2474 .backlog_rcv = tcp_v4_do_rcv,
2475 .hash = tcp_v4_hash,
2476 .unhash = tcp_unhash,
2477 .get_port = tcp_v4_get_port,
2478 .enter_memory_pressure = tcp_enter_memory_pressure,
2479 .sockets_allocated = &tcp_sockets_allocated,
2480 .memory_allocated = &tcp_memory_allocated,
2481 .memory_pressure = &tcp_memory_pressure,
2482 .sysctl_mem = sysctl_tcp_mem,
2483 .sysctl_wmem = sysctl_tcp_wmem,
2484 .sysctl_rmem = sysctl_tcp_rmem,
2485 .max_header = MAX_TCP_HEADER,
2486 .obj_size = sizeof(struct tcp_sock),
2487 .rsk_prot = &tcp_request_sock_ops,
2492 void __init tcp_v4_init(struct net_proto_family *ops)
2494 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2496 panic("Failed to create the TCP control socket.\n");
2497 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2498 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2500 /* Unhash it so that IP input processing does not even
2501 * see it, we do not wish this socket to see incoming
2504 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2507 EXPORT_SYMBOL(ipv4_specific);
2508 EXPORT_SYMBOL(tcp_bind_hash);
2509 EXPORT_SYMBOL(tcp_bucket_create);
2510 EXPORT_SYMBOL(tcp_hashinfo);
2511 EXPORT_SYMBOL(tcp_inherit_port);
2512 EXPORT_SYMBOL(tcp_listen_wlock);
2513 EXPORT_SYMBOL(tcp_port_rover);
2514 EXPORT_SYMBOL(tcp_prot);
2515 EXPORT_SYMBOL(tcp_put_port);
2516 EXPORT_SYMBOL(tcp_unhash);
2517 EXPORT_SYMBOL(tcp_v4_conn_request);
2518 EXPORT_SYMBOL(tcp_v4_connect);
2519 EXPORT_SYMBOL(tcp_v4_do_rcv);
2520 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2521 EXPORT_SYMBOL(tcp_v4_send_check);
2522 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2524 #ifdef CONFIG_PROC_FS
2525 EXPORT_SYMBOL(tcp_proc_register);
2526 EXPORT_SYMBOL(tcp_proc_unregister);
2528 EXPORT_SYMBOL(sysctl_local_port_range);
2529 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2530 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);