2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * open_request handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
69 #include <net/inet_common.h>
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
78 extern int sysctl_ip_dynaddr;
79 int sysctl_tcp_tw_reuse;
80 int sysctl_tcp_low_latency;
82 /* Check TCP sequence numbers in ICMP packets. */
83 #define ICMP_MIN_LENGTH 8
85 /* Socket used for sending RSTs */
86 static struct socket *tcp_socket;
88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
92 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
93 .__tcp_lhash_users = ATOMIC_INIT(0),
95 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
96 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108 __u32 faddr, __u16 fport)
110 int h = (laddr ^ lport) ^ (faddr ^ fport);
113 return h & (tcp_ehash_size - 1);
116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
118 struct inet_sock *inet = inet_sk(sk);
119 __u32 laddr = inet->rcv_saddr;
120 __u16 lport = inet->num;
121 __u32 faddr = inet->daddr;
122 __u16 fport = inet->dport;
124 return tcp_hashfn(laddr, lport, faddr, fport);
127 /* Allocate and initialize a new TCP local port bind bucket.
128 * The bindhash mutex for snum's hash chain must be held here.
130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
133 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
138 INIT_HLIST_HEAD(&tb->owners);
139 hlist_add_head(&tb->node, &head->chain);
144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
147 if (hlist_empty(&tb->owners)) {
148 __hlist_del(&tb->node);
149 kmem_cache_free(tcp_bucket_cachep, tb);
153 /* Caller must disable local BH processing. */
154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
156 struct tcp_bind_hashbucket *head =
157 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158 struct tcp_bind_bucket *tb;
160 spin_lock(&head->lock);
161 tb = tcp_sk(sk)->bind_hash;
162 sk_add_bind_node(child, &tb->owners);
163 tcp_sk(child)->bind_hash = tb;
164 spin_unlock(&head->lock);
167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
170 __tcp_inherit_port(sk, child);
174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
177 inet_sk(sk)->num = snum;
178 sk_add_bind_node(sk, &tb->owners);
179 tcp_sk(sk)->bind_hash = tb;
182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
184 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
189 sk_for_each_bound(sk2, node, &tb->owners) {
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199 sk2_rcv_saddr == sk_rcv_saddr)
207 /* Obtain a reference to a local port for the given sock,
208 * if snum is zero it means select any available local port.
210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
212 struct tcp_bind_hashbucket *head;
213 struct hlist_node *node;
214 struct tcp_bind_bucket *tb;
219 int low = sysctl_local_port_range[0];
220 int high = sysctl_local_port_range[1];
221 int remaining = (high - low) + 1;
224 spin_lock(&tcp_portalloc_lock);
225 rover = tcp_port_rover;
228 if (rover < low || rover > high)
230 head = &tcp_bhash[tcp_bhashfn(rover)];
231 spin_lock(&head->lock);
232 tb_for_each(tb, node, &head->chain)
233 if (tb->port == rover)
237 spin_unlock(&head->lock);
238 } while (--remaining > 0);
239 tcp_port_rover = rover;
240 spin_unlock(&tcp_portalloc_lock);
242 /* Exhausted local port range during search? */
247 /* OK, here is the one we will use. HEAD is
248 * non-NULL and we hold it's mutex.
252 head = &tcp_bhash[tcp_bhashfn(snum)];
253 spin_lock(&head->lock);
254 tb_for_each(tb, node, &head->chain)
255 if (tb->port == snum)
261 if (!hlist_empty(&tb->owners)) {
262 if (sk->sk_reuse > 1)
264 if (tb->fastreuse > 0 &&
265 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
269 if (tcp_bind_conflict(sk, tb))
275 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
277 if (hlist_empty(&tb->owners)) {
278 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
282 } else if (tb->fastreuse &&
283 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
286 if (!tcp_sk(sk)->bind_hash)
287 tcp_bind_hash(sk, tb, snum);
288 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
292 spin_unlock(&head->lock);
298 /* Get rid of any references to a local port held by the
301 static void __tcp_put_port(struct sock *sk)
303 struct inet_sock *inet = inet_sk(sk);
304 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
305 struct tcp_bind_bucket *tb;
307 spin_lock(&head->lock);
308 tb = tcp_sk(sk)->bind_hash;
309 __sk_del_bind_node(sk);
310 tcp_sk(sk)->bind_hash = NULL;
312 tcp_bucket_destroy(tb);
313 spin_unlock(&head->lock);
316 void tcp_put_port(struct sock *sk)
323 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
324 * Look, when several writers sleep and reader wakes them up, all but one
325 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
326 * this, _but_ remember, it adds useless work on UP machines (wake up each
327 * exclusive lock release). It should be ifdefed really.
330 void tcp_listen_wlock(void)
332 write_lock(&tcp_lhash_lock);
334 if (atomic_read(&tcp_lhash_users)) {
338 prepare_to_wait_exclusive(&tcp_lhash_wait,
339 &wait, TASK_UNINTERRUPTIBLE);
340 if (!atomic_read(&tcp_lhash_users))
342 write_unlock_bh(&tcp_lhash_lock);
344 write_lock_bh(&tcp_lhash_lock);
347 finish_wait(&tcp_lhash_wait, &wait);
351 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
353 struct hlist_head *list;
356 BUG_TRAP(sk_unhashed(sk));
357 if (listen_possible && sk->sk_state == TCP_LISTEN) {
358 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
359 lock = &tcp_lhash_lock;
362 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
363 lock = &tcp_ehash[sk->sk_hashent].lock;
366 __sk_add_node(sk, list);
367 sock_prot_inc_use(sk->sk_prot);
369 if (listen_possible && sk->sk_state == TCP_LISTEN)
370 wake_up(&tcp_lhash_wait);
373 static void tcp_v4_hash(struct sock *sk)
375 if (sk->sk_state != TCP_CLOSE) {
377 __tcp_v4_hash(sk, 1);
382 void tcp_unhash(struct sock *sk)
389 if (sk->sk_state == TCP_LISTEN) {
392 lock = &tcp_lhash_lock;
394 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
396 write_lock_bh(&head->lock);
399 if (__sk_del_node_init(sk))
400 sock_prot_dec_use(sk->sk_prot);
401 write_unlock_bh(lock);
404 if (sk->sk_state == TCP_LISTEN)
405 wake_up(&tcp_lhash_wait);
408 /* Don't inline this cruft. Here are some nice properties to
409 * exploit here. The BSD API does not allow a listening TCP
410 * to specify the remote port nor the remote address for the
411 * connection. So always assume those are both wildcarded
412 * during the search since they can never be otherwise.
414 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
415 unsigned short hnum, int dif)
417 struct sock *result = NULL, *sk;
418 struct hlist_node *node;
422 sk_for_each(sk, node, head) {
423 struct inet_sock *inet = inet_sk(sk);
425 if (inet->num == hnum && !ipv6_only_sock(sk)) {
426 __u32 rcv_saddr = inet->rcv_saddr;
428 score = (sk->sk_family == PF_INET ? 1 : 0);
430 if (rcv_saddr != daddr)
434 if (sk->sk_bound_dev_if) {
435 if (sk->sk_bound_dev_if != dif)
441 if (score > hiscore) {
450 /* Optimize the common listener case. */
451 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
452 unsigned short hnum, int dif)
454 struct sock *sk = NULL;
455 struct hlist_head *head;
457 read_lock(&tcp_lhash_lock);
458 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
459 if (!hlist_empty(head)) {
460 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
462 if (inet->num == hnum && !sk->sk_node.next &&
463 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
464 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
465 !sk->sk_bound_dev_if)
467 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
473 read_unlock(&tcp_lhash_lock);
477 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
478 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
480 * Local BH must be disabled here.
483 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
487 struct tcp_ehash_bucket *head;
488 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
489 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
491 struct hlist_node *node;
492 /* Optimize here for direct hit, only listening connections can
493 * have wildcards anyways.
495 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
496 head = &tcp_ehash[hash];
497 read_lock(&head->lock);
498 sk_for_each(sk, node, &head->chain) {
499 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
500 goto hit; /* You sunk my battleship! */
503 /* Must check for a TIME_WAIT'er before going to listener hash. */
504 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
505 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
510 read_unlock(&head->lock);
517 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
518 u32 daddr, u16 hnum, int dif)
520 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
523 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
526 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
532 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
538 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
540 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
542 return secure_tcp_sequence_number(skb->nh.iph->daddr,
548 /* called with local bh disabled */
549 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
550 struct tcp_tw_bucket **twp)
552 struct inet_sock *inet = inet_sk(sk);
553 u32 daddr = inet->rcv_saddr;
554 u32 saddr = inet->daddr;
555 int dif = sk->sk_bound_dev_if;
556 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
557 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
558 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
559 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
561 struct hlist_node *node;
562 struct tcp_tw_bucket *tw;
564 write_lock(&head->lock);
566 /* Check TIME-WAIT sockets first. */
567 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
568 tw = (struct tcp_tw_bucket *)sk2;
570 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
571 struct tcp_sock *tp = tcp_sk(sk);
573 /* With PAWS, it is safe from the viewpoint
574 of data integrity. Even without PAWS it
575 is safe provided sequence spaces do not
576 overlap i.e. at data rates <= 80Mbit/sec.
578 Actually, the idea is close to VJ's one,
579 only timestamp cache is held not per host,
580 but per port pair and TW bucket is used
583 If TW bucket has been already destroyed we
584 fall back to VJ's scheme and use initial
585 timestamp retrieved from peer table.
587 if (tw->tw_ts_recent_stamp &&
588 (!twp || (sysctl_tcp_tw_reuse &&
590 tw->tw_ts_recent_stamp > 1))) {
592 tw->tw_snd_nxt + 65535 + 2) == 0)
594 tp->rx_opt.ts_recent = tw->tw_ts_recent;
595 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
604 /* And established part... */
605 sk_for_each(sk2, node, &head->chain) {
606 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
611 /* Must record num and sport now. Otherwise we will see
612 * in hash table socket with a funny identity. */
614 inet->sport = htons(lport);
615 sk->sk_hashent = hash;
616 BUG_TRAP(sk_unhashed(sk));
617 __sk_add_node(sk, &head->chain);
618 sock_prot_inc_use(sk->sk_prot);
619 write_unlock(&head->lock);
623 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
625 /* Silly. Should hash-dance instead... */
626 tcp_tw_deschedule(tw);
627 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
635 write_unlock(&head->lock);
636 return -EADDRNOTAVAIL;
639 static inline u32 connect_port_offset(const struct sock *sk)
641 const struct inet_sock *inet = inet_sk(sk);
643 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
648 * Bind a port for a connect operation and hash it.
650 static inline int tcp_v4_hash_connect(struct sock *sk)
652 unsigned short snum = inet_sk(sk)->num;
653 struct tcp_bind_hashbucket *head;
654 struct tcp_bind_bucket *tb;
658 int low = sysctl_local_port_range[0];
659 int high = sysctl_local_port_range[1];
660 int range = high - low;
664 u32 offset = hint + connect_port_offset(sk);
665 struct hlist_node *node;
666 struct tcp_tw_bucket *tw = NULL;
669 for (i = 1; i <= range; i++) {
670 port = low + (i + offset) % range;
671 head = &tcp_bhash[tcp_bhashfn(port)];
672 spin_lock(&head->lock);
674 /* Does not bother with rcv_saddr checks,
675 * because the established check is already
678 tb_for_each(tb, node, &head->chain) {
679 if (tb->port == port) {
680 BUG_TRAP(!hlist_empty(&tb->owners));
681 if (tb->fastreuse >= 0)
683 if (!__tcp_v4_check_established(sk,
691 tb = tcp_bucket_create(head, port);
693 spin_unlock(&head->lock);
700 spin_unlock(&head->lock);
704 return -EADDRNOTAVAIL;
709 /* Head lock still held and bh's disabled */
710 tcp_bind_hash(sk, tb, port);
711 if (sk_unhashed(sk)) {
712 inet_sk(sk)->sport = htons(port);
713 __tcp_v4_hash(sk, 0);
715 spin_unlock(&head->lock);
718 tcp_tw_deschedule(tw);
726 head = &tcp_bhash[tcp_bhashfn(snum)];
727 tb = tcp_sk(sk)->bind_hash;
728 spin_lock_bh(&head->lock);
729 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
730 __tcp_v4_hash(sk, 0);
731 spin_unlock_bh(&head->lock);
734 spin_unlock(&head->lock);
735 /* No definite answer... Walk to established hash table */
736 ret = __tcp_v4_check_established(sk, snum, NULL);
743 /* This will initiate an outgoing connection. */
744 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
746 struct inet_sock *inet = inet_sk(sk);
747 struct tcp_sock *tp = tcp_sk(sk);
748 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
754 if (addr_len < sizeof(struct sockaddr_in))
757 if (usin->sin_family != AF_INET)
758 return -EAFNOSUPPORT;
760 nexthop = daddr = usin->sin_addr.s_addr;
761 if (inet->opt && inet->opt->srr) {
764 nexthop = inet->opt->faddr;
767 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
768 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
770 inet->sport, usin->sin_port, sk);
774 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
779 if (!inet->opt || !inet->opt->srr)
783 inet->saddr = rt->rt_src;
784 inet->rcv_saddr = inet->saddr;
786 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
787 /* Reset inherited state */
788 tp->rx_opt.ts_recent = 0;
789 tp->rx_opt.ts_recent_stamp = 0;
793 if (sysctl_tcp_tw_recycle &&
794 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
795 struct inet_peer *peer = rt_get_peer(rt);
797 /* VJ's idea. We save last timestamp seen from
798 * the destination in peer table, when entering state TIME-WAIT
799 * and initialize rx_opt.ts_recent from it, when trying new connection.
802 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
803 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
804 tp->rx_opt.ts_recent = peer->tcp_ts;
808 inet->dport = usin->sin_port;
811 tp->ext_header_len = 0;
813 tp->ext_header_len = inet->opt->optlen;
815 tp->rx_opt.mss_clamp = 536;
817 /* Socket identity is still unknown (sport may be zero).
818 * However we set state to SYN-SENT and not releasing socket
819 * lock select source port, enter ourselves into the hash tables and
820 * complete initialization after this.
822 tcp_set_state(sk, TCP_SYN_SENT);
823 err = tcp_v4_hash_connect(sk);
827 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
831 /* OK, now commit destination to socket. */
832 __sk_dst_set(sk, &rt->u.dst);
833 tcp_v4_setup_caps(sk, &rt->u.dst);
836 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
841 inet->id = tp->write_seq ^ jiffies;
843 err = tcp_connect(sk);
851 /* This unhashes the socket and releases the local port, if necessary. */
852 tcp_set_state(sk, TCP_CLOSE);
854 sk->sk_route_caps = 0;
859 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
861 return ((struct rtable *)skb->dst)->rt_iif;
864 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
866 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
869 static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
870 struct open_request ***prevp,
872 __u32 raddr, __u32 laddr)
874 struct tcp_listen_opt *lopt = tp->listen_opt;
875 struct open_request *req, **prev;
877 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
878 (req = *prev) != NULL;
879 prev = &req->dl_next) {
880 if (req->rmt_port == rport &&
881 req->af.v4_req.rmt_addr == raddr &&
882 req->af.v4_req.loc_addr == laddr &&
883 TCP_INET_FAMILY(req->class->family)) {
893 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
895 struct tcp_sock *tp = tcp_sk(sk);
896 struct tcp_listen_opt *lopt = tp->listen_opt;
897 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
899 req->expires = jiffies + TCP_TIMEOUT_INIT;
902 req->dl_next = lopt->syn_table[h];
904 write_lock(&tp->syn_wait_lock);
905 lopt->syn_table[h] = req;
906 write_unlock(&tp->syn_wait_lock);
913 * This routine does path mtu discovery as defined in RFC1191.
915 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
918 struct dst_entry *dst;
919 struct inet_sock *inet = inet_sk(sk);
920 struct tcp_sock *tp = tcp_sk(sk);
922 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
923 * send out by Linux are always <576bytes so they should go through
926 if (sk->sk_state == TCP_LISTEN)
929 /* We don't check in the destentry if pmtu discovery is forbidden
930 * on this route. We just assume that no packet_to_big packets
931 * are send back when pmtu discovery is not active.
932 * There is a small race when the user changes this flag in the
933 * route, but I think that's acceptable.
935 if ((dst = __sk_dst_check(sk, 0)) == NULL)
938 dst->ops->update_pmtu(dst, mtu);
940 /* Something is about to be wrong... Remember soft error
941 * for the case, if this connection will not able to recover.
943 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
944 sk->sk_err_soft = EMSGSIZE;
948 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
949 tp->pmtu_cookie > mtu) {
950 tcp_sync_mss(sk, mtu);
952 /* Resend the TCP packet because it's
953 * clear that the old packet has been
954 * dropped. This is the new "fast" path mtu
957 tcp_simple_retransmit(sk);
958 } /* else let the usual retransmit timer handle it */
962 * This routine is called by the ICMP module when it gets some
963 * sort of error condition. If err < 0 then the socket should
964 * be closed and the error returned to the user. If err > 0
965 * it's just the icmp type << 8 | icmp code. After adjustment
966 * header points to the first 8 bytes of the tcp header. We need
967 * to find the appropriate port.
969 * The locking strategy used here is very "optimistic". When
970 * someone else accesses the socket the ICMP is just dropped
971 * and for some paths there is no check at all.
972 * A more general error queue to queue errors for later handling
973 * is probably better.
977 void tcp_v4_err(struct sk_buff *skb, u32 info)
979 struct iphdr *iph = (struct iphdr *)skb->data;
980 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
982 struct inet_sock *inet;
983 int type = skb->h.icmph->type;
984 int code = skb->h.icmph->code;
989 if (skb->len < (iph->ihl << 2) + 8) {
990 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
994 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
995 th->source, tcp_v4_iif(skb));
997 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1000 if (sk->sk_state == TCP_TIME_WAIT) {
1001 tcp_tw_put((struct tcp_tw_bucket *)sk);
1006 /* If too many ICMPs get dropped on busy
1007 * servers this needs to be solved differently.
1009 if (sock_owned_by_user(sk))
1010 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1012 if (sk->sk_state == TCP_CLOSE)
1016 seq = ntohl(th->seq);
1017 if (sk->sk_state != TCP_LISTEN &&
1018 !between(seq, tp->snd_una, tp->snd_nxt)) {
1019 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1024 case ICMP_SOURCE_QUENCH:
1025 /* Just silently ignore these. */
1027 case ICMP_PARAMETERPROB:
1030 case ICMP_DEST_UNREACH:
1031 if (code > NR_ICMP_UNREACH)
1034 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1035 if (!sock_owned_by_user(sk))
1036 do_pmtu_discovery(sk, iph, info);
1040 err = icmp_err_convert[code].errno;
1042 case ICMP_TIME_EXCEEDED:
1049 switch (sk->sk_state) {
1050 struct open_request *req, **prev;
1052 if (sock_owned_by_user(sk))
1055 req = tcp_v4_search_req(tp, &prev, th->dest,
1056 iph->daddr, iph->saddr);
1060 /* ICMPs are not backlogged, hence we cannot get
1061 an established socket here.
1065 if (seq != req->snt_isn) {
1066 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1071 * Still in SYN_RECV, just remove it silently.
1072 * There is no good way to pass the error to the newly
1073 * created socket, and POSIX does not want network
1074 * errors returned from accept().
1076 tcp_synq_drop(sk, req, prev);
1080 case TCP_SYN_RECV: /* Cannot happen.
1081 It can f.e. if SYNs crossed.
1083 if (!sock_owned_by_user(sk)) {
1084 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1087 sk->sk_error_report(sk);
1091 sk->sk_err_soft = err;
1096 /* If we've already connected we will keep trying
1097 * until we time out, or the user gives up.
1099 * rfc1122 4.2.3.9 allows to consider as hard errors
1100 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1101 * but it is obsoleted by pmtu discovery).
1103 * Note, that in modern internet, where routing is unreliable
1104 * and in each dark corner broken firewalls sit, sending random
1105 * errors ordered by their masters even this two messages finally lose
1106 * their original sense (even Linux sends invalid PORT_UNREACHs)
1108 * Now we are in compliance with RFCs.
1113 if (!sock_owned_by_user(sk) && inet->recverr) {
1115 sk->sk_error_report(sk);
1116 } else { /* Only an error on timeout */
1117 sk->sk_err_soft = err;
1125 /* This routine computes an IPv4 TCP checksum. */
1126 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1127 struct sk_buff *skb)
1129 struct inet_sock *inet = inet_sk(sk);
1131 if (skb->ip_summed == CHECKSUM_HW) {
1132 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1133 skb->csum = offsetof(struct tcphdr, check);
1135 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1136 csum_partial((char *)th,
1143 * This routine will send an RST to the other tcp.
1145 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1147 * Answer: if a packet caused RST, it is not for a socket
1148 * existing in our system, if it is matched to a socket,
1149 * it is just duplicate segment or bug in other side's TCP.
1150 * So that we build reply only basing on parameters
1151 * arrived with segment.
1152 * Exception: precedence violation. We do not implement it in any case.
1155 static void tcp_v4_send_reset(struct sk_buff *skb)
1157 struct tcphdr *th = skb->h.th;
1159 struct ip_reply_arg arg;
1161 /* Never send a reset in response to a reset. */
1165 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1168 /* Swap the send and the receive. */
1169 memset(&rth, 0, sizeof(struct tcphdr));
1170 rth.dest = th->source;
1171 rth.source = th->dest;
1172 rth.doff = sizeof(struct tcphdr) / 4;
1176 rth.seq = th->ack_seq;
1179 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1180 skb->len - (th->doff << 2));
1183 memset(&arg, 0, sizeof arg);
1184 arg.iov[0].iov_base = (unsigned char *)&rth;
1185 arg.iov[0].iov_len = sizeof rth;
1186 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1187 skb->nh.iph->saddr, /*XXX*/
1188 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1189 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1191 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1193 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1194 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1197 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1198 outside socket context is ugly, certainly. What can I do?
1201 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1204 struct tcphdr *th = skb->h.th;
1209 struct ip_reply_arg arg;
1211 memset(&rep.th, 0, sizeof(struct tcphdr));
1212 memset(&arg, 0, sizeof arg);
1214 arg.iov[0].iov_base = (unsigned char *)&rep;
1215 arg.iov[0].iov_len = sizeof(rep.th);
1217 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1218 (TCPOPT_TIMESTAMP << 8) |
1220 rep.tsopt[1] = htonl(tcp_time_stamp);
1221 rep.tsopt[2] = htonl(ts);
1222 arg.iov[0].iov_len = sizeof(rep);
1225 /* Swap the send and the receive. */
1226 rep.th.dest = th->source;
1227 rep.th.source = th->dest;
1228 rep.th.doff = arg.iov[0].iov_len / 4;
1229 rep.th.seq = htonl(seq);
1230 rep.th.ack_seq = htonl(ack);
1232 rep.th.window = htons(win);
1234 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1235 skb->nh.iph->saddr, /*XXX*/
1236 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1237 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1239 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1241 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1244 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1246 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1248 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1249 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1254 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1256 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1260 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1261 struct open_request *req)
1264 struct ip_options *opt = req->af.v4_req.opt;
1265 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1267 { .daddr = ((opt && opt->srr) ?
1269 req->af.v4_req.rmt_addr),
1270 .saddr = req->af.v4_req.loc_addr,
1271 .tos = RT_CONN_FLAGS(sk) } },
1272 .proto = IPPROTO_TCP,
1274 { .sport = inet_sk(sk)->sport,
1275 .dport = req->rmt_port } } };
1277 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1278 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1281 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1283 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1290 * Send a SYN-ACK after having received an ACK.
1291 * This still operates on a open_request only, not on a big
1294 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1295 struct dst_entry *dst)
1298 struct sk_buff * skb;
1300 /* First, grab a route. */
1301 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1304 skb = tcp_make_synack(sk, dst, req);
1307 struct tcphdr *th = skb->h.th;
1309 th->check = tcp_v4_check(th, skb->len,
1310 req->af.v4_req.loc_addr,
1311 req->af.v4_req.rmt_addr,
1312 csum_partial((char *)th, skb->len,
1315 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1316 req->af.v4_req.rmt_addr,
1317 req->af.v4_req.opt);
1318 if (err == NET_XMIT_CN)
1328 * IPv4 open_request destructor.
1330 static void tcp_v4_or_free(struct open_request *req)
1332 if (req->af.v4_req.opt)
1333 kfree(req->af.v4_req.opt);
1336 static inline void syn_flood_warning(struct sk_buff *skb)
1338 static unsigned long warntime;
1340 if (time_after(jiffies, (warntime + HZ * 60))) {
1343 "possible SYN flooding on port %d. Sending cookies.\n",
1344 ntohs(skb->h.th->dest));
1349 * Save and compile IPv4 options into the open_request if needed.
1351 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1352 struct sk_buff *skb)
1354 struct ip_options *opt = &(IPCB(skb)->opt);
1355 struct ip_options *dopt = NULL;
1357 if (opt && opt->optlen) {
1358 int opt_size = optlength(opt);
1359 dopt = kmalloc(opt_size, GFP_ATOMIC);
1361 if (ip_options_echo(dopt, skb)) {
1371 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1372 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1373 * It would be better to replace it with a global counter for all sockets
1374 * but then some measure against one socket starving all other sockets
1377 * It was 128 by default. Experiments with real servers show, that
1378 * it is absolutely not enough even at 100conn/sec. 256 cures most
1379 * of problems. This value is adjusted to 128 for very small machines
1380 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1381 * Further increasing requires to change hash table size.
1383 int sysctl_max_syn_backlog = 256;
1385 struct or_calltable or_ipv4 = {
1387 .rtx_syn_ack = tcp_v4_send_synack,
1388 .send_ack = tcp_v4_or_send_ack,
1389 .destructor = tcp_v4_or_free,
1390 .send_reset = tcp_v4_send_reset,
1393 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1395 struct tcp_options_received tmp_opt;
1396 struct open_request *req;
1397 __u32 saddr = skb->nh.iph->saddr;
1398 __u32 daddr = skb->nh.iph->daddr;
1399 __u32 isn = TCP_SKB_CB(skb)->when;
1400 struct dst_entry *dst = NULL;
1401 #ifdef CONFIG_SYN_COOKIES
1402 int want_cookie = 0;
1404 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1407 /* Never answer to SYNs send to broadcast or multicast */
1408 if (((struct rtable *)skb->dst)->rt_flags &
1409 (RTCF_BROADCAST | RTCF_MULTICAST))
1412 /* TW buckets are converted to open requests without
1413 * limitations, they conserve resources and peer is
1414 * evidently real one.
1416 if (tcp_synq_is_full(sk) && !isn) {
1417 #ifdef CONFIG_SYN_COOKIES
1418 if (sysctl_tcp_syncookies) {
1425 /* Accept backlog is full. If we have already queued enough
1426 * of warm entries in syn queue, drop request. It is better than
1427 * clogging syn queue with openreqs with exponentially increasing
1430 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1433 req = tcp_openreq_alloc();
1437 tcp_clear_options(&tmp_opt);
1438 tmp_opt.mss_clamp = 536;
1439 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1441 tcp_parse_options(skb, &tmp_opt, 0);
1444 tcp_clear_options(&tmp_opt);
1445 tmp_opt.saw_tstamp = 0;
1448 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1449 /* Some OSes (unknown ones, but I see them on web server, which
1450 * contains information interesting only for windows'
1451 * users) do not send their stamp in SYN. It is easy case.
1452 * We simply do not advertise TS support.
1454 tmp_opt.saw_tstamp = 0;
1455 tmp_opt.tstamp_ok = 0;
1457 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1459 tcp_openreq_init(req, &tmp_opt, skb);
1461 req->af.v4_req.loc_addr = daddr;
1462 req->af.v4_req.rmt_addr = saddr;
1463 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1464 req->class = &or_ipv4;
1466 TCP_ECN_create_request(req, skb->h.th);
1469 #ifdef CONFIG_SYN_COOKIES
1470 syn_flood_warning(skb);
1472 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1474 struct inet_peer *peer = NULL;
1476 /* VJ's idea. We save last timestamp seen
1477 * from the destination in peer table, when entering
1478 * state TIME-WAIT, and check against it before
1479 * accepting new connection request.
1481 * If "isn" is not zero, this request hit alive
1482 * timewait bucket, so that all the necessary checks
1483 * are made in the function processing timewait state.
1485 if (tmp_opt.saw_tstamp &&
1486 sysctl_tcp_tw_recycle &&
1487 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1488 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1489 peer->v4daddr == saddr) {
1490 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1491 (s32)(peer->tcp_ts - req->ts_recent) >
1493 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1498 /* Kill the following clause, if you dislike this way. */
1499 else if (!sysctl_tcp_syncookies &&
1500 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1501 (sysctl_max_syn_backlog >> 2)) &&
1502 (!peer || !peer->tcp_ts_stamp) &&
1503 (!dst || !dst_metric(dst, RTAX_RTT))) {
1504 /* Without syncookies last quarter of
1505 * backlog is filled with destinations,
1506 * proven to be alive.
1507 * It means that we continue to communicate
1508 * to destinations, already remembered
1509 * to the moment of synflood.
1511 NETDEBUG(if (net_ratelimit()) \
1512 printk(KERN_DEBUG "TCP: drop open "
1513 "request from %u.%u."
1516 ntohs(skb->h.th->source)));
1521 isn = tcp_v4_init_sequence(sk, skb);
1525 if (tcp_v4_send_synack(sk, req, dst))
1529 tcp_openreq_free(req);
1531 tcp_v4_synq_add(sk, req);
1536 tcp_openreq_free(req);
1538 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1544 * The three way handshake has completed - we got a valid synack -
1545 * now create the new socket.
1547 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1548 struct open_request *req,
1549 struct dst_entry *dst)
1551 struct inet_sock *newinet;
1552 struct tcp_sock *newtp;
1555 if (sk_acceptq_is_full(sk))
1558 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1561 newsk = tcp_create_openreq_child(sk, req, skb);
1565 newsk->sk_dst_cache = dst;
1566 tcp_v4_setup_caps(newsk, dst);
1568 newtp = tcp_sk(newsk);
1569 newinet = inet_sk(newsk);
1570 newinet->daddr = req->af.v4_req.rmt_addr;
1571 newinet->rcv_saddr = req->af.v4_req.loc_addr;
1572 newinet->saddr = req->af.v4_req.loc_addr;
1573 newinet->opt = req->af.v4_req.opt;
1574 req->af.v4_req.opt = NULL;
1575 newinet->mc_index = tcp_v4_iif(skb);
1576 newinet->mc_ttl = skb->nh.iph->ttl;
1577 newtp->ext_header_len = 0;
1579 newtp->ext_header_len = newinet->opt->optlen;
1580 newinet->id = newtp->write_seq ^ jiffies;
1582 tcp_sync_mss(newsk, dst_mtu(dst));
1583 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1584 tcp_initialize_rcv_mss(newsk);
1586 __tcp_v4_hash(newsk, 0);
1587 __tcp_inherit_port(sk, newsk);
1592 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1594 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1599 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1601 struct tcphdr *th = skb->h.th;
1602 struct iphdr *iph = skb->nh.iph;
1603 struct tcp_sock *tp = tcp_sk(sk);
1605 struct open_request **prev;
1606 /* Find possible connection requests. */
1607 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1608 iph->saddr, iph->daddr);
1610 return tcp_check_req(sk, skb, req, prev);
1612 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1619 if (nsk->sk_state != TCP_TIME_WAIT) {
1623 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1627 #ifdef CONFIG_SYN_COOKIES
1628 if (!th->rst && !th->syn && th->ack)
1629 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1634 static int tcp_v4_checksum_init(struct sk_buff *skb)
1636 if (skb->ip_summed == CHECKSUM_HW) {
1637 skb->ip_summed = CHECKSUM_UNNECESSARY;
1638 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1639 skb->nh.iph->daddr, skb->csum))
1642 NETDEBUG(if (net_ratelimit())
1643 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1644 skb->ip_summed = CHECKSUM_NONE;
1646 if (skb->len <= 76) {
1647 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1649 skb_checksum(skb, 0, skb->len, 0)))
1651 skb->ip_summed = CHECKSUM_UNNECESSARY;
1653 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1655 skb->nh.iph->daddr, 0);
1661 /* The socket must have it's spinlock held when we get
1664 * We have a potential double-lock case here, so even when
1665 * doing backlog processing we use the BH locking scheme.
1666 * This is because we cannot sleep with the original spinlock
1669 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1671 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1672 TCP_CHECK_TIMER(sk);
1673 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1675 TCP_CHECK_TIMER(sk);
1679 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1682 if (sk->sk_state == TCP_LISTEN) {
1683 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1688 if (tcp_child_process(sk, nsk, skb))
1694 TCP_CHECK_TIMER(sk);
1695 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1697 TCP_CHECK_TIMER(sk);
1701 tcp_v4_send_reset(skb);
1704 /* Be careful here. If this function gets more complicated and
1705 * gcc suffers from register pressure on the x86, sk (in %ebx)
1706 * might be destroyed here. This current version compiles correctly,
1707 * but you have been warned.
1712 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1720 int tcp_v4_rcv(struct sk_buff *skb)
1726 if (skb->pkt_type != PACKET_HOST)
1729 /* Count it even if it's bad */
1730 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1732 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1737 if (th->doff < sizeof(struct tcphdr) / 4)
1739 if (!pskb_may_pull(skb, th->doff * 4))
1742 /* An explanation is required here, I think.
1743 * Packet length and doff are validated by header prediction,
1744 * provided case of th->doff==0 is elimineted.
1745 * So, we defer the checks. */
1746 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1747 tcp_v4_checksum_init(skb) < 0))
1751 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1752 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1753 skb->len - th->doff * 4);
1754 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1755 TCP_SKB_CB(skb)->when = 0;
1756 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1757 TCP_SKB_CB(skb)->sacked = 0;
1759 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1760 skb->nh.iph->daddr, ntohs(th->dest),
1767 if (sk->sk_state == TCP_TIME_WAIT)
1770 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1771 goto discard_and_relse;
1773 if (sk_filter(sk, skb, 0))
1774 goto discard_and_relse;
1780 if (!sock_owned_by_user(sk)) {
1781 if (!tcp_prequeue(sk, skb))
1782 ret = tcp_v4_do_rcv(sk, skb);
1784 sk_add_backlog(sk, skb);
1792 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1795 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1797 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1799 tcp_v4_send_reset(skb);
1803 /* Discard frame. */
1812 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1813 tcp_tw_put((struct tcp_tw_bucket *) sk);
1817 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1818 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1819 tcp_tw_put((struct tcp_tw_bucket *) sk);
1822 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1823 skb, th, skb->len)) {
1825 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1829 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1830 tcp_tw_put((struct tcp_tw_bucket *)sk);
1834 /* Fall through to ACK */
1837 tcp_v4_timewait_ack(sk, skb);
1841 case TCP_TW_SUCCESS:;
1846 /* With per-bucket locks this operation is not-atomic, so that
1847 * this version is not worse.
1849 static void __tcp_v4_rehash(struct sock *sk)
1851 sk->sk_prot->unhash(sk);
1852 sk->sk_prot->hash(sk);
1855 static int tcp_v4_reselect_saddr(struct sock *sk)
1857 struct inet_sock *inet = inet_sk(sk);
1860 __u32 old_saddr = inet->saddr;
1862 __u32 daddr = inet->daddr;
1864 if (inet->opt && inet->opt->srr)
1865 daddr = inet->opt->faddr;
1867 /* Query new route. */
1868 err = ip_route_connect(&rt, daddr, 0,
1870 sk->sk_bound_dev_if,
1872 inet->sport, inet->dport, sk);
1876 __sk_dst_set(sk, &rt->u.dst);
1877 tcp_v4_setup_caps(sk, &rt->u.dst);
1879 new_saddr = rt->rt_src;
1881 if (new_saddr == old_saddr)
1884 if (sysctl_ip_dynaddr > 1) {
1885 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1886 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1888 NIPQUAD(new_saddr));
1891 inet->saddr = new_saddr;
1892 inet->rcv_saddr = new_saddr;
1894 /* XXX The only one ugly spot where we need to
1895 * XXX really change the sockets identity after
1896 * XXX it has entered the hashes. -DaveM
1898 * Besides that, it does not check for connection
1899 * uniqueness. Wait for troubles.
1901 __tcp_v4_rehash(sk);
1905 int tcp_v4_rebuild_header(struct sock *sk)
1907 struct inet_sock *inet = inet_sk(sk);
1908 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1912 /* Route is OK, nothing to do. */
1917 daddr = inet->daddr;
1918 if (inet->opt && inet->opt->srr)
1919 daddr = inet->opt->faddr;
1922 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1925 .saddr = inet->saddr,
1926 .tos = RT_CONN_FLAGS(sk) } },
1927 .proto = IPPROTO_TCP,
1929 { .sport = inet->sport,
1930 .dport = inet->dport } } };
1932 err = ip_route_output_flow(&rt, &fl, sk, 0);
1935 __sk_dst_set(sk, &rt->u.dst);
1936 tcp_v4_setup_caps(sk, &rt->u.dst);
1940 /* Routing failed... */
1941 sk->sk_route_caps = 0;
1943 if (!sysctl_ip_dynaddr ||
1944 sk->sk_state != TCP_SYN_SENT ||
1945 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1946 (err = tcp_v4_reselect_saddr(sk)) != 0)
1947 sk->sk_err_soft = -err;
1952 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1954 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1955 struct inet_sock *inet = inet_sk(sk);
1957 sin->sin_family = AF_INET;
1958 sin->sin_addr.s_addr = inet->daddr;
1959 sin->sin_port = inet->dport;
1962 /* VJ's idea. Save last timestamp seen from this destination
1963 * and hold it at least for normal timewait interval to use for duplicate
1964 * segment detection in subsequent connections, before they enter synchronized
1968 int tcp_v4_remember_stamp(struct sock *sk)
1970 struct inet_sock *inet = inet_sk(sk);
1971 struct tcp_sock *tp = tcp_sk(sk);
1972 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1973 struct inet_peer *peer = NULL;
1976 if (!rt || rt->rt_dst != inet->daddr) {
1977 peer = inet_getpeer(inet->daddr, 1);
1981 rt_bind_peer(rt, 1);
1986 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1987 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1988 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1989 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1990 peer->tcp_ts = tp->rx_opt.ts_recent;
2000 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2002 struct inet_peer *peer = NULL;
2004 peer = inet_getpeer(tw->tw_daddr, 1);
2007 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2008 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2009 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2010 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2011 peer->tcp_ts = tw->tw_ts_recent;
2020 struct tcp_func ipv4_specific = {
2021 .queue_xmit = ip_queue_xmit,
2022 .send_check = tcp_v4_send_check,
2023 .rebuild_header = tcp_v4_rebuild_header,
2024 .conn_request = tcp_v4_conn_request,
2025 .syn_recv_sock = tcp_v4_syn_recv_sock,
2026 .remember_stamp = tcp_v4_remember_stamp,
2027 .net_header_len = sizeof(struct iphdr),
2028 .setsockopt = ip_setsockopt,
2029 .getsockopt = ip_getsockopt,
2030 .addr2sockaddr = v4_addr2sockaddr,
2031 .sockaddr_len = sizeof(struct sockaddr_in),
2034 /* NOTE: A lot of things set to zero explicitly by call to
2035 * sk_alloc() so need not be done here.
2037 static int tcp_v4_init_sock(struct sock *sk)
2039 struct tcp_sock *tp = tcp_sk(sk);
2041 skb_queue_head_init(&tp->out_of_order_queue);
2042 tcp_init_xmit_timers(sk);
2043 tcp_prequeue_init(tp);
2045 tp->rto = TCP_TIMEOUT_INIT;
2046 tp->mdev = TCP_TIMEOUT_INIT;
2048 /* So many TCP implementations out there (incorrectly) count the
2049 * initial SYN frame in their delayed-ACK and congestion control
2050 * algorithms that we must have the following bandaid to talk
2051 * efficiently to them. -DaveM
2055 /* See draft-stevens-tcpca-spec-01 for discussion of the
2056 * initialization of these values.
2058 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2059 tp->snd_cwnd_clamp = ~0;
2060 tp->mss_cache_std = tp->mss_cache = 536;
2062 tp->reordering = sysctl_tcp_reordering;
2064 sk->sk_state = TCP_CLOSE;
2066 sk->sk_write_space = sk_stream_write_space;
2067 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2069 tp->af_specific = &ipv4_specific;
2071 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2072 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2074 atomic_inc(&tcp_sockets_allocated);
2079 int tcp_v4_destroy_sock(struct sock *sk)
2081 struct tcp_sock *tp = tcp_sk(sk);
2083 tcp_clear_xmit_timers(sk);
2085 /* Cleanup up the write buffer. */
2086 sk_stream_writequeue_purge(sk);
2088 /* Cleans up our, hopefully empty, out_of_order_queue. */
2089 __skb_queue_purge(&tp->out_of_order_queue);
2091 /* Clean prequeue, it must be empty really */
2092 __skb_queue_purge(&tp->ucopy.prequeue);
2094 /* Clean up a referenced TCP bind bucket. */
2099 * If sendmsg cached page exists, toss it.
2101 if (sk->sk_sndmsg_page) {
2102 __free_page(sk->sk_sndmsg_page);
2103 sk->sk_sndmsg_page = NULL;
2106 atomic_dec(&tcp_sockets_allocated);
2111 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2113 #ifdef CONFIG_PROC_FS
2114 /* Proc filesystem TCP sock list dumping. */
2116 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2118 return hlist_empty(head) ? NULL :
2119 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2122 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2124 return tw->tw_node.next ?
2125 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2128 static void *listening_get_next(struct seq_file *seq, void *cur)
2130 struct tcp_sock *tp;
2131 struct hlist_node *node;
2132 struct sock *sk = cur;
2133 struct tcp_iter_state* st = seq->private;
2137 sk = sk_head(&tcp_listening_hash[0]);
2143 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2144 struct open_request *req = cur;
2146 tp = tcp_sk(st->syn_wait_sk);
2150 if (req->class->family == st->family) {
2156 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2159 req = tp->listen_opt->syn_table[st->sbucket];
2161 sk = sk_next(st->syn_wait_sk);
2162 st->state = TCP_SEQ_STATE_LISTENING;
2163 read_unlock_bh(&tp->syn_wait_lock);
2166 read_lock_bh(&tp->syn_wait_lock);
2167 if (tp->listen_opt && tp->listen_opt->qlen)
2169 read_unlock_bh(&tp->syn_wait_lock);
2173 sk_for_each_from(sk, node) {
2174 if (sk->sk_family == st->family) {
2179 read_lock_bh(&tp->syn_wait_lock);
2180 if (tp->listen_opt && tp->listen_opt->qlen) {
2182 st->uid = sock_i_uid(sk);
2183 st->syn_wait_sk = sk;
2184 st->state = TCP_SEQ_STATE_OPENREQ;
2188 read_unlock_bh(&tp->syn_wait_lock);
2190 if (++st->bucket < TCP_LHTABLE_SIZE) {
2191 sk = sk_head(&tcp_listening_hash[st->bucket]);
2199 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2201 void *rc = listening_get_next(seq, NULL);
2203 while (rc && *pos) {
2204 rc = listening_get_next(seq, rc);
2210 static void *established_get_first(struct seq_file *seq)
2212 struct tcp_iter_state* st = seq->private;
2215 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2217 struct hlist_node *node;
2218 struct tcp_tw_bucket *tw;
2220 /* We can reschedule _before_ having picked the target: */
2221 cond_resched_softirq();
2223 read_lock(&tcp_ehash[st->bucket].lock);
2224 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2225 if (sk->sk_family != st->family) {
2231 st->state = TCP_SEQ_STATE_TIME_WAIT;
2232 tw_for_each(tw, node,
2233 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2234 if (tw->tw_family != st->family) {
2240 read_unlock(&tcp_ehash[st->bucket].lock);
2241 st->state = TCP_SEQ_STATE_ESTABLISHED;
2247 static void *established_get_next(struct seq_file *seq, void *cur)
2249 struct sock *sk = cur;
2250 struct tcp_tw_bucket *tw;
2251 struct hlist_node *node;
2252 struct tcp_iter_state* st = seq->private;
2256 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2260 while (tw && tw->tw_family != st->family) {
2267 read_unlock(&tcp_ehash[st->bucket].lock);
2268 st->state = TCP_SEQ_STATE_ESTABLISHED;
2270 /* We can reschedule between buckets: */
2271 cond_resched_softirq();
2273 if (++st->bucket < tcp_ehash_size) {
2274 read_lock(&tcp_ehash[st->bucket].lock);
2275 sk = sk_head(&tcp_ehash[st->bucket].chain);
2283 sk_for_each_from(sk, node) {
2284 if (sk->sk_family == st->family)
2288 st->state = TCP_SEQ_STATE_TIME_WAIT;
2289 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2297 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2299 void *rc = established_get_first(seq);
2302 rc = established_get_next(seq, rc);
2308 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2311 struct tcp_iter_state* st = seq->private;
2314 st->state = TCP_SEQ_STATE_LISTENING;
2315 rc = listening_get_idx(seq, &pos);
2318 tcp_listen_unlock();
2320 st->state = TCP_SEQ_STATE_ESTABLISHED;
2321 rc = established_get_idx(seq, pos);
2327 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2329 struct tcp_iter_state* st = seq->private;
2330 st->state = TCP_SEQ_STATE_LISTENING;
2332 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2335 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2338 struct tcp_iter_state* st;
2340 if (v == SEQ_START_TOKEN) {
2341 rc = tcp_get_idx(seq, 0);
2346 switch (st->state) {
2347 case TCP_SEQ_STATE_OPENREQ:
2348 case TCP_SEQ_STATE_LISTENING:
2349 rc = listening_get_next(seq, v);
2351 tcp_listen_unlock();
2353 st->state = TCP_SEQ_STATE_ESTABLISHED;
2354 rc = established_get_first(seq);
2357 case TCP_SEQ_STATE_ESTABLISHED:
2358 case TCP_SEQ_STATE_TIME_WAIT:
2359 rc = established_get_next(seq, v);
2367 static void tcp_seq_stop(struct seq_file *seq, void *v)
2369 struct tcp_iter_state* st = seq->private;
2371 switch (st->state) {
2372 case TCP_SEQ_STATE_OPENREQ:
2374 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2375 read_unlock_bh(&tp->syn_wait_lock);
2377 case TCP_SEQ_STATE_LISTENING:
2378 if (v != SEQ_START_TOKEN)
2379 tcp_listen_unlock();
2381 case TCP_SEQ_STATE_TIME_WAIT:
2382 case TCP_SEQ_STATE_ESTABLISHED:
2384 read_unlock(&tcp_ehash[st->bucket].lock);
2390 static int tcp_seq_open(struct inode *inode, struct file *file)
2392 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2393 struct seq_file *seq;
2394 struct tcp_iter_state *s;
2397 if (unlikely(afinfo == NULL))
2400 s = kmalloc(sizeof(*s), GFP_KERNEL);
2403 memset(s, 0, sizeof(*s));
2404 s->family = afinfo->family;
2405 s->seq_ops.start = tcp_seq_start;
2406 s->seq_ops.next = tcp_seq_next;
2407 s->seq_ops.show = afinfo->seq_show;
2408 s->seq_ops.stop = tcp_seq_stop;
2410 rc = seq_open(file, &s->seq_ops);
2413 seq = file->private_data;
2422 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2425 struct proc_dir_entry *p;
2429 afinfo->seq_fops->owner = afinfo->owner;
2430 afinfo->seq_fops->open = tcp_seq_open;
2431 afinfo->seq_fops->read = seq_read;
2432 afinfo->seq_fops->llseek = seq_lseek;
2433 afinfo->seq_fops->release = seq_release_private;
2435 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2443 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2447 proc_net_remove(afinfo->name);
2448 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2451 static void get_openreq4(struct sock *sk, struct open_request *req,
2452 char *tmpbuf, int i, int uid)
2454 int ttd = req->expires - jiffies;
2456 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2457 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2459 req->af.v4_req.loc_addr,
2460 ntohs(inet_sk(sk)->sport),
2461 req->af.v4_req.rmt_addr,
2462 ntohs(req->rmt_port),
2464 0, 0, /* could print option size, but that is af dependent. */
2465 1, /* timers active (only the expire timer) */
2466 jiffies_to_clock_t(ttd),
2469 0, /* non standard timer */
2470 0, /* open_requests have no inode */
2471 atomic_read(&sk->sk_refcnt),
2475 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2478 unsigned long timer_expires;
2479 struct tcp_sock *tp = tcp_sk(sp);
2480 struct inet_sock *inet = inet_sk(sp);
2481 unsigned int dest = inet->daddr;
2482 unsigned int src = inet->rcv_saddr;
2483 __u16 destp = ntohs(inet->dport);
2484 __u16 srcp = ntohs(inet->sport);
2486 if (tp->pending == TCP_TIME_RETRANS) {
2488 timer_expires = tp->timeout;
2489 } else if (tp->pending == TCP_TIME_PROBE0) {
2491 timer_expires = tp->timeout;
2492 } else if (timer_pending(&sp->sk_timer)) {
2494 timer_expires = sp->sk_timer.expires;
2497 timer_expires = jiffies;
2500 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2501 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2502 i, src, srcp, dest, destp, sp->sk_state,
2503 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2505 jiffies_to_clock_t(timer_expires - jiffies),
2510 atomic_read(&sp->sk_refcnt), sp,
2511 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2513 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2516 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2518 unsigned int dest, src;
2520 int ttd = tw->tw_ttd - jiffies;
2525 dest = tw->tw_daddr;
2526 src = tw->tw_rcv_saddr;
2527 destp = ntohs(tw->tw_dport);
2528 srcp = ntohs(tw->tw_sport);
2530 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2531 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2532 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2533 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2534 atomic_read(&tw->tw_refcnt), tw);
2539 static int tcp4_seq_show(struct seq_file *seq, void *v)
2541 struct tcp_iter_state* st;
2542 char tmpbuf[TMPSZ + 1];
2544 if (v == SEQ_START_TOKEN) {
2545 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2546 " sl local_address rem_address st tx_queue "
2547 "rx_queue tr tm->when retrnsmt uid timeout "
2553 switch (st->state) {
2554 case TCP_SEQ_STATE_LISTENING:
2555 case TCP_SEQ_STATE_ESTABLISHED:
2556 get_tcp4_sock(v, tmpbuf, st->num);
2558 case TCP_SEQ_STATE_OPENREQ:
2559 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2561 case TCP_SEQ_STATE_TIME_WAIT:
2562 get_timewait4_sock(v, tmpbuf, st->num);
2565 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2570 static struct file_operations tcp4_seq_fops;
2571 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2572 .owner = THIS_MODULE,
2575 .seq_show = tcp4_seq_show,
2576 .seq_fops = &tcp4_seq_fops,
2579 int __init tcp4_proc_init(void)
2581 return tcp_proc_register(&tcp4_seq_afinfo);
2584 void tcp4_proc_exit(void)
2586 tcp_proc_unregister(&tcp4_seq_afinfo);
2588 #endif /* CONFIG_PROC_FS */
2590 struct proto tcp_prot = {
2592 .owner = THIS_MODULE,
2594 .connect = tcp_v4_connect,
2595 .disconnect = tcp_disconnect,
2596 .accept = tcp_accept,
2598 .init = tcp_v4_init_sock,
2599 .destroy = tcp_v4_destroy_sock,
2600 .shutdown = tcp_shutdown,
2601 .setsockopt = tcp_setsockopt,
2602 .getsockopt = tcp_getsockopt,
2603 .sendmsg = tcp_sendmsg,
2604 .recvmsg = tcp_recvmsg,
2605 .backlog_rcv = tcp_v4_do_rcv,
2606 .hash = tcp_v4_hash,
2607 .unhash = tcp_unhash,
2608 .get_port = tcp_v4_get_port,
2609 .enter_memory_pressure = tcp_enter_memory_pressure,
2610 .sockets_allocated = &tcp_sockets_allocated,
2611 .memory_allocated = &tcp_memory_allocated,
2612 .memory_pressure = &tcp_memory_pressure,
2613 .sysctl_mem = sysctl_tcp_mem,
2614 .sysctl_wmem = sysctl_tcp_wmem,
2615 .sysctl_rmem = sysctl_tcp_rmem,
2616 .max_header = MAX_TCP_HEADER,
2617 .obj_size = sizeof(struct tcp_sock),
2622 void __init tcp_v4_init(struct net_proto_family *ops)
2624 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2626 panic("Failed to create the TCP control socket.\n");
2627 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2628 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2630 /* Unhash it so that IP input processing does not even
2631 * see it, we do not wish this socket to see incoming
2634 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2637 EXPORT_SYMBOL(ipv4_specific);
2638 EXPORT_SYMBOL(tcp_bind_hash);
2639 EXPORT_SYMBOL(tcp_bucket_create);
2640 EXPORT_SYMBOL(tcp_hashinfo);
2641 EXPORT_SYMBOL(tcp_inherit_port);
2642 EXPORT_SYMBOL(tcp_listen_wlock);
2643 EXPORT_SYMBOL(tcp_port_rover);
2644 EXPORT_SYMBOL(tcp_prot);
2645 EXPORT_SYMBOL(tcp_put_port);
2646 EXPORT_SYMBOL(tcp_unhash);
2647 EXPORT_SYMBOL(tcp_v4_conn_request);
2648 EXPORT_SYMBOL(tcp_v4_connect);
2649 EXPORT_SYMBOL(tcp_v4_do_rcv);
2650 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2651 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2652 EXPORT_SYMBOL(tcp_v4_send_check);
2653 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2655 #ifdef CONFIG_PROC_FS
2656 EXPORT_SYMBOL(tcp_proc_register);
2657 EXPORT_SYMBOL(tcp_proc_unregister);
2659 EXPORT_SYMBOL(sysctl_local_port_range);
2660 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2661 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2662 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);