]> err.no Git - linux-2.6/blob - net/ipv4/tcp_ipv4.c
[INET]: Just rename the TCP hashtable functions/structs to inet_
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen sematics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/xfrm.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90                        struct sk_buff *skb);
91
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93         .lhash_lock     = RW_LOCK_UNLOCKED,
94         .lhash_users    = ATOMIC_INIT(0),
95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 };
98
99 /*
100  * This array holds the first and last local port number.
101  * For high-usage systems, use sysctl to change this to
102  * 32768-61000
103  */
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
106
107 /* Allocate and initialize a new local port bind bucket.
108  * The bindhash mutex for snum's hash chain must be held here.
109  */
110 struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
111                                                  struct inet_bind_hashbucket *head,
112                                                  const unsigned short snum)
113 {
114         struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC);
115         if (tb) {
116                 tb->port = snum;
117                 tb->fastreuse = 0;
118                 INIT_HLIST_HEAD(&tb->owners);
119                 hlist_add_head(&tb->node, &head->chain);
120         }
121         return tb;
122 }
123
124 /* Caller must hold hashbucket lock for this tb with local BH disabled */
125 void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb)
126 {
127         if (hlist_empty(&tb->owners)) {
128                 __hlist_del(&tb->node);
129                 kmem_cache_free(cachep, tb);
130         }
131 }
132
133 /* Caller must disable local BH processing. */
134 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
135 {
136         struct inet_bind_hashbucket *head =
137                                 &tcp_bhash[inet_bhashfn(inet_sk(child)->num,
138                                                         tcp_bhash_size)];
139         struct inet_bind_bucket *tb;
140
141         spin_lock(&head->lock);
142         tb = tcp_sk(sk)->bind_hash;
143         sk_add_bind_node(child, &tb->owners);
144         tcp_sk(child)->bind_hash = tb;
145         spin_unlock(&head->lock);
146 }
147
148 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
149 {
150         local_bh_disable();
151         __tcp_inherit_port(sk, child);
152         local_bh_enable();
153 }
154
155 void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
156                    const unsigned short snum)
157 {
158         inet_sk(sk)->num = snum;
159         sk_add_bind_node(sk, &tb->owners);
160         tcp_sk(sk)->bind_hash = tb;
161 }
162
163 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
164 {
165         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
166         struct sock *sk2;
167         struct hlist_node *node;
168         int reuse = sk->sk_reuse;
169
170         sk_for_each_bound(sk2, node, &tb->owners) {
171                 if (sk != sk2 &&
172                     !tcp_v6_ipv6only(sk2) &&
173                     (!sk->sk_bound_dev_if ||
174                      !sk2->sk_bound_dev_if ||
175                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
176                         if (!reuse || !sk2->sk_reuse ||
177                             sk2->sk_state == TCP_LISTEN) {
178                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
179                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
180                                     sk2_rcv_saddr == sk_rcv_saddr)
181                                         break;
182                         }
183                 }
184         }
185         return node != NULL;
186 }
187
188 /* Obtain a reference to a local port for the given sock,
189  * if snum is zero it means select any available local port.
190  */
191 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
192 {
193         struct inet_bind_hashbucket *head;
194         struct hlist_node *node;
195         struct inet_bind_bucket *tb;
196         int ret;
197
198         local_bh_disable();
199         if (!snum) {
200                 int low = sysctl_local_port_range[0];
201                 int high = sysctl_local_port_range[1];
202                 int remaining = (high - low) + 1;
203                 int rover;
204
205                 spin_lock(&tcp_portalloc_lock);
206                 if (tcp_port_rover < low)
207                         rover = low;
208                 else
209                         rover = tcp_port_rover;
210                 do {
211                         rover++;
212                         if (rover > high)
213                                 rover = low;
214                         head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)];
215                         spin_lock(&head->lock);
216                         inet_bind_bucket_for_each(tb, node, &head->chain)
217                                 if (tb->port == rover)
218                                         goto next;
219                         break;
220                 next:
221                         spin_unlock(&head->lock);
222                 } while (--remaining > 0);
223                 tcp_port_rover = rover;
224                 spin_unlock(&tcp_portalloc_lock);
225
226                 /* Exhausted local port range during search?  It is not
227                  * possible for us to be holding one of the bind hash
228                  * locks if this test triggers, because if 'remaining'
229                  * drops to zero, we broke out of the do/while loop at
230                  * the top level, not from the 'break;' statement.
231                  */
232                 ret = 1;
233                 if (unlikely(remaining <= 0))
234                         goto fail;
235
236                 /* OK, here is the one we will use.  HEAD is
237                  * non-NULL and we hold it's mutex.
238                  */
239                 snum = rover;
240         } else {
241                 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
242                 spin_lock(&head->lock);
243                 inet_bind_bucket_for_each(tb, node, &head->chain)
244                         if (tb->port == snum)
245                                 goto tb_found;
246         }
247         tb = NULL;
248         goto tb_not_found;
249 tb_found:
250         if (!hlist_empty(&tb->owners)) {
251                 if (sk->sk_reuse > 1)
252                         goto success;
253                 if (tb->fastreuse > 0 &&
254                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
255                         goto success;
256                 } else {
257                         ret = 1;
258                         if (tcp_bind_conflict(sk, tb))
259                                 goto fail_unlock;
260                 }
261         }
262 tb_not_found:
263         ret = 1;
264         if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL)
265                 goto fail_unlock;
266         if (hlist_empty(&tb->owners)) {
267                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
268                         tb->fastreuse = 1;
269                 else
270                         tb->fastreuse = 0;
271         } else if (tb->fastreuse &&
272                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
273                 tb->fastreuse = 0;
274 success:
275         if (!tcp_sk(sk)->bind_hash)
276                 tcp_bind_hash(sk, tb, snum);
277         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
278         ret = 0;
279
280 fail_unlock:
281         spin_unlock(&head->lock);
282 fail:
283         local_bh_enable();
284         return ret;
285 }
286
287 /* Get rid of any references to a local port held by the
288  * given sock.
289  */
290 static void __tcp_put_port(struct sock *sk)
291 {
292         struct inet_sock *inet = inet_sk(sk);
293         struct inet_bind_hashbucket *head = &tcp_bhash[inet_bhashfn(inet->num,
294                                                                     tcp_bhash_size)];
295         struct inet_bind_bucket *tb;
296
297         spin_lock(&head->lock);
298         tb = tcp_sk(sk)->bind_hash;
299         __sk_del_bind_node(sk);
300         tcp_sk(sk)->bind_hash = NULL;
301         inet->num = 0;
302         inet_bind_bucket_destroy(tcp_bucket_cachep, tb);
303         spin_unlock(&head->lock);
304 }
305
306 void tcp_put_port(struct sock *sk)
307 {
308         local_bh_disable();
309         __tcp_put_port(sk);
310         local_bh_enable();
311 }
312
313 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
314  * Look, when several writers sleep and reader wakes them up, all but one
315  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
316  * this, _but_ remember, it adds useless work on UP machines (wake up each
317  * exclusive lock release). It should be ifdefed really.
318  */
319
320 void tcp_listen_wlock(void)
321 {
322         write_lock(&tcp_lhash_lock);
323
324         if (atomic_read(&tcp_lhash_users)) {
325                 DEFINE_WAIT(wait);
326
327                 for (;;) {
328                         prepare_to_wait_exclusive(&tcp_lhash_wait,
329                                                 &wait, TASK_UNINTERRUPTIBLE);
330                         if (!atomic_read(&tcp_lhash_users))
331                                 break;
332                         write_unlock_bh(&tcp_lhash_lock);
333                         schedule();
334                         write_lock_bh(&tcp_lhash_lock);
335                 }
336
337                 finish_wait(&tcp_lhash_wait, &wait);
338         }
339 }
340
341 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
342 {
343         struct hlist_head *list;
344         rwlock_t *lock;
345
346         BUG_TRAP(sk_unhashed(sk));
347         if (listen_possible && sk->sk_state == TCP_LISTEN) {
348                 list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)];
349                 lock = &tcp_lhash_lock;
350                 tcp_listen_wlock();
351         } else {
352                 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size);
353                 list = &tcp_ehash[sk->sk_hashent].chain;
354                 lock = &tcp_ehash[sk->sk_hashent].lock;
355                 write_lock(lock);
356         }
357         __sk_add_node(sk, list);
358         sock_prot_inc_use(sk->sk_prot);
359         write_unlock(lock);
360         if (listen_possible && sk->sk_state == TCP_LISTEN)
361                 wake_up(&tcp_lhash_wait);
362 }
363
364 static void tcp_v4_hash(struct sock *sk)
365 {
366         if (sk->sk_state != TCP_CLOSE) {
367                 local_bh_disable();
368                 __tcp_v4_hash(sk, 1);
369                 local_bh_enable();
370         }
371 }
372
373 void tcp_unhash(struct sock *sk)
374 {
375         rwlock_t *lock;
376
377         if (sk_unhashed(sk))
378                 goto ende;
379
380         if (sk->sk_state == TCP_LISTEN) {
381                 local_bh_disable();
382                 tcp_listen_wlock();
383                 lock = &tcp_lhash_lock;
384         } else {
385                 struct inet_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
386                 lock = &head->lock;
387                 write_lock_bh(&head->lock);
388         }
389
390         if (__sk_del_node_init(sk))
391                 sock_prot_dec_use(sk->sk_prot);
392         write_unlock_bh(lock);
393
394  ende:
395         if (sk->sk_state == TCP_LISTEN)
396                 wake_up(&tcp_lhash_wait);
397 }
398
399 /* Don't inline this cruft.  Here are some nice properties to
400  * exploit here.  The BSD API does not allow a listening TCP
401  * to specify the remote port nor the remote address for the
402  * connection.  So always assume those are both wildcarded
403  * during the search since they can never be otherwise.
404  */
405 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
406                                              const u32 daddr,
407                                              const unsigned short hnum,
408                                              const int dif)
409 {
410         struct sock *result = NULL, *sk;
411         struct hlist_node *node;
412         int score, hiscore;
413
414         hiscore=-1;
415         sk_for_each(sk, node, head) {
416                 struct inet_sock *inet = inet_sk(sk);
417
418                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
419                         __u32 rcv_saddr = inet->rcv_saddr;
420
421                         score = (sk->sk_family == PF_INET ? 1 : 0);
422                         if (rcv_saddr) {
423                                 if (rcv_saddr != daddr)
424                                         continue;
425                                 score+=2;
426                         }
427                         if (sk->sk_bound_dev_if) {
428                                 if (sk->sk_bound_dev_if != dif)
429                                         continue;
430                                 score+=2;
431                         }
432                         if (score == 5)
433                                 return sk;
434                         if (score > hiscore) {
435                                 hiscore = score;
436                                 result = sk;
437                         }
438                 }
439         }
440         return result;
441 }
442
443 /* Optimize the common listener case. */
444 static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
445                                                   const unsigned short hnum,
446                                                   const int dif)
447 {
448         struct sock *sk = NULL;
449         struct hlist_head *head;
450
451         read_lock(&tcp_lhash_lock);
452         head = &tcp_listening_hash[inet_lhashfn(hnum)];
453         if (!hlist_empty(head)) {
454                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
455
456                 if (inet->num == hnum && !sk->sk_node.next &&
457                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
458                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
459                     !sk->sk_bound_dev_if)
460                         goto sherry_cache;
461                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
462         }
463         if (sk) {
464 sherry_cache:
465                 sock_hold(sk);
466         }
467         read_unlock(&tcp_lhash_lock);
468         return sk;
469 }
470
471 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
472  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
473  *
474  * Local BH must be disabled here.
475  */
476
477 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
478                                                        const u16 sport,
479                                                        const u32 daddr,
480                                                        const u16 hnum,
481                                                        const int dif)
482 {
483         struct inet_ehash_bucket *head;
484         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
485         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
486         struct sock *sk;
487         struct hlist_node *node;
488         /* Optimize here for direct hit, only listening connections can
489          * have wildcards anyways.
490          */
491         const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size);
492         head = &tcp_ehash[hash];
493         read_lock(&head->lock);
494         sk_for_each(sk, node, &head->chain) {
495                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
496                         goto hit; /* You sunk my battleship! */
497         }
498
499         /* Must check for a TIME_WAIT'er before going to listener hash. */
500         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
501                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
502                         goto hit;
503         }
504         sk = NULL;
505 out:
506         read_unlock(&head->lock);
507         return sk;
508 hit:
509         sock_hold(sk);
510         goto out;
511 }
512
513 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
514                                            u32 daddr, u16 hnum, int dif)
515 {
516         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
517                                                       daddr, hnum, dif);
518
519         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
520 }
521
522 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
523                                   u16 dport, int dif)
524 {
525         struct sock *sk;
526
527         local_bh_disable();
528         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
529         local_bh_enable();
530
531         return sk;
532 }
533
534 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
535
536 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
537 {
538         return secure_tcp_sequence_number(skb->nh.iph->daddr,
539                                           skb->nh.iph->saddr,
540                                           skb->h.th->dest,
541                                           skb->h.th->source);
542 }
543
544 /* called with local bh disabled */
545 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
546                                       struct tcp_tw_bucket **twp)
547 {
548         struct inet_sock *inet = inet_sk(sk);
549         u32 daddr = inet->rcv_saddr;
550         u32 saddr = inet->daddr;
551         int dif = sk->sk_bound_dev_if;
552         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
553         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
554         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size);
555         struct inet_ehash_bucket *head = &tcp_ehash[hash];
556         struct sock *sk2;
557         struct hlist_node *node;
558         struct tcp_tw_bucket *tw;
559
560         write_lock(&head->lock);
561
562         /* Check TIME-WAIT sockets first. */
563         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
564                 tw = (struct tcp_tw_bucket *)sk2;
565
566                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
567                         struct tcp_sock *tp = tcp_sk(sk);
568
569                         /* With PAWS, it is safe from the viewpoint
570                            of data integrity. Even without PAWS it
571                            is safe provided sequence spaces do not
572                            overlap i.e. at data rates <= 80Mbit/sec.
573
574                            Actually, the idea is close to VJ's one,
575                            only timestamp cache is held not per host,
576                            but per port pair and TW bucket is used
577                            as state holder.
578
579                            If TW bucket has been already destroyed we
580                            fall back to VJ's scheme and use initial
581                            timestamp retrieved from peer table.
582                          */
583                         if (tw->tw_ts_recent_stamp &&
584                             (!twp || (sysctl_tcp_tw_reuse &&
585                                       xtime.tv_sec -
586                                       tw->tw_ts_recent_stamp > 1))) {
587                                 if ((tp->write_seq =
588                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
589                                         tp->write_seq = 1;
590                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
591                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
592                                 sock_hold(sk2);
593                                 goto unique;
594                         } else
595                                 goto not_unique;
596                 }
597         }
598         tw = NULL;
599
600         /* And established part... */
601         sk_for_each(sk2, node, &head->chain) {
602                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
603                         goto not_unique;
604         }
605
606 unique:
607         /* Must record num and sport now. Otherwise we will see
608          * in hash table socket with a funny identity. */
609         inet->num = lport;
610         inet->sport = htons(lport);
611         sk->sk_hashent = hash;
612         BUG_TRAP(sk_unhashed(sk));
613         __sk_add_node(sk, &head->chain);
614         sock_prot_inc_use(sk->sk_prot);
615         write_unlock(&head->lock);
616
617         if (twp) {
618                 *twp = tw;
619                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
620         } else if (tw) {
621                 /* Silly. Should hash-dance instead... */
622                 tcp_tw_deschedule(tw);
623                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
624
625                 tcp_tw_put(tw);
626         }
627
628         return 0;
629
630 not_unique:
631         write_unlock(&head->lock);
632         return -EADDRNOTAVAIL;
633 }
634
635 static inline u32 connect_port_offset(const struct sock *sk)
636 {
637         const struct inet_sock *inet = inet_sk(sk);
638
639         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
640                                          inet->dport);
641 }
642
643 /*
644  * Bind a port for a connect operation and hash it.
645  */
646 static inline int tcp_v4_hash_connect(struct sock *sk)
647 {
648         const unsigned short snum = inet_sk(sk)->num;
649         struct inet_bind_hashbucket *head;
650         struct inet_bind_bucket *tb;
651         int ret;
652
653         if (!snum) {
654                 int low = sysctl_local_port_range[0];
655                 int high = sysctl_local_port_range[1];
656                 int range = high - low;
657                 int i;
658                 int port;
659                 static u32 hint;
660                 u32 offset = hint + connect_port_offset(sk);
661                 struct hlist_node *node;
662                 struct tcp_tw_bucket *tw = NULL;
663
664                 local_bh_disable();
665                 for (i = 1; i <= range; i++) {
666                         port = low + (i + offset) % range;
667                         head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)];
668                         spin_lock(&head->lock);
669
670                         /* Does not bother with rcv_saddr checks,
671                          * because the established check is already
672                          * unique enough.
673                          */
674                         inet_bind_bucket_for_each(tb, node, &head->chain) {
675                                 if (tb->port == port) {
676                                         BUG_TRAP(!hlist_empty(&tb->owners));
677                                         if (tb->fastreuse >= 0)
678                                                 goto next_port;
679                                         if (!__tcp_v4_check_established(sk,
680                                                                         port,
681                                                                         &tw))
682                                                 goto ok;
683                                         goto next_port;
684                                 }
685                         }
686
687                         tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port);
688                         if (!tb) {
689                                 spin_unlock(&head->lock);
690                                 break;
691                         }
692                         tb->fastreuse = -1;
693                         goto ok;
694
695                 next_port:
696                         spin_unlock(&head->lock);
697                 }
698                 local_bh_enable();
699
700                 return -EADDRNOTAVAIL;
701
702 ok:
703                 hint += i;
704
705                 /* Head lock still held and bh's disabled */
706                 tcp_bind_hash(sk, tb, port);
707                 if (sk_unhashed(sk)) {
708                         inet_sk(sk)->sport = htons(port);
709                         __tcp_v4_hash(sk, 0);
710                 }
711                 spin_unlock(&head->lock);
712
713                 if (tw) {
714                         tcp_tw_deschedule(tw);
715                         tcp_tw_put(tw);
716                 }
717
718                 ret = 0;
719                 goto out;
720         }
721
722         head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
723         tb  = tcp_sk(sk)->bind_hash;
724         spin_lock_bh(&head->lock);
725         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
726                 __tcp_v4_hash(sk, 0);
727                 spin_unlock_bh(&head->lock);
728                 return 0;
729         } else {
730                 spin_unlock(&head->lock);
731                 /* No definite answer... Walk to established hash table */
732                 ret = __tcp_v4_check_established(sk, snum, NULL);
733 out:
734                 local_bh_enable();
735                 return ret;
736         }
737 }
738
739 /* This will initiate an outgoing connection. */
740 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
741 {
742         struct inet_sock *inet = inet_sk(sk);
743         struct tcp_sock *tp = tcp_sk(sk);
744         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
745         struct rtable *rt;
746         u32 daddr, nexthop;
747         int tmp;
748         int err;
749
750         if (addr_len < sizeof(struct sockaddr_in))
751                 return -EINVAL;
752
753         if (usin->sin_family != AF_INET)
754                 return -EAFNOSUPPORT;
755
756         nexthop = daddr = usin->sin_addr.s_addr;
757         if (inet->opt && inet->opt->srr) {
758                 if (!daddr)
759                         return -EINVAL;
760                 nexthop = inet->opt->faddr;
761         }
762
763         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
764                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
765                                IPPROTO_TCP,
766                                inet->sport, usin->sin_port, sk);
767         if (tmp < 0)
768                 return tmp;
769
770         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
771                 ip_rt_put(rt);
772                 return -ENETUNREACH;
773         }
774
775         if (!inet->opt || !inet->opt->srr)
776                 daddr = rt->rt_dst;
777
778         if (!inet->saddr)
779                 inet->saddr = rt->rt_src;
780         inet->rcv_saddr = inet->saddr;
781
782         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
783                 /* Reset inherited state */
784                 tp->rx_opt.ts_recent       = 0;
785                 tp->rx_opt.ts_recent_stamp = 0;
786                 tp->write_seq              = 0;
787         }
788
789         if (sysctl_tcp_tw_recycle &&
790             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
791                 struct inet_peer *peer = rt_get_peer(rt);
792
793                 /* VJ's idea. We save last timestamp seen from
794                  * the destination in peer table, when entering state TIME-WAIT
795                  * and initialize rx_opt.ts_recent from it, when trying new connection.
796                  */
797
798                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
799                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
800                         tp->rx_opt.ts_recent = peer->tcp_ts;
801                 }
802         }
803
804         inet->dport = usin->sin_port;
805         inet->daddr = daddr;
806
807         tp->ext_header_len = 0;
808         if (inet->opt)
809                 tp->ext_header_len = inet->opt->optlen;
810
811         tp->rx_opt.mss_clamp = 536;
812
813         /* Socket identity is still unknown (sport may be zero).
814          * However we set state to SYN-SENT and not releasing socket
815          * lock select source port, enter ourselves into the hash tables and
816          * complete initialization after this.
817          */
818         tcp_set_state(sk, TCP_SYN_SENT);
819         err = tcp_v4_hash_connect(sk);
820         if (err)
821                 goto failure;
822
823         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
824         if (err)
825                 goto failure;
826
827         /* OK, now commit destination to socket.  */
828         sk_setup_caps(sk, &rt->u.dst);
829
830         if (!tp->write_seq)
831                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
832                                                            inet->daddr,
833                                                            inet->sport,
834                                                            usin->sin_port);
835
836         inet->id = tp->write_seq ^ jiffies;
837
838         err = tcp_connect(sk);
839         rt = NULL;
840         if (err)
841                 goto failure;
842
843         return 0;
844
845 failure:
846         /* This unhashes the socket and releases the local port, if necessary. */
847         tcp_set_state(sk, TCP_CLOSE);
848         ip_rt_put(rt);
849         sk->sk_route_caps = 0;
850         inet->dport = 0;
851         return err;
852 }
853
854 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
855 {
856         return ((struct rtable *)skb->dst)->rt_iif;
857 }
858
859 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
860 {
861         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
862 }
863
864 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
865                                               struct request_sock ***prevp,
866                                               __u16 rport,
867                                               __u32 raddr, __u32 laddr)
868 {
869         struct listen_sock *lopt = tp->accept_queue.listen_opt;
870         struct request_sock *req, **prev;
871
872         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
873              (req = *prev) != NULL;
874              prev = &req->dl_next) {
875                 const struct inet_request_sock *ireq = inet_rsk(req);
876
877                 if (ireq->rmt_port == rport &&
878                     ireq->rmt_addr == raddr &&
879                     ireq->loc_addr == laddr &&
880                     TCP_INET_FAMILY(req->rsk_ops->family)) {
881                         BUG_TRAP(!req->sk);
882                         *prevp = prev;
883                         break;
884                 }
885         }
886
887         return req;
888 }
889
890 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
891 {
892         struct tcp_sock *tp = tcp_sk(sk);
893         struct listen_sock *lopt = tp->accept_queue.listen_opt;
894         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
895
896         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
897         tcp_synq_added(sk);
898 }
899
900
901 /*
902  * This routine does path mtu discovery as defined in RFC1191.
903  */
904 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
905                                      u32 mtu)
906 {
907         struct dst_entry *dst;
908         struct inet_sock *inet = inet_sk(sk);
909         struct tcp_sock *tp = tcp_sk(sk);
910
911         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
912          * send out by Linux are always <576bytes so they should go through
913          * unfragmented).
914          */
915         if (sk->sk_state == TCP_LISTEN)
916                 return;
917
918         /* We don't check in the destentry if pmtu discovery is forbidden
919          * on this route. We just assume that no packet_to_big packets
920          * are send back when pmtu discovery is not active.
921          * There is a small race when the user changes this flag in the
922          * route, but I think that's acceptable.
923          */
924         if ((dst = __sk_dst_check(sk, 0)) == NULL)
925                 return;
926
927         dst->ops->update_pmtu(dst, mtu);
928
929         /* Something is about to be wrong... Remember soft error
930          * for the case, if this connection will not able to recover.
931          */
932         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
933                 sk->sk_err_soft = EMSGSIZE;
934
935         mtu = dst_mtu(dst);
936
937         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
938             tp->pmtu_cookie > mtu) {
939                 tcp_sync_mss(sk, mtu);
940
941                 /* Resend the TCP packet because it's
942                  * clear that the old packet has been
943                  * dropped. This is the new "fast" path mtu
944                  * discovery.
945                  */
946                 tcp_simple_retransmit(sk);
947         } /* else let the usual retransmit timer handle it */
948 }
949
950 /*
951  * This routine is called by the ICMP module when it gets some
952  * sort of error condition.  If err < 0 then the socket should
953  * be closed and the error returned to the user.  If err > 0
954  * it's just the icmp type << 8 | icmp code.  After adjustment
955  * header points to the first 8 bytes of the tcp header.  We need
956  * to find the appropriate port.
957  *
958  * The locking strategy used here is very "optimistic". When
959  * someone else accesses the socket the ICMP is just dropped
960  * and for some paths there is no check at all.
961  * A more general error queue to queue errors for later handling
962  * is probably better.
963  *
964  */
965
966 void tcp_v4_err(struct sk_buff *skb, u32 info)
967 {
968         struct iphdr *iph = (struct iphdr *)skb->data;
969         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
970         struct tcp_sock *tp;
971         struct inet_sock *inet;
972         int type = skb->h.icmph->type;
973         int code = skb->h.icmph->code;
974         struct sock *sk;
975         __u32 seq;
976         int err;
977
978         if (skb->len < (iph->ihl << 2) + 8) {
979                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
980                 return;
981         }
982
983         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
984                            th->source, tcp_v4_iif(skb));
985         if (!sk) {
986                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
987                 return;
988         }
989         if (sk->sk_state == TCP_TIME_WAIT) {
990                 tcp_tw_put((struct tcp_tw_bucket *)sk);
991                 return;
992         }
993
994         bh_lock_sock(sk);
995         /* If too many ICMPs get dropped on busy
996          * servers this needs to be solved differently.
997          */
998         if (sock_owned_by_user(sk))
999                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1000
1001         if (sk->sk_state == TCP_CLOSE)
1002                 goto out;
1003
1004         tp = tcp_sk(sk);
1005         seq = ntohl(th->seq);
1006         if (sk->sk_state != TCP_LISTEN &&
1007             !between(seq, tp->snd_una, tp->snd_nxt)) {
1008                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1009                 goto out;
1010         }
1011
1012         switch (type) {
1013         case ICMP_SOURCE_QUENCH:
1014                 /* Just silently ignore these. */
1015                 goto out;
1016         case ICMP_PARAMETERPROB:
1017                 err = EPROTO;
1018                 break;
1019         case ICMP_DEST_UNREACH:
1020                 if (code > NR_ICMP_UNREACH)
1021                         goto out;
1022
1023                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1024                         if (!sock_owned_by_user(sk))
1025                                 do_pmtu_discovery(sk, iph, info);
1026                         goto out;
1027                 }
1028
1029                 err = icmp_err_convert[code].errno;
1030                 break;
1031         case ICMP_TIME_EXCEEDED:
1032                 err = EHOSTUNREACH;
1033                 break;
1034         default:
1035                 goto out;
1036         }
1037
1038         switch (sk->sk_state) {
1039                 struct request_sock *req, **prev;
1040         case TCP_LISTEN:
1041                 if (sock_owned_by_user(sk))
1042                         goto out;
1043
1044                 req = tcp_v4_search_req(tp, &prev, th->dest,
1045                                         iph->daddr, iph->saddr);
1046                 if (!req)
1047                         goto out;
1048
1049                 /* ICMPs are not backlogged, hence we cannot get
1050                    an established socket here.
1051                  */
1052                 BUG_TRAP(!req->sk);
1053
1054                 if (seq != tcp_rsk(req)->snt_isn) {
1055                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1056                         goto out;
1057                 }
1058
1059                 /*
1060                  * Still in SYN_RECV, just remove it silently.
1061                  * There is no good way to pass the error to the newly
1062                  * created socket, and POSIX does not want network
1063                  * errors returned from accept().
1064                  */
1065                 tcp_synq_drop(sk, req, prev);
1066                 goto out;
1067
1068         case TCP_SYN_SENT:
1069         case TCP_SYN_RECV:  /* Cannot happen.
1070                                It can f.e. if SYNs crossed.
1071                              */
1072                 if (!sock_owned_by_user(sk)) {
1073                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1074                         sk->sk_err = err;
1075
1076                         sk->sk_error_report(sk);
1077
1078                         tcp_done(sk);
1079                 } else {
1080                         sk->sk_err_soft = err;
1081                 }
1082                 goto out;
1083         }
1084
1085         /* If we've already connected we will keep trying
1086          * until we time out, or the user gives up.
1087          *
1088          * rfc1122 4.2.3.9 allows to consider as hard errors
1089          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1090          * but it is obsoleted by pmtu discovery).
1091          *
1092          * Note, that in modern internet, where routing is unreliable
1093          * and in each dark corner broken firewalls sit, sending random
1094          * errors ordered by their masters even this two messages finally lose
1095          * their original sense (even Linux sends invalid PORT_UNREACHs)
1096          *
1097          * Now we are in compliance with RFCs.
1098          *                                                      --ANK (980905)
1099          */
1100
1101         inet = inet_sk(sk);
1102         if (!sock_owned_by_user(sk) && inet->recverr) {
1103                 sk->sk_err = err;
1104                 sk->sk_error_report(sk);
1105         } else  { /* Only an error on timeout */
1106                 sk->sk_err_soft = err;
1107         }
1108
1109 out:
1110         bh_unlock_sock(sk);
1111         sock_put(sk);
1112 }
1113
1114 /* This routine computes an IPv4 TCP checksum. */
1115 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1116                        struct sk_buff *skb)
1117 {
1118         struct inet_sock *inet = inet_sk(sk);
1119
1120         if (skb->ip_summed == CHECKSUM_HW) {
1121                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1122                 skb->csum = offsetof(struct tcphdr, check);
1123         } else {
1124                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1125                                          csum_partial((char *)th,
1126                                                       th->doff << 2,
1127                                                       skb->csum));
1128         }
1129 }
1130
1131 /*
1132  *      This routine will send an RST to the other tcp.
1133  *
1134  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1135  *                    for reset.
1136  *      Answer: if a packet caused RST, it is not for a socket
1137  *              existing in our system, if it is matched to a socket,
1138  *              it is just duplicate segment or bug in other side's TCP.
1139  *              So that we build reply only basing on parameters
1140  *              arrived with segment.
1141  *      Exception: precedence violation. We do not implement it in any case.
1142  */
1143
1144 static void tcp_v4_send_reset(struct sk_buff *skb)
1145 {
1146         struct tcphdr *th = skb->h.th;
1147         struct tcphdr rth;
1148         struct ip_reply_arg arg;
1149
1150         /* Never send a reset in response to a reset. */
1151         if (th->rst)
1152                 return;
1153
1154         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1155                 return;
1156
1157         /* Swap the send and the receive. */
1158         memset(&rth, 0, sizeof(struct tcphdr));
1159         rth.dest   = th->source;
1160         rth.source = th->dest;
1161         rth.doff   = sizeof(struct tcphdr) / 4;
1162         rth.rst    = 1;
1163
1164         if (th->ack) {
1165                 rth.seq = th->ack_seq;
1166         } else {
1167                 rth.ack = 1;
1168                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1169                                     skb->len - (th->doff << 2));
1170         }
1171
1172         memset(&arg, 0, sizeof arg);
1173         arg.iov[0].iov_base = (unsigned char *)&rth;
1174         arg.iov[0].iov_len  = sizeof rth;
1175         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1176                                       skb->nh.iph->saddr, /*XXX*/
1177                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1178         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1179
1180         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1181
1182         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1183         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1184 }
1185
1186 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1187    outside socket context is ugly, certainly. What can I do?
1188  */
1189
1190 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1191                             u32 win, u32 ts)
1192 {
1193         struct tcphdr *th = skb->h.th;
1194         struct {
1195                 struct tcphdr th;
1196                 u32 tsopt[3];
1197         } rep;
1198         struct ip_reply_arg arg;
1199
1200         memset(&rep.th, 0, sizeof(struct tcphdr));
1201         memset(&arg, 0, sizeof arg);
1202
1203         arg.iov[0].iov_base = (unsigned char *)&rep;
1204         arg.iov[0].iov_len  = sizeof(rep.th);
1205         if (ts) {
1206                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1207                                      (TCPOPT_TIMESTAMP << 8) |
1208                                      TCPOLEN_TIMESTAMP);
1209                 rep.tsopt[1] = htonl(tcp_time_stamp);
1210                 rep.tsopt[2] = htonl(ts);
1211                 arg.iov[0].iov_len = sizeof(rep);
1212         }
1213
1214         /* Swap the send and the receive. */
1215         rep.th.dest    = th->source;
1216         rep.th.source  = th->dest;
1217         rep.th.doff    = arg.iov[0].iov_len / 4;
1218         rep.th.seq     = htonl(seq);
1219         rep.th.ack_seq = htonl(ack);
1220         rep.th.ack     = 1;
1221         rep.th.window  = htons(win);
1222
1223         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1224                                       skb->nh.iph->saddr, /*XXX*/
1225                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1226         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1227
1228         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1229
1230         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1231 }
1232
1233 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1234 {
1235         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1236
1237         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1238                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1239
1240         tcp_tw_put(tw);
1241 }
1242
1243 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1244 {
1245         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1246                         req->ts_recent);
1247 }
1248
1249 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1250                                           struct request_sock *req)
1251 {
1252         struct rtable *rt;
1253         const struct inet_request_sock *ireq = inet_rsk(req);
1254         struct ip_options *opt = inet_rsk(req)->opt;
1255         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1256                             .nl_u = { .ip4_u =
1257                                       { .daddr = ((opt && opt->srr) ?
1258                                                   opt->faddr :
1259                                                   ireq->rmt_addr),
1260                                         .saddr = ireq->loc_addr,
1261                                         .tos = RT_CONN_FLAGS(sk) } },
1262                             .proto = IPPROTO_TCP,
1263                             .uli_u = { .ports =
1264                                        { .sport = inet_sk(sk)->sport,
1265                                          .dport = ireq->rmt_port } } };
1266
1267         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1268                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1269                 return NULL;
1270         }
1271         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1272                 ip_rt_put(rt);
1273                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1274                 return NULL;
1275         }
1276         return &rt->u.dst;
1277 }
1278
1279 /*
1280  *      Send a SYN-ACK after having received an ACK.
1281  *      This still operates on a request_sock only, not on a big
1282  *      socket.
1283  */
1284 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1285                               struct dst_entry *dst)
1286 {
1287         const struct inet_request_sock *ireq = inet_rsk(req);
1288         int err = -1;
1289         struct sk_buff * skb;
1290
1291         /* First, grab a route. */
1292         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1293                 goto out;
1294
1295         skb = tcp_make_synack(sk, dst, req);
1296
1297         if (skb) {
1298                 struct tcphdr *th = skb->h.th;
1299
1300                 th->check = tcp_v4_check(th, skb->len,
1301                                          ireq->loc_addr,
1302                                          ireq->rmt_addr,
1303                                          csum_partial((char *)th, skb->len,
1304                                                       skb->csum));
1305
1306                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1307                                             ireq->rmt_addr,
1308                                             ireq->opt);
1309                 if (err == NET_XMIT_CN)
1310                         err = 0;
1311         }
1312
1313 out:
1314         dst_release(dst);
1315         return err;
1316 }
1317
1318 /*
1319  *      IPv4 request_sock destructor.
1320  */
1321 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1322 {
1323         if (inet_rsk(req)->opt)
1324                 kfree(inet_rsk(req)->opt);
1325 }
1326
1327 static inline void syn_flood_warning(struct sk_buff *skb)
1328 {
1329         static unsigned long warntime;
1330
1331         if (time_after(jiffies, (warntime + HZ * 60))) {
1332                 warntime = jiffies;
1333                 printk(KERN_INFO
1334                        "possible SYN flooding on port %d. Sending cookies.\n",
1335                        ntohs(skb->h.th->dest));
1336         }
1337 }
1338
1339 /*
1340  * Save and compile IPv4 options into the request_sock if needed.
1341  */
1342 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1343                                                      struct sk_buff *skb)
1344 {
1345         struct ip_options *opt = &(IPCB(skb)->opt);
1346         struct ip_options *dopt = NULL;
1347
1348         if (opt && opt->optlen) {
1349                 int opt_size = optlength(opt);
1350                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1351                 if (dopt) {
1352                         if (ip_options_echo(dopt, skb)) {
1353                                 kfree(dopt);
1354                                 dopt = NULL;
1355                         }
1356                 }
1357         }
1358         return dopt;
1359 }
1360
1361 struct request_sock_ops tcp_request_sock_ops = {
1362         .family         =       PF_INET,
1363         .obj_size       =       sizeof(struct tcp_request_sock),
1364         .rtx_syn_ack    =       tcp_v4_send_synack,
1365         .send_ack       =       tcp_v4_reqsk_send_ack,
1366         .destructor     =       tcp_v4_reqsk_destructor,
1367         .send_reset     =       tcp_v4_send_reset,
1368 };
1369
1370 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1371 {
1372         struct inet_request_sock *ireq;
1373         struct tcp_options_received tmp_opt;
1374         struct request_sock *req;
1375         __u32 saddr = skb->nh.iph->saddr;
1376         __u32 daddr = skb->nh.iph->daddr;
1377         __u32 isn = TCP_SKB_CB(skb)->when;
1378         struct dst_entry *dst = NULL;
1379 #ifdef CONFIG_SYN_COOKIES
1380         int want_cookie = 0;
1381 #else
1382 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1383 #endif
1384
1385         /* Never answer to SYNs send to broadcast or multicast */
1386         if (((struct rtable *)skb->dst)->rt_flags &
1387             (RTCF_BROADCAST | RTCF_MULTICAST))
1388                 goto drop;
1389
1390         /* TW buckets are converted to open requests without
1391          * limitations, they conserve resources and peer is
1392          * evidently real one.
1393          */
1394         if (tcp_synq_is_full(sk) && !isn) {
1395 #ifdef CONFIG_SYN_COOKIES
1396                 if (sysctl_tcp_syncookies) {
1397                         want_cookie = 1;
1398                 } else
1399 #endif
1400                 goto drop;
1401         }
1402
1403         /* Accept backlog is full. If we have already queued enough
1404          * of warm entries in syn queue, drop request. It is better than
1405          * clogging syn queue with openreqs with exponentially increasing
1406          * timeout.
1407          */
1408         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1409                 goto drop;
1410
1411         req = reqsk_alloc(&tcp_request_sock_ops);
1412         if (!req)
1413                 goto drop;
1414
1415         tcp_clear_options(&tmp_opt);
1416         tmp_opt.mss_clamp = 536;
1417         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1418
1419         tcp_parse_options(skb, &tmp_opt, 0);
1420
1421         if (want_cookie) {
1422                 tcp_clear_options(&tmp_opt);
1423                 tmp_opt.saw_tstamp = 0;
1424         }
1425
1426         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1427                 /* Some OSes (unknown ones, but I see them on web server, which
1428                  * contains information interesting only for windows'
1429                  * users) do not send their stamp in SYN. It is easy case.
1430                  * We simply do not advertise TS support.
1431                  */
1432                 tmp_opt.saw_tstamp = 0;
1433                 tmp_opt.tstamp_ok  = 0;
1434         }
1435         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1436
1437         tcp_openreq_init(req, &tmp_opt, skb);
1438
1439         ireq = inet_rsk(req);
1440         ireq->loc_addr = daddr;
1441         ireq->rmt_addr = saddr;
1442         ireq->opt = tcp_v4_save_options(sk, skb);
1443         if (!want_cookie)
1444                 TCP_ECN_create_request(req, skb->h.th);
1445
1446         if (want_cookie) {
1447 #ifdef CONFIG_SYN_COOKIES
1448                 syn_flood_warning(skb);
1449 #endif
1450                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1451         } else if (!isn) {
1452                 struct inet_peer *peer = NULL;
1453
1454                 /* VJ's idea. We save last timestamp seen
1455                  * from the destination in peer table, when entering
1456                  * state TIME-WAIT, and check against it before
1457                  * accepting new connection request.
1458                  *
1459                  * If "isn" is not zero, this request hit alive
1460                  * timewait bucket, so that all the necessary checks
1461                  * are made in the function processing timewait state.
1462                  */
1463                 if (tmp_opt.saw_tstamp &&
1464                     sysctl_tcp_tw_recycle &&
1465                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1466                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1467                     peer->v4daddr == saddr) {
1468                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1469                             (s32)(peer->tcp_ts - req->ts_recent) >
1470                                                         TCP_PAWS_WINDOW) {
1471                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1472                                 dst_release(dst);
1473                                 goto drop_and_free;
1474                         }
1475                 }
1476                 /* Kill the following clause, if you dislike this way. */
1477                 else if (!sysctl_tcp_syncookies &&
1478                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1479                           (sysctl_max_syn_backlog >> 2)) &&
1480                          (!peer || !peer->tcp_ts_stamp) &&
1481                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1482                         /* Without syncookies last quarter of
1483                          * backlog is filled with destinations,
1484                          * proven to be alive.
1485                          * It means that we continue to communicate
1486                          * to destinations, already remembered
1487                          * to the moment of synflood.
1488                          */
1489                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1490                                               "request from %u.%u."
1491                                               "%u.%u/%u\n",
1492                                               NIPQUAD(saddr),
1493                                               ntohs(skb->h.th->source)));
1494                         dst_release(dst);
1495                         goto drop_and_free;
1496                 }
1497
1498                 isn = tcp_v4_init_sequence(sk, skb);
1499         }
1500         tcp_rsk(req)->snt_isn = isn;
1501
1502         if (tcp_v4_send_synack(sk, req, dst))
1503                 goto drop_and_free;
1504
1505         if (want_cookie) {
1506                 reqsk_free(req);
1507         } else {
1508                 tcp_v4_synq_add(sk, req);
1509         }
1510         return 0;
1511
1512 drop_and_free:
1513         reqsk_free(req);
1514 drop:
1515         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1516         return 0;
1517 }
1518
1519
1520 /*
1521  * The three way handshake has completed - we got a valid synack -
1522  * now create the new socket.
1523  */
1524 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1525                                   struct request_sock *req,
1526                                   struct dst_entry *dst)
1527 {
1528         struct inet_request_sock *ireq;
1529         struct inet_sock *newinet;
1530         struct tcp_sock *newtp;
1531         struct sock *newsk;
1532
1533         if (sk_acceptq_is_full(sk))
1534                 goto exit_overflow;
1535
1536         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1537                 goto exit;
1538
1539         newsk = tcp_create_openreq_child(sk, req, skb);
1540         if (!newsk)
1541                 goto exit;
1542
1543         sk_setup_caps(newsk, dst);
1544
1545         newtp                 = tcp_sk(newsk);
1546         newinet               = inet_sk(newsk);
1547         ireq                  = inet_rsk(req);
1548         newinet->daddr        = ireq->rmt_addr;
1549         newinet->rcv_saddr    = ireq->loc_addr;
1550         newinet->saddr        = ireq->loc_addr;
1551         newinet->opt          = ireq->opt;
1552         ireq->opt             = NULL;
1553         newinet->mc_index     = tcp_v4_iif(skb);
1554         newinet->mc_ttl       = skb->nh.iph->ttl;
1555         newtp->ext_header_len = 0;
1556         if (newinet->opt)
1557                 newtp->ext_header_len = newinet->opt->optlen;
1558         newinet->id = newtp->write_seq ^ jiffies;
1559
1560         tcp_sync_mss(newsk, dst_mtu(dst));
1561         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1562         tcp_initialize_rcv_mss(newsk);
1563
1564         __tcp_v4_hash(newsk, 0);
1565         __tcp_inherit_port(sk, newsk);
1566
1567         return newsk;
1568
1569 exit_overflow:
1570         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1571 exit:
1572         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1573         dst_release(dst);
1574         return NULL;
1575 }
1576
1577 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1578 {
1579         struct tcphdr *th = skb->h.th;
1580         struct iphdr *iph = skb->nh.iph;
1581         struct tcp_sock *tp = tcp_sk(sk);
1582         struct sock *nsk;
1583         struct request_sock **prev;
1584         /* Find possible connection requests. */
1585         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1586                                                      iph->saddr, iph->daddr);
1587         if (req)
1588                 return tcp_check_req(sk, skb, req, prev);
1589
1590         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1591                                           th->source,
1592                                           skb->nh.iph->daddr,
1593                                           ntohs(th->dest),
1594                                           tcp_v4_iif(skb));
1595
1596         if (nsk) {
1597                 if (nsk->sk_state != TCP_TIME_WAIT) {
1598                         bh_lock_sock(nsk);
1599                         return nsk;
1600                 }
1601                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1602                 return NULL;
1603         }
1604
1605 #ifdef CONFIG_SYN_COOKIES
1606         if (!th->rst && !th->syn && th->ack)
1607                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1608 #endif
1609         return sk;
1610 }
1611
1612 static int tcp_v4_checksum_init(struct sk_buff *skb)
1613 {
1614         if (skb->ip_summed == CHECKSUM_HW) {
1615                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1616                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1617                                   skb->nh.iph->daddr, skb->csum))
1618                         return 0;
1619
1620                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1621                 skb->ip_summed = CHECKSUM_NONE;
1622         }
1623         if (skb->len <= 76) {
1624                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1625                                  skb->nh.iph->daddr,
1626                                  skb_checksum(skb, 0, skb->len, 0)))
1627                         return -1;
1628                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1629         } else {
1630                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1631                                           skb->nh.iph->saddr,
1632                                           skb->nh.iph->daddr, 0);
1633         }
1634         return 0;
1635 }
1636
1637
1638 /* The socket must have it's spinlock held when we get
1639  * here.
1640  *
1641  * We have a potential double-lock case here, so even when
1642  * doing backlog processing we use the BH locking scheme.
1643  * This is because we cannot sleep with the original spinlock
1644  * held.
1645  */
1646 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1647 {
1648         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1649                 TCP_CHECK_TIMER(sk);
1650                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1651                         goto reset;
1652                 TCP_CHECK_TIMER(sk);
1653                 return 0;
1654         }
1655
1656         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1657                 goto csum_err;
1658
1659         if (sk->sk_state == TCP_LISTEN) {
1660                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1661                 if (!nsk)
1662                         goto discard;
1663
1664                 if (nsk != sk) {
1665                         if (tcp_child_process(sk, nsk, skb))
1666                                 goto reset;
1667                         return 0;
1668                 }
1669         }
1670
1671         TCP_CHECK_TIMER(sk);
1672         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1673                 goto reset;
1674         TCP_CHECK_TIMER(sk);
1675         return 0;
1676
1677 reset:
1678         tcp_v4_send_reset(skb);
1679 discard:
1680         kfree_skb(skb);
1681         /* Be careful here. If this function gets more complicated and
1682          * gcc suffers from register pressure on the x86, sk (in %ebx)
1683          * might be destroyed here. This current version compiles correctly,
1684          * but you have been warned.
1685          */
1686         return 0;
1687
1688 csum_err:
1689         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1690         goto discard;
1691 }
1692
1693 /*
1694  *      From tcp_input.c
1695  */
1696
1697 int tcp_v4_rcv(struct sk_buff *skb)
1698 {
1699         struct tcphdr *th;
1700         struct sock *sk;
1701         int ret;
1702
1703         if (skb->pkt_type != PACKET_HOST)
1704                 goto discard_it;
1705
1706         /* Count it even if it's bad */
1707         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1708
1709         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1710                 goto discard_it;
1711
1712         th = skb->h.th;
1713
1714         if (th->doff < sizeof(struct tcphdr) / 4)
1715                 goto bad_packet;
1716         if (!pskb_may_pull(skb, th->doff * 4))
1717                 goto discard_it;
1718
1719         /* An explanation is required here, I think.
1720          * Packet length and doff are validated by header prediction,
1721          * provided case of th->doff==0 is elimineted.
1722          * So, we defer the checks. */
1723         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1724              tcp_v4_checksum_init(skb) < 0))
1725                 goto bad_packet;
1726
1727         th = skb->h.th;
1728         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1729         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1730                                     skb->len - th->doff * 4);
1731         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1732         TCP_SKB_CB(skb)->when    = 0;
1733         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1734         TCP_SKB_CB(skb)->sacked  = 0;
1735
1736         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1737                              skb->nh.iph->daddr, ntohs(th->dest),
1738                              tcp_v4_iif(skb));
1739
1740         if (!sk)
1741                 goto no_tcp_socket;
1742
1743 process:
1744         if (sk->sk_state == TCP_TIME_WAIT)
1745                 goto do_time_wait;
1746
1747         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1748                 goto discard_and_relse;
1749
1750         if (sk_filter(sk, skb, 0))
1751                 goto discard_and_relse;
1752
1753         skb->dev = NULL;
1754
1755         bh_lock_sock(sk);
1756         ret = 0;
1757         if (!sock_owned_by_user(sk)) {
1758                 if (!tcp_prequeue(sk, skb))
1759                         ret = tcp_v4_do_rcv(sk, skb);
1760         } else
1761                 sk_add_backlog(sk, skb);
1762         bh_unlock_sock(sk);
1763
1764         sock_put(sk);
1765
1766         return ret;
1767
1768 no_tcp_socket:
1769         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1770                 goto discard_it;
1771
1772         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1773 bad_packet:
1774                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1775         } else {
1776                 tcp_v4_send_reset(skb);
1777         }
1778
1779 discard_it:
1780         /* Discard frame. */
1781         kfree_skb(skb);
1782         return 0;
1783
1784 discard_and_relse:
1785         sock_put(sk);
1786         goto discard_it;
1787
1788 do_time_wait:
1789         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1790                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1791                 goto discard_it;
1792         }
1793
1794         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1795                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1796                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1797                 goto discard_it;
1798         }
1799         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1800                                            skb, th, skb->len)) {
1801         case TCP_TW_SYN: {
1802                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1803                                                           ntohs(th->dest),
1804                                                           tcp_v4_iif(skb));
1805                 if (sk2) {
1806                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1807                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1808                         sk = sk2;
1809                         goto process;
1810                 }
1811                 /* Fall through to ACK */
1812         }
1813         case TCP_TW_ACK:
1814                 tcp_v4_timewait_ack(sk, skb);
1815                 break;
1816         case TCP_TW_RST:
1817                 goto no_tcp_socket;
1818         case TCP_TW_SUCCESS:;
1819         }
1820         goto discard_it;
1821 }
1822
1823 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1824 {
1825         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1826         struct inet_sock *inet = inet_sk(sk);
1827
1828         sin->sin_family         = AF_INET;
1829         sin->sin_addr.s_addr    = inet->daddr;
1830         sin->sin_port           = inet->dport;
1831 }
1832
1833 /* VJ's idea. Save last timestamp seen from this destination
1834  * and hold it at least for normal timewait interval to use for duplicate
1835  * segment detection in subsequent connections, before they enter synchronized
1836  * state.
1837  */
1838
1839 int tcp_v4_remember_stamp(struct sock *sk)
1840 {
1841         struct inet_sock *inet = inet_sk(sk);
1842         struct tcp_sock *tp = tcp_sk(sk);
1843         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1844         struct inet_peer *peer = NULL;
1845         int release_it = 0;
1846
1847         if (!rt || rt->rt_dst != inet->daddr) {
1848                 peer = inet_getpeer(inet->daddr, 1);
1849                 release_it = 1;
1850         } else {
1851                 if (!rt->peer)
1852                         rt_bind_peer(rt, 1);
1853                 peer = rt->peer;
1854         }
1855
1856         if (peer) {
1857                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1858                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1859                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1860                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1861                         peer->tcp_ts = tp->rx_opt.ts_recent;
1862                 }
1863                 if (release_it)
1864                         inet_putpeer(peer);
1865                 return 1;
1866         }
1867
1868         return 0;
1869 }
1870
1871 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1872 {
1873         struct inet_peer *peer = NULL;
1874
1875         peer = inet_getpeer(tw->tw_daddr, 1);
1876
1877         if (peer) {
1878                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1879                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1880                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1881                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1882                         peer->tcp_ts = tw->tw_ts_recent;
1883                 }
1884                 inet_putpeer(peer);
1885                 return 1;
1886         }
1887
1888         return 0;
1889 }
1890
1891 struct tcp_func ipv4_specific = {
1892         .queue_xmit     =       ip_queue_xmit,
1893         .send_check     =       tcp_v4_send_check,
1894         .rebuild_header =       inet_sk_rebuild_header,
1895         .conn_request   =       tcp_v4_conn_request,
1896         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1897         .remember_stamp =       tcp_v4_remember_stamp,
1898         .net_header_len =       sizeof(struct iphdr),
1899         .setsockopt     =       ip_setsockopt,
1900         .getsockopt     =       ip_getsockopt,
1901         .addr2sockaddr  =       v4_addr2sockaddr,
1902         .sockaddr_len   =       sizeof(struct sockaddr_in),
1903 };
1904
1905 /* NOTE: A lot of things set to zero explicitly by call to
1906  *       sk_alloc() so need not be done here.
1907  */
1908 static int tcp_v4_init_sock(struct sock *sk)
1909 {
1910         struct tcp_sock *tp = tcp_sk(sk);
1911
1912         skb_queue_head_init(&tp->out_of_order_queue);
1913         tcp_init_xmit_timers(sk);
1914         tcp_prequeue_init(tp);
1915
1916         tp->rto  = TCP_TIMEOUT_INIT;
1917         tp->mdev = TCP_TIMEOUT_INIT;
1918
1919         /* So many TCP implementations out there (incorrectly) count the
1920          * initial SYN frame in their delayed-ACK and congestion control
1921          * algorithms that we must have the following bandaid to talk
1922          * efficiently to them.  -DaveM
1923          */
1924         tp->snd_cwnd = 2;
1925
1926         /* See draft-stevens-tcpca-spec-01 for discussion of the
1927          * initialization of these values.
1928          */
1929         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1930         tp->snd_cwnd_clamp = ~0;
1931         tp->mss_cache = 536;
1932
1933         tp->reordering = sysctl_tcp_reordering;
1934         tp->ca_ops = &tcp_init_congestion_ops;
1935
1936         sk->sk_state = TCP_CLOSE;
1937
1938         sk->sk_write_space = sk_stream_write_space;
1939         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1940
1941         tp->af_specific = &ipv4_specific;
1942
1943         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1944         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1945
1946         atomic_inc(&tcp_sockets_allocated);
1947
1948         return 0;
1949 }
1950
1951 int tcp_v4_destroy_sock(struct sock *sk)
1952 {
1953         struct tcp_sock *tp = tcp_sk(sk);
1954
1955         tcp_clear_xmit_timers(sk);
1956
1957         tcp_cleanup_congestion_control(tp);
1958
1959         /* Cleanup up the write buffer. */
1960         sk_stream_writequeue_purge(sk);
1961
1962         /* Cleans up our, hopefully empty, out_of_order_queue. */
1963         __skb_queue_purge(&tp->out_of_order_queue);
1964
1965         /* Clean prequeue, it must be empty really */
1966         __skb_queue_purge(&tp->ucopy.prequeue);
1967
1968         /* Clean up a referenced TCP bind bucket. */
1969         if (tp->bind_hash)
1970                 tcp_put_port(sk);
1971
1972         /*
1973          * If sendmsg cached page exists, toss it.
1974          */
1975         if (sk->sk_sndmsg_page) {
1976                 __free_page(sk->sk_sndmsg_page);
1977                 sk->sk_sndmsg_page = NULL;
1978         }
1979
1980         atomic_dec(&tcp_sockets_allocated);
1981
1982         return 0;
1983 }
1984
1985 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1986
1987 #ifdef CONFIG_PROC_FS
1988 /* Proc filesystem TCP sock list dumping. */
1989
1990 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1991 {
1992         return hlist_empty(head) ? NULL :
1993                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1994 }
1995
1996 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1997 {
1998         return tw->tw_node.next ?
1999                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2000 }
2001
2002 static void *listening_get_next(struct seq_file *seq, void *cur)
2003 {
2004         struct tcp_sock *tp;
2005         struct hlist_node *node;
2006         struct sock *sk = cur;
2007         struct tcp_iter_state* st = seq->private;
2008
2009         if (!sk) {
2010                 st->bucket = 0;
2011                 sk = sk_head(&tcp_listening_hash[0]);
2012                 goto get_sk;
2013         }
2014
2015         ++st->num;
2016
2017         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2018                 struct request_sock *req = cur;
2019
2020                 tp = tcp_sk(st->syn_wait_sk);
2021                 req = req->dl_next;
2022                 while (1) {
2023                         while (req) {
2024                                 if (req->rsk_ops->family == st->family) {
2025                                         cur = req;
2026                                         goto out;
2027                                 }
2028                                 req = req->dl_next;
2029                         }
2030                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2031                                 break;
2032 get_req:
2033                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2034                 }
2035                 sk        = sk_next(st->syn_wait_sk);
2036                 st->state = TCP_SEQ_STATE_LISTENING;
2037                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2038         } else {
2039                 tp = tcp_sk(sk);
2040                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2041                 if (reqsk_queue_len(&tp->accept_queue))
2042                         goto start_req;
2043                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2044                 sk = sk_next(sk);
2045         }
2046 get_sk:
2047         sk_for_each_from(sk, node) {
2048                 if (sk->sk_family == st->family) {
2049                         cur = sk;
2050                         goto out;
2051                 }
2052                 tp = tcp_sk(sk);
2053                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2054                 if (reqsk_queue_len(&tp->accept_queue)) {
2055 start_req:
2056                         st->uid         = sock_i_uid(sk);
2057                         st->syn_wait_sk = sk;
2058                         st->state       = TCP_SEQ_STATE_OPENREQ;
2059                         st->sbucket     = 0;
2060                         goto get_req;
2061                 }
2062                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2063         }
2064         if (++st->bucket < INET_LHTABLE_SIZE) {
2065                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2066                 goto get_sk;
2067         }
2068         cur = NULL;
2069 out:
2070         return cur;
2071 }
2072
2073 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2074 {
2075         void *rc = listening_get_next(seq, NULL);
2076
2077         while (rc && *pos) {
2078                 rc = listening_get_next(seq, rc);
2079                 --*pos;
2080         }
2081         return rc;
2082 }
2083
2084 static void *established_get_first(struct seq_file *seq)
2085 {
2086         struct tcp_iter_state* st = seq->private;
2087         void *rc = NULL;
2088
2089         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2090                 struct sock *sk;
2091                 struct hlist_node *node;
2092                 struct tcp_tw_bucket *tw;
2093
2094                 /* We can reschedule _before_ having picked the target: */
2095                 cond_resched_softirq();
2096
2097                 read_lock(&tcp_ehash[st->bucket].lock);
2098                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2099                         if (sk->sk_family != st->family) {
2100                                 continue;
2101                         }
2102                         rc = sk;
2103                         goto out;
2104                 }
2105                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2106                 tw_for_each(tw, node,
2107                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2108                         if (tw->tw_family != st->family) {
2109                                 continue;
2110                         }
2111                         rc = tw;
2112                         goto out;
2113                 }
2114                 read_unlock(&tcp_ehash[st->bucket].lock);
2115                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2116         }
2117 out:
2118         return rc;
2119 }
2120
2121 static void *established_get_next(struct seq_file *seq, void *cur)
2122 {
2123         struct sock *sk = cur;
2124         struct tcp_tw_bucket *tw;
2125         struct hlist_node *node;
2126         struct tcp_iter_state* st = seq->private;
2127
2128         ++st->num;
2129
2130         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2131                 tw = cur;
2132                 tw = tw_next(tw);
2133 get_tw:
2134                 while (tw && tw->tw_family != st->family) {
2135                         tw = tw_next(tw);
2136                 }
2137                 if (tw) {
2138                         cur = tw;
2139                         goto out;
2140                 }
2141                 read_unlock(&tcp_ehash[st->bucket].lock);
2142                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2143
2144                 /* We can reschedule between buckets: */
2145                 cond_resched_softirq();
2146
2147                 if (++st->bucket < tcp_ehash_size) {
2148                         read_lock(&tcp_ehash[st->bucket].lock);
2149                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2150                 } else {
2151                         cur = NULL;
2152                         goto out;
2153                 }
2154         } else
2155                 sk = sk_next(sk);
2156
2157         sk_for_each_from(sk, node) {
2158                 if (sk->sk_family == st->family)
2159                         goto found;
2160         }
2161
2162         st->state = TCP_SEQ_STATE_TIME_WAIT;
2163         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2164         goto get_tw;
2165 found:
2166         cur = sk;
2167 out:
2168         return cur;
2169 }
2170
2171 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2172 {
2173         void *rc = established_get_first(seq);
2174
2175         while (rc && pos) {
2176                 rc = established_get_next(seq, rc);
2177                 --pos;
2178         }               
2179         return rc;
2180 }
2181
2182 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2183 {
2184         void *rc;
2185         struct tcp_iter_state* st = seq->private;
2186
2187         tcp_listen_lock();
2188         st->state = TCP_SEQ_STATE_LISTENING;
2189         rc        = listening_get_idx(seq, &pos);
2190
2191         if (!rc) {
2192                 tcp_listen_unlock();
2193                 local_bh_disable();
2194                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2195                 rc        = established_get_idx(seq, pos);
2196         }
2197
2198         return rc;
2199 }
2200
2201 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2202 {
2203         struct tcp_iter_state* st = seq->private;
2204         st->state = TCP_SEQ_STATE_LISTENING;
2205         st->num = 0;
2206         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2207 }
2208
2209 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2210 {
2211         void *rc = NULL;
2212         struct tcp_iter_state* st;
2213
2214         if (v == SEQ_START_TOKEN) {
2215                 rc = tcp_get_idx(seq, 0);
2216                 goto out;
2217         }
2218         st = seq->private;
2219
2220         switch (st->state) {
2221         case TCP_SEQ_STATE_OPENREQ:
2222         case TCP_SEQ_STATE_LISTENING:
2223                 rc = listening_get_next(seq, v);
2224                 if (!rc) {
2225                         tcp_listen_unlock();
2226                         local_bh_disable();
2227                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2228                         rc        = established_get_first(seq);
2229                 }
2230                 break;
2231         case TCP_SEQ_STATE_ESTABLISHED:
2232         case TCP_SEQ_STATE_TIME_WAIT:
2233                 rc = established_get_next(seq, v);
2234                 break;
2235         }
2236 out:
2237         ++*pos;
2238         return rc;
2239 }
2240
2241 static void tcp_seq_stop(struct seq_file *seq, void *v)
2242 {
2243         struct tcp_iter_state* st = seq->private;
2244
2245         switch (st->state) {
2246         case TCP_SEQ_STATE_OPENREQ:
2247                 if (v) {
2248                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2249                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2250                 }
2251         case TCP_SEQ_STATE_LISTENING:
2252                 if (v != SEQ_START_TOKEN)
2253                         tcp_listen_unlock();
2254                 break;
2255         case TCP_SEQ_STATE_TIME_WAIT:
2256         case TCP_SEQ_STATE_ESTABLISHED:
2257                 if (v)
2258                         read_unlock(&tcp_ehash[st->bucket].lock);
2259                 local_bh_enable();
2260                 break;
2261         }
2262 }
2263
2264 static int tcp_seq_open(struct inode *inode, struct file *file)
2265 {
2266         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2267         struct seq_file *seq;
2268         struct tcp_iter_state *s;
2269         int rc;
2270
2271         if (unlikely(afinfo == NULL))
2272                 return -EINVAL;
2273
2274         s = kmalloc(sizeof(*s), GFP_KERNEL);
2275         if (!s)
2276                 return -ENOMEM;
2277         memset(s, 0, sizeof(*s));
2278         s->family               = afinfo->family;
2279         s->seq_ops.start        = tcp_seq_start;
2280         s->seq_ops.next         = tcp_seq_next;
2281         s->seq_ops.show         = afinfo->seq_show;
2282         s->seq_ops.stop         = tcp_seq_stop;
2283
2284         rc = seq_open(file, &s->seq_ops);
2285         if (rc)
2286                 goto out_kfree;
2287         seq          = file->private_data;
2288         seq->private = s;
2289 out:
2290         return rc;
2291 out_kfree:
2292         kfree(s);
2293         goto out;
2294 }
2295
2296 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2297 {
2298         int rc = 0;
2299         struct proc_dir_entry *p;
2300
2301         if (!afinfo)
2302                 return -EINVAL;
2303         afinfo->seq_fops->owner         = afinfo->owner;
2304         afinfo->seq_fops->open          = tcp_seq_open;
2305         afinfo->seq_fops->read          = seq_read;
2306         afinfo->seq_fops->llseek        = seq_lseek;
2307         afinfo->seq_fops->release       = seq_release_private;
2308         
2309         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2310         if (p)
2311                 p->data = afinfo;
2312         else
2313                 rc = -ENOMEM;
2314         return rc;
2315 }
2316
2317 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2318 {
2319         if (!afinfo)
2320                 return;
2321         proc_net_remove(afinfo->name);
2322         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2323 }
2324
2325 static void get_openreq4(struct sock *sk, struct request_sock *req,
2326                          char *tmpbuf, int i, int uid)
2327 {
2328         const struct inet_request_sock *ireq = inet_rsk(req);
2329         int ttd = req->expires - jiffies;
2330
2331         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2332                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2333                 i,
2334                 ireq->loc_addr,
2335                 ntohs(inet_sk(sk)->sport),
2336                 ireq->rmt_addr,
2337                 ntohs(ireq->rmt_port),
2338                 TCP_SYN_RECV,
2339                 0, 0, /* could print option size, but that is af dependent. */
2340                 1,    /* timers active (only the expire timer) */
2341                 jiffies_to_clock_t(ttd),
2342                 req->retrans,
2343                 uid,
2344                 0,  /* non standard timer */
2345                 0, /* open_requests have no inode */
2346                 atomic_read(&sk->sk_refcnt),
2347                 req);
2348 }
2349
2350 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2351 {
2352         int timer_active;
2353         unsigned long timer_expires;
2354         struct tcp_sock *tp = tcp_sk(sp);
2355         struct inet_sock *inet = inet_sk(sp);
2356         unsigned int dest = inet->daddr;
2357         unsigned int src = inet->rcv_saddr;
2358         __u16 destp = ntohs(inet->dport);
2359         __u16 srcp = ntohs(inet->sport);
2360
2361         if (tp->pending == TCP_TIME_RETRANS) {
2362                 timer_active    = 1;
2363                 timer_expires   = tp->timeout;
2364         } else if (tp->pending == TCP_TIME_PROBE0) {
2365                 timer_active    = 4;
2366                 timer_expires   = tp->timeout;
2367         } else if (timer_pending(&sp->sk_timer)) {
2368                 timer_active    = 2;
2369                 timer_expires   = sp->sk_timer.expires;
2370         } else {
2371                 timer_active    = 0;
2372                 timer_expires = jiffies;
2373         }
2374
2375         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2376                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2377                 i, src, srcp, dest, destp, sp->sk_state,
2378                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2379                 timer_active,
2380                 jiffies_to_clock_t(timer_expires - jiffies),
2381                 tp->retransmits,
2382                 sock_i_uid(sp),
2383                 tp->probes_out,
2384                 sock_i_ino(sp),
2385                 atomic_read(&sp->sk_refcnt), sp,
2386                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2387                 tp->snd_cwnd,
2388                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2389 }
2390
2391 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2392 {
2393         unsigned int dest, src;
2394         __u16 destp, srcp;
2395         int ttd = tw->tw_ttd - jiffies;
2396
2397         if (ttd < 0)
2398                 ttd = 0;
2399
2400         dest  = tw->tw_daddr;
2401         src   = tw->tw_rcv_saddr;
2402         destp = ntohs(tw->tw_dport);
2403         srcp  = ntohs(tw->tw_sport);
2404
2405         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2406                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2407                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2408                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2409                 atomic_read(&tw->tw_refcnt), tw);
2410 }
2411
2412 #define TMPSZ 150
2413
2414 static int tcp4_seq_show(struct seq_file *seq, void *v)
2415 {
2416         struct tcp_iter_state* st;
2417         char tmpbuf[TMPSZ + 1];
2418
2419         if (v == SEQ_START_TOKEN) {
2420                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2421                            "  sl  local_address rem_address   st tx_queue "
2422                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2423                            "inode");
2424                 goto out;
2425         }
2426         st = seq->private;
2427
2428         switch (st->state) {
2429         case TCP_SEQ_STATE_LISTENING:
2430         case TCP_SEQ_STATE_ESTABLISHED:
2431                 get_tcp4_sock(v, tmpbuf, st->num);
2432                 break;
2433         case TCP_SEQ_STATE_OPENREQ:
2434                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2435                 break;
2436         case TCP_SEQ_STATE_TIME_WAIT:
2437                 get_timewait4_sock(v, tmpbuf, st->num);
2438                 break;
2439         }
2440         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2441 out:
2442         return 0;
2443 }
2444
2445 static struct file_operations tcp4_seq_fops;
2446 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2447         .owner          = THIS_MODULE,
2448         .name           = "tcp",
2449         .family         = AF_INET,
2450         .seq_show       = tcp4_seq_show,
2451         .seq_fops       = &tcp4_seq_fops,
2452 };
2453
2454 int __init tcp4_proc_init(void)
2455 {
2456         return tcp_proc_register(&tcp4_seq_afinfo);
2457 }
2458
2459 void tcp4_proc_exit(void)
2460 {
2461         tcp_proc_unregister(&tcp4_seq_afinfo);
2462 }
2463 #endif /* CONFIG_PROC_FS */
2464
2465 struct proto tcp_prot = {
2466         .name                   = "TCP",
2467         .owner                  = THIS_MODULE,
2468         .close                  = tcp_close,
2469         .connect                = tcp_v4_connect,
2470         .disconnect             = tcp_disconnect,
2471         .accept                 = tcp_accept,
2472         .ioctl                  = tcp_ioctl,
2473         .init                   = tcp_v4_init_sock,
2474         .destroy                = tcp_v4_destroy_sock,
2475         .shutdown               = tcp_shutdown,
2476         .setsockopt             = tcp_setsockopt,
2477         .getsockopt             = tcp_getsockopt,
2478         .sendmsg                = tcp_sendmsg,
2479         .recvmsg                = tcp_recvmsg,
2480         .backlog_rcv            = tcp_v4_do_rcv,
2481         .hash                   = tcp_v4_hash,
2482         .unhash                 = tcp_unhash,
2483         .get_port               = tcp_v4_get_port,
2484         .enter_memory_pressure  = tcp_enter_memory_pressure,
2485         .sockets_allocated      = &tcp_sockets_allocated,
2486         .memory_allocated       = &tcp_memory_allocated,
2487         .memory_pressure        = &tcp_memory_pressure,
2488         .sysctl_mem             = sysctl_tcp_mem,
2489         .sysctl_wmem            = sysctl_tcp_wmem,
2490         .sysctl_rmem            = sysctl_tcp_rmem,
2491         .max_header             = MAX_TCP_HEADER,
2492         .obj_size               = sizeof(struct tcp_sock),
2493         .rsk_prot               = &tcp_request_sock_ops,
2494 };
2495
2496
2497
2498 void __init tcp_v4_init(struct net_proto_family *ops)
2499 {
2500         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2501         if (err < 0)
2502                 panic("Failed to create the TCP control socket.\n");
2503         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2504         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2505
2506         /* Unhash it so that IP input processing does not even
2507          * see it, we do not wish this socket to see incoming
2508          * packets.
2509          */
2510         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2511 }
2512
2513 EXPORT_SYMBOL(ipv4_specific);
2514 EXPORT_SYMBOL(tcp_bind_hash);
2515 EXPORT_SYMBOL(inet_bind_bucket_create);
2516 EXPORT_SYMBOL(tcp_hashinfo);
2517 EXPORT_SYMBOL(tcp_inherit_port);
2518 EXPORT_SYMBOL(tcp_listen_wlock);
2519 EXPORT_SYMBOL(tcp_port_rover);
2520 EXPORT_SYMBOL(tcp_prot);
2521 EXPORT_SYMBOL(tcp_put_port);
2522 EXPORT_SYMBOL(tcp_unhash);
2523 EXPORT_SYMBOL(tcp_v4_conn_request);
2524 EXPORT_SYMBOL(tcp_v4_connect);
2525 EXPORT_SYMBOL(tcp_v4_do_rcv);
2526 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2527 EXPORT_SYMBOL(tcp_v4_send_check);
2528 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2529
2530 #ifdef CONFIG_PROC_FS
2531 EXPORT_SYMBOL(tcp_proc_register);
2532 EXPORT_SYMBOL(tcp_proc_unregister);
2533 #endif
2534 EXPORT_SYMBOL(sysctl_local_port_range);
2535 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2536 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2537