err.no Git - linux-2.6/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/ipv6.h>
  70 #include <net/inet_common.h>
  71 #include <net/xfrm.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78
  79 extern int sysctl_ip_dynaddr;
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .lhash_lock     = RW_LOCK_UNLOCKED,
  94         .lhash_users    = ATOMIC_INIT(0),
  95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
  96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
  97         .port_rover     = 1024 - 1,
  98 };
  99
 100 /*
 101  * This array holds the first and last local port number.
 102  * For high-usage systems, use sysctl to change this to
 103  * 32768-61000
 104  */
 105 int sysctl_local_port_range[2] = { 1024, 4999 };
 106
 107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
 108 {
 109         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
 110         struct sock *sk2;
 111         struct hlist_node *node;
 112         int reuse = sk->sk_reuse;
 113
 114         sk_for_each_bound(sk2, node, &tb->owners) {
 115                 if (sk != sk2 &&
 116                     !tcp_v6_ipv6only(sk2) &&
 117                     (!sk->sk_bound_dev_if ||
 118                      !sk2->sk_bound_dev_if ||
 119                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 120                         if (!reuse || !sk2->sk_reuse ||
 121                             sk2->sk_state == TCP_LISTEN) {
 122                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
 123                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 124                                     sk2_rcv_saddr == sk_rcv_saddr)
 125                                         break;
 126                         }
 127                 }
 128         }
 129         return node != NULL;
 130 }
 131
 132 /* Obtain a reference to a local port for the given sock,
 133  * if snum is zero it means select any available local port.
 134  */
 135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 136 {
 137         struct inet_bind_hashbucket *head;
 138         struct hlist_node *node;
 139         struct inet_bind_bucket *tb;
 140         int ret;
 141
 142         local_bh_disable();
 143         if (!snum) {
 144                 int low = sysctl_local_port_range[0];
 145                 int high = sysctl_local_port_range[1];
 146                 int remaining = (high - low) + 1;
 147                 int rover;
 148
 149                 spin_lock(&tcp_hashinfo.portalloc_lock);
 150                 if (tcp_hashinfo.port_rover < low)
 151                         rover = low;
 152                 else
 153                         rover = tcp_hashinfo.port_rover;
 154                 do {
 155                         rover++;
 156                         if (rover > high)
 157                                 rover = low;
 158                         head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
 159                         spin_lock(&head->lock);
 160                         inet_bind_bucket_for_each(tb, node, &head->chain)
 161                                 if (tb->port == rover)
 162                                         goto next;
 163                         break;
 164                 next:
 165                         spin_unlock(&head->lock);
 166                 } while (--remaining > 0);
 167                 tcp_hashinfo.port_rover = rover;
 168                 spin_unlock(&tcp_hashinfo.portalloc_lock);
 169
 170                 /* Exhausted local port range during search?  It is not
 171                  * possible for us to be holding one of the bind hash
 172                  * locks if this test triggers, because if 'remaining'
 173                  * drops to zero, we broke out of the do/while loop at
 174                  * the top level, not from the 'break;' statement.
 175                  */
 176                 ret = 1;
 177                 if (unlikely(remaining <= 0))
 178                         goto fail;
 179
 180                 /* OK, here is the one we will use.  HEAD is
 181                  * non-NULL and we hold it's mutex.
 182                  */
 183                 snum = rover;
 184         } else {
 185                 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 186                 spin_lock(&head->lock);
 187                 inet_bind_bucket_for_each(tb, node, &head->chain)
 188                         if (tb->port == snum)
 189                                 goto tb_found;
 190         }
 191         tb = NULL;
 192         goto tb_not_found;
 193 tb_found:
 194         if (!hlist_empty(&tb->owners)) {
 195                 if (sk->sk_reuse > 1)
 196                         goto success;
 197                 if (tb->fastreuse > 0 &&
 198                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 199                         goto success;
 200                 } else {
 201                         ret = 1;
 202                         if (tcp_bind_conflict(sk, tb))
 203                                 goto fail_unlock;
 204                 }
 205         }
 206 tb_not_found:
 207         ret = 1;
 208         if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
 209                 goto fail_unlock;
 210         if (hlist_empty(&tb->owners)) {
 211                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 212                         tb->fastreuse = 1;
 213                 else
 214                         tb->fastreuse = 0;
 215         } else if (tb->fastreuse &&
 216                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 217                 tb->fastreuse = 0;
 218 success:
 219         if (!inet_sk(sk)->bind_hash)
 220                 inet_bind_hash(sk, tb, snum);
 221         BUG_TRAP(inet_sk(sk)->bind_hash == tb);
 222         ret = 0;
 223
 224 fail_unlock:
 225         spin_unlock(&head->lock);
 226 fail:
 227         local_bh_enable();
 228         return ret;
 229 }
 230
 231 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 232  * Look, when several writers sleep and reader wakes them up, all but one
 233  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 234  * this, _but_ remember, it adds useless work on UP machines (wake up each
 235  * exclusive lock release). It should be ifdefed really.
 236  */
 237
 238 void tcp_listen_wlock(void)
 239 {
 240         write_lock(&tcp_hashinfo.lhash_lock);
 241
 242         if (atomic_read(&tcp_hashinfo.lhash_users)) {
 243                 DEFINE_WAIT(wait);
 244
 245                 for (;;) {
 246                         prepare_to_wait_exclusive(&tcp_hashinfo.lhash_wait,
 247                                                 &wait, TASK_UNINTERRUPTIBLE);
 248                         if (!atomic_read(&tcp_hashinfo.lhash_users))
 249                                 break;
 250                         write_unlock_bh(&tcp_hashinfo.lhash_lock);
 251                         schedule();
 252                         write_lock_bh(&tcp_hashinfo.lhash_lock);
 253                 }
 254
 255                 finish_wait(&tcp_hashinfo.lhash_wait, &wait);
 256         }
 257 }
 258
 259 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 260 {
 261         struct hlist_head *list;
 262         rwlock_t *lock;
 263
 264         BUG_TRAP(sk_unhashed(sk));
 265         if (listen_possible && sk->sk_state == TCP_LISTEN) {
 266                 list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)];
 267                 lock = &tcp_hashinfo.lhash_lock;
 268                 tcp_listen_wlock();
 269         } else {
 270                 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_hashinfo.ehash_size);
 271                 list = &tcp_hashinfo.ehash[sk->sk_hashent].chain;
 272                 lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock;
 273                 write_lock(lock);
 274         }
 275         __sk_add_node(sk, list);
 276         sock_prot_inc_use(sk->sk_prot);
 277         write_unlock(lock);
 278         if (listen_possible && sk->sk_state == TCP_LISTEN)
 279                 wake_up(&tcp_hashinfo.lhash_wait);
 280 }
 281
 282 static void tcp_v4_hash(struct sock *sk)
 283 {
 284         if (sk->sk_state != TCP_CLOSE) {
 285                 local_bh_disable();
 286                 __tcp_v4_hash(sk, 1);
 287                 local_bh_enable();
 288         }
 289 }
 290
 291 void tcp_unhash(struct sock *sk)
 292 {
 293         rwlock_t *lock;
 294
 295         if (sk_unhashed(sk))
 296                 goto ende;
 297
 298         if (sk->sk_state == TCP_LISTEN) {
 299                 local_bh_disable();
 300                 tcp_listen_wlock();
 301                 lock = &tcp_hashinfo.lhash_lock;
 302         } else {
 303                 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[sk->sk_hashent];
 304                 lock = &head->lock;
 305                 write_lock_bh(&head->lock);
 306         }
 307
 308         if (__sk_del_node_init(sk))
 309                 sock_prot_dec_use(sk->sk_prot);
 310         write_unlock_bh(lock);
 311
 312  ende:
 313         if (sk->sk_state == TCP_LISTEN)
 314                 wake_up(&tcp_hashinfo.lhash_wait);
 315 }
 316
 317 /* Don't inline this cruft.  Here are some nice properties to
 318  * exploit here.  The BSD API does not allow a listening TCP
 319  * to specify the remote port nor the remote address for the
 320  * connection.  So always assume those are both wildcarded
 321  * during the search since they can never be otherwise.
 322  */
 323 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
 324                                              const u32 daddr,
 325                                              const unsigned short hnum,
 326                                              const int dif)
 327 {
 328         struct sock *result = NULL, *sk;
 329         struct hlist_node *node;
 330         int score, hiscore;
 331
 332         hiscore=-1;
 333         sk_for_each(sk, node, head) {
 334                 struct inet_sock *inet = inet_sk(sk);
 335
 336                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
 337                         __u32 rcv_saddr = inet->rcv_saddr;
 338
 339                         score = (sk->sk_family == PF_INET ? 1 : 0);
 340                         if (rcv_saddr) {
 341                                 if (rcv_saddr != daddr)
 342                                         continue;
 343                                 score+=2;
 344                         }
 345                         if (sk->sk_bound_dev_if) {
 346                                 if (sk->sk_bound_dev_if != dif)
 347                                         continue;
 348                                 score+=2;
 349                         }
 350                         if (score == 5)
 351                                 return sk;
 352                         if (score > hiscore) {
 353                                 hiscore = score;
 354                                 result = sk;
 355                         }
 356                 }
 357         }
 358         return result;
 359 }
 360
 361 /* Optimize the common listener case. */
 362 static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
 363                                                   const unsigned short hnum,
 364                                                   const int dif)
 365 {
 366         struct sock *sk = NULL;
 367         struct hlist_head *head;
 368
 369         read_lock(&tcp_hashinfo.lhash_lock);
 370         head = &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)];
 371         if (!hlist_empty(head)) {
 372                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
 373
 374                 if (inet->num == hnum && !sk->sk_node.next &&
 375                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 376                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 377                     !sk->sk_bound_dev_if)
 378                         goto sherry_cache;
 379                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 380         }
 381         if (sk) {
 382 sherry_cache:
 383                 sock_hold(sk);
 384         }
 385         read_unlock(&tcp_hashinfo.lhash_lock);
 386         return sk;
 387 }
 388
 389 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 390  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 391  *
 392  * Local BH must be disabled here.
 393  */
 394
 395 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
 396                                                        const u16 sport,
 397                                                        const u32 daddr,
 398                                                        const u16 hnum,
 399                                                        const int dif)
 400 {
 401         struct inet_ehash_bucket *head;
 402         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 403         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 404         struct sock *sk;
 405         struct hlist_node *node;
 406         /* Optimize here for direct hit, only listening connections can
 407          * have wildcards anyways.
 408          */
 409         const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
 410         head = &tcp_hashinfo.ehash[hash];
 411         read_lock(&head->lock);
 412         sk_for_each(sk, node, &head->chain) {
 413                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 414                         goto hit; /* You sunk my battleship! */
 415         }
 416
 417         /* Must check for a TIME_WAIT'er before going to listener hash. */
 418         sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 419                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 420                         goto hit;
 421         }
 422         sk = NULL;
 423 out:
 424         read_unlock(&head->lock);
 425         return sk;
 426 hit:
 427         sock_hold(sk);
 428         goto out;
 429 }
 430
 431 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 432                                            u32 daddr, u16 hnum, int dif)
 433 {
 434         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 435                                                       daddr, hnum, dif);
 436
 437         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 438 }
 439
 440 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 441                                   u16 dport, int dif)
 442 {
 443         struct sock *sk;
 444
 445         local_bh_disable();
 446         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 447         local_bh_enable();
 448
 449         return sk;
 450 }
 451
 452 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
 453
 454 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 455 {
 456         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 457                                           skb->nh.iph->saddr,
 458                                           skb->h.th->dest,
 459                                           skb->h.th->source);
 460 }
 461
 462 /* called with local bh disabled */
 463 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 464                                       struct tcp_tw_bucket **twp)
 465 {
 466         struct inet_sock *inet = inet_sk(sk);
 467         u32 daddr = inet->rcv_saddr;
 468         u32 saddr = inet->daddr;
 469         int dif = sk->sk_bound_dev_if;
 470         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 471         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 472         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
 473         struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
 474         struct sock *sk2;
 475         struct hlist_node *node;
 476         struct tcp_tw_bucket *tw;
 477
 478         write_lock(&head->lock);
 479
 480         /* Check TIME-WAIT sockets first. */
 481         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 482                 tw = (struct tcp_tw_bucket *)sk2;
 483
 484                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 485                         struct tcp_sock *tp = tcp_sk(sk);
 486
 487                         /* With PAWS, it is safe from the viewpoint
 488                            of data integrity. Even without PAWS it
 489                            is safe provided sequence spaces do not
 490                            overlap i.e. at data rates <= 80Mbit/sec.
 491
 492                            Actually, the idea is close to VJ's one,
 493                            only timestamp cache is held not per host,
 494                            but per port pair and TW bucket is used
 495                            as state holder.
 496
 497                            If TW bucket has been already destroyed we
 498                            fall back to VJ's scheme and use initial
 499                            timestamp retrieved from peer table.
 500                          */
 501                         if (tw->tw_ts_recent_stamp &&
 502                             (!twp || (sysctl_tcp_tw_reuse &&
 503                                       xtime.tv_sec -
 504                                       tw->tw_ts_recent_stamp > 1))) {
 505                                 if ((tp->write_seq =
 506                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
 507                                         tp->write_seq = 1;
 508                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
 509                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
 510                                 sock_hold(sk2);
 511                                 goto unique;
 512                         } else
 513                                 goto not_unique;
 514                 }
 515         }
 516         tw = NULL;
 517
 518         /* And established part... */
 519         sk_for_each(sk2, node, &head->chain) {
 520                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 521                         goto not_unique;
 522         }
 523
 524 unique:
 525         /* Must record num and sport now. Otherwise we will see
 526          * in hash table socket with a funny identity. */
 527         inet->num = lport;
 528         inet->sport = htons(lport);
 529         sk->sk_hashent = hash;
 530         BUG_TRAP(sk_unhashed(sk));
 531         __sk_add_node(sk, &head->chain);
 532         sock_prot_inc_use(sk->sk_prot);
 533         write_unlock(&head->lock);
 534
 535         if (twp) {
 536                 *twp = tw;
 537                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 538         } else if (tw) {
 539                 /* Silly. Should hash-dance instead... */
 540                 tcp_tw_deschedule(tw);
 541                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 542
 543                 tcp_tw_put(tw);
 544         }
 545
 546         return 0;
 547
 548 not_unique:
 549         write_unlock(&head->lock);
 550         return -EADDRNOTAVAIL;
 551 }
 552
 553 static inline u32 connect_port_offset(const struct sock *sk)
 554 {
 555         const struct inet_sock *inet = inet_sk(sk);
 556
 557         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 558                                          inet->dport);
 559 }
 560
 561 /*
 562  * Bind a port for a connect operation and hash it.
 563  */
 564 static inline int tcp_v4_hash_connect(struct sock *sk)
 565 {
 566         const unsigned short snum = inet_sk(sk)->num;
 567         struct inet_bind_hashbucket *head;
 568         struct inet_bind_bucket *tb;
 569         int ret;
 570
 571         if (!snum) {
 572                 int low = sysctl_local_port_range[0];
 573                 int high = sysctl_local_port_range[1];
 574                 int range = high - low;
 575                 int i;
 576                 int port;
 577                 static u32 hint;
 578                 u32 offset = hint + connect_port_offset(sk);
 579                 struct hlist_node *node;
 580                 struct tcp_tw_bucket *tw = NULL;
 581
 582                 local_bh_disable();
 583                 for (i = 1; i <= range; i++) {
 584                         port = low + (i + offset) % range;
 585                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
 586                         spin_lock(&head->lock);
 587
 588                         /* Does not bother with rcv_saddr checks,
 589                          * because the established check is already
 590                          * unique enough.
 591                          */
 592                         inet_bind_bucket_for_each(tb, node, &head->chain) {
 593                                 if (tb->port == port) {
 594                                         BUG_TRAP(!hlist_empty(&tb->owners));
 595                                         if (tb->fastreuse >= 0)
 596                                                 goto next_port;
 597                                         if (!__tcp_v4_check_established(sk,
 598                                                                         port,
 599                                                                         &tw))
 600                                                 goto ok;
 601                                         goto next_port;
 602                                 }
 603                         }
 604
 605                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
 606                         if (!tb) {
 607                                 spin_unlock(&head->lock);
 608                                 break;
 609                         }
 610                         tb->fastreuse = -1;
 611                         goto ok;
 612
 613                 next_port:
 614                         spin_unlock(&head->lock);
 615                 }
 616                 local_bh_enable();
 617
 618                 return -EADDRNOTAVAIL;
 619
 620 ok:
 621                 hint += i;
 622
 623                 /* Head lock still held and bh's disabled */
 624                 inet_bind_hash(sk, tb, port);
 625                 if (sk_unhashed(sk)) {
 626                         inet_sk(sk)->sport = htons(port);
 627                         __tcp_v4_hash(sk, 0);
 628                 }
 629                 spin_unlock(&head->lock);
 630
 631                 if (tw) {
 632                         tcp_tw_deschedule(tw);
 633                         tcp_tw_put(tw);
 634                 }
 635
 636                 ret = 0;
 637                 goto out;
 638         }
 639
 640         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 641         tb  = inet_sk(sk)->bind_hash;
 642         spin_lock_bh(&head->lock);
 643         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 644                 __tcp_v4_hash(sk, 0);
 645                 spin_unlock_bh(&head->lock);
 646                 return 0;
 647         } else {
 648                 spin_unlock(&head->lock);
 649                 /* No definite answer... Walk to established hash table */
 650                 ret = __tcp_v4_check_established(sk, snum, NULL);
 651 out:
 652                 local_bh_enable();
 653                 return ret;
 654         }
 655 }
 656
 657 /* This will initiate an outgoing connection. */
 658 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 659 {
 660         struct inet_sock *inet = inet_sk(sk);
 661         struct tcp_sock *tp = tcp_sk(sk);
 662         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 663         struct rtable *rt;
 664         u32 daddr, nexthop;
 665         int tmp;
 666         int err;
 667
 668         if (addr_len < sizeof(struct sockaddr_in))
 669                 return -EINVAL;
 670
 671         if (usin->sin_family != AF_INET)
 672                 return -EAFNOSUPPORT;
 673
 674         nexthop = daddr = usin->sin_addr.s_addr;
 675         if (inet->opt && inet->opt->srr) {
 676                 if (!daddr)
 677                         return -EINVAL;
 678                 nexthop = inet->opt->faddr;
 679         }
 680
 681         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 682                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 683                                IPPROTO_TCP,
 684                                inet->sport, usin->sin_port, sk);
 685         if (tmp < 0)
 686                 return tmp;
 687
 688         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 689                 ip_rt_put(rt);
 690                 return -ENETUNREACH;
 691         }
 692
 693         if (!inet->opt || !inet->opt->srr)
 694                 daddr = rt->rt_dst;
 695
 696         if (!inet->saddr)
 697                 inet->saddr = rt->rt_src;
 698         inet->rcv_saddr = inet->saddr;
 699
 700         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 701                 /* Reset inherited state */
 702                 tp->rx_opt.ts_recent       = 0;
 703                 tp->rx_opt.ts_recent_stamp = 0;
 704                 tp->write_seq              = 0;
 705         }
 706
 707         if (sysctl_tcp_tw_recycle &&
 708             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 709                 struct inet_peer *peer = rt_get_peer(rt);
 710
 711                 /* VJ's idea. We save last timestamp seen from
 712                  * the destination in peer table, when entering state TIME-WAIT
 713                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 714                  */
 715
 716                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 717                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 718                         tp->rx_opt.ts_recent = peer->tcp_ts;
 719                 }
 720         }
 721
 722         inet->dport = usin->sin_port;
 723         inet->daddr = daddr;
 724
 725         tp->ext_header_len = 0;
 726         if (inet->opt)
 727                 tp->ext_header_len = inet->opt->optlen;
 728
 729         tp->rx_opt.mss_clamp = 536;
 730
 731         /* Socket identity is still unknown (sport may be zero).
 732          * However we set state to SYN-SENT and not releasing socket
 733          * lock select source port, enter ourselves into the hash tables and
 734          * complete initialization after this.
 735          */
 736         tcp_set_state(sk, TCP_SYN_SENT);
 737         err = tcp_v4_hash_connect(sk);
 738         if (err)
 739                 goto failure;
 740
 741         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 742         if (err)
 743                 goto failure;
 744
 745         /* OK, now commit destination to socket.  */
 746         sk_setup_caps(sk, &rt->u.dst);
 747
 748         if (!tp->write_seq)
 749                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 750                                                            inet->daddr,
 751                                                            inet->sport,
 752                                                            usin->sin_port);
 753
 754         inet->id = tp->write_seq ^ jiffies;
 755
 756         err = tcp_connect(sk);
 757         rt = NULL;
 758         if (err)
 759                 goto failure;
 760
 761         return 0;
 762
 763 failure:
 764         /* This unhashes the socket and releases the local port, if necessary. */
 765         tcp_set_state(sk, TCP_CLOSE);
 766         ip_rt_put(rt);
 767         sk->sk_route_caps = 0;
 768         inet->dport = 0;
 769         return err;
 770 }
 771
 772 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 773 {
 774         return ((struct rtable *)skb->dst)->rt_iif;
 775 }
 776
 777 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 778 {
 779         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 780 }
 781
 782 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
 783                                               struct request_sock ***prevp,
 784                                               __u16 rport,
 785                                               __u32 raddr, __u32 laddr)
 786 {
 787         struct listen_sock *lopt = tp->accept_queue.listen_opt;
 788         struct request_sock *req, **prev;
 789
 790         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 791              (req = *prev) != NULL;
 792              prev = &req->dl_next) {
 793                 const struct inet_request_sock *ireq = inet_rsk(req);
 794
 795                 if (ireq->rmt_port == rport &&
 796                     ireq->rmt_addr == raddr &&
 797                     ireq->loc_addr == laddr &&
 798                     TCP_INET_FAMILY(req->rsk_ops->family)) {
 799                         BUG_TRAP(!req->sk);
 800                         *prevp = prev;
 801                         break;
 802                 }
 803         }
 804
 805         return req;
 806 }
 807
 808 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
 809 {
 810         struct tcp_sock *tp = tcp_sk(sk);
 811         struct listen_sock *lopt = tp->accept_queue.listen_opt;
 812         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 813
 814         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
 815         tcp_synq_added(sk);
 816 }
 817
 818
 819 /*
 820  * This routine does path mtu discovery as defined in RFC1191.
 821  */
 822 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 823                                      u32 mtu)
 824 {
 825         struct dst_entry *dst;
 826         struct inet_sock *inet = inet_sk(sk);
 827         struct tcp_sock *tp = tcp_sk(sk);
 828
 829         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 830          * send out by Linux are always <576bytes so they should go through
 831          * unfragmented).
 832          */
 833         if (sk->sk_state == TCP_LISTEN)
 834                 return;
 835
 836         /* We don't check in the destentry if pmtu discovery is forbidden
 837          * on this route. We just assume that no packet_to_big packets
 838          * are send back when pmtu discovery is not active.
 839          * There is a small race when the user changes this flag in the
 840          * route, but I think that's acceptable.
 841          */
 842         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 843                 return;
 844
 845         dst->ops->update_pmtu(dst, mtu);
 846
 847         /* Something is about to be wrong... Remember soft error
 848          * for the case, if this connection will not able to recover.
 849          */
 850         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 851                 sk->sk_err_soft = EMSGSIZE;
 852
 853         mtu = dst_mtu(dst);
 854
 855         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 856             tp->pmtu_cookie > mtu) {
 857                 tcp_sync_mss(sk, mtu);
 858
 859                 /* Resend the TCP packet because it's
 860                  * clear that the old packet has been
 861                  * dropped. This is the new "fast" path mtu
 862                  * discovery.
 863                  */
 864                 tcp_simple_retransmit(sk);
 865         } /* else let the usual retransmit timer handle it */
 866 }
 867
 868 /*
 869  * This routine is called by the ICMP module when it gets some
 870  * sort of error condition.  If err < 0 then the socket should
 871  * be closed and the error returned to the user.  If err > 0
 872  * it's just the icmp type << 8 | icmp code.  After adjustment
 873  * header points to the first 8 bytes of the tcp header.  We need
 874  * to find the appropriate port.
 875  *
 876  * The locking strategy used here is very "optimistic". When
 877  * someone else accesses the socket the ICMP is just dropped
 878  * and for some paths there is no check at all.
 879  * A more general error queue to queue errors for later handling
 880  * is probably better.
 881  *
 882  */
 883
 884 void tcp_v4_err(struct sk_buff *skb, u32 info)
 885 {
 886         struct iphdr *iph = (struct iphdr *)skb->data;
 887         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 888         struct tcp_sock *tp;
 889         struct inet_sock *inet;
 890         int type = skb->h.icmph->type;
 891         int code = skb->h.icmph->code;
 892         struct sock *sk;
 893         __u32 seq;
 894         int err;
 895
 896         if (skb->len < (iph->ihl << 2) + 8) {
 897                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 898                 return;
 899         }
 900
 901         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
 902                            th->source, tcp_v4_iif(skb));
 903         if (!sk) {
 904                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 905                 return;
 906         }
 907         if (sk->sk_state == TCP_TIME_WAIT) {
 908                 tcp_tw_put((struct tcp_tw_bucket *)sk);
 909                 return;
 910         }
 911
 912         bh_lock_sock(sk);
 913         /* If too many ICMPs get dropped on busy
 914          * servers this needs to be solved differently.
 915          */
 916         if (sock_owned_by_user(sk))
 917                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 918
 919         if (sk->sk_state == TCP_CLOSE)
 920                 goto out;
 921
 922         tp = tcp_sk(sk);
 923         seq = ntohl(th->seq);
 924         if (sk->sk_state != TCP_LISTEN &&
 925             !between(seq, tp->snd_una, tp->snd_nxt)) {
 926                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
 927                 goto out;
 928         }
 929
 930         switch (type) {
 931         case ICMP_SOURCE_QUENCH:
 932                 /* Just silently ignore these. */
 933                 goto out;
 934         case ICMP_PARAMETERPROB:
 935                 err = EPROTO;
 936                 break;
 937         case ICMP_DEST_UNREACH:
 938                 if (code > NR_ICMP_UNREACH)
 939                         goto out;
 940
 941                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 942                         if (!sock_owned_by_user(sk))
 943                                 do_pmtu_discovery(sk, iph, info);
 944                         goto out;
 945                 }
 946
 947                 err = icmp_err_convert[code].errno;
 948                 break;
 949         case ICMP_TIME_EXCEEDED:
 950                 err = EHOSTUNREACH;
 951                 break;
 952         default:
 953                 goto out;
 954         }
 955
 956         switch (sk->sk_state) {
 957                 struct request_sock *req, **prev;
 958         case TCP_LISTEN:
 959                 if (sock_owned_by_user(sk))
 960                         goto out;
 961
 962                 req = tcp_v4_search_req(tp, &prev, th->dest,
 963                                         iph->daddr, iph->saddr);
 964                 if (!req)
 965                         goto out;
 966
 967                 /* ICMPs are not backlogged, hence we cannot get
 968                    an established socket here.
 969                  */
 970                 BUG_TRAP(!req->sk);
 971
 972                 if (seq != tcp_rsk(req)->snt_isn) {
 973                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 974                         goto out;
 975                 }
 976
 977                 /*
 978                  * Still in SYN_RECV, just remove it silently.
 979                  * There is no good way to pass the error to the newly
 980                  * created socket, and POSIX does not want network
 981                  * errors returned from accept().
 982                  */
 983                 tcp_synq_drop(sk, req, prev);
 984                 goto out;
 985
 986         case TCP_SYN_SENT:
 987         case TCP_SYN_RECV:  /* Cannot happen.
 988                                It can f.e. if SYNs crossed.
 989                              */
 990                 if (!sock_owned_by_user(sk)) {
 991                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 992                         sk->sk_err = err;
 993
 994                         sk->sk_error_report(sk);
 995
 996                         tcp_done(sk);
 997                 } else {
 998                         sk->sk_err_soft = err;
 999                 }
1000                 goto out;
1001         }
1002
1003         /* If we've already connected we will keep trying
1004          * until we time out, or the user gives up.
1005          *
1006          * rfc1122 4.2.3.9 allows to consider as hard errors
1007          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1008          * but it is obsoleted by pmtu discovery).
1009          *
1010          * Note, that in modern internet, where routing is unreliable
1011          * and in each dark corner broken firewalls sit, sending random
1012          * errors ordered by their masters even this two messages finally lose
1013          * their original sense (even Linux sends invalid PORT_UNREACHs)
1014          *
1015          * Now we are in compliance with RFCs.
1016          *                                                      --ANK (980905)
1017          */
1018
1019         inet = inet_sk(sk);
1020         if (!sock_owned_by_user(sk) && inet->recverr) {
1021                 sk->sk_err = err;
1022                 sk->sk_error_report(sk);
1023         } else  { /* Only an error on timeout */
1024                 sk->sk_err_soft = err;
1025         }
1026
1027 out:
1028         bh_unlock_sock(sk);
1029         sock_put(sk);
1030 }
1031
1032 /* This routine computes an IPv4 TCP checksum. */
1033 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1034                        struct sk_buff *skb)
1035 {
1036         struct inet_sock *inet = inet_sk(sk);
1037
1038         if (skb->ip_summed == CHECKSUM_HW) {
1039                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1040                 skb->csum = offsetof(struct tcphdr, check);
1041         } else {
1042                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1043                                          csum_partial((char *)th,
1044                                                       th->doff << 2,
1045                                                       skb->csum));
1046         }
1047 }
1048
1049 /*
1050  *      This routine will send an RST to the other tcp.
1051  *
1052  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1053  *                    for reset.
1054  *      Answer: if a packet caused RST, it is not for a socket
1055  *              existing in our system, if it is matched to a socket,
1056  *              it is just duplicate segment or bug in other side's TCP.
1057  *              So that we build reply only basing on parameters
1058  *              arrived with segment.
1059  *      Exception: precedence violation. We do not implement it in any case.
1060  */
1061
1062 static void tcp_v4_send_reset(struct sk_buff *skb)
1063 {
1064         struct tcphdr *th = skb->h.th;
1065         struct tcphdr rth;
1066         struct ip_reply_arg arg;
1067
1068         /* Never send a reset in response to a reset. */
1069         if (th->rst)
1070                 return;
1071
1072         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1073                 return;
1074
1075         /* Swap the send and the receive. */
1076         memset(&rth, 0, sizeof(struct tcphdr));
1077         rth.dest   = th->source;
1078         rth.source = th->dest;
1079         rth.doff   = sizeof(struct tcphdr) / 4;
1080         rth.rst    = 1;
1081
1082         if (th->ack) {
1083                 rth.seq = th->ack_seq;
1084         } else {
1085                 rth.ack = 1;
1086                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1087                                     skb->len - (th->doff << 2));
1088         }
1089
1090         memset(&arg, 0, sizeof arg);
1091         arg.iov[0].iov_base = (unsigned char *)&rth;
1092         arg.iov[0].iov_len  = sizeof rth;
1093         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1094                                       skb->nh.iph->saddr, /*XXX*/
1095                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1096         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1097
1098         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1099
1100         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1101         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1102 }
1103
1104 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1105    outside socket context is ugly, certainly. What can I do?
1106  */
1107
1108 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1109                             u32 win, u32 ts)
1110 {
1111         struct tcphdr *th = skb->h.th;
1112         struct {
1113                 struct tcphdr th;
1114                 u32 tsopt[3];
1115         } rep;
1116         struct ip_reply_arg arg;
1117
1118         memset(&rep.th, 0, sizeof(struct tcphdr));
1119         memset(&arg, 0, sizeof arg);
1120
1121         arg.iov[0].iov_base = (unsigned char *)&rep;
1122         arg.iov[0].iov_len  = sizeof(rep.th);
1123         if (ts) {
1124                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1125                                      (TCPOPT_TIMESTAMP << 8) |
1126                                      TCPOLEN_TIMESTAMP);
1127                 rep.tsopt[1] = htonl(tcp_time_stamp);
1128                 rep.tsopt[2] = htonl(ts);
1129                 arg.iov[0].iov_len = sizeof(rep);
1130         }
1131
1132         /* Swap the send and the receive. */
1133         rep.th.dest    = th->source;
1134         rep.th.source  = th->dest;
1135         rep.th.doff    = arg.iov[0].iov_len / 4;
1136         rep.th.seq     = htonl(seq);
1137         rep.th.ack_seq = htonl(ack);
1138         rep.th.ack     = 1;
1139         rep.th.window  = htons(win);
1140
1141         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1142                                       skb->nh.iph->saddr, /*XXX*/
1143                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1144         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1145
1146         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1147
1148         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1149 }
1150
1151 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1152 {
1153         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1154
1155         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1156                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1157
1158         tcp_tw_put(tw);
1159 }
1160
1161 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1162 {
1163         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1164                         req->ts_recent);
1165 }
1166
1167 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1168                                           struct request_sock *req)
1169 {
1170         struct rtable *rt;
1171         const struct inet_request_sock *ireq = inet_rsk(req);
1172         struct ip_options *opt = inet_rsk(req)->opt;
1173         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1174                             .nl_u = { .ip4_u =
1175                                       { .daddr = ((opt && opt->srr) ?
1176                                                   opt->faddr :
1177                                                   ireq->rmt_addr),
1178                                         .saddr = ireq->loc_addr,
1179                                         .tos = RT_CONN_FLAGS(sk) } },
1180                             .proto = IPPROTO_TCP,
1181                             .uli_u = { .ports =
1182                                        { .sport = inet_sk(sk)->sport,
1183                                          .dport = ireq->rmt_port } } };
1184
1185         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1186                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1187                 return NULL;
1188         }
1189         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1190                 ip_rt_put(rt);
1191                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1192                 return NULL;
1193         }
1194         return &rt->u.dst;
1195 }
1196
1197 /*
1198  *      Send a SYN-ACK after having received an ACK.
1199  *      This still operates on a request_sock only, not on a big
1200  *      socket.
1201  */
1202 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1203                               struct dst_entry *dst)
1204 {
1205         const struct inet_request_sock *ireq = inet_rsk(req);
1206         int err = -1;
1207         struct sk_buff * skb;
1208
1209         /* First, grab a route. */
1210         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1211                 goto out;
1212
1213         skb = tcp_make_synack(sk, dst, req);
1214
1215         if (skb) {
1216                 struct tcphdr *th = skb->h.th;
1217
1218                 th->check = tcp_v4_check(th, skb->len,
1219                                          ireq->loc_addr,
1220                                          ireq->rmt_addr,
1221                                          csum_partial((char *)th, skb->len,
1222                                                       skb->csum));
1223
1224                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1225                                             ireq->rmt_addr,
1226                                             ireq->opt);
1227                 if (err == NET_XMIT_CN)
1228                         err = 0;
1229         }
1230
1231 out:
1232         dst_release(dst);
1233         return err;
1234 }
1235
1236 /*
1237  *      IPv4 request_sock destructor.
1238  */
1239 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1240 {
1241         if (inet_rsk(req)->opt)
1242                 kfree(inet_rsk(req)->opt);
1243 }
1244
1245 static inline void syn_flood_warning(struct sk_buff *skb)
1246 {
1247         static unsigned long warntime;
1248
1249         if (time_after(jiffies, (warntime + HZ * 60))) {
1250                 warntime = jiffies;
1251                 printk(KERN_INFO
1252                        "possible SYN flooding on port %d. Sending cookies.\n",
1253                        ntohs(skb->h.th->dest));
1254         }
1255 }
1256
1257 /*
1258  * Save and compile IPv4 options into the request_sock if needed.
1259  */
1260 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1261                                                      struct sk_buff *skb)
1262 {
1263         struct ip_options *opt = &(IPCB(skb)->opt);
1264         struct ip_options *dopt = NULL;
1265
1266         if (opt && opt->optlen) {
1267                 int opt_size = optlength(opt);
1268                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1269                 if (dopt) {
1270                         if (ip_options_echo(dopt, skb)) {
1271                                 kfree(dopt);
1272                                 dopt = NULL;
1273                         }
1274                 }
1275         }
1276         return dopt;
1277 }
1278
1279 struct request_sock_ops tcp_request_sock_ops = {
1280         .family         =       PF_INET,
1281         .obj_size       =       sizeof(struct tcp_request_sock),
1282         .rtx_syn_ack    =       tcp_v4_send_synack,
1283         .send_ack       =       tcp_v4_reqsk_send_ack,
1284         .destructor     =       tcp_v4_reqsk_destructor,
1285         .send_reset     =       tcp_v4_send_reset,
1286 };
1287
1288 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1289 {
1290         struct inet_request_sock *ireq;
1291         struct tcp_options_received tmp_opt;
1292         struct request_sock *req;
1293         __u32 saddr = skb->nh.iph->saddr;
1294         __u32 daddr = skb->nh.iph->daddr;
1295         __u32 isn = TCP_SKB_CB(skb)->when;
1296         struct dst_entry *dst = NULL;
1297 #ifdef CONFIG_SYN_COOKIES
1298         int want_cookie = 0;
1299 #else
1300 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1301 #endif
1302
1303         /* Never answer to SYNs send to broadcast or multicast */
1304         if (((struct rtable *)skb->dst)->rt_flags &
1305             (RTCF_BROADCAST | RTCF_MULTICAST))
1306                 goto drop;
1307
1308         /* TW buckets are converted to open requests without
1309          * limitations, they conserve resources and peer is
1310          * evidently real one.
1311          */
1312         if (tcp_synq_is_full(sk) && !isn) {
1313 #ifdef CONFIG_SYN_COOKIES
1314                 if (sysctl_tcp_syncookies) {
1315                         want_cookie = 1;
1316                 } else
1317 #endif
1318                 goto drop;
1319         }
1320
1321         /* Accept backlog is full. If we have already queued enough
1322          * of warm entries in syn queue, drop request. It is better than
1323          * clogging syn queue with openreqs with exponentially increasing
1324          * timeout.
1325          */
1326         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1327                 goto drop;
1328
1329         req = reqsk_alloc(&tcp_request_sock_ops);
1330         if (!req)
1331                 goto drop;
1332
1333         tcp_clear_options(&tmp_opt);
1334         tmp_opt.mss_clamp = 536;
1335         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1336
1337         tcp_parse_options(skb, &tmp_opt, 0);
1338
1339         if (want_cookie) {
1340                 tcp_clear_options(&tmp_opt);
1341                 tmp_opt.saw_tstamp = 0;
1342         }
1343
1344         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1345                 /* Some OSes (unknown ones, but I see them on web server, which
1346                  * contains information interesting only for windows'
1347                  * users) do not send their stamp in SYN. It is easy case.
1348                  * We simply do not advertise TS support.
1349                  */
1350                 tmp_opt.saw_tstamp = 0;
1351                 tmp_opt.tstamp_ok  = 0;
1352         }
1353         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1354
1355         tcp_openreq_init(req, &tmp_opt, skb);
1356
1357         ireq = inet_rsk(req);
1358         ireq->loc_addr = daddr;
1359         ireq->rmt_addr = saddr;
1360         ireq->opt = tcp_v4_save_options(sk, skb);
1361         if (!want_cookie)
1362                 TCP_ECN_create_request(req, skb->h.th);
1363
1364         if (want_cookie) {
1365 #ifdef CONFIG_SYN_COOKIES
1366                 syn_flood_warning(skb);
1367 #endif
1368                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1369         } else if (!isn) {
1370                 struct inet_peer *peer = NULL;
1371
1372                 /* VJ's idea. We save last timestamp seen
1373                  * from the destination in peer table, when entering
1374                  * state TIME-WAIT, and check against it before
1375                  * accepting new connection request.
1376                  *
1377                  * If "isn" is not zero, this request hit alive
1378                  * timewait bucket, so that all the necessary checks
1379                  * are made in the function processing timewait state.
1380                  */
1381                 if (tmp_opt.saw_tstamp &&
1382                     sysctl_tcp_tw_recycle &&
1383                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1384                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1385                     peer->v4daddr == saddr) {
1386                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1387                             (s32)(peer->tcp_ts - req->ts_recent) >
1388                                                         TCP_PAWS_WINDOW) {
1389                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1390                                 dst_release(dst);
1391                                 goto drop_and_free;
1392                         }
1393                 }
1394                 /* Kill the following clause, if you dislike this way. */
1395                 else if (!sysctl_tcp_syncookies &&
1396                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1397                           (sysctl_max_syn_backlog >> 2)) &&
1398                          (!peer || !peer->tcp_ts_stamp) &&
1399                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1400                         /* Without syncookies last quarter of
1401                          * backlog is filled with destinations,
1402                          * proven to be alive.
1403                          * It means that we continue to communicate
1404                          * to destinations, already remembered
1405                          * to the moment of synflood.
1406                          */
1407                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1408                                               "request from %u.%u."
1409                                               "%u.%u/%u\n",
1410                                               NIPQUAD(saddr),
1411                                               ntohs(skb->h.th->source)));
1412                         dst_release(dst);
1413                         goto drop_and_free;
1414                 }
1415
1416                 isn = tcp_v4_init_sequence(sk, skb);
1417         }
1418         tcp_rsk(req)->snt_isn = isn;
1419
1420         if (tcp_v4_send_synack(sk, req, dst))
1421                 goto drop_and_free;
1422
1423         if (want_cookie) {
1424                 reqsk_free(req);
1425         } else {
1426                 tcp_v4_synq_add(sk, req);
1427         }
1428         return 0;
1429
1430 drop_and_free:
1431         reqsk_free(req);
1432 drop:
1433         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1434         return 0;
1435 }
1436
1437
1438 /*
1439  * The three way handshake has completed - we got a valid synack -
1440  * now create the new socket.
1441  */
1442 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1443                                   struct request_sock *req,
1444                                   struct dst_entry *dst)
1445 {
1446         struct inet_request_sock *ireq;
1447         struct inet_sock *newinet;
1448         struct tcp_sock *newtp;
1449         struct sock *newsk;
1450
1451         if (sk_acceptq_is_full(sk))
1452                 goto exit_overflow;
1453
1454         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1455                 goto exit;
1456
1457         newsk = tcp_create_openreq_child(sk, req, skb);
1458         if (!newsk)
1459                 goto exit;
1460
1461         sk_setup_caps(newsk, dst);
1462
1463         newtp                 = tcp_sk(newsk);
1464         newinet               = inet_sk(newsk);
1465         ireq                  = inet_rsk(req);
1466         newinet->daddr        = ireq->rmt_addr;
1467         newinet->rcv_saddr    = ireq->loc_addr;
1468         newinet->saddr        = ireq->loc_addr;
1469         newinet->opt          = ireq->opt;
1470         ireq->opt             = NULL;
1471         newinet->mc_index     = tcp_v4_iif(skb);
1472         newinet->mc_ttl       = skb->nh.iph->ttl;
1473         newtp->ext_header_len = 0;
1474         if (newinet->opt)
1475                 newtp->ext_header_len = newinet->opt->optlen;
1476         newinet->id = newtp->write_seq ^ jiffies;
1477
1478         tcp_sync_mss(newsk, dst_mtu(dst));
1479         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1480         tcp_initialize_rcv_mss(newsk);
1481
1482         __tcp_v4_hash(newsk, 0);
1483         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1484
1485         return newsk;
1486
1487 exit_overflow:
1488         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1489 exit:
1490         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1491         dst_release(dst);
1492         return NULL;
1493 }
1494
1495 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1496 {
1497         struct tcphdr *th = skb->h.th;
1498         struct iphdr *iph = skb->nh.iph;
1499         struct tcp_sock *tp = tcp_sk(sk);
1500         struct sock *nsk;
1501         struct request_sock **prev;
1502         /* Find possible connection requests. */
1503         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1504                                                      iph->saddr, iph->daddr);
1505         if (req)
1506                 return tcp_check_req(sk, skb, req, prev);
1507
1508         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1509                                           th->source,
1510                                           skb->nh.iph->daddr,
1511                                           ntohs(th->dest),
1512                                           tcp_v4_iif(skb));
1513
1514         if (nsk) {
1515                 if (nsk->sk_state != TCP_TIME_WAIT) {
1516                         bh_lock_sock(nsk);
1517                         return nsk;
1518                 }
1519                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1520                 return NULL;
1521         }
1522
1523 #ifdef CONFIG_SYN_COOKIES
1524         if (!th->rst && !th->syn && th->ack)
1525                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1526 #endif
1527         return sk;
1528 }
1529
1530 static int tcp_v4_checksum_init(struct sk_buff *skb)
1531 {
1532         if (skb->ip_summed == CHECKSUM_HW) {
1533                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1534                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1535                                   skb->nh.iph->daddr, skb->csum))
1536                         return 0;
1537
1538                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1539                 skb->ip_summed = CHECKSUM_NONE;
1540         }
1541         if (skb->len <= 76) {
1542                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1543                                  skb->nh.iph->daddr,
1544                                  skb_checksum(skb, 0, skb->len, 0)))
1545                         return -1;
1546                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1547         } else {
1548                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1549                                           skb->nh.iph->saddr,
1550                                           skb->nh.iph->daddr, 0);
1551         }
1552         return 0;
1553 }
1554
1555
1556 /* The socket must have it's spinlock held when we get
1557  * here.
1558  *
1559  * We have a potential double-lock case here, so even when
1560  * doing backlog processing we use the BH locking scheme.
1561  * This is because we cannot sleep with the original spinlock
1562  * held.
1563  */
1564 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1565 {
1566         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1567                 TCP_CHECK_TIMER(sk);
1568                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1569                         goto reset;
1570                 TCP_CHECK_TIMER(sk);
1571                 return 0;
1572         }
1573
1574         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1575                 goto csum_err;
1576
1577         if (sk->sk_state == TCP_LISTEN) {
1578                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1579                 if (!nsk)
1580                         goto discard;
1581
1582                 if (nsk != sk) {
1583                         if (tcp_child_process(sk, nsk, skb))
1584                                 goto reset;
1585                         return 0;
1586                 }
1587         }
1588
1589         TCP_CHECK_TIMER(sk);
1590         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1591                 goto reset;
1592         TCP_CHECK_TIMER(sk);
1593         return 0;
1594
1595 reset:
1596         tcp_v4_send_reset(skb);
1597 discard:
1598         kfree_skb(skb);
1599         /* Be careful here. If this function gets more complicated and
1600          * gcc suffers from register pressure on the x86, sk (in %ebx)
1601          * might be destroyed here. This current version compiles correctly,
1602          * but you have been warned.
1603          */
1604         return 0;
1605
1606 csum_err:
1607         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1608         goto discard;
1609 }
1610
1611 /*
1612  *      From tcp_input.c
1613  */
1614
1615 int tcp_v4_rcv(struct sk_buff *skb)
1616 {
1617         struct tcphdr *th;
1618         struct sock *sk;
1619         int ret;
1620
1621         if (skb->pkt_type != PACKET_HOST)
1622                 goto discard_it;
1623
1624         /* Count it even if it's bad */
1625         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1626
1627         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1628                 goto discard_it;
1629
1630         th = skb->h.th;
1631
1632         if (th->doff < sizeof(struct tcphdr) / 4)
1633                 goto bad_packet;
1634         if (!pskb_may_pull(skb, th->doff * 4))
1635                 goto discard_it;
1636
1637         /* An explanation is required here, I think.
1638          * Packet length and doff are validated by header prediction,
1639          * provided case of th->doff==0 is elimineted.
1640          * So, we defer the checks. */
1641         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1642              tcp_v4_checksum_init(skb) < 0))
1643                 goto bad_packet;
1644
1645         th = skb->h.th;
1646         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1647         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1648                                     skb->len - th->doff * 4);
1649         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1650         TCP_SKB_CB(skb)->when    = 0;
1651         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1652         TCP_SKB_CB(skb)->sacked  = 0;
1653
1654         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1655                              skb->nh.iph->daddr, ntohs(th->dest),
1656                              tcp_v4_iif(skb));
1657
1658         if (!sk)
1659                 goto no_tcp_socket;
1660
1661 process:
1662         if (sk->sk_state == TCP_TIME_WAIT)
1663                 goto do_time_wait;
1664
1665         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1666                 goto discard_and_relse;
1667
1668         if (sk_filter(sk, skb, 0))
1669                 goto discard_and_relse;
1670
1671         skb->dev = NULL;
1672
1673         bh_lock_sock(sk);
1674         ret = 0;
1675         if (!sock_owned_by_user(sk)) {
1676                 if (!tcp_prequeue(sk, skb))
1677                         ret = tcp_v4_do_rcv(sk, skb);
1678         } else
1679                 sk_add_backlog(sk, skb);
1680         bh_unlock_sock(sk);
1681
1682         sock_put(sk);
1683
1684         return ret;
1685
1686 no_tcp_socket:
1687         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1688                 goto discard_it;
1689
1690         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1691 bad_packet:
1692                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1693         } else {
1694                 tcp_v4_send_reset(skb);
1695         }
1696
1697 discard_it:
1698         /* Discard frame. */
1699         kfree_skb(skb);
1700         return 0;
1701
1702 discard_and_relse:
1703         sock_put(sk);
1704         goto discard_it;
1705
1706 do_time_wait:
1707         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1708                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1709                 goto discard_it;
1710         }
1711
1712         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1713                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1714                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1715                 goto discard_it;
1716         }
1717         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1718                                            skb, th, skb->len)) {
1719         case TCP_TW_SYN: {
1720                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1721                                                           ntohs(th->dest),
1722                                                           tcp_v4_iif(skb));
1723                 if (sk2) {
1724                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1725                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1726                         sk = sk2;
1727                         goto process;
1728                 }
1729                 /* Fall through to ACK */
1730         }
1731         case TCP_TW_ACK:
1732                 tcp_v4_timewait_ack(sk, skb);
1733                 break;
1734         case TCP_TW_RST:
1735                 goto no_tcp_socket;
1736         case TCP_TW_SUCCESS:;
1737         }
1738         goto discard_it;
1739 }
1740
1741 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1742 {
1743         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1744         struct inet_sock *inet = inet_sk(sk);
1745
1746         sin->sin_family         = AF_INET;
1747         sin->sin_addr.s_addr    = inet->daddr;
1748         sin->sin_port           = inet->dport;
1749 }
1750
1751 /* VJ's idea. Save last timestamp seen from this destination
1752  * and hold it at least for normal timewait interval to use for duplicate
1753  * segment detection in subsequent connections, before they enter synchronized
1754  * state.
1755  */
1756
1757 int tcp_v4_remember_stamp(struct sock *sk)
1758 {
1759         struct inet_sock *inet = inet_sk(sk);
1760         struct tcp_sock *tp = tcp_sk(sk);
1761         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1762         struct inet_peer *peer = NULL;
1763         int release_it = 0;
1764
1765         if (!rt || rt->rt_dst != inet->daddr) {
1766                 peer = inet_getpeer(inet->daddr, 1);
1767                 release_it = 1;
1768         } else {
1769                 if (!rt->peer)
1770                         rt_bind_peer(rt, 1);
1771                 peer = rt->peer;
1772         }
1773
1774         if (peer) {
1775                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1776                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1777                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1778                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1779                         peer->tcp_ts = tp->rx_opt.ts_recent;
1780                 }
1781                 if (release_it)
1782                         inet_putpeer(peer);
1783                 return 1;
1784         }
1785
1786         return 0;
1787 }
1788
1789 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1790 {
1791         struct inet_peer *peer = NULL;
1792
1793         peer = inet_getpeer(tw->tw_daddr, 1);
1794
1795         if (peer) {
1796                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1797                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1798                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1799                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1800                         peer->tcp_ts = tw->tw_ts_recent;
1801                 }
1802                 inet_putpeer(peer);
1803                 return 1;
1804         }
1805
1806         return 0;
1807 }
1808
1809 struct tcp_func ipv4_specific = {
1810         .queue_xmit     =       ip_queue_xmit,
1811         .send_check     =       tcp_v4_send_check,
1812         .rebuild_header =       inet_sk_rebuild_header,
1813         .conn_request   =       tcp_v4_conn_request,
1814         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1815         .remember_stamp =       tcp_v4_remember_stamp,
1816         .net_header_len =       sizeof(struct iphdr),
1817         .setsockopt     =       ip_setsockopt,
1818         .getsockopt     =       ip_getsockopt,
1819         .addr2sockaddr  =       v4_addr2sockaddr,
1820         .sockaddr_len   =       sizeof(struct sockaddr_in),
1821 };
1822
1823 /* NOTE: A lot of things set to zero explicitly by call to
1824  *       sk_alloc() so need not be done here.
1825  */
1826 static int tcp_v4_init_sock(struct sock *sk)
1827 {
1828         struct tcp_sock *tp = tcp_sk(sk);
1829
1830         skb_queue_head_init(&tp->out_of_order_queue);
1831         tcp_init_xmit_timers(sk);
1832         tcp_prequeue_init(tp);
1833
1834         tp->rto  = TCP_TIMEOUT_INIT;
1835         tp->mdev = TCP_TIMEOUT_INIT;
1836
1837         /* So many TCP implementations out there (incorrectly) count the
1838          * initial SYN frame in their delayed-ACK and congestion control
1839          * algorithms that we must have the following bandaid to talk
1840          * efficiently to them.  -DaveM
1841          */
1842         tp->snd_cwnd = 2;
1843
1844         /* See draft-stevens-tcpca-spec-01 for discussion of the
1845          * initialization of these values.
1846          */
1847         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1848         tp->snd_cwnd_clamp = ~0;
1849         tp->mss_cache = 536;
1850
1851         tp->reordering = sysctl_tcp_reordering;
1852         tp->ca_ops = &tcp_init_congestion_ops;
1853
1854         sk->sk_state = TCP_CLOSE;
1855
1856         sk->sk_write_space = sk_stream_write_space;
1857         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1858
1859         tp->af_specific = &ipv4_specific;
1860
1861         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1862         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1863
1864         atomic_inc(&tcp_sockets_allocated);
1865
1866         return 0;
1867 }
1868
1869 int tcp_v4_destroy_sock(struct sock *sk)
1870 {
1871         struct tcp_sock *tp = tcp_sk(sk);
1872
1873         tcp_clear_xmit_timers(sk);
1874
1875         tcp_cleanup_congestion_control(tp);
1876
1877         /* Cleanup up the write buffer. */
1878         sk_stream_writequeue_purge(sk);
1879
1880         /* Cleans up our, hopefully empty, out_of_order_queue. */
1881         __skb_queue_purge(&tp->out_of_order_queue);
1882
1883         /* Clean prequeue, it must be empty really */
1884         __skb_queue_purge(&tp->ucopy.prequeue);
1885
1886         /* Clean up a referenced TCP bind bucket. */
1887         if (inet_sk(sk)->bind_hash)
1888                 inet_put_port(&tcp_hashinfo, sk);
1889
1890         /*
1891          * If sendmsg cached page exists, toss it.
1892          */
1893         if (sk->sk_sndmsg_page) {
1894                 __free_page(sk->sk_sndmsg_page);
1895                 sk->sk_sndmsg_page = NULL;
1896         }
1897
1898         atomic_dec(&tcp_sockets_allocated);
1899
1900         return 0;
1901 }
1902
1903 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1904
1905 #ifdef CONFIG_PROC_FS
1906 /* Proc filesystem TCP sock list dumping. */
1907
1908 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1909 {
1910         return hlist_empty(head) ? NULL :
1911                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1912 }
1913
1914 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1915 {
1916         return tw->tw_node.next ?
1917                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1918 }
1919
1920 static void *listening_get_next(struct seq_file *seq, void *cur)
1921 {
1922         struct tcp_sock *tp;
1923         struct hlist_node *node;
1924         struct sock *sk = cur;
1925         struct tcp_iter_state* st = seq->private;
1926
1927         if (!sk) {
1928                 st->bucket = 0;
1929                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1930                 goto get_sk;
1931         }
1932
1933         ++st->num;
1934
1935         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1936                 struct request_sock *req = cur;
1937
1938                 tp = tcp_sk(st->syn_wait_sk);
1939                 req = req->dl_next;
1940                 while (1) {
1941                         while (req) {
1942                                 if (req->rsk_ops->family == st->family) {
1943                                         cur = req;
1944                                         goto out;
1945                                 }
1946                                 req = req->dl_next;
1947                         }
1948                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1949                                 break;
1950 get_req:
1951                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1952                 }
1953                 sk        = sk_next(st->syn_wait_sk);
1954                 st->state = TCP_SEQ_STATE_LISTENING;
1955                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1956         } else {
1957                 tp = tcp_sk(sk);
1958                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1959                 if (reqsk_queue_len(&tp->accept_queue))
1960                         goto start_req;
1961                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1962                 sk = sk_next(sk);
1963         }
1964 get_sk:
1965         sk_for_each_from(sk, node) {
1966                 if (sk->sk_family == st->family) {
1967                         cur = sk;
1968                         goto out;
1969                 }
1970                 tp = tcp_sk(sk);
1971                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1972                 if (reqsk_queue_len(&tp->accept_queue)) {
1973 start_req:
1974                         st->uid         = sock_i_uid(sk);
1975                         st->syn_wait_sk = sk;
1976                         st->state       = TCP_SEQ_STATE_OPENREQ;
1977                         st->sbucket     = 0;
1978                         goto get_req;
1979                 }
1980                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1981         }
1982         if (++st->bucket < INET_LHTABLE_SIZE) {
1983                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1984                 goto get_sk;
1985         }
1986         cur = NULL;
1987 out:
1988         return cur;
1989 }
1990
1991 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1992 {
1993         void *rc = listening_get_next(seq, NULL);
1994
1995         while (rc && *pos) {
1996                 rc = listening_get_next(seq, rc);
1997                 --*pos;
1998         }
1999         return rc;
2000 }
2001
2002 static void *established_get_first(struct seq_file *seq)
2003 {
2004         struct tcp_iter_state* st = seq->private;
2005         void *rc = NULL;
2006
2007         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2008                 struct sock *sk;
2009                 struct hlist_node *node;
2010                 struct tcp_tw_bucket *tw;
2011
2012                 /* We can reschedule _before_ having picked the target: */
2013                 cond_resched_softirq();
2014
2015                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2016                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2017                         if (sk->sk_family != st->family) {
2018                                 continue;
2019                         }
2020                         rc = sk;
2021                         goto out;
2022                 }
2023                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2024                 tw_for_each(tw, node,
2025                             &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
2026                         if (tw->tw_family != st->family) {
2027                                 continue;
2028                         }
2029                         rc = tw;
2030                         goto out;
2031                 }
2032                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2033                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2034         }
2035 out:
2036         return rc;
2037 }
2038
2039 static void *established_get_next(struct seq_file *seq, void *cur)
2040 {
2041         struct sock *sk = cur;
2042         struct tcp_tw_bucket *tw;
2043         struct hlist_node *node;
2044         struct tcp_iter_state* st = seq->private;
2045
2046         ++st->num;
2047
2048         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2049                 tw = cur;
2050                 tw = tw_next(tw);
2051 get_tw:
2052                 while (tw && tw->tw_family != st->family) {
2053                         tw = tw_next(tw);
2054                 }
2055                 if (tw) {
2056                         cur = tw;
2057                         goto out;
2058                 }
2059                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2060                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2061
2062                 /* We can reschedule between buckets: */
2063                 cond_resched_softirq();
2064
2065                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2066                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2067                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2068                 } else {
2069                         cur = NULL;
2070                         goto out;
2071                 }
2072         } else
2073                 sk = sk_next(sk);
2074
2075         sk_for_each_from(sk, node) {
2076                 if (sk->sk_family == st->family)
2077                         goto found;
2078         }
2079
2080         st->state = TCP_SEQ_STATE_TIME_WAIT;
2081         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
2082         goto get_tw;
2083 found:
2084         cur = sk;
2085 out:
2086         return cur;
2087 }
2088
2089 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2090 {
2091         void *rc = established_get_first(seq);
2092
2093         while (rc && pos) {
2094                 rc = established_get_next(seq, rc);
2095                 --pos;
2096         }
2097         return rc;
2098 }
2099
2100 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2101 {
2102         void *rc;
2103         struct tcp_iter_state* st = seq->private;
2104
2105         tcp_listen_lock();
2106         st->state = TCP_SEQ_STATE_LISTENING;
2107         rc        = listening_get_idx(seq, &pos);
2108
2109         if (!rc) {
2110                 tcp_listen_unlock();
2111                 local_bh_disable();
2112                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2113                 rc        = established_get_idx(seq, pos);
2114         }
2115
2116         return rc;
2117 }
2118
2119 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2120 {
2121         struct tcp_iter_state* st = seq->private;
2122         st->state = TCP_SEQ_STATE_LISTENING;
2123         st->num = 0;
2124         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2125 }
2126
2127 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2128 {
2129         void *rc = NULL;
2130         struct tcp_iter_state* st;
2131
2132         if (v == SEQ_START_TOKEN) {
2133                 rc = tcp_get_idx(seq, 0);
2134                 goto out;
2135         }
2136         st = seq->private;
2137
2138         switch (st->state) {
2139         case TCP_SEQ_STATE_OPENREQ:
2140         case TCP_SEQ_STATE_LISTENING:
2141                 rc = listening_get_next(seq, v);
2142                 if (!rc) {
2143                         tcp_listen_unlock();
2144                         local_bh_disable();
2145                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2146                         rc        = established_get_first(seq);
2147                 }
2148                 break;
2149         case TCP_SEQ_STATE_ESTABLISHED:
2150         case TCP_SEQ_STATE_TIME_WAIT:
2151                 rc = established_get_next(seq, v);
2152                 break;
2153         }
2154 out:
2155         ++*pos;
2156         return rc;
2157 }
2158
2159 static void tcp_seq_stop(struct seq_file *seq, void *v)
2160 {
2161         struct tcp_iter_state* st = seq->private;
2162
2163         switch (st->state) {
2164         case TCP_SEQ_STATE_OPENREQ:
2165                 if (v) {
2166                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2167                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2168                 }
2169         case TCP_SEQ_STATE_LISTENING:
2170                 if (v != SEQ_START_TOKEN)
2171                         tcp_listen_unlock();
2172                 break;
2173         case TCP_SEQ_STATE_TIME_WAIT:
2174         case TCP_SEQ_STATE_ESTABLISHED:
2175                 if (v)
2176                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2177                 local_bh_enable();
2178                 break;
2179         }
2180 }
2181
2182 static int tcp_seq_open(struct inode *inode, struct file *file)
2183 {
2184         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2185         struct seq_file *seq;
2186         struct tcp_iter_state *s;
2187         int rc;
2188
2189         if (unlikely(afinfo == NULL))
2190                 return -EINVAL;
2191
2192         s = kmalloc(sizeof(*s), GFP_KERNEL);
2193         if (!s)
2194                 return -ENOMEM;
2195         memset(s, 0, sizeof(*s));
2196         s->family               = afinfo->family;
2197         s->seq_ops.start        = tcp_seq_start;
2198         s->seq_ops.next         = tcp_seq_next;
2199         s->seq_ops.show         = afinfo->seq_show;
2200         s->seq_ops.stop         = tcp_seq_stop;
2201
2202         rc = seq_open(file, &s->seq_ops);
2203         if (rc)
2204                 goto out_kfree;
2205         seq          = file->private_data;
2206         seq->private = s;
2207 out:
2208         return rc;
2209 out_kfree:
2210         kfree(s);
2211         goto out;
2212 }
2213
2214 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2215 {
2216         int rc = 0;
2217         struct proc_dir_entry *p;
2218
2219         if (!afinfo)
2220                 return -EINVAL;
2221         afinfo->seq_fops->owner         = afinfo->owner;
2222         afinfo->seq_fops->open          = tcp_seq_open;
2223         afinfo->seq_fops->read          = seq_read;
2224         afinfo->seq_fops->llseek        = seq_lseek;
2225         afinfo->seq_fops->release       = seq_release_private;
2226
2227         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2228         if (p)
2229                 p->data = afinfo;
2230         else
2231                 rc = -ENOMEM;
2232         return rc;
2233 }
2234
2235 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2236 {
2237         if (!afinfo)
2238                 return;
2239         proc_net_remove(afinfo->name);
2240         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2241 }
2242
2243 static void get_openreq4(struct sock *sk, struct request_sock *req,
2244                          char *tmpbuf, int i, int uid)
2245 {
2246         const struct inet_request_sock *ireq = inet_rsk(req);
2247         int ttd = req->expires - jiffies;
2248
2249         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2250                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2251                 i,
2252                 ireq->loc_addr,
2253                 ntohs(inet_sk(sk)->sport),
2254                 ireq->rmt_addr,
2255                 ntohs(ireq->rmt_port),
2256                 TCP_SYN_RECV,
2257                 0, 0, /* could print option size, but that is af dependent. */
2258                 1,    /* timers active (only the expire timer) */
2259                 jiffies_to_clock_t(ttd),
2260                 req->retrans,
2261                 uid,
2262                 0,  /* non standard timer */
2263                 0, /* open_requests have no inode */
2264                 atomic_read(&sk->sk_refcnt),
2265                 req);
2266 }
2267
2268 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2269 {
2270         int timer_active;
2271         unsigned long timer_expires;
2272         struct tcp_sock *tp = tcp_sk(sp);
2273         struct inet_sock *inet = inet_sk(sp);
2274         unsigned int dest = inet->daddr;
2275         unsigned int src = inet->rcv_saddr;
2276         __u16 destp = ntohs(inet->dport);
2277         __u16 srcp = ntohs(inet->sport);
2278
2279         if (tp->pending == TCP_TIME_RETRANS) {
2280                 timer_active    = 1;
2281                 timer_expires   = tp->timeout;
2282         } else if (tp->pending == TCP_TIME_PROBE0) {
2283                 timer_active    = 4;
2284                 timer_expires   = tp->timeout;
2285         } else if (timer_pending(&sp->sk_timer)) {
2286                 timer_active    = 2;
2287                 timer_expires   = sp->sk_timer.expires;
2288         } else {
2289                 timer_active    = 0;
2290                 timer_expires = jiffies;
2291         }
2292
2293         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2294                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2295                 i, src, srcp, dest, destp, sp->sk_state,
2296                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2297                 timer_active,
2298                 jiffies_to_clock_t(timer_expires - jiffies),
2299                 tp->retransmits,
2300                 sock_i_uid(sp),
2301                 tp->probes_out,
2302                 sock_i_ino(sp),
2303                 atomic_read(&sp->sk_refcnt), sp,
2304                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2305                 tp->snd_cwnd,
2306                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2307 }
2308
2309 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2310 {
2311         unsigned int dest, src;
2312         __u16 destp, srcp;
2313         int ttd = tw->tw_ttd - jiffies;
2314
2315         if (ttd < 0)
2316                 ttd = 0;
2317
2318         dest  = tw->tw_daddr;
2319         src   = tw->tw_rcv_saddr;
2320         destp = ntohs(tw->tw_dport);
2321         srcp  = ntohs(tw->tw_sport);
2322
2323         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2324                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2325                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2326                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2327                 atomic_read(&tw->tw_refcnt), tw);
2328 }
2329
2330 #define TMPSZ 150
2331
2332 static int tcp4_seq_show(struct seq_file *seq, void *v)
2333 {
2334         struct tcp_iter_state* st;
2335         char tmpbuf[TMPSZ + 1];
2336
2337         if (v == SEQ_START_TOKEN) {
2338                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2339                            "  sl  local_address rem_address   st tx_queue "
2340                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2341                            "inode");
2342                 goto out;
2343         }
2344         st = seq->private;
2345
2346         switch (st->state) {
2347         case TCP_SEQ_STATE_LISTENING:
2348         case TCP_SEQ_STATE_ESTABLISHED:
2349                 get_tcp4_sock(v, tmpbuf, st->num);
2350                 break;
2351         case TCP_SEQ_STATE_OPENREQ:
2352                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2353                 break;
2354         case TCP_SEQ_STATE_TIME_WAIT:
2355                 get_timewait4_sock(v, tmpbuf, st->num);
2356                 break;
2357         }
2358         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2359 out:
2360         return 0;
2361 }
2362
2363 static struct file_operations tcp4_seq_fops;
2364 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2365         .owner          = THIS_MODULE,
2366         .name           = "tcp",
2367         .family         = AF_INET,
2368         .seq_show       = tcp4_seq_show,
2369         .seq_fops       = &tcp4_seq_fops,
2370 };
2371
2372 int __init tcp4_proc_init(void)
2373 {
2374         return tcp_proc_register(&tcp4_seq_afinfo);
2375 }
2376
2377 void tcp4_proc_exit(void)
2378 {
2379         tcp_proc_unregister(&tcp4_seq_afinfo);
2380 }
2381 #endif /* CONFIG_PROC_FS */
2382
2383 struct proto tcp_prot = {
2384         .name                   = "TCP",
2385         .owner                  = THIS_MODULE,
2386         .close                  = tcp_close,
2387         .connect                = tcp_v4_connect,
2388         .disconnect             = tcp_disconnect,
2389         .accept                 = tcp_accept,
2390         .ioctl                  = tcp_ioctl,
2391         .init                   = tcp_v4_init_sock,
2392         .destroy                = tcp_v4_destroy_sock,
2393         .shutdown               = tcp_shutdown,
2394         .setsockopt             = tcp_setsockopt,
2395         .getsockopt             = tcp_getsockopt,
2396         .sendmsg                = tcp_sendmsg,
2397         .recvmsg                = tcp_recvmsg,
2398         .backlog_rcv            = tcp_v4_do_rcv,
2399         .hash                   = tcp_v4_hash,
2400         .unhash                 = tcp_unhash,
2401         .get_port               = tcp_v4_get_port,
2402         .enter_memory_pressure  = tcp_enter_memory_pressure,
2403         .sockets_allocated      = &tcp_sockets_allocated,
2404         .memory_allocated       = &tcp_memory_allocated,
2405         .memory_pressure        = &tcp_memory_pressure,
2406         .sysctl_mem             = sysctl_tcp_mem,
2407         .sysctl_wmem            = sysctl_tcp_wmem,
2408         .sysctl_rmem            = sysctl_tcp_rmem,
2409         .max_header             = MAX_TCP_HEADER,
2410         .obj_size               = sizeof(struct tcp_sock),
2411         .rsk_prot               = &tcp_request_sock_ops,
2412 };
2413
2414
2415
2416 void __init tcp_v4_init(struct net_proto_family *ops)
2417 {
2418         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2419         if (err < 0)
2420                 panic("Failed to create the TCP control socket.\n");
2421         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2422         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2423
2424         /* Unhash it so that IP input processing does not even
2425          * see it, we do not wish this socket to see incoming
2426          * packets.
2427          */
2428         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2429 }
2430
2431 EXPORT_SYMBOL(ipv4_specific);
2432 EXPORT_SYMBOL(inet_bind_bucket_create);
2433 EXPORT_SYMBOL(tcp_hashinfo);
2434 EXPORT_SYMBOL(tcp_listen_wlock);
2435 EXPORT_SYMBOL(tcp_prot);
2436 EXPORT_SYMBOL(tcp_unhash);
2437 EXPORT_SYMBOL(tcp_v4_conn_request);
2438 EXPORT_SYMBOL(tcp_v4_connect);
2439 EXPORT_SYMBOL(tcp_v4_do_rcv);
2440 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2441 EXPORT_SYMBOL(tcp_v4_send_check);
2442 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2443
2444 #ifdef CONFIG_PROC_FS
2445 EXPORT_SYMBOL(tcp_proc_register);
2446 EXPORT_SYMBOL(tcp_proc_unregister);
2447 #endif
2448 EXPORT_SYMBOL(sysctl_local_port_range);
2449 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2450 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2451