err.no Git - linux-2.6/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/ipv6.h>
  70 #include <net/inet_common.h>
  71 #include <net/xfrm.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78
  79 extern int sysctl_ip_dynaddr;
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .lhash_lock     = RW_LOCK_UNLOCKED,
  94         .lhash_users    = ATOMIC_INIT(0),
  95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
  96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
  97         .port_rover     = 1024 - 1,
  98 };
  99
 100 /*
 101  * This array holds the first and last local port number.
 102  * For high-usage systems, use sysctl to change this to
 103  * 32768-61000
 104  */
 105 int sysctl_local_port_range[2] = { 1024, 4999 };
 106
 107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
 108 {
 109         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
 110         struct sock *sk2;
 111         struct hlist_node *node;
 112         int reuse = sk->sk_reuse;
 113
 114         sk_for_each_bound(sk2, node, &tb->owners) {
 115                 if (sk != sk2 &&
 116                     !tcp_v6_ipv6only(sk2) &&
 117                     (!sk->sk_bound_dev_if ||
 118                      !sk2->sk_bound_dev_if ||
 119                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 120                         if (!reuse || !sk2->sk_reuse ||
 121                             sk2->sk_state == TCP_LISTEN) {
 122                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
 123                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 124                                     sk2_rcv_saddr == sk_rcv_saddr)
 125                                         break;
 126                         }
 127                 }
 128         }
 129         return node != NULL;
 130 }
 131
 132 /* Obtain a reference to a local port for the given sock,
 133  * if snum is zero it means select any available local port.
 134  */
 135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 136 {
 137         struct inet_bind_hashbucket *head;
 138         struct hlist_node *node;
 139         struct inet_bind_bucket *tb;
 140         int ret;
 141
 142         local_bh_disable();
 143         if (!snum) {
 144                 int low = sysctl_local_port_range[0];
 145                 int high = sysctl_local_port_range[1];
 146                 int remaining = (high - low) + 1;
 147                 int rover;
 148
 149                 spin_lock(&tcp_hashinfo.portalloc_lock);
 150                 if (tcp_hashinfo.port_rover < low)
 151                         rover = low;
 152                 else
 153                         rover = tcp_hashinfo.port_rover;
 154                 do {
 155                         rover++;
 156                         if (rover > high)
 157                                 rover = low;
 158                         head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
 159                         spin_lock(&head->lock);
 160                         inet_bind_bucket_for_each(tb, node, &head->chain)
 161                                 if (tb->port == rover)
 162                                         goto next;
 163                         break;
 164                 next:
 165                         spin_unlock(&head->lock);
 166                 } while (--remaining > 0);
 167                 tcp_hashinfo.port_rover = rover;
 168                 spin_unlock(&tcp_hashinfo.portalloc_lock);
 169
 170                 /* Exhausted local port range during search?  It is not
 171                  * possible for us to be holding one of the bind hash
 172                  * locks if this test triggers, because if 'remaining'
 173                  * drops to zero, we broke out of the do/while loop at
 174                  * the top level, not from the 'break;' statement.
 175                  */
 176                 ret = 1;
 177                 if (unlikely(remaining <= 0))
 178                         goto fail;
 179
 180                 /* OK, here is the one we will use.  HEAD is
 181                  * non-NULL and we hold it's mutex.
 182                  */
 183                 snum = rover;
 184         } else {
 185                 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 186                 spin_lock(&head->lock);
 187                 inet_bind_bucket_for_each(tb, node, &head->chain)
 188                         if (tb->port == snum)
 189                                 goto tb_found;
 190         }
 191         tb = NULL;
 192         goto tb_not_found;
 193 tb_found:
 194         if (!hlist_empty(&tb->owners)) {
 195                 if (sk->sk_reuse > 1)
 196                         goto success;
 197                 if (tb->fastreuse > 0 &&
 198                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 199                         goto success;
 200                 } else {
 201                         ret = 1;
 202                         if (tcp_bind_conflict(sk, tb))
 203                                 goto fail_unlock;
 204                 }
 205         }
 206 tb_not_found:
 207         ret = 1;
 208         if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
 209                 goto fail_unlock;
 210         if (hlist_empty(&tb->owners)) {
 211                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 212                         tb->fastreuse = 1;
 213                 else
 214                         tb->fastreuse = 0;
 215         } else if (tb->fastreuse &&
 216                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 217                 tb->fastreuse = 0;
 218 success:
 219         if (!inet_sk(sk)->bind_hash)
 220                 inet_bind_hash(sk, tb, snum);
 221         BUG_TRAP(inet_sk(sk)->bind_hash == tb);
 222         ret = 0;
 223
 224 fail_unlock:
 225         spin_unlock(&head->lock);
 226 fail:
 227         local_bh_enable();
 228         return ret;
 229 }
 230
 231 static void tcp_v4_hash(struct sock *sk)
 232 {
 233         if (sk->sk_state != TCP_CLOSE) {
 234                 local_bh_disable();
 235                 __inet_hash(&tcp_hashinfo, sk, 1);
 236                 local_bh_enable();
 237         }
 238 }
 239
 240 void tcp_unhash(struct sock *sk)
 241 {
 242         rwlock_t *lock;
 243
 244         if (sk_unhashed(sk))
 245                 goto ende;
 246
 247         if (sk->sk_state == TCP_LISTEN) {
 248                 local_bh_disable();
 249                 inet_listen_wlock(&tcp_hashinfo);
 250                 lock = &tcp_hashinfo.lhash_lock;
 251         } else {
 252                 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[sk->sk_hashent];
 253                 lock = &head->lock;
 254                 write_lock_bh(&head->lock);
 255         }
 256
 257         if (__sk_del_node_init(sk))
 258                 sock_prot_dec_use(sk->sk_prot);
 259         write_unlock_bh(lock);
 260
 261  ende:
 262         if (sk->sk_state == TCP_LISTEN)
 263                 wake_up(&tcp_hashinfo.lhash_wait);
 264 }
 265
 266 /* Don't inline this cruft.  Here are some nice properties to
 267  * exploit here.  The BSD API does not allow a listening TCP
 268  * to specify the remote port nor the remote address for the
 269  * connection.  So always assume those are both wildcarded
 270  * during the search since they can never be otherwise.
 271  */
 272 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
 273                                              const u32 daddr,
 274                                              const unsigned short hnum,
 275                                              const int dif)
 276 {
 277         struct sock *result = NULL, *sk;
 278         struct hlist_node *node;
 279         int score, hiscore;
 280
 281         hiscore=-1;
 282         sk_for_each(sk, node, head) {
 283                 struct inet_sock *inet = inet_sk(sk);
 284
 285                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
 286                         __u32 rcv_saddr = inet->rcv_saddr;
 287
 288                         score = (sk->sk_family == PF_INET ? 1 : 0);
 289                         if (rcv_saddr) {
 290                                 if (rcv_saddr != daddr)
 291                                         continue;
 292                                 score+=2;
 293                         }
 294                         if (sk->sk_bound_dev_if) {
 295                                 if (sk->sk_bound_dev_if != dif)
 296                                         continue;
 297                                 score+=2;
 298                         }
 299                         if (score == 5)
 300                                 return sk;
 301                         if (score > hiscore) {
 302                                 hiscore = score;
 303                                 result = sk;
 304                         }
 305                 }
 306         }
 307         return result;
 308 }
 309
 310 /* Optimize the common listener case. */
 311 static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
 312                                                   const unsigned short hnum,
 313                                                   const int dif)
 314 {
 315         struct sock *sk = NULL;
 316         struct hlist_head *head;
 317
 318         read_lock(&tcp_hashinfo.lhash_lock);
 319         head = &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)];
 320         if (!hlist_empty(head)) {
 321                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
 322
 323                 if (inet->num == hnum && !sk->sk_node.next &&
 324                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 325                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 326                     !sk->sk_bound_dev_if)
 327                         goto sherry_cache;
 328                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 329         }
 330         if (sk) {
 331 sherry_cache:
 332                 sock_hold(sk);
 333         }
 334         read_unlock(&tcp_hashinfo.lhash_lock);
 335         return sk;
 336 }
 337
 338 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 339  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 340  *
 341  * Local BH must be disabled here.
 342  */
 343
 344 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
 345                                                        const u16 sport,
 346                                                        const u32 daddr,
 347                                                        const u16 hnum,
 348                                                        const int dif)
 349 {
 350         struct inet_ehash_bucket *head;
 351         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 352         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 353         struct sock *sk;
 354         struct hlist_node *node;
 355         /* Optimize here for direct hit, only listening connections can
 356          * have wildcards anyways.
 357          */
 358         const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
 359         head = &tcp_hashinfo.ehash[hash];
 360         read_lock(&head->lock);
 361         sk_for_each(sk, node, &head->chain) {
 362                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 363                         goto hit; /* You sunk my battleship! */
 364         }
 365
 366         /* Must check for a TIME_WAIT'er before going to listener hash. */
 367         sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 368                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 369                         goto hit;
 370         }
 371         sk = NULL;
 372 out:
 373         read_unlock(&head->lock);
 374         return sk;
 375 hit:
 376         sock_hold(sk);
 377         goto out;
 378 }
 379
 380 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 381                                            u32 daddr, u16 hnum, int dif)
 382 {
 383         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 384                                                       daddr, hnum, dif);
 385
 386         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 387 }
 388
 389 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 390                                   u16 dport, int dif)
 391 {
 392         struct sock *sk;
 393
 394         local_bh_disable();
 395         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 396         local_bh_enable();
 397
 398         return sk;
 399 }
 400
 401 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
 402
 403 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 404 {
 405         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 406                                           skb->nh.iph->saddr,
 407                                           skb->h.th->dest,
 408                                           skb->h.th->source);
 409 }
 410
 411 /* called with local bh disabled */
 412 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 413                                       struct tcp_tw_bucket **twp)
 414 {
 415         struct inet_sock *inet = inet_sk(sk);
 416         u32 daddr = inet->rcv_saddr;
 417         u32 saddr = inet->daddr;
 418         int dif = sk->sk_bound_dev_if;
 419         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 420         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 421         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
 422         struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
 423         struct sock *sk2;
 424         struct hlist_node *node;
 425         struct tcp_tw_bucket *tw;
 426
 427         write_lock(&head->lock);
 428
 429         /* Check TIME-WAIT sockets first. */
 430         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 431                 tw = (struct tcp_tw_bucket *)sk2;
 432
 433                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 434                         struct tcp_sock *tp = tcp_sk(sk);
 435
 436                         /* With PAWS, it is safe from the viewpoint
 437                            of data integrity. Even without PAWS it
 438                            is safe provided sequence spaces do not
 439                            overlap i.e. at data rates <= 80Mbit/sec.
 440
 441                            Actually, the idea is close to VJ's one,
 442                            only timestamp cache is held not per host,
 443                            but per port pair and TW bucket is used
 444                            as state holder.
 445
 446                            If TW bucket has been already destroyed we
 447                            fall back to VJ's scheme and use initial
 448                            timestamp retrieved from peer table.
 449                          */
 450                         if (tw->tw_ts_recent_stamp &&
 451                             (!twp || (sysctl_tcp_tw_reuse &&
 452                                       xtime.tv_sec -
 453                                       tw->tw_ts_recent_stamp > 1))) {
 454                                 if ((tp->write_seq =
 455                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
 456                                         tp->write_seq = 1;
 457                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
 458                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
 459                                 sock_hold(sk2);
 460                                 goto unique;
 461                         } else
 462                                 goto not_unique;
 463                 }
 464         }
 465         tw = NULL;
 466
 467         /* And established part... */
 468         sk_for_each(sk2, node, &head->chain) {
 469                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 470                         goto not_unique;
 471         }
 472
 473 unique:
 474         /* Must record num and sport now. Otherwise we will see
 475          * in hash table socket with a funny identity. */
 476         inet->num = lport;
 477         inet->sport = htons(lport);
 478         sk->sk_hashent = hash;
 479         BUG_TRAP(sk_unhashed(sk));
 480         __sk_add_node(sk, &head->chain);
 481         sock_prot_inc_use(sk->sk_prot);
 482         write_unlock(&head->lock);
 483
 484         if (twp) {
 485                 *twp = tw;
 486                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 487         } else if (tw) {
 488                 /* Silly. Should hash-dance instead... */
 489                 tcp_tw_deschedule(tw);
 490                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 491
 492                 tcp_tw_put(tw);
 493         }
 494
 495         return 0;
 496
 497 not_unique:
 498         write_unlock(&head->lock);
 499         return -EADDRNOTAVAIL;
 500 }
 501
 502 static inline u32 connect_port_offset(const struct sock *sk)
 503 {
 504         const struct inet_sock *inet = inet_sk(sk);
 505
 506         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 507                                          inet->dport);
 508 }
 509
 510 /*
 511  * Bind a port for a connect operation and hash it.
 512  */
 513 static inline int tcp_v4_hash_connect(struct sock *sk)
 514 {
 515         const unsigned short snum = inet_sk(sk)->num;
 516         struct inet_bind_hashbucket *head;
 517         struct inet_bind_bucket *tb;
 518         int ret;
 519
 520         if (!snum) {
 521                 int low = sysctl_local_port_range[0];
 522                 int high = sysctl_local_port_range[1];
 523                 int range = high - low;
 524                 int i;
 525                 int port;
 526                 static u32 hint;
 527                 u32 offset = hint + connect_port_offset(sk);
 528                 struct hlist_node *node;
 529                 struct tcp_tw_bucket *tw = NULL;
 530
 531                 local_bh_disable();
 532                 for (i = 1; i <= range; i++) {
 533                         port = low + (i + offset) % range;
 534                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
 535                         spin_lock(&head->lock);
 536
 537                         /* Does not bother with rcv_saddr checks,
 538                          * because the established check is already
 539                          * unique enough.
 540                          */
 541                         inet_bind_bucket_for_each(tb, node, &head->chain) {
 542                                 if (tb->port == port) {
 543                                         BUG_TRAP(!hlist_empty(&tb->owners));
 544                                         if (tb->fastreuse >= 0)
 545                                                 goto next_port;
 546                                         if (!__tcp_v4_check_established(sk,
 547                                                                         port,
 548                                                                         &tw))
 549                                                 goto ok;
 550                                         goto next_port;
 551                                 }
 552                         }
 553
 554                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
 555                         if (!tb) {
 556                                 spin_unlock(&head->lock);
 557                                 break;
 558                         }
 559                         tb->fastreuse = -1;
 560                         goto ok;
 561
 562                 next_port:
 563                         spin_unlock(&head->lock);
 564                 }
 565                 local_bh_enable();
 566
 567                 return -EADDRNOTAVAIL;
 568
 569 ok:
 570                 hint += i;
 571
 572                 /* Head lock still held and bh's disabled */
 573                 inet_bind_hash(sk, tb, port);
 574                 if (sk_unhashed(sk)) {
 575                         inet_sk(sk)->sport = htons(port);
 576                         __inet_hash(&tcp_hashinfo, sk, 0);
 577                 }
 578                 spin_unlock(&head->lock);
 579
 580                 if (tw) {
 581                         tcp_tw_deschedule(tw);
 582                         tcp_tw_put(tw);
 583                 }
 584
 585                 ret = 0;
 586                 goto out;
 587         }
 588
 589         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 590         tb  = inet_sk(sk)->bind_hash;
 591         spin_lock_bh(&head->lock);
 592         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 593                 __inet_hash(&tcp_hashinfo, sk, 0);
 594                 spin_unlock_bh(&head->lock);
 595                 return 0;
 596         } else {
 597                 spin_unlock(&head->lock);
 598                 /* No definite answer... Walk to established hash table */
 599                 ret = __tcp_v4_check_established(sk, snum, NULL);
 600 out:
 601                 local_bh_enable();
 602                 return ret;
 603         }
 604 }
 605
 606 /* This will initiate an outgoing connection. */
 607 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 608 {
 609         struct inet_sock *inet = inet_sk(sk);
 610         struct tcp_sock *tp = tcp_sk(sk);
 611         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 612         struct rtable *rt;
 613         u32 daddr, nexthop;
 614         int tmp;
 615         int err;
 616
 617         if (addr_len < sizeof(struct sockaddr_in))
 618                 return -EINVAL;
 619
 620         if (usin->sin_family != AF_INET)
 621                 return -EAFNOSUPPORT;
 622
 623         nexthop = daddr = usin->sin_addr.s_addr;
 624         if (inet->opt && inet->opt->srr) {
 625                 if (!daddr)
 626                         return -EINVAL;
 627                 nexthop = inet->opt->faddr;
 628         }
 629
 630         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 631                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 632                                IPPROTO_TCP,
 633                                inet->sport, usin->sin_port, sk);
 634         if (tmp < 0)
 635                 return tmp;
 636
 637         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 638                 ip_rt_put(rt);
 639                 return -ENETUNREACH;
 640         }
 641
 642         if (!inet->opt || !inet->opt->srr)
 643                 daddr = rt->rt_dst;
 644
 645         if (!inet->saddr)
 646                 inet->saddr = rt->rt_src;
 647         inet->rcv_saddr = inet->saddr;
 648
 649         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 650                 /* Reset inherited state */
 651                 tp->rx_opt.ts_recent       = 0;
 652                 tp->rx_opt.ts_recent_stamp = 0;
 653                 tp->write_seq              = 0;
 654         }
 655
 656         if (sysctl_tcp_tw_recycle &&
 657             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 658                 struct inet_peer *peer = rt_get_peer(rt);
 659
 660                 /* VJ's idea. We save last timestamp seen from
 661                  * the destination in peer table, when entering state TIME-WAIT
 662                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 663                  */
 664
 665                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 666                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 667                         tp->rx_opt.ts_recent = peer->tcp_ts;
 668                 }
 669         }
 670
 671         inet->dport = usin->sin_port;
 672         inet->daddr = daddr;
 673
 674         tp->ext_header_len = 0;
 675         if (inet->opt)
 676                 tp->ext_header_len = inet->opt->optlen;
 677
 678         tp->rx_opt.mss_clamp = 536;
 679
 680         /* Socket identity is still unknown (sport may be zero).
 681          * However we set state to SYN-SENT and not releasing socket
 682          * lock select source port, enter ourselves into the hash tables and
 683          * complete initialization after this.
 684          */
 685         tcp_set_state(sk, TCP_SYN_SENT);
 686         err = tcp_v4_hash_connect(sk);
 687         if (err)
 688                 goto failure;
 689
 690         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 691         if (err)
 692                 goto failure;
 693
 694         /* OK, now commit destination to socket.  */
 695         sk_setup_caps(sk, &rt->u.dst);
 696
 697         if (!tp->write_seq)
 698                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 699                                                            inet->daddr,
 700                                                            inet->sport,
 701                                                            usin->sin_port);
 702
 703         inet->id = tp->write_seq ^ jiffies;
 704
 705         err = tcp_connect(sk);
 706         rt = NULL;
 707         if (err)
 708                 goto failure;
 709
 710         return 0;
 711
 712 failure:
 713         /* This unhashes the socket and releases the local port, if necessary. */
 714         tcp_set_state(sk, TCP_CLOSE);
 715         ip_rt_put(rt);
 716         sk->sk_route_caps = 0;
 717         inet->dport = 0;
 718         return err;
 719 }
 720
 721 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 722 {
 723         return ((struct rtable *)skb->dst)->rt_iif;
 724 }
 725
 726 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 727 {
 728         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 729 }
 730
 731 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
 732                                               struct request_sock ***prevp,
 733                                               __u16 rport,
 734                                               __u32 raddr, __u32 laddr)
 735 {
 736         struct listen_sock *lopt = tp->accept_queue.listen_opt;
 737         struct request_sock *req, **prev;
 738
 739         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 740              (req = *prev) != NULL;
 741              prev = &req->dl_next) {
 742                 const struct inet_request_sock *ireq = inet_rsk(req);
 743
 744                 if (ireq->rmt_port == rport &&
 745                     ireq->rmt_addr == raddr &&
 746                     ireq->loc_addr == laddr &&
 747                     TCP_INET_FAMILY(req->rsk_ops->family)) {
 748                         BUG_TRAP(!req->sk);
 749                         *prevp = prev;
 750                         break;
 751                 }
 752         }
 753
 754         return req;
 755 }
 756
 757 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
 758 {
 759         struct tcp_sock *tp = tcp_sk(sk);
 760         struct listen_sock *lopt = tp->accept_queue.listen_opt;
 761         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 762
 763         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
 764         tcp_synq_added(sk);
 765 }
 766
 767
 768 /*
 769  * This routine does path mtu discovery as defined in RFC1191.
 770  */
 771 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 772                                      u32 mtu)
 773 {
 774         struct dst_entry *dst;
 775         struct inet_sock *inet = inet_sk(sk);
 776         struct tcp_sock *tp = tcp_sk(sk);
 777
 778         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 779          * send out by Linux are always <576bytes so they should go through
 780          * unfragmented).
 781          */
 782         if (sk->sk_state == TCP_LISTEN)
 783                 return;
 784
 785         /* We don't check in the destentry if pmtu discovery is forbidden
 786          * on this route. We just assume that no packet_to_big packets
 787          * are send back when pmtu discovery is not active.
 788          * There is a small race when the user changes this flag in the
 789          * route, but I think that's acceptable.
 790          */
 791         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 792                 return;
 793
 794         dst->ops->update_pmtu(dst, mtu);
 795
 796         /* Something is about to be wrong... Remember soft error
 797          * for the case, if this connection will not able to recover.
 798          */
 799         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 800                 sk->sk_err_soft = EMSGSIZE;
 801
 802         mtu = dst_mtu(dst);
 803
 804         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 805             tp->pmtu_cookie > mtu) {
 806                 tcp_sync_mss(sk, mtu);
 807
 808                 /* Resend the TCP packet because it's
 809                  * clear that the old packet has been
 810                  * dropped. This is the new "fast" path mtu
 811                  * discovery.
 812                  */
 813                 tcp_simple_retransmit(sk);
 814         } /* else let the usual retransmit timer handle it */
 815 }
 816
 817 /*
 818  * This routine is called by the ICMP module when it gets some
 819  * sort of error condition.  If err < 0 then the socket should
 820  * be closed and the error returned to the user.  If err > 0
 821  * it's just the icmp type << 8 | icmp code.  After adjustment
 822  * header points to the first 8 bytes of the tcp header.  We need
 823  * to find the appropriate port.
 824  *
 825  * The locking strategy used here is very "optimistic". When
 826  * someone else accesses the socket the ICMP is just dropped
 827  * and for some paths there is no check at all.
 828  * A more general error queue to queue errors for later handling
 829  * is probably better.
 830  *
 831  */
 832
 833 void tcp_v4_err(struct sk_buff *skb, u32 info)
 834 {
 835         struct iphdr *iph = (struct iphdr *)skb->data;
 836         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 837         struct tcp_sock *tp;
 838         struct inet_sock *inet;
 839         int type = skb->h.icmph->type;
 840         int code = skb->h.icmph->code;
 841         struct sock *sk;
 842         __u32 seq;
 843         int err;
 844
 845         if (skb->len < (iph->ihl << 2) + 8) {
 846                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 847                 return;
 848         }
 849
 850         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
 851                            th->source, tcp_v4_iif(skb));
 852         if (!sk) {
 853                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 854                 return;
 855         }
 856         if (sk->sk_state == TCP_TIME_WAIT) {
 857                 tcp_tw_put((struct tcp_tw_bucket *)sk);
 858                 return;
 859         }
 860
 861         bh_lock_sock(sk);
 862         /* If too many ICMPs get dropped on busy
 863          * servers this needs to be solved differently.
 864          */
 865         if (sock_owned_by_user(sk))
 866                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 867
 868         if (sk->sk_state == TCP_CLOSE)
 869                 goto out;
 870
 871         tp = tcp_sk(sk);
 872         seq = ntohl(th->seq);
 873         if (sk->sk_state != TCP_LISTEN &&
 874             !between(seq, tp->snd_una, tp->snd_nxt)) {
 875                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
 876                 goto out;
 877         }
 878
 879         switch (type) {
 880         case ICMP_SOURCE_QUENCH:
 881                 /* Just silently ignore these. */
 882                 goto out;
 883         case ICMP_PARAMETERPROB:
 884                 err = EPROTO;
 885                 break;
 886         case ICMP_DEST_UNREACH:
 887                 if (code > NR_ICMP_UNREACH)
 888                         goto out;
 889
 890                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 891                         if (!sock_owned_by_user(sk))
 892                                 do_pmtu_discovery(sk, iph, info);
 893                         goto out;
 894                 }
 895
 896                 err = icmp_err_convert[code].errno;
 897                 break;
 898         case ICMP_TIME_EXCEEDED:
 899                 err = EHOSTUNREACH;
 900                 break;
 901         default:
 902                 goto out;
 903         }
 904
 905         switch (sk->sk_state) {
 906                 struct request_sock *req, **prev;
 907         case TCP_LISTEN:
 908                 if (sock_owned_by_user(sk))
 909                         goto out;
 910
 911                 req = tcp_v4_search_req(tp, &prev, th->dest,
 912                                         iph->daddr, iph->saddr);
 913                 if (!req)
 914                         goto out;
 915
 916                 /* ICMPs are not backlogged, hence we cannot get
 917                    an established socket here.
 918                  */
 919                 BUG_TRAP(!req->sk);
 920
 921                 if (seq != tcp_rsk(req)->snt_isn) {
 922                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 923                         goto out;
 924                 }
 925
 926                 /*
 927                  * Still in SYN_RECV, just remove it silently.
 928                  * There is no good way to pass the error to the newly
 929                  * created socket, and POSIX does not want network
 930                  * errors returned from accept().
 931                  */
 932                 tcp_synq_drop(sk, req, prev);
 933                 goto out;
 934
 935         case TCP_SYN_SENT:
 936         case TCP_SYN_RECV:  /* Cannot happen.
 937                                It can f.e. if SYNs crossed.
 938                              */
 939                 if (!sock_owned_by_user(sk)) {
 940                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 941                         sk->sk_err = err;
 942
 943                         sk->sk_error_report(sk);
 944
 945                         tcp_done(sk);
 946                 } else {
 947                         sk->sk_err_soft = err;
 948                 }
 949                 goto out;
 950         }
 951
 952         /* If we've already connected we will keep trying
 953          * until we time out, or the user gives up.
 954          *
 955          * rfc1122 4.2.3.9 allows to consider as hard errors
 956          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 957          * but it is obsoleted by pmtu discovery).
 958          *
 959          * Note, that in modern internet, where routing is unreliable
 960          * and in each dark corner broken firewalls sit, sending random
 961          * errors ordered by their masters even this two messages finally lose
 962          * their original sense (even Linux sends invalid PORT_UNREACHs)
 963          *
 964          * Now we are in compliance with RFCs.
 965          *                                                      --ANK (980905)
 966          */
 967
 968         inet = inet_sk(sk);
 969         if (!sock_owned_by_user(sk) && inet->recverr) {
 970                 sk->sk_err = err;
 971                 sk->sk_error_report(sk);
 972         } else  { /* Only an error on timeout */
 973                 sk->sk_err_soft = err;
 974         }
 975
 976 out:
 977         bh_unlock_sock(sk);
 978         sock_put(sk);
 979 }
 980
 981 /* This routine computes an IPv4 TCP checksum. */
 982 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 983                        struct sk_buff *skb)
 984 {
 985         struct inet_sock *inet = inet_sk(sk);
 986
 987         if (skb->ip_summed == CHECKSUM_HW) {
 988                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
 989                 skb->csum = offsetof(struct tcphdr, check);
 990         } else {
 991                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
 992                                          csum_partial((char *)th,
 993                                                       th->doff << 2,
 994                                                       skb->csum));
 995         }
 996 }
 997
 998 /*
 999  *      This routine will send an RST to the other tcp.
1000  *
1001  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1002  *                    for reset.
1003  *      Answer: if a packet caused RST, it is not for a socket
1004  *              existing in our system, if it is matched to a socket,
1005  *              it is just duplicate segment or bug in other side's TCP.
1006  *              So that we build reply only basing on parameters
1007  *              arrived with segment.
1008  *      Exception: precedence violation. We do not implement it in any case.
1009  */
1010
1011 static void tcp_v4_send_reset(struct sk_buff *skb)
1012 {
1013         struct tcphdr *th = skb->h.th;
1014         struct tcphdr rth;
1015         struct ip_reply_arg arg;
1016
1017         /* Never send a reset in response to a reset. */
1018         if (th->rst)
1019                 return;
1020
1021         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1022                 return;
1023
1024         /* Swap the send and the receive. */
1025         memset(&rth, 0, sizeof(struct tcphdr));
1026         rth.dest   = th->source;
1027         rth.source = th->dest;
1028         rth.doff   = sizeof(struct tcphdr) / 4;
1029         rth.rst    = 1;
1030
1031         if (th->ack) {
1032                 rth.seq = th->ack_seq;
1033         } else {
1034                 rth.ack = 1;
1035                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1036                                     skb->len - (th->doff << 2));
1037         }
1038
1039         memset(&arg, 0, sizeof arg);
1040         arg.iov[0].iov_base = (unsigned char *)&rth;
1041         arg.iov[0].iov_len  = sizeof rth;
1042         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1043                                       skb->nh.iph->saddr, /*XXX*/
1044                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1045         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1046
1047         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1048
1049         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1050         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1051 }
1052
1053 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1054    outside socket context is ugly, certainly. What can I do?
1055  */
1056
1057 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1058                             u32 win, u32 ts)
1059 {
1060         struct tcphdr *th = skb->h.th;
1061         struct {
1062                 struct tcphdr th;
1063                 u32 tsopt[3];
1064         } rep;
1065         struct ip_reply_arg arg;
1066
1067         memset(&rep.th, 0, sizeof(struct tcphdr));
1068         memset(&arg, 0, sizeof arg);
1069
1070         arg.iov[0].iov_base = (unsigned char *)&rep;
1071         arg.iov[0].iov_len  = sizeof(rep.th);
1072         if (ts) {
1073                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1074                                      (TCPOPT_TIMESTAMP << 8) |
1075                                      TCPOLEN_TIMESTAMP);
1076                 rep.tsopt[1] = htonl(tcp_time_stamp);
1077                 rep.tsopt[2] = htonl(ts);
1078                 arg.iov[0].iov_len = sizeof(rep);
1079         }
1080
1081         /* Swap the send and the receive. */
1082         rep.th.dest    = th->source;
1083         rep.th.source  = th->dest;
1084         rep.th.doff    = arg.iov[0].iov_len / 4;
1085         rep.th.seq     = htonl(seq);
1086         rep.th.ack_seq = htonl(ack);
1087         rep.th.ack     = 1;
1088         rep.th.window  = htons(win);
1089
1090         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1091                                       skb->nh.iph->saddr, /*XXX*/
1092                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1093         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1094
1095         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1096
1097         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1098 }
1099
1100 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1101 {
1102         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1103
1104         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1105                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1106
1107         tcp_tw_put(tw);
1108 }
1109
1110 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1111 {
1112         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1113                         req->ts_recent);
1114 }
1115
1116 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1117                                           struct request_sock *req)
1118 {
1119         struct rtable *rt;
1120         const struct inet_request_sock *ireq = inet_rsk(req);
1121         struct ip_options *opt = inet_rsk(req)->opt;
1122         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1123                             .nl_u = { .ip4_u =
1124                                       { .daddr = ((opt && opt->srr) ?
1125                                                   opt->faddr :
1126                                                   ireq->rmt_addr),
1127                                         .saddr = ireq->loc_addr,
1128                                         .tos = RT_CONN_FLAGS(sk) } },
1129                             .proto = IPPROTO_TCP,
1130                             .uli_u = { .ports =
1131                                        { .sport = inet_sk(sk)->sport,
1132                                          .dport = ireq->rmt_port } } };
1133
1134         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1135                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1136                 return NULL;
1137         }
1138         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1139                 ip_rt_put(rt);
1140                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1141                 return NULL;
1142         }
1143         return &rt->u.dst;
1144 }
1145
1146 /*
1147  *      Send a SYN-ACK after having received an ACK.
1148  *      This still operates on a request_sock only, not on a big
1149  *      socket.
1150  */
1151 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1152                               struct dst_entry *dst)
1153 {
1154         const struct inet_request_sock *ireq = inet_rsk(req);
1155         int err = -1;
1156         struct sk_buff * skb;
1157
1158         /* First, grab a route. */
1159         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1160                 goto out;
1161
1162         skb = tcp_make_synack(sk, dst, req);
1163
1164         if (skb) {
1165                 struct tcphdr *th = skb->h.th;
1166
1167                 th->check = tcp_v4_check(th, skb->len,
1168                                          ireq->loc_addr,
1169                                          ireq->rmt_addr,
1170                                          csum_partial((char *)th, skb->len,
1171                                                       skb->csum));
1172
1173                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1174                                             ireq->rmt_addr,
1175                                             ireq->opt);
1176                 if (err == NET_XMIT_CN)
1177                         err = 0;
1178         }
1179
1180 out:
1181         dst_release(dst);
1182         return err;
1183 }
1184
1185 /*
1186  *      IPv4 request_sock destructor.
1187  */
1188 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1189 {
1190         if (inet_rsk(req)->opt)
1191                 kfree(inet_rsk(req)->opt);
1192 }
1193
1194 static inline void syn_flood_warning(struct sk_buff *skb)
1195 {
1196         static unsigned long warntime;
1197
1198         if (time_after(jiffies, (warntime + HZ * 60))) {
1199                 warntime = jiffies;
1200                 printk(KERN_INFO
1201                        "possible SYN flooding on port %d. Sending cookies.\n",
1202                        ntohs(skb->h.th->dest));
1203         }
1204 }
1205
1206 /*
1207  * Save and compile IPv4 options into the request_sock if needed.
1208  */
1209 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1210                                                      struct sk_buff *skb)
1211 {
1212         struct ip_options *opt = &(IPCB(skb)->opt);
1213         struct ip_options *dopt = NULL;
1214
1215         if (opt && opt->optlen) {
1216                 int opt_size = optlength(opt);
1217                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1218                 if (dopt) {
1219                         if (ip_options_echo(dopt, skb)) {
1220                                 kfree(dopt);
1221                                 dopt = NULL;
1222                         }
1223                 }
1224         }
1225         return dopt;
1226 }
1227
1228 struct request_sock_ops tcp_request_sock_ops = {
1229         .family         =       PF_INET,
1230         .obj_size       =       sizeof(struct tcp_request_sock),
1231         .rtx_syn_ack    =       tcp_v4_send_synack,
1232         .send_ack       =       tcp_v4_reqsk_send_ack,
1233         .destructor     =       tcp_v4_reqsk_destructor,
1234         .send_reset     =       tcp_v4_send_reset,
1235 };
1236
1237 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1238 {
1239         struct inet_request_sock *ireq;
1240         struct tcp_options_received tmp_opt;
1241         struct request_sock *req;
1242         __u32 saddr = skb->nh.iph->saddr;
1243         __u32 daddr = skb->nh.iph->daddr;
1244         __u32 isn = TCP_SKB_CB(skb)->when;
1245         struct dst_entry *dst = NULL;
1246 #ifdef CONFIG_SYN_COOKIES
1247         int want_cookie = 0;
1248 #else
1249 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1250 #endif
1251
1252         /* Never answer to SYNs send to broadcast or multicast */
1253         if (((struct rtable *)skb->dst)->rt_flags &
1254             (RTCF_BROADCAST | RTCF_MULTICAST))
1255                 goto drop;
1256
1257         /* TW buckets are converted to open requests without
1258          * limitations, they conserve resources and peer is
1259          * evidently real one.
1260          */
1261         if (tcp_synq_is_full(sk) && !isn) {
1262 #ifdef CONFIG_SYN_COOKIES
1263                 if (sysctl_tcp_syncookies) {
1264                         want_cookie = 1;
1265                 } else
1266 #endif
1267                 goto drop;
1268         }
1269
1270         /* Accept backlog is full. If we have already queued enough
1271          * of warm entries in syn queue, drop request. It is better than
1272          * clogging syn queue with openreqs with exponentially increasing
1273          * timeout.
1274          */
1275         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1276                 goto drop;
1277
1278         req = reqsk_alloc(&tcp_request_sock_ops);
1279         if (!req)
1280                 goto drop;
1281
1282         tcp_clear_options(&tmp_opt);
1283         tmp_opt.mss_clamp = 536;
1284         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1285
1286         tcp_parse_options(skb, &tmp_opt, 0);
1287
1288         if (want_cookie) {
1289                 tcp_clear_options(&tmp_opt);
1290                 tmp_opt.saw_tstamp = 0;
1291         }
1292
1293         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1294                 /* Some OSes (unknown ones, but I see them on web server, which
1295                  * contains information interesting only for windows'
1296                  * users) do not send their stamp in SYN. It is easy case.
1297                  * We simply do not advertise TS support.
1298                  */
1299                 tmp_opt.saw_tstamp = 0;
1300                 tmp_opt.tstamp_ok  = 0;
1301         }
1302         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1303
1304         tcp_openreq_init(req, &tmp_opt, skb);
1305
1306         ireq = inet_rsk(req);
1307         ireq->loc_addr = daddr;
1308         ireq->rmt_addr = saddr;
1309         ireq->opt = tcp_v4_save_options(sk, skb);
1310         if (!want_cookie)
1311                 TCP_ECN_create_request(req, skb->h.th);
1312
1313         if (want_cookie) {
1314 #ifdef CONFIG_SYN_COOKIES
1315                 syn_flood_warning(skb);
1316 #endif
1317                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1318         } else if (!isn) {
1319                 struct inet_peer *peer = NULL;
1320
1321                 /* VJ's idea. We save last timestamp seen
1322                  * from the destination in peer table, when entering
1323                  * state TIME-WAIT, and check against it before
1324                  * accepting new connection request.
1325                  *
1326                  * If "isn" is not zero, this request hit alive
1327                  * timewait bucket, so that all the necessary checks
1328                  * are made in the function processing timewait state.
1329                  */
1330                 if (tmp_opt.saw_tstamp &&
1331                     sysctl_tcp_tw_recycle &&
1332                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1333                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1334                     peer->v4daddr == saddr) {
1335                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1336                             (s32)(peer->tcp_ts - req->ts_recent) >
1337                                                         TCP_PAWS_WINDOW) {
1338                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1339                                 dst_release(dst);
1340                                 goto drop_and_free;
1341                         }
1342                 }
1343                 /* Kill the following clause, if you dislike this way. */
1344                 else if (!sysctl_tcp_syncookies &&
1345                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1346                           (sysctl_max_syn_backlog >> 2)) &&
1347                          (!peer || !peer->tcp_ts_stamp) &&
1348                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1349                         /* Without syncookies last quarter of
1350                          * backlog is filled with destinations,
1351                          * proven to be alive.
1352                          * It means that we continue to communicate
1353                          * to destinations, already remembered
1354                          * to the moment of synflood.
1355                          */
1356                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1357                                               "request from %u.%u."
1358                                               "%u.%u/%u\n",
1359                                               NIPQUAD(saddr),
1360                                               ntohs(skb->h.th->source)));
1361                         dst_release(dst);
1362                         goto drop_and_free;
1363                 }
1364
1365                 isn = tcp_v4_init_sequence(sk, skb);
1366         }
1367         tcp_rsk(req)->snt_isn = isn;
1368
1369         if (tcp_v4_send_synack(sk, req, dst))
1370                 goto drop_and_free;
1371
1372         if (want_cookie) {
1373                 reqsk_free(req);
1374         } else {
1375                 tcp_v4_synq_add(sk, req);
1376         }
1377         return 0;
1378
1379 drop_and_free:
1380         reqsk_free(req);
1381 drop:
1382         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1383         return 0;
1384 }
1385
1386
1387 /*
1388  * The three way handshake has completed - we got a valid synack -
1389  * now create the new socket.
1390  */
1391 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1392                                   struct request_sock *req,
1393                                   struct dst_entry *dst)
1394 {
1395         struct inet_request_sock *ireq;
1396         struct inet_sock *newinet;
1397         struct tcp_sock *newtp;
1398         struct sock *newsk;
1399
1400         if (sk_acceptq_is_full(sk))
1401                 goto exit_overflow;
1402
1403         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1404                 goto exit;
1405
1406         newsk = tcp_create_openreq_child(sk, req, skb);
1407         if (!newsk)
1408                 goto exit;
1409
1410         sk_setup_caps(newsk, dst);
1411
1412         newtp                 = tcp_sk(newsk);
1413         newinet               = inet_sk(newsk);
1414         ireq                  = inet_rsk(req);
1415         newinet->daddr        = ireq->rmt_addr;
1416         newinet->rcv_saddr    = ireq->loc_addr;
1417         newinet->saddr        = ireq->loc_addr;
1418         newinet->opt          = ireq->opt;
1419         ireq->opt             = NULL;
1420         newinet->mc_index     = tcp_v4_iif(skb);
1421         newinet->mc_ttl       = skb->nh.iph->ttl;
1422         newtp->ext_header_len = 0;
1423         if (newinet->opt)
1424                 newtp->ext_header_len = newinet->opt->optlen;
1425         newinet->id = newtp->write_seq ^ jiffies;
1426
1427         tcp_sync_mss(newsk, dst_mtu(dst));
1428         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1429         tcp_initialize_rcv_mss(newsk);
1430
1431         __inet_hash(&tcp_hashinfo, newsk, 0);
1432         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1433
1434         return newsk;
1435
1436 exit_overflow:
1437         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1438 exit:
1439         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1440         dst_release(dst);
1441         return NULL;
1442 }
1443
1444 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1445 {
1446         struct tcphdr *th = skb->h.th;
1447         struct iphdr *iph = skb->nh.iph;
1448         struct tcp_sock *tp = tcp_sk(sk);
1449         struct sock *nsk;
1450         struct request_sock **prev;
1451         /* Find possible connection requests. */
1452         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1453                                                      iph->saddr, iph->daddr);
1454         if (req)
1455                 return tcp_check_req(sk, skb, req, prev);
1456
1457         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1458                                           th->source,
1459                                           skb->nh.iph->daddr,
1460                                           ntohs(th->dest),
1461                                           tcp_v4_iif(skb));
1462
1463         if (nsk) {
1464                 if (nsk->sk_state != TCP_TIME_WAIT) {
1465                         bh_lock_sock(nsk);
1466                         return nsk;
1467                 }
1468                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1469                 return NULL;
1470         }
1471
1472 #ifdef CONFIG_SYN_COOKIES
1473         if (!th->rst && !th->syn && th->ack)
1474                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1475 #endif
1476         return sk;
1477 }
1478
1479 static int tcp_v4_checksum_init(struct sk_buff *skb)
1480 {
1481         if (skb->ip_summed == CHECKSUM_HW) {
1482                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1483                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1484                                   skb->nh.iph->daddr, skb->csum))
1485                         return 0;
1486
1487                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1488                 skb->ip_summed = CHECKSUM_NONE;
1489         }
1490         if (skb->len <= 76) {
1491                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1492                                  skb->nh.iph->daddr,
1493                                  skb_checksum(skb, 0, skb->len, 0)))
1494                         return -1;
1495                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1496         } else {
1497                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1498                                           skb->nh.iph->saddr,
1499                                           skb->nh.iph->daddr, 0);
1500         }
1501         return 0;
1502 }
1503
1504
1505 /* The socket must have it's spinlock held when we get
1506  * here.
1507  *
1508  * We have a potential double-lock case here, so even when
1509  * doing backlog processing we use the BH locking scheme.
1510  * This is because we cannot sleep with the original spinlock
1511  * held.
1512  */
1513 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1514 {
1515         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1516                 TCP_CHECK_TIMER(sk);
1517                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1518                         goto reset;
1519                 TCP_CHECK_TIMER(sk);
1520                 return 0;
1521         }
1522
1523         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1524                 goto csum_err;
1525
1526         if (sk->sk_state == TCP_LISTEN) {
1527                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1528                 if (!nsk)
1529                         goto discard;
1530
1531                 if (nsk != sk) {
1532                         if (tcp_child_process(sk, nsk, skb))
1533                                 goto reset;
1534                         return 0;
1535                 }
1536         }
1537
1538         TCP_CHECK_TIMER(sk);
1539         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1540                 goto reset;
1541         TCP_CHECK_TIMER(sk);
1542         return 0;
1543
1544 reset:
1545         tcp_v4_send_reset(skb);
1546 discard:
1547         kfree_skb(skb);
1548         /* Be careful here. If this function gets more complicated and
1549          * gcc suffers from register pressure on the x86, sk (in %ebx)
1550          * might be destroyed here. This current version compiles correctly,
1551          * but you have been warned.
1552          */
1553         return 0;
1554
1555 csum_err:
1556         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1557         goto discard;
1558 }
1559
1560 /*
1561  *      From tcp_input.c
1562  */
1563
1564 int tcp_v4_rcv(struct sk_buff *skb)
1565 {
1566         struct tcphdr *th;
1567         struct sock *sk;
1568         int ret;
1569
1570         if (skb->pkt_type != PACKET_HOST)
1571                 goto discard_it;
1572
1573         /* Count it even if it's bad */
1574         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1575
1576         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1577                 goto discard_it;
1578
1579         th = skb->h.th;
1580
1581         if (th->doff < sizeof(struct tcphdr) / 4)
1582                 goto bad_packet;
1583         if (!pskb_may_pull(skb, th->doff * 4))
1584                 goto discard_it;
1585
1586         /* An explanation is required here, I think.
1587          * Packet length and doff are validated by header prediction,
1588          * provided case of th->doff==0 is elimineted.
1589          * So, we defer the checks. */
1590         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1591              tcp_v4_checksum_init(skb) < 0))
1592                 goto bad_packet;
1593
1594         th = skb->h.th;
1595         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1596         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1597                                     skb->len - th->doff * 4);
1598         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1599         TCP_SKB_CB(skb)->when    = 0;
1600         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1601         TCP_SKB_CB(skb)->sacked  = 0;
1602
1603         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1604                              skb->nh.iph->daddr, ntohs(th->dest),
1605                              tcp_v4_iif(skb));
1606
1607         if (!sk)
1608                 goto no_tcp_socket;
1609
1610 process:
1611         if (sk->sk_state == TCP_TIME_WAIT)
1612                 goto do_time_wait;
1613
1614         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1615                 goto discard_and_relse;
1616
1617         if (sk_filter(sk, skb, 0))
1618                 goto discard_and_relse;
1619
1620         skb->dev = NULL;
1621
1622         bh_lock_sock(sk);
1623         ret = 0;
1624         if (!sock_owned_by_user(sk)) {
1625                 if (!tcp_prequeue(sk, skb))
1626                         ret = tcp_v4_do_rcv(sk, skb);
1627         } else
1628                 sk_add_backlog(sk, skb);
1629         bh_unlock_sock(sk);
1630
1631         sock_put(sk);
1632
1633         return ret;
1634
1635 no_tcp_socket:
1636         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1637                 goto discard_it;
1638
1639         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1640 bad_packet:
1641                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1642         } else {
1643                 tcp_v4_send_reset(skb);
1644         }
1645
1646 discard_it:
1647         /* Discard frame. */
1648         kfree_skb(skb);
1649         return 0;
1650
1651 discard_and_relse:
1652         sock_put(sk);
1653         goto discard_it;
1654
1655 do_time_wait:
1656         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1657                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1658                 goto discard_it;
1659         }
1660
1661         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1662                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1663                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1664                 goto discard_it;
1665         }
1666         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1667                                            skb, th, skb->len)) {
1668         case TCP_TW_SYN: {
1669                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1670                                                           ntohs(th->dest),
1671                                                           tcp_v4_iif(skb));
1672                 if (sk2) {
1673                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1674                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1675                         sk = sk2;
1676                         goto process;
1677                 }
1678                 /* Fall through to ACK */
1679         }
1680         case TCP_TW_ACK:
1681                 tcp_v4_timewait_ack(sk, skb);
1682                 break;
1683         case TCP_TW_RST:
1684                 goto no_tcp_socket;
1685         case TCP_TW_SUCCESS:;
1686         }
1687         goto discard_it;
1688 }
1689
1690 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1691 {
1692         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1693         struct inet_sock *inet = inet_sk(sk);
1694
1695         sin->sin_family         = AF_INET;
1696         sin->sin_addr.s_addr    = inet->daddr;
1697         sin->sin_port           = inet->dport;
1698 }
1699
1700 /* VJ's idea. Save last timestamp seen from this destination
1701  * and hold it at least for normal timewait interval to use for duplicate
1702  * segment detection in subsequent connections, before they enter synchronized
1703  * state.
1704  */
1705
1706 int tcp_v4_remember_stamp(struct sock *sk)
1707 {
1708         struct inet_sock *inet = inet_sk(sk);
1709         struct tcp_sock *tp = tcp_sk(sk);
1710         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1711         struct inet_peer *peer = NULL;
1712         int release_it = 0;
1713
1714         if (!rt || rt->rt_dst != inet->daddr) {
1715                 peer = inet_getpeer(inet->daddr, 1);
1716                 release_it = 1;
1717         } else {
1718                 if (!rt->peer)
1719                         rt_bind_peer(rt, 1);
1720                 peer = rt->peer;
1721         }
1722
1723         if (peer) {
1724                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1725                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1726                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1727                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1728                         peer->tcp_ts = tp->rx_opt.ts_recent;
1729                 }
1730                 if (release_it)
1731                         inet_putpeer(peer);
1732                 return 1;
1733         }
1734
1735         return 0;
1736 }
1737
1738 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1739 {
1740         struct inet_peer *peer = NULL;
1741
1742         peer = inet_getpeer(tw->tw_daddr, 1);
1743
1744         if (peer) {
1745                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1746                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1747                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1748                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1749                         peer->tcp_ts = tw->tw_ts_recent;
1750                 }
1751                 inet_putpeer(peer);
1752                 return 1;
1753         }
1754
1755         return 0;
1756 }
1757
1758 struct tcp_func ipv4_specific = {
1759         .queue_xmit     =       ip_queue_xmit,
1760         .send_check     =       tcp_v4_send_check,
1761         .rebuild_header =       inet_sk_rebuild_header,
1762         .conn_request   =       tcp_v4_conn_request,
1763         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1764         .remember_stamp =       tcp_v4_remember_stamp,
1765         .net_header_len =       sizeof(struct iphdr),
1766         .setsockopt     =       ip_setsockopt,
1767         .getsockopt     =       ip_getsockopt,
1768         .addr2sockaddr  =       v4_addr2sockaddr,
1769         .sockaddr_len   =       sizeof(struct sockaddr_in),
1770 };
1771
1772 /* NOTE: A lot of things set to zero explicitly by call to
1773  *       sk_alloc() so need not be done here.
1774  */
1775 static int tcp_v4_init_sock(struct sock *sk)
1776 {
1777         struct tcp_sock *tp = tcp_sk(sk);
1778
1779         skb_queue_head_init(&tp->out_of_order_queue);
1780         tcp_init_xmit_timers(sk);
1781         tcp_prequeue_init(tp);
1782
1783         tp->rto  = TCP_TIMEOUT_INIT;
1784         tp->mdev = TCP_TIMEOUT_INIT;
1785
1786         /* So many TCP implementations out there (incorrectly) count the
1787          * initial SYN frame in their delayed-ACK and congestion control
1788          * algorithms that we must have the following bandaid to talk
1789          * efficiently to them.  -DaveM
1790          */
1791         tp->snd_cwnd = 2;
1792
1793         /* See draft-stevens-tcpca-spec-01 for discussion of the
1794          * initialization of these values.
1795          */
1796         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1797         tp->snd_cwnd_clamp = ~0;
1798         tp->mss_cache = 536;
1799
1800         tp->reordering = sysctl_tcp_reordering;
1801         tp->ca_ops = &tcp_init_congestion_ops;
1802
1803         sk->sk_state = TCP_CLOSE;
1804
1805         sk->sk_write_space = sk_stream_write_space;
1806         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1807
1808         tp->af_specific = &ipv4_specific;
1809
1810         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1811         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1812
1813         atomic_inc(&tcp_sockets_allocated);
1814
1815         return 0;
1816 }
1817
1818 int tcp_v4_destroy_sock(struct sock *sk)
1819 {
1820         struct tcp_sock *tp = tcp_sk(sk);
1821
1822         tcp_clear_xmit_timers(sk);
1823
1824         tcp_cleanup_congestion_control(tp);
1825
1826         /* Cleanup up the write buffer. */
1827         sk_stream_writequeue_purge(sk);
1828
1829         /* Cleans up our, hopefully empty, out_of_order_queue. */
1830         __skb_queue_purge(&tp->out_of_order_queue);
1831
1832         /* Clean prequeue, it must be empty really */
1833         __skb_queue_purge(&tp->ucopy.prequeue);
1834
1835         /* Clean up a referenced TCP bind bucket. */
1836         if (inet_sk(sk)->bind_hash)
1837                 inet_put_port(&tcp_hashinfo, sk);
1838
1839         /*
1840          * If sendmsg cached page exists, toss it.
1841          */
1842         if (sk->sk_sndmsg_page) {
1843                 __free_page(sk->sk_sndmsg_page);
1844                 sk->sk_sndmsg_page = NULL;
1845         }
1846
1847         atomic_dec(&tcp_sockets_allocated);
1848
1849         return 0;
1850 }
1851
1852 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1853
1854 #ifdef CONFIG_PROC_FS
1855 /* Proc filesystem TCP sock list dumping. */
1856
1857 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1858 {
1859         return hlist_empty(head) ? NULL :
1860                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1861 }
1862
1863 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1864 {
1865         return tw->tw_node.next ?
1866                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1867 }
1868
1869 static void *listening_get_next(struct seq_file *seq, void *cur)
1870 {
1871         struct tcp_sock *tp;
1872         struct hlist_node *node;
1873         struct sock *sk = cur;
1874         struct tcp_iter_state* st = seq->private;
1875
1876         if (!sk) {
1877                 st->bucket = 0;
1878                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1879                 goto get_sk;
1880         }
1881
1882         ++st->num;
1883
1884         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1885                 struct request_sock *req = cur;
1886
1887                 tp = tcp_sk(st->syn_wait_sk);
1888                 req = req->dl_next;
1889                 while (1) {
1890                         while (req) {
1891                                 if (req->rsk_ops->family == st->family) {
1892                                         cur = req;
1893                                         goto out;
1894                                 }
1895                                 req = req->dl_next;
1896                         }
1897                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1898                                 break;
1899 get_req:
1900                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1901                 }
1902                 sk        = sk_next(st->syn_wait_sk);
1903                 st->state = TCP_SEQ_STATE_LISTENING;
1904                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1905         } else {
1906                 tp = tcp_sk(sk);
1907                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1908                 if (reqsk_queue_len(&tp->accept_queue))
1909                         goto start_req;
1910                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1911                 sk = sk_next(sk);
1912         }
1913 get_sk:
1914         sk_for_each_from(sk, node) {
1915                 if (sk->sk_family == st->family) {
1916                         cur = sk;
1917                         goto out;
1918                 }
1919                 tp = tcp_sk(sk);
1920                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1921                 if (reqsk_queue_len(&tp->accept_queue)) {
1922 start_req:
1923                         st->uid         = sock_i_uid(sk);
1924                         st->syn_wait_sk = sk;
1925                         st->state       = TCP_SEQ_STATE_OPENREQ;
1926                         st->sbucket     = 0;
1927                         goto get_req;
1928                 }
1929                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1930         }
1931         if (++st->bucket < INET_LHTABLE_SIZE) {
1932                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1933                 goto get_sk;
1934         }
1935         cur = NULL;
1936 out:
1937         return cur;
1938 }
1939
1940 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1941 {
1942         void *rc = listening_get_next(seq, NULL);
1943
1944         while (rc && *pos) {
1945                 rc = listening_get_next(seq, rc);
1946                 --*pos;
1947         }
1948         return rc;
1949 }
1950
1951 static void *established_get_first(struct seq_file *seq)
1952 {
1953         struct tcp_iter_state* st = seq->private;
1954         void *rc = NULL;
1955
1956         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1957                 struct sock *sk;
1958                 struct hlist_node *node;
1959                 struct tcp_tw_bucket *tw;
1960
1961                 /* We can reschedule _before_ having picked the target: */
1962                 cond_resched_softirq();
1963
1964                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1965                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1966                         if (sk->sk_family != st->family) {
1967                                 continue;
1968                         }
1969                         rc = sk;
1970                         goto out;
1971                 }
1972                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1973                 tw_for_each(tw, node,
1974                             &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1975                         if (tw->tw_family != st->family) {
1976                                 continue;
1977                         }
1978                         rc = tw;
1979                         goto out;
1980                 }
1981                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1982                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1983         }
1984 out:
1985         return rc;
1986 }
1987
1988 static void *established_get_next(struct seq_file *seq, void *cur)
1989 {
1990         struct sock *sk = cur;
1991         struct tcp_tw_bucket *tw;
1992         struct hlist_node *node;
1993         struct tcp_iter_state* st = seq->private;
1994
1995         ++st->num;
1996
1997         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1998                 tw = cur;
1999                 tw = tw_next(tw);
2000 get_tw:
2001                 while (tw && tw->tw_family != st->family) {
2002                         tw = tw_next(tw);
2003                 }
2004                 if (tw) {
2005                         cur = tw;
2006                         goto out;
2007                 }
2008                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2009                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2010
2011                 /* We can reschedule between buckets: */
2012                 cond_resched_softirq();
2013
2014                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2015                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2016                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2017                 } else {
2018                         cur = NULL;
2019                         goto out;
2020                 }
2021         } else
2022                 sk = sk_next(sk);
2023
2024         sk_for_each_from(sk, node) {
2025                 if (sk->sk_family == st->family)
2026                         goto found;
2027         }
2028
2029         st->state = TCP_SEQ_STATE_TIME_WAIT;
2030         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
2031         goto get_tw;
2032 found:
2033         cur = sk;
2034 out:
2035         return cur;
2036 }
2037
2038 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2039 {
2040         void *rc = established_get_first(seq);
2041
2042         while (rc && pos) {
2043                 rc = established_get_next(seq, rc);
2044                 --pos;
2045         }
2046         return rc;
2047 }
2048
2049 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2050 {
2051         void *rc;
2052         struct tcp_iter_state* st = seq->private;
2053
2054         inet_listen_lock(&tcp_hashinfo);
2055         st->state = TCP_SEQ_STATE_LISTENING;
2056         rc        = listening_get_idx(seq, &pos);
2057
2058         if (!rc) {
2059                 inet_listen_unlock(&tcp_hashinfo);
2060                 local_bh_disable();
2061                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2062                 rc        = established_get_idx(seq, pos);
2063         }
2064
2065         return rc;
2066 }
2067
2068 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2069 {
2070         struct tcp_iter_state* st = seq->private;
2071         st->state = TCP_SEQ_STATE_LISTENING;
2072         st->num = 0;
2073         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2074 }
2075
2076 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2077 {
2078         void *rc = NULL;
2079         struct tcp_iter_state* st;
2080
2081         if (v == SEQ_START_TOKEN) {
2082                 rc = tcp_get_idx(seq, 0);
2083                 goto out;
2084         }
2085         st = seq->private;
2086
2087         switch (st->state) {
2088         case TCP_SEQ_STATE_OPENREQ:
2089         case TCP_SEQ_STATE_LISTENING:
2090                 rc = listening_get_next(seq, v);
2091                 if (!rc) {
2092                         inet_listen_unlock(&tcp_hashinfo);
2093                         local_bh_disable();
2094                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2095                         rc        = established_get_first(seq);
2096                 }
2097                 break;
2098         case TCP_SEQ_STATE_ESTABLISHED:
2099         case TCP_SEQ_STATE_TIME_WAIT:
2100                 rc = established_get_next(seq, v);
2101                 break;
2102         }
2103 out:
2104         ++*pos;
2105         return rc;
2106 }
2107
2108 static void tcp_seq_stop(struct seq_file *seq, void *v)
2109 {
2110         struct tcp_iter_state* st = seq->private;
2111
2112         switch (st->state) {
2113         case TCP_SEQ_STATE_OPENREQ:
2114                 if (v) {
2115                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2116                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2117                 }
2118         case TCP_SEQ_STATE_LISTENING:
2119                 if (v != SEQ_START_TOKEN)
2120                         inet_listen_unlock(&tcp_hashinfo);
2121                 break;
2122         case TCP_SEQ_STATE_TIME_WAIT:
2123         case TCP_SEQ_STATE_ESTABLISHED:
2124                 if (v)
2125                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2126                 local_bh_enable();
2127                 break;
2128         }
2129 }
2130
2131 static int tcp_seq_open(struct inode *inode, struct file *file)
2132 {
2133         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2134         struct seq_file *seq;
2135         struct tcp_iter_state *s;
2136         int rc;
2137
2138         if (unlikely(afinfo == NULL))
2139                 return -EINVAL;
2140
2141         s = kmalloc(sizeof(*s), GFP_KERNEL);
2142         if (!s)
2143                 return -ENOMEM;
2144         memset(s, 0, sizeof(*s));
2145         s->family               = afinfo->family;
2146         s->seq_ops.start        = tcp_seq_start;
2147         s->seq_ops.next         = tcp_seq_next;
2148         s->seq_ops.show         = afinfo->seq_show;
2149         s->seq_ops.stop         = tcp_seq_stop;
2150
2151         rc = seq_open(file, &s->seq_ops);
2152         if (rc)
2153                 goto out_kfree;
2154         seq          = file->private_data;
2155         seq->private = s;
2156 out:
2157         return rc;
2158 out_kfree:
2159         kfree(s);
2160         goto out;
2161 }
2162
2163 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2164 {
2165         int rc = 0;
2166         struct proc_dir_entry *p;
2167
2168         if (!afinfo)
2169                 return -EINVAL;
2170         afinfo->seq_fops->owner         = afinfo->owner;
2171         afinfo->seq_fops->open          = tcp_seq_open;
2172         afinfo->seq_fops->read          = seq_read;
2173         afinfo->seq_fops->llseek        = seq_lseek;
2174         afinfo->seq_fops->release       = seq_release_private;
2175
2176         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2177         if (p)
2178                 p->data = afinfo;
2179         else
2180                 rc = -ENOMEM;
2181         return rc;
2182 }
2183
2184 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2185 {
2186         if (!afinfo)
2187                 return;
2188         proc_net_remove(afinfo->name);
2189         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2190 }
2191
2192 static void get_openreq4(struct sock *sk, struct request_sock *req,
2193                          char *tmpbuf, int i, int uid)
2194 {
2195         const struct inet_request_sock *ireq = inet_rsk(req);
2196         int ttd = req->expires - jiffies;
2197
2198         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2199                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2200                 i,
2201                 ireq->loc_addr,
2202                 ntohs(inet_sk(sk)->sport),
2203                 ireq->rmt_addr,
2204                 ntohs(ireq->rmt_port),
2205                 TCP_SYN_RECV,
2206                 0, 0, /* could print option size, but that is af dependent. */
2207                 1,    /* timers active (only the expire timer) */
2208                 jiffies_to_clock_t(ttd),
2209                 req->retrans,
2210                 uid,
2211                 0,  /* non standard timer */
2212                 0, /* open_requests have no inode */
2213                 atomic_read(&sk->sk_refcnt),
2214                 req);
2215 }
2216
2217 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2218 {
2219         int timer_active;
2220         unsigned long timer_expires;
2221         struct tcp_sock *tp = tcp_sk(sp);
2222         struct inet_sock *inet = inet_sk(sp);
2223         unsigned int dest = inet->daddr;
2224         unsigned int src = inet->rcv_saddr;
2225         __u16 destp = ntohs(inet->dport);
2226         __u16 srcp = ntohs(inet->sport);
2227
2228         if (tp->pending == TCP_TIME_RETRANS) {
2229                 timer_active    = 1;
2230                 timer_expires   = tp->timeout;
2231         } else if (tp->pending == TCP_TIME_PROBE0) {
2232                 timer_active    = 4;
2233                 timer_expires   = tp->timeout;
2234         } else if (timer_pending(&sp->sk_timer)) {
2235                 timer_active    = 2;
2236                 timer_expires   = sp->sk_timer.expires;
2237         } else {
2238                 timer_active    = 0;
2239                 timer_expires = jiffies;
2240         }
2241
2242         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2243                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2244                 i, src, srcp, dest, destp, sp->sk_state,
2245                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2246                 timer_active,
2247                 jiffies_to_clock_t(timer_expires - jiffies),
2248                 tp->retransmits,
2249                 sock_i_uid(sp),
2250                 tp->probes_out,
2251                 sock_i_ino(sp),
2252                 atomic_read(&sp->sk_refcnt), sp,
2253                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2254                 tp->snd_cwnd,
2255                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2256 }
2257
2258 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2259 {
2260         unsigned int dest, src;
2261         __u16 destp, srcp;
2262         int ttd = tw->tw_ttd - jiffies;
2263
2264         if (ttd < 0)
2265                 ttd = 0;
2266
2267         dest  = tw->tw_daddr;
2268         src   = tw->tw_rcv_saddr;
2269         destp = ntohs(tw->tw_dport);
2270         srcp  = ntohs(tw->tw_sport);
2271
2272         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2273                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2274                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2275                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2276                 atomic_read(&tw->tw_refcnt), tw);
2277 }
2278
2279 #define TMPSZ 150
2280
2281 static int tcp4_seq_show(struct seq_file *seq, void *v)
2282 {
2283         struct tcp_iter_state* st;
2284         char tmpbuf[TMPSZ + 1];
2285
2286         if (v == SEQ_START_TOKEN) {
2287                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2288                            "  sl  local_address rem_address   st tx_queue "
2289                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2290                            "inode");
2291                 goto out;
2292         }
2293         st = seq->private;
2294
2295         switch (st->state) {
2296         case TCP_SEQ_STATE_LISTENING:
2297         case TCP_SEQ_STATE_ESTABLISHED:
2298                 get_tcp4_sock(v, tmpbuf, st->num);
2299                 break;
2300         case TCP_SEQ_STATE_OPENREQ:
2301                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2302                 break;
2303         case TCP_SEQ_STATE_TIME_WAIT:
2304                 get_timewait4_sock(v, tmpbuf, st->num);
2305                 break;
2306         }
2307         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2308 out:
2309         return 0;
2310 }
2311
2312 static struct file_operations tcp4_seq_fops;
2313 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2314         .owner          = THIS_MODULE,
2315         .name           = "tcp",
2316         .family         = AF_INET,
2317         .seq_show       = tcp4_seq_show,
2318         .seq_fops       = &tcp4_seq_fops,
2319 };
2320
2321 int __init tcp4_proc_init(void)
2322 {
2323         return tcp_proc_register(&tcp4_seq_afinfo);
2324 }
2325
2326 void tcp4_proc_exit(void)
2327 {
2328         tcp_proc_unregister(&tcp4_seq_afinfo);
2329 }
2330 #endif /* CONFIG_PROC_FS */
2331
2332 struct proto tcp_prot = {
2333         .name                   = "TCP",
2334         .owner                  = THIS_MODULE,
2335         .close                  = tcp_close,
2336         .connect                = tcp_v4_connect,
2337         .disconnect             = tcp_disconnect,
2338         .accept                 = tcp_accept,
2339         .ioctl                  = tcp_ioctl,
2340         .init                   = tcp_v4_init_sock,
2341         .destroy                = tcp_v4_destroy_sock,
2342         .shutdown               = tcp_shutdown,
2343         .setsockopt             = tcp_setsockopt,
2344         .getsockopt             = tcp_getsockopt,
2345         .sendmsg                = tcp_sendmsg,
2346         .recvmsg                = tcp_recvmsg,
2347         .backlog_rcv            = tcp_v4_do_rcv,
2348         .hash                   = tcp_v4_hash,
2349         .unhash                 = tcp_unhash,
2350         .get_port               = tcp_v4_get_port,
2351         .enter_memory_pressure  = tcp_enter_memory_pressure,
2352         .sockets_allocated      = &tcp_sockets_allocated,
2353         .memory_allocated       = &tcp_memory_allocated,
2354         .memory_pressure        = &tcp_memory_pressure,
2355         .sysctl_mem             = sysctl_tcp_mem,
2356         .sysctl_wmem            = sysctl_tcp_wmem,
2357         .sysctl_rmem            = sysctl_tcp_rmem,
2358         .max_header             = MAX_TCP_HEADER,
2359         .obj_size               = sizeof(struct tcp_sock),
2360         .rsk_prot               = &tcp_request_sock_ops,
2361 };
2362
2363
2364
2365 void __init tcp_v4_init(struct net_proto_family *ops)
2366 {
2367         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2368         if (err < 0)
2369                 panic("Failed to create the TCP control socket.\n");
2370         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2371         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2372
2373         /* Unhash it so that IP input processing does not even
2374          * see it, we do not wish this socket to see incoming
2375          * packets.
2376          */
2377         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2378 }
2379
2380 EXPORT_SYMBOL(ipv4_specific);
2381 EXPORT_SYMBOL(inet_bind_bucket_create);
2382 EXPORT_SYMBOL(tcp_hashinfo);
2383 EXPORT_SYMBOL(tcp_prot);
2384 EXPORT_SYMBOL(tcp_unhash);
2385 EXPORT_SYMBOL(tcp_v4_conn_request);
2386 EXPORT_SYMBOL(tcp_v4_connect);
2387 EXPORT_SYMBOL(tcp_v4_do_rcv);
2388 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2389 EXPORT_SYMBOL(tcp_v4_send_check);
2390 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2391
2392 #ifdef CONFIG_PROC_FS
2393 EXPORT_SYMBOL(tcp_proc_register);
2394 EXPORT_SYMBOL(tcp_proc_unregister);
2395 #endif
2396 EXPORT_SYMBOL(sysctl_local_port_range);
2397 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2398 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2399