err.no Git - linux-2.6/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/ipv6.h>
  70 #include <net/inet_common.h>
  71 #include <net/xfrm.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78
  79 extern int sysctl_ip_dynaddr;
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .lhash_lock     = RW_LOCK_UNLOCKED,
  94         .lhash_users    = ATOMIC_INIT(0),
  95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
  96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
  97         .port_rover     = 1024 - 1,
  98 };
  99
 100 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 101 {
 102         return inet_csk_get_port(&tcp_hashinfo, sk, snum);
 103 }
 104
 105 static void tcp_v4_hash(struct sock *sk)
 106 {
 107         inet_hash(&tcp_hashinfo, sk);
 108 }
 109
 110 void tcp_unhash(struct sock *sk)
 111 {
 112         inet_unhash(&tcp_hashinfo, sk);
 113 }
 114
 115 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 116 {
 117         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 118                                           skb->nh.iph->saddr,
 119                                           skb->h.th->dest,
 120                                           skb->h.th->source);
 121 }
 122
 123 /* called with local bh disabled */
 124 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 125                                       struct inet_timewait_sock **twp)
 126 {
 127         struct inet_sock *inet = inet_sk(sk);
 128         u32 daddr = inet->rcv_saddr;
 129         u32 saddr = inet->daddr;
 130         int dif = sk->sk_bound_dev_if;
 131         INET_ADDR_COOKIE(acookie, saddr, daddr)
 132         const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
 133         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
 134         struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
 135         struct sock *sk2;
 136         const struct hlist_node *node;
 137         struct inet_timewait_sock *tw;
 138
 139         write_lock(&head->lock);
 140
 141         /* Check TIME-WAIT sockets first. */
 142         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 143                 tw = inet_twsk(sk2);
 144
 145                 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 146                         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 147                         struct tcp_sock *tp = tcp_sk(sk);
 148
 149                         /* With PAWS, it is safe from the viewpoint
 150                            of data integrity. Even without PAWS it
 151                            is safe provided sequence spaces do not
 152                            overlap i.e. at data rates <= 80Mbit/sec.
 153
 154                            Actually, the idea is close to VJ's one,
 155                            only timestamp cache is held not per host,
 156                            but per port pair and TW bucket is used
 157                            as state holder.
 158
 159                            If TW bucket has been already destroyed we
 160                            fall back to VJ's scheme and use initial
 161                            timestamp retrieved from peer table.
 162                          */
 163                         if (tcptw->tw_ts_recent_stamp &&
 164                             (!twp || (sysctl_tcp_tw_reuse &&
 165                                       xtime.tv_sec -
 166                                       tcptw->tw_ts_recent_stamp > 1))) {
 167                                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 168                                 if (tp->write_seq == 0)
 169                                         tp->write_seq = 1;
 170                                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                                 sock_hold(sk2);
 173                                 goto unique;
 174                         } else
 175                                 goto not_unique;
 176                 }
 177         }
 178         tw = NULL;
 179
 180         /* And established part... */
 181         sk_for_each(sk2, node, &head->chain) {
 182                 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 183                         goto not_unique;
 184         }
 185
 186 unique:
 187         /* Must record num and sport now. Otherwise we will see
 188          * in hash table socket with a funny identity. */
 189         inet->num = lport;
 190         inet->sport = htons(lport);
 191         sk->sk_hashent = hash;
 192         BUG_TRAP(sk_unhashed(sk));
 193         __sk_add_node(sk, &head->chain);
 194         sock_prot_inc_use(sk->sk_prot);
 195         write_unlock(&head->lock);
 196
 197         if (twp) {
 198                 *twp = tw;
 199                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 200         } else if (tw) {
 201                 /* Silly. Should hash-dance instead... */
 202                 inet_twsk_deschedule(tw, &tcp_death_row);
 203                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 204
 205                 inet_twsk_put(tw);
 206         }
 207
 208         return 0;
 209
 210 not_unique:
 211         write_unlock(&head->lock);
 212         return -EADDRNOTAVAIL;
 213 }
 214
 215 static inline u32 connect_port_offset(const struct sock *sk)
 216 {
 217         const struct inet_sock *inet = inet_sk(sk);
 218
 219         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 220                                          inet->dport);
 221 }
 222
 223 /*
 224  * Bind a port for a connect operation and hash it.
 225  */
 226 static inline int tcp_v4_hash_connect(struct sock *sk)
 227 {
 228         const unsigned short snum = inet_sk(sk)->num;
 229         struct inet_bind_hashbucket *head;
 230         struct inet_bind_bucket *tb;
 231         int ret;
 232
 233         if (!snum) {
 234                 int low = sysctl_local_port_range[0];
 235                 int high = sysctl_local_port_range[1];
 236                 int range = high - low;
 237                 int i;
 238                 int port;
 239                 static u32 hint;
 240                 u32 offset = hint + connect_port_offset(sk);
 241                 struct hlist_node *node;
 242                 struct inet_timewait_sock *tw = NULL;
 243
 244                 local_bh_disable();
 245                 for (i = 1; i <= range; i++) {
 246                         port = low + (i + offset) % range;
 247                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
 248                         spin_lock(&head->lock);
 249
 250                         /* Does not bother with rcv_saddr checks,
 251                          * because the established check is already
 252                          * unique enough.
 253                          */
 254                         inet_bind_bucket_for_each(tb, node, &head->chain) {
 255                                 if (tb->port == port) {
 256                                         BUG_TRAP(!hlist_empty(&tb->owners));
 257                                         if (tb->fastreuse >= 0)
 258                                                 goto next_port;
 259                                         if (!__tcp_v4_check_established(sk,
 260                                                                         port,
 261                                                                         &tw))
 262                                                 goto ok;
 263                                         goto next_port;
 264                                 }
 265                         }
 266
 267                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
 268                         if (!tb) {
 269                                 spin_unlock(&head->lock);
 270                                 break;
 271                         }
 272                         tb->fastreuse = -1;
 273                         goto ok;
 274
 275                 next_port:
 276                         spin_unlock(&head->lock);
 277                 }
 278                 local_bh_enable();
 279
 280                 return -EADDRNOTAVAIL;
 281
 282 ok:
 283                 hint += i;
 284
 285                 /* Head lock still held and bh's disabled */
 286                 inet_bind_hash(sk, tb, port);
 287                 if (sk_unhashed(sk)) {
 288                         inet_sk(sk)->sport = htons(port);
 289                         __inet_hash(&tcp_hashinfo, sk, 0);
 290                 }
 291                 spin_unlock(&head->lock);
 292
 293                 if (tw) {
 294                         inet_twsk_deschedule(tw, &tcp_death_row);;
 295                         inet_twsk_put(tw);
 296                 }
 297
 298                 ret = 0;
 299                 goto out;
 300         }
 301
 302         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 303         tb  = inet_csk(sk)->icsk_bind_hash;
 304         spin_lock_bh(&head->lock);
 305         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 306                 __inet_hash(&tcp_hashinfo, sk, 0);
 307                 spin_unlock_bh(&head->lock);
 308                 return 0;
 309         } else {
 310                 spin_unlock(&head->lock);
 311                 /* No definite answer... Walk to established hash table */
 312                 ret = __tcp_v4_check_established(sk, snum, NULL);
 313 out:
 314                 local_bh_enable();
 315                 return ret;
 316         }
 317 }
 318
 319 /* This will initiate an outgoing connection. */
 320 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 321 {
 322         struct inet_sock *inet = inet_sk(sk);
 323         struct tcp_sock *tp = tcp_sk(sk);
 324         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 325         struct rtable *rt;
 326         u32 daddr, nexthop;
 327         int tmp;
 328         int err;
 329
 330         if (addr_len < sizeof(struct sockaddr_in))
 331                 return -EINVAL;
 332
 333         if (usin->sin_family != AF_INET)
 334                 return -EAFNOSUPPORT;
 335
 336         nexthop = daddr = usin->sin_addr.s_addr;
 337         if (inet->opt && inet->opt->srr) {
 338                 if (!daddr)
 339                         return -EINVAL;
 340                 nexthop = inet->opt->faddr;
 341         }
 342
 343         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 344                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 345                                IPPROTO_TCP,
 346                                inet->sport, usin->sin_port, sk);
 347         if (tmp < 0)
 348                 return tmp;
 349
 350         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 351                 ip_rt_put(rt);
 352                 return -ENETUNREACH;
 353         }
 354
 355         if (!inet->opt || !inet->opt->srr)
 356                 daddr = rt->rt_dst;
 357
 358         if (!inet->saddr)
 359                 inet->saddr = rt->rt_src;
 360         inet->rcv_saddr = inet->saddr;
 361
 362         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 363                 /* Reset inherited state */
 364                 tp->rx_opt.ts_recent       = 0;
 365                 tp->rx_opt.ts_recent_stamp = 0;
 366                 tp->write_seq              = 0;
 367         }
 368
 369         if (tcp_death_row.sysctl_tw_recycle &&
 370             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 371                 struct inet_peer *peer = rt_get_peer(rt);
 372
 373                 /* VJ's idea. We save last timestamp seen from
 374                  * the destination in peer table, when entering state TIME-WAIT
 375                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 376                  */
 377
 378                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 379                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 380                         tp->rx_opt.ts_recent = peer->tcp_ts;
 381                 }
 382         }
 383
 384         inet->dport = usin->sin_port;
 385         inet->daddr = daddr;
 386
 387         tp->ext_header_len = 0;
 388         if (inet->opt)
 389                 tp->ext_header_len = inet->opt->optlen;
 390
 391         tp->rx_opt.mss_clamp = 536;
 392
 393         /* Socket identity is still unknown (sport may be zero).
 394          * However we set state to SYN-SENT and not releasing socket
 395          * lock select source port, enter ourselves into the hash tables and
 396          * complete initialization after this.
 397          */
 398         tcp_set_state(sk, TCP_SYN_SENT);
 399         err = tcp_v4_hash_connect(sk);
 400         if (err)
 401                 goto failure;
 402
 403         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 404         if (err)
 405                 goto failure;
 406
 407         /* OK, now commit destination to socket.  */
 408         sk_setup_caps(sk, &rt->u.dst);
 409
 410         if (!tp->write_seq)
 411                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 412                                                            inet->daddr,
 413                                                            inet->sport,
 414                                                            usin->sin_port);
 415
 416         inet->id = tp->write_seq ^ jiffies;
 417
 418         err = tcp_connect(sk);
 419         rt = NULL;
 420         if (err)
 421                 goto failure;
 422
 423         return 0;
 424
 425 failure:
 426         /* This unhashes the socket and releases the local port, if necessary. */
 427         tcp_set_state(sk, TCP_CLOSE);
 428         ip_rt_put(rt);
 429         sk->sk_route_caps = 0;
 430         inet->dport = 0;
 431         return err;
 432 }
 433
 434 /*
 435  * This routine does path mtu discovery as defined in RFC1191.
 436  */
 437 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 438                                      u32 mtu)
 439 {
 440         struct dst_entry *dst;
 441         struct inet_sock *inet = inet_sk(sk);
 442         struct tcp_sock *tp = tcp_sk(sk);
 443
 444         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 445          * send out by Linux are always <576bytes so they should go through
 446          * unfragmented).
 447          */
 448         if (sk->sk_state == TCP_LISTEN)
 449                 return;
 450
 451         /* We don't check in the destentry if pmtu discovery is forbidden
 452          * on this route. We just assume that no packet_to_big packets
 453          * are send back when pmtu discovery is not active.
 454          * There is a small race when the user changes this flag in the
 455          * route, but I think that's acceptable.
 456          */
 457         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 458                 return;
 459
 460         dst->ops->update_pmtu(dst, mtu);
 461
 462         /* Something is about to be wrong... Remember soft error
 463          * for the case, if this connection will not able to recover.
 464          */
 465         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 466                 sk->sk_err_soft = EMSGSIZE;
 467
 468         mtu = dst_mtu(dst);
 469
 470         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 471             tp->pmtu_cookie > mtu) {
 472                 tcp_sync_mss(sk, mtu);
 473
 474                 /* Resend the TCP packet because it's
 475                  * clear that the old packet has been
 476                  * dropped. This is the new "fast" path mtu
 477                  * discovery.
 478                  */
 479                 tcp_simple_retransmit(sk);
 480         } /* else let the usual retransmit timer handle it */
 481 }
 482
 483 /*
 484  * This routine is called by the ICMP module when it gets some
 485  * sort of error condition.  If err < 0 then the socket should
 486  * be closed and the error returned to the user.  If err > 0
 487  * it's just the icmp type << 8 | icmp code.  After adjustment
 488  * header points to the first 8 bytes of the tcp header.  We need
 489  * to find the appropriate port.
 490  *
 491  * The locking strategy used here is very "optimistic". When
 492  * someone else accesses the socket the ICMP is just dropped
 493  * and for some paths there is no check at all.
 494  * A more general error queue to queue errors for later handling
 495  * is probably better.
 496  *
 497  */
 498
 499 void tcp_v4_err(struct sk_buff *skb, u32 info)
 500 {
 501         struct iphdr *iph = (struct iphdr *)skb->data;
 502         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 503         struct tcp_sock *tp;
 504         struct inet_sock *inet;
 505         int type = skb->h.icmph->type;
 506         int code = skb->h.icmph->code;
 507         struct sock *sk;
 508         __u32 seq;
 509         int err;
 510
 511         if (skb->len < (iph->ihl << 2) + 8) {
 512                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 513                 return;
 514         }
 515
 516         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
 517                          th->source, inet_iif(skb));
 518         if (!sk) {
 519                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 520                 return;
 521         }
 522         if (sk->sk_state == TCP_TIME_WAIT) {
 523                 inet_twsk_put((struct inet_timewait_sock *)sk);
 524                 return;
 525         }
 526
 527         bh_lock_sock(sk);
 528         /* If too many ICMPs get dropped on busy
 529          * servers this needs to be solved differently.
 530          */
 531         if (sock_owned_by_user(sk))
 532                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 533
 534         if (sk->sk_state == TCP_CLOSE)
 535                 goto out;
 536
 537         tp = tcp_sk(sk);
 538         seq = ntohl(th->seq);
 539         if (sk->sk_state != TCP_LISTEN &&
 540             !between(seq, tp->snd_una, tp->snd_nxt)) {
 541                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
 542                 goto out;
 543         }
 544
 545         switch (type) {
 546         case ICMP_SOURCE_QUENCH:
 547                 /* Just silently ignore these. */
 548                 goto out;
 549         case ICMP_PARAMETERPROB:
 550                 err = EPROTO;
 551                 break;
 552         case ICMP_DEST_UNREACH:
 553                 if (code > NR_ICMP_UNREACH)
 554                         goto out;
 555
 556                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 557                         if (!sock_owned_by_user(sk))
 558                                 do_pmtu_discovery(sk, iph, info);
 559                         goto out;
 560                 }
 561
 562                 err = icmp_err_convert[code].errno;
 563                 break;
 564         case ICMP_TIME_EXCEEDED:
 565                 err = EHOSTUNREACH;
 566                 break;
 567         default:
 568                 goto out;
 569         }
 570
 571         switch (sk->sk_state) {
 572                 struct request_sock *req, **prev;
 573         case TCP_LISTEN:
 574                 if (sock_owned_by_user(sk))
 575                         goto out;
 576
 577                 req = inet_csk_search_req(sk, &prev, th->dest,
 578                                           iph->daddr, iph->saddr);
 579                 if (!req)
 580                         goto out;
 581
 582                 /* ICMPs are not backlogged, hence we cannot get
 583                    an established socket here.
 584                  */
 585                 BUG_TRAP(!req->sk);
 586
 587                 if (seq != tcp_rsk(req)->snt_isn) {
 588                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 589                         goto out;
 590                 }
 591
 592                 /*
 593                  * Still in SYN_RECV, just remove it silently.
 594                  * There is no good way to pass the error to the newly
 595                  * created socket, and POSIX does not want network
 596                  * errors returned from accept().
 597                  */
 598                 inet_csk_reqsk_queue_drop(sk, req, prev);
 599                 goto out;
 600
 601         case TCP_SYN_SENT:
 602         case TCP_SYN_RECV:  /* Cannot happen.
 603                                It can f.e. if SYNs crossed.
 604                              */
 605                 if (!sock_owned_by_user(sk)) {
 606                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 607                         sk->sk_err = err;
 608
 609                         sk->sk_error_report(sk);
 610
 611                         tcp_done(sk);
 612                 } else {
 613                         sk->sk_err_soft = err;
 614                 }
 615                 goto out;
 616         }
 617
 618         /* If we've already connected we will keep trying
 619          * until we time out, or the user gives up.
 620          *
 621          * rfc1122 4.2.3.9 allows to consider as hard errors
 622          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 623          * but it is obsoleted by pmtu discovery).
 624          *
 625          * Note, that in modern internet, where routing is unreliable
 626          * and in each dark corner broken firewalls sit, sending random
 627          * errors ordered by their masters even this two messages finally lose
 628          * their original sense (even Linux sends invalid PORT_UNREACHs)
 629          *
 630          * Now we are in compliance with RFCs.
 631          *                                                      --ANK (980905)
 632          */
 633
 634         inet = inet_sk(sk);
 635         if (!sock_owned_by_user(sk) && inet->recverr) {
 636                 sk->sk_err = err;
 637                 sk->sk_error_report(sk);
 638         } else  { /* Only an error on timeout */
 639                 sk->sk_err_soft = err;
 640         }
 641
 642 out:
 643         bh_unlock_sock(sk);
 644         sock_put(sk);
 645 }
 646
 647 /* This routine computes an IPv4 TCP checksum. */
 648 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 649                        struct sk_buff *skb)
 650 {
 651         struct inet_sock *inet = inet_sk(sk);
 652
 653         if (skb->ip_summed == CHECKSUM_HW) {
 654                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
 655                 skb->csum = offsetof(struct tcphdr, check);
 656         } else {
 657                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
 658                                          csum_partial((char *)th,
 659                                                       th->doff << 2,
 660                                                       skb->csum));
 661         }
 662 }
 663
 664 /*
 665  *      This routine will send an RST to the other tcp.
 666  *
 667  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 668  *                    for reset.
 669  *      Answer: if a packet caused RST, it is not for a socket
 670  *              existing in our system, if it is matched to a socket,
 671  *              it is just duplicate segment or bug in other side's TCP.
 672  *              So that we build reply only basing on parameters
 673  *              arrived with segment.
 674  *      Exception: precedence violation. We do not implement it in any case.
 675  */
 676
 677 static void tcp_v4_send_reset(struct sk_buff *skb)
 678 {
 679         struct tcphdr *th = skb->h.th;
 680         struct tcphdr rth;
 681         struct ip_reply_arg arg;
 682
 683         /* Never send a reset in response to a reset. */
 684         if (th->rst)
 685                 return;
 686
 687         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
 688                 return;
 689
 690         /* Swap the send and the receive. */
 691         memset(&rth, 0, sizeof(struct tcphdr));
 692         rth.dest   = th->source;
 693         rth.source = th->dest;
 694         rth.doff   = sizeof(struct tcphdr) / 4;
 695         rth.rst    = 1;
 696
 697         if (th->ack) {
 698                 rth.seq = th->ack_seq;
 699         } else {
 700                 rth.ack = 1;
 701                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 702                                     skb->len - (th->doff << 2));
 703         }
 704
 705         memset(&arg, 0, sizeof arg);
 706         arg.iov[0].iov_base = (unsigned char *)&rth;
 707         arg.iov[0].iov_len  = sizeof rth;
 708         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 709                                       skb->nh.iph->saddr, /*XXX*/
 710                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 711         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 712
 713         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 714
 715         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 716         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 717 }
 718
 719 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 720    outside socket context is ugly, certainly. What can I do?
 721  */
 722
 723 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 724                             u32 win, u32 ts)
 725 {
 726         struct tcphdr *th = skb->h.th;
 727         struct {
 728                 struct tcphdr th;
 729                 u32 tsopt[3];
 730         } rep;
 731         struct ip_reply_arg arg;
 732
 733         memset(&rep.th, 0, sizeof(struct tcphdr));
 734         memset(&arg, 0, sizeof arg);
 735
 736         arg.iov[0].iov_base = (unsigned char *)&rep;
 737         arg.iov[0].iov_len  = sizeof(rep.th);
 738         if (ts) {
 739                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 740                                      (TCPOPT_TIMESTAMP << 8) |
 741                                      TCPOLEN_TIMESTAMP);
 742                 rep.tsopt[1] = htonl(tcp_time_stamp);
 743                 rep.tsopt[2] = htonl(ts);
 744                 arg.iov[0].iov_len = sizeof(rep);
 745         }
 746
 747         /* Swap the send and the receive. */
 748         rep.th.dest    = th->source;
 749         rep.th.source  = th->dest;
 750         rep.th.doff    = arg.iov[0].iov_len / 4;
 751         rep.th.seq     = htonl(seq);
 752         rep.th.ack_seq = htonl(ack);
 753         rep.th.ack     = 1;
 754         rep.th.window  = htons(win);
 755
 756         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 757                                       skb->nh.iph->saddr, /*XXX*/
 758                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 759         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 760
 761         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 762
 763         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 764 }
 765
 766 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 767 {
 768         struct inet_timewait_sock *tw = inet_twsk(sk);
 769         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 770
 771         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 772                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
 773
 774         inet_twsk_put(tw);
 775 }
 776
 777 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 778 {
 779         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 780                         req->ts_recent);
 781 }
 782
 783 /*
 784  *      Send a SYN-ACK after having received an ACK.
 785  *      This still operates on a request_sock only, not on a big
 786  *      socket.
 787  */
 788 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 789                               struct dst_entry *dst)
 790 {
 791         const struct inet_request_sock *ireq = inet_rsk(req);
 792         int err = -1;
 793         struct sk_buff * skb;
 794
 795         /* First, grab a route. */
 796         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 797                 goto out;
 798
 799         skb = tcp_make_synack(sk, dst, req);
 800
 801         if (skb) {
 802                 struct tcphdr *th = skb->h.th;
 803
 804                 th->check = tcp_v4_check(th, skb->len,
 805                                          ireq->loc_addr,
 806                                          ireq->rmt_addr,
 807                                          csum_partial((char *)th, skb->len,
 808                                                       skb->csum));
 809
 810                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 811                                             ireq->rmt_addr,
 812                                             ireq->opt);
 813                 if (err == NET_XMIT_CN)
 814                         err = 0;
 815         }
 816
 817 out:
 818         dst_release(dst);
 819         return err;
 820 }
 821
 822 /*
 823  *      IPv4 request_sock destructor.
 824  */
 825 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 826 {
 827         if (inet_rsk(req)->opt)
 828                 kfree(inet_rsk(req)->opt);
 829 }
 830
 831 static inline void syn_flood_warning(struct sk_buff *skb)
 832 {
 833         static unsigned long warntime;
 834
 835         if (time_after(jiffies, (warntime + HZ * 60))) {
 836                 warntime = jiffies;
 837                 printk(KERN_INFO
 838                        "possible SYN flooding on port %d. Sending cookies.\n",
 839                        ntohs(skb->h.th->dest));
 840         }
 841 }
 842
 843 /*
 844  * Save and compile IPv4 options into the request_sock if needed.
 845  */
 846 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
 847                                                      struct sk_buff *skb)
 848 {
 849         struct ip_options *opt = &(IPCB(skb)->opt);
 850         struct ip_options *dopt = NULL;
 851
 852         if (opt && opt->optlen) {
 853                 int opt_size = optlength(opt);
 854                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 855                 if (dopt) {
 856                         if (ip_options_echo(dopt, skb)) {
 857                                 kfree(dopt);
 858                                 dopt = NULL;
 859                         }
 860                 }
 861         }
 862         return dopt;
 863 }
 864
 865 struct request_sock_ops tcp_request_sock_ops = {
 866         .family         =       PF_INET,
 867         .obj_size       =       sizeof(struct tcp_request_sock),
 868         .rtx_syn_ack    =       tcp_v4_send_synack,
 869         .send_ack       =       tcp_v4_reqsk_send_ack,
 870         .destructor     =       tcp_v4_reqsk_destructor,
 871         .send_reset     =       tcp_v4_send_reset,
 872 };
 873
 874 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 875 {
 876         struct inet_request_sock *ireq;
 877         struct tcp_options_received tmp_opt;
 878         struct request_sock *req;
 879         __u32 saddr = skb->nh.iph->saddr;
 880         __u32 daddr = skb->nh.iph->daddr;
 881         __u32 isn = TCP_SKB_CB(skb)->when;
 882         struct dst_entry *dst = NULL;
 883 #ifdef CONFIG_SYN_COOKIES
 884         int want_cookie = 0;
 885 #else
 886 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
 887 #endif
 888
 889         /* Never answer to SYNs send to broadcast or multicast */
 890         if (((struct rtable *)skb->dst)->rt_flags &
 891             (RTCF_BROADCAST | RTCF_MULTICAST))
 892                 goto drop;
 893
 894         /* TW buckets are converted to open requests without
 895          * limitations, they conserve resources and peer is
 896          * evidently real one.
 897          */
 898         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
 899 #ifdef CONFIG_SYN_COOKIES
 900                 if (sysctl_tcp_syncookies) {
 901                         want_cookie = 1;
 902                 } else
 903 #endif
 904                 goto drop;
 905         }
 906
 907         /* Accept backlog is full. If we have already queued enough
 908          * of warm entries in syn queue, drop request. It is better than
 909          * clogging syn queue with openreqs with exponentially increasing
 910          * timeout.
 911          */
 912         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 913                 goto drop;
 914
 915         req = reqsk_alloc(&tcp_request_sock_ops);
 916         if (!req)
 917                 goto drop;
 918
 919         tcp_clear_options(&tmp_opt);
 920         tmp_opt.mss_clamp = 536;
 921         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
 922
 923         tcp_parse_options(skb, &tmp_opt, 0);
 924
 925         if (want_cookie) {
 926                 tcp_clear_options(&tmp_opt);
 927                 tmp_opt.saw_tstamp = 0;
 928         }
 929
 930         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
 931                 /* Some OSes (unknown ones, but I see them on web server, which
 932                  * contains information interesting only for windows'
 933                  * users) do not send their stamp in SYN. It is easy case.
 934                  * We simply do not advertise TS support.
 935                  */
 936                 tmp_opt.saw_tstamp = 0;
 937                 tmp_opt.tstamp_ok  = 0;
 938         }
 939         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 940
 941         tcp_openreq_init(req, &tmp_opt, skb);
 942
 943         ireq = inet_rsk(req);
 944         ireq->loc_addr = daddr;
 945         ireq->rmt_addr = saddr;
 946         ireq->opt = tcp_v4_save_options(sk, skb);
 947         if (!want_cookie)
 948                 TCP_ECN_create_request(req, skb->h.th);
 949
 950         if (want_cookie) {
 951 #ifdef CONFIG_SYN_COOKIES
 952                 syn_flood_warning(skb);
 953 #endif
 954                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
 955         } else if (!isn) {
 956                 struct inet_peer *peer = NULL;
 957
 958                 /* VJ's idea. We save last timestamp seen
 959                  * from the destination in peer table, when entering
 960                  * state TIME-WAIT, and check against it before
 961                  * accepting new connection request.
 962                  *
 963                  * If "isn" is not zero, this request hit alive
 964                  * timewait bucket, so that all the necessary checks
 965                  * are made in the function processing timewait state.
 966                  */
 967                 if (tmp_opt.saw_tstamp &&
 968                     tcp_death_row.sysctl_tw_recycle &&
 969                     (dst = inet_csk_route_req(sk, req)) != NULL &&
 970                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 971                     peer->v4daddr == saddr) {
 972                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
 973                             (s32)(peer->tcp_ts - req->ts_recent) >
 974                                                         TCP_PAWS_WINDOW) {
 975                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
 976                                 dst_release(dst);
 977                                 goto drop_and_free;
 978                         }
 979                 }
 980                 /* Kill the following clause, if you dislike this way. */
 981                 else if (!sysctl_tcp_syncookies &&
 982                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 983                           (sysctl_max_syn_backlog >> 2)) &&
 984                          (!peer || !peer->tcp_ts_stamp) &&
 985                          (!dst || !dst_metric(dst, RTAX_RTT))) {
 986                         /* Without syncookies last quarter of
 987                          * backlog is filled with destinations,
 988                          * proven to be alive.
 989                          * It means that we continue to communicate
 990                          * to destinations, already remembered
 991                          * to the moment of synflood.
 992                          */
 993                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
 994                                        "request from %u.%u.%u.%u/%u\n",
 995                                        NIPQUAD(saddr),
 996                                        ntohs(skb->h.th->source));
 997                         dst_release(dst);
 998                         goto drop_and_free;
 999                 }
1000
1001                 isn = tcp_v4_init_sequence(sk, skb);
1002         }
1003         tcp_rsk(req)->snt_isn = isn;
1004
1005         if (tcp_v4_send_synack(sk, req, dst))
1006                 goto drop_and_free;
1007
1008         if (want_cookie) {
1009                 reqsk_free(req);
1010         } else {
1011                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1012         }
1013         return 0;
1014
1015 drop_and_free:
1016         reqsk_free(req);
1017 drop:
1018         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1019         return 0;
1020 }
1021
1022
1023 /*
1024  * The three way handshake has completed - we got a valid synack -
1025  * now create the new socket.
1026  */
1027 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1028                                   struct request_sock *req,
1029                                   struct dst_entry *dst)
1030 {
1031         struct inet_request_sock *ireq;
1032         struct inet_sock *newinet;
1033         struct tcp_sock *newtp;
1034         struct sock *newsk;
1035
1036         if (sk_acceptq_is_full(sk))
1037                 goto exit_overflow;
1038
1039         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1040                 goto exit;
1041
1042         newsk = tcp_create_openreq_child(sk, req, skb);
1043         if (!newsk)
1044                 goto exit;
1045
1046         sk_setup_caps(newsk, dst);
1047
1048         newtp                 = tcp_sk(newsk);
1049         newinet               = inet_sk(newsk);
1050         ireq                  = inet_rsk(req);
1051         newinet->daddr        = ireq->rmt_addr;
1052         newinet->rcv_saddr    = ireq->loc_addr;
1053         newinet->saddr        = ireq->loc_addr;
1054         newinet->opt          = ireq->opt;
1055         ireq->opt             = NULL;
1056         newinet->mc_index     = inet_iif(skb);
1057         newinet->mc_ttl       = skb->nh.iph->ttl;
1058         newtp->ext_header_len = 0;
1059         if (newinet->opt)
1060                 newtp->ext_header_len = newinet->opt->optlen;
1061         newinet->id = newtp->write_seq ^ jiffies;
1062
1063         tcp_sync_mss(newsk, dst_mtu(dst));
1064         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1065         tcp_initialize_rcv_mss(newsk);
1066
1067         __inet_hash(&tcp_hashinfo, newsk, 0);
1068         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1069
1070         return newsk;
1071
1072 exit_overflow:
1073         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1074 exit:
1075         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1076         dst_release(dst);
1077         return NULL;
1078 }
1079
1080 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1081 {
1082         struct tcphdr *th = skb->h.th;
1083         struct iphdr *iph = skb->nh.iph;
1084         struct sock *nsk;
1085         struct request_sock **prev;
1086         /* Find possible connection requests. */
1087         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1088                                                        iph->saddr, iph->daddr);
1089         if (req)
1090                 return tcp_check_req(sk, skb, req, prev);
1091
1092         nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1093                                         th->source, skb->nh.iph->daddr,
1094                                         ntohs(th->dest), inet_iif(skb));
1095
1096         if (nsk) {
1097                 if (nsk->sk_state != TCP_TIME_WAIT) {
1098                         bh_lock_sock(nsk);
1099                         return nsk;
1100                 }
1101                 inet_twsk_put((struct inet_timewait_sock *)nsk);
1102                 return NULL;
1103         }
1104
1105 #ifdef CONFIG_SYN_COOKIES
1106         if (!th->rst && !th->syn && th->ack)
1107                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1108 #endif
1109         return sk;
1110 }
1111
1112 static int tcp_v4_checksum_init(struct sk_buff *skb)
1113 {
1114         if (skb->ip_summed == CHECKSUM_HW) {
1115                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1116                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1117                                   skb->nh.iph->daddr, skb->csum))
1118                         return 0;
1119
1120                 LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
1121                 skb->ip_summed = CHECKSUM_NONE;
1122         }
1123         if (skb->len <= 76) {
1124                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1125                                  skb->nh.iph->daddr,
1126                                  skb_checksum(skb, 0, skb->len, 0)))
1127                         return -1;
1128                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1129         } else {
1130                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1131                                           skb->nh.iph->saddr,
1132                                           skb->nh.iph->daddr, 0);
1133         }
1134         return 0;
1135 }
1136
1137
1138 /* The socket must have it's spinlock held when we get
1139  * here.
1140  *
1141  * We have a potential double-lock case here, so even when
1142  * doing backlog processing we use the BH locking scheme.
1143  * This is because we cannot sleep with the original spinlock
1144  * held.
1145  */
1146 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1147 {
1148         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1149                 TCP_CHECK_TIMER(sk);
1150                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1151                         goto reset;
1152                 TCP_CHECK_TIMER(sk);
1153                 return 0;
1154         }
1155
1156         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1157                 goto csum_err;
1158
1159         if (sk->sk_state == TCP_LISTEN) {
1160                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1161                 if (!nsk)
1162                         goto discard;
1163
1164                 if (nsk != sk) {
1165                         if (tcp_child_process(sk, nsk, skb))
1166                                 goto reset;
1167                         return 0;
1168                 }
1169         }
1170
1171         TCP_CHECK_TIMER(sk);
1172         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1173                 goto reset;
1174         TCP_CHECK_TIMER(sk);
1175         return 0;
1176
1177 reset:
1178         tcp_v4_send_reset(skb);
1179 discard:
1180         kfree_skb(skb);
1181         /* Be careful here. If this function gets more complicated and
1182          * gcc suffers from register pressure on the x86, sk (in %ebx)
1183          * might be destroyed here. This current version compiles correctly,
1184          * but you have been warned.
1185          */
1186         return 0;
1187
1188 csum_err:
1189         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1190         goto discard;
1191 }
1192
1193 /*
1194  *      From tcp_input.c
1195  */
1196
1197 int tcp_v4_rcv(struct sk_buff *skb)
1198 {
1199         struct tcphdr *th;
1200         struct sock *sk;
1201         int ret;
1202
1203         if (skb->pkt_type != PACKET_HOST)
1204                 goto discard_it;
1205
1206         /* Count it even if it's bad */
1207         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1208
1209         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1210                 goto discard_it;
1211
1212         th = skb->h.th;
1213
1214         if (th->doff < sizeof(struct tcphdr) / 4)
1215                 goto bad_packet;
1216         if (!pskb_may_pull(skb, th->doff * 4))
1217                 goto discard_it;
1218
1219         /* An explanation is required here, I think.
1220          * Packet length and doff are validated by header prediction,
1221          * provided case of th->doff==0 is elimineted.
1222          * So, we defer the checks. */
1223         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1224              tcp_v4_checksum_init(skb) < 0))
1225                 goto bad_packet;
1226
1227         th = skb->h.th;
1228         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1229         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1230                                     skb->len - th->doff * 4);
1231         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1232         TCP_SKB_CB(skb)->when    = 0;
1233         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1234         TCP_SKB_CB(skb)->sacked  = 0;
1235
1236         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1237                            skb->nh.iph->daddr, ntohs(th->dest),
1238                            inet_iif(skb));
1239
1240         if (!sk)
1241                 goto no_tcp_socket;
1242
1243 process:
1244         if (sk->sk_state == TCP_TIME_WAIT)
1245                 goto do_time_wait;
1246
1247         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1248                 goto discard_and_relse;
1249
1250         if (sk_filter(sk, skb, 0))
1251                 goto discard_and_relse;
1252
1253         skb->dev = NULL;
1254
1255         bh_lock_sock(sk);
1256         ret = 0;
1257         if (!sock_owned_by_user(sk)) {
1258                 if (!tcp_prequeue(sk, skb))
1259                         ret = tcp_v4_do_rcv(sk, skb);
1260         } else
1261                 sk_add_backlog(sk, skb);
1262         bh_unlock_sock(sk);
1263
1264         sock_put(sk);
1265
1266         return ret;
1267
1268 no_tcp_socket:
1269         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1270                 goto discard_it;
1271
1272         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1273 bad_packet:
1274                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1275         } else {
1276                 tcp_v4_send_reset(skb);
1277         }
1278
1279 discard_it:
1280         /* Discard frame. */
1281         kfree_skb(skb);
1282         return 0;
1283
1284 discard_and_relse:
1285         sock_put(sk);
1286         goto discard_it;
1287
1288 do_time_wait:
1289         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1290                 inet_twsk_put((struct inet_timewait_sock *) sk);
1291                 goto discard_it;
1292         }
1293
1294         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1295                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1296                 inet_twsk_put((struct inet_timewait_sock *) sk);
1297                 goto discard_it;
1298         }
1299         switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1300                                            skb, th)) {
1301         case TCP_TW_SYN: {
1302                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1303                                                         skb->nh.iph->daddr,
1304                                                         ntohs(th->dest),
1305                                                         inet_iif(skb));
1306                 if (sk2) {
1307                         inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1308                                              &tcp_death_row);
1309                         inet_twsk_put((struct inet_timewait_sock *)sk);
1310                         sk = sk2;
1311                         goto process;
1312                 }
1313                 /* Fall through to ACK */
1314         }
1315         case TCP_TW_ACK:
1316                 tcp_v4_timewait_ack(sk, skb);
1317                 break;
1318         case TCP_TW_RST:
1319                 goto no_tcp_socket;
1320         case TCP_TW_SUCCESS:;
1321         }
1322         goto discard_it;
1323 }
1324
1325 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1326 {
1327         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1328         struct inet_sock *inet = inet_sk(sk);
1329
1330         sin->sin_family         = AF_INET;
1331         sin->sin_addr.s_addr    = inet->daddr;
1332         sin->sin_port           = inet->dport;
1333 }
1334
1335 /* VJ's idea. Save last timestamp seen from this destination
1336  * and hold it at least for normal timewait interval to use for duplicate
1337  * segment detection in subsequent connections, before they enter synchronized
1338  * state.
1339  */
1340
1341 int tcp_v4_remember_stamp(struct sock *sk)
1342 {
1343         struct inet_sock *inet = inet_sk(sk);
1344         struct tcp_sock *tp = tcp_sk(sk);
1345         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1346         struct inet_peer *peer = NULL;
1347         int release_it = 0;
1348
1349         if (!rt || rt->rt_dst != inet->daddr) {
1350                 peer = inet_getpeer(inet->daddr, 1);
1351                 release_it = 1;
1352         } else {
1353                 if (!rt->peer)
1354                         rt_bind_peer(rt, 1);
1355                 peer = rt->peer;
1356         }
1357
1358         if (peer) {
1359                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1360                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1361                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1362                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1363                         peer->tcp_ts = tp->rx_opt.ts_recent;
1364                 }
1365                 if (release_it)
1366                         inet_putpeer(peer);
1367                 return 1;
1368         }
1369
1370         return 0;
1371 }
1372
1373 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1374 {
1375         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1376
1377         if (peer) {
1378                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1379
1380                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1381                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1382                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1383                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1384                         peer->tcp_ts       = tcptw->tw_ts_recent;
1385                 }
1386                 inet_putpeer(peer);
1387                 return 1;
1388         }
1389
1390         return 0;
1391 }
1392
1393 struct tcp_func ipv4_specific = {
1394         .queue_xmit     =       ip_queue_xmit,
1395         .send_check     =       tcp_v4_send_check,
1396         .rebuild_header =       inet_sk_rebuild_header,
1397         .conn_request   =       tcp_v4_conn_request,
1398         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1399         .remember_stamp =       tcp_v4_remember_stamp,
1400         .net_header_len =       sizeof(struct iphdr),
1401         .setsockopt     =       ip_setsockopt,
1402         .getsockopt     =       ip_getsockopt,
1403         .addr2sockaddr  =       v4_addr2sockaddr,
1404         .sockaddr_len   =       sizeof(struct sockaddr_in),
1405 };
1406
1407 /* NOTE: A lot of things set to zero explicitly by call to
1408  *       sk_alloc() so need not be done here.
1409  */
1410 static int tcp_v4_init_sock(struct sock *sk)
1411 {
1412         struct tcp_sock *tp = tcp_sk(sk);
1413
1414         skb_queue_head_init(&tp->out_of_order_queue);
1415         tcp_init_xmit_timers(sk);
1416         tcp_prequeue_init(tp);
1417
1418         inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
1419         tp->mdev = TCP_TIMEOUT_INIT;
1420
1421         /* So many TCP implementations out there (incorrectly) count the
1422          * initial SYN frame in their delayed-ACK and congestion control
1423          * algorithms that we must have the following bandaid to talk
1424          * efficiently to them.  -DaveM
1425          */
1426         tp->snd_cwnd = 2;
1427
1428         /* See draft-stevens-tcpca-spec-01 for discussion of the
1429          * initialization of these values.
1430          */
1431         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1432         tp->snd_cwnd_clamp = ~0;
1433         tp->mss_cache = 536;
1434
1435         tp->reordering = sysctl_tcp_reordering;
1436         tp->ca_ops = &tcp_init_congestion_ops;
1437
1438         sk->sk_state = TCP_CLOSE;
1439
1440         sk->sk_write_space = sk_stream_write_space;
1441         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1442
1443         tp->af_specific = &ipv4_specific;
1444
1445         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1446         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1447
1448         atomic_inc(&tcp_sockets_allocated);
1449
1450         return 0;
1451 }
1452
1453 int tcp_v4_destroy_sock(struct sock *sk)
1454 {
1455         struct tcp_sock *tp = tcp_sk(sk);
1456
1457         tcp_clear_xmit_timers(sk);
1458
1459         tcp_cleanup_congestion_control(tp);
1460
1461         /* Cleanup up the write buffer. */
1462         sk_stream_writequeue_purge(sk);
1463
1464         /* Cleans up our, hopefully empty, out_of_order_queue. */
1465         __skb_queue_purge(&tp->out_of_order_queue);
1466
1467         /* Clean prequeue, it must be empty really */
1468         __skb_queue_purge(&tp->ucopy.prequeue);
1469
1470         /* Clean up a referenced TCP bind bucket. */
1471         if (inet_csk(sk)->icsk_bind_hash)
1472                 inet_put_port(&tcp_hashinfo, sk);
1473
1474         /*
1475          * If sendmsg cached page exists, toss it.
1476          */
1477         if (sk->sk_sndmsg_page) {
1478                 __free_page(sk->sk_sndmsg_page);
1479                 sk->sk_sndmsg_page = NULL;
1480         }
1481
1482         atomic_dec(&tcp_sockets_allocated);
1483
1484         return 0;
1485 }
1486
1487 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1488
1489 #ifdef CONFIG_PROC_FS
1490 /* Proc filesystem TCP sock list dumping. */
1491
1492 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1493 {
1494         return hlist_empty(head) ? NULL :
1495                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1496 }
1497
1498 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1499 {
1500         return tw->tw_node.next ?
1501                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1502 }
1503
1504 static void *listening_get_next(struct seq_file *seq, void *cur)
1505 {
1506         struct inet_connection_sock *icsk;
1507         struct hlist_node *node;
1508         struct sock *sk = cur;
1509         struct tcp_iter_state* st = seq->private;
1510
1511         if (!sk) {
1512                 st->bucket = 0;
1513                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1514                 goto get_sk;
1515         }
1516
1517         ++st->num;
1518
1519         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1520                 struct request_sock *req = cur;
1521
1522                 icsk = inet_csk(st->syn_wait_sk);
1523                 req = req->dl_next;
1524                 while (1) {
1525                         while (req) {
1526                                 if (req->rsk_ops->family == st->family) {
1527                                         cur = req;
1528                                         goto out;
1529                                 }
1530                                 req = req->dl_next;
1531                         }
1532                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1533                                 break;
1534 get_req:
1535                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1536                 }
1537                 sk        = sk_next(st->syn_wait_sk);
1538                 st->state = TCP_SEQ_STATE_LISTENING;
1539                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1540         } else {
1541                 icsk = inet_csk(sk);
1542                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1543                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1544                         goto start_req;
1545                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1546                 sk = sk_next(sk);
1547         }
1548 get_sk:
1549         sk_for_each_from(sk, node) {
1550                 if (sk->sk_family == st->family) {
1551                         cur = sk;
1552                         goto out;
1553                 }
1554                 icsk = inet_csk(sk);
1555                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1556                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1557 start_req:
1558                         st->uid         = sock_i_uid(sk);
1559                         st->syn_wait_sk = sk;
1560                         st->state       = TCP_SEQ_STATE_OPENREQ;
1561                         st->sbucket     = 0;
1562                         goto get_req;
1563                 }
1564                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1565         }
1566         if (++st->bucket < INET_LHTABLE_SIZE) {
1567                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1568                 goto get_sk;
1569         }
1570         cur = NULL;
1571 out:
1572         return cur;
1573 }
1574
1575 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1576 {
1577         void *rc = listening_get_next(seq, NULL);
1578
1579         while (rc && *pos) {
1580                 rc = listening_get_next(seq, rc);
1581                 --*pos;
1582         }
1583         return rc;
1584 }
1585
1586 static void *established_get_first(struct seq_file *seq)
1587 {
1588         struct tcp_iter_state* st = seq->private;
1589         void *rc = NULL;
1590
1591         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1592                 struct sock *sk;
1593                 struct hlist_node *node;
1594                 struct inet_timewait_sock *tw;
1595
1596                 /* We can reschedule _before_ having picked the target: */
1597                 cond_resched_softirq();
1598
1599                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1600                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1601                         if (sk->sk_family != st->family) {
1602                                 continue;
1603                         }
1604                         rc = sk;
1605                         goto out;
1606                 }
1607                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1608                 inet_twsk_for_each(tw, node,
1609                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1610                         if (tw->tw_family != st->family) {
1611                                 continue;
1612                         }
1613                         rc = tw;
1614                         goto out;
1615                 }
1616                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1617                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1618         }
1619 out:
1620         return rc;
1621 }
1622
1623 static void *established_get_next(struct seq_file *seq, void *cur)
1624 {
1625         struct sock *sk = cur;
1626         struct inet_timewait_sock *tw;
1627         struct hlist_node *node;
1628         struct tcp_iter_state* st = seq->private;
1629
1630         ++st->num;
1631
1632         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1633                 tw = cur;
1634                 tw = tw_next(tw);
1635 get_tw:
1636                 while (tw && tw->tw_family != st->family) {
1637                         tw = tw_next(tw);
1638                 }
1639                 if (tw) {
1640                         cur = tw;
1641                         goto out;
1642                 }
1643                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1644                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1645
1646                 /* We can reschedule between buckets: */
1647                 cond_resched_softirq();
1648
1649                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1650                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1651                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1652                 } else {
1653                         cur = NULL;
1654                         goto out;
1655                 }
1656         } else
1657                 sk = sk_next(sk);
1658
1659         sk_for_each_from(sk, node) {
1660                 if (sk->sk_family == st->family)
1661                         goto found;
1662         }
1663
1664         st->state = TCP_SEQ_STATE_TIME_WAIT;
1665         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1666         goto get_tw;
1667 found:
1668         cur = sk;
1669 out:
1670         return cur;
1671 }
1672
1673 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1674 {
1675         void *rc = established_get_first(seq);
1676
1677         while (rc && pos) {
1678                 rc = established_get_next(seq, rc);
1679                 --pos;
1680         }
1681         return rc;
1682 }
1683
1684 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1685 {
1686         void *rc;
1687         struct tcp_iter_state* st = seq->private;
1688
1689         inet_listen_lock(&tcp_hashinfo);
1690         st->state = TCP_SEQ_STATE_LISTENING;
1691         rc        = listening_get_idx(seq, &pos);
1692
1693         if (!rc) {
1694                 inet_listen_unlock(&tcp_hashinfo);
1695                 local_bh_disable();
1696                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1697                 rc        = established_get_idx(seq, pos);
1698         }
1699
1700         return rc;
1701 }
1702
1703 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1704 {
1705         struct tcp_iter_state* st = seq->private;
1706         st->state = TCP_SEQ_STATE_LISTENING;
1707         st->num = 0;
1708         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1709 }
1710
1711 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1712 {
1713         void *rc = NULL;
1714         struct tcp_iter_state* st;
1715
1716         if (v == SEQ_START_TOKEN) {
1717                 rc = tcp_get_idx(seq, 0);
1718                 goto out;
1719         }
1720         st = seq->private;
1721
1722         switch (st->state) {
1723         case TCP_SEQ_STATE_OPENREQ:
1724         case TCP_SEQ_STATE_LISTENING:
1725                 rc = listening_get_next(seq, v);
1726                 if (!rc) {
1727                         inet_listen_unlock(&tcp_hashinfo);
1728                         local_bh_disable();
1729                         st->state = TCP_SEQ_STATE_ESTABLISHED;
1730                         rc        = established_get_first(seq);
1731                 }
1732                 break;
1733         case TCP_SEQ_STATE_ESTABLISHED:
1734         case TCP_SEQ_STATE_TIME_WAIT:
1735                 rc = established_get_next(seq, v);
1736                 break;
1737         }
1738 out:
1739         ++*pos;
1740         return rc;
1741 }
1742
1743 static void tcp_seq_stop(struct seq_file *seq, void *v)
1744 {
1745         struct tcp_iter_state* st = seq->private;
1746
1747         switch (st->state) {
1748         case TCP_SEQ_STATE_OPENREQ:
1749                 if (v) {
1750                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1751                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1752                 }
1753         case TCP_SEQ_STATE_LISTENING:
1754                 if (v != SEQ_START_TOKEN)
1755                         inet_listen_unlock(&tcp_hashinfo);
1756                 break;
1757         case TCP_SEQ_STATE_TIME_WAIT:
1758         case TCP_SEQ_STATE_ESTABLISHED:
1759                 if (v)
1760                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1761                 local_bh_enable();
1762                 break;
1763         }
1764 }
1765
1766 static int tcp_seq_open(struct inode *inode, struct file *file)
1767 {
1768         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1769         struct seq_file *seq;
1770         struct tcp_iter_state *s;
1771         int rc;
1772
1773         if (unlikely(afinfo == NULL))
1774                 return -EINVAL;
1775
1776         s = kmalloc(sizeof(*s), GFP_KERNEL);
1777         if (!s)
1778                 return -ENOMEM;
1779         memset(s, 0, sizeof(*s));
1780         s->family               = afinfo->family;
1781         s->seq_ops.start        = tcp_seq_start;
1782         s->seq_ops.next         = tcp_seq_next;
1783         s->seq_ops.show         = afinfo->seq_show;
1784         s->seq_ops.stop         = tcp_seq_stop;
1785
1786         rc = seq_open(file, &s->seq_ops);
1787         if (rc)
1788                 goto out_kfree;
1789         seq          = file->private_data;
1790         seq->private = s;
1791 out:
1792         return rc;
1793 out_kfree:
1794         kfree(s);
1795         goto out;
1796 }
1797
1798 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1799 {
1800         int rc = 0;
1801         struct proc_dir_entry *p;
1802
1803         if (!afinfo)
1804                 return -EINVAL;
1805         afinfo->seq_fops->owner         = afinfo->owner;
1806         afinfo->seq_fops->open          = tcp_seq_open;
1807         afinfo->seq_fops->read          = seq_read;
1808         afinfo->seq_fops->llseek        = seq_lseek;
1809         afinfo->seq_fops->release       = seq_release_private;
1810
1811         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1812         if (p)
1813                 p->data = afinfo;
1814         else
1815                 rc = -ENOMEM;
1816         return rc;
1817 }
1818
1819 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1820 {
1821         if (!afinfo)
1822                 return;
1823         proc_net_remove(afinfo->name);
1824         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1825 }
1826
1827 static void get_openreq4(struct sock *sk, struct request_sock *req,
1828                          char *tmpbuf, int i, int uid)
1829 {
1830         const struct inet_request_sock *ireq = inet_rsk(req);
1831         int ttd = req->expires - jiffies;
1832
1833         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1834                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1835                 i,
1836                 ireq->loc_addr,
1837                 ntohs(inet_sk(sk)->sport),
1838                 ireq->rmt_addr,
1839                 ntohs(ireq->rmt_port),
1840                 TCP_SYN_RECV,
1841                 0, 0, /* could print option size, but that is af dependent. */
1842                 1,    /* timers active (only the expire timer) */
1843                 jiffies_to_clock_t(ttd),
1844                 req->retrans,
1845                 uid,
1846                 0,  /* non standard timer */
1847                 0, /* open_requests have no inode */
1848                 atomic_read(&sk->sk_refcnt),
1849                 req);
1850 }
1851
1852 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1853 {
1854         int timer_active;
1855         unsigned long timer_expires;
1856         struct tcp_sock *tp = tcp_sk(sp);
1857         const struct inet_connection_sock *icsk = inet_csk(sp);
1858         struct inet_sock *inet = inet_sk(sp);
1859         unsigned int dest = inet->daddr;
1860         unsigned int src = inet->rcv_saddr;
1861         __u16 destp = ntohs(inet->dport);
1862         __u16 srcp = ntohs(inet->sport);
1863
1864         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1865                 timer_active    = 1;
1866                 timer_expires   = icsk->icsk_timeout;
1867         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1868                 timer_active    = 4;
1869                 timer_expires   = icsk->icsk_timeout;
1870         } else if (timer_pending(&sp->sk_timer)) {
1871                 timer_active    = 2;
1872                 timer_expires   = sp->sk_timer.expires;
1873         } else {
1874                 timer_active    = 0;
1875                 timer_expires = jiffies;
1876         }
1877
1878         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1879                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
1880                 i, src, srcp, dest, destp, sp->sk_state,
1881                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1882                 timer_active,
1883                 jiffies_to_clock_t(timer_expires - jiffies),
1884                 icsk->icsk_retransmits,
1885                 sock_i_uid(sp),
1886                 tp->probes_out,
1887                 sock_i_ino(sp),
1888                 atomic_read(&sp->sk_refcnt), sp,
1889                 icsk->icsk_rto,
1890                 icsk->icsk_ack.ato,
1891                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1892                 tp->snd_cwnd,
1893                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1894 }
1895
1896 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1897 {
1898         unsigned int dest, src;
1899         __u16 destp, srcp;
1900         int ttd = tw->tw_ttd - jiffies;
1901
1902         if (ttd < 0)
1903                 ttd = 0;
1904
1905         dest  = tw->tw_daddr;
1906         src   = tw->tw_rcv_saddr;
1907         destp = ntohs(tw->tw_dport);
1908         srcp  = ntohs(tw->tw_sport);
1909
1910         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1911                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1912                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1913                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1914                 atomic_read(&tw->tw_refcnt), tw);
1915 }
1916
1917 #define TMPSZ 150
1918
1919 static int tcp4_seq_show(struct seq_file *seq, void *v)
1920 {
1921         struct tcp_iter_state* st;
1922         char tmpbuf[TMPSZ + 1];
1923
1924         if (v == SEQ_START_TOKEN) {
1925                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
1926                            "  sl  local_address rem_address   st tx_queue "
1927                            "rx_queue tr tm->when retrnsmt   uid  timeout "
1928                            "inode");
1929                 goto out;
1930         }
1931         st = seq->private;
1932
1933         switch (st->state) {
1934         case TCP_SEQ_STATE_LISTENING:
1935         case TCP_SEQ_STATE_ESTABLISHED:
1936                 get_tcp4_sock(v, tmpbuf, st->num);
1937                 break;
1938         case TCP_SEQ_STATE_OPENREQ:
1939                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1940                 break;
1941         case TCP_SEQ_STATE_TIME_WAIT:
1942                 get_timewait4_sock(v, tmpbuf, st->num);
1943                 break;
1944         }
1945         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1946 out:
1947         return 0;
1948 }
1949
1950 static struct file_operations tcp4_seq_fops;
1951 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1952         .owner          = THIS_MODULE,
1953         .name           = "tcp",
1954         .family         = AF_INET,
1955         .seq_show       = tcp4_seq_show,
1956         .seq_fops       = &tcp4_seq_fops,
1957 };
1958
1959 int __init tcp4_proc_init(void)
1960 {
1961         return tcp_proc_register(&tcp4_seq_afinfo);
1962 }
1963
1964 void tcp4_proc_exit(void)
1965 {
1966         tcp_proc_unregister(&tcp4_seq_afinfo);
1967 }
1968 #endif /* CONFIG_PROC_FS */
1969
1970 struct proto tcp_prot = {
1971         .name                   = "TCP",
1972         .owner                  = THIS_MODULE,
1973         .close                  = tcp_close,
1974         .connect                = tcp_v4_connect,
1975         .disconnect             = tcp_disconnect,
1976         .accept                 = inet_csk_accept,
1977         .ioctl                  = tcp_ioctl,
1978         .init                   = tcp_v4_init_sock,
1979         .destroy                = tcp_v4_destroy_sock,
1980         .shutdown               = tcp_shutdown,
1981         .setsockopt             = tcp_setsockopt,
1982         .getsockopt             = tcp_getsockopt,
1983         .sendmsg                = tcp_sendmsg,
1984         .recvmsg                = tcp_recvmsg,
1985         .backlog_rcv            = tcp_v4_do_rcv,
1986         .hash                   = tcp_v4_hash,
1987         .unhash                 = tcp_unhash,
1988         .get_port               = tcp_v4_get_port,
1989         .enter_memory_pressure  = tcp_enter_memory_pressure,
1990         .sockets_allocated      = &tcp_sockets_allocated,
1991         .orphan_count           = &tcp_orphan_count,
1992         .memory_allocated       = &tcp_memory_allocated,
1993         .memory_pressure        = &tcp_memory_pressure,
1994         .sysctl_mem             = sysctl_tcp_mem,
1995         .sysctl_wmem            = sysctl_tcp_wmem,
1996         .sysctl_rmem            = sysctl_tcp_rmem,
1997         .max_header             = MAX_TCP_HEADER,
1998         .obj_size               = sizeof(struct tcp_sock),
1999         .twsk_obj_size          = sizeof(struct tcp_timewait_sock),
2000         .rsk_prot               = &tcp_request_sock_ops,
2001 };
2002
2003
2004
2005 void __init tcp_v4_init(struct net_proto_family *ops)
2006 {
2007         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2008         if (err < 0)
2009                 panic("Failed to create the TCP control socket.\n");
2010         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2011         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2012
2013         /* Unhash it so that IP input processing does not even
2014          * see it, we do not wish this socket to see incoming
2015          * packets.
2016          */
2017         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2018 }
2019
2020 EXPORT_SYMBOL(ipv4_specific);
2021 EXPORT_SYMBOL(inet_bind_bucket_create);
2022 EXPORT_SYMBOL(tcp_hashinfo);
2023 EXPORT_SYMBOL(tcp_prot);
2024 EXPORT_SYMBOL(tcp_unhash);
2025 EXPORT_SYMBOL(tcp_v4_conn_request);
2026 EXPORT_SYMBOL(tcp_v4_connect);
2027 EXPORT_SYMBOL(tcp_v4_do_rcv);
2028 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2029 EXPORT_SYMBOL(tcp_v4_send_check);
2030 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2031
2032 #ifdef CONFIG_PROC_FS
2033 EXPORT_SYMBOL(tcp_proc_register);
2034 EXPORT_SYMBOL(tcp_proc_unregister);
2035 #endif
2036 EXPORT_SYMBOL(sysctl_local_port_range);
2037 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2038 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2039