err.no Git - linux-2.6/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/ipv6.h>
  70 #include <net/inet_common.h>
  71 #include <net/xfrm.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78
  79 extern int sysctl_ip_dynaddr;
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .lhash_lock     = RW_LOCK_UNLOCKED,
  94         .lhash_users    = ATOMIC_INIT(0),
  95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
  96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
  97         .port_rover     = 1024 - 1,
  98 };
  99
 100 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 101 {
 102         return inet_csk_get_port(&tcp_hashinfo, sk, snum);
 103 }
 104
 105 static void tcp_v4_hash(struct sock *sk)
 106 {
 107         inet_hash(&tcp_hashinfo, sk);
 108 }
 109
 110 void tcp_unhash(struct sock *sk)
 111 {
 112         inet_unhash(&tcp_hashinfo, sk);
 113 }
 114
 115 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 116 {
 117         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 118                                           skb->nh.iph->saddr,
 119                                           skb->h.th->dest,
 120                                           skb->h.th->source);
 121 }
 122
 123 /* called with local bh disabled */
 124 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 125                                       struct inet_timewait_sock **twp)
 126 {
 127         struct inet_sock *inet = inet_sk(sk);
 128         u32 daddr = inet->rcv_saddr;
 129         u32 saddr = inet->daddr;
 130         int dif = sk->sk_bound_dev_if;
 131         INET_ADDR_COOKIE(acookie, saddr, daddr)
 132         const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
 133         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
 134         struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
 135         struct sock *sk2;
 136         const struct hlist_node *node;
 137         struct inet_timewait_sock *tw;
 138
 139         write_lock(&head->lock);
 140
 141         /* Check TIME-WAIT sockets first. */
 142         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 143                 tw = inet_twsk(sk2);
 144
 145                 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 146                         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 147                         struct tcp_sock *tp = tcp_sk(sk);
 148
 149                         /* With PAWS, it is safe from the viewpoint
 150                            of data integrity. Even without PAWS it
 151                            is safe provided sequence spaces do not
 152                            overlap i.e. at data rates <= 80Mbit/sec.
 153
 154                            Actually, the idea is close to VJ's one,
 155                            only timestamp cache is held not per host,
 156                            but per port pair and TW bucket is used
 157                            as state holder.
 158
 159                            If TW bucket has been already destroyed we
 160                            fall back to VJ's scheme and use initial
 161                            timestamp retrieved from peer table.
 162                          */
 163                         if (tcptw->tw_ts_recent_stamp &&
 164                             (!twp || (sysctl_tcp_tw_reuse &&
 165                                       xtime.tv_sec -
 166                                       tcptw->tw_ts_recent_stamp > 1))) {
 167                                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 168                                 if (tp->write_seq == 0)
 169                                         tp->write_seq = 1;
 170                                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                                 sock_hold(sk2);
 173                                 goto unique;
 174                         } else
 175                                 goto not_unique;
 176                 }
 177         }
 178         tw = NULL;
 179
 180         /* And established part... */
 181         sk_for_each(sk2, node, &head->chain) {
 182                 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 183                         goto not_unique;
 184         }
 185
 186 unique:
 187         /* Must record num and sport now. Otherwise we will see
 188          * in hash table socket with a funny identity. */
 189         inet->num = lport;
 190         inet->sport = htons(lport);
 191         sk->sk_hashent = hash;
 192         BUG_TRAP(sk_unhashed(sk));
 193         __sk_add_node(sk, &head->chain);
 194         sock_prot_inc_use(sk->sk_prot);
 195         write_unlock(&head->lock);
 196
 197         if (twp) {
 198                 *twp = tw;
 199                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 200         } else if (tw) {
 201                 /* Silly. Should hash-dance instead... */
 202                 tcp_tw_deschedule(tw);
 203                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 204
 205                 inet_twsk_put(tw);
 206         }
 207
 208         return 0;
 209
 210 not_unique:
 211         write_unlock(&head->lock);
 212         return -EADDRNOTAVAIL;
 213 }
 214
 215 static inline u32 connect_port_offset(const struct sock *sk)
 216 {
 217         const struct inet_sock *inet = inet_sk(sk);
 218
 219         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 220                                          inet->dport);
 221 }
 222
 223 /*
 224  * Bind a port for a connect operation and hash it.
 225  */
 226 static inline int tcp_v4_hash_connect(struct sock *sk)
 227 {
 228         const unsigned short snum = inet_sk(sk)->num;
 229         struct inet_bind_hashbucket *head;
 230         struct inet_bind_bucket *tb;
 231         int ret;
 232
 233         if (!snum) {
 234                 int low = sysctl_local_port_range[0];
 235                 int high = sysctl_local_port_range[1];
 236                 int range = high - low;
 237                 int i;
 238                 int port;
 239                 static u32 hint;
 240                 u32 offset = hint + connect_port_offset(sk);
 241                 struct hlist_node *node;
 242                 struct inet_timewait_sock *tw = NULL;
 243
 244                 local_bh_disable();
 245                 for (i = 1; i <= range; i++) {
 246                         port = low + (i + offset) % range;
 247                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
 248                         spin_lock(&head->lock);
 249
 250                         /* Does not bother with rcv_saddr checks,
 251                          * because the established check is already
 252                          * unique enough.
 253                          */
 254                         inet_bind_bucket_for_each(tb, node, &head->chain) {
 255                                 if (tb->port == port) {
 256                                         BUG_TRAP(!hlist_empty(&tb->owners));
 257                                         if (tb->fastreuse >= 0)
 258                                                 goto next_port;
 259                                         if (!__tcp_v4_check_established(sk,
 260                                                                         port,
 261                                                                         &tw))
 262                                                 goto ok;
 263                                         goto next_port;
 264                                 }
 265                         }
 266
 267                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
 268                         if (!tb) {
 269                                 spin_unlock(&head->lock);
 270                                 break;
 271                         }
 272                         tb->fastreuse = -1;
 273                         goto ok;
 274
 275                 next_port:
 276                         spin_unlock(&head->lock);
 277                 }
 278                 local_bh_enable();
 279
 280                 return -EADDRNOTAVAIL;
 281
 282 ok:
 283                 hint += i;
 284
 285                 /* Head lock still held and bh's disabled */
 286                 inet_bind_hash(sk, tb, port);
 287                 if (sk_unhashed(sk)) {
 288                         inet_sk(sk)->sport = htons(port);
 289                         __inet_hash(&tcp_hashinfo, sk, 0);
 290                 }
 291                 spin_unlock(&head->lock);
 292
 293                 if (tw) {
 294                         tcp_tw_deschedule(tw);
 295                         inet_twsk_put(tw);
 296                 }
 297
 298                 ret = 0;
 299                 goto out;
 300         }
 301
 302         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 303         tb  = inet_csk(sk)->icsk_bind_hash;
 304         spin_lock_bh(&head->lock);
 305         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 306                 __inet_hash(&tcp_hashinfo, sk, 0);
 307                 spin_unlock_bh(&head->lock);
 308                 return 0;
 309         } else {
 310                 spin_unlock(&head->lock);
 311                 /* No definite answer... Walk to established hash table */
 312                 ret = __tcp_v4_check_established(sk, snum, NULL);
 313 out:
 314                 local_bh_enable();
 315                 return ret;
 316         }
 317 }
 318
 319 /* This will initiate an outgoing connection. */
 320 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 321 {
 322         struct inet_sock *inet = inet_sk(sk);
 323         struct tcp_sock *tp = tcp_sk(sk);
 324         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 325         struct rtable *rt;
 326         u32 daddr, nexthop;
 327         int tmp;
 328         int err;
 329
 330         if (addr_len < sizeof(struct sockaddr_in))
 331                 return -EINVAL;
 332
 333         if (usin->sin_family != AF_INET)
 334                 return -EAFNOSUPPORT;
 335
 336         nexthop = daddr = usin->sin_addr.s_addr;
 337         if (inet->opt && inet->opt->srr) {
 338                 if (!daddr)
 339                         return -EINVAL;
 340                 nexthop = inet->opt->faddr;
 341         }
 342
 343         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 344                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 345                                IPPROTO_TCP,
 346                                inet->sport, usin->sin_port, sk);
 347         if (tmp < 0)
 348                 return tmp;
 349
 350         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 351                 ip_rt_put(rt);
 352                 return -ENETUNREACH;
 353         }
 354
 355         if (!inet->opt || !inet->opt->srr)
 356                 daddr = rt->rt_dst;
 357
 358         if (!inet->saddr)
 359                 inet->saddr = rt->rt_src;
 360         inet->rcv_saddr = inet->saddr;
 361
 362         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 363                 /* Reset inherited state */
 364                 tp->rx_opt.ts_recent       = 0;
 365                 tp->rx_opt.ts_recent_stamp = 0;
 366                 tp->write_seq              = 0;
 367         }
 368
 369         if (sysctl_tcp_tw_recycle &&
 370             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 371                 struct inet_peer *peer = rt_get_peer(rt);
 372
 373                 /* VJ's idea. We save last timestamp seen from
 374                  * the destination in peer table, when entering state TIME-WAIT
 375                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 376                  */
 377
 378                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 379                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 380                         tp->rx_opt.ts_recent = peer->tcp_ts;
 381                 }
 382         }
 383
 384         inet->dport = usin->sin_port;
 385         inet->daddr = daddr;
 386
 387         tp->ext_header_len = 0;
 388         if (inet->opt)
 389                 tp->ext_header_len = inet->opt->optlen;
 390
 391         tp->rx_opt.mss_clamp = 536;
 392
 393         /* Socket identity is still unknown (sport may be zero).
 394          * However we set state to SYN-SENT and not releasing socket
 395          * lock select source port, enter ourselves into the hash tables and
 396          * complete initialization after this.
 397          */
 398         tcp_set_state(sk, TCP_SYN_SENT);
 399         err = tcp_v4_hash_connect(sk);
 400         if (err)
 401                 goto failure;
 402
 403         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 404         if (err)
 405                 goto failure;
 406
 407         /* OK, now commit destination to socket.  */
 408         sk_setup_caps(sk, &rt->u.dst);
 409
 410         if (!tp->write_seq)
 411                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 412                                                            inet->daddr,
 413                                                            inet->sport,
 414                                                            usin->sin_port);
 415
 416         inet->id = tp->write_seq ^ jiffies;
 417
 418         err = tcp_connect(sk);
 419         rt = NULL;
 420         if (err)
 421                 goto failure;
 422
 423         return 0;
 424
 425 failure:
 426         /* This unhashes the socket and releases the local port, if necessary. */
 427         tcp_set_state(sk, TCP_CLOSE);
 428         ip_rt_put(rt);
 429         sk->sk_route_caps = 0;
 430         inet->dport = 0;
 431         return err;
 432 }
 433
 434 static inline int inet_iif(const struct sk_buff *skb)
 435 {
 436         return ((struct rtable *)skb->dst)->rt_iif;
 437 }
 438
 439 /*
 440  * This routine does path mtu discovery as defined in RFC1191.
 441  */
 442 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 443                                      u32 mtu)
 444 {
 445         struct dst_entry *dst;
 446         struct inet_sock *inet = inet_sk(sk);
 447         struct tcp_sock *tp = tcp_sk(sk);
 448
 449         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 450          * send out by Linux are always <576bytes so they should go through
 451          * unfragmented).
 452          */
 453         if (sk->sk_state == TCP_LISTEN)
 454                 return;
 455
 456         /* We don't check in the destentry if pmtu discovery is forbidden
 457          * on this route. We just assume that no packet_to_big packets
 458          * are send back when pmtu discovery is not active.
 459          * There is a small race when the user changes this flag in the
 460          * route, but I think that's acceptable.
 461          */
 462         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 463                 return;
 464
 465         dst->ops->update_pmtu(dst, mtu);
 466
 467         /* Something is about to be wrong... Remember soft error
 468          * for the case, if this connection will not able to recover.
 469          */
 470         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 471                 sk->sk_err_soft = EMSGSIZE;
 472
 473         mtu = dst_mtu(dst);
 474
 475         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 476             tp->pmtu_cookie > mtu) {
 477                 tcp_sync_mss(sk, mtu);
 478
 479                 /* Resend the TCP packet because it's
 480                  * clear that the old packet has been
 481                  * dropped. This is the new "fast" path mtu
 482                  * discovery.
 483                  */
 484                 tcp_simple_retransmit(sk);
 485         } /* else let the usual retransmit timer handle it */
 486 }
 487
 488 /*
 489  * This routine is called by the ICMP module when it gets some
 490  * sort of error condition.  If err < 0 then the socket should
 491  * be closed and the error returned to the user.  If err > 0
 492  * it's just the icmp type << 8 | icmp code.  After adjustment
 493  * header points to the first 8 bytes of the tcp header.  We need
 494  * to find the appropriate port.
 495  *
 496  * The locking strategy used here is very "optimistic". When
 497  * someone else accesses the socket the ICMP is just dropped
 498  * and for some paths there is no check at all.
 499  * A more general error queue to queue errors for later handling
 500  * is probably better.
 501  *
 502  */
 503
 504 void tcp_v4_err(struct sk_buff *skb, u32 info)
 505 {
 506         struct iphdr *iph = (struct iphdr *)skb->data;
 507         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 508         struct tcp_sock *tp;
 509         struct inet_sock *inet;
 510         int type = skb->h.icmph->type;
 511         int code = skb->h.icmph->code;
 512         struct sock *sk;
 513         __u32 seq;
 514         int err;
 515
 516         if (skb->len < (iph->ihl << 2) + 8) {
 517                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 518                 return;
 519         }
 520
 521         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
 522                          th->source, inet_iif(skb));
 523         if (!sk) {
 524                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 525                 return;
 526         }
 527         if (sk->sk_state == TCP_TIME_WAIT) {
 528                 inet_twsk_put((struct inet_timewait_sock *)sk);
 529                 return;
 530         }
 531
 532         bh_lock_sock(sk);
 533         /* If too many ICMPs get dropped on busy
 534          * servers this needs to be solved differently.
 535          */
 536         if (sock_owned_by_user(sk))
 537                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 538
 539         if (sk->sk_state == TCP_CLOSE)
 540                 goto out;
 541
 542         tp = tcp_sk(sk);
 543         seq = ntohl(th->seq);
 544         if (sk->sk_state != TCP_LISTEN &&
 545             !between(seq, tp->snd_una, tp->snd_nxt)) {
 546                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
 547                 goto out;
 548         }
 549
 550         switch (type) {
 551         case ICMP_SOURCE_QUENCH:
 552                 /* Just silently ignore these. */
 553                 goto out;
 554         case ICMP_PARAMETERPROB:
 555                 err = EPROTO;
 556                 break;
 557         case ICMP_DEST_UNREACH:
 558                 if (code > NR_ICMP_UNREACH)
 559                         goto out;
 560
 561                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 562                         if (!sock_owned_by_user(sk))
 563                                 do_pmtu_discovery(sk, iph, info);
 564                         goto out;
 565                 }
 566
 567                 err = icmp_err_convert[code].errno;
 568                 break;
 569         case ICMP_TIME_EXCEEDED:
 570                 err = EHOSTUNREACH;
 571                 break;
 572         default:
 573                 goto out;
 574         }
 575
 576         switch (sk->sk_state) {
 577                 struct request_sock *req, **prev;
 578         case TCP_LISTEN:
 579                 if (sock_owned_by_user(sk))
 580                         goto out;
 581
 582                 req = inet_csk_search_req(sk, &prev, th->dest,
 583                                           iph->daddr, iph->saddr);
 584                 if (!req)
 585                         goto out;
 586
 587                 /* ICMPs are not backlogged, hence we cannot get
 588                    an established socket here.
 589                  */
 590                 BUG_TRAP(!req->sk);
 591
 592                 if (seq != tcp_rsk(req)->snt_isn) {
 593                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 594                         goto out;
 595                 }
 596
 597                 /*
 598                  * Still in SYN_RECV, just remove it silently.
 599                  * There is no good way to pass the error to the newly
 600                  * created socket, and POSIX does not want network
 601                  * errors returned from accept().
 602                  */
 603                 inet_csk_reqsk_queue_drop(sk, req, prev);
 604                 goto out;
 605
 606         case TCP_SYN_SENT:
 607         case TCP_SYN_RECV:  /* Cannot happen.
 608                                It can f.e. if SYNs crossed.
 609                              */
 610                 if (!sock_owned_by_user(sk)) {
 611                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 612                         sk->sk_err = err;
 613
 614                         sk->sk_error_report(sk);
 615
 616                         tcp_done(sk);
 617                 } else {
 618                         sk->sk_err_soft = err;
 619                 }
 620                 goto out;
 621         }
 622
 623         /* If we've already connected we will keep trying
 624          * until we time out, or the user gives up.
 625          *
 626          * rfc1122 4.2.3.9 allows to consider as hard errors
 627          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 628          * but it is obsoleted by pmtu discovery).
 629          *
 630          * Note, that in modern internet, where routing is unreliable
 631          * and in each dark corner broken firewalls sit, sending random
 632          * errors ordered by their masters even this two messages finally lose
 633          * their original sense (even Linux sends invalid PORT_UNREACHs)
 634          *
 635          * Now we are in compliance with RFCs.
 636          *                                                      --ANK (980905)
 637          */
 638
 639         inet = inet_sk(sk);
 640         if (!sock_owned_by_user(sk) && inet->recverr) {
 641                 sk->sk_err = err;
 642                 sk->sk_error_report(sk);
 643         } else  { /* Only an error on timeout */
 644                 sk->sk_err_soft = err;
 645         }
 646
 647 out:
 648         bh_unlock_sock(sk);
 649         sock_put(sk);
 650 }
 651
 652 /* This routine computes an IPv4 TCP checksum. */
 653 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 654                        struct sk_buff *skb)
 655 {
 656         struct inet_sock *inet = inet_sk(sk);
 657
 658         if (skb->ip_summed == CHECKSUM_HW) {
 659                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
 660                 skb->csum = offsetof(struct tcphdr, check);
 661         } else {
 662                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
 663                                          csum_partial((char *)th,
 664                                                       th->doff << 2,
 665                                                       skb->csum));
 666         }
 667 }
 668
 669 /*
 670  *      This routine will send an RST to the other tcp.
 671  *
 672  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 673  *                    for reset.
 674  *      Answer: if a packet caused RST, it is not for a socket
 675  *              existing in our system, if it is matched to a socket,
 676  *              it is just duplicate segment or bug in other side's TCP.
 677  *              So that we build reply only basing on parameters
 678  *              arrived with segment.
 679  *      Exception: precedence violation. We do not implement it in any case.
 680  */
 681
 682 static void tcp_v4_send_reset(struct sk_buff *skb)
 683 {
 684         struct tcphdr *th = skb->h.th;
 685         struct tcphdr rth;
 686         struct ip_reply_arg arg;
 687
 688         /* Never send a reset in response to a reset. */
 689         if (th->rst)
 690                 return;
 691
 692         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
 693                 return;
 694
 695         /* Swap the send and the receive. */
 696         memset(&rth, 0, sizeof(struct tcphdr));
 697         rth.dest   = th->source;
 698         rth.source = th->dest;
 699         rth.doff   = sizeof(struct tcphdr) / 4;
 700         rth.rst    = 1;
 701
 702         if (th->ack) {
 703                 rth.seq = th->ack_seq;
 704         } else {
 705                 rth.ack = 1;
 706                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 707                                     skb->len - (th->doff << 2));
 708         }
 709
 710         memset(&arg, 0, sizeof arg);
 711         arg.iov[0].iov_base = (unsigned char *)&rth;
 712         arg.iov[0].iov_len  = sizeof rth;
 713         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 714                                       skb->nh.iph->saddr, /*XXX*/
 715                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 716         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 717
 718         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 719
 720         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 721         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 722 }
 723
 724 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 725    outside socket context is ugly, certainly. What can I do?
 726  */
 727
 728 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 729                             u32 win, u32 ts)
 730 {
 731         struct tcphdr *th = skb->h.th;
 732         struct {
 733                 struct tcphdr th;
 734                 u32 tsopt[3];
 735         } rep;
 736         struct ip_reply_arg arg;
 737
 738         memset(&rep.th, 0, sizeof(struct tcphdr));
 739         memset(&arg, 0, sizeof arg);
 740
 741         arg.iov[0].iov_base = (unsigned char *)&rep;
 742         arg.iov[0].iov_len  = sizeof(rep.th);
 743         if (ts) {
 744                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 745                                      (TCPOPT_TIMESTAMP << 8) |
 746                                      TCPOLEN_TIMESTAMP);
 747                 rep.tsopt[1] = htonl(tcp_time_stamp);
 748                 rep.tsopt[2] = htonl(ts);
 749                 arg.iov[0].iov_len = sizeof(rep);
 750         }
 751
 752         /* Swap the send and the receive. */
 753         rep.th.dest    = th->source;
 754         rep.th.source  = th->dest;
 755         rep.th.doff    = arg.iov[0].iov_len / 4;
 756         rep.th.seq     = htonl(seq);
 757         rep.th.ack_seq = htonl(ack);
 758         rep.th.ack     = 1;
 759         rep.th.window  = htons(win);
 760
 761         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 762                                       skb->nh.iph->saddr, /*XXX*/
 763                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 764         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 765
 766         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 767
 768         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 769 }
 770
 771 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 772 {
 773         struct inet_timewait_sock *tw = inet_twsk(sk);
 774         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 775
 776         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 777                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
 778
 779         inet_twsk_put(tw);
 780 }
 781
 782 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 783 {
 784         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 785                         req->ts_recent);
 786 }
 787
 788 /*
 789  *      Send a SYN-ACK after having received an ACK.
 790  *      This still operates on a request_sock only, not on a big
 791  *      socket.
 792  */
 793 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 794                               struct dst_entry *dst)
 795 {
 796         const struct inet_request_sock *ireq = inet_rsk(req);
 797         int err = -1;
 798         struct sk_buff * skb;
 799
 800         /* First, grab a route. */
 801         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 802                 goto out;
 803
 804         skb = tcp_make_synack(sk, dst, req);
 805
 806         if (skb) {
 807                 struct tcphdr *th = skb->h.th;
 808
 809                 th->check = tcp_v4_check(th, skb->len,
 810                                          ireq->loc_addr,
 811                                          ireq->rmt_addr,
 812                                          csum_partial((char *)th, skb->len,
 813                                                       skb->csum));
 814
 815                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 816                                             ireq->rmt_addr,
 817                                             ireq->opt);
 818                 if (err == NET_XMIT_CN)
 819                         err = 0;
 820         }
 821
 822 out:
 823         dst_release(dst);
 824         return err;
 825 }
 826
 827 /*
 828  *      IPv4 request_sock destructor.
 829  */
 830 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 831 {
 832         if (inet_rsk(req)->opt)
 833                 kfree(inet_rsk(req)->opt);
 834 }
 835
 836 static inline void syn_flood_warning(struct sk_buff *skb)
 837 {
 838         static unsigned long warntime;
 839
 840         if (time_after(jiffies, (warntime + HZ * 60))) {
 841                 warntime = jiffies;
 842                 printk(KERN_INFO
 843                        "possible SYN flooding on port %d. Sending cookies.\n",
 844                        ntohs(skb->h.th->dest));
 845         }
 846 }
 847
 848 /*
 849  * Save and compile IPv4 options into the request_sock if needed.
 850  */
 851 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
 852                                                      struct sk_buff *skb)
 853 {
 854         struct ip_options *opt = &(IPCB(skb)->opt);
 855         struct ip_options *dopt = NULL;
 856
 857         if (opt && opt->optlen) {
 858                 int opt_size = optlength(opt);
 859                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 860                 if (dopt) {
 861                         if (ip_options_echo(dopt, skb)) {
 862                                 kfree(dopt);
 863                                 dopt = NULL;
 864                         }
 865                 }
 866         }
 867         return dopt;
 868 }
 869
 870 struct request_sock_ops tcp_request_sock_ops = {
 871         .family         =       PF_INET,
 872         .obj_size       =       sizeof(struct tcp_request_sock),
 873         .rtx_syn_ack    =       tcp_v4_send_synack,
 874         .send_ack       =       tcp_v4_reqsk_send_ack,
 875         .destructor     =       tcp_v4_reqsk_destructor,
 876         .send_reset     =       tcp_v4_send_reset,
 877 };
 878
 879 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 880 {
 881         struct inet_request_sock *ireq;
 882         struct tcp_options_received tmp_opt;
 883         struct request_sock *req;
 884         __u32 saddr = skb->nh.iph->saddr;
 885         __u32 daddr = skb->nh.iph->daddr;
 886         __u32 isn = TCP_SKB_CB(skb)->when;
 887         struct dst_entry *dst = NULL;
 888 #ifdef CONFIG_SYN_COOKIES
 889         int want_cookie = 0;
 890 #else
 891 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
 892 #endif
 893
 894         /* Never answer to SYNs send to broadcast or multicast */
 895         if (((struct rtable *)skb->dst)->rt_flags &
 896             (RTCF_BROADCAST | RTCF_MULTICAST))
 897                 goto drop;
 898
 899         /* TW buckets are converted to open requests without
 900          * limitations, they conserve resources and peer is
 901          * evidently real one.
 902          */
 903         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
 904 #ifdef CONFIG_SYN_COOKIES
 905                 if (sysctl_tcp_syncookies) {
 906                         want_cookie = 1;
 907                 } else
 908 #endif
 909                 goto drop;
 910         }
 911
 912         /* Accept backlog is full. If we have already queued enough
 913          * of warm entries in syn queue, drop request. It is better than
 914          * clogging syn queue with openreqs with exponentially increasing
 915          * timeout.
 916          */
 917         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 918                 goto drop;
 919
 920         req = reqsk_alloc(&tcp_request_sock_ops);
 921         if (!req)
 922                 goto drop;
 923
 924         tcp_clear_options(&tmp_opt);
 925         tmp_opt.mss_clamp = 536;
 926         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
 927
 928         tcp_parse_options(skb, &tmp_opt, 0);
 929
 930         if (want_cookie) {
 931                 tcp_clear_options(&tmp_opt);
 932                 tmp_opt.saw_tstamp = 0;
 933         }
 934
 935         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
 936                 /* Some OSes (unknown ones, but I see them on web server, which
 937                  * contains information interesting only for windows'
 938                  * users) do not send their stamp in SYN. It is easy case.
 939                  * We simply do not advertise TS support.
 940                  */
 941                 tmp_opt.saw_tstamp = 0;
 942                 tmp_opt.tstamp_ok  = 0;
 943         }
 944         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 945
 946         tcp_openreq_init(req, &tmp_opt, skb);
 947
 948         ireq = inet_rsk(req);
 949         ireq->loc_addr = daddr;
 950         ireq->rmt_addr = saddr;
 951         ireq->opt = tcp_v4_save_options(sk, skb);
 952         if (!want_cookie)
 953                 TCP_ECN_create_request(req, skb->h.th);
 954
 955         if (want_cookie) {
 956 #ifdef CONFIG_SYN_COOKIES
 957                 syn_flood_warning(skb);
 958 #endif
 959                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
 960         } else if (!isn) {
 961                 struct inet_peer *peer = NULL;
 962
 963                 /* VJ's idea. We save last timestamp seen
 964                  * from the destination in peer table, when entering
 965                  * state TIME-WAIT, and check against it before
 966                  * accepting new connection request.
 967                  *
 968                  * If "isn" is not zero, this request hit alive
 969                  * timewait bucket, so that all the necessary checks
 970                  * are made in the function processing timewait state.
 971                  */
 972                 if (tmp_opt.saw_tstamp &&
 973                     sysctl_tcp_tw_recycle &&
 974                     (dst = inet_csk_route_req(sk, req)) != NULL &&
 975                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 976                     peer->v4daddr == saddr) {
 977                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
 978                             (s32)(peer->tcp_ts - req->ts_recent) >
 979                                                         TCP_PAWS_WINDOW) {
 980                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
 981                                 dst_release(dst);
 982                                 goto drop_and_free;
 983                         }
 984                 }
 985                 /* Kill the following clause, if you dislike this way. */
 986                 else if (!sysctl_tcp_syncookies &&
 987                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 988                           (sysctl_max_syn_backlog >> 2)) &&
 989                          (!peer || !peer->tcp_ts_stamp) &&
 990                          (!dst || !dst_metric(dst, RTAX_RTT))) {
 991                         /* Without syncookies last quarter of
 992                          * backlog is filled with destinations,
 993                          * proven to be alive.
 994                          * It means that we continue to communicate
 995                          * to destinations, already remembered
 996                          * to the moment of synflood.
 997                          */
 998                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
 999                                               "request from %u.%u."
1000                                               "%u.%u/%u\n",
1001                                               NIPQUAD(saddr),
1002                                               ntohs(skb->h.th->source)));
1003                         dst_release(dst);
1004                         goto drop_and_free;
1005                 }
1006
1007                 isn = tcp_v4_init_sequence(sk, skb);
1008         }
1009         tcp_rsk(req)->snt_isn = isn;
1010
1011         if (tcp_v4_send_synack(sk, req, dst))
1012                 goto drop_and_free;
1013
1014         if (want_cookie) {
1015                 reqsk_free(req);
1016         } else {
1017                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1018         }
1019         return 0;
1020
1021 drop_and_free:
1022         reqsk_free(req);
1023 drop:
1024         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1025         return 0;
1026 }
1027
1028
1029 /*
1030  * The three way handshake has completed - we got a valid synack -
1031  * now create the new socket.
1032  */
1033 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1034                                   struct request_sock *req,
1035                                   struct dst_entry *dst)
1036 {
1037         struct inet_request_sock *ireq;
1038         struct inet_sock *newinet;
1039         struct tcp_sock *newtp;
1040         struct sock *newsk;
1041
1042         if (sk_acceptq_is_full(sk))
1043                 goto exit_overflow;
1044
1045         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1046                 goto exit;
1047
1048         newsk = tcp_create_openreq_child(sk, req, skb);
1049         if (!newsk)
1050                 goto exit;
1051
1052         sk_setup_caps(newsk, dst);
1053
1054         newtp                 = tcp_sk(newsk);
1055         newinet               = inet_sk(newsk);
1056         ireq                  = inet_rsk(req);
1057         newinet->daddr        = ireq->rmt_addr;
1058         newinet->rcv_saddr    = ireq->loc_addr;
1059         newinet->saddr        = ireq->loc_addr;
1060         newinet->opt          = ireq->opt;
1061         ireq->opt             = NULL;
1062         newinet->mc_index     = inet_iif(skb);
1063         newinet->mc_ttl       = skb->nh.iph->ttl;
1064         newtp->ext_header_len = 0;
1065         if (newinet->opt)
1066                 newtp->ext_header_len = newinet->opt->optlen;
1067         newinet->id = newtp->write_seq ^ jiffies;
1068
1069         tcp_sync_mss(newsk, dst_mtu(dst));
1070         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1071         tcp_initialize_rcv_mss(newsk);
1072
1073         __inet_hash(&tcp_hashinfo, newsk, 0);
1074         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1075
1076         return newsk;
1077
1078 exit_overflow:
1079         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1080 exit:
1081         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1082         dst_release(dst);
1083         return NULL;
1084 }
1085
1086 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1087 {
1088         struct tcphdr *th = skb->h.th;
1089         struct iphdr *iph = skb->nh.iph;
1090         struct sock *nsk;
1091         struct request_sock **prev;
1092         /* Find possible connection requests. */
1093         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1094                                                        iph->saddr, iph->daddr);
1095         if (req)
1096                 return tcp_check_req(sk, skb, req, prev);
1097
1098         nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1099                                         th->source, skb->nh.iph->daddr,
1100                                         ntohs(th->dest), inet_iif(skb));
1101
1102         if (nsk) {
1103                 if (nsk->sk_state != TCP_TIME_WAIT) {
1104                         bh_lock_sock(nsk);
1105                         return nsk;
1106                 }
1107                 inet_twsk_put((struct inet_timewait_sock *)nsk);
1108                 return NULL;
1109         }
1110
1111 #ifdef CONFIG_SYN_COOKIES
1112         if (!th->rst && !th->syn && th->ack)
1113                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1114 #endif
1115         return sk;
1116 }
1117
1118 static int tcp_v4_checksum_init(struct sk_buff *skb)
1119 {
1120         if (skb->ip_summed == CHECKSUM_HW) {
1121                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1122                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1123                                   skb->nh.iph->daddr, skb->csum))
1124                         return 0;
1125
1126                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1127                 skb->ip_summed = CHECKSUM_NONE;
1128         }
1129         if (skb->len <= 76) {
1130                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1131                                  skb->nh.iph->daddr,
1132                                  skb_checksum(skb, 0, skb->len, 0)))
1133                         return -1;
1134                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1135         } else {
1136                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1137                                           skb->nh.iph->saddr,
1138                                           skb->nh.iph->daddr, 0);
1139         }
1140         return 0;
1141 }
1142
1143
1144 /* The socket must have it's spinlock held when we get
1145  * here.
1146  *
1147  * We have a potential double-lock case here, so even when
1148  * doing backlog processing we use the BH locking scheme.
1149  * This is because we cannot sleep with the original spinlock
1150  * held.
1151  */
1152 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1153 {
1154         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1155                 TCP_CHECK_TIMER(sk);
1156                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1157                         goto reset;
1158                 TCP_CHECK_TIMER(sk);
1159                 return 0;
1160         }
1161
1162         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1163                 goto csum_err;
1164
1165         if (sk->sk_state == TCP_LISTEN) {
1166                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1167                 if (!nsk)
1168                         goto discard;
1169
1170                 if (nsk != sk) {
1171                         if (tcp_child_process(sk, nsk, skb))
1172                                 goto reset;
1173                         return 0;
1174                 }
1175         }
1176
1177         TCP_CHECK_TIMER(sk);
1178         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1179                 goto reset;
1180         TCP_CHECK_TIMER(sk);
1181         return 0;
1182
1183 reset:
1184         tcp_v4_send_reset(skb);
1185 discard:
1186         kfree_skb(skb);
1187         /* Be careful here. If this function gets more complicated and
1188          * gcc suffers from register pressure on the x86, sk (in %ebx)
1189          * might be destroyed here. This current version compiles correctly,
1190          * but you have been warned.
1191          */
1192         return 0;
1193
1194 csum_err:
1195         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1196         goto discard;
1197 }
1198
1199 /*
1200  *      From tcp_input.c
1201  */
1202
1203 int tcp_v4_rcv(struct sk_buff *skb)
1204 {
1205         struct tcphdr *th;
1206         struct sock *sk;
1207         int ret;
1208
1209         if (skb->pkt_type != PACKET_HOST)
1210                 goto discard_it;
1211
1212         /* Count it even if it's bad */
1213         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1214
1215         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1216                 goto discard_it;
1217
1218         th = skb->h.th;
1219
1220         if (th->doff < sizeof(struct tcphdr) / 4)
1221                 goto bad_packet;
1222         if (!pskb_may_pull(skb, th->doff * 4))
1223                 goto discard_it;
1224
1225         /* An explanation is required here, I think.
1226          * Packet length and doff are validated by header prediction,
1227          * provided case of th->doff==0 is elimineted.
1228          * So, we defer the checks. */
1229         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1230              tcp_v4_checksum_init(skb) < 0))
1231                 goto bad_packet;
1232
1233         th = skb->h.th;
1234         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1235         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1236                                     skb->len - th->doff * 4);
1237         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1238         TCP_SKB_CB(skb)->when    = 0;
1239         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1240         TCP_SKB_CB(skb)->sacked  = 0;
1241
1242         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1243                            skb->nh.iph->daddr, ntohs(th->dest),
1244                            inet_iif(skb));
1245
1246         if (!sk)
1247                 goto no_tcp_socket;
1248
1249 process:
1250         if (sk->sk_state == TCP_TIME_WAIT)
1251                 goto do_time_wait;
1252
1253         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1254                 goto discard_and_relse;
1255
1256         if (sk_filter(sk, skb, 0))
1257                 goto discard_and_relse;
1258
1259         skb->dev = NULL;
1260
1261         bh_lock_sock(sk);
1262         ret = 0;
1263         if (!sock_owned_by_user(sk)) {
1264                 if (!tcp_prequeue(sk, skb))
1265                         ret = tcp_v4_do_rcv(sk, skb);
1266         } else
1267                 sk_add_backlog(sk, skb);
1268         bh_unlock_sock(sk);
1269
1270         sock_put(sk);
1271
1272         return ret;
1273
1274 no_tcp_socket:
1275         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1276                 goto discard_it;
1277
1278         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1279 bad_packet:
1280                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1281         } else {
1282                 tcp_v4_send_reset(skb);
1283         }
1284
1285 discard_it:
1286         /* Discard frame. */
1287         kfree_skb(skb);
1288         return 0;
1289
1290 discard_and_relse:
1291         sock_put(sk);
1292         goto discard_it;
1293
1294 do_time_wait:
1295         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1296                 inet_twsk_put((struct inet_timewait_sock *) sk);
1297                 goto discard_it;
1298         }
1299
1300         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1301                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1302                 inet_twsk_put((struct inet_timewait_sock *) sk);
1303                 goto discard_it;
1304         }
1305         switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1306                                            skb, th)) {
1307         case TCP_TW_SYN: {
1308                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1309                                                         skb->nh.iph->daddr,
1310                                                         ntohs(th->dest),
1311                                                         inet_iif(skb));
1312                 if (sk2) {
1313                         tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1314                         inet_twsk_put((struct inet_timewait_sock *)sk);
1315                         sk = sk2;
1316                         goto process;
1317                 }
1318                 /* Fall through to ACK */
1319         }
1320         case TCP_TW_ACK:
1321                 tcp_v4_timewait_ack(sk, skb);
1322                 break;
1323         case TCP_TW_RST:
1324                 goto no_tcp_socket;
1325         case TCP_TW_SUCCESS:;
1326         }
1327         goto discard_it;
1328 }
1329
1330 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1331 {
1332         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1333         struct inet_sock *inet = inet_sk(sk);
1334
1335         sin->sin_family         = AF_INET;
1336         sin->sin_addr.s_addr    = inet->daddr;
1337         sin->sin_port           = inet->dport;
1338 }
1339
1340 /* VJ's idea. Save last timestamp seen from this destination
1341  * and hold it at least for normal timewait interval to use for duplicate
1342  * segment detection in subsequent connections, before they enter synchronized
1343  * state.
1344  */
1345
1346 int tcp_v4_remember_stamp(struct sock *sk)
1347 {
1348         struct inet_sock *inet = inet_sk(sk);
1349         struct tcp_sock *tp = tcp_sk(sk);
1350         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1351         struct inet_peer *peer = NULL;
1352         int release_it = 0;
1353
1354         if (!rt || rt->rt_dst != inet->daddr) {
1355                 peer = inet_getpeer(inet->daddr, 1);
1356                 release_it = 1;
1357         } else {
1358                 if (!rt->peer)
1359                         rt_bind_peer(rt, 1);
1360                 peer = rt->peer;
1361         }
1362
1363         if (peer) {
1364                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1365                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1366                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1367                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1368                         peer->tcp_ts = tp->rx_opt.ts_recent;
1369                 }
1370                 if (release_it)
1371                         inet_putpeer(peer);
1372                 return 1;
1373         }
1374
1375         return 0;
1376 }
1377
1378 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1379 {
1380         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1381
1382         if (peer) {
1383                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1384
1385                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1386                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1387                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1388                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1389                         peer->tcp_ts       = tcptw->tw_ts_recent;
1390                 }
1391                 inet_putpeer(peer);
1392                 return 1;
1393         }
1394
1395         return 0;
1396 }
1397
1398 struct tcp_func ipv4_specific = {
1399         .queue_xmit     =       ip_queue_xmit,
1400         .send_check     =       tcp_v4_send_check,
1401         .rebuild_header =       inet_sk_rebuild_header,
1402         .conn_request   =       tcp_v4_conn_request,
1403         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1404         .remember_stamp =       tcp_v4_remember_stamp,
1405         .net_header_len =       sizeof(struct iphdr),
1406         .setsockopt     =       ip_setsockopt,
1407         .getsockopt     =       ip_getsockopt,
1408         .addr2sockaddr  =       v4_addr2sockaddr,
1409         .sockaddr_len   =       sizeof(struct sockaddr_in),
1410 };
1411
1412 /* NOTE: A lot of things set to zero explicitly by call to
1413  *       sk_alloc() so need not be done here.
1414  */
1415 static int tcp_v4_init_sock(struct sock *sk)
1416 {
1417         struct tcp_sock *tp = tcp_sk(sk);
1418
1419         skb_queue_head_init(&tp->out_of_order_queue);
1420         tcp_init_xmit_timers(sk);
1421         tcp_prequeue_init(tp);
1422
1423         inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
1424         tp->mdev = TCP_TIMEOUT_INIT;
1425
1426         /* So many TCP implementations out there (incorrectly) count the
1427          * initial SYN frame in their delayed-ACK and congestion control
1428          * algorithms that we must have the following bandaid to talk
1429          * efficiently to them.  -DaveM
1430          */
1431         tp->snd_cwnd = 2;
1432
1433         /* See draft-stevens-tcpca-spec-01 for discussion of the
1434          * initialization of these values.
1435          */
1436         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1437         tp->snd_cwnd_clamp = ~0;
1438         tp->mss_cache = 536;
1439
1440         tp->reordering = sysctl_tcp_reordering;
1441         tp->ca_ops = &tcp_init_congestion_ops;
1442
1443         sk->sk_state = TCP_CLOSE;
1444
1445         sk->sk_write_space = sk_stream_write_space;
1446         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1447
1448         tp->af_specific = &ipv4_specific;
1449
1450         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1451         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1452
1453         atomic_inc(&tcp_sockets_allocated);
1454
1455         return 0;
1456 }
1457
1458 int tcp_v4_destroy_sock(struct sock *sk)
1459 {
1460         struct tcp_sock *tp = tcp_sk(sk);
1461
1462         tcp_clear_xmit_timers(sk);
1463
1464         tcp_cleanup_congestion_control(tp);
1465
1466         /* Cleanup up the write buffer. */
1467         sk_stream_writequeue_purge(sk);
1468
1469         /* Cleans up our, hopefully empty, out_of_order_queue. */
1470         __skb_queue_purge(&tp->out_of_order_queue);
1471
1472         /* Clean prequeue, it must be empty really */
1473         __skb_queue_purge(&tp->ucopy.prequeue);
1474
1475         /* Clean up a referenced TCP bind bucket. */
1476         if (inet_csk(sk)->icsk_bind_hash)
1477                 inet_put_port(&tcp_hashinfo, sk);
1478
1479         /*
1480          * If sendmsg cached page exists, toss it.
1481          */
1482         if (sk->sk_sndmsg_page) {
1483                 __free_page(sk->sk_sndmsg_page);
1484                 sk->sk_sndmsg_page = NULL;
1485         }
1486
1487         atomic_dec(&tcp_sockets_allocated);
1488
1489         return 0;
1490 }
1491
1492 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1493
1494 #ifdef CONFIG_PROC_FS
1495 /* Proc filesystem TCP sock list dumping. */
1496
1497 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1498 {
1499         return hlist_empty(head) ? NULL :
1500                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1501 }
1502
1503 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1504 {
1505         return tw->tw_node.next ?
1506                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1507 }
1508
1509 static void *listening_get_next(struct seq_file *seq, void *cur)
1510 {
1511         struct inet_connection_sock *icsk;
1512         struct hlist_node *node;
1513         struct sock *sk = cur;
1514         struct tcp_iter_state* st = seq->private;
1515
1516         if (!sk) {
1517                 st->bucket = 0;
1518                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1519                 goto get_sk;
1520         }
1521
1522         ++st->num;
1523
1524         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1525                 struct request_sock *req = cur;
1526
1527                 icsk = inet_csk(st->syn_wait_sk);
1528                 req = req->dl_next;
1529                 while (1) {
1530                         while (req) {
1531                                 if (req->rsk_ops->family == st->family) {
1532                                         cur = req;
1533                                         goto out;
1534                                 }
1535                                 req = req->dl_next;
1536                         }
1537                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1538                                 break;
1539 get_req:
1540                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1541                 }
1542                 sk        = sk_next(st->syn_wait_sk);
1543                 st->state = TCP_SEQ_STATE_LISTENING;
1544                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1545         } else {
1546                 icsk = inet_csk(sk);
1547                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1548                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1549                         goto start_req;
1550                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1551                 sk = sk_next(sk);
1552         }
1553 get_sk:
1554         sk_for_each_from(sk, node) {
1555                 if (sk->sk_family == st->family) {
1556                         cur = sk;
1557                         goto out;
1558                 }
1559                 icsk = inet_csk(sk);
1560                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1561                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1562 start_req:
1563                         st->uid         = sock_i_uid(sk);
1564                         st->syn_wait_sk = sk;
1565                         st->state       = TCP_SEQ_STATE_OPENREQ;
1566                         st->sbucket     = 0;
1567                         goto get_req;
1568                 }
1569                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1570         }
1571         if (++st->bucket < INET_LHTABLE_SIZE) {
1572                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1573                 goto get_sk;
1574         }
1575         cur = NULL;
1576 out:
1577         return cur;
1578 }
1579
1580 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1581 {
1582         void *rc = listening_get_next(seq, NULL);
1583
1584         while (rc && *pos) {
1585                 rc = listening_get_next(seq, rc);
1586                 --*pos;
1587         }
1588         return rc;
1589 }
1590
1591 static void *established_get_first(struct seq_file *seq)
1592 {
1593         struct tcp_iter_state* st = seq->private;
1594         void *rc = NULL;
1595
1596         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1597                 struct sock *sk;
1598                 struct hlist_node *node;
1599                 struct inet_timewait_sock *tw;
1600
1601                 /* We can reschedule _before_ having picked the target: */
1602                 cond_resched_softirq();
1603
1604                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1605                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1606                         if (sk->sk_family != st->family) {
1607                                 continue;
1608                         }
1609                         rc = sk;
1610                         goto out;
1611                 }
1612                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1613                 inet_twsk_for_each(tw, node,
1614                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1615                         if (tw->tw_family != st->family) {
1616                                 continue;
1617                         }
1618                         rc = tw;
1619                         goto out;
1620                 }
1621                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1622                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1623         }
1624 out:
1625         return rc;
1626 }
1627
1628 static void *established_get_next(struct seq_file *seq, void *cur)
1629 {
1630         struct sock *sk = cur;
1631         struct inet_timewait_sock *tw;
1632         struct hlist_node *node;
1633         struct tcp_iter_state* st = seq->private;
1634
1635         ++st->num;
1636
1637         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1638                 tw = cur;
1639                 tw = tw_next(tw);
1640 get_tw:
1641                 while (tw && tw->tw_family != st->family) {
1642                         tw = tw_next(tw);
1643                 }
1644                 if (tw) {
1645                         cur = tw;
1646                         goto out;
1647                 }
1648                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1649                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1650
1651                 /* We can reschedule between buckets: */
1652                 cond_resched_softirq();
1653
1654                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1655                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1656                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1657                 } else {
1658                         cur = NULL;
1659                         goto out;
1660                 }
1661         } else
1662                 sk = sk_next(sk);
1663
1664         sk_for_each_from(sk, node) {
1665                 if (sk->sk_family == st->family)
1666                         goto found;
1667         }
1668
1669         st->state = TCP_SEQ_STATE_TIME_WAIT;
1670         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1671         goto get_tw;
1672 found:
1673         cur = sk;
1674 out:
1675         return cur;
1676 }
1677
1678 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1679 {
1680         void *rc = established_get_first(seq);
1681
1682         while (rc && pos) {
1683                 rc = established_get_next(seq, rc);
1684                 --pos;
1685         }
1686         return rc;
1687 }
1688
1689 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1690 {
1691         void *rc;
1692         struct tcp_iter_state* st = seq->private;
1693
1694         inet_listen_lock(&tcp_hashinfo);
1695         st->state = TCP_SEQ_STATE_LISTENING;
1696         rc        = listening_get_idx(seq, &pos);
1697
1698         if (!rc) {
1699                 inet_listen_unlock(&tcp_hashinfo);
1700                 local_bh_disable();
1701                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1702                 rc        = established_get_idx(seq, pos);
1703         }
1704
1705         return rc;
1706 }
1707
1708 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1709 {
1710         struct tcp_iter_state* st = seq->private;
1711         st->state = TCP_SEQ_STATE_LISTENING;
1712         st->num = 0;
1713         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1714 }
1715
1716 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1717 {
1718         void *rc = NULL;
1719         struct tcp_iter_state* st;
1720
1721         if (v == SEQ_START_TOKEN) {
1722                 rc = tcp_get_idx(seq, 0);
1723                 goto out;
1724         }
1725         st = seq->private;
1726
1727         switch (st->state) {
1728         case TCP_SEQ_STATE_OPENREQ:
1729         case TCP_SEQ_STATE_LISTENING:
1730                 rc = listening_get_next(seq, v);
1731                 if (!rc) {
1732                         inet_listen_unlock(&tcp_hashinfo);
1733                         local_bh_disable();
1734                         st->state = TCP_SEQ_STATE_ESTABLISHED;
1735                         rc        = established_get_first(seq);
1736                 }
1737                 break;
1738         case TCP_SEQ_STATE_ESTABLISHED:
1739         case TCP_SEQ_STATE_TIME_WAIT:
1740                 rc = established_get_next(seq, v);
1741                 break;
1742         }
1743 out:
1744         ++*pos;
1745         return rc;
1746 }
1747
1748 static void tcp_seq_stop(struct seq_file *seq, void *v)
1749 {
1750         struct tcp_iter_state* st = seq->private;
1751
1752         switch (st->state) {
1753         case TCP_SEQ_STATE_OPENREQ:
1754                 if (v) {
1755                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1756                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1757                 }
1758         case TCP_SEQ_STATE_LISTENING:
1759                 if (v != SEQ_START_TOKEN)
1760                         inet_listen_unlock(&tcp_hashinfo);
1761                 break;
1762         case TCP_SEQ_STATE_TIME_WAIT:
1763         case TCP_SEQ_STATE_ESTABLISHED:
1764                 if (v)
1765                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1766                 local_bh_enable();
1767                 break;
1768         }
1769 }
1770
1771 static int tcp_seq_open(struct inode *inode, struct file *file)
1772 {
1773         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1774         struct seq_file *seq;
1775         struct tcp_iter_state *s;
1776         int rc;
1777
1778         if (unlikely(afinfo == NULL))
1779                 return -EINVAL;
1780
1781         s = kmalloc(sizeof(*s), GFP_KERNEL);
1782         if (!s)
1783                 return -ENOMEM;
1784         memset(s, 0, sizeof(*s));
1785         s->family               = afinfo->family;
1786         s->seq_ops.start        = tcp_seq_start;
1787         s->seq_ops.next         = tcp_seq_next;
1788         s->seq_ops.show         = afinfo->seq_show;
1789         s->seq_ops.stop         = tcp_seq_stop;
1790
1791         rc = seq_open(file, &s->seq_ops);
1792         if (rc)
1793                 goto out_kfree;
1794         seq          = file->private_data;
1795         seq->private = s;
1796 out:
1797         return rc;
1798 out_kfree:
1799         kfree(s);
1800         goto out;
1801 }
1802
1803 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1804 {
1805         int rc = 0;
1806         struct proc_dir_entry *p;
1807
1808         if (!afinfo)
1809                 return -EINVAL;
1810         afinfo->seq_fops->owner         = afinfo->owner;
1811         afinfo->seq_fops->open          = tcp_seq_open;
1812         afinfo->seq_fops->read          = seq_read;
1813         afinfo->seq_fops->llseek        = seq_lseek;
1814         afinfo->seq_fops->release       = seq_release_private;
1815
1816         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1817         if (p)
1818                 p->data = afinfo;
1819         else
1820                 rc = -ENOMEM;
1821         return rc;
1822 }
1823
1824 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1825 {
1826         if (!afinfo)
1827                 return;
1828         proc_net_remove(afinfo->name);
1829         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1830 }
1831
1832 static void get_openreq4(struct sock *sk, struct request_sock *req,
1833                          char *tmpbuf, int i, int uid)
1834 {
1835         const struct inet_request_sock *ireq = inet_rsk(req);
1836         int ttd = req->expires - jiffies;
1837
1838         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1839                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1840                 i,
1841                 ireq->loc_addr,
1842                 ntohs(inet_sk(sk)->sport),
1843                 ireq->rmt_addr,
1844                 ntohs(ireq->rmt_port),
1845                 TCP_SYN_RECV,
1846                 0, 0, /* could print option size, but that is af dependent. */
1847                 1,    /* timers active (only the expire timer) */
1848                 jiffies_to_clock_t(ttd),
1849                 req->retrans,
1850                 uid,
1851                 0,  /* non standard timer */
1852                 0, /* open_requests have no inode */
1853                 atomic_read(&sk->sk_refcnt),
1854                 req);
1855 }
1856
1857 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1858 {
1859         int timer_active;
1860         unsigned long timer_expires;
1861         struct tcp_sock *tp = tcp_sk(sp);
1862         const struct inet_connection_sock *icsk = inet_csk(sp);
1863         struct inet_sock *inet = inet_sk(sp);
1864         unsigned int dest = inet->daddr;
1865         unsigned int src = inet->rcv_saddr;
1866         __u16 destp = ntohs(inet->dport);
1867         __u16 srcp = ntohs(inet->sport);
1868
1869         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1870                 timer_active    = 1;
1871                 timer_expires   = icsk->icsk_timeout;
1872         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1873                 timer_active    = 4;
1874                 timer_expires   = icsk->icsk_timeout;
1875         } else if (timer_pending(&sp->sk_timer)) {
1876                 timer_active    = 2;
1877                 timer_expires   = sp->sk_timer.expires;
1878         } else {
1879                 timer_active    = 0;
1880                 timer_expires = jiffies;
1881         }
1882
1883         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1884                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
1885                 i, src, srcp, dest, destp, sp->sk_state,
1886                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1887                 timer_active,
1888                 jiffies_to_clock_t(timer_expires - jiffies),
1889                 icsk->icsk_retransmits,
1890                 sock_i_uid(sp),
1891                 tp->probes_out,
1892                 sock_i_ino(sp),
1893                 atomic_read(&sp->sk_refcnt), sp,
1894                 icsk->icsk_rto,
1895                 icsk->icsk_ack.ato,
1896                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1897                 tp->snd_cwnd,
1898                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1899 }
1900
1901 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1902 {
1903         unsigned int dest, src;
1904         __u16 destp, srcp;
1905         int ttd = tw->tw_ttd - jiffies;
1906
1907         if (ttd < 0)
1908                 ttd = 0;
1909
1910         dest  = tw->tw_daddr;
1911         src   = tw->tw_rcv_saddr;
1912         destp = ntohs(tw->tw_dport);
1913         srcp  = ntohs(tw->tw_sport);
1914
1915         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1916                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1917                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1918                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1919                 atomic_read(&tw->tw_refcnt), tw);
1920 }
1921
1922 #define TMPSZ 150
1923
1924 static int tcp4_seq_show(struct seq_file *seq, void *v)
1925 {
1926         struct tcp_iter_state* st;
1927         char tmpbuf[TMPSZ + 1];
1928
1929         if (v == SEQ_START_TOKEN) {
1930                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
1931                            "  sl  local_address rem_address   st tx_queue "
1932                            "rx_queue tr tm->when retrnsmt   uid  timeout "
1933                            "inode");
1934                 goto out;
1935         }
1936         st = seq->private;
1937
1938         switch (st->state) {
1939         case TCP_SEQ_STATE_LISTENING:
1940         case TCP_SEQ_STATE_ESTABLISHED:
1941                 get_tcp4_sock(v, tmpbuf, st->num);
1942                 break;
1943         case TCP_SEQ_STATE_OPENREQ:
1944                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1945                 break;
1946         case TCP_SEQ_STATE_TIME_WAIT:
1947                 get_timewait4_sock(v, tmpbuf, st->num);
1948                 break;
1949         }
1950         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1951 out:
1952         return 0;
1953 }
1954
1955 static struct file_operations tcp4_seq_fops;
1956 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1957         .owner          = THIS_MODULE,
1958         .name           = "tcp",
1959         .family         = AF_INET,
1960         .seq_show       = tcp4_seq_show,
1961         .seq_fops       = &tcp4_seq_fops,
1962 };
1963
1964 int __init tcp4_proc_init(void)
1965 {
1966         return tcp_proc_register(&tcp4_seq_afinfo);
1967 }
1968
1969 void tcp4_proc_exit(void)
1970 {
1971         tcp_proc_unregister(&tcp4_seq_afinfo);
1972 }
1973 #endif /* CONFIG_PROC_FS */
1974
1975 struct proto tcp_prot = {
1976         .name                   = "TCP",
1977         .owner                  = THIS_MODULE,
1978         .close                  = tcp_close,
1979         .connect                = tcp_v4_connect,
1980         .disconnect             = tcp_disconnect,
1981         .accept                 = inet_csk_accept,
1982         .ioctl                  = tcp_ioctl,
1983         .init                   = tcp_v4_init_sock,
1984         .destroy                = tcp_v4_destroy_sock,
1985         .shutdown               = tcp_shutdown,
1986         .setsockopt             = tcp_setsockopt,
1987         .getsockopt             = tcp_getsockopt,
1988         .sendmsg                = tcp_sendmsg,
1989         .recvmsg                = tcp_recvmsg,
1990         .backlog_rcv            = tcp_v4_do_rcv,
1991         .hash                   = tcp_v4_hash,
1992         .unhash                 = tcp_unhash,
1993         .get_port               = tcp_v4_get_port,
1994         .enter_memory_pressure  = tcp_enter_memory_pressure,
1995         .sockets_allocated      = &tcp_sockets_allocated,
1996         .memory_allocated       = &tcp_memory_allocated,
1997         .memory_pressure        = &tcp_memory_pressure,
1998         .sysctl_mem             = sysctl_tcp_mem,
1999         .sysctl_wmem            = sysctl_tcp_wmem,
2000         .sysctl_rmem            = sysctl_tcp_rmem,
2001         .max_header             = MAX_TCP_HEADER,
2002         .obj_size               = sizeof(struct tcp_sock),
2003         .twsk_obj_size          = sizeof(struct tcp_timewait_sock),
2004         .rsk_prot               = &tcp_request_sock_ops,
2005 };
2006
2007
2008
2009 void __init tcp_v4_init(struct net_proto_family *ops)
2010 {
2011         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2012         if (err < 0)
2013                 panic("Failed to create the TCP control socket.\n");
2014         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2015         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2016
2017         /* Unhash it so that IP input processing does not even
2018          * see it, we do not wish this socket to see incoming
2019          * packets.
2020          */
2021         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2022 }
2023
2024 EXPORT_SYMBOL(ipv4_specific);
2025 EXPORT_SYMBOL(inet_bind_bucket_create);
2026 EXPORT_SYMBOL(tcp_hashinfo);
2027 EXPORT_SYMBOL(tcp_prot);
2028 EXPORT_SYMBOL(tcp_unhash);
2029 EXPORT_SYMBOL(tcp_v4_conn_request);
2030 EXPORT_SYMBOL(tcp_v4_connect);
2031 EXPORT_SYMBOL(tcp_v4_do_rcv);
2032 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2033 EXPORT_SYMBOL(tcp_v4_send_check);
2034 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2035
2036 #ifdef CONFIG_PROC_FS
2037 EXPORT_SYMBOL(tcp_proc_register);
2038 EXPORT_SYMBOL(tcp_proc_unregister);
2039 #endif
2040 EXPORT_SYMBOL(sysctl_local_port_range);
2041 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2042 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2043