err.no Git - linux-2.6/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen semantics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74
  75 #include <linux/inet.h>
  76 #include <linux/ipv6.h>
  77 #include <linux/stddef.h>
  78 #include <linux/proc_fs.h>
  79 #include <linux/seq_file.h>
  80
  81 int sysctl_tcp_tw_reuse;
  82 int sysctl_tcp_low_latency;
  83
  84 /* Check TCP sequence numbers in ICMP packets. */
  85 #define ICMP_MIN_LENGTH 8
  86
  87 /* Socket used for sending RSTs */
  88 static struct socket *tcp_socket;
  89
  90 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
  91
  92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .lhash_lock     = RW_LOCK_UNLOCKED,
  94         .lhash_users    = ATOMIC_INIT(0),
  95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
  96 };
  97
  98 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
  99 {
 100         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
 101                                  inet_csk_bind_conflict);
 102 }
 103
 104 static void tcp_v4_hash(struct sock *sk)
 105 {
 106         inet_hash(&tcp_hashinfo, sk);
 107 }
 108
 109 void tcp_unhash(struct sock *sk)
 110 {
 111         inet_unhash(&tcp_hashinfo, sk);
 112 }
 113
 114 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 115 {
 116         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 117                                           skb->nh.iph->saddr,
 118                                           skb->h.th->dest,
 119                                           skb->h.th->source);
 120 }
 121
 122 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 123 {
 124         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 125         struct tcp_sock *tp = tcp_sk(sk);
 126
 127         /* With PAWS, it is safe from the viewpoint
 128            of data integrity. Even without PAWS it is safe provided sequence
 129            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 130
 131            Actually, the idea is close to VJ's one, only timestamp cache is
 132            held not per host, but per port pair and TW bucket is used as state
 133            holder.
 134
 135            If TW bucket has been already destroyed we fall back to VJ's scheme
 136            and use initial timestamp retrieved from peer table.
 137          */
 138         if (tcptw->tw_ts_recent_stamp &&
 139             (twp == NULL || (sysctl_tcp_tw_reuse &&
 140                              xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
 141                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 142                 if (tp->write_seq == 0)
 143                         tp->write_seq = 1;
 144                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 145                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 146                 sock_hold(sktw);
 147                 return 1;
 148         }
 149
 150         return 0;
 151 }
 152
 153 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 154
 155 /* called with local bh disabled */
 156 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 157                                       struct inet_timewait_sock **twp)
 158 {
 159         struct inet_sock *inet = inet_sk(sk);
 160         u32 daddr = inet->rcv_saddr;
 161         u32 saddr = inet->daddr;
 162         int dif = sk->sk_bound_dev_if;
 163         INET_ADDR_COOKIE(acookie, saddr, daddr)
 164         const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
 165         unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
 166         struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
 167         struct sock *sk2;
 168         const struct hlist_node *node;
 169         struct inet_timewait_sock *tw;
 170
 171         prefetch(head->chain.first);
 172         write_lock(&head->lock);
 173
 174         /* Check TIME-WAIT sockets first. */
 175         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 176                 tw = inet_twsk(sk2);
 177
 178                 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
 179                         if (twsk_unique(sk, sk2, twp))
 180                                 goto unique;
 181                         else
 182                                 goto not_unique;
 183                 }
 184         }
 185         tw = NULL;
 186
 187         /* And established part... */
 188         sk_for_each(sk2, node, &head->chain) {
 189                 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
 190                         goto not_unique;
 191         }
 192
 193 unique:
 194         /* Must record num and sport now. Otherwise we will see
 195          * in hash table socket with a funny identity. */
 196         inet->num = lport;
 197         inet->sport = htons(lport);
 198         sk->sk_hash = hash;
 199         BUG_TRAP(sk_unhashed(sk));
 200         __sk_add_node(sk, &head->chain);
 201         sock_prot_inc_use(sk->sk_prot);
 202         write_unlock(&head->lock);
 203
 204         if (twp) {
 205                 *twp = tw;
 206                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 207         } else if (tw) {
 208                 /* Silly. Should hash-dance instead... */
 209                 inet_twsk_deschedule(tw, &tcp_death_row);
 210                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 211
 212                 inet_twsk_put(tw);
 213         }
 214
 215         return 0;
 216
 217 not_unique:
 218         write_unlock(&head->lock);
 219         return -EADDRNOTAVAIL;
 220 }
 221
 222 static inline u32 connect_port_offset(const struct sock *sk)
 223 {
 224         const struct inet_sock *inet = inet_sk(sk);
 225
 226         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 227                                          inet->dport);
 228 }
 229
 230 /*
 231  * Bind a port for a connect operation and hash it.
 232  */
 233 static inline int tcp_v4_hash_connect(struct sock *sk)
 234 {
 235         const unsigned short snum = inet_sk(sk)->num;
 236         struct inet_bind_hashbucket *head;
 237         struct inet_bind_bucket *tb;
 238         int ret;
 239
 240         if (!snum) {
 241                 int low = sysctl_local_port_range[0];
 242                 int high = sysctl_local_port_range[1];
 243                 int range = high - low;
 244                 int i;
 245                 int port;
 246                 static u32 hint;
 247                 u32 offset = hint + connect_port_offset(sk);
 248                 struct hlist_node *node;
 249                 struct inet_timewait_sock *tw = NULL;
 250
 251                 local_bh_disable();
 252                 for (i = 1; i <= range; i++) {
 253                         port = low + (i + offset) % range;
 254                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
 255                         spin_lock(&head->lock);
 256
 257                         /* Does not bother with rcv_saddr checks,
 258                          * because the established check is already
 259                          * unique enough.
 260                          */
 261                         inet_bind_bucket_for_each(tb, node, &head->chain) {
 262                                 if (tb->port == port) {
 263                                         BUG_TRAP(!hlist_empty(&tb->owners));
 264                                         if (tb->fastreuse >= 0)
 265                                                 goto next_port;
 266                                         if (!__tcp_v4_check_established(sk,
 267                                                                         port,
 268                                                                         &tw))
 269                                                 goto ok;
 270                                         goto next_port;
 271                                 }
 272                         }
 273
 274                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
 275                         if (!tb) {
 276                                 spin_unlock(&head->lock);
 277                                 break;
 278                         }
 279                         tb->fastreuse = -1;
 280                         goto ok;
 281
 282                 next_port:
 283                         spin_unlock(&head->lock);
 284                 }
 285                 local_bh_enable();
 286
 287                 return -EADDRNOTAVAIL;
 288
 289 ok:
 290                 hint += i;
 291
 292                 /* Head lock still held and bh's disabled */
 293                 inet_bind_hash(sk, tb, port);
 294                 if (sk_unhashed(sk)) {
 295                         inet_sk(sk)->sport = htons(port);
 296                         __inet_hash(&tcp_hashinfo, sk, 0);
 297                 }
 298                 spin_unlock(&head->lock);
 299
 300                 if (tw) {
 301                         inet_twsk_deschedule(tw, &tcp_death_row);;
 302                         inet_twsk_put(tw);
 303                 }
 304
 305                 ret = 0;
 306                 goto out;
 307         }
 308
 309         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 310         tb  = inet_csk(sk)->icsk_bind_hash;
 311         spin_lock_bh(&head->lock);
 312         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 313                 __inet_hash(&tcp_hashinfo, sk, 0);
 314                 spin_unlock_bh(&head->lock);
 315                 return 0;
 316         } else {
 317                 spin_unlock(&head->lock);
 318                 /* No definite answer... Walk to established hash table */
 319                 ret = __tcp_v4_check_established(sk, snum, NULL);
 320 out:
 321                 local_bh_enable();
 322                 return ret;
 323         }
 324 }
 325
 326 /* This will initiate an outgoing connection. */
 327 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 328 {
 329         struct inet_sock *inet = inet_sk(sk);
 330         struct tcp_sock *tp = tcp_sk(sk);
 331         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 332         struct rtable *rt;
 333         u32 daddr, nexthop;
 334         int tmp;
 335         int err;
 336
 337         if (addr_len < sizeof(struct sockaddr_in))
 338                 return -EINVAL;
 339
 340         if (usin->sin_family != AF_INET)
 341                 return -EAFNOSUPPORT;
 342
 343         nexthop = daddr = usin->sin_addr.s_addr;
 344         if (inet->opt && inet->opt->srr) {
 345                 if (!daddr)
 346                         return -EINVAL;
 347                 nexthop = inet->opt->faddr;
 348         }
 349
 350         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 351                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 352                                IPPROTO_TCP,
 353                                inet->sport, usin->sin_port, sk);
 354         if (tmp < 0)
 355                 return tmp;
 356
 357         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 358                 ip_rt_put(rt);
 359                 return -ENETUNREACH;
 360         }
 361
 362         if (!inet->opt || !inet->opt->srr)
 363                 daddr = rt->rt_dst;
 364
 365         if (!inet->saddr)
 366                 inet->saddr = rt->rt_src;
 367         inet->rcv_saddr = inet->saddr;
 368
 369         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 370                 /* Reset inherited state */
 371                 tp->rx_opt.ts_recent       = 0;
 372                 tp->rx_opt.ts_recent_stamp = 0;
 373                 tp->write_seq              = 0;
 374         }
 375
 376         if (tcp_death_row.sysctl_tw_recycle &&
 377             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 378                 struct inet_peer *peer = rt_get_peer(rt);
 379
 380                 /* VJ's idea. We save last timestamp seen from
 381                  * the destination in peer table, when entering state TIME-WAIT
 382                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 383                  */
 384
 385                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 386                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 387                         tp->rx_opt.ts_recent = peer->tcp_ts;
 388                 }
 389         }
 390
 391         inet->dport = usin->sin_port;
 392         inet->daddr = daddr;
 393
 394         tp->ext_header_len = 0;
 395         if (inet->opt)
 396                 tp->ext_header_len = inet->opt->optlen;
 397
 398         tp->rx_opt.mss_clamp = 536;
 399
 400         /* Socket identity is still unknown (sport may be zero).
 401          * However we set state to SYN-SENT and not releasing socket
 402          * lock select source port, enter ourselves into the hash tables and
 403          * complete initialization after this.
 404          */
 405         tcp_set_state(sk, TCP_SYN_SENT);
 406         err = tcp_v4_hash_connect(sk);
 407         if (err)
 408                 goto failure;
 409
 410         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 411         if (err)
 412                 goto failure;
 413
 414         /* OK, now commit destination to socket.  */
 415         sk_setup_caps(sk, &rt->u.dst);
 416
 417         if (!tp->write_seq)
 418                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 419                                                            inet->daddr,
 420                                                            inet->sport,
 421                                                            usin->sin_port);
 422
 423         inet->id = tp->write_seq ^ jiffies;
 424
 425         err = tcp_connect(sk);
 426         rt = NULL;
 427         if (err)
 428                 goto failure;
 429
 430         return 0;
 431
 432 failure:
 433         /* This unhashes the socket and releases the local port, if necessary. */
 434         tcp_set_state(sk, TCP_CLOSE);
 435         ip_rt_put(rt);
 436         sk->sk_route_caps = 0;
 437         inet->dport = 0;
 438         return err;
 439 }
 440
 441 /*
 442  * This routine does path mtu discovery as defined in RFC1191.
 443  */
 444 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 445                                      u32 mtu)
 446 {
 447         struct dst_entry *dst;
 448         struct inet_sock *inet = inet_sk(sk);
 449         struct tcp_sock *tp = tcp_sk(sk);
 450
 451         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 452          * send out by Linux are always <576bytes so they should go through
 453          * unfragmented).
 454          */
 455         if (sk->sk_state == TCP_LISTEN)
 456                 return;
 457
 458         /* We don't check in the destentry if pmtu discovery is forbidden
 459          * on this route. We just assume that no packet_to_big packets
 460          * are send back when pmtu discovery is not active.
 461          * There is a small race when the user changes this flag in the
 462          * route, but I think that's acceptable.
 463          */
 464         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 465                 return;
 466
 467         dst->ops->update_pmtu(dst, mtu);
 468
 469         /* Something is about to be wrong... Remember soft error
 470          * for the case, if this connection will not able to recover.
 471          */
 472         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 473                 sk->sk_err_soft = EMSGSIZE;
 474
 475         mtu = dst_mtu(dst);
 476
 477         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 478             tp->pmtu_cookie > mtu) {
 479                 tcp_sync_mss(sk, mtu);
 480
 481                 /* Resend the TCP packet because it's
 482                  * clear that the old packet has been
 483                  * dropped. This is the new "fast" path mtu
 484                  * discovery.
 485                  */
 486                 tcp_simple_retransmit(sk);
 487         } /* else let the usual retransmit timer handle it */
 488 }
 489
 490 /*
 491  * This routine is called by the ICMP module when it gets some
 492  * sort of error condition.  If err < 0 then the socket should
 493  * be closed and the error returned to the user.  If err > 0
 494  * it's just the icmp type << 8 | icmp code.  After adjustment
 495  * header points to the first 8 bytes of the tcp header.  We need
 496  * to find the appropriate port.
 497  *
 498  * The locking strategy used here is very "optimistic". When
 499  * someone else accesses the socket the ICMP is just dropped
 500  * and for some paths there is no check at all.
 501  * A more general error queue to queue errors for later handling
 502  * is probably better.
 503  *
 504  */
 505
 506 void tcp_v4_err(struct sk_buff *skb, u32 info)
 507 {
 508         struct iphdr *iph = (struct iphdr *)skb->data;
 509         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 510         struct tcp_sock *tp;
 511         struct inet_sock *inet;
 512         int type = skb->h.icmph->type;
 513         int code = skb->h.icmph->code;
 514         struct sock *sk;
 515         __u32 seq;
 516         int err;
 517
 518         if (skb->len < (iph->ihl << 2) + 8) {
 519                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 520                 return;
 521         }
 522
 523         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
 524                          th->source, inet_iif(skb));
 525         if (!sk) {
 526                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 527                 return;
 528         }
 529         if (sk->sk_state == TCP_TIME_WAIT) {
 530                 inet_twsk_put((struct inet_timewait_sock *)sk);
 531                 return;
 532         }
 533
 534         bh_lock_sock(sk);
 535         /* If too many ICMPs get dropped on busy
 536          * servers this needs to be solved differently.
 537          */
 538         if (sock_owned_by_user(sk))
 539                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 540
 541         if (sk->sk_state == TCP_CLOSE)
 542                 goto out;
 543
 544         tp = tcp_sk(sk);
 545         seq = ntohl(th->seq);
 546         if (sk->sk_state != TCP_LISTEN &&
 547             !between(seq, tp->snd_una, tp->snd_nxt)) {
 548                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
 549                 goto out;
 550         }
 551
 552         switch (type) {
 553         case ICMP_SOURCE_QUENCH:
 554                 /* Just silently ignore these. */
 555                 goto out;
 556         case ICMP_PARAMETERPROB:
 557                 err = EPROTO;
 558                 break;
 559         case ICMP_DEST_UNREACH:
 560                 if (code > NR_ICMP_UNREACH)
 561                         goto out;
 562
 563                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 564                         if (!sock_owned_by_user(sk))
 565                                 do_pmtu_discovery(sk, iph, info);
 566                         goto out;
 567                 }
 568
 569                 err = icmp_err_convert[code].errno;
 570                 break;
 571         case ICMP_TIME_EXCEEDED:
 572                 err = EHOSTUNREACH;
 573                 break;
 574         default:
 575                 goto out;
 576         }
 577
 578         switch (sk->sk_state) {
 579                 struct request_sock *req, **prev;
 580         case TCP_LISTEN:
 581                 if (sock_owned_by_user(sk))
 582                         goto out;
 583
 584                 req = inet_csk_search_req(sk, &prev, th->dest,
 585                                           iph->daddr, iph->saddr);
 586                 if (!req)
 587                         goto out;
 588
 589                 /* ICMPs are not backlogged, hence we cannot get
 590                    an established socket here.
 591                  */
 592                 BUG_TRAP(!req->sk);
 593
 594                 if (seq != tcp_rsk(req)->snt_isn) {
 595                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 596                         goto out;
 597                 }
 598
 599                 /*
 600                  * Still in SYN_RECV, just remove it silently.
 601                  * There is no good way to pass the error to the newly
 602                  * created socket, and POSIX does not want network
 603                  * errors returned from accept().
 604                  */
 605                 inet_csk_reqsk_queue_drop(sk, req, prev);
 606                 goto out;
 607
 608         case TCP_SYN_SENT:
 609         case TCP_SYN_RECV:  /* Cannot happen.
 610                                It can f.e. if SYNs crossed.
 611                              */
 612                 if (!sock_owned_by_user(sk)) {
 613                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 614                         sk->sk_err = err;
 615
 616                         sk->sk_error_report(sk);
 617
 618                         tcp_done(sk);
 619                 } else {
 620                         sk->sk_err_soft = err;
 621                 }
 622                 goto out;
 623         }
 624
 625         /* If we've already connected we will keep trying
 626          * until we time out, or the user gives up.
 627          *
 628          * rfc1122 4.2.3.9 allows to consider as hard errors
 629          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 630          * but it is obsoleted by pmtu discovery).
 631          *
 632          * Note, that in modern internet, where routing is unreliable
 633          * and in each dark corner broken firewalls sit, sending random
 634          * errors ordered by their masters even this two messages finally lose
 635          * their original sense (even Linux sends invalid PORT_UNREACHs)
 636          *
 637          * Now we are in compliance with RFCs.
 638          *                                                      --ANK (980905)
 639          */
 640
 641         inet = inet_sk(sk);
 642         if (!sock_owned_by_user(sk) && inet->recverr) {
 643                 sk->sk_err = err;
 644                 sk->sk_error_report(sk);
 645         } else  { /* Only an error on timeout */
 646                 sk->sk_err_soft = err;
 647         }
 648
 649 out:
 650         bh_unlock_sock(sk);
 651         sock_put(sk);
 652 }
 653
 654 /* This routine computes an IPv4 TCP checksum. */
 655 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 656 {
 657         struct inet_sock *inet = inet_sk(sk);
 658         struct tcphdr *th = skb->h.th;
 659
 660         if (skb->ip_summed == CHECKSUM_HW) {
 661                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
 662                 skb->csum = offsetof(struct tcphdr, check);
 663         } else {
 664                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
 665                                          csum_partial((char *)th,
 666                                                       th->doff << 2,
 667                                                       skb->csum));
 668         }
 669 }
 670
 671 /*
 672  *      This routine will send an RST to the other tcp.
 673  *
 674  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 675  *                    for reset.
 676  *      Answer: if a packet caused RST, it is not for a socket
 677  *              existing in our system, if it is matched to a socket,
 678  *              it is just duplicate segment or bug in other side's TCP.
 679  *              So that we build reply only basing on parameters
 680  *              arrived with segment.
 681  *      Exception: precedence violation. We do not implement it in any case.
 682  */
 683
 684 static void tcp_v4_send_reset(struct sk_buff *skb)
 685 {
 686         struct tcphdr *th = skb->h.th;
 687         struct tcphdr rth;
 688         struct ip_reply_arg arg;
 689
 690         /* Never send a reset in response to a reset. */
 691         if (th->rst)
 692                 return;
 693
 694         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
 695                 return;
 696
 697         /* Swap the send and the receive. */
 698         memset(&rth, 0, sizeof(struct tcphdr));
 699         rth.dest   = th->source;
 700         rth.source = th->dest;
 701         rth.doff   = sizeof(struct tcphdr) / 4;
 702         rth.rst    = 1;
 703
 704         if (th->ack) {
 705                 rth.seq = th->ack_seq;
 706         } else {
 707                 rth.ack = 1;
 708                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 709                                     skb->len - (th->doff << 2));
 710         }
 711
 712         memset(&arg, 0, sizeof arg);
 713         arg.iov[0].iov_base = (unsigned char *)&rth;
 714         arg.iov[0].iov_len  = sizeof rth;
 715         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 716                                       skb->nh.iph->saddr, /*XXX*/
 717                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 718         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 719
 720         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 721
 722         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 723         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 724 }
 725
 726 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 727    outside socket context is ugly, certainly. What can I do?
 728  */
 729
 730 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 731                             u32 win, u32 ts)
 732 {
 733         struct tcphdr *th = skb->h.th;
 734         struct {
 735                 struct tcphdr th;
 736                 u32 tsopt[3];
 737         } rep;
 738         struct ip_reply_arg arg;
 739
 740         memset(&rep.th, 0, sizeof(struct tcphdr));
 741         memset(&arg, 0, sizeof arg);
 742
 743         arg.iov[0].iov_base = (unsigned char *)&rep;
 744         arg.iov[0].iov_len  = sizeof(rep.th);
 745         if (ts) {
 746                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 747                                      (TCPOPT_TIMESTAMP << 8) |
 748                                      TCPOLEN_TIMESTAMP);
 749                 rep.tsopt[1] = htonl(tcp_time_stamp);
 750                 rep.tsopt[2] = htonl(ts);
 751                 arg.iov[0].iov_len = sizeof(rep);
 752         }
 753
 754         /* Swap the send and the receive. */
 755         rep.th.dest    = th->source;
 756         rep.th.source  = th->dest;
 757         rep.th.doff    = arg.iov[0].iov_len / 4;
 758         rep.th.seq     = htonl(seq);
 759         rep.th.ack_seq = htonl(ack);
 760         rep.th.ack     = 1;
 761         rep.th.window  = htons(win);
 762
 763         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 764                                       skb->nh.iph->saddr, /*XXX*/
 765                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 766         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 767
 768         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 769
 770         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 771 }
 772
 773 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 774 {
 775         struct inet_timewait_sock *tw = inet_twsk(sk);
 776         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 777
 778         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 779                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
 780
 781         inet_twsk_put(tw);
 782 }
 783
 784 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 785 {
 786         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 787                         req->ts_recent);
 788 }
 789
 790 /*
 791  *      Send a SYN-ACK after having received an ACK.
 792  *      This still operates on a request_sock only, not on a big
 793  *      socket.
 794  */
 795 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 796                               struct dst_entry *dst)
 797 {
 798         const struct inet_request_sock *ireq = inet_rsk(req);
 799         int err = -1;
 800         struct sk_buff * skb;
 801
 802         /* First, grab a route. */
 803         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 804                 goto out;
 805
 806         skb = tcp_make_synack(sk, dst, req);
 807
 808         if (skb) {
 809                 struct tcphdr *th = skb->h.th;
 810
 811                 th->check = tcp_v4_check(th, skb->len,
 812                                          ireq->loc_addr,
 813                                          ireq->rmt_addr,
 814                                          csum_partial((char *)th, skb->len,
 815                                                       skb->csum));
 816
 817                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 818                                             ireq->rmt_addr,
 819                                             ireq->opt);
 820                 if (err == NET_XMIT_CN)
 821                         err = 0;
 822         }
 823
 824 out:
 825         dst_release(dst);
 826         return err;
 827 }
 828
 829 /*
 830  *      IPv4 request_sock destructor.
 831  */
 832 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 833 {
 834         kfree(inet_rsk(req)->opt);
 835 }
 836
 837 static inline void syn_flood_warning(struct sk_buff *skb)
 838 {
 839         static unsigned long warntime;
 840
 841         if (time_after(jiffies, (warntime + HZ * 60))) {
 842                 warntime = jiffies;
 843                 printk(KERN_INFO
 844                        "possible SYN flooding on port %d. Sending cookies.\n",
 845                        ntohs(skb->h.th->dest));
 846         }
 847 }
 848
 849 /*
 850  * Save and compile IPv4 options into the request_sock if needed.
 851  */
 852 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
 853                                                      struct sk_buff *skb)
 854 {
 855         struct ip_options *opt = &(IPCB(skb)->opt);
 856         struct ip_options *dopt = NULL;
 857
 858         if (opt && opt->optlen) {
 859                 int opt_size = optlength(opt);
 860                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 861                 if (dopt) {
 862                         if (ip_options_echo(dopt, skb)) {
 863                                 kfree(dopt);
 864                                 dopt = NULL;
 865                         }
 866                 }
 867         }
 868         return dopt;
 869 }
 870
 871 struct request_sock_ops tcp_request_sock_ops = {
 872         .family         =       PF_INET,
 873         .obj_size       =       sizeof(struct tcp_request_sock),
 874         .rtx_syn_ack    =       tcp_v4_send_synack,
 875         .send_ack       =       tcp_v4_reqsk_send_ack,
 876         .destructor     =       tcp_v4_reqsk_destructor,
 877         .send_reset     =       tcp_v4_send_reset,
 878 };
 879
 880 static struct timewait_sock_ops tcp_timewait_sock_ops = {
 881         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
 882         .twsk_unique    = tcp_twsk_unique,
 883 };
 884
 885 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 886 {
 887         struct inet_request_sock *ireq;
 888         struct tcp_options_received tmp_opt;
 889         struct request_sock *req;
 890         __u32 saddr = skb->nh.iph->saddr;
 891         __u32 daddr = skb->nh.iph->daddr;
 892         __u32 isn = TCP_SKB_CB(skb)->when;
 893         struct dst_entry *dst = NULL;
 894 #ifdef CONFIG_SYN_COOKIES
 895         int want_cookie = 0;
 896 #else
 897 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
 898 #endif
 899
 900         /* Never answer to SYNs send to broadcast or multicast */
 901         if (((struct rtable *)skb->dst)->rt_flags &
 902             (RTCF_BROADCAST | RTCF_MULTICAST))
 903                 goto drop;
 904
 905         /* TW buckets are converted to open requests without
 906          * limitations, they conserve resources and peer is
 907          * evidently real one.
 908          */
 909         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
 910 #ifdef CONFIG_SYN_COOKIES
 911                 if (sysctl_tcp_syncookies) {
 912                         want_cookie = 1;
 913                 } else
 914 #endif
 915                 goto drop;
 916         }
 917
 918         /* Accept backlog is full. If we have already queued enough
 919          * of warm entries in syn queue, drop request. It is better than
 920          * clogging syn queue with openreqs with exponentially increasing
 921          * timeout.
 922          */
 923         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 924                 goto drop;
 925
 926         req = reqsk_alloc(&tcp_request_sock_ops);
 927         if (!req)
 928                 goto drop;
 929
 930         tcp_clear_options(&tmp_opt);
 931         tmp_opt.mss_clamp = 536;
 932         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
 933
 934         tcp_parse_options(skb, &tmp_opt, 0);
 935
 936         if (want_cookie) {
 937                 tcp_clear_options(&tmp_opt);
 938                 tmp_opt.saw_tstamp = 0;
 939         }
 940
 941         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
 942                 /* Some OSes (unknown ones, but I see them on web server, which
 943                  * contains information interesting only for windows'
 944                  * users) do not send their stamp in SYN. It is easy case.
 945                  * We simply do not advertise TS support.
 946                  */
 947                 tmp_opt.saw_tstamp = 0;
 948                 tmp_opt.tstamp_ok  = 0;
 949         }
 950         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 951
 952         tcp_openreq_init(req, &tmp_opt, skb);
 953
 954         ireq = inet_rsk(req);
 955         ireq->loc_addr = daddr;
 956         ireq->rmt_addr = saddr;
 957         ireq->opt = tcp_v4_save_options(sk, skb);
 958         if (!want_cookie)
 959                 TCP_ECN_create_request(req, skb->h.th);
 960
 961         if (want_cookie) {
 962 #ifdef CONFIG_SYN_COOKIES
 963                 syn_flood_warning(skb);
 964 #endif
 965                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
 966         } else if (!isn) {
 967                 struct inet_peer *peer = NULL;
 968
 969                 /* VJ's idea. We save last timestamp seen
 970                  * from the destination in peer table, when entering
 971                  * state TIME-WAIT, and check against it before
 972                  * accepting new connection request.
 973                  *
 974                  * If "isn" is not zero, this request hit alive
 975                  * timewait bucket, so that all the necessary checks
 976                  * are made in the function processing timewait state.
 977                  */
 978                 if (tmp_opt.saw_tstamp &&
 979                     tcp_death_row.sysctl_tw_recycle &&
 980                     (dst = inet_csk_route_req(sk, req)) != NULL &&
 981                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 982                     peer->v4daddr == saddr) {
 983                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
 984                             (s32)(peer->tcp_ts - req->ts_recent) >
 985                                                         TCP_PAWS_WINDOW) {
 986                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
 987                                 dst_release(dst);
 988                                 goto drop_and_free;
 989                         }
 990                 }
 991                 /* Kill the following clause, if you dislike this way. */
 992                 else if (!sysctl_tcp_syncookies &&
 993                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 994                           (sysctl_max_syn_backlog >> 2)) &&
 995                          (!peer || !peer->tcp_ts_stamp) &&
 996                          (!dst || !dst_metric(dst, RTAX_RTT))) {
 997                         /* Without syncookies last quarter of
 998                          * backlog is filled with destinations,
 999                          * proven to be alive.
1000                          * It means that we continue to communicate
1001                          * to destinations, already remembered
1002                          * to the moment of synflood.
1003                          */
1004                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1005                                        "request from %u.%u.%u.%u/%u\n",
1006                                        NIPQUAD(saddr),
1007                                        ntohs(skb->h.th->source));
1008                         dst_release(dst);
1009                         goto drop_and_free;
1010                 }
1011
1012                 isn = tcp_v4_init_sequence(sk, skb);
1013         }
1014         tcp_rsk(req)->snt_isn = isn;
1015
1016         if (tcp_v4_send_synack(sk, req, dst))
1017                 goto drop_and_free;
1018
1019         if (want_cookie) {
1020                 reqsk_free(req);
1021         } else {
1022                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1023         }
1024         return 0;
1025
1026 drop_and_free:
1027         reqsk_free(req);
1028 drop:
1029         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1030         return 0;
1031 }
1032
1033
1034 /*
1035  * The three way handshake has completed - we got a valid synack -
1036  * now create the new socket.
1037  */
1038 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1039                                   struct request_sock *req,
1040                                   struct dst_entry *dst)
1041 {
1042         struct inet_request_sock *ireq;
1043         struct inet_sock *newinet;
1044         struct tcp_sock *newtp;
1045         struct sock *newsk;
1046
1047         if (sk_acceptq_is_full(sk))
1048                 goto exit_overflow;
1049
1050         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1051                 goto exit;
1052
1053         newsk = tcp_create_openreq_child(sk, req, skb);
1054         if (!newsk)
1055                 goto exit;
1056
1057         sk_setup_caps(newsk, dst);
1058
1059         newtp                 = tcp_sk(newsk);
1060         newinet               = inet_sk(newsk);
1061         ireq                  = inet_rsk(req);
1062         newinet->daddr        = ireq->rmt_addr;
1063         newinet->rcv_saddr    = ireq->loc_addr;
1064         newinet->saddr        = ireq->loc_addr;
1065         newinet->opt          = ireq->opt;
1066         ireq->opt             = NULL;
1067         newinet->mc_index     = inet_iif(skb);
1068         newinet->mc_ttl       = skb->nh.iph->ttl;
1069         newtp->ext_header_len = 0;
1070         if (newinet->opt)
1071                 newtp->ext_header_len = newinet->opt->optlen;
1072         newinet->id = newtp->write_seq ^ jiffies;
1073
1074         tcp_sync_mss(newsk, dst_mtu(dst));
1075         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1076         tcp_initialize_rcv_mss(newsk);
1077
1078         __inet_hash(&tcp_hashinfo, newsk, 0);
1079         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1080
1081         return newsk;
1082
1083 exit_overflow:
1084         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1085 exit:
1086         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1087         dst_release(dst);
1088         return NULL;
1089 }
1090
1091 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1092 {
1093         struct tcphdr *th = skb->h.th;
1094         struct iphdr *iph = skb->nh.iph;
1095         struct sock *nsk;
1096         struct request_sock **prev;
1097         /* Find possible connection requests. */
1098         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1099                                                        iph->saddr, iph->daddr);
1100         if (req)
1101                 return tcp_check_req(sk, skb, req, prev);
1102
1103         nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1104                                         th->source, skb->nh.iph->daddr,
1105                                         ntohs(th->dest), inet_iif(skb));
1106
1107         if (nsk) {
1108                 if (nsk->sk_state != TCP_TIME_WAIT) {
1109                         bh_lock_sock(nsk);
1110                         return nsk;
1111                 }
1112                 inet_twsk_put((struct inet_timewait_sock *)nsk);
1113                 return NULL;
1114         }
1115
1116 #ifdef CONFIG_SYN_COOKIES
1117         if (!th->rst && !th->syn && th->ack)
1118                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1119 #endif
1120         return sk;
1121 }
1122
1123 static int tcp_v4_checksum_init(struct sk_buff *skb)
1124 {
1125         if (skb->ip_summed == CHECKSUM_HW) {
1126                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1127                                   skb->nh.iph->daddr, skb->csum)) {
1128                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1129                         return 0;
1130                 }
1131         }
1132
1133         skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
1134                                        skb->len, IPPROTO_TCP, 0);
1135
1136         if (skb->len <= 76) {
1137                 return __skb_checksum_complete(skb);
1138         }
1139         return 0;
1140 }
1141
1142
1143 /* The socket must have it's spinlock held when we get
1144  * here.
1145  *
1146  * We have a potential double-lock case here, so even when
1147  * doing backlog processing we use the BH locking scheme.
1148  * This is because we cannot sleep with the original spinlock
1149  * held.
1150  */
1151 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1152 {
1153         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1154                 TCP_CHECK_TIMER(sk);
1155                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1156                         goto reset;
1157                 TCP_CHECK_TIMER(sk);
1158                 return 0;
1159         }
1160
1161         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1162                 goto csum_err;
1163
1164         if (sk->sk_state == TCP_LISTEN) {
1165                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1166                 if (!nsk)
1167                         goto discard;
1168
1169                 if (nsk != sk) {
1170                         if (tcp_child_process(sk, nsk, skb))
1171                                 goto reset;
1172                         return 0;
1173                 }
1174         }
1175
1176         TCP_CHECK_TIMER(sk);
1177         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1178                 goto reset;
1179         TCP_CHECK_TIMER(sk);
1180         return 0;
1181
1182 reset:
1183         tcp_v4_send_reset(skb);
1184 discard:
1185         kfree_skb(skb);
1186         /* Be careful here. If this function gets more complicated and
1187          * gcc suffers from register pressure on the x86, sk (in %ebx)
1188          * might be destroyed here. This current version compiles correctly,
1189          * but you have been warned.
1190          */
1191         return 0;
1192
1193 csum_err:
1194         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1195         goto discard;
1196 }
1197
1198 /*
1199  *      From tcp_input.c
1200  */
1201
1202 int tcp_v4_rcv(struct sk_buff *skb)
1203 {
1204         struct tcphdr *th;
1205         struct sock *sk;
1206         int ret;
1207
1208         if (skb->pkt_type != PACKET_HOST)
1209                 goto discard_it;
1210
1211         /* Count it even if it's bad */
1212         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1213
1214         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1215                 goto discard_it;
1216
1217         th = skb->h.th;
1218
1219         if (th->doff < sizeof(struct tcphdr) / 4)
1220                 goto bad_packet;
1221         if (!pskb_may_pull(skb, th->doff * 4))
1222                 goto discard_it;
1223
1224         /* An explanation is required here, I think.
1225          * Packet length and doff are validated by header prediction,
1226          * provided case of th->doff==0 is eliminated.
1227          * So, we defer the checks. */
1228         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1229              tcp_v4_checksum_init(skb)))
1230                 goto bad_packet;
1231
1232         th = skb->h.th;
1233         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1234         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1235                                     skb->len - th->doff * 4);
1236         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1237         TCP_SKB_CB(skb)->when    = 0;
1238         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1239         TCP_SKB_CB(skb)->sacked  = 0;
1240
1241         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1242                            skb->nh.iph->daddr, ntohs(th->dest),
1243                            inet_iif(skb));
1244
1245         if (!sk)
1246                 goto no_tcp_socket;
1247
1248 process:
1249         if (sk->sk_state == TCP_TIME_WAIT)
1250                 goto do_time_wait;
1251
1252         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1253                 goto discard_and_relse;
1254
1255         if (sk_filter(sk, skb, 0))
1256                 goto discard_and_relse;
1257
1258         skb->dev = NULL;
1259
1260         bh_lock_sock(sk);
1261         ret = 0;
1262         if (!sock_owned_by_user(sk)) {
1263                 if (!tcp_prequeue(sk, skb))
1264                         ret = tcp_v4_do_rcv(sk, skb);
1265         } else
1266                 sk_add_backlog(sk, skb);
1267         bh_unlock_sock(sk);
1268
1269         sock_put(sk);
1270
1271         return ret;
1272
1273 no_tcp_socket:
1274         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1275                 goto discard_it;
1276
1277         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1278 bad_packet:
1279                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1280         } else {
1281                 tcp_v4_send_reset(skb);
1282         }
1283
1284 discard_it:
1285         /* Discard frame. */
1286         kfree_skb(skb);
1287         return 0;
1288
1289 discard_and_relse:
1290         sock_put(sk);
1291         goto discard_it;
1292
1293 do_time_wait:
1294         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1295                 inet_twsk_put((struct inet_timewait_sock *) sk);
1296                 goto discard_it;
1297         }
1298
1299         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1300                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1301                 inet_twsk_put((struct inet_timewait_sock *) sk);
1302                 goto discard_it;
1303         }
1304         switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1305                                            skb, th)) {
1306         case TCP_TW_SYN: {
1307                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1308                                                         skb->nh.iph->daddr,
1309                                                         ntohs(th->dest),
1310                                                         inet_iif(skb));
1311                 if (sk2) {
1312                         inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1313                                              &tcp_death_row);
1314                         inet_twsk_put((struct inet_timewait_sock *)sk);
1315                         sk = sk2;
1316                         goto process;
1317                 }
1318                 /* Fall through to ACK */
1319         }
1320         case TCP_TW_ACK:
1321                 tcp_v4_timewait_ack(sk, skb);
1322                 break;
1323         case TCP_TW_RST:
1324                 goto no_tcp_socket;
1325         case TCP_TW_SUCCESS:;
1326         }
1327         goto discard_it;
1328 }
1329
1330 /* VJ's idea. Save last timestamp seen from this destination
1331  * and hold it at least for normal timewait interval to use for duplicate
1332  * segment detection in subsequent connections, before they enter synchronized
1333  * state.
1334  */
1335
1336 int tcp_v4_remember_stamp(struct sock *sk)
1337 {
1338         struct inet_sock *inet = inet_sk(sk);
1339         struct tcp_sock *tp = tcp_sk(sk);
1340         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1341         struct inet_peer *peer = NULL;
1342         int release_it = 0;
1343
1344         if (!rt || rt->rt_dst != inet->daddr) {
1345                 peer = inet_getpeer(inet->daddr, 1);
1346                 release_it = 1;
1347         } else {
1348                 if (!rt->peer)
1349                         rt_bind_peer(rt, 1);
1350                 peer = rt->peer;
1351         }
1352
1353         if (peer) {
1354                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1355                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1356                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1357                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1358                         peer->tcp_ts = tp->rx_opt.ts_recent;
1359                 }
1360                 if (release_it)
1361                         inet_putpeer(peer);
1362                 return 1;
1363         }
1364
1365         return 0;
1366 }
1367
1368 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1369 {
1370         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1371
1372         if (peer) {
1373                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1374
1375                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1376                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1377                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1378                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1379                         peer->tcp_ts       = tcptw->tw_ts_recent;
1380                 }
1381                 inet_putpeer(peer);
1382                 return 1;
1383         }
1384
1385         return 0;
1386 }
1387
1388 struct inet_connection_sock_af_ops ipv4_specific = {
1389         .queue_xmit     =       ip_queue_xmit,
1390         .send_check     =       tcp_v4_send_check,
1391         .rebuild_header =       inet_sk_rebuild_header,
1392         .conn_request   =       tcp_v4_conn_request,
1393         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1394         .remember_stamp =       tcp_v4_remember_stamp,
1395         .net_header_len =       sizeof(struct iphdr),
1396         .setsockopt     =       ip_setsockopt,
1397         .getsockopt     =       ip_getsockopt,
1398         .addr2sockaddr  =       inet_csk_addr2sockaddr,
1399         .sockaddr_len   =       sizeof(struct sockaddr_in),
1400 };
1401
1402 /* NOTE: A lot of things set to zero explicitly by call to
1403  *       sk_alloc() so need not be done here.
1404  */
1405 static int tcp_v4_init_sock(struct sock *sk)
1406 {
1407         struct inet_connection_sock *icsk = inet_csk(sk);
1408         struct tcp_sock *tp = tcp_sk(sk);
1409
1410         skb_queue_head_init(&tp->out_of_order_queue);
1411         tcp_init_xmit_timers(sk);
1412         tcp_prequeue_init(tp);
1413
1414         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1415         tp->mdev = TCP_TIMEOUT_INIT;
1416
1417         /* So many TCP implementations out there (incorrectly) count the
1418          * initial SYN frame in their delayed-ACK and congestion control
1419          * algorithms that we must have the following bandaid to talk
1420          * efficiently to them.  -DaveM
1421          */
1422         tp->snd_cwnd = 2;
1423
1424         /* See draft-stevens-tcpca-spec-01 for discussion of the
1425          * initialization of these values.
1426          */
1427         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1428         tp->snd_cwnd_clamp = ~0;
1429         tp->mss_cache = 536;
1430
1431         tp->reordering = sysctl_tcp_reordering;
1432         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1433
1434         sk->sk_state = TCP_CLOSE;
1435
1436         sk->sk_write_space = sk_stream_write_space;
1437         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1438
1439         icsk->icsk_af_ops = &ipv4_specific;
1440
1441         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1442         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1443
1444         atomic_inc(&tcp_sockets_allocated);
1445
1446         return 0;
1447 }
1448
1449 int tcp_v4_destroy_sock(struct sock *sk)
1450 {
1451         struct tcp_sock *tp = tcp_sk(sk);
1452
1453         tcp_clear_xmit_timers(sk);
1454
1455         tcp_cleanup_congestion_control(sk);
1456
1457         /* Cleanup up the write buffer. */
1458         sk_stream_writequeue_purge(sk);
1459
1460         /* Cleans up our, hopefully empty, out_of_order_queue. */
1461         __skb_queue_purge(&tp->out_of_order_queue);
1462
1463         /* Clean prequeue, it must be empty really */
1464         __skb_queue_purge(&tp->ucopy.prequeue);
1465
1466         /* Clean up a referenced TCP bind bucket. */
1467         if (inet_csk(sk)->icsk_bind_hash)
1468                 inet_put_port(&tcp_hashinfo, sk);
1469
1470         /*
1471          * If sendmsg cached page exists, toss it.
1472          */
1473         if (sk->sk_sndmsg_page) {
1474                 __free_page(sk->sk_sndmsg_page);
1475                 sk->sk_sndmsg_page = NULL;
1476         }
1477
1478         atomic_dec(&tcp_sockets_allocated);
1479
1480         return 0;
1481 }
1482
1483 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1484
1485 #ifdef CONFIG_PROC_FS
1486 /* Proc filesystem TCP sock list dumping. */
1487
1488 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1489 {
1490         return hlist_empty(head) ? NULL :
1491                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1492 }
1493
1494 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1495 {
1496         return tw->tw_node.next ?
1497                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1498 }
1499
1500 static void *listening_get_next(struct seq_file *seq, void *cur)
1501 {
1502         struct inet_connection_sock *icsk;
1503         struct hlist_node *node;
1504         struct sock *sk = cur;
1505         struct tcp_iter_state* st = seq->private;
1506
1507         if (!sk) {
1508                 st->bucket = 0;
1509                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1510                 goto get_sk;
1511         }
1512
1513         ++st->num;
1514
1515         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1516                 struct request_sock *req = cur;
1517
1518                 icsk = inet_csk(st->syn_wait_sk);
1519                 req = req->dl_next;
1520                 while (1) {
1521                         while (req) {
1522                                 if (req->rsk_ops->family == st->family) {
1523                                         cur = req;
1524                                         goto out;
1525                                 }
1526                                 req = req->dl_next;
1527                         }
1528                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1529                                 break;
1530 get_req:
1531                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1532                 }
1533                 sk        = sk_next(st->syn_wait_sk);
1534                 st->state = TCP_SEQ_STATE_LISTENING;
1535                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1536         } else {
1537                 icsk = inet_csk(sk);
1538                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1539                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1540                         goto start_req;
1541                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1542                 sk = sk_next(sk);
1543         }
1544 get_sk:
1545         sk_for_each_from(sk, node) {
1546                 if (sk->sk_family == st->family) {
1547                         cur = sk;
1548                         goto out;
1549                 }
1550                 icsk = inet_csk(sk);
1551                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1552                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1553 start_req:
1554                         st->uid         = sock_i_uid(sk);
1555                         st->syn_wait_sk = sk;
1556                         st->state       = TCP_SEQ_STATE_OPENREQ;
1557                         st->sbucket     = 0;
1558                         goto get_req;
1559                 }
1560                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1561         }
1562         if (++st->bucket < INET_LHTABLE_SIZE) {
1563                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1564                 goto get_sk;
1565         }
1566         cur = NULL;
1567 out:
1568         return cur;
1569 }
1570
1571 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1572 {
1573         void *rc = listening_get_next(seq, NULL);
1574
1575         while (rc && *pos) {
1576                 rc = listening_get_next(seq, rc);
1577                 --*pos;
1578         }
1579         return rc;
1580 }
1581
1582 static void *established_get_first(struct seq_file *seq)
1583 {
1584         struct tcp_iter_state* st = seq->private;
1585         void *rc = NULL;
1586
1587         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1588                 struct sock *sk;
1589                 struct hlist_node *node;
1590                 struct inet_timewait_sock *tw;
1591
1592                 /* We can reschedule _before_ having picked the target: */
1593                 cond_resched_softirq();
1594
1595                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1596                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1597                         if (sk->sk_family != st->family) {
1598                                 continue;
1599                         }
1600                         rc = sk;
1601                         goto out;
1602                 }
1603                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1604                 inet_twsk_for_each(tw, node,
1605                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1606                         if (tw->tw_family != st->family) {
1607                                 continue;
1608                         }
1609                         rc = tw;
1610                         goto out;
1611                 }
1612                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1613                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1614         }
1615 out:
1616         return rc;
1617 }
1618
1619 static void *established_get_next(struct seq_file *seq, void *cur)
1620 {
1621         struct sock *sk = cur;
1622         struct inet_timewait_sock *tw;
1623         struct hlist_node *node;
1624         struct tcp_iter_state* st = seq->private;
1625
1626         ++st->num;
1627
1628         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1629                 tw = cur;
1630                 tw = tw_next(tw);
1631 get_tw:
1632                 while (tw && tw->tw_family != st->family) {
1633                         tw = tw_next(tw);
1634                 }
1635                 if (tw) {
1636                         cur = tw;
1637                         goto out;
1638                 }
1639                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1640                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1641
1642                 /* We can reschedule between buckets: */
1643                 cond_resched_softirq();
1644
1645                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1646                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1647                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1648                 } else {
1649                         cur = NULL;
1650                         goto out;
1651                 }
1652         } else
1653                 sk = sk_next(sk);
1654
1655         sk_for_each_from(sk, node) {
1656                 if (sk->sk_family == st->family)
1657                         goto found;
1658         }
1659
1660         st->state = TCP_SEQ_STATE_TIME_WAIT;
1661         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1662         goto get_tw;
1663 found:
1664         cur = sk;
1665 out:
1666         return cur;
1667 }
1668
1669 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1670 {
1671         void *rc = established_get_first(seq);
1672
1673         while (rc && pos) {
1674                 rc = established_get_next(seq, rc);
1675                 --pos;
1676         }
1677         return rc;
1678 }
1679
1680 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1681 {
1682         void *rc;
1683         struct tcp_iter_state* st = seq->private;
1684
1685         inet_listen_lock(&tcp_hashinfo);
1686         st->state = TCP_SEQ_STATE_LISTENING;
1687         rc        = listening_get_idx(seq, &pos);
1688
1689         if (!rc) {
1690                 inet_listen_unlock(&tcp_hashinfo);
1691                 local_bh_disable();
1692                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1693                 rc        = established_get_idx(seq, pos);
1694         }
1695
1696         return rc;
1697 }
1698
1699 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1700 {
1701         struct tcp_iter_state* st = seq->private;
1702         st->state = TCP_SEQ_STATE_LISTENING;
1703         st->num = 0;
1704         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1705 }
1706
1707 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1708 {
1709         void *rc = NULL;
1710         struct tcp_iter_state* st;
1711
1712         if (v == SEQ_START_TOKEN) {
1713                 rc = tcp_get_idx(seq, 0);
1714                 goto out;
1715         }
1716         st = seq->private;
1717
1718         switch (st->state) {
1719         case TCP_SEQ_STATE_OPENREQ:
1720         case TCP_SEQ_STATE_LISTENING:
1721                 rc = listening_get_next(seq, v);
1722                 if (!rc) {
1723                         inet_listen_unlock(&tcp_hashinfo);
1724                         local_bh_disable();
1725                         st->state = TCP_SEQ_STATE_ESTABLISHED;
1726                         rc        = established_get_first(seq);
1727                 }
1728                 break;
1729         case TCP_SEQ_STATE_ESTABLISHED:
1730         case TCP_SEQ_STATE_TIME_WAIT:
1731                 rc = established_get_next(seq, v);
1732                 break;
1733         }
1734 out:
1735         ++*pos;
1736         return rc;
1737 }
1738
1739 static void tcp_seq_stop(struct seq_file *seq, void *v)
1740 {
1741         struct tcp_iter_state* st = seq->private;
1742
1743         switch (st->state) {
1744         case TCP_SEQ_STATE_OPENREQ:
1745                 if (v) {
1746                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1747                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1748                 }
1749         case TCP_SEQ_STATE_LISTENING:
1750                 if (v != SEQ_START_TOKEN)
1751                         inet_listen_unlock(&tcp_hashinfo);
1752                 break;
1753         case TCP_SEQ_STATE_TIME_WAIT:
1754         case TCP_SEQ_STATE_ESTABLISHED:
1755                 if (v)
1756                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1757                 local_bh_enable();
1758                 break;
1759         }
1760 }
1761
1762 static int tcp_seq_open(struct inode *inode, struct file *file)
1763 {
1764         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1765         struct seq_file *seq;
1766         struct tcp_iter_state *s;
1767         int rc;
1768
1769         if (unlikely(afinfo == NULL))
1770                 return -EINVAL;
1771
1772         s = kmalloc(sizeof(*s), GFP_KERNEL);
1773         if (!s)
1774                 return -ENOMEM;
1775         memset(s, 0, sizeof(*s));
1776         s->family               = afinfo->family;
1777         s->seq_ops.start        = tcp_seq_start;
1778         s->seq_ops.next         = tcp_seq_next;
1779         s->seq_ops.show         = afinfo->seq_show;
1780         s->seq_ops.stop         = tcp_seq_stop;
1781
1782         rc = seq_open(file, &s->seq_ops);
1783         if (rc)
1784                 goto out_kfree;
1785         seq          = file->private_data;
1786         seq->private = s;
1787 out:
1788         return rc;
1789 out_kfree:
1790         kfree(s);
1791         goto out;
1792 }
1793
1794 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1795 {
1796         int rc = 0;
1797         struct proc_dir_entry *p;
1798
1799         if (!afinfo)
1800                 return -EINVAL;
1801         afinfo->seq_fops->owner         = afinfo->owner;
1802         afinfo->seq_fops->open          = tcp_seq_open;
1803         afinfo->seq_fops->read          = seq_read;
1804         afinfo->seq_fops->llseek        = seq_lseek;
1805         afinfo->seq_fops->release       = seq_release_private;
1806
1807         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1808         if (p)
1809                 p->data = afinfo;
1810         else
1811                 rc = -ENOMEM;
1812         return rc;
1813 }
1814
1815 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1816 {
1817         if (!afinfo)
1818                 return;
1819         proc_net_remove(afinfo->name);
1820         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1821 }
1822
1823 static void get_openreq4(struct sock *sk, struct request_sock *req,
1824                          char *tmpbuf, int i, int uid)
1825 {
1826         const struct inet_request_sock *ireq = inet_rsk(req);
1827         int ttd = req->expires - jiffies;
1828
1829         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1830                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1831                 i,
1832                 ireq->loc_addr,
1833                 ntohs(inet_sk(sk)->sport),
1834                 ireq->rmt_addr,
1835                 ntohs(ireq->rmt_port),
1836                 TCP_SYN_RECV,
1837                 0, 0, /* could print option size, but that is af dependent. */
1838                 1,    /* timers active (only the expire timer) */
1839                 jiffies_to_clock_t(ttd),
1840                 req->retrans,
1841                 uid,
1842                 0,  /* non standard timer */
1843                 0, /* open_requests have no inode */
1844                 atomic_read(&sk->sk_refcnt),
1845                 req);
1846 }
1847
1848 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1849 {
1850         int timer_active;
1851         unsigned long timer_expires;
1852         struct tcp_sock *tp = tcp_sk(sp);
1853         const struct inet_connection_sock *icsk = inet_csk(sp);
1854         struct inet_sock *inet = inet_sk(sp);
1855         unsigned int dest = inet->daddr;
1856         unsigned int src = inet->rcv_saddr;
1857         __u16 destp = ntohs(inet->dport);
1858         __u16 srcp = ntohs(inet->sport);
1859
1860         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1861                 timer_active    = 1;
1862                 timer_expires   = icsk->icsk_timeout;
1863         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1864                 timer_active    = 4;
1865                 timer_expires   = icsk->icsk_timeout;
1866         } else if (timer_pending(&sp->sk_timer)) {
1867                 timer_active    = 2;
1868                 timer_expires   = sp->sk_timer.expires;
1869         } else {
1870                 timer_active    = 0;
1871                 timer_expires = jiffies;
1872         }
1873
1874         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1875                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
1876                 i, src, srcp, dest, destp, sp->sk_state,
1877                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1878                 timer_active,
1879                 jiffies_to_clock_t(timer_expires - jiffies),
1880                 icsk->icsk_retransmits,
1881                 sock_i_uid(sp),
1882                 icsk->icsk_probes_out,
1883                 sock_i_ino(sp),
1884                 atomic_read(&sp->sk_refcnt), sp,
1885                 icsk->icsk_rto,
1886                 icsk->icsk_ack.ato,
1887                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1888                 tp->snd_cwnd,
1889                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1890 }
1891
1892 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1893 {
1894         unsigned int dest, src;
1895         __u16 destp, srcp;
1896         int ttd = tw->tw_ttd - jiffies;
1897
1898         if (ttd < 0)
1899                 ttd = 0;
1900
1901         dest  = tw->tw_daddr;
1902         src   = tw->tw_rcv_saddr;
1903         destp = ntohs(tw->tw_dport);
1904         srcp  = ntohs(tw->tw_sport);
1905
1906         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1907                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1908                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1909                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1910                 atomic_read(&tw->tw_refcnt), tw);
1911 }
1912
1913 #define TMPSZ 150
1914
1915 static int tcp4_seq_show(struct seq_file *seq, void *v)
1916 {
1917         struct tcp_iter_state* st;
1918         char tmpbuf[TMPSZ + 1];
1919
1920         if (v == SEQ_START_TOKEN) {
1921                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
1922                            "  sl  local_address rem_address   st tx_queue "
1923                            "rx_queue tr tm->when retrnsmt   uid  timeout "
1924                            "inode");
1925                 goto out;
1926         }
1927         st = seq->private;
1928
1929         switch (st->state) {
1930         case TCP_SEQ_STATE_LISTENING:
1931         case TCP_SEQ_STATE_ESTABLISHED:
1932                 get_tcp4_sock(v, tmpbuf, st->num);
1933                 break;
1934         case TCP_SEQ_STATE_OPENREQ:
1935                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1936                 break;
1937         case TCP_SEQ_STATE_TIME_WAIT:
1938                 get_timewait4_sock(v, tmpbuf, st->num);
1939                 break;
1940         }
1941         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1942 out:
1943         return 0;
1944 }
1945
1946 static struct file_operations tcp4_seq_fops;
1947 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1948         .owner          = THIS_MODULE,
1949         .name           = "tcp",
1950         .family         = AF_INET,
1951         .seq_show       = tcp4_seq_show,
1952         .seq_fops       = &tcp4_seq_fops,
1953 };
1954
1955 int __init tcp4_proc_init(void)
1956 {
1957         return tcp_proc_register(&tcp4_seq_afinfo);
1958 }
1959
1960 void tcp4_proc_exit(void)
1961 {
1962         tcp_proc_unregister(&tcp4_seq_afinfo);
1963 }
1964 #endif /* CONFIG_PROC_FS */
1965
1966 struct proto tcp_prot = {
1967         .name                   = "TCP",
1968         .owner                  = THIS_MODULE,
1969         .close                  = tcp_close,
1970         .connect                = tcp_v4_connect,
1971         .disconnect             = tcp_disconnect,
1972         .accept                 = inet_csk_accept,
1973         .ioctl                  = tcp_ioctl,
1974         .init                   = tcp_v4_init_sock,
1975         .destroy                = tcp_v4_destroy_sock,
1976         .shutdown               = tcp_shutdown,
1977         .setsockopt             = tcp_setsockopt,
1978         .getsockopt             = tcp_getsockopt,
1979         .sendmsg                = tcp_sendmsg,
1980         .recvmsg                = tcp_recvmsg,
1981         .backlog_rcv            = tcp_v4_do_rcv,
1982         .hash                   = tcp_v4_hash,
1983         .unhash                 = tcp_unhash,
1984         .get_port               = tcp_v4_get_port,
1985         .enter_memory_pressure  = tcp_enter_memory_pressure,
1986         .sockets_allocated      = &tcp_sockets_allocated,
1987         .orphan_count           = &tcp_orphan_count,
1988         .memory_allocated       = &tcp_memory_allocated,
1989         .memory_pressure        = &tcp_memory_pressure,
1990         .sysctl_mem             = sysctl_tcp_mem,
1991         .sysctl_wmem            = sysctl_tcp_wmem,
1992         .sysctl_rmem            = sysctl_tcp_rmem,
1993         .max_header             = MAX_TCP_HEADER,
1994         .obj_size               = sizeof(struct tcp_sock),
1995         .twsk_prot              = &tcp_timewait_sock_ops,
1996         .rsk_prot               = &tcp_request_sock_ops,
1997 };
1998
1999
2000
2001 void __init tcp_v4_init(struct net_proto_family *ops)
2002 {
2003         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2004         if (err < 0)
2005                 panic("Failed to create the TCP control socket.\n");
2006         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2007         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2008
2009         /* Unhash it so that IP input processing does not even
2010          * see it, we do not wish this socket to see incoming
2011          * packets.
2012          */
2013         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2014 }
2015
2016 EXPORT_SYMBOL(ipv4_specific);
2017 EXPORT_SYMBOL(inet_bind_bucket_create);
2018 EXPORT_SYMBOL(tcp_hashinfo);
2019 EXPORT_SYMBOL(tcp_prot);
2020 EXPORT_SYMBOL(tcp_unhash);
2021 EXPORT_SYMBOL(tcp_v4_conn_request);
2022 EXPORT_SYMBOL(tcp_v4_connect);
2023 EXPORT_SYMBOL(tcp_v4_do_rcv);
2024 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2025 EXPORT_SYMBOL(tcp_v4_send_check);
2026 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2027
2028 #ifdef CONFIG_PROC_FS
2029 EXPORT_SYMBOL(tcp_proc_register);
2030 EXPORT_SYMBOL(tcp_proc_unregister);
2031 #endif
2032 EXPORT_SYMBOL(sysctl_local_port_range);
2033 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2034 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2035