err.no Git - linux-2.6/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen semantics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75
  76 #include <linux/inet.h>
  77 #include <linux/ipv6.h>
  78 #include <linux/stddef.h>
  79 #include <linux/proc_fs.h>
  80 #include <linux/seq_file.h>
  81
  82 #include <linux/crypto.h>
  83 #include <linux/scatterlist.h>
  84
  85 int sysctl_tcp_tw_reuse __read_mostly;
  86 int sysctl_tcp_low_latency __read_mostly;
  87
  88 /* Check TCP sequence numbers in ICMP packets. */
  89 #define ICMP_MIN_LENGTH 8
  90
  91 /* Socket used for sending RSTs */
  92 static struct socket *tcp_socket __read_mostly;
  93
  94 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
  95
  96 #ifdef CONFIG_TCP_MD5SIG
  97 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  98                                                    __be32 addr);
  99 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
 100                                    __be32 saddr, __be32 daddr,
 101                                    struct tcphdr *th, int protocol,
 102                                    unsigned int tcplen);
 103 #endif
 104
 105 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 106         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
 107         .lhash_users = ATOMIC_INIT(0),
 108         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
 109 };
 110
 111 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 112 {
 113         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 114                                           ip_hdr(skb)->saddr,
 115                                           tcp_hdr(skb)->dest,
 116                                           tcp_hdr(skb)->source);
 117 }
 118
 119 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 120 {
 121         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 122         struct tcp_sock *tp = tcp_sk(sk);
 123
 124         /* With PAWS, it is safe from the viewpoint
 125            of data integrity. Even without PAWS it is safe provided sequence
 126            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 127
 128            Actually, the idea is close to VJ's one, only timestamp cache is
 129            held not per host, but per port pair and TW bucket is used as state
 130            holder.
 131
 132            If TW bucket has been already destroyed we fall back to VJ's scheme
 133            and use initial timestamp retrieved from peer table.
 134          */
 135         if (tcptw->tw_ts_recent_stamp &&
 136             (twp == NULL || (sysctl_tcp_tw_reuse &&
 137                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 138                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 139                 if (tp->write_seq == 0)
 140                         tp->write_seq = 1;
 141                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 142                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 143                 sock_hold(sktw);
 144                 return 1;
 145         }
 146
 147         return 0;
 148 }
 149
 150 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 151
 152 /* This will initiate an outgoing connection. */
 153 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 154 {
 155         struct inet_sock *inet = inet_sk(sk);
 156         struct tcp_sock *tp = tcp_sk(sk);
 157         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 158         struct rtable *rt;
 159         __be32 daddr, nexthop;
 160         int tmp;
 161         int err;
 162
 163         if (addr_len < sizeof(struct sockaddr_in))
 164                 return -EINVAL;
 165
 166         if (usin->sin_family != AF_INET)
 167                 return -EAFNOSUPPORT;
 168
 169         nexthop = daddr = usin->sin_addr.s_addr;
 170         if (inet->opt && inet->opt->srr) {
 171                 if (!daddr)
 172                         return -EINVAL;
 173                 nexthop = inet->opt->faddr;
 174         }
 175
 176         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 177                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 178                                IPPROTO_TCP,
 179                                inet->sport, usin->sin_port, sk, 1);
 180         if (tmp < 0) {
 181                 if (tmp == -ENETUNREACH)
 182                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
 183                 return tmp;
 184         }
 185
 186         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 187                 ip_rt_put(rt);
 188                 return -ENETUNREACH;
 189         }
 190
 191         if (!inet->opt || !inet->opt->srr)
 192                 daddr = rt->rt_dst;
 193
 194         if (!inet->saddr)
 195                 inet->saddr = rt->rt_src;
 196         inet->rcv_saddr = inet->saddr;
 197
 198         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 199                 /* Reset inherited state */
 200                 tp->rx_opt.ts_recent       = 0;
 201                 tp->rx_opt.ts_recent_stamp = 0;
 202                 tp->write_seq              = 0;
 203         }
 204
 205         if (tcp_death_row.sysctl_tw_recycle &&
 206             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 207                 struct inet_peer *peer = rt_get_peer(rt);
 208                 /*
 209                  * VJ's idea. We save last timestamp seen from
 210                  * the destination in peer table, when entering state
 211                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 212                  * when trying new connection.
 213                  */
 214                 if (peer != NULL &&
 215                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
 216                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 217                         tp->rx_opt.ts_recent = peer->tcp_ts;
 218                 }
 219         }
 220
 221         inet->dport = usin->sin_port;
 222         inet->daddr = daddr;
 223
 224         inet_csk(sk)->icsk_ext_hdr_len = 0;
 225         if (inet->opt)
 226                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 227
 228         tp->rx_opt.mss_clamp = 536;
 229
 230         /* Socket identity is still unknown (sport may be zero).
 231          * However we set state to SYN-SENT and not releasing socket
 232          * lock select source port, enter ourselves into the hash tables and
 233          * complete initialization after this.
 234          */
 235         tcp_set_state(sk, TCP_SYN_SENT);
 236         err = inet_hash_connect(&tcp_death_row, sk);
 237         if (err)
 238                 goto failure;
 239
 240         err = ip_route_newports(&rt, IPPROTO_TCP,
 241                                 inet->sport, inet->dport, sk);
 242         if (err)
 243                 goto failure;
 244
 245         /* OK, now commit destination to socket.  */
 246         sk->sk_gso_type = SKB_GSO_TCPV4;
 247         sk_setup_caps(sk, &rt->u.dst);
 248
 249         if (!tp->write_seq)
 250                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 251                                                            inet->daddr,
 252                                                            inet->sport,
 253                                                            usin->sin_port);
 254
 255         inet->id = tp->write_seq ^ jiffies;
 256
 257         err = tcp_connect(sk);
 258         rt = NULL;
 259         if (err)
 260                 goto failure;
 261
 262         return 0;
 263
 264 failure:
 265         /*
 266          * This unhashes the socket and releases the local port,
 267          * if necessary.
 268          */
 269         tcp_set_state(sk, TCP_CLOSE);
 270         ip_rt_put(rt);
 271         sk->sk_route_caps = 0;
 272         inet->dport = 0;
 273         return err;
 274 }
 275
 276 /*
 277  * This routine does path mtu discovery as defined in RFC1191.
 278  */
 279 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 280 {
 281         struct dst_entry *dst;
 282         struct inet_sock *inet = inet_sk(sk);
 283
 284         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 285          * send out by Linux are always <576bytes so they should go through
 286          * unfragmented).
 287          */
 288         if (sk->sk_state == TCP_LISTEN)
 289                 return;
 290
 291         /* We don't check in the destentry if pmtu discovery is forbidden
 292          * on this route. We just assume that no packet_to_big packets
 293          * are send back when pmtu discovery is not active.
 294          * There is a small race when the user changes this flag in the
 295          * route, but I think that's acceptable.
 296          */
 297         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 298                 return;
 299
 300         dst->ops->update_pmtu(dst, mtu);
 301
 302         /* Something is about to be wrong... Remember soft error
 303          * for the case, if this connection will not able to recover.
 304          */
 305         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 306                 sk->sk_err_soft = EMSGSIZE;
 307
 308         mtu = dst_mtu(dst);
 309
 310         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 311             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 312                 tcp_sync_mss(sk, mtu);
 313
 314                 /* Resend the TCP packet because it's
 315                  * clear that the old packet has been
 316                  * dropped. This is the new "fast" path mtu
 317                  * discovery.
 318                  */
 319                 tcp_simple_retransmit(sk);
 320         } /* else let the usual retransmit timer handle it */
 321 }
 322
 323 /*
 324  * This routine is called by the ICMP module when it gets some
 325  * sort of error condition.  If err < 0 then the socket should
 326  * be closed and the error returned to the user.  If err > 0
 327  * it's just the icmp type << 8 | icmp code.  After adjustment
 328  * header points to the first 8 bytes of the tcp header.  We need
 329  * to find the appropriate port.
 330  *
 331  * The locking strategy used here is very "optimistic". When
 332  * someone else accesses the socket the ICMP is just dropped
 333  * and for some paths there is no check at all.
 334  * A more general error queue to queue errors for later handling
 335  * is probably better.
 336  *
 337  */
 338
 339 void tcp_v4_err(struct sk_buff *skb, u32 info)
 340 {
 341         struct iphdr *iph = (struct iphdr *)skb->data;
 342         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 343         struct tcp_sock *tp;
 344         struct inet_sock *inet;
 345         const int type = icmp_hdr(skb)->type;
 346         const int code = icmp_hdr(skb)->code;
 347         struct sock *sk;
 348         __u32 seq;
 349         int err;
 350
 351         if (skb->len < (iph->ihl << 2) + 8) {
 352                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 353                 return;
 354         }
 355
 356         sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
 357                         iph->saddr, th->source, inet_iif(skb));
 358         if (!sk) {
 359                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 360                 return;
 361         }
 362         if (sk->sk_state == TCP_TIME_WAIT) {
 363                 inet_twsk_put(inet_twsk(sk));
 364                 return;
 365         }
 366
 367         bh_lock_sock(sk);
 368         /* If too many ICMPs get dropped on busy
 369          * servers this needs to be solved differently.
 370          */
 371         if (sock_owned_by_user(sk))
 372                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 373
 374         if (sk->sk_state == TCP_CLOSE)
 375                 goto out;
 376
 377         tp = tcp_sk(sk);
 378         seq = ntohl(th->seq);
 379         if (sk->sk_state != TCP_LISTEN &&
 380             !between(seq, tp->snd_una, tp->snd_nxt)) {
 381                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 382                 goto out;
 383         }
 384
 385         switch (type) {
 386         case ICMP_SOURCE_QUENCH:
 387                 /* Just silently ignore these. */
 388                 goto out;
 389         case ICMP_PARAMETERPROB:
 390                 err = EPROTO;
 391                 break;
 392         case ICMP_DEST_UNREACH:
 393                 if (code > NR_ICMP_UNREACH)
 394                         goto out;
 395
 396                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 397                         if (!sock_owned_by_user(sk))
 398                                 do_pmtu_discovery(sk, iph, info);
 399                         goto out;
 400                 }
 401
 402                 err = icmp_err_convert[code].errno;
 403                 break;
 404         case ICMP_TIME_EXCEEDED:
 405                 err = EHOSTUNREACH;
 406                 break;
 407         default:
 408                 goto out;
 409         }
 410
 411         switch (sk->sk_state) {
 412                 struct request_sock *req, **prev;
 413         case TCP_LISTEN:
 414                 if (sock_owned_by_user(sk))
 415                         goto out;
 416
 417                 req = inet_csk_search_req(sk, &prev, th->dest,
 418                                           iph->daddr, iph->saddr);
 419                 if (!req)
 420                         goto out;
 421
 422                 /* ICMPs are not backlogged, hence we cannot get
 423                    an established socket here.
 424                  */
 425                 BUG_TRAP(!req->sk);
 426
 427                 if (seq != tcp_rsk(req)->snt_isn) {
 428                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 429                         goto out;
 430                 }
 431
 432                 /*
 433                  * Still in SYN_RECV, just remove it silently.
 434                  * There is no good way to pass the error to the newly
 435                  * created socket, and POSIX does not want network
 436                  * errors returned from accept().
 437                  */
 438                 inet_csk_reqsk_queue_drop(sk, req, prev);
 439                 goto out;
 440
 441         case TCP_SYN_SENT:
 442         case TCP_SYN_RECV:  /* Cannot happen.
 443                                It can f.e. if SYNs crossed.
 444                              */
 445                 if (!sock_owned_by_user(sk)) {
 446                         sk->sk_err = err;
 447
 448                         sk->sk_error_report(sk);
 449
 450                         tcp_done(sk);
 451                 } else {
 452                         sk->sk_err_soft = err;
 453                 }
 454                 goto out;
 455         }
 456
 457         /* If we've already connected we will keep trying
 458          * until we time out, or the user gives up.
 459          *
 460          * rfc1122 4.2.3.9 allows to consider as hard errors
 461          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 462          * but it is obsoleted by pmtu discovery).
 463          *
 464          * Note, that in modern internet, where routing is unreliable
 465          * and in each dark corner broken firewalls sit, sending random
 466          * errors ordered by their masters even this two messages finally lose
 467          * their original sense (even Linux sends invalid PORT_UNREACHs)
 468          *
 469          * Now we are in compliance with RFCs.
 470          *                                                      --ANK (980905)
 471          */
 472
 473         inet = inet_sk(sk);
 474         if (!sock_owned_by_user(sk) && inet->recverr) {
 475                 sk->sk_err = err;
 476                 sk->sk_error_report(sk);
 477         } else  { /* Only an error on timeout */
 478                 sk->sk_err_soft = err;
 479         }
 480
 481 out:
 482         bh_unlock_sock(sk);
 483         sock_put(sk);
 484 }
 485
 486 /* This routine computes an IPv4 TCP checksum. */
 487 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 488 {
 489         struct inet_sock *inet = inet_sk(sk);
 490         struct tcphdr *th = tcp_hdr(skb);
 491
 492         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 493                 th->check = ~tcp_v4_check(len, inet->saddr,
 494                                           inet->daddr, 0);
 495                 skb->csum_start = skb_transport_header(skb) - skb->head;
 496                 skb->csum_offset = offsetof(struct tcphdr, check);
 497         } else {
 498                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
 499                                          csum_partial((char *)th,
 500                                                       th->doff << 2,
 501                                                       skb->csum));
 502         }
 503 }
 504
 505 int tcp_v4_gso_send_check(struct sk_buff *skb)
 506 {
 507         const struct iphdr *iph;
 508         struct tcphdr *th;
 509
 510         if (!pskb_may_pull(skb, sizeof(*th)))
 511                 return -EINVAL;
 512
 513         iph = ip_hdr(skb);
 514         th = tcp_hdr(skb);
 515
 516         th->check = 0;
 517         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
 518         skb->csum_start = skb_transport_header(skb) - skb->head;
 519         skb->csum_offset = offsetof(struct tcphdr, check);
 520         skb->ip_summed = CHECKSUM_PARTIAL;
 521         return 0;
 522 }
 523
 524 /*
 525  *      This routine will send an RST to the other tcp.
 526  *
 527  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 528  *                    for reset.
 529  *      Answer: if a packet caused RST, it is not for a socket
 530  *              existing in our system, if it is matched to a socket,
 531  *              it is just duplicate segment or bug in other side's TCP.
 532  *              So that we build reply only basing on parameters
 533  *              arrived with segment.
 534  *      Exception: precedence violation. We do not implement it in any case.
 535  */
 536
 537 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 538 {
 539         struct tcphdr *th = tcp_hdr(skb);
 540         struct {
 541                 struct tcphdr th;
 542 #ifdef CONFIG_TCP_MD5SIG
 543                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 544 #endif
 545         } rep;
 546         struct ip_reply_arg arg;
 547 #ifdef CONFIG_TCP_MD5SIG
 548         struct tcp_md5sig_key *key;
 549 #endif
 550
 551         /* Never send a reset in response to a reset. */
 552         if (th->rst)
 553                 return;
 554
 555         if (skb->rtable->rt_type != RTN_LOCAL)
 556                 return;
 557
 558         /* Swap the send and the receive. */
 559         memset(&rep, 0, sizeof(rep));
 560         rep.th.dest   = th->source;
 561         rep.th.source = th->dest;
 562         rep.th.doff   = sizeof(struct tcphdr) / 4;
 563         rep.th.rst    = 1;
 564
 565         if (th->ack) {
 566                 rep.th.seq = th->ack_seq;
 567         } else {
 568                 rep.th.ack = 1;
 569                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 570                                        skb->len - (th->doff << 2));
 571         }
 572
 573         memset(&arg, 0, sizeof(arg));
 574         arg.iov[0].iov_base = (unsigned char *)&rep;
 575         arg.iov[0].iov_len  = sizeof(rep.th);
 576
 577 #ifdef CONFIG_TCP_MD5SIG
 578         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 579         if (key) {
 580                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 581                                    (TCPOPT_NOP << 16) |
 582                                    (TCPOPT_MD5SIG << 8) |
 583                                    TCPOLEN_MD5SIG);
 584                 /* Update length and the length the header thinks exists */
 585                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 586                 rep.th.doff = arg.iov[0].iov_len / 4;
 587
 588                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
 589                                         key,
 590                                         ip_hdr(skb)->daddr,
 591                                         ip_hdr(skb)->saddr,
 592                                         &rep.th, IPPROTO_TCP,
 593                                         arg.iov[0].iov_len);
 594         }
 595 #endif
 596         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 597                                       ip_hdr(skb)->saddr, /* XXX */
 598                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 599         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 600
 601         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 602
 603         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 604         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 605 }
 606
 607 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 608    outside socket context is ugly, certainly. What can I do?
 609  */
 610
 611 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
 612                             struct sk_buff *skb, u32 seq, u32 ack,
 613                             u32 win, u32 ts)
 614 {
 615         struct tcphdr *th = tcp_hdr(skb);
 616         struct {
 617                 struct tcphdr th;
 618                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 619 #ifdef CONFIG_TCP_MD5SIG
 620                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 621 #endif
 622                         ];
 623         } rep;
 624         struct ip_reply_arg arg;
 625 #ifdef CONFIG_TCP_MD5SIG
 626         struct tcp_md5sig_key *key;
 627         struct tcp_md5sig_key tw_key;
 628 #endif
 629
 630         memset(&rep.th, 0, sizeof(struct tcphdr));
 631         memset(&arg, 0, sizeof(arg));
 632
 633         arg.iov[0].iov_base = (unsigned char *)&rep;
 634         arg.iov[0].iov_len  = sizeof(rep.th);
 635         if (ts) {
 636                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 637                                    (TCPOPT_TIMESTAMP << 8) |
 638                                    TCPOLEN_TIMESTAMP);
 639                 rep.opt[1] = htonl(tcp_time_stamp);
 640                 rep.opt[2] = htonl(ts);
 641                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 642         }
 643
 644         /* Swap the send and the receive. */
 645         rep.th.dest    = th->source;
 646         rep.th.source  = th->dest;
 647         rep.th.doff    = arg.iov[0].iov_len / 4;
 648         rep.th.seq     = htonl(seq);
 649         rep.th.ack_seq = htonl(ack);
 650         rep.th.ack     = 1;
 651         rep.th.window  = htons(win);
 652
 653 #ifdef CONFIG_TCP_MD5SIG
 654         /*
 655          * The SKB holds an imcoming packet, but may not have a valid ->sk
 656          * pointer. This is especially the case when we're dealing with a
 657          * TIME_WAIT ack, because the sk structure is long gone, and only
 658          * the tcp_timewait_sock remains. So the md5 key is stashed in that
 659          * structure, and we use it in preference.  I believe that (twsk ||
 660          * skb->sk) holds true, but we program defensively.
 661          */
 662         if (!twsk && skb->sk) {
 663                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
 664         } else if (twsk && twsk->tw_md5_keylen) {
 665                 tw_key.key = twsk->tw_md5_key;
 666                 tw_key.keylen = twsk->tw_md5_keylen;
 667                 key = &tw_key;
 668         } else
 669                 key = NULL;
 670
 671         if (key) {
 672                 int offset = (ts) ? 3 : 0;
 673
 674                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 675                                           (TCPOPT_NOP << 16) |
 676                                           (TCPOPT_MD5SIG << 8) |
 677                                           TCPOLEN_MD5SIG);
 678                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 679                 rep.th.doff = arg.iov[0].iov_len/4;
 680
 681                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
 682                                         key,
 683                                         ip_hdr(skb)->daddr,
 684                                         ip_hdr(skb)->saddr,
 685                                         &rep.th, IPPROTO_TCP,
 686                                         arg.iov[0].iov_len);
 687         }
 688 #endif
 689         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 690                                       ip_hdr(skb)->saddr, /* XXX */
 691                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 692         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 693         if (twsk)
 694                 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
 695
 696         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 697
 698         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 699 }
 700
 701 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 702 {
 703         struct inet_timewait_sock *tw = inet_twsk(sk);
 704         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 705
 706         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 707                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 708                         tcptw->tw_ts_recent);
 709
 710         inet_twsk_put(tw);
 711 }
 712
 713 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
 714                                   struct request_sock *req)
 715 {
 716         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
 717                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 718                         req->ts_recent);
 719 }
 720
 721 /*
 722  *      Send a SYN-ACK after having received a SYN.
 723  *      This still operates on a request_sock only, not on a big
 724  *      socket.
 725  */
 726 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 727                                 struct dst_entry *dst)
 728 {
 729         const struct inet_request_sock *ireq = inet_rsk(req);
 730         int err = -1;
 731         struct sk_buff * skb;
 732
 733         /* First, grab a route. */
 734         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 735                 return -1;
 736
 737         skb = tcp_make_synack(sk, dst, req);
 738
 739         if (skb) {
 740                 struct tcphdr *th = tcp_hdr(skb);
 741
 742                 th->check = tcp_v4_check(skb->len,
 743                                          ireq->loc_addr,
 744                                          ireq->rmt_addr,
 745                                          csum_partial((char *)th, skb->len,
 746                                                       skb->csum));
 747
 748                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 749                                             ireq->rmt_addr,
 750                                             ireq->opt);
 751                 err = net_xmit_eval(err);
 752         }
 753
 754         dst_release(dst);
 755         return err;
 756 }
 757
 758 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
 759 {
 760         return __tcp_v4_send_synack(sk, req, NULL);
 761 }
 762
 763 /*
 764  *      IPv4 request_sock destructor.
 765  */
 766 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 767 {
 768         kfree(inet_rsk(req)->opt);
 769 }
 770
 771 #ifdef CONFIG_SYN_COOKIES
 772 static void syn_flood_warning(struct sk_buff *skb)
 773 {
 774         static unsigned long warntime;
 775
 776         if (time_after(jiffies, (warntime + HZ * 60))) {
 777                 warntime = jiffies;
 778                 printk(KERN_INFO
 779                        "possible SYN flooding on port %d. Sending cookies.\n",
 780                        ntohs(tcp_hdr(skb)->dest));
 781         }
 782 }
 783 #endif
 784
 785 /*
 786  * Save and compile IPv4 options into the request_sock if needed.
 787  */
 788 static struct ip_options *tcp_v4_save_options(struct sock *sk,
 789                                               struct sk_buff *skb)
 790 {
 791         struct ip_options *opt = &(IPCB(skb)->opt);
 792         struct ip_options *dopt = NULL;
 793
 794         if (opt && opt->optlen) {
 795                 int opt_size = optlength(opt);
 796                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 797                 if (dopt) {
 798                         if (ip_options_echo(dopt, skb)) {
 799                                 kfree(dopt);
 800                                 dopt = NULL;
 801                         }
 802                 }
 803         }
 804         return dopt;
 805 }
 806
 807 #ifdef CONFIG_TCP_MD5SIG
 808 /*
 809  * RFC2385 MD5 checksumming requires a mapping of
 810  * IP address->MD5 Key.
 811  * We need to maintain these in the sk structure.
 812  */
 813
 814 /* Find the Key structure for an address.  */
 815 static struct tcp_md5sig_key *
 816                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 817 {
 818         struct tcp_sock *tp = tcp_sk(sk);
 819         int i;
 820
 821         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 822                 return NULL;
 823         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 824                 if (tp->md5sig_info->keys4[i].addr == addr)
 825                         return &tp->md5sig_info->keys4[i].base;
 826         }
 827         return NULL;
 828 }
 829
 830 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 831                                          struct sock *addr_sk)
 832 {
 833         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
 834 }
 835
 836 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 837
 838 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 839                                                       struct request_sock *req)
 840 {
 841         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 842 }
 843
 844 /* This can be called on a newly created socket, from other files */
 845 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 846                       u8 *newkey, u8 newkeylen)
 847 {
 848         /* Add Key to the list */
 849         struct tcp_md5sig_key *key;
 850         struct tcp_sock *tp = tcp_sk(sk);
 851         struct tcp4_md5sig_key *keys;
 852
 853         key = tcp_v4_md5_do_lookup(sk, addr);
 854         if (key) {
 855                 /* Pre-existing entry - just update that one. */
 856                 kfree(key->key);
 857                 key->key = newkey;
 858                 key->keylen = newkeylen;
 859         } else {
 860                 struct tcp_md5sig_info *md5sig;
 861
 862                 if (!tp->md5sig_info) {
 863                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 864                                                   GFP_ATOMIC);
 865                         if (!tp->md5sig_info) {
 866                                 kfree(newkey);
 867                                 return -ENOMEM;
 868                         }
 869                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 870                 }
 871                 if (tcp_alloc_md5sig_pool() == NULL) {
 872                         kfree(newkey);
 873                         return -ENOMEM;
 874                 }
 875                 md5sig = tp->md5sig_info;
 876
 877                 if (md5sig->alloced4 == md5sig->entries4) {
 878                         keys = kmalloc((sizeof(*keys) *
 879                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 880                         if (!keys) {
 881                                 kfree(newkey);
 882                                 tcp_free_md5sig_pool();
 883                                 return -ENOMEM;
 884                         }
 885
 886                         if (md5sig->entries4)
 887                                 memcpy(keys, md5sig->keys4,
 888                                        sizeof(*keys) * md5sig->entries4);
 889
 890                         /* Free old key list, and reference new one */
 891                         kfree(md5sig->keys4);
 892                         md5sig->keys4 = keys;
 893                         md5sig->alloced4++;
 894                 }
 895                 md5sig->entries4++;
 896                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 897                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 898                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 899         }
 900         return 0;
 901 }
 902
 903 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 904
 905 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 906                                u8 *newkey, u8 newkeylen)
 907 {
 908         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
 909                                  newkey, newkeylen);
 910 }
 911
 912 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 913 {
 914         struct tcp_sock *tp = tcp_sk(sk);
 915         int i;
 916
 917         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 918                 if (tp->md5sig_info->keys4[i].addr == addr) {
 919                         /* Free the key */
 920                         kfree(tp->md5sig_info->keys4[i].base.key);
 921                         tp->md5sig_info->entries4--;
 922
 923                         if (tp->md5sig_info->entries4 == 0) {
 924                                 kfree(tp->md5sig_info->keys4);
 925                                 tp->md5sig_info->keys4 = NULL;
 926                                 tp->md5sig_info->alloced4 = 0;
 927                         } else if (tp->md5sig_info->entries4 != i) {
 928                                 /* Need to do some manipulation */
 929                                 memmove(&tp->md5sig_info->keys4[i],
 930                                         &tp->md5sig_info->keys4[i+1],
 931                                         (tp->md5sig_info->entries4 - i) *
 932                                          sizeof(struct tcp4_md5sig_key));
 933                         }
 934                         tcp_free_md5sig_pool();
 935                         return 0;
 936                 }
 937         }
 938         return -ENOENT;
 939 }
 940
 941 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 942
 943 static void tcp_v4_clear_md5_list(struct sock *sk)
 944 {
 945         struct tcp_sock *tp = tcp_sk(sk);
 946
 947         /* Free each key, then the set of key keys,
 948          * the crypto element, and then decrement our
 949          * hold on the last resort crypto.
 950          */
 951         if (tp->md5sig_info->entries4) {
 952                 int i;
 953                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 954                         kfree(tp->md5sig_info->keys4[i].base.key);
 955                 tp->md5sig_info->entries4 = 0;
 956                 tcp_free_md5sig_pool();
 957         }
 958         if (tp->md5sig_info->keys4) {
 959                 kfree(tp->md5sig_info->keys4);
 960                 tp->md5sig_info->keys4 = NULL;
 961                 tp->md5sig_info->alloced4  = 0;
 962         }
 963 }
 964
 965 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 966                                  int optlen)
 967 {
 968         struct tcp_md5sig cmd;
 969         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 970         u8 *newkey;
 971
 972         if (optlen < sizeof(cmd))
 973                 return -EINVAL;
 974
 975         if (copy_from_user(&cmd, optval, sizeof(cmd)))
 976                 return -EFAULT;
 977
 978         if (sin->sin_family != AF_INET)
 979                 return -EINVAL;
 980
 981         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
 982                 if (!tcp_sk(sk)->md5sig_info)
 983                         return -ENOENT;
 984                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
 985         }
 986
 987         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
 988                 return -EINVAL;
 989
 990         if (!tcp_sk(sk)->md5sig_info) {
 991                 struct tcp_sock *tp = tcp_sk(sk);
 992                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
 993
 994                 if (!p)
 995                         return -EINVAL;
 996
 997                 tp->md5sig_info = p;
 998                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 999         }
1000
1001         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1002         if (!newkey)
1003                 return -ENOMEM;
1004         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1005                                  newkey, cmd.tcpm_keylen);
1006 }
1007
1008 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1009                                    __be32 saddr, __be32 daddr,
1010                                    struct tcphdr *th, int protocol,
1011                                    unsigned int tcplen)
1012 {
1013         struct scatterlist sg[4];
1014         __u16 data_len;
1015         int block = 0;
1016         __sum16 old_checksum;
1017         struct tcp_md5sig_pool *hp;
1018         struct tcp4_pseudohdr *bp;
1019         struct hash_desc *desc;
1020         int err;
1021         unsigned int nbytes = 0;
1022
1023         /*
1024          * Okay, so RFC2385 is turned on for this connection,
1025          * so we need to generate the MD5 hash for the packet now.
1026          */
1027
1028         hp = tcp_get_md5sig_pool();
1029         if (!hp)
1030                 goto clear_hash_noput;
1031
1032         bp = &hp->md5_blk.ip4;
1033         desc = &hp->md5_desc;
1034
1035         /*
1036          * 1. the TCP pseudo-header (in the order: source IP address,
1037          * destination IP address, zero-padded protocol number, and
1038          * segment length)
1039          */
1040         bp->saddr = saddr;
1041         bp->daddr = daddr;
1042         bp->pad = 0;
1043         bp->protocol = protocol;
1044         bp->len = htons(tcplen);
1045
1046         sg_init_table(sg, 4);
1047
1048         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1049         nbytes += sizeof(*bp);
1050
1051         /* 2. the TCP header, excluding options, and assuming a
1052          * checksum of zero/
1053          */
1054         old_checksum = th->check;
1055         th->check = 0;
1056         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1057         nbytes += sizeof(struct tcphdr);
1058
1059         /* 3. the TCP segment data (if any) */
1060         data_len = tcplen - (th->doff << 2);
1061         if (data_len > 0) {
1062                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1063                 sg_set_buf(&sg[block++], data, data_len);
1064                 nbytes += data_len;
1065         }
1066
1067         /* 4. an independently-specified key or password, known to both
1068          * TCPs and presumably connection-specific
1069          */
1070         sg_set_buf(&sg[block++], key->key, key->keylen);
1071         nbytes += key->keylen;
1072
1073         sg_mark_end(&sg[block - 1]);
1074
1075         /* Now store the Hash into the packet */
1076         err = crypto_hash_init(desc);
1077         if (err)
1078                 goto clear_hash;
1079         err = crypto_hash_update(desc, sg, nbytes);
1080         if (err)
1081                 goto clear_hash;
1082         err = crypto_hash_final(desc, md5_hash);
1083         if (err)
1084                 goto clear_hash;
1085
1086         /* Reset header, and free up the crypto */
1087         tcp_put_md5sig_pool();
1088         th->check = old_checksum;
1089
1090 out:
1091         return 0;
1092 clear_hash:
1093         tcp_put_md5sig_pool();
1094 clear_hash_noput:
1095         memset(md5_hash, 0, 16);
1096         goto out;
1097 }
1098
1099 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1100                          struct sock *sk,
1101                          struct dst_entry *dst,
1102                          struct request_sock *req,
1103                          struct tcphdr *th, int protocol,
1104                          unsigned int tcplen)
1105 {
1106         __be32 saddr, daddr;
1107
1108         if (sk) {
1109                 saddr = inet_sk(sk)->saddr;
1110                 daddr = inet_sk(sk)->daddr;
1111         } else {
1112                 struct rtable *rt = (struct rtable *)dst;
1113                 BUG_ON(!rt);
1114                 saddr = rt->rt_src;
1115                 daddr = rt->rt_dst;
1116         }
1117         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1118                                        saddr, daddr,
1119                                        th, protocol, tcplen);
1120 }
1121
1122 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1123
1124 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1125 {
1126         /*
1127          * This gets called for each TCP segment that arrives
1128          * so we want to be efficient.
1129          * We have 3 drop cases:
1130          * o No MD5 hash and one expected.
1131          * o MD5 hash and we're not expecting one.
1132          * o MD5 hash and its wrong.
1133          */
1134         __u8 *hash_location = NULL;
1135         struct tcp_md5sig_key *hash_expected;
1136         const struct iphdr *iph = ip_hdr(skb);
1137         struct tcphdr *th = tcp_hdr(skb);
1138         int length = (th->doff << 2) - sizeof(struct tcphdr);
1139         int genhash;
1140         unsigned char *ptr;
1141         unsigned char newhash[16];
1142
1143         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1144
1145         /*
1146          * If the TCP option length is less than the TCP_MD5SIG
1147          * option length, then we can shortcut
1148          */
1149         if (length < TCPOLEN_MD5SIG) {
1150                 if (hash_expected)
1151                         return 1;
1152                 else
1153                         return 0;
1154         }
1155
1156         /* Okay, we can't shortcut - we have to grub through the options */
1157         ptr = (unsigned char *)(th + 1);
1158         while (length > 0) {
1159                 int opcode = *ptr++;
1160                 int opsize;
1161
1162                 switch (opcode) {
1163                 case TCPOPT_EOL:
1164                         goto done_opts;
1165                 case TCPOPT_NOP:
1166                         length--;
1167                         continue;
1168                 default:
1169                         opsize = *ptr++;
1170                         if (opsize < 2)
1171                                 goto done_opts;
1172                         if (opsize > length)
1173                                 goto done_opts;
1174
1175                         if (opcode == TCPOPT_MD5SIG) {
1176                                 hash_location = ptr;
1177                                 goto done_opts;
1178                         }
1179                 }
1180                 ptr += opsize-2;
1181                 length -= opsize;
1182         }
1183 done_opts:
1184         /* We've parsed the options - do we have a hash? */
1185         if (!hash_expected && !hash_location)
1186                 return 0;
1187
1188         if (hash_expected && !hash_location) {
1189                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1190                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1191                                NIPQUAD(iph->saddr), ntohs(th->source),
1192                                NIPQUAD(iph->daddr), ntohs(th->dest));
1193                 return 1;
1194         }
1195
1196         if (!hash_expected && hash_location) {
1197                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1198                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1199                                NIPQUAD(iph->saddr), ntohs(th->source),
1200                                NIPQUAD(iph->daddr), ntohs(th->dest));
1201                 return 1;
1202         }
1203
1204         /* Okay, so this is hash_expected and hash_location -
1205          * so we need to calculate the checksum.
1206          */
1207         genhash = tcp_v4_do_calc_md5_hash(newhash,
1208                                           hash_expected,
1209                                           iph->saddr, iph->daddr,
1210                                           th, sk->sk_protocol,
1211                                           skb->len);
1212
1213         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1214                 if (net_ratelimit()) {
1215                         printk(KERN_INFO "MD5 Hash failed for "
1216                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1217                                NIPQUAD(iph->saddr), ntohs(th->source),
1218                                NIPQUAD(iph->daddr), ntohs(th->dest),
1219                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1220                 }
1221                 return 1;
1222         }
1223         return 0;
1224 }
1225
1226 #endif
1227
1228 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1229         .family         =       PF_INET,
1230         .obj_size       =       sizeof(struct tcp_request_sock),
1231         .rtx_syn_ack    =       tcp_v4_send_synack,
1232         .send_ack       =       tcp_v4_reqsk_send_ack,
1233         .destructor     =       tcp_v4_reqsk_destructor,
1234         .send_reset     =       tcp_v4_send_reset,
1235 };
1236
1237 #ifdef CONFIG_TCP_MD5SIG
1238 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1239         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1240 };
1241 #endif
1242
1243 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1244         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1245         .twsk_unique    = tcp_twsk_unique,
1246         .twsk_destructor= tcp_twsk_destructor,
1247 };
1248
1249 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1250 {
1251         struct inet_request_sock *ireq;
1252         struct tcp_options_received tmp_opt;
1253         struct request_sock *req;
1254         __be32 saddr = ip_hdr(skb)->saddr;
1255         __be32 daddr = ip_hdr(skb)->daddr;
1256         __u32 isn = TCP_SKB_CB(skb)->when;
1257         struct dst_entry *dst = NULL;
1258 #ifdef CONFIG_SYN_COOKIES
1259         int want_cookie = 0;
1260 #else
1261 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1262 #endif
1263
1264         /* Never answer to SYNs send to broadcast or multicast */
1265         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1266                 goto drop;
1267
1268         /* TW buckets are converted to open requests without
1269          * limitations, they conserve resources and peer is
1270          * evidently real one.
1271          */
1272         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1273 #ifdef CONFIG_SYN_COOKIES
1274                 if (sysctl_tcp_syncookies) {
1275                         want_cookie = 1;
1276                 } else
1277 #endif
1278                 goto drop;
1279         }
1280
1281         /* Accept backlog is full. If we have already queued enough
1282          * of warm entries in syn queue, drop request. It is better than
1283          * clogging syn queue with openreqs with exponentially increasing
1284          * timeout.
1285          */
1286         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1287                 goto drop;
1288
1289         req = reqsk_alloc(&tcp_request_sock_ops);
1290         if (!req)
1291                 goto drop;
1292
1293 #ifdef CONFIG_TCP_MD5SIG
1294         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1295 #endif
1296
1297         tcp_clear_options(&tmp_opt);
1298         tmp_opt.mss_clamp = 536;
1299         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1300
1301         tcp_parse_options(skb, &tmp_opt, 0);
1302
1303         if (want_cookie) {
1304                 tcp_clear_options(&tmp_opt);
1305                 tmp_opt.saw_tstamp = 0;
1306         }
1307
1308         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1309                 /* Some OSes (unknown ones, but I see them on web server, which
1310                  * contains information interesting only for windows'
1311                  * users) do not send their stamp in SYN. It is easy case.
1312                  * We simply do not advertise TS support.
1313                  */
1314                 tmp_opt.saw_tstamp = 0;
1315                 tmp_opt.tstamp_ok  = 0;
1316         }
1317         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1318
1319         tcp_openreq_init(req, &tmp_opt, skb);
1320
1321         if (security_inet_conn_request(sk, skb, req))
1322                 goto drop_and_free;
1323
1324         ireq = inet_rsk(req);
1325         ireq->loc_addr = daddr;
1326         ireq->rmt_addr = saddr;
1327         ireq->opt = tcp_v4_save_options(sk, skb);
1328         if (!want_cookie)
1329                 TCP_ECN_create_request(req, tcp_hdr(skb));
1330
1331         if (want_cookie) {
1332 #ifdef CONFIG_SYN_COOKIES
1333                 syn_flood_warning(skb);
1334 #endif
1335                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1336         } else if (!isn) {
1337                 struct inet_peer *peer = NULL;
1338
1339                 /* VJ's idea. We save last timestamp seen
1340                  * from the destination in peer table, when entering
1341                  * state TIME-WAIT, and check against it before
1342                  * accepting new connection request.
1343                  *
1344                  * If "isn" is not zero, this request hit alive
1345                  * timewait bucket, so that all the necessary checks
1346                  * are made in the function processing timewait state.
1347                  */
1348                 if (tmp_opt.saw_tstamp &&
1349                     tcp_death_row.sysctl_tw_recycle &&
1350                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1351                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1352                     peer->v4daddr == saddr) {
1353                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1354                             (s32)(peer->tcp_ts - req->ts_recent) >
1355                                                         TCP_PAWS_WINDOW) {
1356                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1357                                 goto drop_and_release;
1358                         }
1359                 }
1360                 /* Kill the following clause, if you dislike this way. */
1361                 else if (!sysctl_tcp_syncookies &&
1362                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1363                           (sysctl_max_syn_backlog >> 2)) &&
1364                          (!peer || !peer->tcp_ts_stamp) &&
1365                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1366                         /* Without syncookies last quarter of
1367                          * backlog is filled with destinations,
1368                          * proven to be alive.
1369                          * It means that we continue to communicate
1370                          * to destinations, already remembered
1371                          * to the moment of synflood.
1372                          */
1373                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1374                                        "request from %u.%u.%u.%u/%u\n",
1375                                        NIPQUAD(saddr),
1376                                        ntohs(tcp_hdr(skb)->source));
1377                         goto drop_and_release;
1378                 }
1379
1380                 isn = tcp_v4_init_sequence(skb);
1381         }
1382         tcp_rsk(req)->snt_isn = isn;
1383
1384         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1385                 goto drop_and_free;
1386
1387         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1388         return 0;
1389
1390 drop_and_release:
1391         dst_release(dst);
1392 drop_and_free:
1393         reqsk_free(req);
1394 drop:
1395         return 0;
1396 }
1397
1398
1399 /*
1400  * The three way handshake has completed - we got a valid synack -
1401  * now create the new socket.
1402  */
1403 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1404                                   struct request_sock *req,
1405                                   struct dst_entry *dst)
1406 {
1407         struct inet_request_sock *ireq;
1408         struct inet_sock *newinet;
1409         struct tcp_sock *newtp;
1410         struct sock *newsk;
1411 #ifdef CONFIG_TCP_MD5SIG
1412         struct tcp_md5sig_key *key;
1413 #endif
1414
1415         if (sk_acceptq_is_full(sk))
1416                 goto exit_overflow;
1417
1418         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1419                 goto exit;
1420
1421         newsk = tcp_create_openreq_child(sk, req, skb);
1422         if (!newsk)
1423                 goto exit;
1424
1425         newsk->sk_gso_type = SKB_GSO_TCPV4;
1426         sk_setup_caps(newsk, dst);
1427
1428         newtp                 = tcp_sk(newsk);
1429         newinet               = inet_sk(newsk);
1430         ireq                  = inet_rsk(req);
1431         newinet->daddr        = ireq->rmt_addr;
1432         newinet->rcv_saddr    = ireq->loc_addr;
1433         newinet->saddr        = ireq->loc_addr;
1434         newinet->opt          = ireq->opt;
1435         ireq->opt             = NULL;
1436         newinet->mc_index     = inet_iif(skb);
1437         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1438         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1439         if (newinet->opt)
1440                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1441         newinet->id = newtp->write_seq ^ jiffies;
1442
1443         tcp_mtup_init(newsk);
1444         tcp_sync_mss(newsk, dst_mtu(dst));
1445         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1446         tcp_initialize_rcv_mss(newsk);
1447
1448 #ifdef CONFIG_TCP_MD5SIG
1449         /* Copy over the MD5 key from the original socket */
1450         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1451                 /*
1452                  * We're using one, so create a matching key
1453                  * on the newsk structure. If we fail to get
1454                  * memory, then we end up not copying the key
1455                  * across. Shucks.
1456                  */
1457                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1458                 if (newkey != NULL)
1459                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1460                                           newkey, key->keylen);
1461         }
1462 #endif
1463
1464         __inet_hash_nolisten(newsk);
1465         __inet_inherit_port(sk, newsk);
1466
1467         return newsk;
1468
1469 exit_overflow:
1470         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1471 exit:
1472         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1473         dst_release(dst);
1474         return NULL;
1475 }
1476
1477 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1478 {
1479         struct tcphdr *th = tcp_hdr(skb);
1480         const struct iphdr *iph = ip_hdr(skb);
1481         struct sock *nsk;
1482         struct request_sock **prev;
1483         /* Find possible connection requests. */
1484         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1485                                                        iph->saddr, iph->daddr);
1486         if (req)
1487                 return tcp_check_req(sk, skb, req, prev);
1488
1489         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1490                         th->source, iph->daddr, th->dest, inet_iif(skb));
1491
1492         if (nsk) {
1493                 if (nsk->sk_state != TCP_TIME_WAIT) {
1494                         bh_lock_sock(nsk);
1495                         return nsk;
1496                 }
1497                 inet_twsk_put(inet_twsk(nsk));
1498                 return NULL;
1499         }
1500
1501 #ifdef CONFIG_SYN_COOKIES
1502         if (!th->rst && !th->syn && th->ack)
1503                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1504 #endif
1505         return sk;
1506 }
1507
1508 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1509 {
1510         const struct iphdr *iph = ip_hdr(skb);
1511
1512         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1513                 if (!tcp_v4_check(skb->len, iph->saddr,
1514                                   iph->daddr, skb->csum)) {
1515                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1516                         return 0;
1517                 }
1518         }
1519
1520         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1521                                        skb->len, IPPROTO_TCP, 0);
1522
1523         if (skb->len <= 76) {
1524                 return __skb_checksum_complete(skb);
1525         }
1526         return 0;
1527 }
1528
1529
1530 /* The socket must have it's spinlock held when we get
1531  * here.
1532  *
1533  * We have a potential double-lock case here, so even when
1534  * doing backlog processing we use the BH locking scheme.
1535  * This is because we cannot sleep with the original spinlock
1536  * held.
1537  */
1538 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1539 {
1540         struct sock *rsk;
1541 #ifdef CONFIG_TCP_MD5SIG
1542         /*
1543          * We really want to reject the packet as early as possible
1544          * if:
1545          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1546          *  o There is an MD5 option and we're not expecting one
1547          */
1548         if (tcp_v4_inbound_md5_hash(sk, skb))
1549                 goto discard;
1550 #endif
1551
1552         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1553                 TCP_CHECK_TIMER(sk);
1554                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1555                         rsk = sk;
1556                         goto reset;
1557                 }
1558                 TCP_CHECK_TIMER(sk);
1559                 return 0;
1560         }
1561
1562         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1563                 goto csum_err;
1564
1565         if (sk->sk_state == TCP_LISTEN) {
1566                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1567                 if (!nsk)
1568                         goto discard;
1569
1570                 if (nsk != sk) {
1571                         if (tcp_child_process(sk, nsk, skb)) {
1572                                 rsk = nsk;
1573                                 goto reset;
1574                         }
1575                         return 0;
1576                 }
1577         }
1578
1579         TCP_CHECK_TIMER(sk);
1580         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1581                 rsk = sk;
1582                 goto reset;
1583         }
1584         TCP_CHECK_TIMER(sk);
1585         return 0;
1586
1587 reset:
1588         tcp_v4_send_reset(rsk, skb);
1589 discard:
1590         kfree_skb(skb);
1591         /* Be careful here. If this function gets more complicated and
1592          * gcc suffers from register pressure on the x86, sk (in %ebx)
1593          * might be destroyed here. This current version compiles correctly,
1594          * but you have been warned.
1595          */
1596         return 0;
1597
1598 csum_err:
1599         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1600         goto discard;
1601 }
1602
1603 /*
1604  *      From tcp_input.c
1605  */
1606
1607 int tcp_v4_rcv(struct sk_buff *skb)
1608 {
1609         const struct iphdr *iph;
1610         struct tcphdr *th;
1611         struct sock *sk;
1612         int ret;
1613
1614         if (skb->pkt_type != PACKET_HOST)
1615                 goto discard_it;
1616
1617         /* Count it even if it's bad */
1618         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1619
1620         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1621                 goto discard_it;
1622
1623         th = tcp_hdr(skb);
1624
1625         if (th->doff < sizeof(struct tcphdr) / 4)
1626                 goto bad_packet;
1627         if (!pskb_may_pull(skb, th->doff * 4))
1628                 goto discard_it;
1629
1630         /* An explanation is required here, I think.
1631          * Packet length and doff are validated by header prediction,
1632          * provided case of th->doff==0 is eliminated.
1633          * So, we defer the checks. */
1634         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1635                 goto bad_packet;
1636
1637         th = tcp_hdr(skb);
1638         iph = ip_hdr(skb);
1639         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1640         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1641                                     skb->len - th->doff * 4);
1642         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1643         TCP_SKB_CB(skb)->when    = 0;
1644         TCP_SKB_CB(skb)->flags   = iph->tos;
1645         TCP_SKB_CB(skb)->sacked  = 0;
1646
1647         sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1648                         th->source, iph->daddr, th->dest, inet_iif(skb));
1649         if (!sk)
1650                 goto no_tcp_socket;
1651
1652 process:
1653         if (sk->sk_state == TCP_TIME_WAIT)
1654                 goto do_time_wait;
1655
1656         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1657                 goto discard_and_relse;
1658         nf_reset(skb);
1659
1660         if (sk_filter(sk, skb))
1661                 goto discard_and_relse;
1662
1663         skb->dev = NULL;
1664
1665         bh_lock_sock_nested(sk);
1666         ret = 0;
1667         if (!sock_owned_by_user(sk)) {
1668 #ifdef CONFIG_NET_DMA
1669                 struct tcp_sock *tp = tcp_sk(sk);
1670                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1671                         tp->ucopy.dma_chan = get_softnet_dma();
1672                 if (tp->ucopy.dma_chan)
1673                         ret = tcp_v4_do_rcv(sk, skb);
1674                 else
1675 #endif
1676                 {
1677                         if (!tcp_prequeue(sk, skb))
1678                         ret = tcp_v4_do_rcv(sk, skb);
1679                 }
1680         } else
1681                 sk_add_backlog(sk, skb);
1682         bh_unlock_sock(sk);
1683
1684         sock_put(sk);
1685
1686         return ret;
1687
1688 no_tcp_socket:
1689         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1690                 goto discard_it;
1691
1692         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1693 bad_packet:
1694                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1695         } else {
1696                 tcp_v4_send_reset(NULL, skb);
1697         }
1698
1699 discard_it:
1700         /* Discard frame. */
1701         kfree_skb(skb);
1702         return 0;
1703
1704 discard_and_relse:
1705         sock_put(sk);
1706         goto discard_it;
1707
1708 do_time_wait:
1709         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1710                 inet_twsk_put(inet_twsk(sk));
1711                 goto discard_it;
1712         }
1713
1714         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1715                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1716                 inet_twsk_put(inet_twsk(sk));
1717                 goto discard_it;
1718         }
1719         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1720         case TCP_TW_SYN: {
1721                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1722                                                         &tcp_hashinfo,
1723                                                         iph->daddr, th->dest,
1724                                                         inet_iif(skb));
1725                 if (sk2) {
1726                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1727                         inet_twsk_put(inet_twsk(sk));
1728                         sk = sk2;
1729                         goto process;
1730                 }
1731                 /* Fall through to ACK */
1732         }
1733         case TCP_TW_ACK:
1734                 tcp_v4_timewait_ack(sk, skb);
1735                 break;
1736         case TCP_TW_RST:
1737                 goto no_tcp_socket;
1738         case TCP_TW_SUCCESS:;
1739         }
1740         goto discard_it;
1741 }
1742
1743 /* VJ's idea. Save last timestamp seen from this destination
1744  * and hold it at least for normal timewait interval to use for duplicate
1745  * segment detection in subsequent connections, before they enter synchronized
1746  * state.
1747  */
1748
1749 int tcp_v4_remember_stamp(struct sock *sk)
1750 {
1751         struct inet_sock *inet = inet_sk(sk);
1752         struct tcp_sock *tp = tcp_sk(sk);
1753         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1754         struct inet_peer *peer = NULL;
1755         int release_it = 0;
1756
1757         if (!rt || rt->rt_dst != inet->daddr) {
1758                 peer = inet_getpeer(inet->daddr, 1);
1759                 release_it = 1;
1760         } else {
1761                 if (!rt->peer)
1762                         rt_bind_peer(rt, 1);
1763                 peer = rt->peer;
1764         }
1765
1766         if (peer) {
1767                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1768                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1769                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1770                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1771                         peer->tcp_ts = tp->rx_opt.ts_recent;
1772                 }
1773                 if (release_it)
1774                         inet_putpeer(peer);
1775                 return 1;
1776         }
1777
1778         return 0;
1779 }
1780
1781 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1782 {
1783         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1784
1785         if (peer) {
1786                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1787
1788                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1789                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1790                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1791                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1792                         peer->tcp_ts       = tcptw->tw_ts_recent;
1793                 }
1794                 inet_putpeer(peer);
1795                 return 1;
1796         }
1797
1798         return 0;
1799 }
1800
1801 struct inet_connection_sock_af_ops ipv4_specific = {
1802         .queue_xmit        = ip_queue_xmit,
1803         .send_check        = tcp_v4_send_check,
1804         .rebuild_header    = inet_sk_rebuild_header,
1805         .conn_request      = tcp_v4_conn_request,
1806         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1807         .remember_stamp    = tcp_v4_remember_stamp,
1808         .net_header_len    = sizeof(struct iphdr),
1809         .setsockopt        = ip_setsockopt,
1810         .getsockopt        = ip_getsockopt,
1811         .addr2sockaddr     = inet_csk_addr2sockaddr,
1812         .sockaddr_len      = sizeof(struct sockaddr_in),
1813         .bind_conflict     = inet_csk_bind_conflict,
1814 #ifdef CONFIG_COMPAT
1815         .compat_setsockopt = compat_ip_setsockopt,
1816         .compat_getsockopt = compat_ip_getsockopt,
1817 #endif
1818 };
1819
1820 #ifdef CONFIG_TCP_MD5SIG
1821 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1822         .md5_lookup             = tcp_v4_md5_lookup,
1823         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1824         .md5_add                = tcp_v4_md5_add_func,
1825         .md5_parse              = tcp_v4_parse_md5_keys,
1826 };
1827 #endif
1828
1829 /* NOTE: A lot of things set to zero explicitly by call to
1830  *       sk_alloc() so need not be done here.
1831  */
1832 static int tcp_v4_init_sock(struct sock *sk)
1833 {
1834         struct inet_connection_sock *icsk = inet_csk(sk);
1835         struct tcp_sock *tp = tcp_sk(sk);
1836
1837         skb_queue_head_init(&tp->out_of_order_queue);
1838         tcp_init_xmit_timers(sk);
1839         tcp_prequeue_init(tp);
1840
1841         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1842         tp->mdev = TCP_TIMEOUT_INIT;
1843
1844         /* So many TCP implementations out there (incorrectly) count the
1845          * initial SYN frame in their delayed-ACK and congestion control
1846          * algorithms that we must have the following bandaid to talk
1847          * efficiently to them.  -DaveM
1848          */
1849         tp->snd_cwnd = 2;
1850
1851         /* See draft-stevens-tcpca-spec-01 for discussion of the
1852          * initialization of these values.
1853          */
1854         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1855         tp->snd_cwnd_clamp = ~0;
1856         tp->mss_cache = 536;
1857
1858         tp->reordering = sysctl_tcp_reordering;
1859         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1860
1861         sk->sk_state = TCP_CLOSE;
1862
1863         sk->sk_write_space = sk_stream_write_space;
1864         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1865
1866         icsk->icsk_af_ops = &ipv4_specific;
1867         icsk->icsk_sync_mss = tcp_sync_mss;
1868 #ifdef CONFIG_TCP_MD5SIG
1869         tp->af_specific = &tcp_sock_ipv4_specific;
1870 #endif
1871
1872         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1873         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1874
1875         atomic_inc(&tcp_sockets_allocated);
1876
1877         return 0;
1878 }
1879
1880 int tcp_v4_destroy_sock(struct sock *sk)
1881 {
1882         struct tcp_sock *tp = tcp_sk(sk);
1883
1884         tcp_clear_xmit_timers(sk);
1885
1886         tcp_cleanup_congestion_control(sk);
1887
1888         /* Cleanup up the write buffer. */
1889         tcp_write_queue_purge(sk);
1890
1891         /* Cleans up our, hopefully empty, out_of_order_queue. */
1892         __skb_queue_purge(&tp->out_of_order_queue);
1893
1894 #ifdef CONFIG_TCP_MD5SIG
1895         /* Clean up the MD5 key list, if any */
1896         if (tp->md5sig_info) {
1897                 tcp_v4_clear_md5_list(sk);
1898                 kfree(tp->md5sig_info);
1899                 tp->md5sig_info = NULL;
1900         }
1901 #endif
1902
1903 #ifdef CONFIG_NET_DMA
1904         /* Cleans up our sk_async_wait_queue */
1905         __skb_queue_purge(&sk->sk_async_wait_queue);
1906 #endif
1907
1908         /* Clean prequeue, it must be empty really */
1909         __skb_queue_purge(&tp->ucopy.prequeue);
1910
1911         /* Clean up a referenced TCP bind bucket. */
1912         if (inet_csk(sk)->icsk_bind_hash)
1913                 inet_put_port(sk);
1914
1915         /*
1916          * If sendmsg cached page exists, toss it.
1917          */
1918         if (sk->sk_sndmsg_page) {
1919                 __free_page(sk->sk_sndmsg_page);
1920                 sk->sk_sndmsg_page = NULL;
1921         }
1922
1923         if (tp->defer_tcp_accept.request) {
1924                 reqsk_free(tp->defer_tcp_accept.request);
1925                 sock_put(tp->defer_tcp_accept.listen_sk);
1926                 sock_put(sk);
1927                 tp->defer_tcp_accept.listen_sk = NULL;
1928                 tp->defer_tcp_accept.request = NULL;
1929         }
1930
1931         atomic_dec(&tcp_sockets_allocated);
1932
1933         return 0;
1934 }
1935
1936 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1937
1938 #ifdef CONFIG_PROC_FS
1939 /* Proc filesystem TCP sock list dumping. */
1940
1941 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1942 {
1943         return hlist_empty(head) ? NULL :
1944                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1945 }
1946
1947 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1948 {
1949         return tw->tw_node.next ?
1950                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1951 }
1952
1953 static void *listening_get_next(struct seq_file *seq, void *cur)
1954 {
1955         struct inet_connection_sock *icsk;
1956         struct hlist_node *node;
1957         struct sock *sk = cur;
1958         struct tcp_iter_state* st = seq->private;
1959         struct net *net = st->net;
1960
1961         if (!sk) {
1962                 st->bucket = 0;
1963                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1964                 goto get_sk;
1965         }
1966
1967         ++st->num;
1968
1969         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1970                 struct request_sock *req = cur;
1971
1972                 icsk = inet_csk(st->syn_wait_sk);
1973                 req = req->dl_next;
1974                 while (1) {
1975                         while (req) {
1976                                 if (req->rsk_ops->family == st->family &&
1977                                     net_eq(sock_net(req->sk), net)) {
1978                                         cur = req;
1979                                         goto out;
1980                                 }
1981                                 req = req->dl_next;
1982                         }
1983                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1984                                 break;
1985 get_req:
1986                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1987                 }
1988                 sk        = sk_next(st->syn_wait_sk);
1989                 st->state = TCP_SEQ_STATE_LISTENING;
1990                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1991         } else {
1992                 icsk = inet_csk(sk);
1993                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1994                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1995                         goto start_req;
1996                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1997                 sk = sk_next(sk);
1998         }
1999 get_sk:
2000         sk_for_each_from(sk, node) {
2001                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2002                         cur = sk;
2003                         goto out;
2004                 }
2005                 icsk = inet_csk(sk);
2006                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2007                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2008 start_req:
2009                         st->uid         = sock_i_uid(sk);
2010                         st->syn_wait_sk = sk;
2011                         st->state       = TCP_SEQ_STATE_OPENREQ;
2012                         st->sbucket     = 0;
2013                         goto get_req;
2014                 }
2015                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2016         }
2017         if (++st->bucket < INET_LHTABLE_SIZE) {
2018                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2019                 goto get_sk;
2020         }
2021         cur = NULL;
2022 out:
2023         return cur;
2024 }
2025
2026 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2027 {
2028         void *rc = listening_get_next(seq, NULL);
2029
2030         while (rc && *pos) {
2031                 rc = listening_get_next(seq, rc);
2032                 --*pos;
2033         }
2034         return rc;
2035 }
2036
2037 static void *established_get_first(struct seq_file *seq)
2038 {
2039         struct tcp_iter_state* st = seq->private;
2040         struct net *net = st->net;
2041         void *rc = NULL;
2042
2043         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2044                 struct sock *sk;
2045                 struct hlist_node *node;
2046                 struct inet_timewait_sock *tw;
2047                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2048
2049                 read_lock_bh(lock);
2050                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2051                         if (sk->sk_family != st->family ||
2052                             !net_eq(sock_net(sk), net)) {
2053                                 continue;
2054                         }
2055                         rc = sk;
2056                         goto out;
2057                 }
2058                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2059                 inet_twsk_for_each(tw, node,
2060                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2061                         if (tw->tw_family != st->family ||
2062                             !net_eq(twsk_net(tw), net)) {
2063                                 continue;
2064                         }
2065                         rc = tw;
2066                         goto out;
2067                 }
2068                 read_unlock_bh(lock);
2069                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2070         }
2071 out:
2072         return rc;
2073 }
2074
2075 static void *established_get_next(struct seq_file *seq, void *cur)
2076 {
2077         struct sock *sk = cur;
2078         struct inet_timewait_sock *tw;
2079         struct hlist_node *node;
2080         struct tcp_iter_state* st = seq->private;
2081         struct net *net = st->net;
2082
2083         ++st->num;
2084
2085         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2086                 tw = cur;
2087                 tw = tw_next(tw);
2088 get_tw:
2089                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2090                         tw = tw_next(tw);
2091                 }
2092                 if (tw) {
2093                         cur = tw;
2094                         goto out;
2095                 }
2096                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2097                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2098
2099                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2100                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2101                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2102                 } else {
2103                         cur = NULL;
2104                         goto out;
2105                 }
2106         } else
2107                 sk = sk_next(sk);
2108
2109         sk_for_each_from(sk, node) {
2110                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2111                         goto found;
2112         }
2113
2114         st->state = TCP_SEQ_STATE_TIME_WAIT;
2115         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2116         goto get_tw;
2117 found:
2118         cur = sk;
2119 out:
2120         return cur;
2121 }
2122
2123 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2124 {
2125         void *rc = established_get_first(seq);
2126
2127         while (rc && pos) {
2128                 rc = established_get_next(seq, rc);
2129                 --pos;
2130         }
2131         return rc;
2132 }
2133
2134 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2135 {
2136         void *rc;
2137         struct tcp_iter_state* st = seq->private;
2138
2139         inet_listen_lock(&tcp_hashinfo);
2140         st->state = TCP_SEQ_STATE_LISTENING;
2141         rc        = listening_get_idx(seq, &pos);
2142
2143         if (!rc) {
2144                 inet_listen_unlock(&tcp_hashinfo);
2145                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2146                 rc        = established_get_idx(seq, pos);
2147         }
2148
2149         return rc;
2150 }
2151
2152 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2153 {
2154         struct tcp_iter_state* st = seq->private;
2155         st->state = TCP_SEQ_STATE_LISTENING;
2156         st->num = 0;
2157         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2158 }
2159
2160 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2161 {
2162         void *rc = NULL;
2163         struct tcp_iter_state* st;
2164
2165         if (v == SEQ_START_TOKEN) {
2166                 rc = tcp_get_idx(seq, 0);
2167                 goto out;
2168         }
2169         st = seq->private;
2170
2171         switch (st->state) {
2172         case TCP_SEQ_STATE_OPENREQ:
2173         case TCP_SEQ_STATE_LISTENING:
2174                 rc = listening_get_next(seq, v);
2175                 if (!rc) {
2176                         inet_listen_unlock(&tcp_hashinfo);
2177                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2178                         rc        = established_get_first(seq);
2179                 }
2180                 break;
2181         case TCP_SEQ_STATE_ESTABLISHED:
2182         case TCP_SEQ_STATE_TIME_WAIT:
2183                 rc = established_get_next(seq, v);
2184                 break;
2185         }
2186 out:
2187         ++*pos;
2188         return rc;
2189 }
2190
2191 static void tcp_seq_stop(struct seq_file *seq, void *v)
2192 {
2193         struct tcp_iter_state* st = seq->private;
2194
2195         switch (st->state) {
2196         case TCP_SEQ_STATE_OPENREQ:
2197                 if (v) {
2198                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2199                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2200                 }
2201         case TCP_SEQ_STATE_LISTENING:
2202                 if (v != SEQ_START_TOKEN)
2203                         inet_listen_unlock(&tcp_hashinfo);
2204                 break;
2205         case TCP_SEQ_STATE_TIME_WAIT:
2206         case TCP_SEQ_STATE_ESTABLISHED:
2207                 if (v)
2208                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2209                 break;
2210         }
2211 }
2212
2213 static int tcp_seq_open(struct inode *inode, struct file *file)
2214 {
2215         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2216         struct seq_file *seq;
2217         struct tcp_iter_state *s;
2218         struct net *net;
2219         int rc;
2220
2221         if (unlikely(afinfo == NULL))
2222                 return -EINVAL;
2223
2224         s = kzalloc(sizeof(*s), GFP_KERNEL);
2225         if (!s)
2226                 return -ENOMEM;
2227
2228         rc = -ENXIO;
2229         net = get_proc_net(inode);
2230         if (!net)
2231                 goto out_kfree;
2232
2233         s->family               = afinfo->family;
2234         s->seq_ops.start        = tcp_seq_start;
2235         s->seq_ops.next         = tcp_seq_next;
2236         s->seq_ops.show         = afinfo->seq_show;
2237         s->seq_ops.stop         = tcp_seq_stop;
2238         s->net                  = net;
2239
2240         rc = seq_open(file, &s->seq_ops);
2241         if (rc)
2242                 goto out_put_net;
2243         seq = file->private_data;
2244         seq->private = s;
2245 out:
2246         return rc;
2247 out_put_net:
2248         put_net(net);
2249 out_kfree:
2250         kfree(s);
2251         goto out;
2252 }
2253
2254 static int tcp_seq_release(struct inode *inode, struct file *file)
2255 {
2256         struct seq_file *seq = file->private_data;
2257         struct tcp_iter_state *s = seq->private;
2258
2259         put_net(s->net);
2260         seq_release_private(inode, file);
2261         return 0;
2262 }
2263
2264 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2265 {
2266         int rc = 0;
2267         struct proc_dir_entry *p;
2268
2269         if (!afinfo)
2270                 return -EINVAL;
2271         afinfo->seq_fops->owner         = afinfo->owner;
2272         afinfo->seq_fops->open          = tcp_seq_open;
2273         afinfo->seq_fops->read          = seq_read;
2274         afinfo->seq_fops->llseek        = seq_lseek;
2275         afinfo->seq_fops->release       = tcp_seq_release;
2276
2277         p = proc_net_fops_create(net, afinfo->name, S_IRUGO, afinfo->seq_fops);
2278         if (p)
2279                 p->data = afinfo;
2280         else
2281                 rc = -ENOMEM;
2282         return rc;
2283 }
2284
2285 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2286 {
2287         if (!afinfo)
2288                 return;
2289         proc_net_remove(net, afinfo->name);
2290         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2291 }
2292
2293 static void get_openreq4(struct sock *sk, struct request_sock *req,
2294                          char *tmpbuf, int i, int uid)
2295 {
2296         const struct inet_request_sock *ireq = inet_rsk(req);
2297         int ttd = req->expires - jiffies;
2298
2299         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2300                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2301                 i,
2302                 ireq->loc_addr,
2303                 ntohs(inet_sk(sk)->sport),
2304                 ireq->rmt_addr,
2305                 ntohs(ireq->rmt_port),
2306                 TCP_SYN_RECV,
2307                 0, 0, /* could print option size, but that is af dependent. */
2308                 1,    /* timers active (only the expire timer) */
2309                 jiffies_to_clock_t(ttd),
2310                 req->retrans,
2311                 uid,
2312                 0,  /* non standard timer */
2313                 0, /* open_requests have no inode */
2314                 atomic_read(&sk->sk_refcnt),
2315                 req);
2316 }
2317
2318 static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2319 {
2320         int timer_active;
2321         unsigned long timer_expires;
2322         struct tcp_sock *tp = tcp_sk(sk);
2323         const struct inet_connection_sock *icsk = inet_csk(sk);
2324         struct inet_sock *inet = inet_sk(sk);
2325         __be32 dest = inet->daddr;
2326         __be32 src = inet->rcv_saddr;
2327         __u16 destp = ntohs(inet->dport);
2328         __u16 srcp = ntohs(inet->sport);
2329
2330         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2331                 timer_active    = 1;
2332                 timer_expires   = icsk->icsk_timeout;
2333         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2334                 timer_active    = 4;
2335                 timer_expires   = icsk->icsk_timeout;
2336         } else if (timer_pending(&sk->sk_timer)) {
2337                 timer_active    = 2;
2338                 timer_expires   = sk->sk_timer.expires;
2339         } else {
2340                 timer_active    = 0;
2341                 timer_expires = jiffies;
2342         }
2343
2344         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2345                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2346                 i, src, srcp, dest, destp, sk->sk_state,
2347                 tp->write_seq - tp->snd_una,
2348                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2349                                              (tp->rcv_nxt - tp->copied_seq),
2350                 timer_active,
2351                 jiffies_to_clock_t(timer_expires - jiffies),
2352                 icsk->icsk_retransmits,
2353                 sock_i_uid(sk),
2354                 icsk->icsk_probes_out,
2355                 sock_i_ino(sk),
2356                 atomic_read(&sk->sk_refcnt), sk,
2357                 icsk->icsk_rto,
2358                 icsk->icsk_ack.ato,
2359                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2360                 tp->snd_cwnd,
2361                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2362 }
2363
2364 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2365                                char *tmpbuf, int i)
2366 {
2367         __be32 dest, src;
2368         __u16 destp, srcp;
2369         int ttd = tw->tw_ttd - jiffies;
2370
2371         if (ttd < 0)
2372                 ttd = 0;
2373
2374         dest  = tw->tw_daddr;
2375         src   = tw->tw_rcv_saddr;
2376         destp = ntohs(tw->tw_dport);
2377         srcp  = ntohs(tw->tw_sport);
2378
2379         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2380                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2381                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2382                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2383                 atomic_read(&tw->tw_refcnt), tw);
2384 }
2385
2386 #define TMPSZ 150
2387
2388 static int tcp4_seq_show(struct seq_file *seq, void *v)
2389 {
2390         struct tcp_iter_state* st;
2391         char tmpbuf[TMPSZ + 1];
2392
2393         if (v == SEQ_START_TOKEN) {
2394                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2395                            "  sl  local_address rem_address   st tx_queue "
2396                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2397                            "inode");
2398                 goto out;
2399         }
2400         st = seq->private;
2401
2402         switch (st->state) {
2403         case TCP_SEQ_STATE_LISTENING:
2404         case TCP_SEQ_STATE_ESTABLISHED:
2405                 get_tcp4_sock(v, tmpbuf, st->num);
2406                 break;
2407         case TCP_SEQ_STATE_OPENREQ:
2408                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2409                 break;
2410         case TCP_SEQ_STATE_TIME_WAIT:
2411                 get_timewait4_sock(v, tmpbuf, st->num);
2412                 break;
2413         }
2414         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2415 out:
2416         return 0;
2417 }
2418
2419 static struct file_operations tcp4_seq_fops;
2420 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2421         .owner          = THIS_MODULE,
2422         .name           = "tcp",
2423         .family         = AF_INET,
2424         .seq_show       = tcp4_seq_show,
2425         .seq_fops       = &tcp4_seq_fops,
2426 };
2427
2428 static int tcp4_proc_init_net(struct net *net)
2429 {
2430         return tcp_proc_register(net, &tcp4_seq_afinfo);
2431 }
2432
2433 static void tcp4_proc_exit_net(struct net *net)
2434 {
2435         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2436 }
2437
2438 static struct pernet_operations tcp4_net_ops = {
2439         .init = tcp4_proc_init_net,
2440         .exit = tcp4_proc_exit_net,
2441 };
2442
2443 int __init tcp4_proc_init(void)
2444 {
2445         return register_pernet_subsys(&tcp4_net_ops);
2446 }
2447
2448 void tcp4_proc_exit(void)
2449 {
2450         unregister_pernet_subsys(&tcp4_net_ops);
2451 }
2452 #endif /* CONFIG_PROC_FS */
2453
2454 DEFINE_PROTO_INUSE(tcp)
2455
2456 struct proto tcp_prot = {
2457         .name                   = "TCP",
2458         .owner                  = THIS_MODULE,
2459         .close                  = tcp_close,
2460         .connect                = tcp_v4_connect,
2461         .disconnect             = tcp_disconnect,
2462         .accept                 = inet_csk_accept,
2463         .ioctl                  = tcp_ioctl,
2464         .init                   = tcp_v4_init_sock,
2465         .destroy                = tcp_v4_destroy_sock,
2466         .shutdown               = tcp_shutdown,
2467         .setsockopt             = tcp_setsockopt,
2468         .getsockopt             = tcp_getsockopt,
2469         .recvmsg                = tcp_recvmsg,
2470         .backlog_rcv            = tcp_v4_do_rcv,
2471         .hash                   = inet_hash,
2472         .unhash                 = inet_unhash,
2473         .get_port               = inet_csk_get_port,
2474         .enter_memory_pressure  = tcp_enter_memory_pressure,
2475         .sockets_allocated      = &tcp_sockets_allocated,
2476         .orphan_count           = &tcp_orphan_count,
2477         .memory_allocated       = &tcp_memory_allocated,
2478         .memory_pressure        = &tcp_memory_pressure,
2479         .sysctl_mem             = sysctl_tcp_mem,
2480         .sysctl_wmem            = sysctl_tcp_wmem,
2481         .sysctl_rmem            = sysctl_tcp_rmem,
2482         .max_header             = MAX_TCP_HEADER,
2483         .obj_size               = sizeof(struct tcp_sock),
2484         .twsk_prot              = &tcp_timewait_sock_ops,
2485         .rsk_prot               = &tcp_request_sock_ops,
2486         .h.hashinfo             = &tcp_hashinfo,
2487 #ifdef CONFIG_COMPAT
2488         .compat_setsockopt      = compat_tcp_setsockopt,
2489         .compat_getsockopt      = compat_tcp_getsockopt,
2490 #endif
2491         REF_PROTO_INUSE(tcp)
2492 };
2493
2494 void __init tcp_v4_init(void)
2495 {
2496         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2497                                      IPPROTO_TCP) < 0)
2498                 panic("Failed to create the TCP control socket.\n");
2499 }
2500
2501 EXPORT_SYMBOL(ipv4_specific);
2502 EXPORT_SYMBOL(tcp_hashinfo);
2503 EXPORT_SYMBOL(tcp_prot);
2504 EXPORT_SYMBOL(tcp_v4_conn_request);
2505 EXPORT_SYMBOL(tcp_v4_connect);
2506 EXPORT_SYMBOL(tcp_v4_do_rcv);
2507 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2508 EXPORT_SYMBOL(tcp_v4_send_check);
2509 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2510
2511 #ifdef CONFIG_PROC_FS
2512 EXPORT_SYMBOL(tcp_proc_register);
2513 EXPORT_SYMBOL(tcp_proc_unregister);
2514 #endif
2515 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2516