]> err.no Git - linux-2.6/blob - net/ipv4/tcp_ipv4.c
f2926ae1de57dee2821915c2e3f11b60e36927bf
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/types.h>
55 #include <linux/fcntl.h>
56 #include <linux/module.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61 #include <linux/times.h>
62
63 #include <net/net_namespace.h>
64 #include <net/icmp.h>
65 #include <net/inet_hashtables.h>
66 #include <net/tcp.h>
67 #include <net/transp_v6.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/timewait_sock.h>
71 #include <net/xfrm.h>
72 #include <net/netdma.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79
80 #include <linux/crypto.h>
81 #include <linux/scatterlist.h>
82
83 int sysctl_tcp_tw_reuse __read_mostly;
84 int sysctl_tcp_low_latency __read_mostly;
85
86 /* Check TCP sequence numbers in ICMP packets. */
87 #define ICMP_MIN_LENGTH 8
88
89 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
90
91 #ifdef CONFIG_TCP_MD5SIG
92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
93                                                    __be32 addr);
94 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
95                                    __be32 saddr, __be32 daddr,
96                                    struct tcphdr *th, int protocol,
97                                    unsigned int tcplen);
98 #endif
99
100 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
101         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
102         .lhash_users = ATOMIC_INIT(0),
103         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
104 };
105
106 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107 {
108         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109                                           ip_hdr(skb)->saddr,
110                                           tcp_hdr(skb)->dest,
111                                           tcp_hdr(skb)->source);
112 }
113
114 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115 {
116         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117         struct tcp_sock *tp = tcp_sk(sk);
118
119         /* With PAWS, it is safe from the viewpoint
120            of data integrity. Even without PAWS it is safe provided sequence
121            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123            Actually, the idea is close to VJ's one, only timestamp cache is
124            held not per host, but per port pair and TW bucket is used as state
125            holder.
126
127            If TW bucket has been already destroyed we fall back to VJ's scheme
128            and use initial timestamp retrieved from peer table.
129          */
130         if (tcptw->tw_ts_recent_stamp &&
131             (twp == NULL || (sysctl_tcp_tw_reuse &&
132                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134                 if (tp->write_seq == 0)
135                         tp->write_seq = 1;
136                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
137                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138                 sock_hold(sktw);
139                 return 1;
140         }
141
142         return 0;
143 }
144
145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146
147 /* This will initiate an outgoing connection. */
148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149 {
150         struct inet_sock *inet = inet_sk(sk);
151         struct tcp_sock *tp = tcp_sk(sk);
152         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153         struct rtable *rt;
154         __be32 daddr, nexthop;
155         int tmp;
156         int err;
157
158         if (addr_len < sizeof(struct sockaddr_in))
159                 return -EINVAL;
160
161         if (usin->sin_family != AF_INET)
162                 return -EAFNOSUPPORT;
163
164         nexthop = daddr = usin->sin_addr.s_addr;
165         if (inet->opt && inet->opt->srr) {
166                 if (!daddr)
167                         return -EINVAL;
168                 nexthop = inet->opt->faddr;
169         }
170
171         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
172                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173                                IPPROTO_TCP,
174                                inet->sport, usin->sin_port, sk, 1);
175         if (tmp < 0) {
176                 if (tmp == -ENETUNREACH)
177                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
178                 return tmp;
179         }
180
181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182                 ip_rt_put(rt);
183                 return -ENETUNREACH;
184         }
185
186         if (!inet->opt || !inet->opt->srr)
187                 daddr = rt->rt_dst;
188
189         if (!inet->saddr)
190                 inet->saddr = rt->rt_src;
191         inet->rcv_saddr = inet->saddr;
192
193         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
194                 /* Reset inherited state */
195                 tp->rx_opt.ts_recent       = 0;
196                 tp->rx_opt.ts_recent_stamp = 0;
197                 tp->write_seq              = 0;
198         }
199
200         if (tcp_death_row.sysctl_tw_recycle &&
201             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
202                 struct inet_peer *peer = rt_get_peer(rt);
203                 /*
204                  * VJ's idea. We save last timestamp seen from
205                  * the destination in peer table, when entering state
206                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
207                  * when trying new connection.
208                  */
209                 if (peer != NULL &&
210                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
211                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212                         tp->rx_opt.ts_recent = peer->tcp_ts;
213                 }
214         }
215
216         inet->dport = usin->sin_port;
217         inet->daddr = daddr;
218
219         inet_csk(sk)->icsk_ext_hdr_len = 0;
220         if (inet->opt)
221                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
222
223         tp->rx_opt.mss_clamp = 536;
224
225         /* Socket identity is still unknown (sport may be zero).
226          * However we set state to SYN-SENT and not releasing socket
227          * lock select source port, enter ourselves into the hash tables and
228          * complete initialization after this.
229          */
230         tcp_set_state(sk, TCP_SYN_SENT);
231         err = inet_hash_connect(&tcp_death_row, sk);
232         if (err)
233                 goto failure;
234
235         err = ip_route_newports(&rt, IPPROTO_TCP,
236                                 inet->sport, inet->dport, sk);
237         if (err)
238                 goto failure;
239
240         /* OK, now commit destination to socket.  */
241         sk->sk_gso_type = SKB_GSO_TCPV4;
242         sk_setup_caps(sk, &rt->u.dst);
243
244         if (!tp->write_seq)
245                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
246                                                            inet->daddr,
247                                                            inet->sport,
248                                                            usin->sin_port);
249
250         inet->id = tp->write_seq ^ jiffies;
251
252         err = tcp_connect(sk);
253         rt = NULL;
254         if (err)
255                 goto failure;
256
257         return 0;
258
259 failure:
260         /*
261          * This unhashes the socket and releases the local port,
262          * if necessary.
263          */
264         tcp_set_state(sk, TCP_CLOSE);
265         ip_rt_put(rt);
266         sk->sk_route_caps = 0;
267         inet->dport = 0;
268         return err;
269 }
270
271 /*
272  * This routine does path mtu discovery as defined in RFC1191.
273  */
274 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
275 {
276         struct dst_entry *dst;
277         struct inet_sock *inet = inet_sk(sk);
278
279         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280          * send out by Linux are always <576bytes so they should go through
281          * unfragmented).
282          */
283         if (sk->sk_state == TCP_LISTEN)
284                 return;
285
286         /* We don't check in the destentry if pmtu discovery is forbidden
287          * on this route. We just assume that no packet_to_big packets
288          * are send back when pmtu discovery is not active.
289          * There is a small race when the user changes this flag in the
290          * route, but I think that's acceptable.
291          */
292         if ((dst = __sk_dst_check(sk, 0)) == NULL)
293                 return;
294
295         dst->ops->update_pmtu(dst, mtu);
296
297         /* Something is about to be wrong... Remember soft error
298          * for the case, if this connection will not able to recover.
299          */
300         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301                 sk->sk_err_soft = EMSGSIZE;
302
303         mtu = dst_mtu(dst);
304
305         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
306             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
307                 tcp_sync_mss(sk, mtu);
308
309                 /* Resend the TCP packet because it's
310                  * clear that the old packet has been
311                  * dropped. This is the new "fast" path mtu
312                  * discovery.
313                  */
314                 tcp_simple_retransmit(sk);
315         } /* else let the usual retransmit timer handle it */
316 }
317
318 /*
319  * This routine is called by the ICMP module when it gets some
320  * sort of error condition.  If err < 0 then the socket should
321  * be closed and the error returned to the user.  If err > 0
322  * it's just the icmp type << 8 | icmp code.  After adjustment
323  * header points to the first 8 bytes of the tcp header.  We need
324  * to find the appropriate port.
325  *
326  * The locking strategy used here is very "optimistic". When
327  * someone else accesses the socket the ICMP is just dropped
328  * and for some paths there is no check at all.
329  * A more general error queue to queue errors for later handling
330  * is probably better.
331  *
332  */
333
334 void tcp_v4_err(struct sk_buff *skb, u32 info)
335 {
336         struct iphdr *iph = (struct iphdr *)skb->data;
337         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
338         struct tcp_sock *tp;
339         struct inet_sock *inet;
340         const int type = icmp_hdr(skb)->type;
341         const int code = icmp_hdr(skb)->code;
342         struct sock *sk;
343         __u32 seq;
344         int err;
345
346         if (skb->len < (iph->ihl << 2) + 8) {
347                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
348                 return;
349         }
350
351         sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
352                         iph->saddr, th->source, inet_iif(skb));
353         if (!sk) {
354                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
355                 return;
356         }
357         if (sk->sk_state == TCP_TIME_WAIT) {
358                 inet_twsk_put(inet_twsk(sk));
359                 return;
360         }
361
362         bh_lock_sock(sk);
363         /* If too many ICMPs get dropped on busy
364          * servers this needs to be solved differently.
365          */
366         if (sock_owned_by_user(sk))
367                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
368
369         if (sk->sk_state == TCP_CLOSE)
370                 goto out;
371
372         tp = tcp_sk(sk);
373         seq = ntohl(th->seq);
374         if (sk->sk_state != TCP_LISTEN &&
375             !between(seq, tp->snd_una, tp->snd_nxt)) {
376                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
377                 goto out;
378         }
379
380         switch (type) {
381         case ICMP_SOURCE_QUENCH:
382                 /* Just silently ignore these. */
383                 goto out;
384         case ICMP_PARAMETERPROB:
385                 err = EPROTO;
386                 break;
387         case ICMP_DEST_UNREACH:
388                 if (code > NR_ICMP_UNREACH)
389                         goto out;
390
391                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
392                         if (!sock_owned_by_user(sk))
393                                 do_pmtu_discovery(sk, iph, info);
394                         goto out;
395                 }
396
397                 err = icmp_err_convert[code].errno;
398                 break;
399         case ICMP_TIME_EXCEEDED:
400                 err = EHOSTUNREACH;
401                 break;
402         default:
403                 goto out;
404         }
405
406         switch (sk->sk_state) {
407                 struct request_sock *req, **prev;
408         case TCP_LISTEN:
409                 if (sock_owned_by_user(sk))
410                         goto out;
411
412                 req = inet_csk_search_req(sk, &prev, th->dest,
413                                           iph->daddr, iph->saddr);
414                 if (!req)
415                         goto out;
416
417                 /* ICMPs are not backlogged, hence we cannot get
418                    an established socket here.
419                  */
420                 BUG_TRAP(!req->sk);
421
422                 if (seq != tcp_rsk(req)->snt_isn) {
423                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
424                         goto out;
425                 }
426
427                 /*
428                  * Still in SYN_RECV, just remove it silently.
429                  * There is no good way to pass the error to the newly
430                  * created socket, and POSIX does not want network
431                  * errors returned from accept().
432                  */
433                 inet_csk_reqsk_queue_drop(sk, req, prev);
434                 goto out;
435
436         case TCP_SYN_SENT:
437         case TCP_SYN_RECV:  /* Cannot happen.
438                                It can f.e. if SYNs crossed.
439                              */
440                 if (!sock_owned_by_user(sk)) {
441                         sk->sk_err = err;
442
443                         sk->sk_error_report(sk);
444
445                         tcp_done(sk);
446                 } else {
447                         sk->sk_err_soft = err;
448                 }
449                 goto out;
450         }
451
452         /* If we've already connected we will keep trying
453          * until we time out, or the user gives up.
454          *
455          * rfc1122 4.2.3.9 allows to consider as hard errors
456          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
457          * but it is obsoleted by pmtu discovery).
458          *
459          * Note, that in modern internet, where routing is unreliable
460          * and in each dark corner broken firewalls sit, sending random
461          * errors ordered by their masters even this two messages finally lose
462          * their original sense (even Linux sends invalid PORT_UNREACHs)
463          *
464          * Now we are in compliance with RFCs.
465          *                                                      --ANK (980905)
466          */
467
468         inet = inet_sk(sk);
469         if (!sock_owned_by_user(sk) && inet->recverr) {
470                 sk->sk_err = err;
471                 sk->sk_error_report(sk);
472         } else  { /* Only an error on timeout */
473                 sk->sk_err_soft = err;
474         }
475
476 out:
477         bh_unlock_sock(sk);
478         sock_put(sk);
479 }
480
481 /* This routine computes an IPv4 TCP checksum. */
482 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
483 {
484         struct inet_sock *inet = inet_sk(sk);
485         struct tcphdr *th = tcp_hdr(skb);
486
487         if (skb->ip_summed == CHECKSUM_PARTIAL) {
488                 th->check = ~tcp_v4_check(len, inet->saddr,
489                                           inet->daddr, 0);
490                 skb->csum_start = skb_transport_header(skb) - skb->head;
491                 skb->csum_offset = offsetof(struct tcphdr, check);
492         } else {
493                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
494                                          csum_partial((char *)th,
495                                                       th->doff << 2,
496                                                       skb->csum));
497         }
498 }
499
500 int tcp_v4_gso_send_check(struct sk_buff *skb)
501 {
502         const struct iphdr *iph;
503         struct tcphdr *th;
504
505         if (!pskb_may_pull(skb, sizeof(*th)))
506                 return -EINVAL;
507
508         iph = ip_hdr(skb);
509         th = tcp_hdr(skb);
510
511         th->check = 0;
512         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
513         skb->csum_start = skb_transport_header(skb) - skb->head;
514         skb->csum_offset = offsetof(struct tcphdr, check);
515         skb->ip_summed = CHECKSUM_PARTIAL;
516         return 0;
517 }
518
519 /*
520  *      This routine will send an RST to the other tcp.
521  *
522  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
523  *                    for reset.
524  *      Answer: if a packet caused RST, it is not for a socket
525  *              existing in our system, if it is matched to a socket,
526  *              it is just duplicate segment or bug in other side's TCP.
527  *              So that we build reply only basing on parameters
528  *              arrived with segment.
529  *      Exception: precedence violation. We do not implement it in any case.
530  */
531
532 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
533 {
534         struct tcphdr *th = tcp_hdr(skb);
535         struct {
536                 struct tcphdr th;
537 #ifdef CONFIG_TCP_MD5SIG
538                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
539 #endif
540         } rep;
541         struct ip_reply_arg arg;
542 #ifdef CONFIG_TCP_MD5SIG
543         struct tcp_md5sig_key *key;
544 #endif
545
546         /* Never send a reset in response to a reset. */
547         if (th->rst)
548                 return;
549
550         if (skb->rtable->rt_type != RTN_LOCAL)
551                 return;
552
553         /* Swap the send and the receive. */
554         memset(&rep, 0, sizeof(rep));
555         rep.th.dest   = th->source;
556         rep.th.source = th->dest;
557         rep.th.doff   = sizeof(struct tcphdr) / 4;
558         rep.th.rst    = 1;
559
560         if (th->ack) {
561                 rep.th.seq = th->ack_seq;
562         } else {
563                 rep.th.ack = 1;
564                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
565                                        skb->len - (th->doff << 2));
566         }
567
568         memset(&arg, 0, sizeof(arg));
569         arg.iov[0].iov_base = (unsigned char *)&rep;
570         arg.iov[0].iov_len  = sizeof(rep.th);
571
572 #ifdef CONFIG_TCP_MD5SIG
573         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
574         if (key) {
575                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
576                                    (TCPOPT_NOP << 16) |
577                                    (TCPOPT_MD5SIG << 8) |
578                                    TCPOLEN_MD5SIG);
579                 /* Update length and the length the header thinks exists */
580                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
581                 rep.th.doff = arg.iov[0].iov_len / 4;
582
583                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
584                                         key,
585                                         ip_hdr(skb)->daddr,
586                                         ip_hdr(skb)->saddr,
587                                         &rep.th, IPPROTO_TCP,
588                                         arg.iov[0].iov_len);
589         }
590 #endif
591         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
592                                       ip_hdr(skb)->saddr, /* XXX */
593                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
594         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
595
596         ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
597                       &arg, arg.iov[0].iov_len);
598
599         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
600         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
601 }
602
603 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
604    outside socket context is ugly, certainly. What can I do?
605  */
606
607 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
608                             struct sk_buff *skb, u32 seq, u32 ack,
609                             u32 win, u32 ts)
610 {
611         struct tcphdr *th = tcp_hdr(skb);
612         struct {
613                 struct tcphdr th;
614                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
615 #ifdef CONFIG_TCP_MD5SIG
616                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
617 #endif
618                         ];
619         } rep;
620         struct ip_reply_arg arg;
621 #ifdef CONFIG_TCP_MD5SIG
622         struct tcp_md5sig_key *key;
623         struct tcp_md5sig_key tw_key;
624 #endif
625
626         memset(&rep.th, 0, sizeof(struct tcphdr));
627         memset(&arg, 0, sizeof(arg));
628
629         arg.iov[0].iov_base = (unsigned char *)&rep;
630         arg.iov[0].iov_len  = sizeof(rep.th);
631         if (ts) {
632                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
633                                    (TCPOPT_TIMESTAMP << 8) |
634                                    TCPOLEN_TIMESTAMP);
635                 rep.opt[1] = htonl(tcp_time_stamp);
636                 rep.opt[2] = htonl(ts);
637                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
638         }
639
640         /* Swap the send and the receive. */
641         rep.th.dest    = th->source;
642         rep.th.source  = th->dest;
643         rep.th.doff    = arg.iov[0].iov_len / 4;
644         rep.th.seq     = htonl(seq);
645         rep.th.ack_seq = htonl(ack);
646         rep.th.ack     = 1;
647         rep.th.window  = htons(win);
648
649 #ifdef CONFIG_TCP_MD5SIG
650         /*
651          * The SKB holds an imcoming packet, but may not have a valid ->sk
652          * pointer. This is especially the case when we're dealing with a
653          * TIME_WAIT ack, because the sk structure is long gone, and only
654          * the tcp_timewait_sock remains. So the md5 key is stashed in that
655          * structure, and we use it in preference.  I believe that (twsk ||
656          * skb->sk) holds true, but we program defensively.
657          */
658         if (!twsk && skb->sk) {
659                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
660         } else if (twsk && twsk->tw_md5_keylen) {
661                 tw_key.key = twsk->tw_md5_key;
662                 tw_key.keylen = twsk->tw_md5_keylen;
663                 key = &tw_key;
664         } else
665                 key = NULL;
666
667         if (key) {
668                 int offset = (ts) ? 3 : 0;
669
670                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
671                                           (TCPOPT_NOP << 16) |
672                                           (TCPOPT_MD5SIG << 8) |
673                                           TCPOLEN_MD5SIG);
674                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
675                 rep.th.doff = arg.iov[0].iov_len/4;
676
677                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
678                                         key,
679                                         ip_hdr(skb)->daddr,
680                                         ip_hdr(skb)->saddr,
681                                         &rep.th, IPPROTO_TCP,
682                                         arg.iov[0].iov_len);
683         }
684 #endif
685         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
686                                       ip_hdr(skb)->saddr, /* XXX */
687                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
688         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
689         if (twsk)
690                 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
691
692         ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
693                       &arg, arg.iov[0].iov_len);
694
695         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
696 }
697
698 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
699 {
700         struct inet_timewait_sock *tw = inet_twsk(sk);
701         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
702
703         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
704                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
705                         tcptw->tw_ts_recent);
706
707         inet_twsk_put(tw);
708 }
709
710 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
711                                   struct request_sock *req)
712 {
713         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
714                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
715                         req->ts_recent);
716 }
717
718 /*
719  *      Send a SYN-ACK after having received a SYN.
720  *      This still operates on a request_sock only, not on a big
721  *      socket.
722  */
723 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
724                                 struct dst_entry *dst)
725 {
726         const struct inet_request_sock *ireq = inet_rsk(req);
727         int err = -1;
728         struct sk_buff * skb;
729
730         /* First, grab a route. */
731         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
732                 return -1;
733
734         skb = tcp_make_synack(sk, dst, req);
735
736         if (skb) {
737                 struct tcphdr *th = tcp_hdr(skb);
738
739                 th->check = tcp_v4_check(skb->len,
740                                          ireq->loc_addr,
741                                          ireq->rmt_addr,
742                                          csum_partial((char *)th, skb->len,
743                                                       skb->csum));
744
745                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
746                                             ireq->rmt_addr,
747                                             ireq->opt);
748                 err = net_xmit_eval(err);
749         }
750
751         dst_release(dst);
752         return err;
753 }
754
755 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
756 {
757         return __tcp_v4_send_synack(sk, req, NULL);
758 }
759
760 /*
761  *      IPv4 request_sock destructor.
762  */
763 static void tcp_v4_reqsk_destructor(struct request_sock *req)
764 {
765         kfree(inet_rsk(req)->opt);
766 }
767
768 #ifdef CONFIG_SYN_COOKIES
769 static void syn_flood_warning(struct sk_buff *skb)
770 {
771         static unsigned long warntime;
772
773         if (time_after(jiffies, (warntime + HZ * 60))) {
774                 warntime = jiffies;
775                 printk(KERN_INFO
776                        "possible SYN flooding on port %d. Sending cookies.\n",
777                        ntohs(tcp_hdr(skb)->dest));
778         }
779 }
780 #endif
781
782 /*
783  * Save and compile IPv4 options into the request_sock if needed.
784  */
785 static struct ip_options *tcp_v4_save_options(struct sock *sk,
786                                               struct sk_buff *skb)
787 {
788         struct ip_options *opt = &(IPCB(skb)->opt);
789         struct ip_options *dopt = NULL;
790
791         if (opt && opt->optlen) {
792                 int opt_size = optlength(opt);
793                 dopt = kmalloc(opt_size, GFP_ATOMIC);
794                 if (dopt) {
795                         if (ip_options_echo(dopt, skb)) {
796                                 kfree(dopt);
797                                 dopt = NULL;
798                         }
799                 }
800         }
801         return dopt;
802 }
803
804 #ifdef CONFIG_TCP_MD5SIG
805 /*
806  * RFC2385 MD5 checksumming requires a mapping of
807  * IP address->MD5 Key.
808  * We need to maintain these in the sk structure.
809  */
810
811 /* Find the Key structure for an address.  */
812 static struct tcp_md5sig_key *
813                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
814 {
815         struct tcp_sock *tp = tcp_sk(sk);
816         int i;
817
818         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
819                 return NULL;
820         for (i = 0; i < tp->md5sig_info->entries4; i++) {
821                 if (tp->md5sig_info->keys4[i].addr == addr)
822                         return &tp->md5sig_info->keys4[i].base;
823         }
824         return NULL;
825 }
826
827 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
828                                          struct sock *addr_sk)
829 {
830         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
831 }
832
833 EXPORT_SYMBOL(tcp_v4_md5_lookup);
834
835 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
836                                                       struct request_sock *req)
837 {
838         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
839 }
840
841 /* This can be called on a newly created socket, from other files */
842 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
843                       u8 *newkey, u8 newkeylen)
844 {
845         /* Add Key to the list */
846         struct tcp_md5sig_key *key;
847         struct tcp_sock *tp = tcp_sk(sk);
848         struct tcp4_md5sig_key *keys;
849
850         key = tcp_v4_md5_do_lookup(sk, addr);
851         if (key) {
852                 /* Pre-existing entry - just update that one. */
853                 kfree(key->key);
854                 key->key = newkey;
855                 key->keylen = newkeylen;
856         } else {
857                 struct tcp_md5sig_info *md5sig;
858
859                 if (!tp->md5sig_info) {
860                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
861                                                   GFP_ATOMIC);
862                         if (!tp->md5sig_info) {
863                                 kfree(newkey);
864                                 return -ENOMEM;
865                         }
866                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
867                 }
868                 if (tcp_alloc_md5sig_pool() == NULL) {
869                         kfree(newkey);
870                         return -ENOMEM;
871                 }
872                 md5sig = tp->md5sig_info;
873
874                 if (md5sig->alloced4 == md5sig->entries4) {
875                         keys = kmalloc((sizeof(*keys) *
876                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
877                         if (!keys) {
878                                 kfree(newkey);
879                                 tcp_free_md5sig_pool();
880                                 return -ENOMEM;
881                         }
882
883                         if (md5sig->entries4)
884                                 memcpy(keys, md5sig->keys4,
885                                        sizeof(*keys) * md5sig->entries4);
886
887                         /* Free old key list, and reference new one */
888                         kfree(md5sig->keys4);
889                         md5sig->keys4 = keys;
890                         md5sig->alloced4++;
891                 }
892                 md5sig->entries4++;
893                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
894                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
895                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
896         }
897         return 0;
898 }
899
900 EXPORT_SYMBOL(tcp_v4_md5_do_add);
901
902 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
903                                u8 *newkey, u8 newkeylen)
904 {
905         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
906                                  newkey, newkeylen);
907 }
908
909 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
910 {
911         struct tcp_sock *tp = tcp_sk(sk);
912         int i;
913
914         for (i = 0; i < tp->md5sig_info->entries4; i++) {
915                 if (tp->md5sig_info->keys4[i].addr == addr) {
916                         /* Free the key */
917                         kfree(tp->md5sig_info->keys4[i].base.key);
918                         tp->md5sig_info->entries4--;
919
920                         if (tp->md5sig_info->entries4 == 0) {
921                                 kfree(tp->md5sig_info->keys4);
922                                 tp->md5sig_info->keys4 = NULL;
923                                 tp->md5sig_info->alloced4 = 0;
924                         } else if (tp->md5sig_info->entries4 != i) {
925                                 /* Need to do some manipulation */
926                                 memmove(&tp->md5sig_info->keys4[i],
927                                         &tp->md5sig_info->keys4[i+1],
928                                         (tp->md5sig_info->entries4 - i) *
929                                          sizeof(struct tcp4_md5sig_key));
930                         }
931                         tcp_free_md5sig_pool();
932                         return 0;
933                 }
934         }
935         return -ENOENT;
936 }
937
938 EXPORT_SYMBOL(tcp_v4_md5_do_del);
939
940 static void tcp_v4_clear_md5_list(struct sock *sk)
941 {
942         struct tcp_sock *tp = tcp_sk(sk);
943
944         /* Free each key, then the set of key keys,
945          * the crypto element, and then decrement our
946          * hold on the last resort crypto.
947          */
948         if (tp->md5sig_info->entries4) {
949                 int i;
950                 for (i = 0; i < tp->md5sig_info->entries4; i++)
951                         kfree(tp->md5sig_info->keys4[i].base.key);
952                 tp->md5sig_info->entries4 = 0;
953                 tcp_free_md5sig_pool();
954         }
955         if (tp->md5sig_info->keys4) {
956                 kfree(tp->md5sig_info->keys4);
957                 tp->md5sig_info->keys4 = NULL;
958                 tp->md5sig_info->alloced4  = 0;
959         }
960 }
961
962 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
963                                  int optlen)
964 {
965         struct tcp_md5sig cmd;
966         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
967         u8 *newkey;
968
969         if (optlen < sizeof(cmd))
970                 return -EINVAL;
971
972         if (copy_from_user(&cmd, optval, sizeof(cmd)))
973                 return -EFAULT;
974
975         if (sin->sin_family != AF_INET)
976                 return -EINVAL;
977
978         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
979                 if (!tcp_sk(sk)->md5sig_info)
980                         return -ENOENT;
981                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
982         }
983
984         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
985                 return -EINVAL;
986
987         if (!tcp_sk(sk)->md5sig_info) {
988                 struct tcp_sock *tp = tcp_sk(sk);
989                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
990
991                 if (!p)
992                         return -EINVAL;
993
994                 tp->md5sig_info = p;
995                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
996         }
997
998         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
999         if (!newkey)
1000                 return -ENOMEM;
1001         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1002                                  newkey, cmd.tcpm_keylen);
1003 }
1004
1005 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1006                                    __be32 saddr, __be32 daddr,
1007                                    struct tcphdr *th, int protocol,
1008                                    unsigned int tcplen)
1009 {
1010         struct scatterlist sg[4];
1011         __u16 data_len;
1012         int block = 0;
1013         __sum16 old_checksum;
1014         struct tcp_md5sig_pool *hp;
1015         struct tcp4_pseudohdr *bp;
1016         struct hash_desc *desc;
1017         int err;
1018         unsigned int nbytes = 0;
1019
1020         /*
1021          * Okay, so RFC2385 is turned on for this connection,
1022          * so we need to generate the MD5 hash for the packet now.
1023          */
1024
1025         hp = tcp_get_md5sig_pool();
1026         if (!hp)
1027                 goto clear_hash_noput;
1028
1029         bp = &hp->md5_blk.ip4;
1030         desc = &hp->md5_desc;
1031
1032         /*
1033          * 1. the TCP pseudo-header (in the order: source IP address,
1034          * destination IP address, zero-padded protocol number, and
1035          * segment length)
1036          */
1037         bp->saddr = saddr;
1038         bp->daddr = daddr;
1039         bp->pad = 0;
1040         bp->protocol = protocol;
1041         bp->len = htons(tcplen);
1042
1043         sg_init_table(sg, 4);
1044
1045         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1046         nbytes += sizeof(*bp);
1047
1048         /* 2. the TCP header, excluding options, and assuming a
1049          * checksum of zero/
1050          */
1051         old_checksum = th->check;
1052         th->check = 0;
1053         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1054         nbytes += sizeof(struct tcphdr);
1055
1056         /* 3. the TCP segment data (if any) */
1057         data_len = tcplen - (th->doff << 2);
1058         if (data_len > 0) {
1059                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1060                 sg_set_buf(&sg[block++], data, data_len);
1061                 nbytes += data_len;
1062         }
1063
1064         /* 4. an independently-specified key or password, known to both
1065          * TCPs and presumably connection-specific
1066          */
1067         sg_set_buf(&sg[block++], key->key, key->keylen);
1068         nbytes += key->keylen;
1069
1070         sg_mark_end(&sg[block - 1]);
1071
1072         /* Now store the Hash into the packet */
1073         err = crypto_hash_init(desc);
1074         if (err)
1075                 goto clear_hash;
1076         err = crypto_hash_update(desc, sg, nbytes);
1077         if (err)
1078                 goto clear_hash;
1079         err = crypto_hash_final(desc, md5_hash);
1080         if (err)
1081                 goto clear_hash;
1082
1083         /* Reset header, and free up the crypto */
1084         tcp_put_md5sig_pool();
1085         th->check = old_checksum;
1086
1087 out:
1088         return 0;
1089 clear_hash:
1090         tcp_put_md5sig_pool();
1091 clear_hash_noput:
1092         memset(md5_hash, 0, 16);
1093         goto out;
1094 }
1095
1096 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1097                          struct sock *sk,
1098                          struct dst_entry *dst,
1099                          struct request_sock *req,
1100                          struct tcphdr *th, int protocol,
1101                          unsigned int tcplen)
1102 {
1103         __be32 saddr, daddr;
1104
1105         if (sk) {
1106                 saddr = inet_sk(sk)->saddr;
1107                 daddr = inet_sk(sk)->daddr;
1108         } else {
1109                 struct rtable *rt = (struct rtable *)dst;
1110                 BUG_ON(!rt);
1111                 saddr = rt->rt_src;
1112                 daddr = rt->rt_dst;
1113         }
1114         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1115                                        saddr, daddr,
1116                                        th, protocol, tcplen);
1117 }
1118
1119 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1120
1121 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1122 {
1123         /*
1124          * This gets called for each TCP segment that arrives
1125          * so we want to be efficient.
1126          * We have 3 drop cases:
1127          * o No MD5 hash and one expected.
1128          * o MD5 hash and we're not expecting one.
1129          * o MD5 hash and its wrong.
1130          */
1131         __u8 *hash_location = NULL;
1132         struct tcp_md5sig_key *hash_expected;
1133         const struct iphdr *iph = ip_hdr(skb);
1134         struct tcphdr *th = tcp_hdr(skb);
1135         int length = (th->doff << 2) - sizeof(struct tcphdr);
1136         int genhash;
1137         unsigned char *ptr;
1138         unsigned char newhash[16];
1139
1140         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1141
1142         /*
1143          * If the TCP option length is less than the TCP_MD5SIG
1144          * option length, then we can shortcut
1145          */
1146         if (length < TCPOLEN_MD5SIG) {
1147                 if (hash_expected)
1148                         return 1;
1149                 else
1150                         return 0;
1151         }
1152
1153         /* Okay, we can't shortcut - we have to grub through the options */
1154         ptr = (unsigned char *)(th + 1);
1155         while (length > 0) {
1156                 int opcode = *ptr++;
1157                 int opsize;
1158
1159                 switch (opcode) {
1160                 case TCPOPT_EOL:
1161                         goto done_opts;
1162                 case TCPOPT_NOP:
1163                         length--;
1164                         continue;
1165                 default:
1166                         opsize = *ptr++;
1167                         if (opsize < 2)
1168                                 goto done_opts;
1169                         if (opsize > length)
1170                                 goto done_opts;
1171
1172                         if (opcode == TCPOPT_MD5SIG) {
1173                                 hash_location = ptr;
1174                                 goto done_opts;
1175                         }
1176                 }
1177                 ptr += opsize-2;
1178                 length -= opsize;
1179         }
1180 done_opts:
1181         /* We've parsed the options - do we have a hash? */
1182         if (!hash_expected && !hash_location)
1183                 return 0;
1184
1185         if (hash_expected && !hash_location) {
1186                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1187                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1188                                NIPQUAD(iph->saddr), ntohs(th->source),
1189                                NIPQUAD(iph->daddr), ntohs(th->dest));
1190                 return 1;
1191         }
1192
1193         if (!hash_expected && hash_location) {
1194                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1195                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1196                                NIPQUAD(iph->saddr), ntohs(th->source),
1197                                NIPQUAD(iph->daddr), ntohs(th->dest));
1198                 return 1;
1199         }
1200
1201         /* Okay, so this is hash_expected and hash_location -
1202          * so we need to calculate the checksum.
1203          */
1204         genhash = tcp_v4_do_calc_md5_hash(newhash,
1205                                           hash_expected,
1206                                           iph->saddr, iph->daddr,
1207                                           th, sk->sk_protocol,
1208                                           skb->len);
1209
1210         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1211                 if (net_ratelimit()) {
1212                         printk(KERN_INFO "MD5 Hash failed for "
1213                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1214                                NIPQUAD(iph->saddr), ntohs(th->source),
1215                                NIPQUAD(iph->daddr), ntohs(th->dest),
1216                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1217                 }
1218                 return 1;
1219         }
1220         return 0;
1221 }
1222
1223 #endif
1224
1225 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1226         .family         =       PF_INET,
1227         .obj_size       =       sizeof(struct tcp_request_sock),
1228         .rtx_syn_ack    =       tcp_v4_send_synack,
1229         .send_ack       =       tcp_v4_reqsk_send_ack,
1230         .destructor     =       tcp_v4_reqsk_destructor,
1231         .send_reset     =       tcp_v4_send_reset,
1232 };
1233
1234 #ifdef CONFIG_TCP_MD5SIG
1235 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1236         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1237 };
1238 #endif
1239
1240 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1241         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1242         .twsk_unique    = tcp_twsk_unique,
1243         .twsk_destructor= tcp_twsk_destructor,
1244 };
1245
1246 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1247 {
1248         struct inet_request_sock *ireq;
1249         struct tcp_options_received tmp_opt;
1250         struct request_sock *req;
1251         __be32 saddr = ip_hdr(skb)->saddr;
1252         __be32 daddr = ip_hdr(skb)->daddr;
1253         __u32 isn = TCP_SKB_CB(skb)->when;
1254         struct dst_entry *dst = NULL;
1255 #ifdef CONFIG_SYN_COOKIES
1256         int want_cookie = 0;
1257 #else
1258 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1259 #endif
1260
1261         /* Never answer to SYNs send to broadcast or multicast */
1262         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1263                 goto drop;
1264
1265         /* TW buckets are converted to open requests without
1266          * limitations, they conserve resources and peer is
1267          * evidently real one.
1268          */
1269         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1270 #ifdef CONFIG_SYN_COOKIES
1271                 if (sysctl_tcp_syncookies) {
1272                         want_cookie = 1;
1273                 } else
1274 #endif
1275                 goto drop;
1276         }
1277
1278         /* Accept backlog is full. If we have already queued enough
1279          * of warm entries in syn queue, drop request. It is better than
1280          * clogging syn queue with openreqs with exponentially increasing
1281          * timeout.
1282          */
1283         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1284                 goto drop;
1285
1286         req = reqsk_alloc(&tcp_request_sock_ops);
1287         if (!req)
1288                 goto drop;
1289
1290 #ifdef CONFIG_TCP_MD5SIG
1291         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1292 #endif
1293
1294         tcp_clear_options(&tmp_opt);
1295         tmp_opt.mss_clamp = 536;
1296         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1297
1298         tcp_parse_options(skb, &tmp_opt, 0);
1299
1300         if (want_cookie && !tmp_opt.saw_tstamp)
1301                 tcp_clear_options(&tmp_opt);
1302
1303         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1304                 /* Some OSes (unknown ones, but I see them on web server, which
1305                  * contains information interesting only for windows'
1306                  * users) do not send their stamp in SYN. It is easy case.
1307                  * We simply do not advertise TS support.
1308                  */
1309                 tmp_opt.saw_tstamp = 0;
1310                 tmp_opt.tstamp_ok  = 0;
1311         }
1312         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1313
1314         tcp_openreq_init(req, &tmp_opt, skb);
1315
1316         if (security_inet_conn_request(sk, skb, req))
1317                 goto drop_and_free;
1318
1319         ireq = inet_rsk(req);
1320         ireq->loc_addr = daddr;
1321         ireq->rmt_addr = saddr;
1322         ireq->opt = tcp_v4_save_options(sk, skb);
1323         if (!want_cookie)
1324                 TCP_ECN_create_request(req, tcp_hdr(skb));
1325
1326         if (want_cookie) {
1327 #ifdef CONFIG_SYN_COOKIES
1328                 syn_flood_warning(skb);
1329                 req->cookie_ts = tmp_opt.tstamp_ok;
1330 #endif
1331                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1332         } else if (!isn) {
1333                 struct inet_peer *peer = NULL;
1334
1335                 /* VJ's idea. We save last timestamp seen
1336                  * from the destination in peer table, when entering
1337                  * state TIME-WAIT, and check against it before
1338                  * accepting new connection request.
1339                  *
1340                  * If "isn" is not zero, this request hit alive
1341                  * timewait bucket, so that all the necessary checks
1342                  * are made in the function processing timewait state.
1343                  */
1344                 if (tmp_opt.saw_tstamp &&
1345                     tcp_death_row.sysctl_tw_recycle &&
1346                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1347                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1348                     peer->v4daddr == saddr) {
1349                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1350                             (s32)(peer->tcp_ts - req->ts_recent) >
1351                                                         TCP_PAWS_WINDOW) {
1352                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1353                                 goto drop_and_release;
1354                         }
1355                 }
1356                 /* Kill the following clause, if you dislike this way. */
1357                 else if (!sysctl_tcp_syncookies &&
1358                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1359                           (sysctl_max_syn_backlog >> 2)) &&
1360                          (!peer || !peer->tcp_ts_stamp) &&
1361                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1362                         /* Without syncookies last quarter of
1363                          * backlog is filled with destinations,
1364                          * proven to be alive.
1365                          * It means that we continue to communicate
1366                          * to destinations, already remembered
1367                          * to the moment of synflood.
1368                          */
1369                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1370                                        "request from " NIPQUAD_FMT "/%u\n",
1371                                        NIPQUAD(saddr),
1372                                        ntohs(tcp_hdr(skb)->source));
1373                         goto drop_and_release;
1374                 }
1375
1376                 isn = tcp_v4_init_sequence(skb);
1377         }
1378         tcp_rsk(req)->snt_isn = isn;
1379
1380         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1381                 goto drop_and_free;
1382
1383         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1384         return 0;
1385
1386 drop_and_release:
1387         dst_release(dst);
1388 drop_and_free:
1389         reqsk_free(req);
1390 drop:
1391         return 0;
1392 }
1393
1394
1395 /*
1396  * The three way handshake has completed - we got a valid synack -
1397  * now create the new socket.
1398  */
1399 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1400                                   struct request_sock *req,
1401                                   struct dst_entry *dst)
1402 {
1403         struct inet_request_sock *ireq;
1404         struct inet_sock *newinet;
1405         struct tcp_sock *newtp;
1406         struct sock *newsk;
1407 #ifdef CONFIG_TCP_MD5SIG
1408         struct tcp_md5sig_key *key;
1409 #endif
1410
1411         if (sk_acceptq_is_full(sk))
1412                 goto exit_overflow;
1413
1414         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1415                 goto exit;
1416
1417         newsk = tcp_create_openreq_child(sk, req, skb);
1418         if (!newsk)
1419                 goto exit;
1420
1421         newsk->sk_gso_type = SKB_GSO_TCPV4;
1422         sk_setup_caps(newsk, dst);
1423
1424         newtp                 = tcp_sk(newsk);
1425         newinet               = inet_sk(newsk);
1426         ireq                  = inet_rsk(req);
1427         newinet->daddr        = ireq->rmt_addr;
1428         newinet->rcv_saddr    = ireq->loc_addr;
1429         newinet->saddr        = ireq->loc_addr;
1430         newinet->opt          = ireq->opt;
1431         ireq->opt             = NULL;
1432         newinet->mc_index     = inet_iif(skb);
1433         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1434         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1435         if (newinet->opt)
1436                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1437         newinet->id = newtp->write_seq ^ jiffies;
1438
1439         tcp_mtup_init(newsk);
1440         tcp_sync_mss(newsk, dst_mtu(dst));
1441         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1442         tcp_initialize_rcv_mss(newsk);
1443
1444 #ifdef CONFIG_TCP_MD5SIG
1445         /* Copy over the MD5 key from the original socket */
1446         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1447                 /*
1448                  * We're using one, so create a matching key
1449                  * on the newsk structure. If we fail to get
1450                  * memory, then we end up not copying the key
1451                  * across. Shucks.
1452                  */
1453                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1454                 if (newkey != NULL)
1455                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1456                                           newkey, key->keylen);
1457         }
1458 #endif
1459
1460         __inet_hash_nolisten(newsk);
1461         __inet_inherit_port(sk, newsk);
1462
1463         return newsk;
1464
1465 exit_overflow:
1466         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1467 exit:
1468         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1469         dst_release(dst);
1470         return NULL;
1471 }
1472
1473 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1474 {
1475         struct tcphdr *th = tcp_hdr(skb);
1476         const struct iphdr *iph = ip_hdr(skb);
1477         struct sock *nsk;
1478         struct request_sock **prev;
1479         /* Find possible connection requests. */
1480         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1481                                                        iph->saddr, iph->daddr);
1482         if (req)
1483                 return tcp_check_req(sk, skb, req, prev);
1484
1485         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1486                         th->source, iph->daddr, th->dest, inet_iif(skb));
1487
1488         if (nsk) {
1489                 if (nsk->sk_state != TCP_TIME_WAIT) {
1490                         bh_lock_sock(nsk);
1491                         return nsk;
1492                 }
1493                 inet_twsk_put(inet_twsk(nsk));
1494                 return NULL;
1495         }
1496
1497 #ifdef CONFIG_SYN_COOKIES
1498         if (!th->rst && !th->syn && th->ack)
1499                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1500 #endif
1501         return sk;
1502 }
1503
1504 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1505 {
1506         const struct iphdr *iph = ip_hdr(skb);
1507
1508         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1509                 if (!tcp_v4_check(skb->len, iph->saddr,
1510                                   iph->daddr, skb->csum)) {
1511                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1512                         return 0;
1513                 }
1514         }
1515
1516         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1517                                        skb->len, IPPROTO_TCP, 0);
1518
1519         if (skb->len <= 76) {
1520                 return __skb_checksum_complete(skb);
1521         }
1522         return 0;
1523 }
1524
1525
1526 /* The socket must have it's spinlock held when we get
1527  * here.
1528  *
1529  * We have a potential double-lock case here, so even when
1530  * doing backlog processing we use the BH locking scheme.
1531  * This is because we cannot sleep with the original spinlock
1532  * held.
1533  */
1534 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1535 {
1536         struct sock *rsk;
1537 #ifdef CONFIG_TCP_MD5SIG
1538         /*
1539          * We really want to reject the packet as early as possible
1540          * if:
1541          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1542          *  o There is an MD5 option and we're not expecting one
1543          */
1544         if (tcp_v4_inbound_md5_hash(sk, skb))
1545                 goto discard;
1546 #endif
1547
1548         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1549                 TCP_CHECK_TIMER(sk);
1550                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1551                         rsk = sk;
1552                         goto reset;
1553                 }
1554                 TCP_CHECK_TIMER(sk);
1555                 return 0;
1556         }
1557
1558         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1559                 goto csum_err;
1560
1561         if (sk->sk_state == TCP_LISTEN) {
1562                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1563                 if (!nsk)
1564                         goto discard;
1565
1566                 if (nsk != sk) {
1567                         if (tcp_child_process(sk, nsk, skb)) {
1568                                 rsk = nsk;
1569                                 goto reset;
1570                         }
1571                         return 0;
1572                 }
1573         }
1574
1575         TCP_CHECK_TIMER(sk);
1576         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1577                 rsk = sk;
1578                 goto reset;
1579         }
1580         TCP_CHECK_TIMER(sk);
1581         return 0;
1582
1583 reset:
1584         tcp_v4_send_reset(rsk, skb);
1585 discard:
1586         kfree_skb(skb);
1587         /* Be careful here. If this function gets more complicated and
1588          * gcc suffers from register pressure on the x86, sk (in %ebx)
1589          * might be destroyed here. This current version compiles correctly,
1590          * but you have been warned.
1591          */
1592         return 0;
1593
1594 csum_err:
1595         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1596         goto discard;
1597 }
1598
1599 /*
1600  *      From tcp_input.c
1601  */
1602
1603 int tcp_v4_rcv(struct sk_buff *skb)
1604 {
1605         const struct iphdr *iph;
1606         struct tcphdr *th;
1607         struct sock *sk;
1608         int ret;
1609
1610         if (skb->pkt_type != PACKET_HOST)
1611                 goto discard_it;
1612
1613         /* Count it even if it's bad */
1614         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1615
1616         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1617                 goto discard_it;
1618
1619         th = tcp_hdr(skb);
1620
1621         if (th->doff < sizeof(struct tcphdr) / 4)
1622                 goto bad_packet;
1623         if (!pskb_may_pull(skb, th->doff * 4))
1624                 goto discard_it;
1625
1626         /* An explanation is required here, I think.
1627          * Packet length and doff are validated by header prediction,
1628          * provided case of th->doff==0 is eliminated.
1629          * So, we defer the checks. */
1630         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1631                 goto bad_packet;
1632
1633         th = tcp_hdr(skb);
1634         iph = ip_hdr(skb);
1635         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1636         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1637                                     skb->len - th->doff * 4);
1638         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1639         TCP_SKB_CB(skb)->when    = 0;
1640         TCP_SKB_CB(skb)->flags   = iph->tos;
1641         TCP_SKB_CB(skb)->sacked  = 0;
1642
1643         sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1644                         th->source, iph->daddr, th->dest, inet_iif(skb));
1645         if (!sk)
1646                 goto no_tcp_socket;
1647
1648 process:
1649         if (sk->sk_state == TCP_TIME_WAIT)
1650                 goto do_time_wait;
1651
1652         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1653                 goto discard_and_relse;
1654         nf_reset(skb);
1655
1656         if (sk_filter(sk, skb))
1657                 goto discard_and_relse;
1658
1659         skb->dev = NULL;
1660
1661         bh_lock_sock_nested(sk);
1662         ret = 0;
1663         if (!sock_owned_by_user(sk)) {
1664 #ifdef CONFIG_NET_DMA
1665                 struct tcp_sock *tp = tcp_sk(sk);
1666                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1667                         tp->ucopy.dma_chan = get_softnet_dma();
1668                 if (tp->ucopy.dma_chan)
1669                         ret = tcp_v4_do_rcv(sk, skb);
1670                 else
1671 #endif
1672                 {
1673                         if (!tcp_prequeue(sk, skb))
1674                         ret = tcp_v4_do_rcv(sk, skb);
1675                 }
1676         } else
1677                 sk_add_backlog(sk, skb);
1678         bh_unlock_sock(sk);
1679
1680         sock_put(sk);
1681
1682         return ret;
1683
1684 no_tcp_socket:
1685         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1686                 goto discard_it;
1687
1688         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1689 bad_packet:
1690                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1691         } else {
1692                 tcp_v4_send_reset(NULL, skb);
1693         }
1694
1695 discard_it:
1696         /* Discard frame. */
1697         kfree_skb(skb);
1698         return 0;
1699
1700 discard_and_relse:
1701         sock_put(sk);
1702         goto discard_it;
1703
1704 do_time_wait:
1705         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1706                 inet_twsk_put(inet_twsk(sk));
1707                 goto discard_it;
1708         }
1709
1710         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1711                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1712                 inet_twsk_put(inet_twsk(sk));
1713                 goto discard_it;
1714         }
1715         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1716         case TCP_TW_SYN: {
1717                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1718                                                         &tcp_hashinfo,
1719                                                         iph->daddr, th->dest,
1720                                                         inet_iif(skb));
1721                 if (sk2) {
1722                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1723                         inet_twsk_put(inet_twsk(sk));
1724                         sk = sk2;
1725                         goto process;
1726                 }
1727                 /* Fall through to ACK */
1728         }
1729         case TCP_TW_ACK:
1730                 tcp_v4_timewait_ack(sk, skb);
1731                 break;
1732         case TCP_TW_RST:
1733                 goto no_tcp_socket;
1734         case TCP_TW_SUCCESS:;
1735         }
1736         goto discard_it;
1737 }
1738
1739 /* VJ's idea. Save last timestamp seen from this destination
1740  * and hold it at least for normal timewait interval to use for duplicate
1741  * segment detection in subsequent connections, before they enter synchronized
1742  * state.
1743  */
1744
1745 int tcp_v4_remember_stamp(struct sock *sk)
1746 {
1747         struct inet_sock *inet = inet_sk(sk);
1748         struct tcp_sock *tp = tcp_sk(sk);
1749         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1750         struct inet_peer *peer = NULL;
1751         int release_it = 0;
1752
1753         if (!rt || rt->rt_dst != inet->daddr) {
1754                 peer = inet_getpeer(inet->daddr, 1);
1755                 release_it = 1;
1756         } else {
1757                 if (!rt->peer)
1758                         rt_bind_peer(rt, 1);
1759                 peer = rt->peer;
1760         }
1761
1762         if (peer) {
1763                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1764                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1765                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1766                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1767                         peer->tcp_ts = tp->rx_opt.ts_recent;
1768                 }
1769                 if (release_it)
1770                         inet_putpeer(peer);
1771                 return 1;
1772         }
1773
1774         return 0;
1775 }
1776
1777 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1778 {
1779         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1780
1781         if (peer) {
1782                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1783
1784                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1785                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1786                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1787                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1788                         peer->tcp_ts       = tcptw->tw_ts_recent;
1789                 }
1790                 inet_putpeer(peer);
1791                 return 1;
1792         }
1793
1794         return 0;
1795 }
1796
1797 struct inet_connection_sock_af_ops ipv4_specific = {
1798         .queue_xmit        = ip_queue_xmit,
1799         .send_check        = tcp_v4_send_check,
1800         .rebuild_header    = inet_sk_rebuild_header,
1801         .conn_request      = tcp_v4_conn_request,
1802         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1803         .remember_stamp    = tcp_v4_remember_stamp,
1804         .net_header_len    = sizeof(struct iphdr),
1805         .setsockopt        = ip_setsockopt,
1806         .getsockopt        = ip_getsockopt,
1807         .addr2sockaddr     = inet_csk_addr2sockaddr,
1808         .sockaddr_len      = sizeof(struct sockaddr_in),
1809         .bind_conflict     = inet_csk_bind_conflict,
1810 #ifdef CONFIG_COMPAT
1811         .compat_setsockopt = compat_ip_setsockopt,
1812         .compat_getsockopt = compat_ip_getsockopt,
1813 #endif
1814 };
1815
1816 #ifdef CONFIG_TCP_MD5SIG
1817 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1818         .md5_lookup             = tcp_v4_md5_lookup,
1819         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1820         .md5_add                = tcp_v4_md5_add_func,
1821         .md5_parse              = tcp_v4_parse_md5_keys,
1822 };
1823 #endif
1824
1825 /* NOTE: A lot of things set to zero explicitly by call to
1826  *       sk_alloc() so need not be done here.
1827  */
1828 static int tcp_v4_init_sock(struct sock *sk)
1829 {
1830         struct inet_connection_sock *icsk = inet_csk(sk);
1831         struct tcp_sock *tp = tcp_sk(sk);
1832
1833         skb_queue_head_init(&tp->out_of_order_queue);
1834         tcp_init_xmit_timers(sk);
1835         tcp_prequeue_init(tp);
1836
1837         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1838         tp->mdev = TCP_TIMEOUT_INIT;
1839
1840         /* So many TCP implementations out there (incorrectly) count the
1841          * initial SYN frame in their delayed-ACK and congestion control
1842          * algorithms that we must have the following bandaid to talk
1843          * efficiently to them.  -DaveM
1844          */
1845         tp->snd_cwnd = 2;
1846
1847         /* See draft-stevens-tcpca-spec-01 for discussion of the
1848          * initialization of these values.
1849          */
1850         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1851         tp->snd_cwnd_clamp = ~0;
1852         tp->mss_cache = 536;
1853
1854         tp->reordering = sysctl_tcp_reordering;
1855         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1856
1857         sk->sk_state = TCP_CLOSE;
1858
1859         sk->sk_write_space = sk_stream_write_space;
1860         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1861
1862         icsk->icsk_af_ops = &ipv4_specific;
1863         icsk->icsk_sync_mss = tcp_sync_mss;
1864 #ifdef CONFIG_TCP_MD5SIG
1865         tp->af_specific = &tcp_sock_ipv4_specific;
1866 #endif
1867
1868         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1869         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1870
1871         atomic_inc(&tcp_sockets_allocated);
1872
1873         return 0;
1874 }
1875
1876 int tcp_v4_destroy_sock(struct sock *sk)
1877 {
1878         struct tcp_sock *tp = tcp_sk(sk);
1879
1880         tcp_clear_xmit_timers(sk);
1881
1882         tcp_cleanup_congestion_control(sk);
1883
1884         /* Cleanup up the write buffer. */
1885         tcp_write_queue_purge(sk);
1886
1887         /* Cleans up our, hopefully empty, out_of_order_queue. */
1888         __skb_queue_purge(&tp->out_of_order_queue);
1889
1890 #ifdef CONFIG_TCP_MD5SIG
1891         /* Clean up the MD5 key list, if any */
1892         if (tp->md5sig_info) {
1893                 tcp_v4_clear_md5_list(sk);
1894                 kfree(tp->md5sig_info);
1895                 tp->md5sig_info = NULL;
1896         }
1897 #endif
1898
1899 #ifdef CONFIG_NET_DMA
1900         /* Cleans up our sk_async_wait_queue */
1901         __skb_queue_purge(&sk->sk_async_wait_queue);
1902 #endif
1903
1904         /* Clean prequeue, it must be empty really */
1905         __skb_queue_purge(&tp->ucopy.prequeue);
1906
1907         /* Clean up a referenced TCP bind bucket. */
1908         if (inet_csk(sk)->icsk_bind_hash)
1909                 inet_put_port(sk);
1910
1911         /*
1912          * If sendmsg cached page exists, toss it.
1913          */
1914         if (sk->sk_sndmsg_page) {
1915                 __free_page(sk->sk_sndmsg_page);
1916                 sk->sk_sndmsg_page = NULL;
1917         }
1918
1919         if (tp->defer_tcp_accept.request) {
1920                 reqsk_free(tp->defer_tcp_accept.request);
1921                 sock_put(tp->defer_tcp_accept.listen_sk);
1922                 sock_put(sk);
1923                 tp->defer_tcp_accept.listen_sk = NULL;
1924                 tp->defer_tcp_accept.request = NULL;
1925         }
1926
1927         atomic_dec(&tcp_sockets_allocated);
1928
1929         return 0;
1930 }
1931
1932 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1933
1934 #ifdef CONFIG_PROC_FS
1935 /* Proc filesystem TCP sock list dumping. */
1936
1937 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1938 {
1939         return hlist_empty(head) ? NULL :
1940                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1941 }
1942
1943 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1944 {
1945         return tw->tw_node.next ?
1946                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1947 }
1948
1949 static void *listening_get_next(struct seq_file *seq, void *cur)
1950 {
1951         struct inet_connection_sock *icsk;
1952         struct hlist_node *node;
1953         struct sock *sk = cur;
1954         struct tcp_iter_state* st = seq->private;
1955         struct net *net = seq_file_net(seq);
1956
1957         if (!sk) {
1958                 st->bucket = 0;
1959                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1960                 goto get_sk;
1961         }
1962
1963         ++st->num;
1964
1965         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1966                 struct request_sock *req = cur;
1967
1968                 icsk = inet_csk(st->syn_wait_sk);
1969                 req = req->dl_next;
1970                 while (1) {
1971                         while (req) {
1972                                 if (req->rsk_ops->family == st->family &&
1973                                     net_eq(sock_net(req->sk), net)) {
1974                                         cur = req;
1975                                         goto out;
1976                                 }
1977                                 req = req->dl_next;
1978                         }
1979                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1980                                 break;
1981 get_req:
1982                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1983                 }
1984                 sk        = sk_next(st->syn_wait_sk);
1985                 st->state = TCP_SEQ_STATE_LISTENING;
1986                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1987         } else {
1988                 icsk = inet_csk(sk);
1989                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1990                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1991                         goto start_req;
1992                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1993                 sk = sk_next(sk);
1994         }
1995 get_sk:
1996         sk_for_each_from(sk, node) {
1997                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1998                         cur = sk;
1999                         goto out;
2000                 }
2001                 icsk = inet_csk(sk);
2002                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2003                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2004 start_req:
2005                         st->uid         = sock_i_uid(sk);
2006                         st->syn_wait_sk = sk;
2007                         st->state       = TCP_SEQ_STATE_OPENREQ;
2008                         st->sbucket     = 0;
2009                         goto get_req;
2010                 }
2011                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2012         }
2013         if (++st->bucket < INET_LHTABLE_SIZE) {
2014                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2015                 goto get_sk;
2016         }
2017         cur = NULL;
2018 out:
2019         return cur;
2020 }
2021
2022 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2023 {
2024         void *rc = listening_get_next(seq, NULL);
2025
2026         while (rc && *pos) {
2027                 rc = listening_get_next(seq, rc);
2028                 --*pos;
2029         }
2030         return rc;
2031 }
2032
2033 static void *established_get_first(struct seq_file *seq)
2034 {
2035         struct tcp_iter_state* st = seq->private;
2036         struct net *net = seq_file_net(seq);
2037         void *rc = NULL;
2038
2039         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2040                 struct sock *sk;
2041                 struct hlist_node *node;
2042                 struct inet_timewait_sock *tw;
2043                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2044
2045                 read_lock_bh(lock);
2046                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2047                         if (sk->sk_family != st->family ||
2048                             !net_eq(sock_net(sk), net)) {
2049                                 continue;
2050                         }
2051                         rc = sk;
2052                         goto out;
2053                 }
2054                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2055                 inet_twsk_for_each(tw, node,
2056                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2057                         if (tw->tw_family != st->family ||
2058                             !net_eq(twsk_net(tw), net)) {
2059                                 continue;
2060                         }
2061                         rc = tw;
2062                         goto out;
2063                 }
2064                 read_unlock_bh(lock);
2065                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2066         }
2067 out:
2068         return rc;
2069 }
2070
2071 static void *established_get_next(struct seq_file *seq, void *cur)
2072 {
2073         struct sock *sk = cur;
2074         struct inet_timewait_sock *tw;
2075         struct hlist_node *node;
2076         struct tcp_iter_state* st = seq->private;
2077         struct net *net = seq_file_net(seq);
2078
2079         ++st->num;
2080
2081         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2082                 tw = cur;
2083                 tw = tw_next(tw);
2084 get_tw:
2085                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2086                         tw = tw_next(tw);
2087                 }
2088                 if (tw) {
2089                         cur = tw;
2090                         goto out;
2091                 }
2092                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2093                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2094
2095                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2096                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2097                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2098                 } else {
2099                         cur = NULL;
2100                         goto out;
2101                 }
2102         } else
2103                 sk = sk_next(sk);
2104
2105         sk_for_each_from(sk, node) {
2106                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2107                         goto found;
2108         }
2109
2110         st->state = TCP_SEQ_STATE_TIME_WAIT;
2111         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2112         goto get_tw;
2113 found:
2114         cur = sk;
2115 out:
2116         return cur;
2117 }
2118
2119 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2120 {
2121         void *rc = established_get_first(seq);
2122
2123         while (rc && pos) {
2124                 rc = established_get_next(seq, rc);
2125                 --pos;
2126         }
2127         return rc;
2128 }
2129
2130 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2131 {
2132         void *rc;
2133         struct tcp_iter_state* st = seq->private;
2134
2135         inet_listen_lock(&tcp_hashinfo);
2136         st->state = TCP_SEQ_STATE_LISTENING;
2137         rc        = listening_get_idx(seq, &pos);
2138
2139         if (!rc) {
2140                 inet_listen_unlock(&tcp_hashinfo);
2141                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2142                 rc        = established_get_idx(seq, pos);
2143         }
2144
2145         return rc;
2146 }
2147
2148 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2149 {
2150         struct tcp_iter_state* st = seq->private;
2151         st->state = TCP_SEQ_STATE_LISTENING;
2152         st->num = 0;
2153         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2154 }
2155
2156 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2157 {
2158         void *rc = NULL;
2159         struct tcp_iter_state* st;
2160
2161         if (v == SEQ_START_TOKEN) {
2162                 rc = tcp_get_idx(seq, 0);
2163                 goto out;
2164         }
2165         st = seq->private;
2166
2167         switch (st->state) {
2168         case TCP_SEQ_STATE_OPENREQ:
2169         case TCP_SEQ_STATE_LISTENING:
2170                 rc = listening_get_next(seq, v);
2171                 if (!rc) {
2172                         inet_listen_unlock(&tcp_hashinfo);
2173                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2174                         rc        = established_get_first(seq);
2175                 }
2176                 break;
2177         case TCP_SEQ_STATE_ESTABLISHED:
2178         case TCP_SEQ_STATE_TIME_WAIT:
2179                 rc = established_get_next(seq, v);
2180                 break;
2181         }
2182 out:
2183         ++*pos;
2184         return rc;
2185 }
2186
2187 static void tcp_seq_stop(struct seq_file *seq, void *v)
2188 {
2189         struct tcp_iter_state* st = seq->private;
2190
2191         switch (st->state) {
2192         case TCP_SEQ_STATE_OPENREQ:
2193                 if (v) {
2194                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2195                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2196                 }
2197         case TCP_SEQ_STATE_LISTENING:
2198                 if (v != SEQ_START_TOKEN)
2199                         inet_listen_unlock(&tcp_hashinfo);
2200                 break;
2201         case TCP_SEQ_STATE_TIME_WAIT:
2202         case TCP_SEQ_STATE_ESTABLISHED:
2203                 if (v)
2204                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2205                 break;
2206         }
2207 }
2208
2209 static int tcp_seq_open(struct inode *inode, struct file *file)
2210 {
2211         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2212         struct tcp_iter_state *s;
2213         int err;
2214
2215         err = seq_open_net(inode, file, &afinfo->seq_ops,
2216                           sizeof(struct tcp_iter_state));
2217         if (err < 0)
2218                 return err;
2219
2220         s = ((struct seq_file *)file->private_data)->private;
2221         s->family               = afinfo->family;
2222         return 0;
2223 }
2224
2225 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2226 {
2227         int rc = 0;
2228         struct proc_dir_entry *p;
2229
2230         afinfo->seq_fops.open           = tcp_seq_open;
2231         afinfo->seq_fops.read           = seq_read;
2232         afinfo->seq_fops.llseek         = seq_lseek;
2233         afinfo->seq_fops.release        = seq_release_net;
2234
2235         afinfo->seq_ops.start           = tcp_seq_start;
2236         afinfo->seq_ops.next            = tcp_seq_next;
2237         afinfo->seq_ops.stop            = tcp_seq_stop;
2238
2239         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2240                              &afinfo->seq_fops, afinfo);
2241         if (!p)
2242                 rc = -ENOMEM;
2243         return rc;
2244 }
2245
2246 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2247 {
2248         proc_net_remove(net, afinfo->name);
2249 }
2250
2251 static void get_openreq4(struct sock *sk, struct request_sock *req,
2252                          struct seq_file *f, int i, int uid, int *len)
2253 {
2254         const struct inet_request_sock *ireq = inet_rsk(req);
2255         int ttd = req->expires - jiffies;
2256
2257         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2258                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2259                 i,
2260                 ireq->loc_addr,
2261                 ntohs(inet_sk(sk)->sport),
2262                 ireq->rmt_addr,
2263                 ntohs(ireq->rmt_port),
2264                 TCP_SYN_RECV,
2265                 0, 0, /* could print option size, but that is af dependent. */
2266                 1,    /* timers active (only the expire timer) */
2267                 jiffies_to_clock_t(ttd),
2268                 req->retrans,
2269                 uid,
2270                 0,  /* non standard timer */
2271                 0, /* open_requests have no inode */
2272                 atomic_read(&sk->sk_refcnt),
2273                 req,
2274                 len);
2275 }
2276
2277 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2278 {
2279         int timer_active;
2280         unsigned long timer_expires;
2281         struct tcp_sock *tp = tcp_sk(sk);
2282         const struct inet_connection_sock *icsk = inet_csk(sk);
2283         struct inet_sock *inet = inet_sk(sk);
2284         __be32 dest = inet->daddr;
2285         __be32 src = inet->rcv_saddr;
2286         __u16 destp = ntohs(inet->dport);
2287         __u16 srcp = ntohs(inet->sport);
2288
2289         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2290                 timer_active    = 1;
2291                 timer_expires   = icsk->icsk_timeout;
2292         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2293                 timer_active    = 4;
2294                 timer_expires   = icsk->icsk_timeout;
2295         } else if (timer_pending(&sk->sk_timer)) {
2296                 timer_active    = 2;
2297                 timer_expires   = sk->sk_timer.expires;
2298         } else {
2299                 timer_active    = 0;
2300                 timer_expires = jiffies;
2301         }
2302
2303         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2304                         "%08X %5d %8d %lu %d %p %u %u %u %u %d%n",
2305                 i, src, srcp, dest, destp, sk->sk_state,
2306                 tp->write_seq - tp->snd_una,
2307                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2308                                              (tp->rcv_nxt - tp->copied_seq),
2309                 timer_active,
2310                 jiffies_to_clock_t(timer_expires - jiffies),
2311                 icsk->icsk_retransmits,
2312                 sock_i_uid(sk),
2313                 icsk->icsk_probes_out,
2314                 sock_i_ino(sk),
2315                 atomic_read(&sk->sk_refcnt), sk,
2316                 icsk->icsk_rto,
2317                 icsk->icsk_ack.ato,
2318                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2319                 tp->snd_cwnd,
2320                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2321                 len);
2322 }
2323
2324 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2325                                struct seq_file *f, int i, int *len)
2326 {
2327         __be32 dest, src;
2328         __u16 destp, srcp;
2329         int ttd = tw->tw_ttd - jiffies;
2330
2331         if (ttd < 0)
2332                 ttd = 0;
2333
2334         dest  = tw->tw_daddr;
2335         src   = tw->tw_rcv_saddr;
2336         destp = ntohs(tw->tw_dport);
2337         srcp  = ntohs(tw->tw_sport);
2338
2339         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2340                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2341                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2342                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2343                 atomic_read(&tw->tw_refcnt), tw, len);
2344 }
2345
2346 #define TMPSZ 150
2347
2348 static int tcp4_seq_show(struct seq_file *seq, void *v)
2349 {
2350         struct tcp_iter_state* st;
2351         int len;
2352
2353         if (v == SEQ_START_TOKEN) {
2354                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2355                            "  sl  local_address rem_address   st tx_queue "
2356                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2357                            "inode");
2358                 goto out;
2359         }
2360         st = seq->private;
2361
2362         switch (st->state) {
2363         case TCP_SEQ_STATE_LISTENING:
2364         case TCP_SEQ_STATE_ESTABLISHED:
2365                 get_tcp4_sock(v, seq, st->num, &len);
2366                 break;
2367         case TCP_SEQ_STATE_OPENREQ:
2368                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2369                 break;
2370         case TCP_SEQ_STATE_TIME_WAIT:
2371                 get_timewait4_sock(v, seq, st->num, &len);
2372                 break;
2373         }
2374         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2375 out:
2376         return 0;
2377 }
2378
2379 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2380         .name           = "tcp",
2381         .family         = AF_INET,
2382         .seq_fops       = {
2383                 .owner          = THIS_MODULE,
2384         },
2385         .seq_ops        = {
2386                 .show           = tcp4_seq_show,
2387         },
2388 };
2389
2390 static int tcp4_proc_init_net(struct net *net)
2391 {
2392         return tcp_proc_register(net, &tcp4_seq_afinfo);
2393 }
2394
2395 static void tcp4_proc_exit_net(struct net *net)
2396 {
2397         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2398 }
2399
2400 static struct pernet_operations tcp4_net_ops = {
2401         .init = tcp4_proc_init_net,
2402         .exit = tcp4_proc_exit_net,
2403 };
2404
2405 int __init tcp4_proc_init(void)
2406 {
2407         return register_pernet_subsys(&tcp4_net_ops);
2408 }
2409
2410 void tcp4_proc_exit(void)
2411 {
2412         unregister_pernet_subsys(&tcp4_net_ops);
2413 }
2414 #endif /* CONFIG_PROC_FS */
2415
2416 struct proto tcp_prot = {
2417         .name                   = "TCP",
2418         .owner                  = THIS_MODULE,
2419         .close                  = tcp_close,
2420         .connect                = tcp_v4_connect,
2421         .disconnect             = tcp_disconnect,
2422         .accept                 = inet_csk_accept,
2423         .ioctl                  = tcp_ioctl,
2424         .init                   = tcp_v4_init_sock,
2425         .destroy                = tcp_v4_destroy_sock,
2426         .shutdown               = tcp_shutdown,
2427         .setsockopt             = tcp_setsockopt,
2428         .getsockopt             = tcp_getsockopt,
2429         .recvmsg                = tcp_recvmsg,
2430         .backlog_rcv            = tcp_v4_do_rcv,
2431         .hash                   = inet_hash,
2432         .unhash                 = inet_unhash,
2433         .get_port               = inet_csk_get_port,
2434         .enter_memory_pressure  = tcp_enter_memory_pressure,
2435         .sockets_allocated      = &tcp_sockets_allocated,
2436         .orphan_count           = &tcp_orphan_count,
2437         .memory_allocated       = &tcp_memory_allocated,
2438         .memory_pressure        = &tcp_memory_pressure,
2439         .sysctl_mem             = sysctl_tcp_mem,
2440         .sysctl_wmem            = sysctl_tcp_wmem,
2441         .sysctl_rmem            = sysctl_tcp_rmem,
2442         .max_header             = MAX_TCP_HEADER,
2443         .obj_size               = sizeof(struct tcp_sock),
2444         .twsk_prot              = &tcp_timewait_sock_ops,
2445         .rsk_prot               = &tcp_request_sock_ops,
2446         .h.hashinfo             = &tcp_hashinfo,
2447 #ifdef CONFIG_COMPAT
2448         .compat_setsockopt      = compat_tcp_setsockopt,
2449         .compat_getsockopt      = compat_tcp_getsockopt,
2450 #endif
2451 };
2452
2453
2454 static int __net_init tcp_sk_init(struct net *net)
2455 {
2456         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2457                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2458 }
2459
2460 static void __net_exit tcp_sk_exit(struct net *net)
2461 {
2462         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2463 }
2464
2465 static struct pernet_operations __net_initdata tcp_sk_ops = {
2466        .init = tcp_sk_init,
2467        .exit = tcp_sk_exit,
2468 };
2469
2470 void __init tcp_v4_init(void)
2471 {
2472         if (register_pernet_device(&tcp_sk_ops))
2473                 panic("Failed to create the TCP control socket.\n");
2474 }
2475
2476 EXPORT_SYMBOL(ipv4_specific);
2477 EXPORT_SYMBOL(tcp_hashinfo);
2478 EXPORT_SYMBOL(tcp_prot);
2479 EXPORT_SYMBOL(tcp_v4_conn_request);
2480 EXPORT_SYMBOL(tcp_v4_connect);
2481 EXPORT_SYMBOL(tcp_v4_do_rcv);
2482 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2483 EXPORT_SYMBOL(tcp_v4_send_check);
2484 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2485
2486 #ifdef CONFIG_PROC_FS
2487 EXPORT_SYMBOL(tcp_proc_register);
2488 EXPORT_SYMBOL(tcp_proc_unregister);
2489 #endif
2490 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2491