]> err.no Git - linux-2.6/blob - net/ipv4/tcp_ipv4.c
Merge branch 'for-2.6.25' of master.kernel.org:/pub/scm/linux/kernel/git/jwboyer...
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87
88 /* Check TCP sequence numbers in ICMP packets. */
89 #define ICMP_MIN_LENGTH 8
90
91 /* Socket used for sending RSTs */
92 static struct socket *tcp_socket __read_mostly;
93
94 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
95
96 #ifdef CONFIG_TCP_MD5SIG
97 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
98                                                    __be32 addr);
99 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
100                                    __be32 saddr, __be32 daddr,
101                                    struct tcphdr *th, int protocol,
102                                    unsigned int tcplen);
103 #endif
104
105 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
106         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
107         .lhash_users = ATOMIC_INIT(0),
108         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
109 };
110
111 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
112 {
113         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
114                                           ip_hdr(skb)->saddr,
115                                           tcp_hdr(skb)->dest,
116                                           tcp_hdr(skb)->source);
117 }
118
119 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
120 {
121         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
122         struct tcp_sock *tp = tcp_sk(sk);
123
124         /* With PAWS, it is safe from the viewpoint
125            of data integrity. Even without PAWS it is safe provided sequence
126            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
127
128            Actually, the idea is close to VJ's one, only timestamp cache is
129            held not per host, but per port pair and TW bucket is used as state
130            holder.
131
132            If TW bucket has been already destroyed we fall back to VJ's scheme
133            and use initial timestamp retrieved from peer table.
134          */
135         if (tcptw->tw_ts_recent_stamp &&
136             (twp == NULL || (sysctl_tcp_tw_reuse &&
137                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
138                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
139                 if (tp->write_seq == 0)
140                         tp->write_seq = 1;
141                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
142                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
143                 sock_hold(sktw);
144                 return 1;
145         }
146
147         return 0;
148 }
149
150 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
151
152 /* This will initiate an outgoing connection. */
153 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
154 {
155         struct inet_sock *inet = inet_sk(sk);
156         struct tcp_sock *tp = tcp_sk(sk);
157         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
158         struct rtable *rt;
159         __be32 daddr, nexthop;
160         int tmp;
161         int err;
162
163         if (addr_len < sizeof(struct sockaddr_in))
164                 return -EINVAL;
165
166         if (usin->sin_family != AF_INET)
167                 return -EAFNOSUPPORT;
168
169         nexthop = daddr = usin->sin_addr.s_addr;
170         if (inet->opt && inet->opt->srr) {
171                 if (!daddr)
172                         return -EINVAL;
173                 nexthop = inet->opt->faddr;
174         }
175
176         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
177                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
178                                IPPROTO_TCP,
179                                inet->sport, usin->sin_port, sk, 1);
180         if (tmp < 0) {
181                 if (tmp == -ENETUNREACH)
182                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
183                 return tmp;
184         }
185
186         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187                 ip_rt_put(rt);
188                 return -ENETUNREACH;
189         }
190
191         if (!inet->opt || !inet->opt->srr)
192                 daddr = rt->rt_dst;
193
194         if (!inet->saddr)
195                 inet->saddr = rt->rt_src;
196         inet->rcv_saddr = inet->saddr;
197
198         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
199                 /* Reset inherited state */
200                 tp->rx_opt.ts_recent       = 0;
201                 tp->rx_opt.ts_recent_stamp = 0;
202                 tp->write_seq              = 0;
203         }
204
205         if (tcp_death_row.sysctl_tw_recycle &&
206             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
207                 struct inet_peer *peer = rt_get_peer(rt);
208                 /*
209                  * VJ's idea. We save last timestamp seen from
210                  * the destination in peer table, when entering state
211                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
212                  * when trying new connection.
213                  */
214                 if (peer != NULL &&
215                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
216                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
217                         tp->rx_opt.ts_recent = peer->tcp_ts;
218                 }
219         }
220
221         inet->dport = usin->sin_port;
222         inet->daddr = daddr;
223
224         inet_csk(sk)->icsk_ext_hdr_len = 0;
225         if (inet->opt)
226                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
227
228         tp->rx_opt.mss_clamp = 536;
229
230         /* Socket identity is still unknown (sport may be zero).
231          * However we set state to SYN-SENT and not releasing socket
232          * lock select source port, enter ourselves into the hash tables and
233          * complete initialization after this.
234          */
235         tcp_set_state(sk, TCP_SYN_SENT);
236         err = inet_hash_connect(&tcp_death_row, sk);
237         if (err)
238                 goto failure;
239
240         err = ip_route_newports(&rt, IPPROTO_TCP,
241                                 inet->sport, inet->dport, sk);
242         if (err)
243                 goto failure;
244
245         /* OK, now commit destination to socket.  */
246         sk->sk_gso_type = SKB_GSO_TCPV4;
247         sk_setup_caps(sk, &rt->u.dst);
248
249         if (!tp->write_seq)
250                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
251                                                            inet->daddr,
252                                                            inet->sport,
253                                                            usin->sin_port);
254
255         inet->id = tp->write_seq ^ jiffies;
256
257         err = tcp_connect(sk);
258         rt = NULL;
259         if (err)
260                 goto failure;
261
262         return 0;
263
264 failure:
265         /*
266          * This unhashes the socket and releases the local port,
267          * if necessary.
268          */
269         tcp_set_state(sk, TCP_CLOSE);
270         ip_rt_put(rt);
271         sk->sk_route_caps = 0;
272         inet->dport = 0;
273         return err;
274 }
275
276 /*
277  * This routine does path mtu discovery as defined in RFC1191.
278  */
279 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
280 {
281         struct dst_entry *dst;
282         struct inet_sock *inet = inet_sk(sk);
283
284         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
285          * send out by Linux are always <576bytes so they should go through
286          * unfragmented).
287          */
288         if (sk->sk_state == TCP_LISTEN)
289                 return;
290
291         /* We don't check in the destentry if pmtu discovery is forbidden
292          * on this route. We just assume that no packet_to_big packets
293          * are send back when pmtu discovery is not active.
294          * There is a small race when the user changes this flag in the
295          * route, but I think that's acceptable.
296          */
297         if ((dst = __sk_dst_check(sk, 0)) == NULL)
298                 return;
299
300         dst->ops->update_pmtu(dst, mtu);
301
302         /* Something is about to be wrong... Remember soft error
303          * for the case, if this connection will not able to recover.
304          */
305         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
306                 sk->sk_err_soft = EMSGSIZE;
307
308         mtu = dst_mtu(dst);
309
310         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
311             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
312                 tcp_sync_mss(sk, mtu);
313
314                 /* Resend the TCP packet because it's
315                  * clear that the old packet has been
316                  * dropped. This is the new "fast" path mtu
317                  * discovery.
318                  */
319                 tcp_simple_retransmit(sk);
320         } /* else let the usual retransmit timer handle it */
321 }
322
323 /*
324  * This routine is called by the ICMP module when it gets some
325  * sort of error condition.  If err < 0 then the socket should
326  * be closed and the error returned to the user.  If err > 0
327  * it's just the icmp type << 8 | icmp code.  After adjustment
328  * header points to the first 8 bytes of the tcp header.  We need
329  * to find the appropriate port.
330  *
331  * The locking strategy used here is very "optimistic". When
332  * someone else accesses the socket the ICMP is just dropped
333  * and for some paths there is no check at all.
334  * A more general error queue to queue errors for later handling
335  * is probably better.
336  *
337  */
338
339 void tcp_v4_err(struct sk_buff *skb, u32 info)
340 {
341         struct iphdr *iph = (struct iphdr *)skb->data;
342         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
343         struct tcp_sock *tp;
344         struct inet_sock *inet;
345         const int type = icmp_hdr(skb)->type;
346         const int code = icmp_hdr(skb)->code;
347         struct sock *sk;
348         __u32 seq;
349         int err;
350
351         if (skb->len < (iph->ihl << 2) + 8) {
352                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
353                 return;
354         }
355
356         sk = inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->daddr, th->dest,
357                         iph->saddr, th->source, inet_iif(skb));
358         if (!sk) {
359                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
360                 return;
361         }
362         if (sk->sk_state == TCP_TIME_WAIT) {
363                 inet_twsk_put(inet_twsk(sk));
364                 return;
365         }
366
367         bh_lock_sock(sk);
368         /* If too many ICMPs get dropped on busy
369          * servers this needs to be solved differently.
370          */
371         if (sock_owned_by_user(sk))
372                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
373
374         if (sk->sk_state == TCP_CLOSE)
375                 goto out;
376
377         tp = tcp_sk(sk);
378         seq = ntohl(th->seq);
379         if (sk->sk_state != TCP_LISTEN &&
380             !between(seq, tp->snd_una, tp->snd_nxt)) {
381                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
382                 goto out;
383         }
384
385         switch (type) {
386         case ICMP_SOURCE_QUENCH:
387                 /* Just silently ignore these. */
388                 goto out;
389         case ICMP_PARAMETERPROB:
390                 err = EPROTO;
391                 break;
392         case ICMP_DEST_UNREACH:
393                 if (code > NR_ICMP_UNREACH)
394                         goto out;
395
396                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
397                         if (!sock_owned_by_user(sk))
398                                 do_pmtu_discovery(sk, iph, info);
399                         goto out;
400                 }
401
402                 err = icmp_err_convert[code].errno;
403                 break;
404         case ICMP_TIME_EXCEEDED:
405                 err = EHOSTUNREACH;
406                 break;
407         default:
408                 goto out;
409         }
410
411         switch (sk->sk_state) {
412                 struct request_sock *req, **prev;
413         case TCP_LISTEN:
414                 if (sock_owned_by_user(sk))
415                         goto out;
416
417                 req = inet_csk_search_req(sk, &prev, th->dest,
418                                           iph->daddr, iph->saddr);
419                 if (!req)
420                         goto out;
421
422                 /* ICMPs are not backlogged, hence we cannot get
423                    an established socket here.
424                  */
425                 BUG_TRAP(!req->sk);
426
427                 if (seq != tcp_rsk(req)->snt_isn) {
428                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
429                         goto out;
430                 }
431
432                 /*
433                  * Still in SYN_RECV, just remove it silently.
434                  * There is no good way to pass the error to the newly
435                  * created socket, and POSIX does not want network
436                  * errors returned from accept().
437                  */
438                 inet_csk_reqsk_queue_drop(sk, req, prev);
439                 goto out;
440
441         case TCP_SYN_SENT:
442         case TCP_SYN_RECV:  /* Cannot happen.
443                                It can f.e. if SYNs crossed.
444                              */
445                 if (!sock_owned_by_user(sk)) {
446                         sk->sk_err = err;
447
448                         sk->sk_error_report(sk);
449
450                         tcp_done(sk);
451                 } else {
452                         sk->sk_err_soft = err;
453                 }
454                 goto out;
455         }
456
457         /* If we've already connected we will keep trying
458          * until we time out, or the user gives up.
459          *
460          * rfc1122 4.2.3.9 allows to consider as hard errors
461          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
462          * but it is obsoleted by pmtu discovery).
463          *
464          * Note, that in modern internet, where routing is unreliable
465          * and in each dark corner broken firewalls sit, sending random
466          * errors ordered by their masters even this two messages finally lose
467          * their original sense (even Linux sends invalid PORT_UNREACHs)
468          *
469          * Now we are in compliance with RFCs.
470          *                                                      --ANK (980905)
471          */
472
473         inet = inet_sk(sk);
474         if (!sock_owned_by_user(sk) && inet->recverr) {
475                 sk->sk_err = err;
476                 sk->sk_error_report(sk);
477         } else  { /* Only an error on timeout */
478                 sk->sk_err_soft = err;
479         }
480
481 out:
482         bh_unlock_sock(sk);
483         sock_put(sk);
484 }
485
486 /* This routine computes an IPv4 TCP checksum. */
487 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
488 {
489         struct inet_sock *inet = inet_sk(sk);
490         struct tcphdr *th = tcp_hdr(skb);
491
492         if (skb->ip_summed == CHECKSUM_PARTIAL) {
493                 th->check = ~tcp_v4_check(len, inet->saddr,
494                                           inet->daddr, 0);
495                 skb->csum_start = skb_transport_header(skb) - skb->head;
496                 skb->csum_offset = offsetof(struct tcphdr, check);
497         } else {
498                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
499                                          csum_partial((char *)th,
500                                                       th->doff << 2,
501                                                       skb->csum));
502         }
503 }
504
505 int tcp_v4_gso_send_check(struct sk_buff *skb)
506 {
507         const struct iphdr *iph;
508         struct tcphdr *th;
509
510         if (!pskb_may_pull(skb, sizeof(*th)))
511                 return -EINVAL;
512
513         iph = ip_hdr(skb);
514         th = tcp_hdr(skb);
515
516         th->check = 0;
517         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
518         skb->csum_start = skb_transport_header(skb) - skb->head;
519         skb->csum_offset = offsetof(struct tcphdr, check);
520         skb->ip_summed = CHECKSUM_PARTIAL;
521         return 0;
522 }
523
524 /*
525  *      This routine will send an RST to the other tcp.
526  *
527  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
528  *                    for reset.
529  *      Answer: if a packet caused RST, it is not for a socket
530  *              existing in our system, if it is matched to a socket,
531  *              it is just duplicate segment or bug in other side's TCP.
532  *              So that we build reply only basing on parameters
533  *              arrived with segment.
534  *      Exception: precedence violation. We do not implement it in any case.
535  */
536
537 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
538 {
539         struct tcphdr *th = tcp_hdr(skb);
540         struct {
541                 struct tcphdr th;
542 #ifdef CONFIG_TCP_MD5SIG
543                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
544 #endif
545         } rep;
546         struct ip_reply_arg arg;
547 #ifdef CONFIG_TCP_MD5SIG
548         struct tcp_md5sig_key *key;
549 #endif
550
551         /* Never send a reset in response to a reset. */
552         if (th->rst)
553                 return;
554
555         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
556                 return;
557
558         /* Swap the send and the receive. */
559         memset(&rep, 0, sizeof(rep));
560         rep.th.dest   = th->source;
561         rep.th.source = th->dest;
562         rep.th.doff   = sizeof(struct tcphdr) / 4;
563         rep.th.rst    = 1;
564
565         if (th->ack) {
566                 rep.th.seq = th->ack_seq;
567         } else {
568                 rep.th.ack = 1;
569                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
570                                        skb->len - (th->doff << 2));
571         }
572
573         memset(&arg, 0, sizeof(arg));
574         arg.iov[0].iov_base = (unsigned char *)&rep;
575         arg.iov[0].iov_len  = sizeof(rep.th);
576
577 #ifdef CONFIG_TCP_MD5SIG
578         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
579         if (key) {
580                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
581                                    (TCPOPT_NOP << 16) |
582                                    (TCPOPT_MD5SIG << 8) |
583                                    TCPOLEN_MD5SIG);
584                 /* Update length and the length the header thinks exists */
585                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
586                 rep.th.doff = arg.iov[0].iov_len / 4;
587
588                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
589                                         key,
590                                         ip_hdr(skb)->daddr,
591                                         ip_hdr(skb)->saddr,
592                                         &rep.th, IPPROTO_TCP,
593                                         arg.iov[0].iov_len);
594         }
595 #endif
596         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
597                                       ip_hdr(skb)->saddr, /* XXX */
598                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
599         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
600
601         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
602
603         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
604         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
605 }
606
607 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
608    outside socket context is ugly, certainly. What can I do?
609  */
610
611 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
612                             struct sk_buff *skb, u32 seq, u32 ack,
613                             u32 win, u32 ts)
614 {
615         struct tcphdr *th = tcp_hdr(skb);
616         struct {
617                 struct tcphdr th;
618                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
619 #ifdef CONFIG_TCP_MD5SIG
620                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
621 #endif
622                         ];
623         } rep;
624         struct ip_reply_arg arg;
625 #ifdef CONFIG_TCP_MD5SIG
626         struct tcp_md5sig_key *key;
627         struct tcp_md5sig_key tw_key;
628 #endif
629
630         memset(&rep.th, 0, sizeof(struct tcphdr));
631         memset(&arg, 0, sizeof(arg));
632
633         arg.iov[0].iov_base = (unsigned char *)&rep;
634         arg.iov[0].iov_len  = sizeof(rep.th);
635         if (ts) {
636                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
637                                    (TCPOPT_TIMESTAMP << 8) |
638                                    TCPOLEN_TIMESTAMP);
639                 rep.opt[1] = htonl(tcp_time_stamp);
640                 rep.opt[2] = htonl(ts);
641                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
642         }
643
644         /* Swap the send and the receive. */
645         rep.th.dest    = th->source;
646         rep.th.source  = th->dest;
647         rep.th.doff    = arg.iov[0].iov_len / 4;
648         rep.th.seq     = htonl(seq);
649         rep.th.ack_seq = htonl(ack);
650         rep.th.ack     = 1;
651         rep.th.window  = htons(win);
652
653 #ifdef CONFIG_TCP_MD5SIG
654         /*
655          * The SKB holds an imcoming packet, but may not have a valid ->sk
656          * pointer. This is especially the case when we're dealing with a
657          * TIME_WAIT ack, because the sk structure is long gone, and only
658          * the tcp_timewait_sock remains. So the md5 key is stashed in that
659          * structure, and we use it in preference.  I believe that (twsk ||
660          * skb->sk) holds true, but we program defensively.
661          */
662         if (!twsk && skb->sk) {
663                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
664         } else if (twsk && twsk->tw_md5_keylen) {
665                 tw_key.key = twsk->tw_md5_key;
666                 tw_key.keylen = twsk->tw_md5_keylen;
667                 key = &tw_key;
668         } else
669                 key = NULL;
670
671         if (key) {
672                 int offset = (ts) ? 3 : 0;
673
674                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
675                                           (TCPOPT_NOP << 16) |
676                                           (TCPOPT_MD5SIG << 8) |
677                                           TCPOLEN_MD5SIG);
678                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
679                 rep.th.doff = arg.iov[0].iov_len/4;
680
681                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
682                                         key,
683                                         ip_hdr(skb)->daddr,
684                                         ip_hdr(skb)->saddr,
685                                         &rep.th, IPPROTO_TCP,
686                                         arg.iov[0].iov_len);
687         }
688 #endif
689         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
690                                       ip_hdr(skb)->saddr, /* XXX */
691                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
692         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
693         if (twsk)
694                 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
695
696         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
697
698         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
699 }
700
701 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
702 {
703         struct inet_timewait_sock *tw = inet_twsk(sk);
704         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
705
706         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
707                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
708                         tcptw->tw_ts_recent);
709
710         inet_twsk_put(tw);
711 }
712
713 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
714                                   struct request_sock *req)
715 {
716         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
717                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
718                         req->ts_recent);
719 }
720
721 /*
722  *      Send a SYN-ACK after having received a SYN.
723  *      This still operates on a request_sock only, not on a big
724  *      socket.
725  */
726 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
727                               struct dst_entry *dst)
728 {
729         const struct inet_request_sock *ireq = inet_rsk(req);
730         int err = -1;
731         struct sk_buff * skb;
732
733         /* First, grab a route. */
734         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
735                 goto out;
736
737         skb = tcp_make_synack(sk, dst, req);
738
739         if (skb) {
740                 struct tcphdr *th = tcp_hdr(skb);
741
742                 th->check = tcp_v4_check(skb->len,
743                                          ireq->loc_addr,
744                                          ireq->rmt_addr,
745                                          csum_partial((char *)th, skb->len,
746                                                       skb->csum));
747
748                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
749                                             ireq->rmt_addr,
750                                             ireq->opt);
751                 err = net_xmit_eval(err);
752         }
753
754 out:
755         dst_release(dst);
756         return err;
757 }
758
759 /*
760  *      IPv4 request_sock destructor.
761  */
762 static void tcp_v4_reqsk_destructor(struct request_sock *req)
763 {
764         kfree(inet_rsk(req)->opt);
765 }
766
767 #ifdef CONFIG_SYN_COOKIES
768 static void syn_flood_warning(struct sk_buff *skb)
769 {
770         static unsigned long warntime;
771
772         if (time_after(jiffies, (warntime + HZ * 60))) {
773                 warntime = jiffies;
774                 printk(KERN_INFO
775                        "possible SYN flooding on port %d. Sending cookies.\n",
776                        ntohs(tcp_hdr(skb)->dest));
777         }
778 }
779 #endif
780
781 /*
782  * Save and compile IPv4 options into the request_sock if needed.
783  */
784 static struct ip_options *tcp_v4_save_options(struct sock *sk,
785                                               struct sk_buff *skb)
786 {
787         struct ip_options *opt = &(IPCB(skb)->opt);
788         struct ip_options *dopt = NULL;
789
790         if (opt && opt->optlen) {
791                 int opt_size = optlength(opt);
792                 dopt = kmalloc(opt_size, GFP_ATOMIC);
793                 if (dopt) {
794                         if (ip_options_echo(dopt, skb)) {
795                                 kfree(dopt);
796                                 dopt = NULL;
797                         }
798                 }
799         }
800         return dopt;
801 }
802
803 #ifdef CONFIG_TCP_MD5SIG
804 /*
805  * RFC2385 MD5 checksumming requires a mapping of
806  * IP address->MD5 Key.
807  * We need to maintain these in the sk structure.
808  */
809
810 /* Find the Key structure for an address.  */
811 static struct tcp_md5sig_key *
812                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
813 {
814         struct tcp_sock *tp = tcp_sk(sk);
815         int i;
816
817         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
818                 return NULL;
819         for (i = 0; i < tp->md5sig_info->entries4; i++) {
820                 if (tp->md5sig_info->keys4[i].addr == addr)
821                         return &tp->md5sig_info->keys4[i].base;
822         }
823         return NULL;
824 }
825
826 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
827                                          struct sock *addr_sk)
828 {
829         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
830 }
831
832 EXPORT_SYMBOL(tcp_v4_md5_lookup);
833
834 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
835                                                       struct request_sock *req)
836 {
837         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
838 }
839
840 /* This can be called on a newly created socket, from other files */
841 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
842                       u8 *newkey, u8 newkeylen)
843 {
844         /* Add Key to the list */
845         struct tcp_md5sig_key *key;
846         struct tcp_sock *tp = tcp_sk(sk);
847         struct tcp4_md5sig_key *keys;
848
849         key = tcp_v4_md5_do_lookup(sk, addr);
850         if (key) {
851                 /* Pre-existing entry - just update that one. */
852                 kfree(key->key);
853                 key->key = newkey;
854                 key->keylen = newkeylen;
855         } else {
856                 struct tcp_md5sig_info *md5sig;
857
858                 if (!tp->md5sig_info) {
859                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
860                                                   GFP_ATOMIC);
861                         if (!tp->md5sig_info) {
862                                 kfree(newkey);
863                                 return -ENOMEM;
864                         }
865                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
866                 }
867                 if (tcp_alloc_md5sig_pool() == NULL) {
868                         kfree(newkey);
869                         return -ENOMEM;
870                 }
871                 md5sig = tp->md5sig_info;
872
873                 if (md5sig->alloced4 == md5sig->entries4) {
874                         keys = kmalloc((sizeof(*keys) *
875                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
876                         if (!keys) {
877                                 kfree(newkey);
878                                 tcp_free_md5sig_pool();
879                                 return -ENOMEM;
880                         }
881
882                         if (md5sig->entries4)
883                                 memcpy(keys, md5sig->keys4,
884                                        sizeof(*keys) * md5sig->entries4);
885
886                         /* Free old key list, and reference new one */
887                         kfree(md5sig->keys4);
888                         md5sig->keys4 = keys;
889                         md5sig->alloced4++;
890                 }
891                 md5sig->entries4++;
892                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
893                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
894                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
895         }
896         return 0;
897 }
898
899 EXPORT_SYMBOL(tcp_v4_md5_do_add);
900
901 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
902                                u8 *newkey, u8 newkeylen)
903 {
904         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
905                                  newkey, newkeylen);
906 }
907
908 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
909 {
910         struct tcp_sock *tp = tcp_sk(sk);
911         int i;
912
913         for (i = 0; i < tp->md5sig_info->entries4; i++) {
914                 if (tp->md5sig_info->keys4[i].addr == addr) {
915                         /* Free the key */
916                         kfree(tp->md5sig_info->keys4[i].base.key);
917                         tp->md5sig_info->entries4--;
918
919                         if (tp->md5sig_info->entries4 == 0) {
920                                 kfree(tp->md5sig_info->keys4);
921                                 tp->md5sig_info->keys4 = NULL;
922                                 tp->md5sig_info->alloced4 = 0;
923                         } else if (tp->md5sig_info->entries4 != i) {
924                                 /* Need to do some manipulation */
925                                 memmove(&tp->md5sig_info->keys4[i],
926                                         &tp->md5sig_info->keys4[i+1],
927                                         (tp->md5sig_info->entries4 - i) *
928                                          sizeof(struct tcp4_md5sig_key));
929                         }
930                         tcp_free_md5sig_pool();
931                         return 0;
932                 }
933         }
934         return -ENOENT;
935 }
936
937 EXPORT_SYMBOL(tcp_v4_md5_do_del);
938
939 static void tcp_v4_clear_md5_list(struct sock *sk)
940 {
941         struct tcp_sock *tp = tcp_sk(sk);
942
943         /* Free each key, then the set of key keys,
944          * the crypto element, and then decrement our
945          * hold on the last resort crypto.
946          */
947         if (tp->md5sig_info->entries4) {
948                 int i;
949                 for (i = 0; i < tp->md5sig_info->entries4; i++)
950                         kfree(tp->md5sig_info->keys4[i].base.key);
951                 tp->md5sig_info->entries4 = 0;
952                 tcp_free_md5sig_pool();
953         }
954         if (tp->md5sig_info->keys4) {
955                 kfree(tp->md5sig_info->keys4);
956                 tp->md5sig_info->keys4 = NULL;
957                 tp->md5sig_info->alloced4  = 0;
958         }
959 }
960
961 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
962                                  int optlen)
963 {
964         struct tcp_md5sig cmd;
965         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
966         u8 *newkey;
967
968         if (optlen < sizeof(cmd))
969                 return -EINVAL;
970
971         if (copy_from_user(&cmd, optval, sizeof(cmd)))
972                 return -EFAULT;
973
974         if (sin->sin_family != AF_INET)
975                 return -EINVAL;
976
977         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
978                 if (!tcp_sk(sk)->md5sig_info)
979                         return -ENOENT;
980                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
981         }
982
983         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
984                 return -EINVAL;
985
986         if (!tcp_sk(sk)->md5sig_info) {
987                 struct tcp_sock *tp = tcp_sk(sk);
988                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
989
990                 if (!p)
991                         return -EINVAL;
992
993                 tp->md5sig_info = p;
994                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
995         }
996
997         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
998         if (!newkey)
999                 return -ENOMEM;
1000         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1001                                  newkey, cmd.tcpm_keylen);
1002 }
1003
1004 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1005                                    __be32 saddr, __be32 daddr,
1006                                    struct tcphdr *th, int protocol,
1007                                    unsigned int tcplen)
1008 {
1009         struct scatterlist sg[4];
1010         __u16 data_len;
1011         int block = 0;
1012         __sum16 old_checksum;
1013         struct tcp_md5sig_pool *hp;
1014         struct tcp4_pseudohdr *bp;
1015         struct hash_desc *desc;
1016         int err;
1017         unsigned int nbytes = 0;
1018
1019         /*
1020          * Okay, so RFC2385 is turned on for this connection,
1021          * so we need to generate the MD5 hash for the packet now.
1022          */
1023
1024         hp = tcp_get_md5sig_pool();
1025         if (!hp)
1026                 goto clear_hash_noput;
1027
1028         bp = &hp->md5_blk.ip4;
1029         desc = &hp->md5_desc;
1030
1031         /*
1032          * 1. the TCP pseudo-header (in the order: source IP address,
1033          * destination IP address, zero-padded protocol number, and
1034          * segment length)
1035          */
1036         bp->saddr = saddr;
1037         bp->daddr = daddr;
1038         bp->pad = 0;
1039         bp->protocol = protocol;
1040         bp->len = htons(tcplen);
1041
1042         sg_init_table(sg, 4);
1043
1044         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1045         nbytes += sizeof(*bp);
1046
1047         /* 2. the TCP header, excluding options, and assuming a
1048          * checksum of zero/
1049          */
1050         old_checksum = th->check;
1051         th->check = 0;
1052         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1053         nbytes += sizeof(struct tcphdr);
1054
1055         /* 3. the TCP segment data (if any) */
1056         data_len = tcplen - (th->doff << 2);
1057         if (data_len > 0) {
1058                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1059                 sg_set_buf(&sg[block++], data, data_len);
1060                 nbytes += data_len;
1061         }
1062
1063         /* 4. an independently-specified key or password, known to both
1064          * TCPs and presumably connection-specific
1065          */
1066         sg_set_buf(&sg[block++], key->key, key->keylen);
1067         nbytes += key->keylen;
1068
1069         sg_mark_end(&sg[block - 1]);
1070
1071         /* Now store the Hash into the packet */
1072         err = crypto_hash_init(desc);
1073         if (err)
1074                 goto clear_hash;
1075         err = crypto_hash_update(desc, sg, nbytes);
1076         if (err)
1077                 goto clear_hash;
1078         err = crypto_hash_final(desc, md5_hash);
1079         if (err)
1080                 goto clear_hash;
1081
1082         /* Reset header, and free up the crypto */
1083         tcp_put_md5sig_pool();
1084         th->check = old_checksum;
1085
1086 out:
1087         return 0;
1088 clear_hash:
1089         tcp_put_md5sig_pool();
1090 clear_hash_noput:
1091         memset(md5_hash, 0, 16);
1092         goto out;
1093 }
1094
1095 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1096                          struct sock *sk,
1097                          struct dst_entry *dst,
1098                          struct request_sock *req,
1099                          struct tcphdr *th, int protocol,
1100                          unsigned int tcplen)
1101 {
1102         __be32 saddr, daddr;
1103
1104         if (sk) {
1105                 saddr = inet_sk(sk)->saddr;
1106                 daddr = inet_sk(sk)->daddr;
1107         } else {
1108                 struct rtable *rt = (struct rtable *)dst;
1109                 BUG_ON(!rt);
1110                 saddr = rt->rt_src;
1111                 daddr = rt->rt_dst;
1112         }
1113         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1114                                        saddr, daddr,
1115                                        th, protocol, tcplen);
1116 }
1117
1118 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1119
1120 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1121 {
1122         /*
1123          * This gets called for each TCP segment that arrives
1124          * so we want to be efficient.
1125          * We have 3 drop cases:
1126          * o No MD5 hash and one expected.
1127          * o MD5 hash and we're not expecting one.
1128          * o MD5 hash and its wrong.
1129          */
1130         __u8 *hash_location = NULL;
1131         struct tcp_md5sig_key *hash_expected;
1132         const struct iphdr *iph = ip_hdr(skb);
1133         struct tcphdr *th = tcp_hdr(skb);
1134         int length = (th->doff << 2) - sizeof(struct tcphdr);
1135         int genhash;
1136         unsigned char *ptr;
1137         unsigned char newhash[16];
1138
1139         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1140
1141         /*
1142          * If the TCP option length is less than the TCP_MD5SIG
1143          * option length, then we can shortcut
1144          */
1145         if (length < TCPOLEN_MD5SIG) {
1146                 if (hash_expected)
1147                         return 1;
1148                 else
1149                         return 0;
1150         }
1151
1152         /* Okay, we can't shortcut - we have to grub through the options */
1153         ptr = (unsigned char *)(th + 1);
1154         while (length > 0) {
1155                 int opcode = *ptr++;
1156                 int opsize;
1157
1158                 switch (opcode) {
1159                 case TCPOPT_EOL:
1160                         goto done_opts;
1161                 case TCPOPT_NOP:
1162                         length--;
1163                         continue;
1164                 default:
1165                         opsize = *ptr++;
1166                         if (opsize < 2)
1167                                 goto done_opts;
1168                         if (opsize > length)
1169                                 goto done_opts;
1170
1171                         if (opcode == TCPOPT_MD5SIG) {
1172                                 hash_location = ptr;
1173                                 goto done_opts;
1174                         }
1175                 }
1176                 ptr += opsize-2;
1177                 length -= opsize;
1178         }
1179 done_opts:
1180         /* We've parsed the options - do we have a hash? */
1181         if (!hash_expected && !hash_location)
1182                 return 0;
1183
1184         if (hash_expected && !hash_location) {
1185                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1186                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1187                                NIPQUAD(iph->saddr), ntohs(th->source),
1188                                NIPQUAD(iph->daddr), ntohs(th->dest));
1189                 return 1;
1190         }
1191
1192         if (!hash_expected && hash_location) {
1193                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1194                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1195                                NIPQUAD(iph->saddr), ntohs(th->source),
1196                                NIPQUAD(iph->daddr), ntohs(th->dest));
1197                 return 1;
1198         }
1199
1200         /* Okay, so this is hash_expected and hash_location -
1201          * so we need to calculate the checksum.
1202          */
1203         genhash = tcp_v4_do_calc_md5_hash(newhash,
1204                                           hash_expected,
1205                                           iph->saddr, iph->daddr,
1206                                           th, sk->sk_protocol,
1207                                           skb->len);
1208
1209         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1210                 if (net_ratelimit()) {
1211                         printk(KERN_INFO "MD5 Hash failed for "
1212                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1213                                NIPQUAD(iph->saddr), ntohs(th->source),
1214                                NIPQUAD(iph->daddr), ntohs(th->dest),
1215                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1216                 }
1217                 return 1;
1218         }
1219         return 0;
1220 }
1221
1222 #endif
1223
1224 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1225         .family         =       PF_INET,
1226         .obj_size       =       sizeof(struct tcp_request_sock),
1227         .rtx_syn_ack    =       tcp_v4_send_synack,
1228         .send_ack       =       tcp_v4_reqsk_send_ack,
1229         .destructor     =       tcp_v4_reqsk_destructor,
1230         .send_reset     =       tcp_v4_send_reset,
1231 };
1232
1233 #ifdef CONFIG_TCP_MD5SIG
1234 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1235         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1236 };
1237 #endif
1238
1239 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1240         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1241         .twsk_unique    = tcp_twsk_unique,
1242         .twsk_destructor= tcp_twsk_destructor,
1243 };
1244
1245 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1246 {
1247         struct inet_request_sock *ireq;
1248         struct tcp_options_received tmp_opt;
1249         struct request_sock *req;
1250         __be32 saddr = ip_hdr(skb)->saddr;
1251         __be32 daddr = ip_hdr(skb)->daddr;
1252         __u32 isn = TCP_SKB_CB(skb)->when;
1253         struct dst_entry *dst = NULL;
1254 #ifdef CONFIG_SYN_COOKIES
1255         int want_cookie = 0;
1256 #else
1257 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1258 #endif
1259
1260         /* Never answer to SYNs send to broadcast or multicast */
1261         if (((struct rtable *)skb->dst)->rt_flags &
1262             (RTCF_BROADCAST | RTCF_MULTICAST))
1263                 goto drop;
1264
1265         /* TW buckets are converted to open requests without
1266          * limitations, they conserve resources and peer is
1267          * evidently real one.
1268          */
1269         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1270 #ifdef CONFIG_SYN_COOKIES
1271                 if (sysctl_tcp_syncookies) {
1272                         want_cookie = 1;
1273                 } else
1274 #endif
1275                 goto drop;
1276         }
1277
1278         /* Accept backlog is full. If we have already queued enough
1279          * of warm entries in syn queue, drop request. It is better than
1280          * clogging syn queue with openreqs with exponentially increasing
1281          * timeout.
1282          */
1283         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1284                 goto drop;
1285
1286         req = reqsk_alloc(&tcp_request_sock_ops);
1287         if (!req)
1288                 goto drop;
1289
1290 #ifdef CONFIG_TCP_MD5SIG
1291         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1292 #endif
1293
1294         tcp_clear_options(&tmp_opt);
1295         tmp_opt.mss_clamp = 536;
1296         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1297
1298         tcp_parse_options(skb, &tmp_opt, 0);
1299
1300         if (want_cookie) {
1301                 tcp_clear_options(&tmp_opt);
1302                 tmp_opt.saw_tstamp = 0;
1303         }
1304
1305         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1306                 /* Some OSes (unknown ones, but I see them on web server, which
1307                  * contains information interesting only for windows'
1308                  * users) do not send their stamp in SYN. It is easy case.
1309                  * We simply do not advertise TS support.
1310                  */
1311                 tmp_opt.saw_tstamp = 0;
1312                 tmp_opt.tstamp_ok  = 0;
1313         }
1314         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1315
1316         tcp_openreq_init(req, &tmp_opt, skb);
1317
1318         if (security_inet_conn_request(sk, skb, req))
1319                 goto drop_and_free;
1320
1321         ireq = inet_rsk(req);
1322         ireq->loc_addr = daddr;
1323         ireq->rmt_addr = saddr;
1324         ireq->opt = tcp_v4_save_options(sk, skb);
1325         if (!want_cookie)
1326                 TCP_ECN_create_request(req, tcp_hdr(skb));
1327
1328         if (want_cookie) {
1329 #ifdef CONFIG_SYN_COOKIES
1330                 syn_flood_warning(skb);
1331 #endif
1332                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1333         } else if (!isn) {
1334                 struct inet_peer *peer = NULL;
1335
1336                 /* VJ's idea. We save last timestamp seen
1337                  * from the destination in peer table, when entering
1338                  * state TIME-WAIT, and check against it before
1339                  * accepting new connection request.
1340                  *
1341                  * If "isn" is not zero, this request hit alive
1342                  * timewait bucket, so that all the necessary checks
1343                  * are made in the function processing timewait state.
1344                  */
1345                 if (tmp_opt.saw_tstamp &&
1346                     tcp_death_row.sysctl_tw_recycle &&
1347                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1348                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1349                     peer->v4daddr == saddr) {
1350                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1351                             (s32)(peer->tcp_ts - req->ts_recent) >
1352                                                         TCP_PAWS_WINDOW) {
1353                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1354                                 dst_release(dst);
1355                                 goto drop_and_free;
1356                         }
1357                 }
1358                 /* Kill the following clause, if you dislike this way. */
1359                 else if (!sysctl_tcp_syncookies &&
1360                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1361                           (sysctl_max_syn_backlog >> 2)) &&
1362                          (!peer || !peer->tcp_ts_stamp) &&
1363                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1364                         /* Without syncookies last quarter of
1365                          * backlog is filled with destinations,
1366                          * proven to be alive.
1367                          * It means that we continue to communicate
1368                          * to destinations, already remembered
1369                          * to the moment of synflood.
1370                          */
1371                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1372                                        "request from %u.%u.%u.%u/%u\n",
1373                                        NIPQUAD(saddr),
1374                                        ntohs(tcp_hdr(skb)->source));
1375                         dst_release(dst);
1376                         goto drop_and_free;
1377                 }
1378
1379                 isn = tcp_v4_init_sequence(skb);
1380         }
1381         tcp_rsk(req)->snt_isn = isn;
1382
1383         if (tcp_v4_send_synack(sk, req, dst))
1384                 goto drop_and_free;
1385
1386         if (want_cookie) {
1387                 reqsk_free(req);
1388         } else {
1389                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1390         }
1391         return 0;
1392
1393 drop_and_free:
1394         reqsk_free(req);
1395 drop:
1396         return 0;
1397 }
1398
1399
1400 /*
1401  * The three way handshake has completed - we got a valid synack -
1402  * now create the new socket.
1403  */
1404 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1405                                   struct request_sock *req,
1406                                   struct dst_entry *dst)
1407 {
1408         struct inet_request_sock *ireq;
1409         struct inet_sock *newinet;
1410         struct tcp_sock *newtp;
1411         struct sock *newsk;
1412 #ifdef CONFIG_TCP_MD5SIG
1413         struct tcp_md5sig_key *key;
1414 #endif
1415
1416         if (sk_acceptq_is_full(sk))
1417                 goto exit_overflow;
1418
1419         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1420                 goto exit;
1421
1422         newsk = tcp_create_openreq_child(sk, req, skb);
1423         if (!newsk)
1424                 goto exit;
1425
1426         newsk->sk_gso_type = SKB_GSO_TCPV4;
1427         sk_setup_caps(newsk, dst);
1428
1429         newtp                 = tcp_sk(newsk);
1430         newinet               = inet_sk(newsk);
1431         ireq                  = inet_rsk(req);
1432         newinet->daddr        = ireq->rmt_addr;
1433         newinet->rcv_saddr    = ireq->loc_addr;
1434         newinet->saddr        = ireq->loc_addr;
1435         newinet->opt          = ireq->opt;
1436         ireq->opt             = NULL;
1437         newinet->mc_index     = inet_iif(skb);
1438         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1439         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1440         if (newinet->opt)
1441                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1442         newinet->id = newtp->write_seq ^ jiffies;
1443
1444         tcp_mtup_init(newsk);
1445         tcp_sync_mss(newsk, dst_mtu(dst));
1446         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1447         tcp_initialize_rcv_mss(newsk);
1448
1449 #ifdef CONFIG_TCP_MD5SIG
1450         /* Copy over the MD5 key from the original socket */
1451         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1452                 /*
1453                  * We're using one, so create a matching key
1454                  * on the newsk structure. If we fail to get
1455                  * memory, then we end up not copying the key
1456                  * across. Shucks.
1457                  */
1458                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1459                 if (newkey != NULL)
1460                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1461                                           newkey, key->keylen);
1462         }
1463 #endif
1464
1465         __inet_hash_nolisten(newsk);
1466         __inet_inherit_port(sk, newsk);
1467
1468         return newsk;
1469
1470 exit_overflow:
1471         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1472 exit:
1473         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1474         dst_release(dst);
1475         return NULL;
1476 }
1477
1478 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1479 {
1480         struct tcphdr *th = tcp_hdr(skb);
1481         const struct iphdr *iph = ip_hdr(skb);
1482         struct sock *nsk;
1483         struct request_sock **prev;
1484         /* Find possible connection requests. */
1485         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1486                                                        iph->saddr, iph->daddr);
1487         if (req)
1488                 return tcp_check_req(sk, skb, req, prev);
1489
1490         nsk = inet_lookup_established(sk->sk_net, &tcp_hashinfo, iph->saddr,
1491                         th->source, iph->daddr, th->dest, inet_iif(skb));
1492
1493         if (nsk) {
1494                 if (nsk->sk_state != TCP_TIME_WAIT) {
1495                         bh_lock_sock(nsk);
1496                         return nsk;
1497                 }
1498                 inet_twsk_put(inet_twsk(nsk));
1499                 return NULL;
1500         }
1501
1502 #ifdef CONFIG_SYN_COOKIES
1503         if (!th->rst && !th->syn && th->ack)
1504                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1505 #endif
1506         return sk;
1507 }
1508
1509 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1510 {
1511         const struct iphdr *iph = ip_hdr(skb);
1512
1513         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1514                 if (!tcp_v4_check(skb->len, iph->saddr,
1515                                   iph->daddr, skb->csum)) {
1516                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1517                         return 0;
1518                 }
1519         }
1520
1521         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1522                                        skb->len, IPPROTO_TCP, 0);
1523
1524         if (skb->len <= 76) {
1525                 return __skb_checksum_complete(skb);
1526         }
1527         return 0;
1528 }
1529
1530
1531 /* The socket must have it's spinlock held when we get
1532  * here.
1533  *
1534  * We have a potential double-lock case here, so even when
1535  * doing backlog processing we use the BH locking scheme.
1536  * This is because we cannot sleep with the original spinlock
1537  * held.
1538  */
1539 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1540 {
1541         struct sock *rsk;
1542 #ifdef CONFIG_TCP_MD5SIG
1543         /*
1544          * We really want to reject the packet as early as possible
1545          * if:
1546          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1547          *  o There is an MD5 option and we're not expecting one
1548          */
1549         if (tcp_v4_inbound_md5_hash(sk, skb))
1550                 goto discard;
1551 #endif
1552
1553         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1554                 TCP_CHECK_TIMER(sk);
1555                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1556                         rsk = sk;
1557                         goto reset;
1558                 }
1559                 TCP_CHECK_TIMER(sk);
1560                 return 0;
1561         }
1562
1563         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1564                 goto csum_err;
1565
1566         if (sk->sk_state == TCP_LISTEN) {
1567                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1568                 if (!nsk)
1569                         goto discard;
1570
1571                 if (nsk != sk) {
1572                         if (tcp_child_process(sk, nsk, skb)) {
1573                                 rsk = nsk;
1574                                 goto reset;
1575                         }
1576                         return 0;
1577                 }
1578         }
1579
1580         TCP_CHECK_TIMER(sk);
1581         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1582                 rsk = sk;
1583                 goto reset;
1584         }
1585         TCP_CHECK_TIMER(sk);
1586         return 0;
1587
1588 reset:
1589         tcp_v4_send_reset(rsk, skb);
1590 discard:
1591         kfree_skb(skb);
1592         /* Be careful here. If this function gets more complicated and
1593          * gcc suffers from register pressure on the x86, sk (in %ebx)
1594          * might be destroyed here. This current version compiles correctly,
1595          * but you have been warned.
1596          */
1597         return 0;
1598
1599 csum_err:
1600         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1601         goto discard;
1602 }
1603
1604 /*
1605  *      From tcp_input.c
1606  */
1607
1608 int tcp_v4_rcv(struct sk_buff *skb)
1609 {
1610         const struct iphdr *iph;
1611         struct tcphdr *th;
1612         struct sock *sk;
1613         int ret;
1614
1615         if (skb->pkt_type != PACKET_HOST)
1616                 goto discard_it;
1617
1618         /* Count it even if it's bad */
1619         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1620
1621         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1622                 goto discard_it;
1623
1624         th = tcp_hdr(skb);
1625
1626         if (th->doff < sizeof(struct tcphdr) / 4)
1627                 goto bad_packet;
1628         if (!pskb_may_pull(skb, th->doff * 4))
1629                 goto discard_it;
1630
1631         /* An explanation is required here, I think.
1632          * Packet length and doff are validated by header prediction,
1633          * provided case of th->doff==0 is eliminated.
1634          * So, we defer the checks. */
1635         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1636                 goto bad_packet;
1637
1638         th = tcp_hdr(skb);
1639         iph = ip_hdr(skb);
1640         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1641         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1642                                     skb->len - th->doff * 4);
1643         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1644         TCP_SKB_CB(skb)->when    = 0;
1645         TCP_SKB_CB(skb)->flags   = iph->tos;
1646         TCP_SKB_CB(skb)->sacked  = 0;
1647
1648         sk = __inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->saddr,
1649                         th->source, iph->daddr, th->dest, inet_iif(skb));
1650         if (!sk)
1651                 goto no_tcp_socket;
1652
1653 process:
1654         if (sk->sk_state == TCP_TIME_WAIT)
1655                 goto do_time_wait;
1656
1657         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1658                 goto discard_and_relse;
1659         nf_reset(skb);
1660
1661         if (sk_filter(sk, skb))
1662                 goto discard_and_relse;
1663
1664         skb->dev = NULL;
1665
1666         bh_lock_sock_nested(sk);
1667         ret = 0;
1668         if (!sock_owned_by_user(sk)) {
1669 #ifdef CONFIG_NET_DMA
1670                 struct tcp_sock *tp = tcp_sk(sk);
1671                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1672                         tp->ucopy.dma_chan = get_softnet_dma();
1673                 if (tp->ucopy.dma_chan)
1674                         ret = tcp_v4_do_rcv(sk, skb);
1675                 else
1676 #endif
1677                 {
1678                         if (!tcp_prequeue(sk, skb))
1679                         ret = tcp_v4_do_rcv(sk, skb);
1680                 }
1681         } else
1682                 sk_add_backlog(sk, skb);
1683         bh_unlock_sock(sk);
1684
1685         sock_put(sk);
1686
1687         return ret;
1688
1689 no_tcp_socket:
1690         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1691                 goto discard_it;
1692
1693         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1694 bad_packet:
1695                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1696         } else {
1697                 tcp_v4_send_reset(NULL, skb);
1698         }
1699
1700 discard_it:
1701         /* Discard frame. */
1702         kfree_skb(skb);
1703         return 0;
1704
1705 discard_and_relse:
1706         sock_put(sk);
1707         goto discard_it;
1708
1709 do_time_wait:
1710         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1711                 inet_twsk_put(inet_twsk(sk));
1712                 goto discard_it;
1713         }
1714
1715         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1716                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1717                 inet_twsk_put(inet_twsk(sk));
1718                 goto discard_it;
1719         }
1720         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1721         case TCP_TW_SYN: {
1722                 struct sock *sk2 = inet_lookup_listener(skb->dev->nd_net,
1723                                                         &tcp_hashinfo,
1724                                                         iph->daddr, th->dest,
1725                                                         inet_iif(skb));
1726                 if (sk2) {
1727                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1728                         inet_twsk_put(inet_twsk(sk));
1729                         sk = sk2;
1730                         goto process;
1731                 }
1732                 /* Fall through to ACK */
1733         }
1734         case TCP_TW_ACK:
1735                 tcp_v4_timewait_ack(sk, skb);
1736                 break;
1737         case TCP_TW_RST:
1738                 goto no_tcp_socket;
1739         case TCP_TW_SUCCESS:;
1740         }
1741         goto discard_it;
1742 }
1743
1744 /* VJ's idea. Save last timestamp seen from this destination
1745  * and hold it at least for normal timewait interval to use for duplicate
1746  * segment detection in subsequent connections, before they enter synchronized
1747  * state.
1748  */
1749
1750 int tcp_v4_remember_stamp(struct sock *sk)
1751 {
1752         struct inet_sock *inet = inet_sk(sk);
1753         struct tcp_sock *tp = tcp_sk(sk);
1754         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1755         struct inet_peer *peer = NULL;
1756         int release_it = 0;
1757
1758         if (!rt || rt->rt_dst != inet->daddr) {
1759                 peer = inet_getpeer(inet->daddr, 1);
1760                 release_it = 1;
1761         } else {
1762                 if (!rt->peer)
1763                         rt_bind_peer(rt, 1);
1764                 peer = rt->peer;
1765         }
1766
1767         if (peer) {
1768                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1769                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1770                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1771                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1772                         peer->tcp_ts = tp->rx_opt.ts_recent;
1773                 }
1774                 if (release_it)
1775                         inet_putpeer(peer);
1776                 return 1;
1777         }
1778
1779         return 0;
1780 }
1781
1782 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1783 {
1784         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1785
1786         if (peer) {
1787                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1788
1789                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1790                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1791                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1792                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1793                         peer->tcp_ts       = tcptw->tw_ts_recent;
1794                 }
1795                 inet_putpeer(peer);
1796                 return 1;
1797         }
1798
1799         return 0;
1800 }
1801
1802 struct inet_connection_sock_af_ops ipv4_specific = {
1803         .queue_xmit        = ip_queue_xmit,
1804         .send_check        = tcp_v4_send_check,
1805         .rebuild_header    = inet_sk_rebuild_header,
1806         .conn_request      = tcp_v4_conn_request,
1807         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1808         .remember_stamp    = tcp_v4_remember_stamp,
1809         .net_header_len    = sizeof(struct iphdr),
1810         .setsockopt        = ip_setsockopt,
1811         .getsockopt        = ip_getsockopt,
1812         .addr2sockaddr     = inet_csk_addr2sockaddr,
1813         .sockaddr_len      = sizeof(struct sockaddr_in),
1814         .bind_conflict     = inet_csk_bind_conflict,
1815 #ifdef CONFIG_COMPAT
1816         .compat_setsockopt = compat_ip_setsockopt,
1817         .compat_getsockopt = compat_ip_getsockopt,
1818 #endif
1819 };
1820
1821 #ifdef CONFIG_TCP_MD5SIG
1822 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1823         .md5_lookup             = tcp_v4_md5_lookup,
1824         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1825         .md5_add                = tcp_v4_md5_add_func,
1826         .md5_parse              = tcp_v4_parse_md5_keys,
1827 };
1828 #endif
1829
1830 /* NOTE: A lot of things set to zero explicitly by call to
1831  *       sk_alloc() so need not be done here.
1832  */
1833 static int tcp_v4_init_sock(struct sock *sk)
1834 {
1835         struct inet_connection_sock *icsk = inet_csk(sk);
1836         struct tcp_sock *tp = tcp_sk(sk);
1837
1838         skb_queue_head_init(&tp->out_of_order_queue);
1839         tcp_init_xmit_timers(sk);
1840         tcp_prequeue_init(tp);
1841
1842         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1843         tp->mdev = TCP_TIMEOUT_INIT;
1844
1845         /* So many TCP implementations out there (incorrectly) count the
1846          * initial SYN frame in their delayed-ACK and congestion control
1847          * algorithms that we must have the following bandaid to talk
1848          * efficiently to them.  -DaveM
1849          */
1850         tp->snd_cwnd = 2;
1851
1852         /* See draft-stevens-tcpca-spec-01 for discussion of the
1853          * initialization of these values.
1854          */
1855         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1856         tp->snd_cwnd_clamp = ~0;
1857         tp->mss_cache = 536;
1858
1859         tp->reordering = sysctl_tcp_reordering;
1860         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1861
1862         sk->sk_state = TCP_CLOSE;
1863
1864         sk->sk_write_space = sk_stream_write_space;
1865         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1866
1867         icsk->icsk_af_ops = &ipv4_specific;
1868         icsk->icsk_sync_mss = tcp_sync_mss;
1869 #ifdef CONFIG_TCP_MD5SIG
1870         tp->af_specific = &tcp_sock_ipv4_specific;
1871 #endif
1872
1873         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1874         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1875
1876         atomic_inc(&tcp_sockets_allocated);
1877
1878         return 0;
1879 }
1880
1881 int tcp_v4_destroy_sock(struct sock *sk)
1882 {
1883         struct tcp_sock *tp = tcp_sk(sk);
1884
1885         tcp_clear_xmit_timers(sk);
1886
1887         tcp_cleanup_congestion_control(sk);
1888
1889         /* Cleanup up the write buffer. */
1890         tcp_write_queue_purge(sk);
1891
1892         /* Cleans up our, hopefully empty, out_of_order_queue. */
1893         __skb_queue_purge(&tp->out_of_order_queue);
1894
1895 #ifdef CONFIG_TCP_MD5SIG
1896         /* Clean up the MD5 key list, if any */
1897         if (tp->md5sig_info) {
1898                 tcp_v4_clear_md5_list(sk);
1899                 kfree(tp->md5sig_info);
1900                 tp->md5sig_info = NULL;
1901         }
1902 #endif
1903
1904 #ifdef CONFIG_NET_DMA
1905         /* Cleans up our sk_async_wait_queue */
1906         __skb_queue_purge(&sk->sk_async_wait_queue);
1907 #endif
1908
1909         /* Clean prequeue, it must be empty really */
1910         __skb_queue_purge(&tp->ucopy.prequeue);
1911
1912         /* Clean up a referenced TCP bind bucket. */
1913         if (inet_csk(sk)->icsk_bind_hash)
1914                 inet_put_port(sk);
1915
1916         /*
1917          * If sendmsg cached page exists, toss it.
1918          */
1919         if (sk->sk_sndmsg_page) {
1920                 __free_page(sk->sk_sndmsg_page);
1921                 sk->sk_sndmsg_page = NULL;
1922         }
1923
1924         atomic_dec(&tcp_sockets_allocated);
1925
1926         return 0;
1927 }
1928
1929 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1930
1931 #ifdef CONFIG_PROC_FS
1932 /* Proc filesystem TCP sock list dumping. */
1933
1934 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1935 {
1936         return hlist_empty(head) ? NULL :
1937                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1938 }
1939
1940 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1941 {
1942         return tw->tw_node.next ?
1943                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1944 }
1945
1946 static void *listening_get_next(struct seq_file *seq, void *cur)
1947 {
1948         struct inet_connection_sock *icsk;
1949         struct hlist_node *node;
1950         struct sock *sk = cur;
1951         struct tcp_iter_state* st = seq->private;
1952
1953         if (!sk) {
1954                 st->bucket = 0;
1955                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1956                 goto get_sk;
1957         }
1958
1959         ++st->num;
1960
1961         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1962                 struct request_sock *req = cur;
1963
1964                 icsk = inet_csk(st->syn_wait_sk);
1965                 req = req->dl_next;
1966                 while (1) {
1967                         while (req) {
1968                                 if (req->rsk_ops->family == st->family) {
1969                                         cur = req;
1970                                         goto out;
1971                                 }
1972                                 req = req->dl_next;
1973                         }
1974                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1975                                 break;
1976 get_req:
1977                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1978                 }
1979                 sk        = sk_next(st->syn_wait_sk);
1980                 st->state = TCP_SEQ_STATE_LISTENING;
1981                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1982         } else {
1983                 icsk = inet_csk(sk);
1984                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1985                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1986                         goto start_req;
1987                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1988                 sk = sk_next(sk);
1989         }
1990 get_sk:
1991         sk_for_each_from(sk, node) {
1992                 if (sk->sk_family == st->family) {
1993                         cur = sk;
1994                         goto out;
1995                 }
1996                 icsk = inet_csk(sk);
1997                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1998                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1999 start_req:
2000                         st->uid         = sock_i_uid(sk);
2001                         st->syn_wait_sk = sk;
2002                         st->state       = TCP_SEQ_STATE_OPENREQ;
2003                         st->sbucket     = 0;
2004                         goto get_req;
2005                 }
2006                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2007         }
2008         if (++st->bucket < INET_LHTABLE_SIZE) {
2009                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2010                 goto get_sk;
2011         }
2012         cur = NULL;
2013 out:
2014         return cur;
2015 }
2016
2017 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2018 {
2019         void *rc = listening_get_next(seq, NULL);
2020
2021         while (rc && *pos) {
2022                 rc = listening_get_next(seq, rc);
2023                 --*pos;
2024         }
2025         return rc;
2026 }
2027
2028 static void *established_get_first(struct seq_file *seq)
2029 {
2030         struct tcp_iter_state* st = seq->private;
2031         void *rc = NULL;
2032
2033         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2034                 struct sock *sk;
2035                 struct hlist_node *node;
2036                 struct inet_timewait_sock *tw;
2037                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2038
2039                 read_lock_bh(lock);
2040                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2041                         if (sk->sk_family != st->family) {
2042                                 continue;
2043                         }
2044                         rc = sk;
2045                         goto out;
2046                 }
2047                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2048                 inet_twsk_for_each(tw, node,
2049                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2050                         if (tw->tw_family != st->family) {
2051                                 continue;
2052                         }
2053                         rc = tw;
2054                         goto out;
2055                 }
2056                 read_unlock_bh(lock);
2057                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2058         }
2059 out:
2060         return rc;
2061 }
2062
2063 static void *established_get_next(struct seq_file *seq, void *cur)
2064 {
2065         struct sock *sk = cur;
2066         struct inet_timewait_sock *tw;
2067         struct hlist_node *node;
2068         struct tcp_iter_state* st = seq->private;
2069
2070         ++st->num;
2071
2072         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2073                 tw = cur;
2074                 tw = tw_next(tw);
2075 get_tw:
2076                 while (tw && tw->tw_family != st->family) {
2077                         tw = tw_next(tw);
2078                 }
2079                 if (tw) {
2080                         cur = tw;
2081                         goto out;
2082                 }
2083                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2084                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2085
2086                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2087                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2088                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2089                 } else {
2090                         cur = NULL;
2091                         goto out;
2092                 }
2093         } else
2094                 sk = sk_next(sk);
2095
2096         sk_for_each_from(sk, node) {
2097                 if (sk->sk_family == st->family)
2098                         goto found;
2099         }
2100
2101         st->state = TCP_SEQ_STATE_TIME_WAIT;
2102         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2103         goto get_tw;
2104 found:
2105         cur = sk;
2106 out:
2107         return cur;
2108 }
2109
2110 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2111 {
2112         void *rc = established_get_first(seq);
2113
2114         while (rc && pos) {
2115                 rc = established_get_next(seq, rc);
2116                 --pos;
2117         }
2118         return rc;
2119 }
2120
2121 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2122 {
2123         void *rc;
2124         struct tcp_iter_state* st = seq->private;
2125
2126         inet_listen_lock(&tcp_hashinfo);
2127         st->state = TCP_SEQ_STATE_LISTENING;
2128         rc        = listening_get_idx(seq, &pos);
2129
2130         if (!rc) {
2131                 inet_listen_unlock(&tcp_hashinfo);
2132                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2133                 rc        = established_get_idx(seq, pos);
2134         }
2135
2136         return rc;
2137 }
2138
2139 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2140 {
2141         struct tcp_iter_state* st = seq->private;
2142         st->state = TCP_SEQ_STATE_LISTENING;
2143         st->num = 0;
2144         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2145 }
2146
2147 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2148 {
2149         void *rc = NULL;
2150         struct tcp_iter_state* st;
2151
2152         if (v == SEQ_START_TOKEN) {
2153                 rc = tcp_get_idx(seq, 0);
2154                 goto out;
2155         }
2156         st = seq->private;
2157
2158         switch (st->state) {
2159         case TCP_SEQ_STATE_OPENREQ:
2160         case TCP_SEQ_STATE_LISTENING:
2161                 rc = listening_get_next(seq, v);
2162                 if (!rc) {
2163                         inet_listen_unlock(&tcp_hashinfo);
2164                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2165                         rc        = established_get_first(seq);
2166                 }
2167                 break;
2168         case TCP_SEQ_STATE_ESTABLISHED:
2169         case TCP_SEQ_STATE_TIME_WAIT:
2170                 rc = established_get_next(seq, v);
2171                 break;
2172         }
2173 out:
2174         ++*pos;
2175         return rc;
2176 }
2177
2178 static void tcp_seq_stop(struct seq_file *seq, void *v)
2179 {
2180         struct tcp_iter_state* st = seq->private;
2181
2182         switch (st->state) {
2183         case TCP_SEQ_STATE_OPENREQ:
2184                 if (v) {
2185                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2186                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2187                 }
2188         case TCP_SEQ_STATE_LISTENING:
2189                 if (v != SEQ_START_TOKEN)
2190                         inet_listen_unlock(&tcp_hashinfo);
2191                 break;
2192         case TCP_SEQ_STATE_TIME_WAIT:
2193         case TCP_SEQ_STATE_ESTABLISHED:
2194                 if (v)
2195                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2196                 break;
2197         }
2198 }
2199
2200 static int tcp_seq_open(struct inode *inode, struct file *file)
2201 {
2202         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2203         struct seq_file *seq;
2204         struct tcp_iter_state *s;
2205         int rc;
2206
2207         if (unlikely(afinfo == NULL))
2208                 return -EINVAL;
2209
2210         s = kzalloc(sizeof(*s), GFP_KERNEL);
2211         if (!s)
2212                 return -ENOMEM;
2213         s->family               = afinfo->family;
2214         s->seq_ops.start        = tcp_seq_start;
2215         s->seq_ops.next         = tcp_seq_next;
2216         s->seq_ops.show         = afinfo->seq_show;
2217         s->seq_ops.stop         = tcp_seq_stop;
2218
2219         rc = seq_open(file, &s->seq_ops);
2220         if (rc)
2221                 goto out_kfree;
2222         seq          = file->private_data;
2223         seq->private = s;
2224 out:
2225         return rc;
2226 out_kfree:
2227         kfree(s);
2228         goto out;
2229 }
2230
2231 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2232 {
2233         int rc = 0;
2234         struct proc_dir_entry *p;
2235
2236         if (!afinfo)
2237                 return -EINVAL;
2238         afinfo->seq_fops->owner         = afinfo->owner;
2239         afinfo->seq_fops->open          = tcp_seq_open;
2240         afinfo->seq_fops->read          = seq_read;
2241         afinfo->seq_fops->llseek        = seq_lseek;
2242         afinfo->seq_fops->release       = seq_release_private;
2243
2244         p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops);
2245         if (p)
2246                 p->data = afinfo;
2247         else
2248                 rc = -ENOMEM;
2249         return rc;
2250 }
2251
2252 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2253 {
2254         if (!afinfo)
2255                 return;
2256         proc_net_remove(&init_net, afinfo->name);
2257         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2258 }
2259
2260 static void get_openreq4(struct sock *sk, struct request_sock *req,
2261                          char *tmpbuf, int i, int uid)
2262 {
2263         const struct inet_request_sock *ireq = inet_rsk(req);
2264         int ttd = req->expires - jiffies;
2265
2266         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2267                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2268                 i,
2269                 ireq->loc_addr,
2270                 ntohs(inet_sk(sk)->sport),
2271                 ireq->rmt_addr,
2272                 ntohs(ireq->rmt_port),
2273                 TCP_SYN_RECV,
2274                 0, 0, /* could print option size, but that is af dependent. */
2275                 1,    /* timers active (only the expire timer) */
2276                 jiffies_to_clock_t(ttd),
2277                 req->retrans,
2278                 uid,
2279                 0,  /* non standard timer */
2280                 0, /* open_requests have no inode */
2281                 atomic_read(&sk->sk_refcnt),
2282                 req);
2283 }
2284
2285 static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2286 {
2287         int timer_active;
2288         unsigned long timer_expires;
2289         struct tcp_sock *tp = tcp_sk(sk);
2290         const struct inet_connection_sock *icsk = inet_csk(sk);
2291         struct inet_sock *inet = inet_sk(sk);
2292         __be32 dest = inet->daddr;
2293         __be32 src = inet->rcv_saddr;
2294         __u16 destp = ntohs(inet->dport);
2295         __u16 srcp = ntohs(inet->sport);
2296
2297         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2298                 timer_active    = 1;
2299                 timer_expires   = icsk->icsk_timeout;
2300         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2301                 timer_active    = 4;
2302                 timer_expires   = icsk->icsk_timeout;
2303         } else if (timer_pending(&sk->sk_timer)) {
2304                 timer_active    = 2;
2305                 timer_expires   = sk->sk_timer.expires;
2306         } else {
2307                 timer_active    = 0;
2308                 timer_expires = jiffies;
2309         }
2310
2311         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2312                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2313                 i, src, srcp, dest, destp, sk->sk_state,
2314                 tp->write_seq - tp->snd_una,
2315                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2316                                              (tp->rcv_nxt - tp->copied_seq),
2317                 timer_active,
2318                 jiffies_to_clock_t(timer_expires - jiffies),
2319                 icsk->icsk_retransmits,
2320                 sock_i_uid(sk),
2321                 icsk->icsk_probes_out,
2322                 sock_i_ino(sk),
2323                 atomic_read(&sk->sk_refcnt), sk,
2324                 icsk->icsk_rto,
2325                 icsk->icsk_ack.ato,
2326                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2327                 tp->snd_cwnd,
2328                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2329 }
2330
2331 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2332                                char *tmpbuf, int i)
2333 {
2334         __be32 dest, src;
2335         __u16 destp, srcp;
2336         int ttd = tw->tw_ttd - jiffies;
2337
2338         if (ttd < 0)
2339                 ttd = 0;
2340
2341         dest  = tw->tw_daddr;
2342         src   = tw->tw_rcv_saddr;
2343         destp = ntohs(tw->tw_dport);
2344         srcp  = ntohs(tw->tw_sport);
2345
2346         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2347                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2348                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2349                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2350                 atomic_read(&tw->tw_refcnt), tw);
2351 }
2352
2353 #define TMPSZ 150
2354
2355 static int tcp4_seq_show(struct seq_file *seq, void *v)
2356 {
2357         struct tcp_iter_state* st;
2358         char tmpbuf[TMPSZ + 1];
2359
2360         if (v == SEQ_START_TOKEN) {
2361                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2362                            "  sl  local_address rem_address   st tx_queue "
2363                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2364                            "inode");
2365                 goto out;
2366         }
2367         st = seq->private;
2368
2369         switch (st->state) {
2370         case TCP_SEQ_STATE_LISTENING:
2371         case TCP_SEQ_STATE_ESTABLISHED:
2372                 get_tcp4_sock(v, tmpbuf, st->num);
2373                 break;
2374         case TCP_SEQ_STATE_OPENREQ:
2375                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2376                 break;
2377         case TCP_SEQ_STATE_TIME_WAIT:
2378                 get_timewait4_sock(v, tmpbuf, st->num);
2379                 break;
2380         }
2381         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2382 out:
2383         return 0;
2384 }
2385
2386 static struct file_operations tcp4_seq_fops;
2387 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2388         .owner          = THIS_MODULE,
2389         .name           = "tcp",
2390         .family         = AF_INET,
2391         .seq_show       = tcp4_seq_show,
2392         .seq_fops       = &tcp4_seq_fops,
2393 };
2394
2395 int __init tcp4_proc_init(void)
2396 {
2397         return tcp_proc_register(&tcp4_seq_afinfo);
2398 }
2399
2400 void tcp4_proc_exit(void)
2401 {
2402         tcp_proc_unregister(&tcp4_seq_afinfo);
2403 }
2404 #endif /* CONFIG_PROC_FS */
2405
2406 DEFINE_PROTO_INUSE(tcp)
2407
2408 struct proto tcp_prot = {
2409         .name                   = "TCP",
2410         .owner                  = THIS_MODULE,
2411         .close                  = tcp_close,
2412         .connect                = tcp_v4_connect,
2413         .disconnect             = tcp_disconnect,
2414         .accept                 = inet_csk_accept,
2415         .ioctl                  = tcp_ioctl,
2416         .init                   = tcp_v4_init_sock,
2417         .destroy                = tcp_v4_destroy_sock,
2418         .shutdown               = tcp_shutdown,
2419         .setsockopt             = tcp_setsockopt,
2420         .getsockopt             = tcp_getsockopt,
2421         .recvmsg                = tcp_recvmsg,
2422         .backlog_rcv            = tcp_v4_do_rcv,
2423         .hash                   = inet_hash,
2424         .unhash                 = inet_unhash,
2425         .get_port               = inet_csk_get_port,
2426         .enter_memory_pressure  = tcp_enter_memory_pressure,
2427         .sockets_allocated      = &tcp_sockets_allocated,
2428         .orphan_count           = &tcp_orphan_count,
2429         .memory_allocated       = &tcp_memory_allocated,
2430         .memory_pressure        = &tcp_memory_pressure,
2431         .sysctl_mem             = sysctl_tcp_mem,
2432         .sysctl_wmem            = sysctl_tcp_wmem,
2433         .sysctl_rmem            = sysctl_tcp_rmem,
2434         .max_header             = MAX_TCP_HEADER,
2435         .obj_size               = sizeof(struct tcp_sock),
2436         .twsk_prot              = &tcp_timewait_sock_ops,
2437         .rsk_prot               = &tcp_request_sock_ops,
2438         .hashinfo               = &tcp_hashinfo,
2439 #ifdef CONFIG_COMPAT
2440         .compat_setsockopt      = compat_tcp_setsockopt,
2441         .compat_getsockopt      = compat_tcp_getsockopt,
2442 #endif
2443         REF_PROTO_INUSE(tcp)
2444 };
2445
2446 void __init tcp_v4_init(struct net_proto_family *ops)
2447 {
2448         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2449                                      IPPROTO_TCP) < 0)
2450                 panic("Failed to create the TCP control socket.\n");
2451 }
2452
2453 EXPORT_SYMBOL(ipv4_specific);
2454 EXPORT_SYMBOL(tcp_hashinfo);
2455 EXPORT_SYMBOL(tcp_prot);
2456 EXPORT_SYMBOL(tcp_v4_conn_request);
2457 EXPORT_SYMBOL(tcp_v4_connect);
2458 EXPORT_SYMBOL(tcp_v4_do_rcv);
2459 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2460 EXPORT_SYMBOL(tcp_v4_send_check);
2461 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2462
2463 #ifdef CONFIG_PROC_FS
2464 EXPORT_SYMBOL(tcp_proc_register);
2465 EXPORT_SYMBOL(tcp_proc_unregister);
2466 #endif
2467 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2468