]> err.no Git - linux-2.6/blob - net/ipv4/tcp_ipv4.c
[NET]: Use csum_start offset instead of skb_transport_header
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87 /* Check TCP sequence numbers in ICMP packets. */
88 #define ICMP_MIN_LENGTH 8
89
90 /* Socket used for sending RSTs */
91 static struct socket *tcp_socket __read_mostly;
92
93 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
97                                                    __be32 addr);
98 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
99                                    __be32 saddr, __be32 daddr,
100                                    struct tcphdr *th, int protocol,
101                                    int tcplen);
102 #endif
103
104 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
105         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
106         .lhash_users = ATOMIC_INIT(0),
107         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
108 };
109
110 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
111 {
112         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
113                                  inet_csk_bind_conflict);
114 }
115
116 static void tcp_v4_hash(struct sock *sk)
117 {
118         inet_hash(&tcp_hashinfo, sk);
119 }
120
121 void tcp_unhash(struct sock *sk)
122 {
123         inet_unhash(&tcp_hashinfo, sk);
124 }
125
126 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
127 {
128         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
129                                           ip_hdr(skb)->saddr,
130                                           tcp_hdr(skb)->dest,
131                                           tcp_hdr(skb)->source);
132 }
133
134 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
135 {
136         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
137         struct tcp_sock *tp = tcp_sk(sk);
138
139         /* With PAWS, it is safe from the viewpoint
140            of data integrity. Even without PAWS it is safe provided sequence
141            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143            Actually, the idea is close to VJ's one, only timestamp cache is
144            held not per host, but per port pair and TW bucket is used as state
145            holder.
146
147            If TW bucket has been already destroyed we fall back to VJ's scheme
148            and use initial timestamp retrieved from peer table.
149          */
150         if (tcptw->tw_ts_recent_stamp &&
151             (twp == NULL || (sysctl_tcp_tw_reuse &&
152                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
153                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154                 if (tp->write_seq == 0)
155                         tp->write_seq = 1;
156                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
157                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
158                 sock_hold(sktw);
159                 return 1;
160         }
161
162         return 0;
163 }
164
165 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
166
167 /* This will initiate an outgoing connection. */
168 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
169 {
170         struct inet_sock *inet = inet_sk(sk);
171         struct tcp_sock *tp = tcp_sk(sk);
172         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
173         struct rtable *rt;
174         __be32 daddr, nexthop;
175         int tmp;
176         int err;
177
178         if (addr_len < sizeof(struct sockaddr_in))
179                 return -EINVAL;
180
181         if (usin->sin_family != AF_INET)
182                 return -EAFNOSUPPORT;
183
184         nexthop = daddr = usin->sin_addr.s_addr;
185         if (inet->opt && inet->opt->srr) {
186                 if (!daddr)
187                         return -EINVAL;
188                 nexthop = inet->opt->faddr;
189         }
190
191         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
192                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
193                                IPPROTO_TCP,
194                                inet->sport, usin->sin_port, sk, 1);
195         if (tmp < 0)
196                 return tmp;
197
198         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
199                 ip_rt_put(rt);
200                 return -ENETUNREACH;
201         }
202
203         if (!inet->opt || !inet->opt->srr)
204                 daddr = rt->rt_dst;
205
206         if (!inet->saddr)
207                 inet->saddr = rt->rt_src;
208         inet->rcv_saddr = inet->saddr;
209
210         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
211                 /* Reset inherited state */
212                 tp->rx_opt.ts_recent       = 0;
213                 tp->rx_opt.ts_recent_stamp = 0;
214                 tp->write_seq              = 0;
215         }
216
217         if (tcp_death_row.sysctl_tw_recycle &&
218             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
219                 struct inet_peer *peer = rt_get_peer(rt);
220                 /*
221                  * VJ's idea. We save last timestamp seen from
222                  * the destination in peer table, when entering state
223                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
224                  * when trying new connection.
225                  */
226                 if (peer != NULL &&
227                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
228                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
229                         tp->rx_opt.ts_recent = peer->tcp_ts;
230                 }
231         }
232
233         inet->dport = usin->sin_port;
234         inet->daddr = daddr;
235
236         inet_csk(sk)->icsk_ext_hdr_len = 0;
237         if (inet->opt)
238                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
239
240         tp->rx_opt.mss_clamp = 536;
241
242         /* Socket identity is still unknown (sport may be zero).
243          * However we set state to SYN-SENT and not releasing socket
244          * lock select source port, enter ourselves into the hash tables and
245          * complete initialization after this.
246          */
247         tcp_set_state(sk, TCP_SYN_SENT);
248         err = inet_hash_connect(&tcp_death_row, sk);
249         if (err)
250                 goto failure;
251
252         err = ip_route_newports(&rt, IPPROTO_TCP,
253                                 inet->sport, inet->dport, sk);
254         if (err)
255                 goto failure;
256
257         /* OK, now commit destination to socket.  */
258         sk->sk_gso_type = SKB_GSO_TCPV4;
259         sk_setup_caps(sk, &rt->u.dst);
260
261         if (!tp->write_seq)
262                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
263                                                            inet->daddr,
264                                                            inet->sport,
265                                                            usin->sin_port);
266
267         inet->id = tp->write_seq ^ jiffies;
268
269         err = tcp_connect(sk);
270         rt = NULL;
271         if (err)
272                 goto failure;
273
274         return 0;
275
276 failure:
277         /*
278          * This unhashes the socket and releases the local port,
279          * if necessary.
280          */
281         tcp_set_state(sk, TCP_CLOSE);
282         ip_rt_put(rt);
283         sk->sk_route_caps = 0;
284         inet->dport = 0;
285         return err;
286 }
287
288 /*
289  * This routine does path mtu discovery as defined in RFC1191.
290  */
291 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
292 {
293         struct dst_entry *dst;
294         struct inet_sock *inet = inet_sk(sk);
295
296         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
297          * send out by Linux are always <576bytes so they should go through
298          * unfragmented).
299          */
300         if (sk->sk_state == TCP_LISTEN)
301                 return;
302
303         /* We don't check in the destentry if pmtu discovery is forbidden
304          * on this route. We just assume that no packet_to_big packets
305          * are send back when pmtu discovery is not active.
306          * There is a small race when the user changes this flag in the
307          * route, but I think that's acceptable.
308          */
309         if ((dst = __sk_dst_check(sk, 0)) == NULL)
310                 return;
311
312         dst->ops->update_pmtu(dst, mtu);
313
314         /* Something is about to be wrong... Remember soft error
315          * for the case, if this connection will not able to recover.
316          */
317         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
318                 sk->sk_err_soft = EMSGSIZE;
319
320         mtu = dst_mtu(dst);
321
322         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
323             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
324                 tcp_sync_mss(sk, mtu);
325
326                 /* Resend the TCP packet because it's
327                  * clear that the old packet has been
328                  * dropped. This is the new "fast" path mtu
329                  * discovery.
330                  */
331                 tcp_simple_retransmit(sk);
332         } /* else let the usual retransmit timer handle it */
333 }
334
335 /*
336  * This routine is called by the ICMP module when it gets some
337  * sort of error condition.  If err < 0 then the socket should
338  * be closed and the error returned to the user.  If err > 0
339  * it's just the icmp type << 8 | icmp code.  After adjustment
340  * header points to the first 8 bytes of the tcp header.  We need
341  * to find the appropriate port.
342  *
343  * The locking strategy used here is very "optimistic". When
344  * someone else accesses the socket the ICMP is just dropped
345  * and for some paths there is no check at all.
346  * A more general error queue to queue errors for later handling
347  * is probably better.
348  *
349  */
350
351 void tcp_v4_err(struct sk_buff *skb, u32 info)
352 {
353         struct iphdr *iph = (struct iphdr *)skb->data;
354         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
355         struct tcp_sock *tp;
356         struct inet_sock *inet;
357         const int type = icmp_hdr(skb)->type;
358         const int code = icmp_hdr(skb)->code;
359         struct sock *sk;
360         __u32 seq;
361         int err;
362
363         if (skb->len < (iph->ihl << 2) + 8) {
364                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
365                 return;
366         }
367
368         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
369                          th->source, inet_iif(skb));
370         if (!sk) {
371                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
372                 return;
373         }
374         if (sk->sk_state == TCP_TIME_WAIT) {
375                 inet_twsk_put(inet_twsk(sk));
376                 return;
377         }
378
379         bh_lock_sock(sk);
380         /* If too many ICMPs get dropped on busy
381          * servers this needs to be solved differently.
382          */
383         if (sock_owned_by_user(sk))
384                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
385
386         if (sk->sk_state == TCP_CLOSE)
387                 goto out;
388
389         tp = tcp_sk(sk);
390         seq = ntohl(th->seq);
391         if (sk->sk_state != TCP_LISTEN &&
392             !between(seq, tp->snd_una, tp->snd_nxt)) {
393                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
394                 goto out;
395         }
396
397         switch (type) {
398         case ICMP_SOURCE_QUENCH:
399                 /* Just silently ignore these. */
400                 goto out;
401         case ICMP_PARAMETERPROB:
402                 err = EPROTO;
403                 break;
404         case ICMP_DEST_UNREACH:
405                 if (code > NR_ICMP_UNREACH)
406                         goto out;
407
408                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
409                         if (!sock_owned_by_user(sk))
410                                 do_pmtu_discovery(sk, iph, info);
411                         goto out;
412                 }
413
414                 err = icmp_err_convert[code].errno;
415                 break;
416         case ICMP_TIME_EXCEEDED:
417                 err = EHOSTUNREACH;
418                 break;
419         default:
420                 goto out;
421         }
422
423         switch (sk->sk_state) {
424                 struct request_sock *req, **prev;
425         case TCP_LISTEN:
426                 if (sock_owned_by_user(sk))
427                         goto out;
428
429                 req = inet_csk_search_req(sk, &prev, th->dest,
430                                           iph->daddr, iph->saddr);
431                 if (!req)
432                         goto out;
433
434                 /* ICMPs are not backlogged, hence we cannot get
435                    an established socket here.
436                  */
437                 BUG_TRAP(!req->sk);
438
439                 if (seq != tcp_rsk(req)->snt_isn) {
440                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
441                         goto out;
442                 }
443
444                 /*
445                  * Still in SYN_RECV, just remove it silently.
446                  * There is no good way to pass the error to the newly
447                  * created socket, and POSIX does not want network
448                  * errors returned from accept().
449                  */
450                 inet_csk_reqsk_queue_drop(sk, req, prev);
451                 goto out;
452
453         case TCP_SYN_SENT:
454         case TCP_SYN_RECV:  /* Cannot happen.
455                                It can f.e. if SYNs crossed.
456                              */
457                 if (!sock_owned_by_user(sk)) {
458                         sk->sk_err = err;
459
460                         sk->sk_error_report(sk);
461
462                         tcp_done(sk);
463                 } else {
464                         sk->sk_err_soft = err;
465                 }
466                 goto out;
467         }
468
469         /* If we've already connected we will keep trying
470          * until we time out, or the user gives up.
471          *
472          * rfc1122 4.2.3.9 allows to consider as hard errors
473          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
474          * but it is obsoleted by pmtu discovery).
475          *
476          * Note, that in modern internet, where routing is unreliable
477          * and in each dark corner broken firewalls sit, sending random
478          * errors ordered by their masters even this two messages finally lose
479          * their original sense (even Linux sends invalid PORT_UNREACHs)
480          *
481          * Now we are in compliance with RFCs.
482          *                                                      --ANK (980905)
483          */
484
485         inet = inet_sk(sk);
486         if (!sock_owned_by_user(sk) && inet->recverr) {
487                 sk->sk_err = err;
488                 sk->sk_error_report(sk);
489         } else  { /* Only an error on timeout */
490                 sk->sk_err_soft = err;
491         }
492
493 out:
494         bh_unlock_sock(sk);
495         sock_put(sk);
496 }
497
498 /* This routine computes an IPv4 TCP checksum. */
499 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
500 {
501         struct inet_sock *inet = inet_sk(sk);
502         struct tcphdr *th = tcp_hdr(skb);
503
504         if (skb->ip_summed == CHECKSUM_PARTIAL) {
505                 th->check = ~tcp_v4_check(len, inet->saddr,
506                                           inet->daddr, 0);
507                 skb->csum_start = skb_transport_header(skb) - skb->head;
508                 skb->csum_offset = offsetof(struct tcphdr, check);
509         } else {
510                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
511                                          csum_partial((char *)th,
512                                                       th->doff << 2,
513                                                       skb->csum));
514         }
515 }
516
517 int tcp_v4_gso_send_check(struct sk_buff *skb)
518 {
519         const struct iphdr *iph;
520         struct tcphdr *th;
521
522         if (!pskb_may_pull(skb, sizeof(*th)))
523                 return -EINVAL;
524
525         iph = ip_hdr(skb);
526         th = tcp_hdr(skb);
527
528         th->check = 0;
529         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
530         skb->csum_start = skb_transport_header(skb) - skb->head;
531         skb->csum_offset = offsetof(struct tcphdr, check);
532         skb->ip_summed = CHECKSUM_PARTIAL;
533         return 0;
534 }
535
536 /*
537  *      This routine will send an RST to the other tcp.
538  *
539  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
540  *                    for reset.
541  *      Answer: if a packet caused RST, it is not for a socket
542  *              existing in our system, if it is matched to a socket,
543  *              it is just duplicate segment or bug in other side's TCP.
544  *              So that we build reply only basing on parameters
545  *              arrived with segment.
546  *      Exception: precedence violation. We do not implement it in any case.
547  */
548
549 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
550 {
551         struct tcphdr *th = tcp_hdr(skb);
552         struct {
553                 struct tcphdr th;
554 #ifdef CONFIG_TCP_MD5SIG
555                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
556 #endif
557         } rep;
558         struct ip_reply_arg arg;
559 #ifdef CONFIG_TCP_MD5SIG
560         struct tcp_md5sig_key *key;
561 #endif
562
563         /* Never send a reset in response to a reset. */
564         if (th->rst)
565                 return;
566
567         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
568                 return;
569
570         /* Swap the send and the receive. */
571         memset(&rep, 0, sizeof(rep));
572         rep.th.dest   = th->source;
573         rep.th.source = th->dest;
574         rep.th.doff   = sizeof(struct tcphdr) / 4;
575         rep.th.rst    = 1;
576
577         if (th->ack) {
578                 rep.th.seq = th->ack_seq;
579         } else {
580                 rep.th.ack = 1;
581                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
582                                        skb->len - (th->doff << 2));
583         }
584
585         memset(&arg, 0, sizeof(arg));
586         arg.iov[0].iov_base = (unsigned char *)&rep;
587         arg.iov[0].iov_len  = sizeof(rep.th);
588
589 #ifdef CONFIG_TCP_MD5SIG
590         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
591         if (key) {
592                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
593                                    (TCPOPT_NOP << 16) |
594                                    (TCPOPT_MD5SIG << 8) |
595                                    TCPOLEN_MD5SIG);
596                 /* Update length and the length the header thinks exists */
597                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
598                 rep.th.doff = arg.iov[0].iov_len / 4;
599
600                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
601                                         key,
602                                         ip_hdr(skb)->daddr,
603                                         ip_hdr(skb)->saddr,
604                                         &rep.th, IPPROTO_TCP,
605                                         arg.iov[0].iov_len);
606         }
607 #endif
608         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
609                                       ip_hdr(skb)->saddr, /* XXX */
610                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
611         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
612
613         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
614
615         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
616         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
617 }
618
619 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
620    outside socket context is ugly, certainly. What can I do?
621  */
622
623 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
624                             struct sk_buff *skb, u32 seq, u32 ack,
625                             u32 win, u32 ts)
626 {
627         struct tcphdr *th = tcp_hdr(skb);
628         struct {
629                 struct tcphdr th;
630                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
631 #ifdef CONFIG_TCP_MD5SIG
632                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
633 #endif
634                         ];
635         } rep;
636         struct ip_reply_arg arg;
637 #ifdef CONFIG_TCP_MD5SIG
638         struct tcp_md5sig_key *key;
639         struct tcp_md5sig_key tw_key;
640 #endif
641
642         memset(&rep.th, 0, sizeof(struct tcphdr));
643         memset(&arg, 0, sizeof(arg));
644
645         arg.iov[0].iov_base = (unsigned char *)&rep;
646         arg.iov[0].iov_len  = sizeof(rep.th);
647         if (ts) {
648                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
649                                    (TCPOPT_TIMESTAMP << 8) |
650                                    TCPOLEN_TIMESTAMP);
651                 rep.opt[1] = htonl(tcp_time_stamp);
652                 rep.opt[2] = htonl(ts);
653                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
654         }
655
656         /* Swap the send and the receive. */
657         rep.th.dest    = th->source;
658         rep.th.source  = th->dest;
659         rep.th.doff    = arg.iov[0].iov_len / 4;
660         rep.th.seq     = htonl(seq);
661         rep.th.ack_seq = htonl(ack);
662         rep.th.ack     = 1;
663         rep.th.window  = htons(win);
664
665 #ifdef CONFIG_TCP_MD5SIG
666         /*
667          * The SKB holds an imcoming packet, but may not have a valid ->sk
668          * pointer. This is especially the case when we're dealing with a
669          * TIME_WAIT ack, because the sk structure is long gone, and only
670          * the tcp_timewait_sock remains. So the md5 key is stashed in that
671          * structure, and we use it in preference.  I believe that (twsk ||
672          * skb->sk) holds true, but we program defensively.
673          */
674         if (!twsk && skb->sk) {
675                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
676         } else if (twsk && twsk->tw_md5_keylen) {
677                 tw_key.key = twsk->tw_md5_key;
678                 tw_key.keylen = twsk->tw_md5_keylen;
679                 key = &tw_key;
680         } else
681                 key = NULL;
682
683         if (key) {
684                 int offset = (ts) ? 3 : 0;
685
686                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
687                                           (TCPOPT_NOP << 16) |
688                                           (TCPOPT_MD5SIG << 8) |
689                                           TCPOLEN_MD5SIG);
690                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
691                 rep.th.doff = arg.iov[0].iov_len/4;
692
693                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
694                                         key,
695                                         ip_hdr(skb)->daddr,
696                                         ip_hdr(skb)->saddr,
697                                         &rep.th, IPPROTO_TCP,
698                                         arg.iov[0].iov_len);
699         }
700 #endif
701         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
702                                       ip_hdr(skb)->saddr, /* XXX */
703                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
704         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
705
706         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
707
708         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
709 }
710
711 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
712 {
713         struct inet_timewait_sock *tw = inet_twsk(sk);
714         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
715
716         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
717                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
718                         tcptw->tw_ts_recent);
719
720         inet_twsk_put(tw);
721 }
722
723 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
724                                   struct request_sock *req)
725 {
726         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
727                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
728                         req->ts_recent);
729 }
730
731 /*
732  *      Send a SYN-ACK after having received an ACK.
733  *      This still operates on a request_sock only, not on a big
734  *      socket.
735  */
736 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
737                               struct dst_entry *dst)
738 {
739         const struct inet_request_sock *ireq = inet_rsk(req);
740         int err = -1;
741         struct sk_buff * skb;
742
743         /* First, grab a route. */
744         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
745                 goto out;
746
747         skb = tcp_make_synack(sk, dst, req);
748
749         if (skb) {
750                 struct tcphdr *th = tcp_hdr(skb);
751
752                 th->check = tcp_v4_check(skb->len,
753                                          ireq->loc_addr,
754                                          ireq->rmt_addr,
755                                          csum_partial((char *)th, skb->len,
756                                                       skb->csum));
757
758                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
759                                             ireq->rmt_addr,
760                                             ireq->opt);
761                 err = net_xmit_eval(err);
762         }
763
764 out:
765         dst_release(dst);
766         return err;
767 }
768
769 /*
770  *      IPv4 request_sock destructor.
771  */
772 static void tcp_v4_reqsk_destructor(struct request_sock *req)
773 {
774         kfree(inet_rsk(req)->opt);
775 }
776
777 #ifdef CONFIG_SYN_COOKIES
778 static void syn_flood_warning(struct sk_buff *skb)
779 {
780         static unsigned long warntime;
781
782         if (time_after(jiffies, (warntime + HZ * 60))) {
783                 warntime = jiffies;
784                 printk(KERN_INFO
785                        "possible SYN flooding on port %d. Sending cookies.\n",
786                        ntohs(tcp_hdr(skb)->dest));
787         }
788 }
789 #endif
790
791 /*
792  * Save and compile IPv4 options into the request_sock if needed.
793  */
794 static struct ip_options *tcp_v4_save_options(struct sock *sk,
795                                               struct sk_buff *skb)
796 {
797         struct ip_options *opt = &(IPCB(skb)->opt);
798         struct ip_options *dopt = NULL;
799
800         if (opt && opt->optlen) {
801                 int opt_size = optlength(opt);
802                 dopt = kmalloc(opt_size, GFP_ATOMIC);
803                 if (dopt) {
804                         if (ip_options_echo(dopt, skb)) {
805                                 kfree(dopt);
806                                 dopt = NULL;
807                         }
808                 }
809         }
810         return dopt;
811 }
812
813 #ifdef CONFIG_TCP_MD5SIG
814 /*
815  * RFC2385 MD5 checksumming requires a mapping of
816  * IP address->MD5 Key.
817  * We need to maintain these in the sk structure.
818  */
819
820 /* Find the Key structure for an address.  */
821 static struct tcp_md5sig_key *
822                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
823 {
824         struct tcp_sock *tp = tcp_sk(sk);
825         int i;
826
827         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
828                 return NULL;
829         for (i = 0; i < tp->md5sig_info->entries4; i++) {
830                 if (tp->md5sig_info->keys4[i].addr == addr)
831                         return (struct tcp_md5sig_key *)
832                                                 &tp->md5sig_info->keys4[i];
833         }
834         return NULL;
835 }
836
837 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
838                                          struct sock *addr_sk)
839 {
840         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
841 }
842
843 EXPORT_SYMBOL(tcp_v4_md5_lookup);
844
845 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
846                                                       struct request_sock *req)
847 {
848         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
849 }
850
851 /* This can be called on a newly created socket, from other files */
852 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
853                       u8 *newkey, u8 newkeylen)
854 {
855         /* Add Key to the list */
856         struct tcp4_md5sig_key *key;
857         struct tcp_sock *tp = tcp_sk(sk);
858         struct tcp4_md5sig_key *keys;
859
860         key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr);
861         if (key) {
862                 /* Pre-existing entry - just update that one. */
863                 kfree(key->key);
864                 key->key = newkey;
865                 key->keylen = newkeylen;
866         } else {
867                 struct tcp_md5sig_info *md5sig;
868
869                 if (!tp->md5sig_info) {
870                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
871                                                   GFP_ATOMIC);
872                         if (!tp->md5sig_info) {
873                                 kfree(newkey);
874                                 return -ENOMEM;
875                         }
876                 }
877                 if (tcp_alloc_md5sig_pool() == NULL) {
878                         kfree(newkey);
879                         return -ENOMEM;
880                 }
881                 md5sig = tp->md5sig_info;
882
883                 if (md5sig->alloced4 == md5sig->entries4) {
884                         keys = kmalloc((sizeof(*keys) *
885                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
886                         if (!keys) {
887                                 kfree(newkey);
888                                 tcp_free_md5sig_pool();
889                                 return -ENOMEM;
890                         }
891
892                         if (md5sig->entries4)
893                                 memcpy(keys, md5sig->keys4,
894                                        sizeof(*keys) * md5sig->entries4);
895
896                         /* Free old key list, and reference new one */
897                         if (md5sig->keys4)
898                                 kfree(md5sig->keys4);
899                         md5sig->keys4 = keys;
900                         md5sig->alloced4++;
901                 }
902                 md5sig->entries4++;
903                 md5sig->keys4[md5sig->entries4 - 1].addr   = addr;
904                 md5sig->keys4[md5sig->entries4 - 1].key    = newkey;
905                 md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen;
906         }
907         return 0;
908 }
909
910 EXPORT_SYMBOL(tcp_v4_md5_do_add);
911
912 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
913                                u8 *newkey, u8 newkeylen)
914 {
915         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
916                                  newkey, newkeylen);
917 }
918
919 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
920 {
921         struct tcp_sock *tp = tcp_sk(sk);
922         int i;
923
924         for (i = 0; i < tp->md5sig_info->entries4; i++) {
925                 if (tp->md5sig_info->keys4[i].addr == addr) {
926                         /* Free the key */
927                         kfree(tp->md5sig_info->keys4[i].key);
928                         tp->md5sig_info->entries4--;
929
930                         if (tp->md5sig_info->entries4 == 0) {
931                                 kfree(tp->md5sig_info->keys4);
932                                 tp->md5sig_info->keys4 = NULL;
933                                 tp->md5sig_info->alloced4 = 0;
934                         } else if (tp->md5sig_info->entries4 != i) {
935                                 /* Need to do some manipulation */
936                                 memcpy(&tp->md5sig_info->keys4[i],
937                                        &tp->md5sig_info->keys4[i+1],
938                                        (tp->md5sig_info->entries4 - i) *
939                                         sizeof(struct tcp4_md5sig_key));
940                         }
941                         tcp_free_md5sig_pool();
942                         return 0;
943                 }
944         }
945         return -ENOENT;
946 }
947
948 EXPORT_SYMBOL(tcp_v4_md5_do_del);
949
950 static void tcp_v4_clear_md5_list(struct sock *sk)
951 {
952         struct tcp_sock *tp = tcp_sk(sk);
953
954         /* Free each key, then the set of key keys,
955          * the crypto element, and then decrement our
956          * hold on the last resort crypto.
957          */
958         if (tp->md5sig_info->entries4) {
959                 int i;
960                 for (i = 0; i < tp->md5sig_info->entries4; i++)
961                         kfree(tp->md5sig_info->keys4[i].key);
962                 tp->md5sig_info->entries4 = 0;
963                 tcp_free_md5sig_pool();
964         }
965         if (tp->md5sig_info->keys4) {
966                 kfree(tp->md5sig_info->keys4);
967                 tp->md5sig_info->keys4 = NULL;
968                 tp->md5sig_info->alloced4  = 0;
969         }
970 }
971
972 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
973                                  int optlen)
974 {
975         struct tcp_md5sig cmd;
976         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
977         u8 *newkey;
978
979         if (optlen < sizeof(cmd))
980                 return -EINVAL;
981
982         if (copy_from_user(&cmd, optval, sizeof(cmd)))
983                 return -EFAULT;
984
985         if (sin->sin_family != AF_INET)
986                 return -EINVAL;
987
988         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
989                 if (!tcp_sk(sk)->md5sig_info)
990                         return -ENOENT;
991                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
992         }
993
994         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
995                 return -EINVAL;
996
997         if (!tcp_sk(sk)->md5sig_info) {
998                 struct tcp_sock *tp = tcp_sk(sk);
999                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
1000
1001                 if (!p)
1002                         return -EINVAL;
1003
1004                 tp->md5sig_info = p;
1005
1006         }
1007
1008         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1009         if (!newkey)
1010                 return -ENOMEM;
1011         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1012                                  newkey, cmd.tcpm_keylen);
1013 }
1014
1015 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1016                                    __be32 saddr, __be32 daddr,
1017                                    struct tcphdr *th, int protocol,
1018                                    int tcplen)
1019 {
1020         struct scatterlist sg[4];
1021         __u16 data_len;
1022         int block = 0;
1023         __sum16 old_checksum;
1024         struct tcp_md5sig_pool *hp;
1025         struct tcp4_pseudohdr *bp;
1026         struct hash_desc *desc;
1027         int err;
1028         unsigned int nbytes = 0;
1029
1030         /*
1031          * Okay, so RFC2385 is turned on for this connection,
1032          * so we need to generate the MD5 hash for the packet now.
1033          */
1034
1035         hp = tcp_get_md5sig_pool();
1036         if (!hp)
1037                 goto clear_hash_noput;
1038
1039         bp = &hp->md5_blk.ip4;
1040         desc = &hp->md5_desc;
1041
1042         /*
1043          * 1. the TCP pseudo-header (in the order: source IP address,
1044          * destination IP address, zero-padded protocol number, and
1045          * segment length)
1046          */
1047         bp->saddr = saddr;
1048         bp->daddr = daddr;
1049         bp->pad = 0;
1050         bp->protocol = protocol;
1051         bp->len = htons(tcplen);
1052         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1053         nbytes += sizeof(*bp);
1054
1055         /* 2. the TCP header, excluding options, and assuming a
1056          * checksum of zero/
1057          */
1058         old_checksum = th->check;
1059         th->check = 0;
1060         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1061         nbytes += sizeof(struct tcphdr);
1062
1063         /* 3. the TCP segment data (if any) */
1064         data_len = tcplen - (th->doff << 2);
1065         if (data_len > 0) {
1066                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1067                 sg_set_buf(&sg[block++], data, data_len);
1068                 nbytes += data_len;
1069         }
1070
1071         /* 4. an independently-specified key or password, known to both
1072          * TCPs and presumably connection-specific
1073          */
1074         sg_set_buf(&sg[block++], key->key, key->keylen);
1075         nbytes += key->keylen;
1076
1077         /* Now store the Hash into the packet */
1078         err = crypto_hash_init(desc);
1079         if (err)
1080                 goto clear_hash;
1081         err = crypto_hash_update(desc, sg, nbytes);
1082         if (err)
1083                 goto clear_hash;
1084         err = crypto_hash_final(desc, md5_hash);
1085         if (err)
1086                 goto clear_hash;
1087
1088         /* Reset header, and free up the crypto */
1089         tcp_put_md5sig_pool();
1090         th->check = old_checksum;
1091
1092 out:
1093         return 0;
1094 clear_hash:
1095         tcp_put_md5sig_pool();
1096 clear_hash_noput:
1097         memset(md5_hash, 0, 16);
1098         goto out;
1099 }
1100
1101 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1102                          struct sock *sk,
1103                          struct dst_entry *dst,
1104                          struct request_sock *req,
1105                          struct tcphdr *th, int protocol,
1106                          int tcplen)
1107 {
1108         __be32 saddr, daddr;
1109
1110         if (sk) {
1111                 saddr = inet_sk(sk)->saddr;
1112                 daddr = inet_sk(sk)->daddr;
1113         } else {
1114                 struct rtable *rt = (struct rtable *)dst;
1115                 BUG_ON(!rt);
1116                 saddr = rt->rt_src;
1117                 daddr = rt->rt_dst;
1118         }
1119         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1120                                        saddr, daddr,
1121                                        th, protocol, tcplen);
1122 }
1123
1124 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1125
1126 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1127 {
1128         /*
1129          * This gets called for each TCP segment that arrives
1130          * so we want to be efficient.
1131          * We have 3 drop cases:
1132          * o No MD5 hash and one expected.
1133          * o MD5 hash and we're not expecting one.
1134          * o MD5 hash and its wrong.
1135          */
1136         __u8 *hash_location = NULL;
1137         struct tcp_md5sig_key *hash_expected;
1138         const struct iphdr *iph = ip_hdr(skb);
1139         struct tcphdr *th = tcp_hdr(skb);
1140         int length = (th->doff << 2) - sizeof(struct tcphdr);
1141         int genhash;
1142         unsigned char *ptr;
1143         unsigned char newhash[16];
1144
1145         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1146
1147         /*
1148          * If the TCP option length is less than the TCP_MD5SIG
1149          * option length, then we can shortcut
1150          */
1151         if (length < TCPOLEN_MD5SIG) {
1152                 if (hash_expected)
1153                         return 1;
1154                 else
1155                         return 0;
1156         }
1157
1158         /* Okay, we can't shortcut - we have to grub through the options */
1159         ptr = (unsigned char *)(th + 1);
1160         while (length > 0) {
1161                 int opcode = *ptr++;
1162                 int opsize;
1163
1164                 switch (opcode) {
1165                 case TCPOPT_EOL:
1166                         goto done_opts;
1167                 case TCPOPT_NOP:
1168                         length--;
1169                         continue;
1170                 default:
1171                         opsize = *ptr++;
1172                         if (opsize < 2)
1173                                 goto done_opts;
1174                         if (opsize > length)
1175                                 goto done_opts;
1176
1177                         if (opcode == TCPOPT_MD5SIG) {
1178                                 hash_location = ptr;
1179                                 goto done_opts;
1180                         }
1181                 }
1182                 ptr += opsize-2;
1183                 length -= opsize;
1184         }
1185 done_opts:
1186         /* We've parsed the options - do we have a hash? */
1187         if (!hash_expected && !hash_location)
1188                 return 0;
1189
1190         if (hash_expected && !hash_location) {
1191                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1192                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1193                                NIPQUAD(iph->saddr), ntohs(th->source),
1194                                NIPQUAD(iph->daddr), ntohs(th->dest));
1195                 return 1;
1196         }
1197
1198         if (!hash_expected && hash_location) {
1199                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1200                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1201                                NIPQUAD(iph->saddr), ntohs(th->source),
1202                                NIPQUAD(iph->daddr), ntohs(th->dest));
1203                 return 1;
1204         }
1205
1206         /* Okay, so this is hash_expected and hash_location -
1207          * so we need to calculate the checksum.
1208          */
1209         genhash = tcp_v4_do_calc_md5_hash(newhash,
1210                                           hash_expected,
1211                                           iph->saddr, iph->daddr,
1212                                           th, sk->sk_protocol,
1213                                           skb->len);
1214
1215         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1216                 if (net_ratelimit()) {
1217                         printk(KERN_INFO "MD5 Hash failed for "
1218                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1219                                NIPQUAD(iph->saddr), ntohs(th->source),
1220                                NIPQUAD(iph->daddr), ntohs(th->dest),
1221                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1222                 }
1223                 return 1;
1224         }
1225         return 0;
1226 }
1227
1228 #endif
1229
1230 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1231         .family         =       PF_INET,
1232         .obj_size       =       sizeof(struct tcp_request_sock),
1233         .rtx_syn_ack    =       tcp_v4_send_synack,
1234         .send_ack       =       tcp_v4_reqsk_send_ack,
1235         .destructor     =       tcp_v4_reqsk_destructor,
1236         .send_reset     =       tcp_v4_send_reset,
1237 };
1238
1239 #ifdef CONFIG_TCP_MD5SIG
1240 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1241         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1242 };
1243 #endif
1244
1245 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1246         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1247         .twsk_unique    = tcp_twsk_unique,
1248         .twsk_destructor= tcp_twsk_destructor,
1249 };
1250
1251 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1252 {
1253         struct inet_request_sock *ireq;
1254         struct tcp_options_received tmp_opt;
1255         struct request_sock *req;
1256         __be32 saddr = ip_hdr(skb)->saddr;
1257         __be32 daddr = ip_hdr(skb)->daddr;
1258         __u32 isn = TCP_SKB_CB(skb)->when;
1259         struct dst_entry *dst = NULL;
1260 #ifdef CONFIG_SYN_COOKIES
1261         int want_cookie = 0;
1262 #else
1263 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1264 #endif
1265
1266         /* Never answer to SYNs send to broadcast or multicast */
1267         if (((struct rtable *)skb->dst)->rt_flags &
1268             (RTCF_BROADCAST | RTCF_MULTICAST))
1269                 goto drop;
1270
1271         /* TW buckets are converted to open requests without
1272          * limitations, they conserve resources and peer is
1273          * evidently real one.
1274          */
1275         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1276 #ifdef CONFIG_SYN_COOKIES
1277                 if (sysctl_tcp_syncookies) {
1278                         want_cookie = 1;
1279                 } else
1280 #endif
1281                 goto drop;
1282         }
1283
1284         /* Accept backlog is full. If we have already queued enough
1285          * of warm entries in syn queue, drop request. It is better than
1286          * clogging syn queue with openreqs with exponentially increasing
1287          * timeout.
1288          */
1289         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1290                 goto drop;
1291
1292         req = reqsk_alloc(&tcp_request_sock_ops);
1293         if (!req)
1294                 goto drop;
1295
1296 #ifdef CONFIG_TCP_MD5SIG
1297         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1298 #endif
1299
1300         tcp_clear_options(&tmp_opt);
1301         tmp_opt.mss_clamp = 536;
1302         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1303
1304         tcp_parse_options(skb, &tmp_opt, 0);
1305
1306         if (want_cookie) {
1307                 tcp_clear_options(&tmp_opt);
1308                 tmp_opt.saw_tstamp = 0;
1309         }
1310
1311         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1312                 /* Some OSes (unknown ones, but I see them on web server, which
1313                  * contains information interesting only for windows'
1314                  * users) do not send their stamp in SYN. It is easy case.
1315                  * We simply do not advertise TS support.
1316                  */
1317                 tmp_opt.saw_tstamp = 0;
1318                 tmp_opt.tstamp_ok  = 0;
1319         }
1320         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1321
1322         tcp_openreq_init(req, &tmp_opt, skb);
1323
1324         if (security_inet_conn_request(sk, skb, req))
1325                 goto drop_and_free;
1326
1327         ireq = inet_rsk(req);
1328         ireq->loc_addr = daddr;
1329         ireq->rmt_addr = saddr;
1330         ireq->opt = tcp_v4_save_options(sk, skb);
1331         if (!want_cookie)
1332                 TCP_ECN_create_request(req, tcp_hdr(skb));
1333
1334         if (want_cookie) {
1335 #ifdef CONFIG_SYN_COOKIES
1336                 syn_flood_warning(skb);
1337 #endif
1338                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1339         } else if (!isn) {
1340                 struct inet_peer *peer = NULL;
1341
1342                 /* VJ's idea. We save last timestamp seen
1343                  * from the destination in peer table, when entering
1344                  * state TIME-WAIT, and check against it before
1345                  * accepting new connection request.
1346                  *
1347                  * If "isn" is not zero, this request hit alive
1348                  * timewait bucket, so that all the necessary checks
1349                  * are made in the function processing timewait state.
1350                  */
1351                 if (tmp_opt.saw_tstamp &&
1352                     tcp_death_row.sysctl_tw_recycle &&
1353                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1354                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1355                     peer->v4daddr == saddr) {
1356                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1357                             (s32)(peer->tcp_ts - req->ts_recent) >
1358                                                         TCP_PAWS_WINDOW) {
1359                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1360                                 dst_release(dst);
1361                                 goto drop_and_free;
1362                         }
1363                 }
1364                 /* Kill the following clause, if you dislike this way. */
1365                 else if (!sysctl_tcp_syncookies &&
1366                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1367                           (sysctl_max_syn_backlog >> 2)) &&
1368                          (!peer || !peer->tcp_ts_stamp) &&
1369                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1370                         /* Without syncookies last quarter of
1371                          * backlog is filled with destinations,
1372                          * proven to be alive.
1373                          * It means that we continue to communicate
1374                          * to destinations, already remembered
1375                          * to the moment of synflood.
1376                          */
1377                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1378                                        "request from %u.%u.%u.%u/%u\n",
1379                                        NIPQUAD(saddr),
1380                                        ntohs(tcp_hdr(skb)->source));
1381                         dst_release(dst);
1382                         goto drop_and_free;
1383                 }
1384
1385                 isn = tcp_v4_init_sequence(skb);
1386         }
1387         tcp_rsk(req)->snt_isn = isn;
1388
1389         if (tcp_v4_send_synack(sk, req, dst))
1390                 goto drop_and_free;
1391
1392         if (want_cookie) {
1393                 reqsk_free(req);
1394         } else {
1395                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1396         }
1397         return 0;
1398
1399 drop_and_free:
1400         reqsk_free(req);
1401 drop:
1402         return 0;
1403 }
1404
1405
1406 /*
1407  * The three way handshake has completed - we got a valid synack -
1408  * now create the new socket.
1409  */
1410 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1411                                   struct request_sock *req,
1412                                   struct dst_entry *dst)
1413 {
1414         struct inet_request_sock *ireq;
1415         struct inet_sock *newinet;
1416         struct tcp_sock *newtp;
1417         struct sock *newsk;
1418 #ifdef CONFIG_TCP_MD5SIG
1419         struct tcp_md5sig_key *key;
1420 #endif
1421
1422         if (sk_acceptq_is_full(sk))
1423                 goto exit_overflow;
1424
1425         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1426                 goto exit;
1427
1428         newsk = tcp_create_openreq_child(sk, req, skb);
1429         if (!newsk)
1430                 goto exit;
1431
1432         newsk->sk_gso_type = SKB_GSO_TCPV4;
1433         sk_setup_caps(newsk, dst);
1434
1435         newtp                 = tcp_sk(newsk);
1436         newinet               = inet_sk(newsk);
1437         ireq                  = inet_rsk(req);
1438         newinet->daddr        = ireq->rmt_addr;
1439         newinet->rcv_saddr    = ireq->loc_addr;
1440         newinet->saddr        = ireq->loc_addr;
1441         newinet->opt          = ireq->opt;
1442         ireq->opt             = NULL;
1443         newinet->mc_index     = inet_iif(skb);
1444         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1445         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1446         if (newinet->opt)
1447                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1448         newinet->id = newtp->write_seq ^ jiffies;
1449
1450         tcp_mtup_init(newsk);
1451         tcp_sync_mss(newsk, dst_mtu(dst));
1452         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1453         tcp_initialize_rcv_mss(newsk);
1454
1455 #ifdef CONFIG_TCP_MD5SIG
1456         /* Copy over the MD5 key from the original socket */
1457         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1458                 /*
1459                  * We're using one, so create a matching key
1460                  * on the newsk structure. If we fail to get
1461                  * memory, then we end up not copying the key
1462                  * across. Shucks.
1463                  */
1464                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1465                 if (newkey != NULL)
1466                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1467                                           newkey, key->keylen);
1468         }
1469 #endif
1470
1471         __inet_hash(&tcp_hashinfo, newsk, 0);
1472         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1473
1474         return newsk;
1475
1476 exit_overflow:
1477         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1478 exit:
1479         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1480         dst_release(dst);
1481         return NULL;
1482 }
1483
1484 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1485 {
1486         struct tcphdr *th = tcp_hdr(skb);
1487         const struct iphdr *iph = ip_hdr(skb);
1488         struct sock *nsk;
1489         struct request_sock **prev;
1490         /* Find possible connection requests. */
1491         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1492                                                        iph->saddr, iph->daddr);
1493         if (req)
1494                 return tcp_check_req(sk, skb, req, prev);
1495
1496         nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source,
1497                                       iph->daddr, th->dest, inet_iif(skb));
1498
1499         if (nsk) {
1500                 if (nsk->sk_state != TCP_TIME_WAIT) {
1501                         bh_lock_sock(nsk);
1502                         return nsk;
1503                 }
1504                 inet_twsk_put(inet_twsk(nsk));
1505                 return NULL;
1506         }
1507
1508 #ifdef CONFIG_SYN_COOKIES
1509         if (!th->rst && !th->syn && th->ack)
1510                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1511 #endif
1512         return sk;
1513 }
1514
1515 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1516 {
1517         const struct iphdr *iph = ip_hdr(skb);
1518
1519         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1520                 if (!tcp_v4_check(skb->len, iph->saddr,
1521                                   iph->daddr, skb->csum)) {
1522                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1523                         return 0;
1524                 }
1525         }
1526
1527         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1528                                        skb->len, IPPROTO_TCP, 0);
1529
1530         if (skb->len <= 76) {
1531                 return __skb_checksum_complete(skb);
1532         }
1533         return 0;
1534 }
1535
1536
1537 /* The socket must have it's spinlock held when we get
1538  * here.
1539  *
1540  * We have a potential double-lock case here, so even when
1541  * doing backlog processing we use the BH locking scheme.
1542  * This is because we cannot sleep with the original spinlock
1543  * held.
1544  */
1545 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1546 {
1547         struct sock *rsk;
1548 #ifdef CONFIG_TCP_MD5SIG
1549         /*
1550          * We really want to reject the packet as early as possible
1551          * if:
1552          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1553          *  o There is an MD5 option and we're not expecting one
1554          */
1555         if (tcp_v4_inbound_md5_hash(sk, skb))
1556                 goto discard;
1557 #endif
1558
1559         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1560                 TCP_CHECK_TIMER(sk);
1561                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1562                         rsk = sk;
1563                         goto reset;
1564                 }
1565                 TCP_CHECK_TIMER(sk);
1566                 return 0;
1567         }
1568
1569         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1570                 goto csum_err;
1571
1572         if (sk->sk_state == TCP_LISTEN) {
1573                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1574                 if (!nsk)
1575                         goto discard;
1576
1577                 if (nsk != sk) {
1578                         if (tcp_child_process(sk, nsk, skb)) {
1579                                 rsk = nsk;
1580                                 goto reset;
1581                         }
1582                         return 0;
1583                 }
1584         }
1585
1586         TCP_CHECK_TIMER(sk);
1587         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1588                 rsk = sk;
1589                 goto reset;
1590         }
1591         TCP_CHECK_TIMER(sk);
1592         return 0;
1593
1594 reset:
1595         tcp_v4_send_reset(rsk, skb);
1596 discard:
1597         kfree_skb(skb);
1598         /* Be careful here. If this function gets more complicated and
1599          * gcc suffers from register pressure on the x86, sk (in %ebx)
1600          * might be destroyed here. This current version compiles correctly,
1601          * but you have been warned.
1602          */
1603         return 0;
1604
1605 csum_err:
1606         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1607         goto discard;
1608 }
1609
1610 /*
1611  *      From tcp_input.c
1612  */
1613
1614 int tcp_v4_rcv(struct sk_buff *skb)
1615 {
1616         const struct iphdr *iph;
1617         struct tcphdr *th;
1618         struct sock *sk;
1619         int ret;
1620
1621         if (skb->pkt_type != PACKET_HOST)
1622                 goto discard_it;
1623
1624         /* Count it even if it's bad */
1625         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1626
1627         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1628                 goto discard_it;
1629
1630         th = tcp_hdr(skb);
1631
1632         if (th->doff < sizeof(struct tcphdr) / 4)
1633                 goto bad_packet;
1634         if (!pskb_may_pull(skb, th->doff * 4))
1635                 goto discard_it;
1636
1637         /* An explanation is required here, I think.
1638          * Packet length and doff are validated by header prediction,
1639          * provided case of th->doff==0 is eliminated.
1640          * So, we defer the checks. */
1641         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1642              tcp_v4_checksum_init(skb)))
1643                 goto bad_packet;
1644
1645         th = tcp_hdr(skb);
1646         iph = ip_hdr(skb);
1647         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1648         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1649                                     skb->len - th->doff * 4);
1650         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1651         TCP_SKB_CB(skb)->when    = 0;
1652         TCP_SKB_CB(skb)->flags   = iph->tos;
1653         TCP_SKB_CB(skb)->sacked  = 0;
1654
1655         sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source,
1656                            iph->daddr, th->dest, inet_iif(skb));
1657         if (!sk)
1658                 goto no_tcp_socket;
1659
1660 process:
1661         if (sk->sk_state == TCP_TIME_WAIT)
1662                 goto do_time_wait;
1663
1664         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1665                 goto discard_and_relse;
1666         nf_reset(skb);
1667
1668         if (sk_filter(sk, skb))
1669                 goto discard_and_relse;
1670
1671         skb->dev = NULL;
1672
1673         bh_lock_sock_nested(sk);
1674         ret = 0;
1675         if (!sock_owned_by_user(sk)) {
1676 #ifdef CONFIG_NET_DMA
1677                 struct tcp_sock *tp = tcp_sk(sk);
1678                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1679                         tp->ucopy.dma_chan = get_softnet_dma();
1680                 if (tp->ucopy.dma_chan)
1681                         ret = tcp_v4_do_rcv(sk, skb);
1682                 else
1683 #endif
1684                 {
1685                         if (!tcp_prequeue(sk, skb))
1686                         ret = tcp_v4_do_rcv(sk, skb);
1687                 }
1688         } else
1689                 sk_add_backlog(sk, skb);
1690         bh_unlock_sock(sk);
1691
1692         sock_put(sk);
1693
1694         return ret;
1695
1696 no_tcp_socket:
1697         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1698                 goto discard_it;
1699
1700         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1701 bad_packet:
1702                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1703         } else {
1704                 tcp_v4_send_reset(NULL, skb);
1705         }
1706
1707 discard_it:
1708         /* Discard frame. */
1709         kfree_skb(skb);
1710         return 0;
1711
1712 discard_and_relse:
1713         sock_put(sk);
1714         goto discard_it;
1715
1716 do_time_wait:
1717         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1718                 inet_twsk_put(inet_twsk(sk));
1719                 goto discard_it;
1720         }
1721
1722         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1723                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1724                 inet_twsk_put(inet_twsk(sk));
1725                 goto discard_it;
1726         }
1727         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1728         case TCP_TW_SYN: {
1729                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1730                                                         iph->daddr, th->dest,
1731                                                         inet_iif(skb));
1732                 if (sk2) {
1733                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1734                         inet_twsk_put(inet_twsk(sk));
1735                         sk = sk2;
1736                         goto process;
1737                 }
1738                 /* Fall through to ACK */
1739         }
1740         case TCP_TW_ACK:
1741                 tcp_v4_timewait_ack(sk, skb);
1742                 break;
1743         case TCP_TW_RST:
1744                 goto no_tcp_socket;
1745         case TCP_TW_SUCCESS:;
1746         }
1747         goto discard_it;
1748 }
1749
1750 /* VJ's idea. Save last timestamp seen from this destination
1751  * and hold it at least for normal timewait interval to use for duplicate
1752  * segment detection in subsequent connections, before they enter synchronized
1753  * state.
1754  */
1755
1756 int tcp_v4_remember_stamp(struct sock *sk)
1757 {
1758         struct inet_sock *inet = inet_sk(sk);
1759         struct tcp_sock *tp = tcp_sk(sk);
1760         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1761         struct inet_peer *peer = NULL;
1762         int release_it = 0;
1763
1764         if (!rt || rt->rt_dst != inet->daddr) {
1765                 peer = inet_getpeer(inet->daddr, 1);
1766                 release_it = 1;
1767         } else {
1768                 if (!rt->peer)
1769                         rt_bind_peer(rt, 1);
1770                 peer = rt->peer;
1771         }
1772
1773         if (peer) {
1774                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1775                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1776                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1777                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1778                         peer->tcp_ts = tp->rx_opt.ts_recent;
1779                 }
1780                 if (release_it)
1781                         inet_putpeer(peer);
1782                 return 1;
1783         }
1784
1785         return 0;
1786 }
1787
1788 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1789 {
1790         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1791
1792         if (peer) {
1793                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1794
1795                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1796                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1797                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1798                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1799                         peer->tcp_ts       = tcptw->tw_ts_recent;
1800                 }
1801                 inet_putpeer(peer);
1802                 return 1;
1803         }
1804
1805         return 0;
1806 }
1807
1808 struct inet_connection_sock_af_ops ipv4_specific = {
1809         .queue_xmit        = ip_queue_xmit,
1810         .send_check        = tcp_v4_send_check,
1811         .rebuild_header    = inet_sk_rebuild_header,
1812         .conn_request      = tcp_v4_conn_request,
1813         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1814         .remember_stamp    = tcp_v4_remember_stamp,
1815         .net_header_len    = sizeof(struct iphdr),
1816         .setsockopt        = ip_setsockopt,
1817         .getsockopt        = ip_getsockopt,
1818         .addr2sockaddr     = inet_csk_addr2sockaddr,
1819         .sockaddr_len      = sizeof(struct sockaddr_in),
1820 #ifdef CONFIG_COMPAT
1821         .compat_setsockopt = compat_ip_setsockopt,
1822         .compat_getsockopt = compat_ip_getsockopt,
1823 #endif
1824 };
1825
1826 #ifdef CONFIG_TCP_MD5SIG
1827 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1828         .md5_lookup             = tcp_v4_md5_lookup,
1829         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1830         .md5_add                = tcp_v4_md5_add_func,
1831         .md5_parse              = tcp_v4_parse_md5_keys,
1832 };
1833 #endif
1834
1835 /* NOTE: A lot of things set to zero explicitly by call to
1836  *       sk_alloc() so need not be done here.
1837  */
1838 static int tcp_v4_init_sock(struct sock *sk)
1839 {
1840         struct inet_connection_sock *icsk = inet_csk(sk);
1841         struct tcp_sock *tp = tcp_sk(sk);
1842
1843         skb_queue_head_init(&tp->out_of_order_queue);
1844         tcp_init_xmit_timers(sk);
1845         tcp_prequeue_init(tp);
1846
1847         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1848         tp->mdev = TCP_TIMEOUT_INIT;
1849
1850         /* So many TCP implementations out there (incorrectly) count the
1851          * initial SYN frame in their delayed-ACK and congestion control
1852          * algorithms that we must have the following bandaid to talk
1853          * efficiently to them.  -DaveM
1854          */
1855         tp->snd_cwnd = 2;
1856
1857         /* See draft-stevens-tcpca-spec-01 for discussion of the
1858          * initialization of these values.
1859          */
1860         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1861         tp->snd_cwnd_clamp = ~0;
1862         tp->mss_cache = 536;
1863
1864         tp->reordering = sysctl_tcp_reordering;
1865         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1866
1867         sk->sk_state = TCP_CLOSE;
1868
1869         sk->sk_write_space = sk_stream_write_space;
1870         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1871
1872         icsk->icsk_af_ops = &ipv4_specific;
1873         icsk->icsk_sync_mss = tcp_sync_mss;
1874 #ifdef CONFIG_TCP_MD5SIG
1875         tp->af_specific = &tcp_sock_ipv4_specific;
1876 #endif
1877
1878         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1879         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1880
1881         atomic_inc(&tcp_sockets_allocated);
1882
1883         return 0;
1884 }
1885
1886 int tcp_v4_destroy_sock(struct sock *sk)
1887 {
1888         struct tcp_sock *tp = tcp_sk(sk);
1889
1890         tcp_clear_xmit_timers(sk);
1891
1892         tcp_cleanup_congestion_control(sk);
1893
1894         /* Cleanup up the write buffer. */
1895         tcp_write_queue_purge(sk);
1896
1897         /* Cleans up our, hopefully empty, out_of_order_queue. */
1898         __skb_queue_purge(&tp->out_of_order_queue);
1899
1900 #ifdef CONFIG_TCP_MD5SIG
1901         /* Clean up the MD5 key list, if any */
1902         if (tp->md5sig_info) {
1903                 tcp_v4_clear_md5_list(sk);
1904                 kfree(tp->md5sig_info);
1905                 tp->md5sig_info = NULL;
1906         }
1907 #endif
1908
1909 #ifdef CONFIG_NET_DMA
1910         /* Cleans up our sk_async_wait_queue */
1911         __skb_queue_purge(&sk->sk_async_wait_queue);
1912 #endif
1913
1914         /* Clean prequeue, it must be empty really */
1915         __skb_queue_purge(&tp->ucopy.prequeue);
1916
1917         /* Clean up a referenced TCP bind bucket. */
1918         if (inet_csk(sk)->icsk_bind_hash)
1919                 inet_put_port(&tcp_hashinfo, sk);
1920
1921         /*
1922          * If sendmsg cached page exists, toss it.
1923          */
1924         if (sk->sk_sndmsg_page) {
1925                 __free_page(sk->sk_sndmsg_page);
1926                 sk->sk_sndmsg_page = NULL;
1927         }
1928
1929         atomic_dec(&tcp_sockets_allocated);
1930
1931         return 0;
1932 }
1933
1934 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1935
1936 #ifdef CONFIG_PROC_FS
1937 /* Proc filesystem TCP sock list dumping. */
1938
1939 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1940 {
1941         return hlist_empty(head) ? NULL :
1942                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1943 }
1944
1945 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1946 {
1947         return tw->tw_node.next ?
1948                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1949 }
1950
1951 static void *listening_get_next(struct seq_file *seq, void *cur)
1952 {
1953         struct inet_connection_sock *icsk;
1954         struct hlist_node *node;
1955         struct sock *sk = cur;
1956         struct tcp_iter_state* st = seq->private;
1957
1958         if (!sk) {
1959                 st->bucket = 0;
1960                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1961                 goto get_sk;
1962         }
1963
1964         ++st->num;
1965
1966         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1967                 struct request_sock *req = cur;
1968
1969                 icsk = inet_csk(st->syn_wait_sk);
1970                 req = req->dl_next;
1971                 while (1) {
1972                         while (req) {
1973                                 if (req->rsk_ops->family == st->family) {
1974                                         cur = req;
1975                                         goto out;
1976                                 }
1977                                 req = req->dl_next;
1978                         }
1979                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1980                                 break;
1981 get_req:
1982                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1983                 }
1984                 sk        = sk_next(st->syn_wait_sk);
1985                 st->state = TCP_SEQ_STATE_LISTENING;
1986                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1987         } else {
1988                 icsk = inet_csk(sk);
1989                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1990                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1991                         goto start_req;
1992                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1993                 sk = sk_next(sk);
1994         }
1995 get_sk:
1996         sk_for_each_from(sk, node) {
1997                 if (sk->sk_family == st->family) {
1998                         cur = sk;
1999                         goto out;
2000                 }
2001                 icsk = inet_csk(sk);
2002                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2003                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2004 start_req:
2005                         st->uid         = sock_i_uid(sk);
2006                         st->syn_wait_sk = sk;
2007                         st->state       = TCP_SEQ_STATE_OPENREQ;
2008                         st->sbucket     = 0;
2009                         goto get_req;
2010                 }
2011                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2012         }
2013         if (++st->bucket < INET_LHTABLE_SIZE) {
2014                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2015                 goto get_sk;
2016         }
2017         cur = NULL;
2018 out:
2019         return cur;
2020 }
2021
2022 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2023 {
2024         void *rc = listening_get_next(seq, NULL);
2025
2026         while (rc && *pos) {
2027                 rc = listening_get_next(seq, rc);
2028                 --*pos;
2029         }
2030         return rc;
2031 }
2032
2033 static void *established_get_first(struct seq_file *seq)
2034 {
2035         struct tcp_iter_state* st = seq->private;
2036         void *rc = NULL;
2037
2038         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2039                 struct sock *sk;
2040                 struct hlist_node *node;
2041                 struct inet_timewait_sock *tw;
2042
2043                 /* We can reschedule _before_ having picked the target: */
2044                 cond_resched_softirq();
2045
2046                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2047                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2048                         if (sk->sk_family != st->family) {
2049                                 continue;
2050                         }
2051                         rc = sk;
2052                         goto out;
2053                 }
2054                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2055                 inet_twsk_for_each(tw, node,
2056                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2057                         if (tw->tw_family != st->family) {
2058                                 continue;
2059                         }
2060                         rc = tw;
2061                         goto out;
2062                 }
2063                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2064                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2065         }
2066 out:
2067         return rc;
2068 }
2069
2070 static void *established_get_next(struct seq_file *seq, void *cur)
2071 {
2072         struct sock *sk = cur;
2073         struct inet_timewait_sock *tw;
2074         struct hlist_node *node;
2075         struct tcp_iter_state* st = seq->private;
2076
2077         ++st->num;
2078
2079         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2080                 tw = cur;
2081                 tw = tw_next(tw);
2082 get_tw:
2083                 while (tw && tw->tw_family != st->family) {
2084                         tw = tw_next(tw);
2085                 }
2086                 if (tw) {
2087                         cur = tw;
2088                         goto out;
2089                 }
2090                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2091                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2092
2093                 /* We can reschedule between buckets: */
2094                 cond_resched_softirq();
2095
2096                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2097                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2098                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2099                 } else {
2100                         cur = NULL;
2101                         goto out;
2102                 }
2103         } else
2104                 sk = sk_next(sk);
2105
2106         sk_for_each_from(sk, node) {
2107                 if (sk->sk_family == st->family)
2108                         goto found;
2109         }
2110
2111         st->state = TCP_SEQ_STATE_TIME_WAIT;
2112         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2113         goto get_tw;
2114 found:
2115         cur = sk;
2116 out:
2117         return cur;
2118 }
2119
2120 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2121 {
2122         void *rc = established_get_first(seq);
2123
2124         while (rc && pos) {
2125                 rc = established_get_next(seq, rc);
2126                 --pos;
2127         }
2128         return rc;
2129 }
2130
2131 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2132 {
2133         void *rc;
2134         struct tcp_iter_state* st = seq->private;
2135
2136         inet_listen_lock(&tcp_hashinfo);
2137         st->state = TCP_SEQ_STATE_LISTENING;
2138         rc        = listening_get_idx(seq, &pos);
2139
2140         if (!rc) {
2141                 inet_listen_unlock(&tcp_hashinfo);
2142                 local_bh_disable();
2143                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2144                 rc        = established_get_idx(seq, pos);
2145         }
2146
2147         return rc;
2148 }
2149
2150 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2151 {
2152         struct tcp_iter_state* st = seq->private;
2153         st->state = TCP_SEQ_STATE_LISTENING;
2154         st->num = 0;
2155         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2156 }
2157
2158 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2159 {
2160         void *rc = NULL;
2161         struct tcp_iter_state* st;
2162
2163         if (v == SEQ_START_TOKEN) {
2164                 rc = tcp_get_idx(seq, 0);
2165                 goto out;
2166         }
2167         st = seq->private;
2168
2169         switch (st->state) {
2170         case TCP_SEQ_STATE_OPENREQ:
2171         case TCP_SEQ_STATE_LISTENING:
2172                 rc = listening_get_next(seq, v);
2173                 if (!rc) {
2174                         inet_listen_unlock(&tcp_hashinfo);
2175                         local_bh_disable();
2176                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2177                         rc        = established_get_first(seq);
2178                 }
2179                 break;
2180         case TCP_SEQ_STATE_ESTABLISHED:
2181         case TCP_SEQ_STATE_TIME_WAIT:
2182                 rc = established_get_next(seq, v);
2183                 break;
2184         }
2185 out:
2186         ++*pos;
2187         return rc;
2188 }
2189
2190 static void tcp_seq_stop(struct seq_file *seq, void *v)
2191 {
2192         struct tcp_iter_state* st = seq->private;
2193
2194         switch (st->state) {
2195         case TCP_SEQ_STATE_OPENREQ:
2196                 if (v) {
2197                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2198                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2199                 }
2200         case TCP_SEQ_STATE_LISTENING:
2201                 if (v != SEQ_START_TOKEN)
2202                         inet_listen_unlock(&tcp_hashinfo);
2203                 break;
2204         case TCP_SEQ_STATE_TIME_WAIT:
2205         case TCP_SEQ_STATE_ESTABLISHED:
2206                 if (v)
2207                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2208                 local_bh_enable();
2209                 break;
2210         }
2211 }
2212
2213 static int tcp_seq_open(struct inode *inode, struct file *file)
2214 {
2215         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2216         struct seq_file *seq;
2217         struct tcp_iter_state *s;
2218         int rc;
2219
2220         if (unlikely(afinfo == NULL))
2221                 return -EINVAL;
2222
2223         s = kzalloc(sizeof(*s), GFP_KERNEL);
2224         if (!s)
2225                 return -ENOMEM;
2226         s->family               = afinfo->family;
2227         s->seq_ops.start        = tcp_seq_start;
2228         s->seq_ops.next         = tcp_seq_next;
2229         s->seq_ops.show         = afinfo->seq_show;
2230         s->seq_ops.stop         = tcp_seq_stop;
2231
2232         rc = seq_open(file, &s->seq_ops);
2233         if (rc)
2234                 goto out_kfree;
2235         seq          = file->private_data;
2236         seq->private = s;
2237 out:
2238         return rc;
2239 out_kfree:
2240         kfree(s);
2241         goto out;
2242 }
2243
2244 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2245 {
2246         int rc = 0;
2247         struct proc_dir_entry *p;
2248
2249         if (!afinfo)
2250                 return -EINVAL;
2251         afinfo->seq_fops->owner         = afinfo->owner;
2252         afinfo->seq_fops->open          = tcp_seq_open;
2253         afinfo->seq_fops->read          = seq_read;
2254         afinfo->seq_fops->llseek        = seq_lseek;
2255         afinfo->seq_fops->release       = seq_release_private;
2256
2257         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2258         if (p)
2259                 p->data = afinfo;
2260         else
2261                 rc = -ENOMEM;
2262         return rc;
2263 }
2264
2265 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2266 {
2267         if (!afinfo)
2268                 return;
2269         proc_net_remove(afinfo->name);
2270         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2271 }
2272
2273 static void get_openreq4(struct sock *sk, struct request_sock *req,
2274                          char *tmpbuf, int i, int uid)
2275 {
2276         const struct inet_request_sock *ireq = inet_rsk(req);
2277         int ttd = req->expires - jiffies;
2278
2279         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2280                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2281                 i,
2282                 ireq->loc_addr,
2283                 ntohs(inet_sk(sk)->sport),
2284                 ireq->rmt_addr,
2285                 ntohs(ireq->rmt_port),
2286                 TCP_SYN_RECV,
2287                 0, 0, /* could print option size, but that is af dependent. */
2288                 1,    /* timers active (only the expire timer) */
2289                 jiffies_to_clock_t(ttd),
2290                 req->retrans,
2291                 uid,
2292                 0,  /* non standard timer */
2293                 0, /* open_requests have no inode */
2294                 atomic_read(&sk->sk_refcnt),
2295                 req);
2296 }
2297
2298 static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2299 {
2300         int timer_active;
2301         unsigned long timer_expires;
2302         struct tcp_sock *tp = tcp_sk(sk);
2303         const struct inet_connection_sock *icsk = inet_csk(sk);
2304         struct inet_sock *inet = inet_sk(sk);
2305         __be32 dest = inet->daddr;
2306         __be32 src = inet->rcv_saddr;
2307         __u16 destp = ntohs(inet->dport);
2308         __u16 srcp = ntohs(inet->sport);
2309
2310         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2311                 timer_active    = 1;
2312                 timer_expires   = icsk->icsk_timeout;
2313         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2314                 timer_active    = 4;
2315                 timer_expires   = icsk->icsk_timeout;
2316         } else if (timer_pending(&sk->sk_timer)) {
2317                 timer_active    = 2;
2318                 timer_expires   = sk->sk_timer.expires;
2319         } else {
2320                 timer_active    = 0;
2321                 timer_expires = jiffies;
2322         }
2323
2324         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2325                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2326                 i, src, srcp, dest, destp, sk->sk_state,
2327                 tp->write_seq - tp->snd_una,
2328                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2329                                              (tp->rcv_nxt - tp->copied_seq),
2330                 timer_active,
2331                 jiffies_to_clock_t(timer_expires - jiffies),
2332                 icsk->icsk_retransmits,
2333                 sock_i_uid(sk),
2334                 icsk->icsk_probes_out,
2335                 sock_i_ino(sk),
2336                 atomic_read(&sk->sk_refcnt), sk,
2337                 icsk->icsk_rto,
2338                 icsk->icsk_ack.ato,
2339                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2340                 tp->snd_cwnd,
2341                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2342 }
2343
2344 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2345                                char *tmpbuf, int i)
2346 {
2347         __be32 dest, src;
2348         __u16 destp, srcp;
2349         int ttd = tw->tw_ttd - jiffies;
2350
2351         if (ttd < 0)
2352                 ttd = 0;
2353
2354         dest  = tw->tw_daddr;
2355         src   = tw->tw_rcv_saddr;
2356         destp = ntohs(tw->tw_dport);
2357         srcp  = ntohs(tw->tw_sport);
2358
2359         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2360                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2361                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2362                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2363                 atomic_read(&tw->tw_refcnt), tw);
2364 }
2365
2366 #define TMPSZ 150
2367
2368 static int tcp4_seq_show(struct seq_file *seq, void *v)
2369 {
2370         struct tcp_iter_state* st;
2371         char tmpbuf[TMPSZ + 1];
2372
2373         if (v == SEQ_START_TOKEN) {
2374                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2375                            "  sl  local_address rem_address   st tx_queue "
2376                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2377                            "inode");
2378                 goto out;
2379         }
2380         st = seq->private;
2381
2382         switch (st->state) {
2383         case TCP_SEQ_STATE_LISTENING:
2384         case TCP_SEQ_STATE_ESTABLISHED:
2385                 get_tcp4_sock(v, tmpbuf, st->num);
2386                 break;
2387         case TCP_SEQ_STATE_OPENREQ:
2388                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2389                 break;
2390         case TCP_SEQ_STATE_TIME_WAIT:
2391                 get_timewait4_sock(v, tmpbuf, st->num);
2392                 break;
2393         }
2394         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2395 out:
2396         return 0;
2397 }
2398
2399 static struct file_operations tcp4_seq_fops;
2400 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2401         .owner          = THIS_MODULE,
2402         .name           = "tcp",
2403         .family         = AF_INET,
2404         .seq_show       = tcp4_seq_show,
2405         .seq_fops       = &tcp4_seq_fops,
2406 };
2407
2408 int __init tcp4_proc_init(void)
2409 {
2410         return tcp_proc_register(&tcp4_seq_afinfo);
2411 }
2412
2413 void tcp4_proc_exit(void)
2414 {
2415         tcp_proc_unregister(&tcp4_seq_afinfo);
2416 }
2417 #endif /* CONFIG_PROC_FS */
2418
2419 struct proto tcp_prot = {
2420         .name                   = "TCP",
2421         .owner                  = THIS_MODULE,
2422         .close                  = tcp_close,
2423         .connect                = tcp_v4_connect,
2424         .disconnect             = tcp_disconnect,
2425         .accept                 = inet_csk_accept,
2426         .ioctl                  = tcp_ioctl,
2427         .init                   = tcp_v4_init_sock,
2428         .destroy                = tcp_v4_destroy_sock,
2429         .shutdown               = tcp_shutdown,
2430         .setsockopt             = tcp_setsockopt,
2431         .getsockopt             = tcp_getsockopt,
2432         .sendmsg                = tcp_sendmsg,
2433         .recvmsg                = tcp_recvmsg,
2434         .backlog_rcv            = tcp_v4_do_rcv,
2435         .hash                   = tcp_v4_hash,
2436         .unhash                 = tcp_unhash,
2437         .get_port               = tcp_v4_get_port,
2438         .enter_memory_pressure  = tcp_enter_memory_pressure,
2439         .sockets_allocated      = &tcp_sockets_allocated,
2440         .orphan_count           = &tcp_orphan_count,
2441         .memory_allocated       = &tcp_memory_allocated,
2442         .memory_pressure        = &tcp_memory_pressure,
2443         .sysctl_mem             = sysctl_tcp_mem,
2444         .sysctl_wmem            = sysctl_tcp_wmem,
2445         .sysctl_rmem            = sysctl_tcp_rmem,
2446         .max_header             = MAX_TCP_HEADER,
2447         .obj_size               = sizeof(struct tcp_sock),
2448         .twsk_prot              = &tcp_timewait_sock_ops,
2449         .rsk_prot               = &tcp_request_sock_ops,
2450 #ifdef CONFIG_COMPAT
2451         .compat_setsockopt      = compat_tcp_setsockopt,
2452         .compat_getsockopt      = compat_tcp_getsockopt,
2453 #endif
2454 };
2455
2456 void __init tcp_v4_init(struct net_proto_family *ops)
2457 {
2458         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2459                                      IPPROTO_TCP) < 0)
2460                 panic("Failed to create the TCP control socket.\n");
2461 }
2462
2463 EXPORT_SYMBOL(ipv4_specific);
2464 EXPORT_SYMBOL(tcp_hashinfo);
2465 EXPORT_SYMBOL(tcp_prot);
2466 EXPORT_SYMBOL(tcp_unhash);
2467 EXPORT_SYMBOL(tcp_v4_conn_request);
2468 EXPORT_SYMBOL(tcp_v4_connect);
2469 EXPORT_SYMBOL(tcp_v4_do_rcv);
2470 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2471 EXPORT_SYMBOL(tcp_v4_send_check);
2472 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2473
2474 #ifdef CONFIG_PROC_FS
2475 EXPORT_SYMBOL(tcp_proc_register);
2476 EXPORT_SYMBOL(tcp_proc_unregister);
2477 #endif
2478 EXPORT_SYMBOL(sysctl_local_port_range);
2479 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2480