/* People can turn this off for buggy TCP's found in printers etc. */
int sysctl_tcp_retrans_collapse = 1;
+/* People can turn this on to work with those rare, broken TCPs that
+ * interpret the window field as a signed quantity.
+ */
+int sysctl_tcp_workaround_signed_windows = 0;
+
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
* which are too large can cause TCP streams to be bursty.
*/
int sysctl_tcp_tso_win_divisor = 3;
-static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
- struct sk_buff *skb)
+int sysctl_tcp_mtu_probing = 0;
+int sysctl_tcp_base_mss = 512;
+
+/* By default, RFC2861 behavior. */
+int sysctl_tcp_slow_start_after_idle = 1;
+
+static void update_send_head(struct sock *sk, struct tcp_sock *tp,
+ struct sk_buff *skb)
{
sk->sk_send_head = skb->next;
if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
tp->snd_cwnd_used = 0;
}
-static inline void tcp_event_data_sent(struct tcp_sock *tp,
- struct sk_buff *skb, struct sock *sk)
+static void tcp_event_data_sent(struct tcp_sock *tp,
+ struct sk_buff *skb, struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_time_stamp;
- if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
+ if (sysctl_tcp_slow_start_after_idle &&
+ (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
tcp_cwnd_restart(sk, __sk_dst_get(sk));
tp->lsndtime = now;
icsk->icsk_ack.pingpong = 1;
}
-static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
space = (space / mss) * mss;
/* NOTE: offering an initial window larger than 32767
- * will break some buggy TCP stacks. We try to be nice.
- * If we are not window scaling, then this truncates
- * our initial window offering to 32k. There should also
- * be a sysctl option to stop being nice.
+ * will break some buggy TCP stacks. If the admin tells us
+ * it is likely we could be speaking with such a buggy stack
+ * we will truncate our initial window offering to 32K-1
+ * unless the remote has sent us a window scaling option,
+ * which we interpret as a sign the remote TCP is not
+ * misinterpreting the window field as a signed quantity.
*/
- (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+ if (sysctl_tcp_workaround_signed_windows)
+ (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+ else
+ (*rcv_wnd) = space;
+
(*rcv_wscale) = 0;
if (wscale_ok) {
/* Set window scaling on max possible window
* value can be stuffed directly into th->window for an outgoing
* frame.
*/
-static __inline__ u16 tcp_select_window(struct sock *sk)
+static u16 tcp_select_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 cur_win = tcp_receive_window(tp);
/* Make sure we do not exceed the maximum possible
* scaled window.
*/
- if (!tp->rx_opt.rcv_wscale)
+ if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
return new_win;
}
+static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp,
+ __u32 tstamp)
+{
+ if (tp->rx_opt.tstamp_ok) {
+ *ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ *ptr++ = htonl(tstamp);
+ *ptr++ = htonl(tp->rx_opt.ts_recent);
+ }
+ if (tp->rx_opt.eff_sacks) {
+ struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
+ int this_sack;
+
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK << 8) |
+ (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
+ TCPOLEN_SACK_PERBLOCK)));
+ for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
+ *ptr++ = htonl(sp[this_sack].start_seq);
+ *ptr++ = htonl(sp[this_sack].end_seq);
+ }
+ if (tp->rx_opt.dsack) {
+ tp->rx_opt.dsack = 0;
+ tp->rx_opt.eff_sacks--;
+ }
+ }
+}
+
+/* Construct a tcp options header for a SYN or SYN_ACK packet.
+ * If this is every changed make sure to change the definition of
+ * MAX_SYN_SIZE to match the new maximum number of options that you
+ * can generate.
+ */
+static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
+ int offer_wscale, int wscale, __u32 tstamp,
+ __u32 ts_recent)
+{
+ /* We always get an MSS option.
+ * The option bytes which will be seen in normal data
+ * packets should timestamps be used, must be in the MSS
+ * advertised. But we subtract them from tp->mss_cache so
+ * that calculations in tcp_sendmsg are simpler etc.
+ * So account for this fact here if necessary. If we
+ * don't do this correctly, as a receiver we won't
+ * recognize data packets as being full sized when we
+ * should, and thus we won't abide by the delayed ACK
+ * rules correctly.
+ * SACKs don't matter, we never delay an ACK when we
+ * have any of those going out.
+ */
+ *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
+ if (ts) {
+ if(sack)
+ *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) |
+ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+ else
+ *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+ *ptr++ = htonl(tstamp); /* TSVAL */
+ *ptr++ = htonl(ts_recent); /* TSECR */
+ } else if(sack)
+ *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
+ if (offer_wscale)
+ *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));
+}
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
TCP_ECN_send(sk, tp, skb, tcp_header_size);
}
- tp->af_specific->send_check(sk, th, skb->len, skb);
+ icsk->icsk_af_ops->send_check(sk, skb->len, skb);
if (likely(tcb->flags & TCPCB_FLAG_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
TCP_INC_STATS(TCP_MIB_OUTSEGS);
- err = tp->af_specific->queue_xmit(skb, 0);
- if (unlikely(err <= 0))
+ err = icsk->icsk_af_ops->queue_xmit(skb, 0);
+ if (likely(err <= 0))
return err;
tcp_enter_cwr(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
int nsize, old_factor;
+ int nlen;
u16 flags;
BUG_ON(len > skb->len);
buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
if (buff == NULL)
return -ENOMEM; /* We'll just try again later. */
+
sk_charge_skb(sk, buff);
+ nlen = skb->len - len - nsize;
+ buff->truesize += nlen;
+ skb->truesize -= nlen;
/* Correct the sequence numbers. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
* eventually). The difference is that pulled data not copied, but
* immediately discarded.
*/
-static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
+static void __pskb_trim_head(struct sk_buff *skb, int len)
{
int i, k, eat;
skb->tail = skb->data;
skb->data_len -= len;
skb->len = skb->data_len;
- return skb->tail;
}
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
return -ENOMEM;
- if (len <= skb_headlen(skb)) {
+ /* If len == headlen, we avoid __skb_pull to preserve alignment. */
+ if (unlikely(len < skb_headlen(skb)))
__skb_pull(skb, len);
- } else {
- if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
- return -ENOMEM;
- }
+ else
+ __pskb_trim_head(skb, len - skb_headlen(skb));
TCP_SKB_CB(skb)->seq += len;
skb->ip_summed = CHECKSUM_HW;
return 0;
}
+/* Not accounting for SACKs here. */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int mss_now;
+
+ /* Calculate base mss without TCP options:
+ It is MMS_S - sizeof(tcphdr) of rfc1122
+ */
+ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+
+ /* Clamp it (mss_clamp does not include tcp options) */
+ if (mss_now > tp->rx_opt.mss_clamp)
+ mss_now = tp->rx_opt.mss_clamp;
+
+ /* Now subtract optional transport overhead */
+ mss_now -= icsk->icsk_ext_hdr_len;
+
+ /* Then reserve room for full set of TCP options and 8 bytes of data */
+ if (mss_now < 48)
+ mss_now = 48;
+
+ /* Now subtract TCP options size, not including SACKs */
+ mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+ return mss_now;
+}
+
+/* Inverse of above */
+int tcp_mss_to_mtu(struct sock *sk, int mss)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int mtu;
+
+ mtu = mss +
+ tp->tcp_header_len +
+ icsk->icsk_ext_hdr_len +
+ icsk->icsk_af_ops->net_header_len;
+
+ return mtu;
+}
+
+void tcp_mtup_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
+ icsk->icsk_mtup.probe_size = 0;
+}
+
/* This function synchronize snd mss to current pmtu/exthdr set.
tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
It is minimum of user_mss and mss received with SYN.
It also does not include TCP options.
- tp->pmtu_cookie is last pmtu, seen by this function.
+ inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
tp->mss_cache is current effective sending mss, including
all tcp options except for SACKs. It is evaluated,
NOTE1. rfc1122 clearly states that advertised MSS
DOES NOT include either tcp or ip options.
- NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
- this function. --ANK (980731)
+ NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
+ are READ ONLY outside this function. --ANK (980731)
*/
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
int mss_now;
- /* Calculate base mss without TCP options:
- It is MMS_S - sizeof(tcphdr) of rfc1122
- */
- mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
-
- /* Clamp it (mss_clamp does not include tcp options) */
- if (mss_now > tp->rx_opt.mss_clamp)
- mss_now = tp->rx_opt.mss_clamp;
-
- /* Now subtract optional transport overhead */
- mss_now -= tp->ext_header_len;
-
- /* Then reserve room for full set of TCP options and 8 bytes of data */
- if (mss_now < 48)
- mss_now = 48;
+ if (icsk->icsk_mtup.search_high > pmtu)
+ icsk->icsk_mtup.search_high = pmtu;
- /* Now subtract TCP options size, not including SACKs */
- mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+ mss_now = tcp_mtu_to_mss(sk, pmtu);
/* Bound mss with half of window */
if (tp->max_window && mss_now > (tp->max_window>>1))
mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
/* And store cached results */
- tp->pmtu_cookie = pmtu;
+ icsk->icsk_pmtu_cookie = pmtu;
+ if (icsk->icsk_mtup.enabled)
+ mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
tp->mss_cache = mss_now;
return mss_now;
if (dst) {
u32 mtu = dst_mtu(dst);
- if (mtu != tp->pmtu_cookie)
+ if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
mss_now = tcp_sync_mss(sk, mtu);
}
xmit_size_goal = mss_now;
if (doing_tso) {
- xmit_size_goal = 65535 -
- tp->af_specific->net_header_len -
- tp->ext_header_len - tp->tcp_header_len;
+ xmit_size_goal = (65535 -
+ inet_csk(sk)->icsk_af_ops->net_header_len -
+ inet_csk(sk)->icsk_ext_hdr_len -
+ tp->tcp_header_len);
if (tp->max_window &&
(xmit_size_goal > (tp->max_window >> 1)))
/* Congestion window validation. (RFC2861) */
-static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
+static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
{
__u32 packets_out = tp->packets_out;
/* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
+static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
if (unlikely(buff == NULL))
return -ENOMEM;
- buff->truesize = nlen;
+ sk_charge_skb(sk, buff);
+ buff->truesize += nlen;
skb->truesize -= nlen;
/* Correct the sequence numbers. */
limit = min(send_win, cong_win);
+ /* If a full-sized TSO skb can be sent, do it. */
+ if (limit >= 65536)
+ return 0;
+
if (sysctl_tcp_tso_win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
return 1;
}
+/* Create a new MTU probe if we are ready.
+ * Returns 0 if we should wait to probe (no cwnd available),
+ * 1 if a probe was sent,
+ * -1 otherwise */
+static int tcp_mtu_probe(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *skb, *nskb, *next;
+ int len;
+ int probe_size;
+ unsigned int pif;
+ int copy;
+ int mss_now;
+
+ /* Not currently probing/verifying,
+ * not in recovery,
+ * have enough cwnd, and
+ * not SACKing (the variable headers throw things off) */
+ if (!icsk->icsk_mtup.enabled ||
+ icsk->icsk_mtup.probe_size ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+ tp->snd_cwnd < 11 ||
+ tp->rx_opt.eff_sacks)
+ return -1;
+
+ /* Very simple search strategy: just double the MSS. */
+ mss_now = tcp_current_mss(sk, 0);
+ probe_size = 2*tp->mss_cache;
+ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
+ /* TODO: set timer for probe_converge_event */
+ return -1;
+ }
+
+ /* Have enough data in the send queue to probe? */
+ len = 0;
+ if ((skb = sk->sk_send_head) == NULL)
+ return -1;
+ while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
+ skb = skb->next;
+ if (len < probe_size)
+ return -1;
+
+ /* Receive window check. */
+ if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
+ if (tp->snd_wnd < probe_size)
+ return -1;
+ else
+ return 0;
+ }
+
+ /* Do we need to wait to drain cwnd? */
+ pif = tcp_packets_in_flight(tp);
+ if (pif + 2 > tp->snd_cwnd) {
+ /* With no packets in flight, don't stall. */
+ if (pif == 0)
+ return -1;
+ else
+ return 0;
+ }
+
+ /* We're allowed to probe. Build it now. */
+ if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
+ return -1;
+ sk_charge_skb(sk, nskb);
+
+ skb = sk->sk_send_head;
+ __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue);
+ sk->sk_send_head = nskb;
+
+ TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+ TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
+ TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
+ TCP_SKB_CB(nskb)->sacked = 0;
+ nskb->csum = 0;
+ if (skb->ip_summed == CHECKSUM_HW)
+ nskb->ip_summed = CHECKSUM_HW;
+
+ len = 0;
+ while (len < probe_size) {
+ next = skb->next;
+
+ copy = min_t(int, skb->len, probe_size - len);
+ if (nskb->ip_summed)
+ skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
+ else
+ nskb->csum = skb_copy_and_csum_bits(skb, 0,
+ skb_put(nskb, copy), copy, nskb->csum);
+
+ if (skb->len <= copy) {
+ /* We've eaten all the data from this skb.
+ * Throw it away. */
+ TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
+ __skb_unlink(skb, &sk->sk_write_queue);
+ sk_stream_free_skb(sk, skb);
+ } else {
+ TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
+ ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+ if (!skb_shinfo(skb)->nr_frags) {
+ skb_pull(skb, copy);
+ if (skb->ip_summed != CHECKSUM_HW)
+ skb->csum = csum_partial(skb->data, skb->len, 0);
+ } else {
+ __pskb_trim_head(skb, copy);
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ }
+ TCP_SKB_CB(skb)->seq += copy;
+ }
+
+ len += copy;
+ skb = next;
+ }
+ tcp_init_tso_segs(sk, nskb, nskb->len);
+
+ /* We're ready to send. If this fails, the probe will
+ * be resegmented into mss-sized pieces by tcp_write_xmit(). */
+ TCP_SKB_CB(nskb)->when = tcp_time_stamp;
+ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
+ /* Decrement cwnd here because we are sending
+ * effectively two packets. */
+ tp->snd_cwnd--;
+ update_send_head(sk, tp, nskb);
+
+ icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
+ tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
+ tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
+
+ return 1;
+ }
+
+ return -1;
+}
+
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
+ int result;
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and all
return 0;
sent_pkts = 0;
+
+ /* Do MTU probing. */
+ if ((result = tcp_mtu_probe(sk)) == 0) {
+ return 0;
+ } else if (result > 0) {
+ sent_pkts = 1;
+ }
+
while ((skb = sk->sk_send_head)) {
unsigned int limit;
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
unsigned int cur_mss = tcp_current_mss(sk, 0);
int err;
+ /* Inconslusive MTU probe */
+ if (icsk->icsk_mtup.probe_size) {
+ icsk->icsk_mtup.probe_size = 0;
+ }
+
/* Do not sent more than we queued. 1/4 is reserved for possible
* copying overhead: fragmentation, tunneling, mangling etc.
*/
(sysctl_tcp_retrans_collapse != 0))
tcp_retrans_try_collapse(sk, skb, cur_mss);
- if(tp->af_specific->rebuild_header(sk))
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return -EHOSTUNREACH; /* Routing failure or similar. */
/* Some Solaris stacks overoptimize and ignore the FIN on a
/*
* Do all connect socket setups that can be done AF independent.
*/
-static inline void tcp_connect_init(struct sock *sk)
+static void tcp_connect_init(struct sock *sk)
{
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
if (tp->rx_opt.user_mss)
tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
tp->max_window = 0;
+ tcp_mtup_init(sk);
tcp_sync_mss(sk, dst_mtu(dst));
if (!tp->window_clamp)
EXPORT_SYMBOL(tcp_simple_retransmit);
EXPORT_SYMBOL(tcp_sync_mss);
EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
+EXPORT_SYMBOL(tcp_mtup_init);