mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-10 15:58:47 +00:00
tcp: early retransmit
This patch implements RFC 5827 early retransmit (ER) for TCP. It reduces DUPACK threshold (dupthresh) if outstanding packets are less than 4 to recover losses by fast recovery instead of timeout. While the algorithm is simple, small but frequent network reordering makes this feature dangerous: the connection repeatedly enter false recovery and degrade performance. Therefore we implement a mitigation suggested in the appendix of the RFC that delays entering fast recovery by a small interval, i.e., RTT/4. Currently ER is conservative and is disabled for the rest of the connection after the first reordering event. A large scale web server experiment on the performance impact of ER is summarized in section 6 of the paper "Proportional Rate Reduction for TCP”, IMC 2011. http://conferences.sigcomm.org/imc/2011/docs/p155.pdf Note that Linux has a similar feature called THIN_DUPACK. The differences are THIN_DUPACK do not mitigate reorderings and is only used after slow start. Currently ER is disabled if THIN_DUPACK is enabled. I would be happy to merge THIN_DUPACK feature with ER if people think it's a good idea. ER is enabled by sysctl_tcp_early_retrans: 0: Disables ER 1: Reduce dupthresh to packets_out - 1 when outstanding packets < 4. 2: (Default) reduce dupthresh like mode 1. In addition, delay entering fast recovery by RTT/4. Note: mode 2 is implemented in the third part of this patch series. Signed-off-by: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
1fbc340514
commit
eed530b6c6
@ -190,6 +190,20 @@ tcp_cookie_size - INTEGER
|
|||||||
tcp_dsack - BOOLEAN
|
tcp_dsack - BOOLEAN
|
||||||
Allows TCP to send "duplicate" SACKs.
|
Allows TCP to send "duplicate" SACKs.
|
||||||
|
|
||||||
|
tcp_early_retrans - INTEGER
|
||||||
|
Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
|
||||||
|
for triggering fast retransmit when the amount of outstanding data is
|
||||||
|
small and when no previously unsent data can be transmitted (such
|
||||||
|
that limited transmit could be used).
|
||||||
|
Possible values:
|
||||||
|
0 disables ER
|
||||||
|
1 enables ER
|
||||||
|
2 enables ER but delays fast recovery and fast retransmit
|
||||||
|
by a fourth of RTT. This mitigates connection falsely
|
||||||
|
recovers when network has a small degree of reordering
|
||||||
|
(less than 3 packets).
|
||||||
|
Default: 2
|
||||||
|
|
||||||
tcp_ecn - INTEGER
|
tcp_ecn - INTEGER
|
||||||
Enable Explicit Congestion Notification (ECN) in TCP. ECN is only
|
Enable Explicit Congestion Notification (ECN) in TCP. ECN is only
|
||||||
used when both ends of the TCP flow support it. It is useful to
|
used when both ends of the TCP flow support it. It is useful to
|
||||||
|
@ -372,6 +372,7 @@ struct tcp_sock {
|
|||||||
repair : 1,
|
repair : 1,
|
||||||
unused : 1;
|
unused : 1;
|
||||||
u8 repair_queue;
|
u8 repair_queue;
|
||||||
|
u8 do_early_retrans:1;/* Enable RFC5827 early-retransmit */
|
||||||
|
|
||||||
/* RTT measurement */
|
/* RTT measurement */
|
||||||
u32 srtt; /* smoothed round trip time << 3 */
|
u32 srtt; /* smoothed round trip time << 3 */
|
||||||
|
@ -252,6 +252,7 @@ extern int sysctl_tcp_max_ssthresh;
|
|||||||
extern int sysctl_tcp_cookie_size;
|
extern int sysctl_tcp_cookie_size;
|
||||||
extern int sysctl_tcp_thin_linear_timeouts;
|
extern int sysctl_tcp_thin_linear_timeouts;
|
||||||
extern int sysctl_tcp_thin_dupack;
|
extern int sysctl_tcp_thin_dupack;
|
||||||
|
extern int sysctl_tcp_early_retrans;
|
||||||
|
|
||||||
extern atomic_long_t tcp_memory_allocated;
|
extern atomic_long_t tcp_memory_allocated;
|
||||||
extern struct percpu_counter tcp_sockets_allocated;
|
extern struct percpu_counter tcp_sockets_allocated;
|
||||||
@ -797,6 +798,20 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
|
|||||||
tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
|
tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* TCP early-retransmit (ER) is similar to but more conservative than
|
||||||
|
* the thin-dupack feature. Enable ER only if thin-dupack is disabled.
|
||||||
|
*/
|
||||||
|
static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
|
||||||
|
{
|
||||||
|
tp->do_early_retrans = sysctl_tcp_early_retrans &&
|
||||||
|
!sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
|
||||||
|
{
|
||||||
|
tp->do_early_retrans = 0;
|
||||||
|
}
|
||||||
|
|
||||||
static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
|
static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
|
||||||
{
|
{
|
||||||
return tp->sacked_out + tp->lost_out;
|
return tp->sacked_out + tp->lost_out;
|
||||||
|
@ -27,6 +27,7 @@
|
|||||||
#include <net/tcp_memcontrol.h>
|
#include <net/tcp_memcontrol.h>
|
||||||
|
|
||||||
static int zero;
|
static int zero;
|
||||||
|
static int two = 2;
|
||||||
static int tcp_retr1_max = 255;
|
static int tcp_retr1_max = 255;
|
||||||
static int ip_local_port_range_min[] = { 1, 1 };
|
static int ip_local_port_range_min[] = { 1, 1 };
|
||||||
static int ip_local_port_range_max[] = { 65535, 65535 };
|
static int ip_local_port_range_max[] = { 65535, 65535 };
|
||||||
@ -676,6 +677,15 @@ static struct ctl_table ipv4_table[] = {
|
|||||||
.mode = 0644,
|
.mode = 0644,
|
||||||
.proc_handler = proc_dointvec
|
.proc_handler = proc_dointvec
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
.procname = "tcp_early_retrans",
|
||||||
|
.data = &sysctl_tcp_early_retrans,
|
||||||
|
.maxlen = sizeof(int),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = proc_dointvec_minmax,
|
||||||
|
.extra1 = &zero,
|
||||||
|
.extra2 = &two,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
.procname = "udp_mem",
|
.procname = "udp_mem",
|
||||||
.data = &sysctl_udp_mem,
|
.data = &sysctl_udp_mem,
|
||||||
|
@ -395,6 +395,7 @@ void tcp_init_sock(struct sock *sk)
|
|||||||
tp->mss_cache = TCP_MSS_DEFAULT;
|
tp->mss_cache = TCP_MSS_DEFAULT;
|
||||||
|
|
||||||
tp->reordering = sysctl_tcp_reordering;
|
tp->reordering = sysctl_tcp_reordering;
|
||||||
|
tcp_enable_early_retrans(tp);
|
||||||
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
|
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
|
||||||
|
|
||||||
sk->sk_state = TCP_CLOSE;
|
sk->sk_state = TCP_CLOSE;
|
||||||
@ -2495,6 +2496,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
|
|||||||
err = -EINVAL;
|
err = -EINVAL;
|
||||||
else
|
else
|
||||||
tp->thin_dupack = val;
|
tp->thin_dupack = val;
|
||||||
|
if (tp->thin_dupack)
|
||||||
|
tcp_disable_early_retrans(tp);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case TCP_REPAIR:
|
case TCP_REPAIR:
|
||||||
|
@ -99,6 +99,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
|
|||||||
|
|
||||||
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
|
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
|
||||||
int sysctl_tcp_abc __read_mostly;
|
int sysctl_tcp_abc __read_mostly;
|
||||||
|
int sysctl_tcp_early_retrans __read_mostly = 2;
|
||||||
|
|
||||||
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
|
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
|
||||||
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
|
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
|
||||||
@ -906,6 +907,7 @@ static void tcp_init_metrics(struct sock *sk)
|
|||||||
if (dst_metric(dst, RTAX_REORDERING) &&
|
if (dst_metric(dst, RTAX_REORDERING) &&
|
||||||
tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
|
tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
|
||||||
tcp_disable_fack(tp);
|
tcp_disable_fack(tp);
|
||||||
|
tcp_disable_early_retrans(tp);
|
||||||
tp->reordering = dst_metric(dst, RTAX_REORDERING);
|
tp->reordering = dst_metric(dst, RTAX_REORDERING);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -988,6 +990,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
|
|||||||
#endif
|
#endif
|
||||||
tcp_disable_fack(tp);
|
tcp_disable_fack(tp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (metric > 0)
|
||||||
|
tcp_disable_early_retrans(tp);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This must be called before lost_out is incremented */
|
/* This must be called before lost_out is incremented */
|
||||||
@ -2492,6 +2497,16 @@ static int tcp_time_to_recover(struct sock *sk)
|
|||||||
tcp_is_sack(tp) && !tcp_send_head(sk))
|
tcp_is_sack(tp) && !tcp_send_head(sk))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
|
/* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
|
||||||
|
* retransmissions due to small network reorderings, we implement
|
||||||
|
* Mitigation A.3 in the RFC and delay the retransmission for a short
|
||||||
|
* interval if appropriate.
|
||||||
|
*/
|
||||||
|
if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
|
||||||
|
(tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
|
||||||
|
!tcp_may_send_now(sk))
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -482,6 +482,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
|
|||||||
newtp->sacked_out = 0;
|
newtp->sacked_out = 0;
|
||||||
newtp->fackets_out = 0;
|
newtp->fackets_out = 0;
|
||||||
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
|
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
|
||||||
|
tcp_enable_early_retrans(newtp);
|
||||||
|
|
||||||
/* So many TCP implementations out there (incorrectly) count the
|
/* So many TCP implementations out there (incorrectly) count the
|
||||||
* initial SYN frame in their delayed-ACK and congestion control
|
* initial SYN frame in their delayed-ACK and congestion control
|
||||||
|
Loading…
x
Reference in New Issue
Block a user