mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-16 13:34:30 +00:00
7c48266593
When an rxrpc call is in its transmission phase and is sending a lot of packets, stalls occasionally occur that cause severe performance degradation (eg. increasing the transmission time for a 256MiB payload from 0.7s to 2.5s over a 10G link). rxrpc already implements TCP-style congestion control [RFC5681] and this helps mitigate the effects, but occasionally we're missing a time event that deals with a missing ACK, leading to a stall until the RTO expires. Fix this by implementing RACK/TLP in rxrpc. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
419 lines
12 KiB
C
419 lines
12 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/* RACK-TLP [RFC8958] Implementation
|
|
*
|
|
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
|
|
* Written by David Howells (dhowells@redhat.com)
|
|
*/
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
#include "ar-internal.h"
|
|
|
|
static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1,
|
|
ktime_t t2, rxrpc_seq_t seq2)
|
|
{
|
|
if (ktime_after(t1, t2))
|
|
return true;
|
|
return t1 == t2 && after(seq1, seq2);
|
|
}
|
|
|
|
/*
|
|
* Mark a packet lost.
|
|
*/
|
|
static void rxrpc_rack_mark_lost(struct rxrpc_call *call,
|
|
struct rxrpc_txqueue *tq, unsigned int ix)
|
|
{
|
|
if (__test_and_set_bit(ix, &tq->segment_lost)) {
|
|
if (__test_and_clear_bit(ix, &tq->segment_retransmitted))
|
|
call->tx_nr_resent--;
|
|
} else {
|
|
call->tx_nr_lost++;
|
|
}
|
|
tq->segment_xmit_ts[ix] = UINT_MAX;
|
|
}
|
|
|
|
/*
|
|
* Get the transmission time of a packet in the Tx queue.
|
|
*/
|
|
static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix)
|
|
{
|
|
if (tq->segment_xmit_ts[ix] == UINT_MAX)
|
|
return KTIME_MAX;
|
|
return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]);
|
|
}
|
|
|
|
/*
|
|
* Get a bitmask of nack bits for a queue segment and mask off any that aren't
|
|
* yet reported.
|
|
*/
|
|
static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq)
|
|
{
|
|
unsigned long nacks = ~tq->segment_acked;
|
|
|
|
if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE)
|
|
nacks &= (1UL << tq->nr_reported_acks) - 1;
|
|
return nacks;
|
|
}
|
|
|
|
/*
|
|
* Update the RACK state for the most recently sent packet that has been
|
|
* delivered [RFC8958 6.2 Step 2].
|
|
*/
|
|
static void rxrpc_rack_update(struct rxrpc_call *call,
|
|
struct rxrpc_ack_summary *summary,
|
|
struct rxrpc_txqueue *tq,
|
|
unsigned int ix)
|
|
{
|
|
rxrpc_seq_t seq = tq->qbase + ix;
|
|
ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
|
|
ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts);
|
|
|
|
if (__test_and_clear_bit(ix, &tq->segment_lost))
|
|
call->tx_nr_lost--;
|
|
|
|
if (test_bit(ix, &tq->segment_retransmitted)) {
|
|
/* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */
|
|
if (before(call->acks_highest_serial, tq->segment_serial[ix]))
|
|
return;
|
|
if (rtt < minmax_get(&call->min_rtt))
|
|
return;
|
|
}
|
|
|
|
/* The RACK algorithm requires the segment ACKs to be traversed in
|
|
* order of segment transmission - but the only thing this seems to
|
|
* matter for is that RACK.rtt is set to the rtt of the most recently
|
|
* transmitted segment. We should be able to achieve the same by only
|
|
* setting RACK.rtt if the xmit time is greater.
|
|
*/
|
|
if (ktime_after(xmit_ts, call->rack_rtt_ts)) {
|
|
call->rack_rtt = rtt;
|
|
call->rack_rtt_ts = xmit_ts;
|
|
}
|
|
|
|
if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) {
|
|
call->rack_rtt = rtt;
|
|
call->rack_xmit_ts = xmit_ts;
|
|
call->rack_end_seq = seq;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Detect data segment reordering [RFC8958 6.2 Step 3].
|
|
*/
|
|
static void rxrpc_rack_detect_reordering(struct rxrpc_call *call,
|
|
struct rxrpc_ack_summary *summary,
|
|
struct rxrpc_txqueue *tq,
|
|
unsigned int ix)
|
|
{
|
|
rxrpc_seq_t seq = tq->qbase + ix;
|
|
|
|
/* Track the highest sequence number so far ACK'd. This is not
|
|
* necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer
|
|
* could put a NACK in the last SACK slot.
|
|
*/
|
|
if (after(seq, call->rack_fack))
|
|
call->rack_fack = seq;
|
|
else if (before(seq, call->rack_fack) &&
|
|
test_bit(ix, &tq->segment_retransmitted))
|
|
call->rack_reordering_seen = true;
|
|
}
|
|
|
|
void rxrpc_input_rack_one(struct rxrpc_call *call,
|
|
struct rxrpc_ack_summary *summary,
|
|
struct rxrpc_txqueue *tq,
|
|
unsigned int ix)
|
|
{
|
|
rxrpc_rack_update(call, summary, tq, ix);
|
|
rxrpc_rack_detect_reordering(call, summary, tq, ix);
|
|
}
|
|
|
|
void rxrpc_input_rack(struct rxrpc_call *call,
|
|
struct rxrpc_ack_summary *summary,
|
|
struct rxrpc_txqueue *tq,
|
|
unsigned long new_acks)
|
|
{
|
|
while (new_acks) {
|
|
unsigned int ix = __ffs(new_acks);
|
|
|
|
__clear_bit(ix, &new_acks);
|
|
rxrpc_input_rack_one(call, summary, tq, ix);
|
|
}
|
|
|
|
trace_rxrpc_rack_update(call, summary);
|
|
}
|
|
|
|
/*
|
|
* Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated
|
|
* duration of the reordering window.
|
|
*
|
|
* Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can
|
|
* be given a 'DUPLICATE' reason with the serial number referring to the
|
|
* duplicated DATA packet. Rx does not inform as to whether this was a
|
|
* reception of the same packet twice or of a retransmission of a packet we
|
|
* already received (though this could be determined by the transmitter based
|
|
* on the serial number).
|
|
*/
|
|
static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call,
|
|
struct rxrpc_ack_summary *summary)
|
|
{
|
|
rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */
|
|
rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */
|
|
bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE;
|
|
int dup_thresh = 3;
|
|
|
|
/* DSACK-based reordering window adaptation */
|
|
if (!call->rack_dsack_round_none &&
|
|
after_eq(snd_una, call->rack_dsack_round))
|
|
call->rack_dsack_round_none = true;
|
|
|
|
/* Grow the reordering window per round that sees DSACK. Reset the
|
|
* window after 16 DSACK-free recoveries.
|
|
*/
|
|
if (call->rack_dsack_round_none && have_dsack_option) {
|
|
call->rack_dsack_round_none = false;
|
|
call->rack_dsack_round = snd_nxt;
|
|
call->rack_reo_wnd_mult++;
|
|
call->rack_reo_wnd_persist = 16;
|
|
} else if (summary->exiting_fast_or_rto_recovery) {
|
|
call->rack_reo_wnd_persist--;
|
|
if (call->rack_reo_wnd_persist <= 0)
|
|
call->rack_reo_wnd_mult = 1;
|
|
}
|
|
|
|
if (!call->rack_reordering_seen) {
|
|
if (summary->in_fast_or_rto_recovery)
|
|
return 0;
|
|
if (call->acks_nr_sacks >= dup_thresh)
|
|
return 0;
|
|
}
|
|
|
|
return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4,
|
|
call->srtt_us >> 3));
|
|
}
|
|
|
|
/*
|
|
* Detect losses [RFC8958 6.2 Step 5].
|
|
*/
|
|
static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call,
|
|
struct rxrpc_ack_summary *summary)
|
|
{
|
|
struct rxrpc_txqueue *tq;
|
|
ktime_t timeout = 0, lost_after, now = ktime_get_real();
|
|
|
|
call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary);
|
|
lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd);
|
|
trace_rxrpc_rack_scan_loss(call);
|
|
|
|
for (tq = call->tx_queue; tq; tq = tq->next) {
|
|
unsigned long nacks = rxrpc_tq_nacks(tq);
|
|
|
|
if (after(tq->qbase, call->tx_transmitted))
|
|
break;
|
|
trace_rxrpc_rack_scan_loss_tq(call, tq, nacks);
|
|
|
|
/* Skip ones marked lost but not yet retransmitted */
|
|
nacks &= ~tq->segment_lost | tq->segment_retransmitted;
|
|
|
|
while (nacks) {
|
|
unsigned int ix = __ffs(nacks);
|
|
rxrpc_seq_t seq = tq->qbase + ix;
|
|
ktime_t remaining;
|
|
ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
|
|
|
|
__clear_bit(ix, &nacks);
|
|
|
|
if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq,
|
|
xmit_ts, seq)) {
|
|
remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now);
|
|
if (remaining <= 0) {
|
|
rxrpc_rack_mark_lost(call, tq, ix);
|
|
trace_rxrpc_rack_detect_loss(call, summary, seq);
|
|
} else {
|
|
timeout = max(remaining, timeout);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return timeout;
|
|
}
|
|
|
|
/*
|
|
* Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5].
|
|
*/
|
|
void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call,
|
|
struct rxrpc_ack_summary *summary)
|
|
{
|
|
ktime_t timeout = rxrpc_rack_detect_loss(call, summary);
|
|
|
|
if (timeout) {
|
|
call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER;
|
|
call->rack_timo_at = ktime_add(ktime_get_real(), timeout);
|
|
trace_rxrpc_rack_timer(call, timeout, false);
|
|
trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Handle RACK-TLP RTO expiration [RFC8958 6.3].
|
|
*/
|
|
static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call)
|
|
{
|
|
struct rxrpc_txqueue *tq;
|
|
rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */
|
|
ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd);
|
|
ktime_t deadline = ktime_sub(ktime_get_real(), lost_after);
|
|
|
|
for (tq = call->tx_queue; tq; tq = tq->next) {
|
|
unsigned long unacked = ~tq->segment_acked;
|
|
|
|
trace_rxrpc_rack_mark_loss_tq(call, tq);
|
|
while (unacked) {
|
|
unsigned int ix = __ffs(unacked);
|
|
rxrpc_seq_t seq = tq->qbase + ix;
|
|
ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
|
|
|
|
if (after(seq, call->tx_transmitted))
|
|
return;
|
|
__clear_bit(ix, &unacked);
|
|
|
|
if (seq == snd_una ||
|
|
ktime_before(xmit_ts, deadline))
|
|
rxrpc_rack_mark_lost(call, tq, ix);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2].
|
|
*/
|
|
ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now)
|
|
{
|
|
unsigned int flight_size = rxrpc_tx_in_flight(call);
|
|
ktime_t rto_at = ktime_add(call->tx_last_sent,
|
|
rxrpc_get_rto_backoff(call, false));
|
|
ktime_t pto;
|
|
|
|
if (call->rtt_count > 0) {
|
|
/* Use 2*SRTT as the timeout. */
|
|
pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4);
|
|
if (flight_size)
|
|
pto = ktime_add(pto, call->tlp_max_ack_delay);
|
|
} else {
|
|
pto = NSEC_PER_SEC;
|
|
}
|
|
|
|
if (ktime_after(ktime_add(now, pto), rto_at))
|
|
pto = ktime_sub(rto_at, now);
|
|
return pto;
|
|
}
|
|
|
|
/*
|
|
* Send a TLP loss probe on PTO expiration [RFC8958 7.3].
|
|
*/
|
|
void rxrpc_tlp_send_probe(struct rxrpc_call *call)
|
|
{
|
|
unsigned int in_flight = rxrpc_tx_in_flight(call);
|
|
|
|
if (after_eq(call->acks_hard_ack, call->tx_transmitted))
|
|
return; /* Everything we transmitted has been acked. */
|
|
|
|
/* There must be no other loss probe still in flight and we need to
|
|
* have taken a new RTT sample since last probe or the start of
|
|
* connection.
|
|
*/
|
|
if (!call->tlp_serial &&
|
|
call->tlp_rtt_taken != call->rtt_taken) {
|
|
call->tlp_is_retrans = false;
|
|
if (after(call->send_top, call->tx_transmitted) &&
|
|
rxrpc_tx_window_space(call) > 0) {
|
|
/* Transmit the lowest-sequence unsent DATA */
|
|
call->tx_last_serial = 0;
|
|
rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data);
|
|
call->tlp_serial = call->tx_last_serial;
|
|
call->tlp_seq = call->tx_transmitted;
|
|
trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new);
|
|
in_flight = rxrpc_tx_in_flight(call);
|
|
} else {
|
|
/* Retransmit the highest-sequence DATA sent */
|
|
call->tx_last_serial = 0;
|
|
rxrpc_resend_tlp(call);
|
|
call->tlp_is_retrans = true;
|
|
trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit);
|
|
}
|
|
} else {
|
|
trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy);
|
|
}
|
|
|
|
if (in_flight != 0) {
|
|
ktime_t rto = rxrpc_get_rto_backoff(call, false);
|
|
|
|
call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO;
|
|
call->rack_timo_at = ktime_add(ktime_get_real(), rto);
|
|
trace_rxrpc_rack_timer(call, rto, false);
|
|
trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Detect losses using the ACK of a TLP loss probe [RFC8958 7.4].
|
|
*/
|
|
void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary)
|
|
{
|
|
if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack))
|
|
return;
|
|
|
|
if (!call->tlp_is_retrans) {
|
|
/* TLP of new data delivered */
|
|
trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data);
|
|
call->tlp_serial = 0;
|
|
} else if (summary->ack_reason == RXRPC_ACK_DUPLICATE &&
|
|
summary->acked_serial == call->tlp_serial) {
|
|
/* General Case: Detected packet losses using RACK [7.4.1] */
|
|
trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked);
|
|
call->tlp_serial = 0;
|
|
} else if (after(call->acks_hard_ack, call->tlp_seq)) {
|
|
/* Repaired the single loss */
|
|
trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond);
|
|
call->tlp_serial = 0;
|
|
// TODO: Invoke congestion control to react to the loss
|
|
// event the probe has repaired
|
|
} else if (summary->tlp_probe_acked) {
|
|
trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked);
|
|
/* Special Case: Detected a single loss repaired by the loss
|
|
* probe [7.4.2]
|
|
*/
|
|
call->tlp_serial = 0;
|
|
} else {
|
|
trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Handle RACK timer expiration; returns true to request a resend.
|
|
*/
|
|
void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by)
|
|
{
|
|
struct rxrpc_ack_summary summary = {};
|
|
enum rxrpc_rack_timer_mode mode = call->rack_timer_mode;
|
|
|
|
trace_rxrpc_rack_timer(call, overran_by, true);
|
|
call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF;
|
|
|
|
switch (mode) {
|
|
case RXRPC_CALL_RACKTIMER_RACK_REORDER:
|
|
rxrpc_rack_detect_loss_and_arm_timer(call, &summary);
|
|
break;
|
|
case RXRPC_CALL_RACKTIMER_TLP_PTO:
|
|
rxrpc_tlp_send_probe(call);
|
|
break;
|
|
case RXRPC_CALL_RACKTIMER_RTO:
|
|
// Might need to poke the congestion algo in some way
|
|
rxrpc_rack_mark_losses_on_rto(call);
|
|
break;
|
|
//case RXRPC_CALL_RACKTIMER_ZEROWIN:
|
|
default:
|
|
pr_warn("Unexpected rack timer %u", call->rack_timer_mode);
|
|
}
|
|
}
|