mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-04 04:02:26 +00:00
mptcp: allow picking different xmit subflows
Update the scheduler to less trivial heuristic: cache the last used subflow, and try to send on it a reasonably long burst of data. When the burst or the subflow send space is exhausted, pick the subflow with the lower ratio between write space and send buffer - that is, the subflow with the greater relative amount of free space. v1 -> v2: - fix 32 bit build breakage due to 64bits div - fix checkpath issues (uint64_t -> u64) Signed-off-by: Paolo Abeni <pabeni@redhat.com> Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
4596a2c1b7
commit
d5f49190de
@ -1031,41 +1031,105 @@ static void mptcp_nospace(struct mptcp_sock *msk)
|
||||
}
|
||||
}
|
||||
|
||||
static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
|
||||
{
|
||||
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
|
||||
|
||||
/* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
|
||||
if (subflow->request_join && !subflow->fully_established)
|
||||
return false;
|
||||
|
||||
/* only send if our side has not closed yet */
|
||||
return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
|
||||
}
|
||||
|
||||
#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
|
||||
sizeof(struct tcphdr) - \
|
||||
MAX_TCP_OPTION_SPACE - \
|
||||
sizeof(struct ipv6hdr) - \
|
||||
sizeof(struct frag_hdr))
|
||||
|
||||
struct subflow_send_info {
|
||||
struct sock *ssk;
|
||||
u64 ratio;
|
||||
};
|
||||
|
||||
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
|
||||
u32 *sndbuf)
|
||||
{
|
||||
struct subflow_send_info send_info[2];
|
||||
struct mptcp_subflow_context *subflow;
|
||||
struct sock *sk = (struct sock *)msk;
|
||||
struct sock *backup = NULL;
|
||||
bool free;
|
||||
int i, nr_active = 0;
|
||||
struct sock *ssk;
|
||||
u64 ratio;
|
||||
u32 pace;
|
||||
|
||||
sock_owned_by_me(sk);
|
||||
sock_owned_by_me((struct sock *)msk);
|
||||
|
||||
*sndbuf = 0;
|
||||
if (!mptcp_ext_cache_refill(msk))
|
||||
return NULL;
|
||||
|
||||
mptcp_for_each_subflow(msk, subflow) {
|
||||
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
|
||||
|
||||
free = sk_stream_is_writeable(subflow->tcp_sock);
|
||||
if (!free) {
|
||||
mptcp_nospace(msk);
|
||||
if (__mptcp_check_fallback(msk)) {
|
||||
if (!msk->first)
|
||||
return NULL;
|
||||
*sndbuf = msk->first->sk_sndbuf;
|
||||
return sk_stream_memory_free(msk->first) ? msk->first : NULL;
|
||||
}
|
||||
|
||||
/* re-use last subflow, if the burst allow that */
|
||||
if (msk->last_snd && msk->snd_burst > 0 &&
|
||||
sk_stream_memory_free(msk->last_snd) &&
|
||||
mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
|
||||
mptcp_for_each_subflow(msk, subflow) {
|
||||
ssk = mptcp_subflow_tcp_sock(subflow);
|
||||
*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
|
||||
if (subflow->backup) {
|
||||
if (!backup)
|
||||
backup = ssk;
|
||||
}
|
||||
return msk->last_snd;
|
||||
}
|
||||
|
||||
/* pick the subflow with the lower wmem/wspace ratio */
|
||||
for (i = 0; i < 2; ++i) {
|
||||
send_info[i].ssk = NULL;
|
||||
send_info[i].ratio = -1;
|
||||
}
|
||||
mptcp_for_each_subflow(msk, subflow) {
|
||||
ssk = mptcp_subflow_tcp_sock(subflow);
|
||||
if (!mptcp_subflow_active(subflow))
|
||||
continue;
|
||||
|
||||
nr_active += !subflow->backup;
|
||||
*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
|
||||
if (!sk_stream_memory_free(subflow->tcp_sock))
|
||||
continue;
|
||||
|
||||
pace = READ_ONCE(ssk->sk_pacing_rate);
|
||||
if (!pace)
|
||||
continue;
|
||||
|
||||
ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
|
||||
pace);
|
||||
if (ratio < send_info[subflow->backup].ratio) {
|
||||
send_info[subflow->backup].ssk = ssk;
|
||||
send_info[subflow->backup].ratio = ratio;
|
||||
}
|
||||
}
|
||||
|
||||
return ssk;
|
||||
}
|
||||
pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
|
||||
msk, nr_active, send_info[0].ssk, send_info[0].ratio,
|
||||
send_info[1].ssk, send_info[1].ratio);
|
||||
|
||||
return backup;
|
||||
/* pick the best backup if no other subflow is active */
|
||||
if (!nr_active)
|
||||
send_info[0].ssk = send_info[1].ssk;
|
||||
|
||||
if (send_info[0].ssk) {
|
||||
msk->last_snd = send_info[0].ssk;
|
||||
msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
|
||||
sk_stream_wspace(msk->last_snd));
|
||||
return msk->last_snd;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void ssk_check_wmem(struct mptcp_sock *msk)
|
||||
@ -1160,6 +1224,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
|
||||
break;
|
||||
}
|
||||
|
||||
/* burst can be negative, we will try move to the next subflow
|
||||
* at selection time, if possible.
|
||||
*/
|
||||
msk->snd_burst -= ret;
|
||||
copied += ret;
|
||||
|
||||
tx_ok = msg_data_left(msg);
|
||||
@ -1375,6 +1443,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
|
||||
unsigned int moved = 0;
|
||||
bool done;
|
||||
|
||||
/* avoid looping forever below on racing close */
|
||||
if (((struct sock *)msk)->sk_state == TCP_CLOSE)
|
||||
return false;
|
||||
|
||||
__mptcp_flush_join_list(msk);
|
||||
do {
|
||||
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
|
||||
|
||||
@ -1539,9 +1612,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
|
||||
|
||||
sock_owned_by_me((const struct sock *)msk);
|
||||
|
||||
if (__mptcp_check_fallback(msk))
|
||||
return msk->first;
|
||||
|
||||
mptcp_for_each_subflow(msk, subflow) {
|
||||
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
|
||||
|
||||
if (!mptcp_subflow_active(subflow))
|
||||
continue;
|
||||
|
||||
/* still data outstanding at TCP level? Don't retransmit. */
|
||||
if (!tcp_write_queue_empty(ssk))
|
||||
return NULL;
|
||||
|
@ -196,6 +196,8 @@ struct mptcp_sock {
|
||||
u64 write_seq;
|
||||
u64 ack_seq;
|
||||
u64 rcv_data_fin_seq;
|
||||
struct sock *last_snd;
|
||||
int snd_burst;
|
||||
atomic64_t snd_una;
|
||||
unsigned long timer_ival;
|
||||
u32 token;
|
||||
@ -473,12 +475,12 @@ static inline bool before64(__u64 seq1, __u64 seq2)
|
||||
|
||||
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
|
||||
|
||||
static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
|
||||
static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk)
|
||||
{
|
||||
return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
|
||||
}
|
||||
|
||||
static inline bool mptcp_check_fallback(struct sock *sk)
|
||||
static inline bool mptcp_check_fallback(const struct sock *sk)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
|
||||
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
|
||||
|
Loading…
Reference in New Issue
Block a user