Merge branch 'mptcp-features-and-fixes-for-v6-7'

Mat Martineau says:

====================
mptcp: Features and fixes for v6.7

Patch 1 adds a configurable timeout for the MPTCP connection when all
subflows are closed, to support break-before-make use cases.

Patch 2 is a fix for a 1-byte error in rx data counters with MPTCP
fastopen connections.

Patch 3 is a minor code cleanup.

Patches 4 & 5 add handling of rcvlowat for MPTCP sockets, with a
prerequisite patch to use a common scaling ratio between TCP and MPTCP.

Patch 6 improves efficiency of memory copying in MPTCP transmit code.

Patch 7 refactors syncing of socket options from the MPTCP socket to
its subflows.

Patches 8 & 9 help the MPTCP packet scheduler perform well by changing
the handling of notsent_lowat in subflows and how available buffer space
is calculated for MPTCP-level sends.
====================

Link: https://lore.kernel.org/r/20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2023-10-25 12:23:36 -07:00
commit 8846f9a04b
8 changed files with 229 additions and 82 deletions

View File

@ -25,6 +25,17 @@ add_addr_timeout - INTEGER (seconds)
Default: 120 Default: 120
close_timeout - INTEGER (seconds)
Set the make-after-break timeout: in absence of any close or
shutdown syscall, MPTCP sockets will maintain the status
unchanged for such time, after the last subflow removal, before
moving to TCP_CLOSE.
The default value matches TCP_TIMEWAIT_LEN. This is a per-namespace
sysctl.
Default: 60
checksum_enabled - BOOLEAN checksum_enabled - BOOLEAN
Control whether DSS checksum can be enabled. Control whether DSS checksum can be enabled.

View File

@ -1489,13 +1489,15 @@ static inline int tcp_space_from_win(const struct sock *sk, int win)
return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
} }
/* Assume a conservative default of 1200 bytes of payload per 4K page.
* This may be adjusted later in tcp_measure_rcv_mss().
*/
#define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \
SKB_TRUESIZE(4096))
static inline void tcp_scaling_ratio_init(struct sock *sk) static inline void tcp_scaling_ratio_init(struct sock *sk)
{ {
/* Assume a conservative default of 1200 bytes of payload per 4K page. tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
* This may be adjusted later in tcp_measure_rcv_mss().
*/
tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
SKB_TRUESIZE(4096);
} }
/* Note: caller must be prepared to deal with negative returns */ /* Note: caller must be prepared to deal with negative returns */

View File

@ -27,6 +27,7 @@ struct mptcp_pernet {
#endif #endif
unsigned int add_addr_timeout; unsigned int add_addr_timeout;
unsigned int close_timeout;
unsigned int stale_loss_cnt; unsigned int stale_loss_cnt;
u8 mptcp_enabled; u8 mptcp_enabled;
u8 checksum_enabled; u8 checksum_enabled;
@ -65,6 +66,13 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net)
return mptcp_get_pernet(net)->stale_loss_cnt; return mptcp_get_pernet(net)->stale_loss_cnt;
} }
unsigned int mptcp_close_timeout(const struct sock *sk)
{
if (sock_flag(sk, SOCK_DEAD))
return TCP_TIMEWAIT_LEN;
return mptcp_get_pernet(sock_net(sk))->close_timeout;
}
int mptcp_get_pm_type(const struct net *net) int mptcp_get_pm_type(const struct net *net)
{ {
return mptcp_get_pernet(net)->pm_type; return mptcp_get_pernet(net)->pm_type;
@ -79,6 +87,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{ {
pernet->mptcp_enabled = 1; pernet->mptcp_enabled = 1;
pernet->add_addr_timeout = TCP_RTO_MAX; pernet->add_addr_timeout = TCP_RTO_MAX;
pernet->close_timeout = TCP_TIMEWAIT_LEN;
pernet->checksum_enabled = 0; pernet->checksum_enabled = 0;
pernet->allow_join_initial_addr_port = 1; pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4; pernet->stale_loss_cnt = 4;
@ -141,6 +150,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dostring, .proc_handler = proc_dostring,
}, },
{
.procname = "close_timeout",
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{} {}
}; };
@ -163,6 +178,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[4].data = &pernet->stale_loss_cnt; table[4].data = &pernet->stale_loss_cnt;
table[5].data = &pernet->pm_type; table[5].data = &pernet->pm_type;
table[6].data = &pernet->scheduler; table[6].data = &pernet->scheduler;
table[7].data = &pernet->close_timeout;
hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
ARRAY_SIZE(mptcp_sysctl_table)); ARRAY_SIZE(mptcp_sysctl_table));

View File

@ -52,6 +52,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf
mptcp_set_owner_r(skb, sk); mptcp_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb); __skb_queue_tail(&sk->sk_receive_queue, skb);
mptcp_sk(sk)->bytes_received += skb->len;
sk->sk_data_ready(sk); sk->sk_data_ready(sk);

View File

@ -121,8 +121,6 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk)
ret = __mptcp_socket_create(msk); ret = __mptcp_socket_create(msk);
if (ret) if (ret)
return ERR_PTR(ret); return ERR_PTR(ret);
mptcp_sockopt_sync(msk, msk->first);
} }
return msk->first; return msk->first;
@ -863,9 +861,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
/* Wake-up the reader only for in-sequence data */ /* Wake-up the reader only for in-sequence data */
mptcp_data_lock(sk); mptcp_data_lock(sk);
if (move_skbs_to_msk(msk, ssk)) if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
sk->sk_data_ready(sk); sk->sk_data_ready(sk);
mptcp_data_unlock(sk); mptcp_data_unlock(sk);
} }
@ -893,6 +890,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
mptcp_sockopt_sync_locked(msk, ssk); mptcp_sockopt_sync_locked(msk, ssk);
mptcp_subflow_joined(msk, ssk); mptcp_subflow_joined(msk, ssk);
mptcp_stop_tout_timer(sk); mptcp_stop_tout_timer(sk);
__mptcp_propagate_sndbuf(sk, ssk);
return true; return true;
} }
@ -1079,15 +1077,16 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
bool first = true; bool first = true;
sk_stream_moderate_sndbuf(sk);
mptcp_for_each_subflow(msk, subflow) { mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (first) if (first)
tcp_enter_memory_pressure(ssk); tcp_enter_memory_pressure(ssk);
sk_stream_moderate_sndbuf(ssk); sk_stream_moderate_sndbuf(ssk);
first = false; first = false;
} }
__mptcp_sync_sndbuf(sk);
} }
/* ensure we get enough memory for the frag hdr, beyond some minimal amount of /* ensure we get enough memory for the frag hdr, beyond some minimal amount of
@ -1761,6 +1760,18 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
return ret; return ret;
} }
static int do_copy_data_nocache(struct sock *sk, int copy,
struct iov_iter *from, char *to)
{
if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
if (!copy_from_iter_full_nocache(to, copy, from))
return -EFAULT;
} else if (!copy_from_iter_full(to, copy, from)) {
return -EFAULT;
}
return 0;
}
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
@ -1834,11 +1845,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!sk_wmem_schedule(sk, total_ts)) if (!sk_wmem_schedule(sk, total_ts))
goto wait_for_memory; goto wait_for_memory;
if (copy_page_from_iter(dfrag->page, offset, psize, ret = do_copy_data_nocache(sk, psize, &msg->msg_iter,
&msg->msg_iter) != psize) { page_address(dfrag->page) + offset);
ret = -EFAULT; if (ret)
goto do_error; goto do_error;
}
/* data successfully copied into the write queue */ /* data successfully copied into the write queue */
sk_forward_alloc_add(sk, -total_ts); sk_forward_alloc_add(sk, -total_ts);
@ -1922,6 +1932,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
if (!(flags & MSG_PEEK)) { if (!(flags & MSG_PEEK)) {
MPTCP_SKB_CB(skb)->offset += count; MPTCP_SKB_CB(skb)->offset += count;
MPTCP_SKB_CB(skb)->map_seq += count; MPTCP_SKB_CB(skb)->map_seq += count;
msk->bytes_consumed += count;
} }
break; break;
} }
@ -1932,6 +1943,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
__skb_unlink(skb, &msk->receive_queue); __skb_unlink(skb, &msk->receive_queue);
__kfree_skb(skb); __kfree_skb(skb);
msk->bytes_consumed += count;
} }
if (copied >= len) if (copied >= len)
@ -2391,8 +2403,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
if (msk->in_accept_queue && msk->first == ssk && if (msk->in_accept_queue && msk->first == ssk &&
(sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) { (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) {
/* ensure later check in mptcp_worker() will dispose the msk */ /* ensure later check in mptcp_worker() will dispose the msk */
mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1));
sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DEAD);
mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1));
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
mptcp_subflow_drop_ctx(ssk); mptcp_subflow_drop_ctx(ssk);
goto out_release; goto out_release;
@ -2448,6 +2460,7 @@ out_release:
WRITE_ONCE(msk->first, NULL); WRITE_ONCE(msk->first, NULL);
out: out:
__mptcp_sync_sndbuf(sk);
if (need_push) if (need_push)
__mptcp_push_pending(sk, 0); __mptcp_push_pending(sk, 0);
@ -2516,7 +2529,7 @@ static bool mptcp_close_tout_expired(const struct sock *sk)
return false; return false;
return time_after32(tcp_jiffies32, return time_after32(tcp_jiffies32,
inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN); inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk));
} }
static void mptcp_check_fastclose(struct mptcp_sock *msk) static void mptcp_check_fastclose(struct mptcp_sock *msk)
@ -2659,7 +2672,7 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
return; return;
close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies +
TCP_TIMEWAIT_LEN; mptcp_close_timeout(sk);
/* the close timeout takes precedence on the fail one, and here at least one of /* the close timeout takes precedence on the fail one, and here at least one of
* them is active * them is active
@ -2755,6 +2768,7 @@ static void __mptcp_init_sock(struct sock *sk)
msk->rmem_fwd_alloc = 0; msk->rmem_fwd_alloc = 0;
WRITE_ONCE(msk->rmem_released, 0); WRITE_ONCE(msk->rmem_released, 0);
msk->timer_ival = TCP_RTO_MIN; msk->timer_ival = TCP_RTO_MIN;
msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
WRITE_ONCE(msk->first, NULL); WRITE_ONCE(msk->first, NULL);
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@ -2964,16 +2978,9 @@ void __mptcp_unaccepted_force_close(struct sock *sk)
__mptcp_destroy_sock(sk); __mptcp_destroy_sock(sk);
} }
static __poll_t mptcp_check_readable(struct mptcp_sock *msk) static __poll_t mptcp_check_readable(struct sock *sk)
{ {
/* Concurrent splices from sk_receive_queue into receive_queue will return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0;
* always show at least one non-empty queue when checked in this order.
*/
if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
skb_queue_empty_lockless(&msk->receive_queue))
return 0;
return EPOLLIN | EPOLLRDNORM;
} }
static void mptcp_check_listen_stop(struct sock *sk) static void mptcp_check_listen_stop(struct sock *sk)
@ -3011,7 +3018,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
goto cleanup; goto cleanup;
} }
if (mptcp_check_readable(msk) || timeout < 0) { if (mptcp_data_avail(msk) || timeout < 0) {
/* If the msk has read data, or the caller explicitly ask it, /* If the msk has read data, or the caller explicitly ask it,
* do the MPTCP equivalent of TCP reset, aka MPTCP fastclose * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose
*/ */
@ -3138,6 +3145,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
msk->snd_data_fin_enable = false; msk->snd_data_fin_enable = false;
msk->rcv_fastclose = false; msk->rcv_fastclose = false;
msk->use_64bit_ack = false; msk->use_64bit_ack = false;
msk->bytes_consumed = 0;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
mptcp_pm_data_reset(msk); mptcp_pm_data_reset(msk);
mptcp_ca_reset(sk); mptcp_ca_reset(sk);
@ -3219,7 +3227,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
* uses the correct data * uses the correct data
*/ */
mptcp_copy_inaddrs(nsk, ssk); mptcp_copy_inaddrs(nsk, ssk);
mptcp_propagate_sndbuf(nsk, ssk); __mptcp_propagate_sndbuf(nsk, ssk);
mptcp_rcv_space_init(msk, ssk); mptcp_rcv_space_init(msk, ssk);
bh_unlock_sock(nsk); bh_unlock_sock(nsk);
@ -3397,6 +3405,8 @@ static void mptcp_release_cb(struct sock *sk)
__mptcp_set_connected(sk); __mptcp_set_connected(sk);
if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
__mptcp_error_report(sk); __mptcp_error_report(sk);
if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags))
__mptcp_sync_sndbuf(sk);
} }
__mptcp_update_rmem(sk); __mptcp_update_rmem(sk);
@ -3441,6 +3451,14 @@ void mptcp_subflow_process_delegated(struct sock *ssk, long status)
__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk); mptcp_data_unlock(sk);
} }
if (status & BIT(MPTCP_DELEGATE_SNDBUF)) {
mptcp_data_lock(sk);
if (!sock_owned_by_user(sk))
__mptcp_sync_sndbuf(sk);
else
__set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk);
}
if (status & BIT(MPTCP_DELEGATE_ACK)) if (status & BIT(MPTCP_DELEGATE_ACK))
schedule_3rdack_retransmission(ssk); schedule_3rdack_retransmission(ssk);
} }
@ -3525,6 +3543,7 @@ bool mptcp_finish_join(struct sock *ssk)
/* active subflow, already present inside the conn_list */ /* active subflow, already present inside the conn_list */
if (!list_empty(&subflow->node)) { if (!list_empty(&subflow->node)) {
mptcp_subflow_joined(msk, ssk); mptcp_subflow_joined(msk, ssk);
mptcp_propagate_sndbuf(parent, ssk);
return true; return true;
} }
@ -3909,7 +3928,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
mask |= mptcp_check_readable(msk); mask |= mptcp_check_readable(sk);
if (shutdown & SEND_SHUTDOWN) if (shutdown & SEND_SHUTDOWN)
mask |= EPOLLOUT | EPOLLWRNORM; mask |= EPOLLOUT | EPOLLWRNORM;
else else
@ -3947,6 +3966,7 @@ static const struct proto_ops mptcp_stream_ops = {
.sendmsg = inet_sendmsg, .sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg, .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap, .mmap = sock_no_mmap,
.set_rcvlowat = mptcp_set_rcvlowat,
}; };
static struct inet_protosw mptcp_protosw = { static struct inet_protosw mptcp_protosw = {
@ -4048,6 +4068,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
.compat_ioctl = inet6_compat_ioctl, .compat_ioctl = inet6_compat_ioctl,
#endif #endif
.set_rcvlowat = mptcp_set_rcvlowat,
}; };
static struct proto mptcp_v6_prot; static struct proto mptcp_v6_prot;

View File

@ -125,6 +125,7 @@
#define MPTCP_RETRANSMIT 4 #define MPTCP_RETRANSMIT 4
#define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_FLUSH_JOIN_LIST 5
#define MPTCP_CONNECTED 6 #define MPTCP_CONNECTED 6
#define MPTCP_SYNC_SNDBUF 7
struct mptcp_skb_cb { struct mptcp_skb_cb {
u64 map_seq; u64 map_seq;
@ -269,6 +270,7 @@ struct mptcp_sock {
atomic64_t rcv_wnd_sent; atomic64_t rcv_wnd_sent;
u64 rcv_data_fin_seq; u64 rcv_data_fin_seq;
u64 bytes_retrans; u64 bytes_retrans;
u64 bytes_consumed;
int rmem_fwd_alloc; int rmem_fwd_alloc;
int snd_burst; int snd_burst;
int old_wspace; int old_wspace;
@ -434,11 +436,6 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
return (struct mptcp_subflow_request_sock *)rsk; return (struct mptcp_subflow_request_sock *)rsk;
} }
enum mptcp_data_avail {
MPTCP_SUBFLOW_NODATA,
MPTCP_SUBFLOW_DATA_AVAIL,
};
struct mptcp_delegated_action { struct mptcp_delegated_action {
struct napi_struct napi; struct napi_struct napi;
struct list_head head; struct list_head head;
@ -449,6 +446,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
#define MPTCP_DELEGATE_SCHEDULED 0 #define MPTCP_DELEGATE_SCHEDULED 0
#define MPTCP_DELEGATE_SEND 1 #define MPTCP_DELEGATE_SEND 1
#define MPTCP_DELEGATE_ACK 2 #define MPTCP_DELEGATE_ACK 2
#define MPTCP_DELEGATE_SNDBUF 3
#define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED)) #define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED))
/* MPTCP subflow context */ /* MPTCP subflow context */
@ -494,7 +492,7 @@ struct mptcp_subflow_context {
valid_csum_seen : 1, /* at least one csum validated */ valid_csum_seen : 1, /* at least one csum validated */
is_mptfo : 1, /* subflow is doing TFO */ is_mptfo : 1, /* subflow is doing TFO */
__unused : 9; __unused : 9;
enum mptcp_data_avail data_avail; bool data_avail;
bool scheduled; bool scheduled;
u32 remote_nonce; u32 remote_nonce;
u64 thmac; u64 thmac;
@ -522,6 +520,9 @@ struct mptcp_subflow_context {
u32 setsockopt_seq; u32 setsockopt_seq;
u32 stale_rcv_tstamp; u32 stale_rcv_tstamp;
int cached_sndbuf; /* sndbuf size when last synced with the msk sndbuf,
* protected by the msk socket lock
*/
struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *tcp_sock; /* tcp sk backpointer */
struct sock *conn; /* parent mptcp_sock */ struct sock *conn; /* parent mptcp_sock */
@ -615,6 +616,7 @@ unsigned int mptcp_get_add_addr_timeout(const struct net *net);
int mptcp_is_checksum_enabled(const struct net *net); int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net); int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net);
unsigned int mptcp_close_timeout(const struct sock *sk);
int mptcp_get_pm_type(const struct net *net); int mptcp_get_pm_type(const struct net *net);
const char *mptcp_get_scheduler(const struct net *net); const char *mptcp_get_scheduler(const struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
@ -663,6 +665,24 @@ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk);
int mptcp_sched_get_send(struct mptcp_sock *msk); int mptcp_sched_get_send(struct mptcp_sock *msk);
int mptcp_sched_get_retrans(struct mptcp_sock *msk); int mptcp_sched_get_retrans(struct mptcp_sock *msk);
static inline u64 mptcp_data_avail(const struct mptcp_sock *msk)
{
return READ_ONCE(msk->bytes_received) - READ_ONCE(msk->bytes_consumed);
}
static inline bool mptcp_epollin_ready(const struct sock *sk)
{
/* mptcp doesn't have to deal with small skbs in the receive queue,
* at it can always coalesce them
*/
return (mptcp_data_avail(mptcp_sk(sk)) >= sk->sk_rcvlowat) ||
(mem_cgroup_sockets_enabled && sk->sk_memcg &&
mem_cgroup_under_socket_pressure(sk->sk_memcg)) ||
READ_ONCE(tcp_memory_pressure);
}
int mptcp_set_rcvlowat(struct sock *sk, int val);
static inline bool __tcp_can_send(const struct sock *ssk) static inline bool __tcp_can_send(const struct sock *ssk)
{ {
/* only send if our side has not closed yet */ /* only send if our side has not closed yet */
@ -737,6 +757,7 @@ static inline bool mptcp_is_fully_established(struct sock *sk)
return inet_sk_state_load(sk) == TCP_ESTABLISHED && return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
READ_ONCE(mptcp_sk(sk)->fully_established); READ_ONCE(mptcp_sk(sk)->fully_established);
} }
void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk); bool mptcp_finish_join(struct sock *sk);
@ -764,13 +785,52 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
} }
static inline bool mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) static inline void __mptcp_sync_sndbuf(struct sock *sk)
{ {
if ((sk->sk_userlocks & SOCK_SNDBUF_LOCK) || ssk->sk_sndbuf <= READ_ONCE(sk->sk_sndbuf)) struct mptcp_subflow_context *subflow;
return false; int ssk_sndbuf, new_sndbuf;
WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf); if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
return true; return;
new_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[0];
mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf);
subflow->cached_sndbuf = ssk_sndbuf;
new_sndbuf += ssk_sndbuf;
}
/* the msk max wmem limit is <nr_subflows> * tcp wmem[2] */
WRITE_ONCE(sk->sk_sndbuf, new_sndbuf);
}
/* The called held both the msk socket and the subflow socket locks,
* possibly under BH
*/
static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
if (READ_ONCE(ssk->sk_sndbuf) != subflow->cached_sndbuf)
__mptcp_sync_sndbuf(sk);
}
/* the caller held only the subflow socket lock, either in process or
* BH context. Additionally this can be called under the msk data lock,
* so we can't acquire such lock here: let the delegate action acquires
* the needed locks in suitable order.
*/
static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
if (likely(READ_ONCE(ssk->sk_sndbuf) == subflow->cached_sndbuf))
return;
local_bh_disable();
mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF);
local_bh_enable();
} }
static inline void mptcp_write_space(struct sock *sk) static inline void mptcp_write_space(struct sock *sk)

View File

@ -95,6 +95,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in
case SO_SNDBUFFORCE: case SO_SNDBUFFORCE:
ssk->sk_userlocks |= SOCK_SNDBUF_LOCK; ssk->sk_userlocks |= SOCK_SNDBUF_LOCK;
WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
break; break;
case SO_RCVBUF: case SO_RCVBUF:
case SO_RCVBUFFORCE: case SO_RCVBUFFORCE:
@ -1415,8 +1416,10 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
if (sk->sk_userlocks & tx_rx_locks) { if (sk->sk_userlocks & tx_rx_locks) {
ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks; ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) {
WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
}
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf);
} }
@ -1444,37 +1447,63 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
} }
static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
{
bool slow = lock_sock_fast(ssk);
sync_socket_options(msk, ssk);
unlock_sock_fast(ssk, slow);
}
void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
msk_owned_by_me(msk);
if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
__mptcp_sockopt_sync(msk, ssk);
subflow->setsockopt_seq = msk->setsockopt_seq;
}
}
void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk)
{ {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
msk_owned_by_me(msk); msk_owned_by_me(msk);
ssk->sk_rcvlowat = 0;
/* subflows must ignore any latency-related settings: will not affect
* the user-space - only the msk is relevant - but will foul the
* mptcp scheduler
*/
tcp_sk(ssk)->notsent_lowat = UINT_MAX;
if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) { if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
sync_socket_options(msk, ssk); sync_socket_options(msk, ssk);
subflow->setsockopt_seq = msk->setsockopt_seq; subflow->setsockopt_seq = msk->setsockopt_seq;
} }
} }
/* unfortunately this is different enough from the tcp version so
* that we can't factor it out
*/
int mptcp_set_rcvlowat(struct sock *sk, int val)
{
struct mptcp_subflow_context *subflow;
int space, cap;
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
cap = sk->sk_rcvbuf >> 1;
else
cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
val = min(val, cap);
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
/* Check if we need to signal EPOLLIN right now */
if (mptcp_epollin_ready(sk))
sk->sk_data_ready(sk);
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
return 0;
space = __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, val);
if (space <= sk->sk_rcvbuf)
return 0;
/* propagate the rcvbuf changes to all the subflows */
WRITE_ONCE(sk->sk_rcvbuf, space);
mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow;
slow = lock_sock_fast(ssk);
WRITE_ONCE(ssk->sk_rcvbuf, space);
tcp_sk(ssk)->window_clamp = val;
unlock_sock_fast(ssk, slow);
}
return 0;
}

View File

@ -421,6 +421,7 @@ static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct soc
void __mptcp_set_connected(struct sock *sk) void __mptcp_set_connected(struct sock *sk)
{ {
__mptcp_propagate_sndbuf(sk, mptcp_sk(sk)->first);
if (sk->sk_state == TCP_SYN_SENT) { if (sk->sk_state == TCP_SYN_SENT) {
inet_sk_state_store(sk, TCP_ESTABLISHED); inet_sk_state_store(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk); sk->sk_state_change(sk);
@ -472,7 +473,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
return; return;
msk = mptcp_sk(parent); msk = mptcp_sk(parent);
mptcp_propagate_sndbuf(parent, sk);
subflow->rel_write_seq = 1; subflow->rel_write_seq = 1;
subflow->conn_finished = 1; subflow->conn_finished = 1;
subflow->ssn_offset = TCP_SKB_CB(skb)->seq; subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
@ -1237,7 +1237,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
struct sk_buff *skb; struct sk_buff *skb;
if (!skb_peek(&ssk->sk_receive_queue)) if (!skb_peek(&ssk->sk_receive_queue))
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); WRITE_ONCE(subflow->data_avail, false);
if (subflow->data_avail) if (subflow->data_avail)
return true; return true;
@ -1271,7 +1271,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
continue; continue;
} }
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); WRITE_ONCE(subflow->data_avail, true);
break; break;
} }
return true; return true;
@ -1293,7 +1293,7 @@ fallback:
goto reset; goto reset;
} }
mptcp_subflow_fail(msk, ssk); mptcp_subflow_fail(msk, ssk);
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); WRITE_ONCE(subflow->data_avail, true);
return true; return true;
} }
@ -1310,7 +1310,7 @@ reset:
while ((skb = skb_peek(&ssk->sk_receive_queue))) while ((skb = skb_peek(&ssk->sk_receive_queue)))
sk_eat_skb(ssk, skb); sk_eat_skb(ssk, skb);
tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_send_active_reset(ssk, GFP_ATOMIC);
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); WRITE_ONCE(subflow->data_avail, false);
return false; return false;
} }
@ -1322,7 +1322,7 @@ reset:
subflow->map_seq = READ_ONCE(msk->ack_seq); subflow->map_seq = READ_ONCE(msk->ack_seq);
subflow->map_data_len = skb->len; subflow->map_data_len = skb->len;
subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); WRITE_ONCE(subflow->data_avail, true);
return true; return true;
} }
@ -1334,7 +1334,7 @@ bool mptcp_subflow_data_available(struct sock *sk)
if (subflow->map_valid && if (subflow->map_valid &&
mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
subflow->map_valid = 0; subflow->map_valid = 0;
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); WRITE_ONCE(subflow->data_avail, false);
pr_debug("Done with mapping: seq=%u data_len=%u", pr_debug("Done with mapping: seq=%u data_len=%u",
subflow->map_subflow_seq, subflow->map_subflow_seq,
@ -1405,10 +1405,18 @@ static void subflow_data_ready(struct sock *sk)
WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
!subflow->mp_join && !(state & TCPF_CLOSE)); !subflow->mp_join && !(state & TCPF_CLOSE));
if (mptcp_subflow_data_available(sk)) if (mptcp_subflow_data_available(sk)) {
mptcp_data_ready(parent, sk); mptcp_data_ready(parent, sk);
else if (unlikely(sk->sk_err))
/* subflow-level lowat test are not relevant.
* respect the msk-level threshold eventually mandating an immediate ack
*/
if (mptcp_data_avail(msk) < parent->sk_rcvlowat &&
(tcp_sk(sk)->rcv_nxt - tcp_sk(sk)->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss)
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
} else if (unlikely(sk->sk_err)) {
subflow_error_report(sk); subflow_error_report(sk);
}
} }
static void subflow_write_space(struct sock *ssk) static void subflow_write_space(struct sock *ssk)
@ -1525,8 +1533,6 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
if (addr.ss_family == AF_INET6) if (addr.ss_family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6); addrlen = sizeof(struct sockaddr_in6);
#endif #endif
mptcp_sockopt_sync(msk, ssk);
ssk->sk_bound_dev_if = ifindex; ssk->sk_bound_dev_if = ifindex;
err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
if (err) if (err)
@ -1637,7 +1643,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
err = security_mptcp_add_subflow(sk, sf->sk); err = security_mptcp_add_subflow(sk, sf->sk);
if (err) if (err)
goto release_ssk; goto err_free;
/* the newly created socket has to be in the same cgroup as its parent */ /* the newly created socket has to be in the same cgroup as its parent */
mptcp_attach_cgroup(sk, sf->sk); mptcp_attach_cgroup(sk, sf->sk);
@ -1651,15 +1657,12 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
sock_inuse_add(net, 1); sock_inuse_add(net, 1);
err = tcp_set_ulp(sf->sk, "mptcp"); err = tcp_set_ulp(sf->sk, "mptcp");
if (err)
goto err_free;
release_ssk: mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk);
release_sock(sf->sk); release_sock(sf->sk);
if (err) {
sock_release(sf);
return err;
}
/* the newly created socket really belongs to the owning MPTCP master /* the newly created socket really belongs to the owning MPTCP master
* socket, even if for additional subflows the allocation is performed * socket, even if for additional subflows the allocation is performed
* by a kernel workqueue. Adjust inode references, so that the * by a kernel workqueue. Adjust inode references, so that the
@ -1679,6 +1682,11 @@ release_ssk:
mptcp_subflow_ops_override(sf->sk); mptcp_subflow_ops_override(sf->sk);
return 0; return 0;
err_free:
release_sock(sf->sk);
sock_release(sf);
return err;
} }
static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
@ -1728,7 +1736,6 @@ static void subflow_state_change(struct sock *sk)
msk = mptcp_sk(parent); msk = mptcp_sk(parent);
if (subflow_simultaneous_connect(sk)) { if (subflow_simultaneous_connect(sk)) {
mptcp_propagate_sndbuf(parent, sk);
mptcp_do_fallback(sk); mptcp_do_fallback(sk);
mptcp_rcv_space_init(msk, sk); mptcp_rcv_space_init(msk, sk);
pr_fallback(msk); pr_fallback(msk);