mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2024-12-29 17:23:36 +00:00
Merge branch 'mptcp-features-and-fixes-for-v6-7'
Mat Martineau says: ==================== mptcp: Features and fixes for v6.7 Patch 1 adds a configurable timeout for the MPTCP connection when all subflows are closed, to support break-before-make use cases. Patch 2 is a fix for a 1-byte error in rx data counters with MPTCP fastopen connections. Patch 3 is a minor code cleanup. Patches 4 & 5 add handling of rcvlowat for MPTCP sockets, with a prerequisite patch to use a common scaling ratio between TCP and MPTCP. Patch 6 improves efficiency of memory copying in MPTCP transmit code. Patch 7 refactors syncing of socket options from the MPTCP socket to its subflows. Patches 8 & 9 help the MPTCP packet scheduler perform well by changing the handling of notsent_lowat in subflows and how available buffer space is calculated for MPTCP-level sends. ==================== Link: https://lore.kernel.org/r/20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
8846f9a04b
@ -25,6 +25,17 @@ add_addr_timeout - INTEGER (seconds)
|
||||
|
||||
Default: 120
|
||||
|
||||
close_timeout - INTEGER (seconds)
|
||||
Set the make-after-break timeout: in absence of any close or
|
||||
shutdown syscall, MPTCP sockets will maintain the status
|
||||
unchanged for such time, after the last subflow removal, before
|
||||
moving to TCP_CLOSE.
|
||||
|
||||
The default value matches TCP_TIMEWAIT_LEN. This is a per-namespace
|
||||
sysctl.
|
||||
|
||||
Default: 60
|
||||
|
||||
checksum_enabled - BOOLEAN
|
||||
Control whether DSS checksum can be enabled.
|
||||
|
||||
|
@ -1489,13 +1489,15 @@ static inline int tcp_space_from_win(const struct sock *sk, int win)
|
||||
return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
|
||||
}
|
||||
|
||||
/* Assume a conservative default of 1200 bytes of payload per 4K page.
|
||||
* This may be adjusted later in tcp_measure_rcv_mss().
|
||||
*/
|
||||
#define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \
|
||||
SKB_TRUESIZE(4096))
|
||||
|
||||
static inline void tcp_scaling_ratio_init(struct sock *sk)
|
||||
{
|
||||
/* Assume a conservative default of 1200 bytes of payload per 4K page.
|
||||
* This may be adjusted later in tcp_measure_rcv_mss().
|
||||
*/
|
||||
tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
|
||||
SKB_TRUESIZE(4096);
|
||||
tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
|
||||
}
|
||||
|
||||
/* Note: caller must be prepared to deal with negative returns */
|
||||
|
@ -27,6 +27,7 @@ struct mptcp_pernet {
|
||||
#endif
|
||||
|
||||
unsigned int add_addr_timeout;
|
||||
unsigned int close_timeout;
|
||||
unsigned int stale_loss_cnt;
|
||||
u8 mptcp_enabled;
|
||||
u8 checksum_enabled;
|
||||
@ -65,6 +66,13 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net)
|
||||
return mptcp_get_pernet(net)->stale_loss_cnt;
|
||||
}
|
||||
|
||||
unsigned int mptcp_close_timeout(const struct sock *sk)
|
||||
{
|
||||
if (sock_flag(sk, SOCK_DEAD))
|
||||
return TCP_TIMEWAIT_LEN;
|
||||
return mptcp_get_pernet(sock_net(sk))->close_timeout;
|
||||
}
|
||||
|
||||
int mptcp_get_pm_type(const struct net *net)
|
||||
{
|
||||
return mptcp_get_pernet(net)->pm_type;
|
||||
@ -79,6 +87,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
|
||||
{
|
||||
pernet->mptcp_enabled = 1;
|
||||
pernet->add_addr_timeout = TCP_RTO_MAX;
|
||||
pernet->close_timeout = TCP_TIMEWAIT_LEN;
|
||||
pernet->checksum_enabled = 0;
|
||||
pernet->allow_join_initial_addr_port = 1;
|
||||
pernet->stale_loss_cnt = 4;
|
||||
@ -141,6 +150,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dostring,
|
||||
},
|
||||
{
|
||||
.procname = "close_timeout",
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_jiffies,
|
||||
},
|
||||
{}
|
||||
};
|
||||
|
||||
@ -163,6 +178,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
|
||||
table[4].data = &pernet->stale_loss_cnt;
|
||||
table[5].data = &pernet->pm_type;
|
||||
table[6].data = &pernet->scheduler;
|
||||
table[7].data = &pernet->close_timeout;
|
||||
|
||||
hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
|
||||
ARRAY_SIZE(mptcp_sysctl_table));
|
||||
|
@ -52,6 +52,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf
|
||||
|
||||
mptcp_set_owner_r(skb, sk);
|
||||
__skb_queue_tail(&sk->sk_receive_queue, skb);
|
||||
mptcp_sk(sk)->bytes_received += skb->len;
|
||||
|
||||
sk->sk_data_ready(sk);
|
||||
|
||||
|
@ -121,8 +121,6 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk)
|
||||
ret = __mptcp_socket_create(msk);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
mptcp_sockopt_sync(msk, msk->first);
|
||||
}
|
||||
|
||||
return msk->first;
|
||||
@ -863,9 +861,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
|
||||
|
||||
/* Wake-up the reader only for in-sequence data */
|
||||
mptcp_data_lock(sk);
|
||||
if (move_skbs_to_msk(msk, ssk))
|
||||
if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
|
||||
sk->sk_data_ready(sk);
|
||||
|
||||
mptcp_data_unlock(sk);
|
||||
}
|
||||
|
||||
@ -893,6 +890,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
|
||||
mptcp_sockopt_sync_locked(msk, ssk);
|
||||
mptcp_subflow_joined(msk, ssk);
|
||||
mptcp_stop_tout_timer(sk);
|
||||
__mptcp_propagate_sndbuf(sk, ssk);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1079,15 +1077,16 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
|
||||
struct mptcp_sock *msk = mptcp_sk(sk);
|
||||
bool first = true;
|
||||
|
||||
sk_stream_moderate_sndbuf(sk);
|
||||
mptcp_for_each_subflow(msk, subflow) {
|
||||
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
|
||||
|
||||
if (first)
|
||||
tcp_enter_memory_pressure(ssk);
|
||||
sk_stream_moderate_sndbuf(ssk);
|
||||
|
||||
first = false;
|
||||
}
|
||||
__mptcp_sync_sndbuf(sk);
|
||||
}
|
||||
|
||||
/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
|
||||
@ -1761,6 +1760,18 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int do_copy_data_nocache(struct sock *sk, int copy,
|
||||
struct iov_iter *from, char *to)
|
||||
{
|
||||
if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
|
||||
if (!copy_from_iter_full_nocache(to, copy, from))
|
||||
return -EFAULT;
|
||||
} else if (!copy_from_iter_full(to, copy, from)) {
|
||||
return -EFAULT;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
|
||||
{
|
||||
struct mptcp_sock *msk = mptcp_sk(sk);
|
||||
@ -1834,11 +1845,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
|
||||
if (!sk_wmem_schedule(sk, total_ts))
|
||||
goto wait_for_memory;
|
||||
|
||||
if (copy_page_from_iter(dfrag->page, offset, psize,
|
||||
&msg->msg_iter) != psize) {
|
||||
ret = -EFAULT;
|
||||
ret = do_copy_data_nocache(sk, psize, &msg->msg_iter,
|
||||
page_address(dfrag->page) + offset);
|
||||
if (ret)
|
||||
goto do_error;
|
||||
}
|
||||
|
||||
/* data successfully copied into the write queue */
|
||||
sk_forward_alloc_add(sk, -total_ts);
|
||||
@ -1922,6 +1932,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
|
||||
if (!(flags & MSG_PEEK)) {
|
||||
MPTCP_SKB_CB(skb)->offset += count;
|
||||
MPTCP_SKB_CB(skb)->map_seq += count;
|
||||
msk->bytes_consumed += count;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -1932,6 +1943,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
|
||||
WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
|
||||
__skb_unlink(skb, &msk->receive_queue);
|
||||
__kfree_skb(skb);
|
||||
msk->bytes_consumed += count;
|
||||
}
|
||||
|
||||
if (copied >= len)
|
||||
@ -2391,8 +2403,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
|
||||
if (msk->in_accept_queue && msk->first == ssk &&
|
||||
(sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) {
|
||||
/* ensure later check in mptcp_worker() will dispose the msk */
|
||||
mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1));
|
||||
sock_set_flag(sk, SOCK_DEAD);
|
||||
mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1));
|
||||
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
|
||||
mptcp_subflow_drop_ctx(ssk);
|
||||
goto out_release;
|
||||
@ -2448,6 +2460,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
|
||||
WRITE_ONCE(msk->first, NULL);
|
||||
|
||||
out:
|
||||
__mptcp_sync_sndbuf(sk);
|
||||
if (need_push)
|
||||
__mptcp_push_pending(sk, 0);
|
||||
|
||||
@ -2516,7 +2529,7 @@ static bool mptcp_close_tout_expired(const struct sock *sk)
|
||||
return false;
|
||||
|
||||
return time_after32(tcp_jiffies32,
|
||||
inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN);
|
||||
inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk));
|
||||
}
|
||||
|
||||
static void mptcp_check_fastclose(struct mptcp_sock *msk)
|
||||
@ -2659,7 +2672,7 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
|
||||
return;
|
||||
|
||||
close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies +
|
||||
TCP_TIMEWAIT_LEN;
|
||||
mptcp_close_timeout(sk);
|
||||
|
||||
/* the close timeout takes precedence on the fail one, and here at least one of
|
||||
* them is active
|
||||
@ -2755,6 +2768,7 @@ static void __mptcp_init_sock(struct sock *sk)
|
||||
msk->rmem_fwd_alloc = 0;
|
||||
WRITE_ONCE(msk->rmem_released, 0);
|
||||
msk->timer_ival = TCP_RTO_MIN;
|
||||
msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
|
||||
|
||||
WRITE_ONCE(msk->first, NULL);
|
||||
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
|
||||
@ -2964,16 +2978,9 @@ void __mptcp_unaccepted_force_close(struct sock *sk)
|
||||
__mptcp_destroy_sock(sk);
|
||||
}
|
||||
|
||||
static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
|
||||
static __poll_t mptcp_check_readable(struct sock *sk)
|
||||
{
|
||||
/* Concurrent splices from sk_receive_queue into receive_queue will
|
||||
* always show at least one non-empty queue when checked in this order.
|
||||
*/
|
||||
if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
|
||||
skb_queue_empty_lockless(&msk->receive_queue))
|
||||
return 0;
|
||||
|
||||
return EPOLLIN | EPOLLRDNORM;
|
||||
return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0;
|
||||
}
|
||||
|
||||
static void mptcp_check_listen_stop(struct sock *sk)
|
||||
@ -3011,7 +3018,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (mptcp_check_readable(msk) || timeout < 0) {
|
||||
if (mptcp_data_avail(msk) || timeout < 0) {
|
||||
/* If the msk has read data, or the caller explicitly ask it,
|
||||
* do the MPTCP equivalent of TCP reset, aka MPTCP fastclose
|
||||
*/
|
||||
@ -3138,6 +3145,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
|
||||
msk->snd_data_fin_enable = false;
|
||||
msk->rcv_fastclose = false;
|
||||
msk->use_64bit_ack = false;
|
||||
msk->bytes_consumed = 0;
|
||||
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
|
||||
mptcp_pm_data_reset(msk);
|
||||
mptcp_ca_reset(sk);
|
||||
@ -3219,7 +3227,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
|
||||
* uses the correct data
|
||||
*/
|
||||
mptcp_copy_inaddrs(nsk, ssk);
|
||||
mptcp_propagate_sndbuf(nsk, ssk);
|
||||
__mptcp_propagate_sndbuf(nsk, ssk);
|
||||
|
||||
mptcp_rcv_space_init(msk, ssk);
|
||||
bh_unlock_sock(nsk);
|
||||
@ -3397,6 +3405,8 @@ static void mptcp_release_cb(struct sock *sk)
|
||||
__mptcp_set_connected(sk);
|
||||
if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
|
||||
__mptcp_error_report(sk);
|
||||
if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags))
|
||||
__mptcp_sync_sndbuf(sk);
|
||||
}
|
||||
|
||||
__mptcp_update_rmem(sk);
|
||||
@ -3441,6 +3451,14 @@ void mptcp_subflow_process_delegated(struct sock *ssk, long status)
|
||||
__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
|
||||
mptcp_data_unlock(sk);
|
||||
}
|
||||
if (status & BIT(MPTCP_DELEGATE_SNDBUF)) {
|
||||
mptcp_data_lock(sk);
|
||||
if (!sock_owned_by_user(sk))
|
||||
__mptcp_sync_sndbuf(sk);
|
||||
else
|
||||
__set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags);
|
||||
mptcp_data_unlock(sk);
|
||||
}
|
||||
if (status & BIT(MPTCP_DELEGATE_ACK))
|
||||
schedule_3rdack_retransmission(ssk);
|
||||
}
|
||||
@ -3525,6 +3543,7 @@ bool mptcp_finish_join(struct sock *ssk)
|
||||
/* active subflow, already present inside the conn_list */
|
||||
if (!list_empty(&subflow->node)) {
|
||||
mptcp_subflow_joined(msk, ssk);
|
||||
mptcp_propagate_sndbuf(parent, ssk);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -3909,7 +3928,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
|
||||
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
|
||||
|
||||
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
|
||||
mask |= mptcp_check_readable(msk);
|
||||
mask |= mptcp_check_readable(sk);
|
||||
if (shutdown & SEND_SHUTDOWN)
|
||||
mask |= EPOLLOUT | EPOLLWRNORM;
|
||||
else
|
||||
@ -3947,6 +3966,7 @@ static const struct proto_ops mptcp_stream_ops = {
|
||||
.sendmsg = inet_sendmsg,
|
||||
.recvmsg = inet_recvmsg,
|
||||
.mmap = sock_no_mmap,
|
||||
.set_rcvlowat = mptcp_set_rcvlowat,
|
||||
};
|
||||
|
||||
static struct inet_protosw mptcp_protosw = {
|
||||
@ -4048,6 +4068,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
|
||||
#ifdef CONFIG_COMPAT
|
||||
.compat_ioctl = inet6_compat_ioctl,
|
||||
#endif
|
||||
.set_rcvlowat = mptcp_set_rcvlowat,
|
||||
};
|
||||
|
||||
static struct proto mptcp_v6_prot;
|
||||
|
@ -125,6 +125,7 @@
|
||||
#define MPTCP_RETRANSMIT 4
|
||||
#define MPTCP_FLUSH_JOIN_LIST 5
|
||||
#define MPTCP_CONNECTED 6
|
||||
#define MPTCP_SYNC_SNDBUF 7
|
||||
|
||||
struct mptcp_skb_cb {
|
||||
u64 map_seq;
|
||||
@ -269,6 +270,7 @@ struct mptcp_sock {
|
||||
atomic64_t rcv_wnd_sent;
|
||||
u64 rcv_data_fin_seq;
|
||||
u64 bytes_retrans;
|
||||
u64 bytes_consumed;
|
||||
int rmem_fwd_alloc;
|
||||
int snd_burst;
|
||||
int old_wspace;
|
||||
@ -434,11 +436,6 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
|
||||
return (struct mptcp_subflow_request_sock *)rsk;
|
||||
}
|
||||
|
||||
enum mptcp_data_avail {
|
||||
MPTCP_SUBFLOW_NODATA,
|
||||
MPTCP_SUBFLOW_DATA_AVAIL,
|
||||
};
|
||||
|
||||
struct mptcp_delegated_action {
|
||||
struct napi_struct napi;
|
||||
struct list_head head;
|
||||
@ -449,6 +446,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
|
||||
#define MPTCP_DELEGATE_SCHEDULED 0
|
||||
#define MPTCP_DELEGATE_SEND 1
|
||||
#define MPTCP_DELEGATE_ACK 2
|
||||
#define MPTCP_DELEGATE_SNDBUF 3
|
||||
|
||||
#define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED))
|
||||
/* MPTCP subflow context */
|
||||
@ -494,7 +492,7 @@ struct mptcp_subflow_context {
|
||||
valid_csum_seen : 1, /* at least one csum validated */
|
||||
is_mptfo : 1, /* subflow is doing TFO */
|
||||
__unused : 9;
|
||||
enum mptcp_data_avail data_avail;
|
||||
bool data_avail;
|
||||
bool scheduled;
|
||||
u32 remote_nonce;
|
||||
u64 thmac;
|
||||
@ -522,6 +520,9 @@ struct mptcp_subflow_context {
|
||||
|
||||
u32 setsockopt_seq;
|
||||
u32 stale_rcv_tstamp;
|
||||
int cached_sndbuf; /* sndbuf size when last synced with the msk sndbuf,
|
||||
* protected by the msk socket lock
|
||||
*/
|
||||
|
||||
struct sock *tcp_sock; /* tcp sk backpointer */
|
||||
struct sock *conn; /* parent mptcp_sock */
|
||||
@ -615,6 +616,7 @@ unsigned int mptcp_get_add_addr_timeout(const struct net *net);
|
||||
int mptcp_is_checksum_enabled(const struct net *net);
|
||||
int mptcp_allow_join_id0(const struct net *net);
|
||||
unsigned int mptcp_stale_loss_cnt(const struct net *net);
|
||||
unsigned int mptcp_close_timeout(const struct sock *sk);
|
||||
int mptcp_get_pm_type(const struct net *net);
|
||||
const char *mptcp_get_scheduler(const struct net *net);
|
||||
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
|
||||
@ -663,6 +665,24 @@ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk);
|
||||
int mptcp_sched_get_send(struct mptcp_sock *msk);
|
||||
int mptcp_sched_get_retrans(struct mptcp_sock *msk);
|
||||
|
||||
static inline u64 mptcp_data_avail(const struct mptcp_sock *msk)
|
||||
{
|
||||
return READ_ONCE(msk->bytes_received) - READ_ONCE(msk->bytes_consumed);
|
||||
}
|
||||
|
||||
static inline bool mptcp_epollin_ready(const struct sock *sk)
|
||||
{
|
||||
/* mptcp doesn't have to deal with small skbs in the receive queue,
|
||||
* at it can always coalesce them
|
||||
*/
|
||||
return (mptcp_data_avail(mptcp_sk(sk)) >= sk->sk_rcvlowat) ||
|
||||
(mem_cgroup_sockets_enabled && sk->sk_memcg &&
|
||||
mem_cgroup_under_socket_pressure(sk->sk_memcg)) ||
|
||||
READ_ONCE(tcp_memory_pressure);
|
||||
}
|
||||
|
||||
int mptcp_set_rcvlowat(struct sock *sk, int val);
|
||||
|
||||
static inline bool __tcp_can_send(const struct sock *ssk)
|
||||
{
|
||||
/* only send if our side has not closed yet */
|
||||
@ -737,6 +757,7 @@ static inline bool mptcp_is_fully_established(struct sock *sk)
|
||||
return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
|
||||
READ_ONCE(mptcp_sk(sk)->fully_established);
|
||||
}
|
||||
|
||||
void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
|
||||
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
|
||||
bool mptcp_finish_join(struct sock *sk);
|
||||
@ -764,13 +785,52 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
|
||||
READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
|
||||
}
|
||||
|
||||
static inline bool mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
|
||||
static inline void __mptcp_sync_sndbuf(struct sock *sk)
|
||||
{
|
||||
if ((sk->sk_userlocks & SOCK_SNDBUF_LOCK) || ssk->sk_sndbuf <= READ_ONCE(sk->sk_sndbuf))
|
||||
return false;
|
||||
struct mptcp_subflow_context *subflow;
|
||||
int ssk_sndbuf, new_sndbuf;
|
||||
|
||||
WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
|
||||
return true;
|
||||
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
|
||||
return;
|
||||
|
||||
new_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[0];
|
||||
mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
|
||||
ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf);
|
||||
|
||||
subflow->cached_sndbuf = ssk_sndbuf;
|
||||
new_sndbuf += ssk_sndbuf;
|
||||
}
|
||||
|
||||
/* the msk max wmem limit is <nr_subflows> * tcp wmem[2] */
|
||||
WRITE_ONCE(sk->sk_sndbuf, new_sndbuf);
|
||||
}
|
||||
|
||||
/* The called held both the msk socket and the subflow socket locks,
|
||||
* possibly under BH
|
||||
*/
|
||||
static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
|
||||
|
||||
if (READ_ONCE(ssk->sk_sndbuf) != subflow->cached_sndbuf)
|
||||
__mptcp_sync_sndbuf(sk);
|
||||
}
|
||||
|
||||
/* the caller held only the subflow socket lock, either in process or
|
||||
* BH context. Additionally this can be called under the msk data lock,
|
||||
* so we can't acquire such lock here: let the delegate action acquires
|
||||
* the needed locks in suitable order.
|
||||
*/
|
||||
static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
|
||||
|
||||
if (likely(READ_ONCE(ssk->sk_sndbuf) == subflow->cached_sndbuf))
|
||||
return;
|
||||
|
||||
local_bh_disable();
|
||||
mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF);
|
||||
local_bh_enable();
|
||||
}
|
||||
|
||||
static inline void mptcp_write_space(struct sock *sk)
|
||||
|
@ -95,6 +95,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in
|
||||
case SO_SNDBUFFORCE:
|
||||
ssk->sk_userlocks |= SOCK_SNDBUF_LOCK;
|
||||
WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
|
||||
mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
|
||||
break;
|
||||
case SO_RCVBUF:
|
||||
case SO_RCVBUFFORCE:
|
||||
@ -1415,8 +1416,10 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
|
||||
|
||||
if (sk->sk_userlocks & tx_rx_locks) {
|
||||
ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
|
||||
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
|
||||
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) {
|
||||
WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
|
||||
mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
|
||||
}
|
||||
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
|
||||
WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf);
|
||||
}
|
||||
@ -1444,37 +1447,63 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
|
||||
inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
|
||||
}
|
||||
|
||||
static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
|
||||
{
|
||||
bool slow = lock_sock_fast(ssk);
|
||||
|
||||
sync_socket_options(msk, ssk);
|
||||
|
||||
unlock_sock_fast(ssk, slow);
|
||||
}
|
||||
|
||||
void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
|
||||
|
||||
msk_owned_by_me(msk);
|
||||
|
||||
if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
|
||||
__mptcp_sockopt_sync(msk, ssk);
|
||||
|
||||
subflow->setsockopt_seq = msk->setsockopt_seq;
|
||||
}
|
||||
}
|
||||
|
||||
void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
|
||||
|
||||
msk_owned_by_me(msk);
|
||||
|
||||
ssk->sk_rcvlowat = 0;
|
||||
|
||||
/* subflows must ignore any latency-related settings: will not affect
|
||||
* the user-space - only the msk is relevant - but will foul the
|
||||
* mptcp scheduler
|
||||
*/
|
||||
tcp_sk(ssk)->notsent_lowat = UINT_MAX;
|
||||
|
||||
if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
|
||||
sync_socket_options(msk, ssk);
|
||||
|
||||
subflow->setsockopt_seq = msk->setsockopt_seq;
|
||||
}
|
||||
}
|
||||
|
||||
/* unfortunately this is different enough from the tcp version so
|
||||
* that we can't factor it out
|
||||
*/
|
||||
int mptcp_set_rcvlowat(struct sock *sk, int val)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow;
|
||||
int space, cap;
|
||||
|
||||
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
|
||||
cap = sk->sk_rcvbuf >> 1;
|
||||
else
|
||||
cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
|
||||
val = min(val, cap);
|
||||
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
|
||||
|
||||
/* Check if we need to signal EPOLLIN right now */
|
||||
if (mptcp_epollin_ready(sk))
|
||||
sk->sk_data_ready(sk);
|
||||
|
||||
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
|
||||
return 0;
|
||||
|
||||
space = __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, val);
|
||||
if (space <= sk->sk_rcvbuf)
|
||||
return 0;
|
||||
|
||||
/* propagate the rcvbuf changes to all the subflows */
|
||||
WRITE_ONCE(sk->sk_rcvbuf, space);
|
||||
mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
|
||||
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
|
||||
bool slow;
|
||||
|
||||
slow = lock_sock_fast(ssk);
|
||||
WRITE_ONCE(ssk->sk_rcvbuf, space);
|
||||
tcp_sk(ssk)->window_clamp = val;
|
||||
unlock_sock_fast(ssk, slow);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -421,6 +421,7 @@ static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct soc
|
||||
|
||||
void __mptcp_set_connected(struct sock *sk)
|
||||
{
|
||||
__mptcp_propagate_sndbuf(sk, mptcp_sk(sk)->first);
|
||||
if (sk->sk_state == TCP_SYN_SENT) {
|
||||
inet_sk_state_store(sk, TCP_ESTABLISHED);
|
||||
sk->sk_state_change(sk);
|
||||
@ -472,7 +473,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
|
||||
return;
|
||||
|
||||
msk = mptcp_sk(parent);
|
||||
mptcp_propagate_sndbuf(parent, sk);
|
||||
subflow->rel_write_seq = 1;
|
||||
subflow->conn_finished = 1;
|
||||
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
|
||||
@ -1237,7 +1237,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
|
||||
struct sk_buff *skb;
|
||||
|
||||
if (!skb_peek(&ssk->sk_receive_queue))
|
||||
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
|
||||
WRITE_ONCE(subflow->data_avail, false);
|
||||
if (subflow->data_avail)
|
||||
return true;
|
||||
|
||||
@ -1271,7 +1271,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
|
||||
continue;
|
||||
}
|
||||
|
||||
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
|
||||
WRITE_ONCE(subflow->data_avail, true);
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
@ -1293,7 +1293,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
|
||||
goto reset;
|
||||
}
|
||||
mptcp_subflow_fail(msk, ssk);
|
||||
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
|
||||
WRITE_ONCE(subflow->data_avail, true);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1310,7 +1310,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
|
||||
while ((skb = skb_peek(&ssk->sk_receive_queue)))
|
||||
sk_eat_skb(ssk, skb);
|
||||
tcp_send_active_reset(ssk, GFP_ATOMIC);
|
||||
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
|
||||
WRITE_ONCE(subflow->data_avail, false);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1322,7 +1322,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
|
||||
subflow->map_seq = READ_ONCE(msk->ack_seq);
|
||||
subflow->map_data_len = skb->len;
|
||||
subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
|
||||
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
|
||||
WRITE_ONCE(subflow->data_avail, true);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1334,7 +1334,7 @@ bool mptcp_subflow_data_available(struct sock *sk)
|
||||
if (subflow->map_valid &&
|
||||
mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
|
||||
subflow->map_valid = 0;
|
||||
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
|
||||
WRITE_ONCE(subflow->data_avail, false);
|
||||
|
||||
pr_debug("Done with mapping: seq=%u data_len=%u",
|
||||
subflow->map_subflow_seq,
|
||||
@ -1405,10 +1405,18 @@ static void subflow_data_ready(struct sock *sk)
|
||||
WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
|
||||
!subflow->mp_join && !(state & TCPF_CLOSE));
|
||||
|
||||
if (mptcp_subflow_data_available(sk))
|
||||
if (mptcp_subflow_data_available(sk)) {
|
||||
mptcp_data_ready(parent, sk);
|
||||
else if (unlikely(sk->sk_err))
|
||||
|
||||
/* subflow-level lowat test are not relevant.
|
||||
* respect the msk-level threshold eventually mandating an immediate ack
|
||||
*/
|
||||
if (mptcp_data_avail(msk) < parent->sk_rcvlowat &&
|
||||
(tcp_sk(sk)->rcv_nxt - tcp_sk(sk)->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss)
|
||||
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
|
||||
} else if (unlikely(sk->sk_err)) {
|
||||
subflow_error_report(sk);
|
||||
}
|
||||
}
|
||||
|
||||
static void subflow_write_space(struct sock *ssk)
|
||||
@ -1525,8 +1533,6 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
|
||||
if (addr.ss_family == AF_INET6)
|
||||
addrlen = sizeof(struct sockaddr_in6);
|
||||
#endif
|
||||
mptcp_sockopt_sync(msk, ssk);
|
||||
|
||||
ssk->sk_bound_dev_if = ifindex;
|
||||
err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
|
||||
if (err)
|
||||
@ -1637,7 +1643,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
|
||||
|
||||
err = security_mptcp_add_subflow(sk, sf->sk);
|
||||
if (err)
|
||||
goto release_ssk;
|
||||
goto err_free;
|
||||
|
||||
/* the newly created socket has to be in the same cgroup as its parent */
|
||||
mptcp_attach_cgroup(sk, sf->sk);
|
||||
@ -1651,15 +1657,12 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
|
||||
get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
|
||||
sock_inuse_add(net, 1);
|
||||
err = tcp_set_ulp(sf->sk, "mptcp");
|
||||
if (err)
|
||||
goto err_free;
|
||||
|
||||
release_ssk:
|
||||
mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk);
|
||||
release_sock(sf->sk);
|
||||
|
||||
if (err) {
|
||||
sock_release(sf);
|
||||
return err;
|
||||
}
|
||||
|
||||
/* the newly created socket really belongs to the owning MPTCP master
|
||||
* socket, even if for additional subflows the allocation is performed
|
||||
* by a kernel workqueue. Adjust inode references, so that the
|
||||
@ -1679,6 +1682,11 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
|
||||
mptcp_subflow_ops_override(sf->sk);
|
||||
|
||||
return 0;
|
||||
|
||||
err_free:
|
||||
release_sock(sf->sk);
|
||||
sock_release(sf);
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
|
||||
@ -1728,7 +1736,6 @@ static void subflow_state_change(struct sock *sk)
|
||||
|
||||
msk = mptcp_sk(parent);
|
||||
if (subflow_simultaneous_connect(sk)) {
|
||||
mptcp_propagate_sndbuf(parent, sk);
|
||||
mptcp_do_fallback(sk);
|
||||
mptcp_rcv_space_init(msk, sk);
|
||||
pr_fallback(msk);
|
||||
|
Loading…
Reference in New Issue
Block a user