mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-16 13:34:30 +00:00
Merge branch 'mptcp-stalled-connections-fix'
Matthieu Baerts says: ==================== mptcp: fix stalled connections Daire reported a few issues with MPTCP where some connections were stalled in different states. Paolo did a great job fixing them. Patch 1 fixes bogus receive window shrinkage with multiple subflows. Due to a race condition and unlucky circumstances, that may lead to TCP-level window shrinkage, and the connection being stalled on the sender end. Patch 2 is a preparation for patch 3 which processes pending subflow errors on close. Without that and under specific circumstances, the MPTCP-level socket might not switch to the CLOSE state and stall. Patch 4 is also a preparation patch for the next one. Patch 5 fixes MPTCP connections not switching to the CLOSE state when all subflows have been closed but no DATA_FIN have been exchanged to explicitly close the MPTCP connection. Now connections in such state will switch to the CLOSE state after a timeout, still allowing the "make-after-break" feature but making sure connections don't stall forever. It will be possible to modify this timeout -- currently matching TCP TIMEWAIT value (60 seconds) -- in a future version. ==================== Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
This commit is contained in:
commit
972983fc32
@ -1269,12 +1269,13 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
|
||||
|
||||
if (rcv_wnd == rcv_wnd_old)
|
||||
break;
|
||||
if (before64(rcv_wnd_new, rcv_wnd)) {
|
||||
|
||||
rcv_wnd_old = rcv_wnd;
|
||||
if (before64(rcv_wnd_new, rcv_wnd_old)) {
|
||||
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE);
|
||||
goto raise_win;
|
||||
}
|
||||
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT);
|
||||
rcv_wnd_old = rcv_wnd;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -405,7 +405,7 @@ drop:
|
||||
return false;
|
||||
}
|
||||
|
||||
static void mptcp_stop_timer(struct sock *sk)
|
||||
static void mptcp_stop_rtx_timer(struct sock *sk)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
@ -770,6 +770,46 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
|
||||
return moved;
|
||||
}
|
||||
|
||||
static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
|
||||
{
|
||||
int err = sock_error(ssk);
|
||||
int ssk_state;
|
||||
|
||||
if (!err)
|
||||
return false;
|
||||
|
||||
/* only propagate errors on fallen-back sockets or
|
||||
* on MPC connect
|
||||
*/
|
||||
if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk)))
|
||||
return false;
|
||||
|
||||
/* We need to propagate only transition to CLOSE state.
|
||||
* Orphaned socket will see such state change via
|
||||
* subflow_sched_work_if_closed() and that path will properly
|
||||
* destroy the msk as needed.
|
||||
*/
|
||||
ssk_state = inet_sk_state_load(ssk);
|
||||
if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
|
||||
inet_sk_state_store(sk, ssk_state);
|
||||
WRITE_ONCE(sk->sk_err, -err);
|
||||
|
||||
/* This barrier is coupled with smp_rmb() in mptcp_poll() */
|
||||
smp_wmb();
|
||||
sk_error_report(sk);
|
||||
return true;
|
||||
}
|
||||
|
||||
void __mptcp_error_report(struct sock *sk)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow;
|
||||
struct mptcp_sock *msk = mptcp_sk(sk);
|
||||
|
||||
mptcp_for_each_subflow(msk, subflow)
|
||||
if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow)))
|
||||
break;
|
||||
}
|
||||
|
||||
/* In most cases we will be able to lock the mptcp socket. If its already
|
||||
* owned, we need to defer to the work queue to avoid ABBA deadlock.
|
||||
*/
|
||||
@ -852,6 +892,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
|
||||
mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++;
|
||||
mptcp_sockopt_sync_locked(msk, ssk);
|
||||
mptcp_subflow_joined(msk, ssk);
|
||||
mptcp_stop_tout_timer(sk);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -871,12 +912,12 @@ static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list
|
||||
}
|
||||
}
|
||||
|
||||
static bool mptcp_timer_pending(struct sock *sk)
|
||||
static bool mptcp_rtx_timer_pending(struct sock *sk)
|
||||
{
|
||||
return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
|
||||
}
|
||||
|
||||
static void mptcp_reset_timer(struct sock *sk)
|
||||
static void mptcp_reset_rtx_timer(struct sock *sk)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
unsigned long tout;
|
||||
@ -1010,10 +1051,10 @@ static void __mptcp_clean_una(struct sock *sk)
|
||||
out:
|
||||
if (snd_una == READ_ONCE(msk->snd_nxt) &&
|
||||
snd_una == READ_ONCE(msk->write_seq)) {
|
||||
if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
|
||||
mptcp_stop_timer(sk);
|
||||
if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
|
||||
mptcp_stop_rtx_timer(sk);
|
||||
} else {
|
||||
mptcp_reset_timer(sk);
|
||||
mptcp_reset_rtx_timer(sk);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1586,8 +1627,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
|
||||
mptcp_push_release(ssk, &info);
|
||||
|
||||
/* ensure the rtx timer is running */
|
||||
if (!mptcp_timer_pending(sk))
|
||||
mptcp_reset_timer(sk);
|
||||
if (!mptcp_rtx_timer_pending(sk))
|
||||
mptcp_reset_rtx_timer(sk);
|
||||
if (do_check_data_fin)
|
||||
mptcp_check_send_data_fin(sk);
|
||||
}
|
||||
@ -1650,8 +1691,8 @@ out:
|
||||
if (copied) {
|
||||
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
|
||||
info.size_goal);
|
||||
if (!mptcp_timer_pending(sk))
|
||||
mptcp_reset_timer(sk);
|
||||
if (!mptcp_rtx_timer_pending(sk))
|
||||
mptcp_reset_rtx_timer(sk);
|
||||
|
||||
if (msk->snd_data_fin_enable &&
|
||||
msk->snd_nxt + 1 == msk->write_seq)
|
||||
@ -2220,7 +2261,7 @@ static void mptcp_retransmit_timer(struct timer_list *t)
|
||||
sock_put(sk);
|
||||
}
|
||||
|
||||
static void mptcp_timeout_timer(struct timer_list *t)
|
||||
static void mptcp_tout_timer(struct timer_list *t)
|
||||
{
|
||||
struct sock *sk = from_timer(sk, t, sk_timer);
|
||||
|
||||
@ -2329,18 +2370,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
|
||||
bool dispose_it, need_push = false;
|
||||
|
||||
/* If the first subflow moved to a close state before accept, e.g. due
|
||||
* to an incoming reset, mptcp either:
|
||||
* - if either the subflow or the msk are dead, destroy the context
|
||||
* (the subflow socket is deleted by inet_child_forget) and the msk
|
||||
* - otherwise do nothing at the moment and take action at accept and/or
|
||||
* listener shutdown - user-space must be able to accept() the closed
|
||||
* socket.
|
||||
* to an incoming reset or listener shutdown, the subflow socket is
|
||||
* already deleted by inet_child_forget() and the mptcp socket can't
|
||||
* survive too.
|
||||
*/
|
||||
if (msk->in_accept_queue && msk->first == ssk) {
|
||||
if (!sock_flag(sk, SOCK_DEAD) && !sock_flag(ssk, SOCK_DEAD))
|
||||
return;
|
||||
|
||||
if (msk->in_accept_queue && msk->first == ssk &&
|
||||
(sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) {
|
||||
/* ensure later check in mptcp_worker() will dispose the msk */
|
||||
mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1));
|
||||
sock_set_flag(sk, SOCK_DEAD);
|
||||
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
|
||||
mptcp_subflow_drop_ctx(ssk);
|
||||
@ -2392,6 +2429,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
|
||||
}
|
||||
|
||||
out_release:
|
||||
__mptcp_subflow_error_report(sk, ssk);
|
||||
release_sock(ssk);
|
||||
|
||||
sock_put(ssk);
|
||||
@ -2402,6 +2440,22 @@ out_release:
|
||||
out:
|
||||
if (need_push)
|
||||
__mptcp_push_pending(sk, 0);
|
||||
|
||||
/* Catch every 'all subflows closed' scenario, including peers silently
|
||||
* closing them, e.g. due to timeout.
|
||||
* For established sockets, allow an additional timeout before closing,
|
||||
* as the protocol can still create more subflows.
|
||||
*/
|
||||
if (list_is_singular(&msk->conn_list) && msk->first &&
|
||||
inet_sk_state_load(msk->first) == TCP_CLOSE) {
|
||||
if (sk->sk_state != TCP_ESTABLISHED ||
|
||||
msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) {
|
||||
inet_sk_state_store(sk, TCP_CLOSE);
|
||||
mptcp_close_wake_up(sk);
|
||||
} else {
|
||||
mptcp_start_tout_timer(sk);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
|
||||
@ -2445,23 +2499,14 @@ static void __mptcp_close_subflow(struct sock *sk)
|
||||
|
||||
}
|
||||
|
||||
static bool mptcp_should_close(const struct sock *sk)
|
||||
static bool mptcp_close_tout_expired(const struct sock *sk)
|
||||
{
|
||||
s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp;
|
||||
struct mptcp_subflow_context *subflow;
|
||||
|
||||
if (delta >= TCP_TIMEWAIT_LEN || mptcp_sk(sk)->in_accept_queue)
|
||||
return true;
|
||||
|
||||
/* if all subflows are in closed status don't bother with additional
|
||||
* timeout
|
||||
*/
|
||||
mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
|
||||
if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) !=
|
||||
TCP_CLOSE)
|
||||
if (!inet_csk(sk)->icsk_mtup.probe_timestamp ||
|
||||
sk->sk_state == TCP_CLOSE)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
||||
return time_after32(tcp_jiffies32,
|
||||
inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN);
|
||||
}
|
||||
|
||||
static void mptcp_check_fastclose(struct mptcp_sock *msk)
|
||||
@ -2588,27 +2633,28 @@ static void __mptcp_retrans(struct sock *sk)
|
||||
reset_timer:
|
||||
mptcp_check_and_set_pending(sk);
|
||||
|
||||
if (!mptcp_timer_pending(sk))
|
||||
mptcp_reset_timer(sk);
|
||||
if (!mptcp_rtx_timer_pending(sk))
|
||||
mptcp_reset_rtx_timer(sk);
|
||||
}
|
||||
|
||||
/* schedule the timeout timer for the relevant event: either close timeout
|
||||
* or mp_fail timeout. The close timeout takes precedence on the mp_fail one
|
||||
*/
|
||||
void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout)
|
||||
void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
|
||||
{
|
||||
struct sock *sk = (struct sock *)msk;
|
||||
unsigned long timeout, close_timeout;
|
||||
|
||||
if (!fail_tout && !sock_flag(sk, SOCK_DEAD))
|
||||
if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp)
|
||||
return;
|
||||
|
||||
close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN;
|
||||
close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies +
|
||||
TCP_TIMEWAIT_LEN;
|
||||
|
||||
/* the close timeout takes precedence on the fail one, and here at least one of
|
||||
* them is active
|
||||
*/
|
||||
timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout;
|
||||
timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout;
|
||||
|
||||
sk_reset_timer(sk, &sk->sk_timer, timeout);
|
||||
}
|
||||
@ -2627,8 +2673,6 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
|
||||
mptcp_subflow_reset(ssk);
|
||||
WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0);
|
||||
unlock_sock_fast(ssk, slow);
|
||||
|
||||
mptcp_reset_timeout(msk, 0);
|
||||
}
|
||||
|
||||
static void mptcp_do_fastclose(struct sock *sk)
|
||||
@ -2665,19 +2709,15 @@ static void mptcp_worker(struct work_struct *work)
|
||||
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
|
||||
__mptcp_close_subflow(sk);
|
||||
|
||||
/* There is no point in keeping around an orphaned sk timedout or
|
||||
* closed, but we need the msk around to reply to incoming DATA_FIN,
|
||||
* even if it is orphaned and in FIN_WAIT2 state
|
||||
*/
|
||||
if (sock_flag(sk, SOCK_DEAD)) {
|
||||
if (mptcp_should_close(sk))
|
||||
if (mptcp_close_tout_expired(sk)) {
|
||||
mptcp_do_fastclose(sk);
|
||||
mptcp_close_wake_up(sk);
|
||||
}
|
||||
|
||||
if (sk->sk_state == TCP_CLOSE) {
|
||||
if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) {
|
||||
__mptcp_destroy_sock(sk);
|
||||
goto unlock;
|
||||
}
|
||||
}
|
||||
|
||||
if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
|
||||
__mptcp_retrans(sk);
|
||||
@ -2717,7 +2757,7 @@ static void __mptcp_init_sock(struct sock *sk)
|
||||
|
||||
/* re-use the csk retrans timer for MPTCP-level retrans */
|
||||
timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
|
||||
timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0);
|
||||
timer_setup(&sk->sk_timer, mptcp_tout_timer, 0);
|
||||
}
|
||||
|
||||
static void mptcp_ca_reset(struct sock *sk)
|
||||
@ -2808,8 +2848,8 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
|
||||
} else {
|
||||
pr_debug("Sending DATA_FIN on subflow %p", ssk);
|
||||
tcp_send_ack(ssk);
|
||||
if (!mptcp_timer_pending(sk))
|
||||
mptcp_reset_timer(sk);
|
||||
if (!mptcp_rtx_timer_pending(sk))
|
||||
mptcp_reset_rtx_timer(sk);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -2892,7 +2932,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
|
||||
|
||||
might_sleep();
|
||||
|
||||
mptcp_stop_timer(sk);
|
||||
mptcp_stop_rtx_timer(sk);
|
||||
sk_stop_timer(sk, &sk->sk_timer);
|
||||
msk->pm.status = 0;
|
||||
mptcp_release_sched(msk);
|
||||
@ -2975,7 +3015,6 @@ bool __mptcp_close(struct sock *sk, long timeout)
|
||||
|
||||
cleanup:
|
||||
/* orphan all the subflows */
|
||||
inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
|
||||
mptcp_for_each_subflow(msk, subflow) {
|
||||
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
|
||||
bool slow = lock_sock_fast_nested(ssk);
|
||||
@ -3012,7 +3051,7 @@ cleanup:
|
||||
__mptcp_destroy_sock(sk);
|
||||
do_cancel_work = true;
|
||||
} else {
|
||||
mptcp_reset_timeout(msk, 0);
|
||||
mptcp_start_tout_timer(sk);
|
||||
}
|
||||
|
||||
return do_cancel_work;
|
||||
@ -3075,8 +3114,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
|
||||
mptcp_check_listen_stop(sk);
|
||||
inet_sk_state_store(sk, TCP_CLOSE);
|
||||
|
||||
mptcp_stop_timer(sk);
|
||||
sk_stop_timer(sk, &sk->sk_timer);
|
||||
mptcp_stop_rtx_timer(sk);
|
||||
mptcp_stop_tout_timer(sk);
|
||||
|
||||
if (msk->token)
|
||||
mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
|
||||
|
@ -718,7 +718,29 @@ void mptcp_get_options(const struct sk_buff *skb,
|
||||
|
||||
void mptcp_finish_connect(struct sock *sk);
|
||||
void __mptcp_set_connected(struct sock *sk);
|
||||
void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout);
|
||||
void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout);
|
||||
|
||||
static inline void mptcp_stop_tout_timer(struct sock *sk)
|
||||
{
|
||||
if (!inet_csk(sk)->icsk_mtup.probe_timestamp)
|
||||
return;
|
||||
|
||||
sk_stop_timer(sk, &sk->sk_timer);
|
||||
inet_csk(sk)->icsk_mtup.probe_timestamp = 0;
|
||||
}
|
||||
|
||||
static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout)
|
||||
{
|
||||
/* avoid 0 timestamp, as that means no close timeout */
|
||||
inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1;
|
||||
}
|
||||
|
||||
static inline void mptcp_start_tout_timer(struct sock *sk)
|
||||
{
|
||||
mptcp_set_close_tout(sk, tcp_jiffies32);
|
||||
mptcp_reset_tout_timer(mptcp_sk(sk), 0);
|
||||
}
|
||||
|
||||
static inline bool mptcp_is_fully_established(struct sock *sk)
|
||||
{
|
||||
return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
|
||||
|
@ -1226,7 +1226,7 @@ static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
|
||||
WRITE_ONCE(subflow->fail_tout, fail_tout);
|
||||
tcp_send_ack(ssk);
|
||||
|
||||
mptcp_reset_timeout(msk, subflow->fail_tout);
|
||||
mptcp_reset_tout_timer(msk, subflow->fail_tout);
|
||||
}
|
||||
|
||||
static bool subflow_check_data_avail(struct sock *ssk)
|
||||
@ -1362,42 +1362,6 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
|
||||
*full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
|
||||
}
|
||||
|
||||
void __mptcp_error_report(struct sock *sk)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow;
|
||||
struct mptcp_sock *msk = mptcp_sk(sk);
|
||||
|
||||
mptcp_for_each_subflow(msk, subflow) {
|
||||
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
|
||||
int err = sock_error(ssk);
|
||||
int ssk_state;
|
||||
|
||||
if (!err)
|
||||
continue;
|
||||
|
||||
/* only propagate errors on fallen-back sockets or
|
||||
* on MPC connect
|
||||
*/
|
||||
if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
|
||||
continue;
|
||||
|
||||
/* We need to propagate only transition to CLOSE state.
|
||||
* Orphaned socket will see such state change via
|
||||
* subflow_sched_work_if_closed() and that path will properly
|
||||
* destroy the msk as needed.
|
||||
*/
|
||||
ssk_state = inet_sk_state_load(ssk);
|
||||
if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
|
||||
inet_sk_state_store(sk, ssk_state);
|
||||
WRITE_ONCE(sk->sk_err, -err);
|
||||
|
||||
/* This barrier is coupled with smp_rmb() in mptcp_poll() */
|
||||
smp_wmb();
|
||||
sk_error_report(sk);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void subflow_error_report(struct sock *ssk)
|
||||
{
|
||||
struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
|
||||
@ -1588,6 +1552,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
|
||||
mptcp_sock_graft(ssk, sk->sk_socket);
|
||||
iput(SOCK_INODE(sf));
|
||||
WRITE_ONCE(msk->allow_infinite_fallback, false);
|
||||
mptcp_stop_tout_timer(sk);
|
||||
return 0;
|
||||
|
||||
failed_unlink:
|
||||
|
Loading…
x
Reference in New Issue
Block a user