2020-01-21 16:56:15 -08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/* Multipath TCP
|
|
|
|
*
|
|
|
|
* Copyright (c) 2017 - 2019, Intel Corporation.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define pr_fmt(fmt) "MPTCP: " fmt
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/netdevice.h>
|
2020-01-21 16:56:26 -08:00
|
|
|
#include <linux/sched/signal.h>
|
|
|
|
#include <linux/atomic.h>
|
2020-01-21 16:56:15 -08:00
|
|
|
#include <net/sock.h>
|
|
|
|
#include <net/inet_common.h>
|
|
|
|
#include <net/inet_hashtables.h>
|
|
|
|
#include <net/protocol.h>
|
|
|
|
#include <net/tcp.h>
|
2020-01-21 16:56:19 -08:00
|
|
|
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
|
|
|
|
#include <net/transp_v6.h>
|
|
|
|
#endif
|
2020-01-21 16:56:15 -08:00
|
|
|
#include <net/mptcp.h>
|
|
|
|
#include "protocol.h"
|
|
|
|
|
2020-01-21 16:56:17 -08:00
|
|
|
#define MPTCP_SAME_STATE TCP_MAX_STATES
|
|
|
|
|
2020-02-06 00:39:37 +01:00
|
|
|
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
|
|
|
|
struct mptcp6_sock {
|
|
|
|
struct mptcp_sock msk;
|
|
|
|
struct ipv6_pinfo np;
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
2020-01-21 16:56:17 -08:00
|
|
|
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
|
|
|
|
* completed yet or has failed, return the subflow socket.
|
|
|
|
* Otherwise return NULL.
|
|
|
|
*/
|
|
|
|
static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
|
|
|
|
{
|
2020-01-21 16:56:32 -08:00
|
|
|
if (!msk->subflow || READ_ONCE(msk->can_ack))
|
2020-01-21 16:56:17 -08:00
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return msk->subflow;
|
|
|
|
}
|
|
|
|
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk)
|
|
|
|
{
|
|
|
|
return msk->first && !sk_is_mptcp(msk->first);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
|
2020-01-21 16:56:18 -08:00
|
|
|
{
|
|
|
|
sock_owned_by_me((const struct sock *)msk);
|
|
|
|
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
if (likely(!__mptcp_needs_tcp_fallback(msk)))
|
2020-01-21 16:56:18 -08:00
|
|
|
return NULL;
|
|
|
|
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
if (msk->subflow) {
|
2020-02-04 18:12:30 +01:00
|
|
|
release_sock((struct sock *)msk);
|
|
|
|
return msk->subflow;
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
}
|
|
|
|
|
2020-02-04 18:12:30 +01:00
|
|
|
return NULL;
|
2020-01-21 16:56:18 -08:00
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:17 -08:00
|
|
|
static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk)
|
|
|
|
{
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
return !msk->first;
|
2020-01-21 16:56:17 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
|
|
|
|
{
|
|
|
|
struct mptcp_subflow_context *subflow;
|
|
|
|
struct sock *sk = (struct sock *)msk;
|
|
|
|
struct socket *ssock;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
ssock = __mptcp_nmpc_socket(msk);
|
|
|
|
if (ssock)
|
|
|
|
goto set_state;
|
|
|
|
|
|
|
|
if (!__mptcp_can_create_subflow(msk))
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
|
|
err = mptcp_subflow_create_socket(sk, &ssock);
|
|
|
|
if (err)
|
|
|
|
return ERR_PTR(err);
|
|
|
|
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
msk->first = ssock->sk;
|
2020-01-21 16:56:17 -08:00
|
|
|
msk->subflow = ssock;
|
|
|
|
subflow = mptcp_subflow_ctx(ssock->sk);
|
2020-01-21 16:56:18 -08:00
|
|
|
list_add(&subflow->node, &msk->conn_list);
|
2020-01-21 16:56:17 -08:00
|
|
|
subflow->request_mptcp = 1;
|
|
|
|
|
|
|
|
set_state:
|
|
|
|
if (state != MPTCP_SAME_STATE)
|
|
|
|
inet_sk_state_store(sk, state);
|
|
|
|
return ssock;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:18 -08:00
|
|
|
static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
|
|
|
|
{
|
|
|
|
struct mptcp_subflow_context *subflow;
|
|
|
|
|
|
|
|
sock_owned_by_me((const struct sock *)msk);
|
|
|
|
|
|
|
|
mptcp_for_each_subflow(msk, subflow) {
|
|
|
|
return mptcp_subflow_tcp_sock(subflow);
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2020-02-26 10:14:46 +01:00
|
|
|
void mptcp_data_ready(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
|
|
|
|
|
|
|
set_bit(MPTCP_DATA_READY, &msk->flags);
|
|
|
|
sk->sk_data_ready(sk);
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:23 -08:00
|
|
|
static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
|
|
|
|
{
|
|
|
|
if (!msk->cached_ext)
|
|
|
|
msk->cached_ext = __skb_ext_alloc();
|
|
|
|
|
|
|
|
return !!msk->cached_ext;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:26 -08:00
|
|
|
static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
|
|
|
|
{
|
|
|
|
struct mptcp_subflow_context *subflow;
|
|
|
|
struct sock *sk = (struct sock *)msk;
|
|
|
|
|
|
|
|
sock_owned_by_me(sk);
|
|
|
|
|
|
|
|
mptcp_for_each_subflow(msk, subflow) {
|
|
|
|
if (subflow->data_avail)
|
|
|
|
return mptcp_subflow_tcp_sock(subflow);
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:27 -08:00
|
|
|
static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
|
|
|
|
const struct sk_buff *skb,
|
|
|
|
const struct mptcp_ext *mpext)
|
|
|
|
{
|
|
|
|
if (!tcp_skb_can_collapse_to(skb))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* can collapse only if MPTCP level sequence is in order */
|
|
|
|
return mpext && mpext->data_seq + mpext->data_len == msk->write_seq;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:23 -08:00
|
|
|
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
|
2020-01-21 16:56:27 -08:00
|
|
|
struct msghdr *msg, long *timeo, int *pmss_now,
|
|
|
|
int *ps_goal)
|
2020-01-21 16:56:23 -08:00
|
|
|
{
|
2020-01-21 16:56:27 -08:00
|
|
|
int mss_now, avail_size, size_goal, ret;
|
2020-01-21 16:56:23 -08:00
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
|
|
|
struct mptcp_ext *mpext = NULL;
|
2020-01-21 16:56:27 -08:00
|
|
|
struct sk_buff *skb, *tail;
|
|
|
|
bool can_collapse = false;
|
2020-01-21 16:56:23 -08:00
|
|
|
struct page_frag *pfrag;
|
|
|
|
size_t psize;
|
|
|
|
|
|
|
|
/* use the mptcp page cache so that we can easily move the data
|
|
|
|
* from one substream to another, but do per subflow memory accounting
|
|
|
|
*/
|
|
|
|
pfrag = sk_page_frag(sk);
|
|
|
|
while (!sk_page_frag_refill(ssk, pfrag) ||
|
|
|
|
!mptcp_ext_cache_refill(msk)) {
|
|
|
|
ret = sk_stream_wait_memory(ssk, timeo);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
if (unlikely(__mptcp_needs_tcp_fallback(msk)))
|
|
|
|
return 0;
|
2020-01-21 16:56:23 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* compute copy limit */
|
|
|
|
mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
|
2020-01-21 16:56:27 -08:00
|
|
|
*pmss_now = mss_now;
|
|
|
|
*ps_goal = size_goal;
|
|
|
|
avail_size = size_goal;
|
|
|
|
skb = tcp_write_queue_tail(ssk);
|
|
|
|
if (skb) {
|
|
|
|
mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
|
|
|
|
|
|
|
|
/* Limit the write to the size available in the
|
|
|
|
* current skb, if any, so that we create at most a new skb.
|
|
|
|
* Explicitly tells TCP internals to avoid collapsing on later
|
|
|
|
* queue management operation, to avoid breaking the ext <->
|
|
|
|
* SSN association set here
|
|
|
|
*/
|
|
|
|
can_collapse = (size_goal - skb->len > 0) &&
|
|
|
|
mptcp_skb_can_collapse_to(msk, skb, mpext);
|
|
|
|
if (!can_collapse)
|
|
|
|
TCP_SKB_CB(skb)->eor = 1;
|
|
|
|
else
|
|
|
|
avail_size = size_goal - skb->len;
|
|
|
|
}
|
|
|
|
psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size);
|
2020-01-21 16:56:23 -08:00
|
|
|
|
2020-01-21 16:56:27 -08:00
|
|
|
/* Copy to page */
|
2020-01-21 16:56:23 -08:00
|
|
|
pr_debug("left=%zu", msg_data_left(msg));
|
|
|
|
psize = copy_page_from_iter(pfrag->page, pfrag->offset,
|
|
|
|
min_t(size_t, msg_data_left(msg), psize),
|
|
|
|
&msg->msg_iter);
|
|
|
|
pr_debug("left=%zu", msg_data_left(msg));
|
|
|
|
if (!psize)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2020-01-21 16:56:27 -08:00
|
|
|
/* tell the TCP stack to delay the push so that we can safely
|
|
|
|
* access the skb after the sendpages call
|
2020-01-21 16:56:23 -08:00
|
|
|
*/
|
|
|
|
ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
|
|
|
|
msg->msg_flags | MSG_SENDPAGE_NOTLAST);
|
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
|
|
|
if (unlikely(ret < psize))
|
|
|
|
iov_iter_revert(&msg->msg_iter, psize - ret);
|
|
|
|
|
2020-01-21 16:56:27 -08:00
|
|
|
/* if the tail skb extension is still the cached one, collapsing
|
|
|
|
* really happened. Note: we can't check for 'same skb' as the sk_buff
|
|
|
|
* hdr on tail can be transmitted, freed and re-allocated by the
|
|
|
|
* do_tcp_sendpages() call
|
|
|
|
*/
|
|
|
|
tail = tcp_write_queue_tail(ssk);
|
|
|
|
if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) {
|
|
|
|
WARN_ON_ONCE(!can_collapse);
|
|
|
|
mpext->data_len += ret;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:23 -08:00
|
|
|
skb = tcp_write_queue_tail(ssk);
|
|
|
|
mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext);
|
|
|
|
msk->cached_ext = NULL;
|
|
|
|
|
|
|
|
memset(mpext, 0, sizeof(*mpext));
|
|
|
|
mpext->data_seq = msk->write_seq;
|
|
|
|
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
|
|
|
|
mpext->data_len = ret;
|
|
|
|
mpext->use_map = 1;
|
|
|
|
mpext->dsn64 = 1;
|
|
|
|
|
|
|
|
pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
|
|
|
|
mpext->data_seq, mpext->subflow_seq, mpext->data_len,
|
|
|
|
mpext->dsn64);
|
|
|
|
|
2020-01-21 16:56:27 -08:00
|
|
|
out:
|
2020-01-21 16:56:23 -08:00
|
|
|
pfrag->offset += ret;
|
|
|
|
msk->write_seq += ret;
|
|
|
|
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:25 -08:00
|
|
|
static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
|
|
|
|
{
|
|
|
|
struct socket *sock;
|
|
|
|
|
|
|
|
if (likely(sk_stream_is_writeable(ssk)))
|
|
|
|
return;
|
|
|
|
|
|
|
|
sock = READ_ONCE(ssk->sk_socket);
|
|
|
|
|
|
|
|
if (sock) {
|
|
|
|
clear_bit(MPTCP_SEND_SPACE, &msk->flags);
|
|
|
|
smp_mb__after_atomic();
|
|
|
|
/* set NOSPACE only after clearing SEND_SPACE flag */
|
|
|
|
set_bit(SOCK_NOSPACE, &sock->flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:15 -08:00
|
|
|
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
|
|
|
|
{
|
2020-01-21 16:56:27 -08:00
|
|
|
int mss_now = 0, size_goal = 0, ret = 0;
|
2020-01-21 16:56:15 -08:00
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
2020-01-21 16:56:18 -08:00
|
|
|
struct socket *ssock;
|
2020-01-21 16:56:23 -08:00
|
|
|
size_t copied = 0;
|
2020-01-21 16:56:18 -08:00
|
|
|
struct sock *ssk;
|
2020-01-21 16:56:23 -08:00
|
|
|
long timeo;
|
2020-01-21 16:56:15 -08:00
|
|
|
|
|
|
|
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2020-01-21 16:56:18 -08:00
|
|
|
lock_sock(sk);
|
|
|
|
ssock = __mptcp_tcp_fallback(msk);
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
if (unlikely(ssock)) {
|
|
|
|
fallback:
|
2020-01-21 16:56:18 -08:00
|
|
|
pr_debug("fallback passthrough");
|
|
|
|
ret = sock_sendmsg(ssock, msg);
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
return ret >= 0 ? ret + copied : (copied ? copied : ret);
|
2020-01-21 16:56:18 -08:00
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:23 -08:00
|
|
|
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
|
|
|
|
|
2020-01-21 16:56:18 -08:00
|
|
|
ssk = mptcp_subflow_get(msk);
|
|
|
|
if (!ssk) {
|
|
|
|
release_sock(sk);
|
|
|
|
return -ENOTCONN;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:23 -08:00
|
|
|
pr_debug("conn_list->subflow=%p", ssk);
|
2020-01-21 16:56:18 -08:00
|
|
|
|
2020-01-21 16:56:23 -08:00
|
|
|
lock_sock(ssk);
|
|
|
|
while (msg_data_left(msg)) {
|
2020-01-21 16:56:27 -08:00
|
|
|
ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now,
|
|
|
|
&size_goal);
|
2020-01-21 16:56:23 -08:00
|
|
|
if (ret < 0)
|
|
|
|
break;
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
|
|
|
|
release_sock(ssk);
|
|
|
|
ssock = __mptcp_tcp_fallback(msk);
|
|
|
|
goto fallback;
|
|
|
|
}
|
2020-01-21 16:56:23 -08:00
|
|
|
|
|
|
|
copied += ret;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:27 -08:00
|
|
|
if (copied) {
|
2020-01-21 16:56:23 -08:00
|
|
|
ret = copied;
|
2020-01-21 16:56:27 -08:00
|
|
|
tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
|
|
|
|
size_goal);
|
|
|
|
}
|
2020-01-21 16:56:23 -08:00
|
|
|
|
2020-01-21 16:56:25 -08:00
|
|
|
ssk_check_wmem(msk, ssk);
|
2020-01-21 16:56:23 -08:00
|
|
|
release_sock(ssk);
|
2020-01-21 16:56:18 -08:00
|
|
|
release_sock(sk);
|
|
|
|
return ret;
|
2020-01-21 16:56:15 -08:00
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:24 -08:00
|
|
|
int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb,
|
|
|
|
unsigned int offset, size_t len)
|
|
|
|
{
|
|
|
|
struct mptcp_read_arg *arg = desc->arg.data;
|
|
|
|
size_t copy_len;
|
|
|
|
|
|
|
|
copy_len = min(desc->count, len);
|
|
|
|
|
|
|
|
if (likely(arg->msg)) {
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = skb_copy_datagram_msg(skb, offset, arg->msg, copy_len);
|
|
|
|
if (err) {
|
|
|
|
pr_debug("error path");
|
|
|
|
desc->error = err;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
pr_debug("Flushing skb payload");
|
|
|
|
}
|
|
|
|
|
|
|
|
desc->count -= copy_len;
|
|
|
|
|
|
|
|
pr_debug("consumed %zu bytes, %zu left", copy_len, desc->count);
|
|
|
|
return copy_len;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:26 -08:00
|
|
|
static void mptcp_wait_data(struct sock *sk, long *timeo)
|
|
|
|
{
|
|
|
|
DEFINE_WAIT_FUNC(wait, woken_wake_function);
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
|
|
|
|
|
|
|
add_wait_queue(sk_sleep(sk), &wait);
|
|
|
|
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
|
|
|
|
|
|
|
|
sk_wait_event(sk, timeo,
|
|
|
|
test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait);
|
|
|
|
|
|
|
|
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
|
|
|
|
remove_wait_queue(sk_sleep(sk), &wait);
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:15 -08:00
|
|
|
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
|
|
|
|
int nonblock, int flags, int *addr_len)
|
|
|
|
{
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
2020-01-21 16:56:26 -08:00
|
|
|
struct mptcp_subflow_context *subflow;
|
|
|
|
bool more_data_avail = false;
|
|
|
|
struct mptcp_read_arg arg;
|
|
|
|
read_descriptor_t desc;
|
|
|
|
bool wait_data = false;
|
2020-01-21 16:56:18 -08:00
|
|
|
struct socket *ssock;
|
2020-01-21 16:56:26 -08:00
|
|
|
struct tcp_sock *tp;
|
|
|
|
bool done = false;
|
2020-01-21 16:56:18 -08:00
|
|
|
struct sock *ssk;
|
|
|
|
int copied = 0;
|
2020-01-21 16:56:26 -08:00
|
|
|
int target;
|
|
|
|
long timeo;
|
2020-01-21 16:56:15 -08:00
|
|
|
|
|
|
|
if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2020-01-21 16:56:18 -08:00
|
|
|
lock_sock(sk);
|
|
|
|
ssock = __mptcp_tcp_fallback(msk);
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
if (unlikely(ssock)) {
|
|
|
|
fallback:
|
2020-01-21 16:56:18 -08:00
|
|
|
pr_debug("fallback-read subflow=%p",
|
|
|
|
mptcp_subflow_ctx(ssock->sk));
|
|
|
|
copied = sock_recvmsg(ssock, msg, flags);
|
|
|
|
return copied;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:26 -08:00
|
|
|
arg.msg = msg;
|
|
|
|
desc.arg.data = &arg;
|
|
|
|
desc.error = 0;
|
|
|
|
|
|
|
|
timeo = sock_rcvtimeo(sk, nonblock);
|
|
|
|
|
|
|
|
len = min_t(size_t, len, INT_MAX);
|
|
|
|
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
|
|
|
|
|
|
|
|
while (!done) {
|
|
|
|
u32 map_remaining;
|
|
|
|
int bytes_read;
|
|
|
|
|
|
|
|
ssk = mptcp_subflow_recv_lookup(msk);
|
|
|
|
pr_debug("msk=%p ssk=%p", msk, ssk);
|
|
|
|
if (!ssk)
|
|
|
|
goto wait_for_data;
|
|
|
|
|
|
|
|
subflow = mptcp_subflow_ctx(ssk);
|
|
|
|
tp = tcp_sk(ssk);
|
|
|
|
|
|
|
|
lock_sock(ssk);
|
|
|
|
do {
|
|
|
|
/* try to read as much data as available */
|
|
|
|
map_remaining = subflow->map_data_len -
|
|
|
|
mptcp_subflow_get_map_offset(subflow);
|
|
|
|
desc.count = min_t(size_t, len - copied, map_remaining);
|
|
|
|
pr_debug("reading %zu bytes, copied %d", desc.count,
|
|
|
|
copied);
|
|
|
|
bytes_read = tcp_read_sock(ssk, &desc,
|
|
|
|
mptcp_read_actor);
|
|
|
|
if (bytes_read < 0) {
|
|
|
|
if (!copied)
|
|
|
|
copied = bytes_read;
|
|
|
|
done = true;
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
|
|
|
pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq,
|
|
|
|
msk->ack_seq + bytes_read);
|
|
|
|
msk->ack_seq += bytes_read;
|
|
|
|
copied += bytes_read;
|
|
|
|
if (copied >= len) {
|
|
|
|
done = true;
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
if (tp->urg_data && tp->urg_seq == tp->copied_seq) {
|
|
|
|
pr_err("Urgent data present, cannot proceed");
|
|
|
|
done = true;
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
next:
|
|
|
|
more_data_avail = mptcp_subflow_data_available(ssk);
|
|
|
|
} while (more_data_avail && !done);
|
|
|
|
release_sock(ssk);
|
|
|
|
continue;
|
|
|
|
|
|
|
|
wait_for_data:
|
|
|
|
more_data_avail = false;
|
|
|
|
|
|
|
|
/* only the master socket status is relevant here. The exit
|
|
|
|
* conditions mirror closely tcp_recvmsg()
|
|
|
|
*/
|
|
|
|
if (copied >= target)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (copied) {
|
|
|
|
if (sk->sk_err ||
|
|
|
|
sk->sk_state == TCP_CLOSE ||
|
|
|
|
(sk->sk_shutdown & RCV_SHUTDOWN) ||
|
|
|
|
!timeo ||
|
|
|
|
signal_pending(current))
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
if (sk->sk_err) {
|
|
|
|
copied = sock_error(sk);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sk->sk_shutdown & RCV_SHUTDOWN)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (sk->sk_state == TCP_CLOSE) {
|
|
|
|
copied = -ENOTCONN;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!timeo) {
|
|
|
|
copied = -EAGAIN;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (signal_pending(current)) {
|
|
|
|
copied = sock_intr_errno(timeo);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pr_debug("block timeout %ld", timeo);
|
|
|
|
wait_data = true;
|
|
|
|
mptcp_wait_data(sk, &timeo);
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
if (unlikely(__mptcp_tcp_fallback(msk)))
|
|
|
|
goto fallback;
|
2020-01-21 16:56:18 -08:00
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:26 -08:00
|
|
|
if (more_data_avail) {
|
|
|
|
if (!test_bit(MPTCP_DATA_READY, &msk->flags))
|
|
|
|
set_bit(MPTCP_DATA_READY, &msk->flags);
|
|
|
|
} else if (!wait_data) {
|
|
|
|
clear_bit(MPTCP_DATA_READY, &msk->flags);
|
2020-01-21 16:56:18 -08:00
|
|
|
|
2020-01-21 16:56:26 -08:00
|
|
|
/* .. race-breaker: ssk might get new data after last
|
|
|
|
* data_available() returns false.
|
|
|
|
*/
|
|
|
|
ssk = mptcp_subflow_recv_lookup(msk);
|
|
|
|
if (unlikely(ssk))
|
|
|
|
set_bit(MPTCP_DATA_READY, &msk->flags);
|
|
|
|
}
|
2020-01-21 16:56:18 -08:00
|
|
|
|
2020-01-21 16:56:26 -08:00
|
|
|
release_sock(sk);
|
2020-01-21 16:56:18 -08:00
|
|
|
return copied;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* subflow sockets can be either outgoing (connect) or incoming
|
|
|
|
* (accept).
|
|
|
|
*
|
|
|
|
* Outgoing subflows use in-kernel sockets.
|
|
|
|
* Incoming subflows do not have their own 'struct socket' allocated,
|
|
|
|
* so we need to use tcp_close() after detaching them from the mptcp
|
|
|
|
* parent socket.
|
|
|
|
*/
|
|
|
|
static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
|
|
|
|
struct mptcp_subflow_context *subflow,
|
|
|
|
long timeout)
|
|
|
|
{
|
|
|
|
struct socket *sock = READ_ONCE(ssk->sk_socket);
|
|
|
|
|
|
|
|
list_del(&subflow->node);
|
|
|
|
|
|
|
|
if (sock && sock != sk->sk_socket) {
|
|
|
|
/* outgoing subflow */
|
|
|
|
sock_release(sock);
|
|
|
|
} else {
|
|
|
|
/* incoming subflow */
|
|
|
|
tcp_close(ssk, timeout);
|
|
|
|
}
|
2020-01-21 16:56:15 -08:00
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:28 -08:00
|
|
|
static int __mptcp_init_sock(struct sock *sk)
|
2020-01-21 16:56:15 -08:00
|
|
|
{
|
2020-01-21 16:56:18 -08:00
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&msk->conn_list);
|
2020-01-21 16:56:25 -08:00
|
|
|
__set_bit(MPTCP_SEND_SPACE, &msk->flags);
|
2020-01-21 16:56:18 -08:00
|
|
|
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
msk->first = NULL;
|
|
|
|
|
2020-01-21 16:56:15 -08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:28 -08:00
|
|
|
static int mptcp_init_sock(struct sock *sk)
|
|
|
|
{
|
|
|
|
if (!mptcp_is_enabled(sock_net(sk)))
|
|
|
|
return -ENOPROTOOPT;
|
|
|
|
|
|
|
|
return __mptcp_init_sock(sk);
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:21 -08:00
|
|
|
static void mptcp_subflow_shutdown(struct sock *ssk, int how)
|
|
|
|
{
|
|
|
|
lock_sock(ssk);
|
|
|
|
|
|
|
|
switch (ssk->sk_state) {
|
|
|
|
case TCP_LISTEN:
|
|
|
|
if (!(how & RCV_SHUTDOWN))
|
|
|
|
break;
|
|
|
|
/* fall through */
|
|
|
|
case TCP_SYN_SENT:
|
|
|
|
tcp_disconnect(ssk, O_NONBLOCK);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
ssk->sk_shutdown |= how;
|
|
|
|
tcp_shutdown(ssk, how);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Wake up anyone sleeping in poll. */
|
|
|
|
ssk->sk_state_change(ssk);
|
|
|
|
release_sock(ssk);
|
|
|
|
}
|
|
|
|
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
/* Called with msk lock held, releases such lock before returning */
|
2020-02-04 18:12:30 +01:00
|
|
|
static void mptcp_close(struct sock *sk, long timeout)
|
2020-01-21 16:56:15 -08:00
|
|
|
{
|
2020-01-21 16:56:18 -08:00
|
|
|
struct mptcp_subflow_context *subflow, *tmp;
|
2020-01-21 16:56:15 -08:00
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
mptcp: avoid a lockdep splat when mcast group was joined
syzbot triggered following lockdep splat:
ffffffff82d2cd40 (rtnl_mutex){+.+.}, at: ip_mc_drop_socket+0x52/0x180
but task is already holding lock:
ffff8881187a2310 (sk_lock-AF_INET){+.+.}, at: mptcp_close+0x18/0x30
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (sk_lock-AF_INET){+.+.}:
lock_acquire+0xee/0x230
lock_sock_nested+0x89/0xc0
do_ip_setsockopt.isra.0+0x335/0x22f0
ip_setsockopt+0x35/0x60
tcp_setsockopt+0x5d/0x90
__sys_setsockopt+0xf3/0x190
__x64_sys_setsockopt+0x61/0x70
do_syscall_64+0x72/0x300
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (rtnl_mutex){+.+.}:
check_prevs_add+0x2b7/0x1210
__lock_acquire+0x10b6/0x1400
lock_acquire+0xee/0x230
__mutex_lock+0x120/0xc70
ip_mc_drop_socket+0x52/0x180
inet_release+0x36/0xe0
__sock_release+0xfd/0x130
__mptcp_close+0xa8/0x1f0
inet_release+0x7f/0xe0
__sock_release+0x69/0x130
sock_close+0x18/0x20
__fput+0x179/0x400
task_work_run+0xd5/0x110
do_exit+0x685/0x1510
do_group_exit+0x7e/0x170
__x64_sys_exit_group+0x28/0x30
do_syscall_64+0x72/0x300
entry_SYSCALL_64_after_hwframe+0x49/0xbe
The trigger is:
socket(AF_INET, SOCK_STREAM, 0x106 /* IPPROTO_MPTCP */) = 4
setsockopt(4, SOL_IP, MCAST_JOIN_GROUP, {gr_interface=7, gr_group={sa_family=AF_INET, sin_port=htons(20003), sin_addr=inet_addr("224.0.0.2")}}, 136) = 0
exit(0)
Which results in a call to rtnl_lock while we are holding
the parent mptcp socket lock via
mptcp_close -> lock_sock(msk) -> inet_release -> ip_mc_drop_socket -> rtnl_lock().
>From lockdep point of view we thus have both
'rtnl_lock; lock_sock' and 'lock_sock; rtnl_lock'.
Fix this by stealing the msk conn_list and doing the subflow close
without holding the msk lock.
Fixes: cec37a6e41aae7bf ("mptcp: Handle MP_CAPABLE options for outgoing connections")
Reported-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-29 15:54:45 +01:00
|
|
|
LIST_HEAD(conn_list);
|
2020-01-21 16:56:15 -08:00
|
|
|
|
2020-02-04 18:12:30 +01:00
|
|
|
lock_sock(sk);
|
|
|
|
|
2020-01-21 16:56:20 -08:00
|
|
|
mptcp_token_destroy(msk->token);
|
2020-01-21 16:56:15 -08:00
|
|
|
inet_sk_state_store(sk, TCP_CLOSE);
|
|
|
|
|
mptcp: avoid a lockdep splat when mcast group was joined
syzbot triggered following lockdep splat:
ffffffff82d2cd40 (rtnl_mutex){+.+.}, at: ip_mc_drop_socket+0x52/0x180
but task is already holding lock:
ffff8881187a2310 (sk_lock-AF_INET){+.+.}, at: mptcp_close+0x18/0x30
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (sk_lock-AF_INET){+.+.}:
lock_acquire+0xee/0x230
lock_sock_nested+0x89/0xc0
do_ip_setsockopt.isra.0+0x335/0x22f0
ip_setsockopt+0x35/0x60
tcp_setsockopt+0x5d/0x90
__sys_setsockopt+0xf3/0x190
__x64_sys_setsockopt+0x61/0x70
do_syscall_64+0x72/0x300
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (rtnl_mutex){+.+.}:
check_prevs_add+0x2b7/0x1210
__lock_acquire+0x10b6/0x1400
lock_acquire+0xee/0x230
__mutex_lock+0x120/0xc70
ip_mc_drop_socket+0x52/0x180
inet_release+0x36/0xe0
__sock_release+0xfd/0x130
__mptcp_close+0xa8/0x1f0
inet_release+0x7f/0xe0
__sock_release+0x69/0x130
sock_close+0x18/0x20
__fput+0x179/0x400
task_work_run+0xd5/0x110
do_exit+0x685/0x1510
do_group_exit+0x7e/0x170
__x64_sys_exit_group+0x28/0x30
do_syscall_64+0x72/0x300
entry_SYSCALL_64_after_hwframe+0x49/0xbe
The trigger is:
socket(AF_INET, SOCK_STREAM, 0x106 /* IPPROTO_MPTCP */) = 4
setsockopt(4, SOL_IP, MCAST_JOIN_GROUP, {gr_interface=7, gr_group={sa_family=AF_INET, sin_port=htons(20003), sin_addr=inet_addr("224.0.0.2")}}, 136) = 0
exit(0)
Which results in a call to rtnl_lock while we are holding
the parent mptcp socket lock via
mptcp_close -> lock_sock(msk) -> inet_release -> ip_mc_drop_socket -> rtnl_lock().
>From lockdep point of view we thus have both
'rtnl_lock; lock_sock' and 'lock_sock; rtnl_lock'.
Fix this by stealing the msk conn_list and doing the subflow close
without holding the msk lock.
Fixes: cec37a6e41aae7bf ("mptcp: Handle MP_CAPABLE options for outgoing connections")
Reported-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-29 15:54:45 +01:00
|
|
|
list_splice_init(&msk->conn_list, &conn_list);
|
|
|
|
|
|
|
|
release_sock(sk);
|
|
|
|
|
|
|
|
list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
|
2020-01-21 16:56:18 -08:00
|
|
|
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
|
|
|
|
|
|
|
|
__mptcp_close_ssk(sk, ssk, subflow, timeout);
|
2020-01-21 16:56:15 -08:00
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:18 -08:00
|
|
|
sk_common_release(sk);
|
2020-01-21 16:56:15 -08:00
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:19 -08:00
|
|
|
static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
|
|
|
|
{
|
|
|
|
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
|
|
|
|
const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
|
|
|
|
struct ipv6_pinfo *msk6 = inet6_sk(msk);
|
|
|
|
|
|
|
|
msk->sk_v6_daddr = ssk->sk_v6_daddr;
|
|
|
|
msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr;
|
|
|
|
|
|
|
|
if (msk6 && ssk6) {
|
|
|
|
msk6->saddr = ssk6->saddr;
|
|
|
|
msk6->flow_label = ssk6->flow_label;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num;
|
|
|
|
inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport;
|
|
|
|
inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport;
|
|
|
|
inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr;
|
|
|
|
inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr;
|
|
|
|
inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
|
|
|
|
}
|
|
|
|
|
2020-02-06 00:39:37 +01:00
|
|
|
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
|
|
|
|
static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
|
|
|
|
{
|
|
|
|
unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo);
|
|
|
|
|
|
|
|
return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2020-02-10 16:27:59 +08:00
|
|
|
static struct sock *mptcp_sk_clone_lock(const struct sock *sk)
|
2020-02-06 00:39:37 +01:00
|
|
|
{
|
|
|
|
struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
|
|
|
|
|
|
|
|
if (!nsk)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
|
|
|
|
if (nsk->sk_family == AF_INET6)
|
|
|
|
inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return nsk;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:19 -08:00
|
|
|
static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
|
|
|
|
bool kern)
|
|
|
|
{
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
|
|
|
struct socket *listener;
|
|
|
|
struct sock *newsk;
|
|
|
|
|
|
|
|
listener = __mptcp_nmpc_socket(msk);
|
|
|
|
if (WARN_ON_ONCE(!listener)) {
|
|
|
|
*err = -EINVAL;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk));
|
|
|
|
newsk = inet_csk_accept(listener->sk, flags, err, kern);
|
|
|
|
if (!newsk)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk));
|
|
|
|
|
|
|
|
if (sk_is_mptcp(newsk)) {
|
|
|
|
struct mptcp_subflow_context *subflow;
|
|
|
|
struct sock *new_mptcp_sock;
|
|
|
|
struct sock *ssk = newsk;
|
2020-01-21 16:56:23 -08:00
|
|
|
u64 ack_seq;
|
2020-01-21 16:56:19 -08:00
|
|
|
|
|
|
|
subflow = mptcp_subflow_ctx(newsk);
|
|
|
|
lock_sock(sk);
|
|
|
|
|
|
|
|
local_bh_disable();
|
2020-02-06 00:39:37 +01:00
|
|
|
new_mptcp_sock = mptcp_sk_clone_lock(sk);
|
2020-01-21 16:56:19 -08:00
|
|
|
if (!new_mptcp_sock) {
|
|
|
|
*err = -ENOBUFS;
|
|
|
|
local_bh_enable();
|
|
|
|
release_sock(sk);
|
2020-01-21 16:56:21 -08:00
|
|
|
mptcp_subflow_shutdown(newsk, SHUT_RDWR + 1);
|
2020-01-21 16:56:19 -08:00
|
|
|
tcp_close(newsk, 0);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:28 -08:00
|
|
|
__mptcp_init_sock(new_mptcp_sock);
|
2020-01-21 16:56:19 -08:00
|
|
|
|
|
|
|
msk = mptcp_sk(new_mptcp_sock);
|
|
|
|
msk->local_key = subflow->local_key;
|
2020-01-21 16:56:20 -08:00
|
|
|
msk->token = subflow->token;
|
2020-01-21 16:56:19 -08:00
|
|
|
msk->subflow = NULL;
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
msk->first = newsk;
|
2020-01-21 16:56:19 -08:00
|
|
|
|
2020-01-21 16:56:20 -08:00
|
|
|
mptcp_token_update_accept(newsk, new_mptcp_sock);
|
2020-01-21 16:56:23 -08:00
|
|
|
|
|
|
|
msk->write_seq = subflow->idsn + 1;
|
2020-01-21 16:56:32 -08:00
|
|
|
if (subflow->can_ack) {
|
|
|
|
msk->can_ack = true;
|
|
|
|
msk->remote_key = subflow->remote_key;
|
|
|
|
mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
|
|
|
|
ack_seq++;
|
|
|
|
msk->ack_seq = ack_seq;
|
|
|
|
}
|
2020-01-21 16:56:19 -08:00
|
|
|
newsk = new_mptcp_sock;
|
|
|
|
mptcp_copy_inaddrs(newsk, ssk);
|
|
|
|
list_add(&subflow->node, &msk->conn_list);
|
|
|
|
|
|
|
|
/* will be fully established at mptcp_stream_accept()
|
|
|
|
* completion.
|
|
|
|
*/
|
|
|
|
inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV);
|
|
|
|
bh_unlock_sock(new_mptcp_sock);
|
|
|
|
local_bh_enable();
|
|
|
|
release_sock(sk);
|
2020-01-21 16:56:26 -08:00
|
|
|
|
|
|
|
/* the subflow can already receive packet, avoid racing with
|
|
|
|
* the receive path and process the pending ones
|
|
|
|
*/
|
|
|
|
lock_sock(ssk);
|
|
|
|
subflow->rel_write_seq = 1;
|
|
|
|
subflow->tcp_sock = ssk;
|
|
|
|
subflow->conn = new_mptcp_sock;
|
|
|
|
if (unlikely(!skb_queue_empty(&ssk->sk_receive_queue)))
|
|
|
|
mptcp_subflow_data_available(ssk);
|
|
|
|
release_sock(ssk);
|
2020-01-21 16:56:19 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
return newsk;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:20 -08:00
|
|
|
static void mptcp_destroy(struct sock *sk)
|
|
|
|
{
|
2020-01-29 15:54:43 +01:00
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
|
|
|
|
|
|
|
if (msk->cached_ext)
|
|
|
|
__skb_ext_put(msk->cached_ext);
|
2020-01-21 16:56:20 -08:00
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:22 -08:00
|
|
|
static int mptcp_setsockopt(struct sock *sk, int level, int optname,
|
mptcp: fix panic on user pointer access
Its not possible to call the kernel_(s|g)etsockopt functions here,
the address points to user memory:
General protection fault in user access. Non-canonical address?
WARNING: CPU: 1 PID: 5352 at arch/x86/mm/extable.c:77 ex_handler_uaccess+0xba/0xe0 arch/x86/mm/extable.c:77
Kernel panic - not syncing: panic_on_warn set ...
[..]
Call Trace:
fixup_exception+0x9d/0xcd arch/x86/mm/extable.c:178
general_protection+0x2d/0x40 arch/x86/entry/entry_64.S:1202
do_ip_getsockopt+0x1f6/0x1860 net/ipv4/ip_sockglue.c:1323
ip_getsockopt+0x87/0x1c0 net/ipv4/ip_sockglue.c:1561
tcp_getsockopt net/ipv4/tcp.c:3691 [inline]
tcp_getsockopt+0x8c/0xd0 net/ipv4/tcp.c:3685
kernel_getsockopt+0x121/0x1f0 net/socket.c:3736
mptcp_getsockopt+0x69/0x90 net/mptcp/protocol.c:830
__sys_getsockopt+0x13a/0x220 net/socket.c:2175
We can call tcp_get/setsockopt functions instead. Doing so fixes
crashing, but still leaves rtnl related lockdep splat:
WARNING: possible circular locking dependency detected
5.5.0-rc6 #2 Not tainted
------------------------------------------------------
syz-executor.0/16334 is trying to acquire lock:
ffffffff84f7a080 (rtnl_mutex){+.+.}, at: do_ip_setsockopt.isra.0+0x277/0x3820 net/ipv4/ip_sockglue.c:644
but task is already holding lock:
ffff888116503b90 (sk_lock-AF_INET){+.+.}, at: lock_sock include/net/sock.h:1516 [inline]
ffff888116503b90 (sk_lock-AF_INET){+.+.}, at: mptcp_setsockopt+0x28/0x90 net/mptcp/protocol.c:1284
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (sk_lock-AF_INET){+.+.}:
lock_sock_nested+0xca/0x120 net/core/sock.c:2944
lock_sock include/net/sock.h:1516 [inline]
do_ip_setsockopt.isra.0+0x281/0x3820 net/ipv4/ip_sockglue.c:645
ip_setsockopt+0x44/0xf0 net/ipv4/ip_sockglue.c:1248
udp_setsockopt+0x5d/0xa0 net/ipv4/udp.c:2639
__sys_setsockopt+0x152/0x240 net/socket.c:2130
__do_sys_setsockopt net/socket.c:2146 [inline]
__se_sys_setsockopt net/socket.c:2143 [inline]
__x64_sys_setsockopt+0xba/0x150 net/socket.c:2143
do_syscall_64+0xbd/0x5b0 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (rtnl_mutex){+.+.}:
check_prev_add kernel/locking/lockdep.c:2475 [inline]
check_prevs_add kernel/locking/lockdep.c:2580 [inline]
validate_chain kernel/locking/lockdep.c:2970 [inline]
__lock_acquire+0x1fb2/0x4680 kernel/locking/lockdep.c:3954
lock_acquire+0x127/0x330 kernel/locking/lockdep.c:4484
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x158/0x1340 kernel/locking/mutex.c:1103
do_ip_setsockopt.isra.0+0x277/0x3820 net/ipv4/ip_sockglue.c:644
ip_setsockopt+0x44/0xf0 net/ipv4/ip_sockglue.c:1248
tcp_setsockopt net/ipv4/tcp.c:3159 [inline]
tcp_setsockopt+0x8c/0xd0 net/ipv4/tcp.c:3153
kernel_setsockopt+0x121/0x1f0 net/socket.c:3767
mptcp_setsockopt+0x69/0x90 net/mptcp/protocol.c:1288
__sys_setsockopt+0x152/0x240 net/socket.c:2130
__do_sys_setsockopt net/socket.c:2146 [inline]
__se_sys_setsockopt net/socket.c:2143 [inline]
__x64_sys_setsockopt+0xba/0x150 net/socket.c:2143
do_syscall_64+0xbd/0x5b0 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(sk_lock-AF_INET);
lock(rtnl_mutex);
lock(sk_lock-AF_INET);
lock(rtnl_mutex);
The lockdep complaint is because we hold mptcp socket lock when calling
the sk_prot get/setsockopt handler, and those might need to acquire the
rtnl mutex. Normally, order is:
rtnl_lock(sk) -> lock_sock
Whereas for mptcp the order is
lock_sock(mptcp_sk) rtnl_lock -> lock_sock(subflow_sk)
We can avoid this by releasing the mptcp socket lock early, but, as Paolo
points out, we need to get/put the subflow socket refcount before doing so
to avoid race with concurrent close().
Fixes: 717e79c867ca5 ("mptcp: Add setsockopt()/getsockopt() socket operations")
Reported-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-29 15:54:44 +01:00
|
|
|
char __user *optval, unsigned int optlen)
|
2020-01-21 16:56:22 -08:00
|
|
|
{
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
|
|
|
struct socket *ssock;
|
|
|
|
|
|
|
|
pr_debug("msk=%p", msk);
|
|
|
|
|
|
|
|
/* @@ the meaning of setsockopt() when the socket is connected and
|
2020-02-14 14:14:29 -08:00
|
|
|
* there are multiple subflows is not yet defined. It is up to the
|
|
|
|
* MPTCP-level socket to configure the subflows until the subflow
|
|
|
|
* is in TCP fallback, when TCP socket options are passed through
|
|
|
|
* to the one remaining subflow.
|
2020-01-21 16:56:22 -08:00
|
|
|
*/
|
|
|
|
lock_sock(sk);
|
2020-02-14 14:14:29 -08:00
|
|
|
ssock = __mptcp_tcp_fallback(msk);
|
|
|
|
if (ssock)
|
|
|
|
return tcp_setsockopt(ssock->sk, level, optname, optval,
|
|
|
|
optlen);
|
mptcp: fix panic on user pointer access
Its not possible to call the kernel_(s|g)etsockopt functions here,
the address points to user memory:
General protection fault in user access. Non-canonical address?
WARNING: CPU: 1 PID: 5352 at arch/x86/mm/extable.c:77 ex_handler_uaccess+0xba/0xe0 arch/x86/mm/extable.c:77
Kernel panic - not syncing: panic_on_warn set ...
[..]
Call Trace:
fixup_exception+0x9d/0xcd arch/x86/mm/extable.c:178
general_protection+0x2d/0x40 arch/x86/entry/entry_64.S:1202
do_ip_getsockopt+0x1f6/0x1860 net/ipv4/ip_sockglue.c:1323
ip_getsockopt+0x87/0x1c0 net/ipv4/ip_sockglue.c:1561
tcp_getsockopt net/ipv4/tcp.c:3691 [inline]
tcp_getsockopt+0x8c/0xd0 net/ipv4/tcp.c:3685
kernel_getsockopt+0x121/0x1f0 net/socket.c:3736
mptcp_getsockopt+0x69/0x90 net/mptcp/protocol.c:830
__sys_getsockopt+0x13a/0x220 net/socket.c:2175
We can call tcp_get/setsockopt functions instead. Doing so fixes
crashing, but still leaves rtnl related lockdep splat:
WARNING: possible circular locking dependency detected
5.5.0-rc6 #2 Not tainted
------------------------------------------------------
syz-executor.0/16334 is trying to acquire lock:
ffffffff84f7a080 (rtnl_mutex){+.+.}, at: do_ip_setsockopt.isra.0+0x277/0x3820 net/ipv4/ip_sockglue.c:644
but task is already holding lock:
ffff888116503b90 (sk_lock-AF_INET){+.+.}, at: lock_sock include/net/sock.h:1516 [inline]
ffff888116503b90 (sk_lock-AF_INET){+.+.}, at: mptcp_setsockopt+0x28/0x90 net/mptcp/protocol.c:1284
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (sk_lock-AF_INET){+.+.}:
lock_sock_nested+0xca/0x120 net/core/sock.c:2944
lock_sock include/net/sock.h:1516 [inline]
do_ip_setsockopt.isra.0+0x281/0x3820 net/ipv4/ip_sockglue.c:645
ip_setsockopt+0x44/0xf0 net/ipv4/ip_sockglue.c:1248
udp_setsockopt+0x5d/0xa0 net/ipv4/udp.c:2639
__sys_setsockopt+0x152/0x240 net/socket.c:2130
__do_sys_setsockopt net/socket.c:2146 [inline]
__se_sys_setsockopt net/socket.c:2143 [inline]
__x64_sys_setsockopt+0xba/0x150 net/socket.c:2143
do_syscall_64+0xbd/0x5b0 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (rtnl_mutex){+.+.}:
check_prev_add kernel/locking/lockdep.c:2475 [inline]
check_prevs_add kernel/locking/lockdep.c:2580 [inline]
validate_chain kernel/locking/lockdep.c:2970 [inline]
__lock_acquire+0x1fb2/0x4680 kernel/locking/lockdep.c:3954
lock_acquire+0x127/0x330 kernel/locking/lockdep.c:4484
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x158/0x1340 kernel/locking/mutex.c:1103
do_ip_setsockopt.isra.0+0x277/0x3820 net/ipv4/ip_sockglue.c:644
ip_setsockopt+0x44/0xf0 net/ipv4/ip_sockglue.c:1248
tcp_setsockopt net/ipv4/tcp.c:3159 [inline]
tcp_setsockopt+0x8c/0xd0 net/ipv4/tcp.c:3153
kernel_setsockopt+0x121/0x1f0 net/socket.c:3767
mptcp_setsockopt+0x69/0x90 net/mptcp/protocol.c:1288
__sys_setsockopt+0x152/0x240 net/socket.c:2130
__do_sys_setsockopt net/socket.c:2146 [inline]
__se_sys_setsockopt net/socket.c:2143 [inline]
__x64_sys_setsockopt+0xba/0x150 net/socket.c:2143
do_syscall_64+0xbd/0x5b0 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(sk_lock-AF_INET);
lock(rtnl_mutex);
lock(sk_lock-AF_INET);
lock(rtnl_mutex);
The lockdep complaint is because we hold mptcp socket lock when calling
the sk_prot get/setsockopt handler, and those might need to acquire the
rtnl mutex. Normally, order is:
rtnl_lock(sk) -> lock_sock
Whereas for mptcp the order is
lock_sock(mptcp_sk) rtnl_lock -> lock_sock(subflow_sk)
We can avoid this by releasing the mptcp socket lock early, but, as Paolo
points out, we need to get/put the subflow socket refcount before doing so
to avoid race with concurrent close().
Fixes: 717e79c867ca5 ("mptcp: Add setsockopt()/getsockopt() socket operations")
Reported-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-29 15:54:44 +01:00
|
|
|
|
2020-01-21 16:56:22 -08:00
|
|
|
release_sock(sk);
|
|
|
|
|
2020-02-14 14:14:29 -08:00
|
|
|
return -EOPNOTSUPP;
|
2020-01-21 16:56:22 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int mptcp_getsockopt(struct sock *sk, int level, int optname,
|
mptcp: fix panic on user pointer access
Its not possible to call the kernel_(s|g)etsockopt functions here,
the address points to user memory:
General protection fault in user access. Non-canonical address?
WARNING: CPU: 1 PID: 5352 at arch/x86/mm/extable.c:77 ex_handler_uaccess+0xba/0xe0 arch/x86/mm/extable.c:77
Kernel panic - not syncing: panic_on_warn set ...
[..]
Call Trace:
fixup_exception+0x9d/0xcd arch/x86/mm/extable.c:178
general_protection+0x2d/0x40 arch/x86/entry/entry_64.S:1202
do_ip_getsockopt+0x1f6/0x1860 net/ipv4/ip_sockglue.c:1323
ip_getsockopt+0x87/0x1c0 net/ipv4/ip_sockglue.c:1561
tcp_getsockopt net/ipv4/tcp.c:3691 [inline]
tcp_getsockopt+0x8c/0xd0 net/ipv4/tcp.c:3685
kernel_getsockopt+0x121/0x1f0 net/socket.c:3736
mptcp_getsockopt+0x69/0x90 net/mptcp/protocol.c:830
__sys_getsockopt+0x13a/0x220 net/socket.c:2175
We can call tcp_get/setsockopt functions instead. Doing so fixes
crashing, but still leaves rtnl related lockdep splat:
WARNING: possible circular locking dependency detected
5.5.0-rc6 #2 Not tainted
------------------------------------------------------
syz-executor.0/16334 is trying to acquire lock:
ffffffff84f7a080 (rtnl_mutex){+.+.}, at: do_ip_setsockopt.isra.0+0x277/0x3820 net/ipv4/ip_sockglue.c:644
but task is already holding lock:
ffff888116503b90 (sk_lock-AF_INET){+.+.}, at: lock_sock include/net/sock.h:1516 [inline]
ffff888116503b90 (sk_lock-AF_INET){+.+.}, at: mptcp_setsockopt+0x28/0x90 net/mptcp/protocol.c:1284
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (sk_lock-AF_INET){+.+.}:
lock_sock_nested+0xca/0x120 net/core/sock.c:2944
lock_sock include/net/sock.h:1516 [inline]
do_ip_setsockopt.isra.0+0x281/0x3820 net/ipv4/ip_sockglue.c:645
ip_setsockopt+0x44/0xf0 net/ipv4/ip_sockglue.c:1248
udp_setsockopt+0x5d/0xa0 net/ipv4/udp.c:2639
__sys_setsockopt+0x152/0x240 net/socket.c:2130
__do_sys_setsockopt net/socket.c:2146 [inline]
__se_sys_setsockopt net/socket.c:2143 [inline]
__x64_sys_setsockopt+0xba/0x150 net/socket.c:2143
do_syscall_64+0xbd/0x5b0 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (rtnl_mutex){+.+.}:
check_prev_add kernel/locking/lockdep.c:2475 [inline]
check_prevs_add kernel/locking/lockdep.c:2580 [inline]
validate_chain kernel/locking/lockdep.c:2970 [inline]
__lock_acquire+0x1fb2/0x4680 kernel/locking/lockdep.c:3954
lock_acquire+0x127/0x330 kernel/locking/lockdep.c:4484
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x158/0x1340 kernel/locking/mutex.c:1103
do_ip_setsockopt.isra.0+0x277/0x3820 net/ipv4/ip_sockglue.c:644
ip_setsockopt+0x44/0xf0 net/ipv4/ip_sockglue.c:1248
tcp_setsockopt net/ipv4/tcp.c:3159 [inline]
tcp_setsockopt+0x8c/0xd0 net/ipv4/tcp.c:3153
kernel_setsockopt+0x121/0x1f0 net/socket.c:3767
mptcp_setsockopt+0x69/0x90 net/mptcp/protocol.c:1288
__sys_setsockopt+0x152/0x240 net/socket.c:2130
__do_sys_setsockopt net/socket.c:2146 [inline]
__se_sys_setsockopt net/socket.c:2143 [inline]
__x64_sys_setsockopt+0xba/0x150 net/socket.c:2143
do_syscall_64+0xbd/0x5b0 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(sk_lock-AF_INET);
lock(rtnl_mutex);
lock(sk_lock-AF_INET);
lock(rtnl_mutex);
The lockdep complaint is because we hold mptcp socket lock when calling
the sk_prot get/setsockopt handler, and those might need to acquire the
rtnl mutex. Normally, order is:
rtnl_lock(sk) -> lock_sock
Whereas for mptcp the order is
lock_sock(mptcp_sk) rtnl_lock -> lock_sock(subflow_sk)
We can avoid this by releasing the mptcp socket lock early, but, as Paolo
points out, we need to get/put the subflow socket refcount before doing so
to avoid race with concurrent close().
Fixes: 717e79c867ca5 ("mptcp: Add setsockopt()/getsockopt() socket operations")
Reported-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-29 15:54:44 +01:00
|
|
|
char __user *optval, int __user *option)
|
2020-01-21 16:56:22 -08:00
|
|
|
{
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
|
|
|
struct socket *ssock;
|
|
|
|
|
|
|
|
pr_debug("msk=%p", msk);
|
|
|
|
|
2020-02-14 14:14:29 -08:00
|
|
|
/* @@ the meaning of setsockopt() when the socket is connected and
|
|
|
|
* there are multiple subflows is not yet defined. It is up to the
|
|
|
|
* MPTCP-level socket to configure the subflows until the subflow
|
|
|
|
* is in TCP fallback, when socket options are passed through
|
|
|
|
* to the one remaining subflow.
|
2020-01-21 16:56:22 -08:00
|
|
|
*/
|
|
|
|
lock_sock(sk);
|
2020-02-14 14:14:29 -08:00
|
|
|
ssock = __mptcp_tcp_fallback(msk);
|
|
|
|
if (ssock)
|
|
|
|
return tcp_getsockopt(ssock->sk, level, optname, optval,
|
|
|
|
option);
|
mptcp: fix panic on user pointer access
Its not possible to call the kernel_(s|g)etsockopt functions here,
the address points to user memory:
General protection fault in user access. Non-canonical address?
WARNING: CPU: 1 PID: 5352 at arch/x86/mm/extable.c:77 ex_handler_uaccess+0xba/0xe0 arch/x86/mm/extable.c:77
Kernel panic - not syncing: panic_on_warn set ...
[..]
Call Trace:
fixup_exception+0x9d/0xcd arch/x86/mm/extable.c:178
general_protection+0x2d/0x40 arch/x86/entry/entry_64.S:1202
do_ip_getsockopt+0x1f6/0x1860 net/ipv4/ip_sockglue.c:1323
ip_getsockopt+0x87/0x1c0 net/ipv4/ip_sockglue.c:1561
tcp_getsockopt net/ipv4/tcp.c:3691 [inline]
tcp_getsockopt+0x8c/0xd0 net/ipv4/tcp.c:3685
kernel_getsockopt+0x121/0x1f0 net/socket.c:3736
mptcp_getsockopt+0x69/0x90 net/mptcp/protocol.c:830
__sys_getsockopt+0x13a/0x220 net/socket.c:2175
We can call tcp_get/setsockopt functions instead. Doing so fixes
crashing, but still leaves rtnl related lockdep splat:
WARNING: possible circular locking dependency detected
5.5.0-rc6 #2 Not tainted
------------------------------------------------------
syz-executor.0/16334 is trying to acquire lock:
ffffffff84f7a080 (rtnl_mutex){+.+.}, at: do_ip_setsockopt.isra.0+0x277/0x3820 net/ipv4/ip_sockglue.c:644
but task is already holding lock:
ffff888116503b90 (sk_lock-AF_INET){+.+.}, at: lock_sock include/net/sock.h:1516 [inline]
ffff888116503b90 (sk_lock-AF_INET){+.+.}, at: mptcp_setsockopt+0x28/0x90 net/mptcp/protocol.c:1284
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (sk_lock-AF_INET){+.+.}:
lock_sock_nested+0xca/0x120 net/core/sock.c:2944
lock_sock include/net/sock.h:1516 [inline]
do_ip_setsockopt.isra.0+0x281/0x3820 net/ipv4/ip_sockglue.c:645
ip_setsockopt+0x44/0xf0 net/ipv4/ip_sockglue.c:1248
udp_setsockopt+0x5d/0xa0 net/ipv4/udp.c:2639
__sys_setsockopt+0x152/0x240 net/socket.c:2130
__do_sys_setsockopt net/socket.c:2146 [inline]
__se_sys_setsockopt net/socket.c:2143 [inline]
__x64_sys_setsockopt+0xba/0x150 net/socket.c:2143
do_syscall_64+0xbd/0x5b0 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (rtnl_mutex){+.+.}:
check_prev_add kernel/locking/lockdep.c:2475 [inline]
check_prevs_add kernel/locking/lockdep.c:2580 [inline]
validate_chain kernel/locking/lockdep.c:2970 [inline]
__lock_acquire+0x1fb2/0x4680 kernel/locking/lockdep.c:3954
lock_acquire+0x127/0x330 kernel/locking/lockdep.c:4484
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x158/0x1340 kernel/locking/mutex.c:1103
do_ip_setsockopt.isra.0+0x277/0x3820 net/ipv4/ip_sockglue.c:644
ip_setsockopt+0x44/0xf0 net/ipv4/ip_sockglue.c:1248
tcp_setsockopt net/ipv4/tcp.c:3159 [inline]
tcp_setsockopt+0x8c/0xd0 net/ipv4/tcp.c:3153
kernel_setsockopt+0x121/0x1f0 net/socket.c:3767
mptcp_setsockopt+0x69/0x90 net/mptcp/protocol.c:1288
__sys_setsockopt+0x152/0x240 net/socket.c:2130
__do_sys_setsockopt net/socket.c:2146 [inline]
__se_sys_setsockopt net/socket.c:2143 [inline]
__x64_sys_setsockopt+0xba/0x150 net/socket.c:2143
do_syscall_64+0xbd/0x5b0 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(sk_lock-AF_INET);
lock(rtnl_mutex);
lock(sk_lock-AF_INET);
lock(rtnl_mutex);
The lockdep complaint is because we hold mptcp socket lock when calling
the sk_prot get/setsockopt handler, and those might need to acquire the
rtnl mutex. Normally, order is:
rtnl_lock(sk) -> lock_sock
Whereas for mptcp the order is
lock_sock(mptcp_sk) rtnl_lock -> lock_sock(subflow_sk)
We can avoid this by releasing the mptcp socket lock early, but, as Paolo
points out, we need to get/put the subflow socket refcount before doing so
to avoid race with concurrent close().
Fixes: 717e79c867ca5 ("mptcp: Add setsockopt()/getsockopt() socket operations")
Reported-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-29 15:54:44 +01:00
|
|
|
|
2020-01-21 16:56:22 -08:00
|
|
|
release_sock(sk);
|
|
|
|
|
2020-02-14 14:14:29 -08:00
|
|
|
return -EOPNOTSUPP;
|
2020-01-21 16:56:22 -08:00
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:18 -08:00
|
|
|
static int mptcp_get_port(struct sock *sk, unsigned short snum)
|
2020-01-21 16:56:15 -08:00
|
|
|
{
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
2020-01-21 16:56:18 -08:00
|
|
|
struct socket *ssock;
|
2020-01-21 16:56:15 -08:00
|
|
|
|
2020-01-21 16:56:18 -08:00
|
|
|
ssock = __mptcp_nmpc_socket(msk);
|
|
|
|
pr_debug("msk=%p, subflow=%p", msk, ssock);
|
|
|
|
if (WARN_ON_ONCE(!ssock))
|
|
|
|
return -EINVAL;
|
2020-01-21 16:56:15 -08:00
|
|
|
|
2020-01-21 16:56:18 -08:00
|
|
|
return inet_csk_get_port(ssock->sk, snum);
|
|
|
|
}
|
2020-01-21 16:56:15 -08:00
|
|
|
|
2020-01-21 16:56:18 -08:00
|
|
|
void mptcp_finish_connect(struct sock *ssk)
|
|
|
|
{
|
|
|
|
struct mptcp_subflow_context *subflow;
|
|
|
|
struct mptcp_sock *msk;
|
|
|
|
struct sock *sk;
|
2020-01-21 16:56:23 -08:00
|
|
|
u64 ack_seq;
|
2020-01-21 16:56:15 -08:00
|
|
|
|
2020-01-21 16:56:18 -08:00
|
|
|
subflow = mptcp_subflow_ctx(ssk);
|
2020-01-21 16:56:15 -08:00
|
|
|
|
2020-01-21 16:56:18 -08:00
|
|
|
if (!subflow->mp_capable)
|
|
|
|
return;
|
|
|
|
|
|
|
|
sk = subflow->conn;
|
|
|
|
msk = mptcp_sk(sk);
|
|
|
|
|
2020-01-21 16:56:24 -08:00
|
|
|
pr_debug("msk=%p, token=%u", sk, subflow->token);
|
|
|
|
|
2020-01-21 16:56:23 -08:00
|
|
|
mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
|
|
|
|
ack_seq++;
|
2020-01-21 16:56:24 -08:00
|
|
|
subflow->map_seq = ack_seq;
|
|
|
|
subflow->map_subflow_seq = 1;
|
2020-01-21 16:56:23 -08:00
|
|
|
subflow->rel_write_seq = 1;
|
|
|
|
|
2020-01-21 16:56:18 -08:00
|
|
|
/* the socket is not connected yet, no msk/subflow ops can access/race
|
|
|
|
* accessing the field below
|
|
|
|
*/
|
|
|
|
WRITE_ONCE(msk->remote_key, subflow->remote_key);
|
|
|
|
WRITE_ONCE(msk->local_key, subflow->local_key);
|
2020-01-21 16:56:20 -08:00
|
|
|
WRITE_ONCE(msk->token, subflow->token);
|
2020-01-21 16:56:23 -08:00
|
|
|
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
|
|
|
|
WRITE_ONCE(msk->ack_seq, ack_seq);
|
2020-01-21 16:56:32 -08:00
|
|
|
WRITE_ONCE(msk->can_ack, 1);
|
2020-01-21 16:56:15 -08:00
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:19 -08:00
|
|
|
static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
|
|
|
|
{
|
|
|
|
write_lock_bh(&sk->sk_callback_lock);
|
|
|
|
rcu_assign_pointer(sk->sk_wq, &parent->wq);
|
|
|
|
sk_set_socket(sk, parent);
|
|
|
|
sk->sk_uid = SOCK_INODE(parent)->i_uid;
|
|
|
|
write_unlock_bh(&sk->sk_callback_lock);
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:25 -08:00
|
|
|
static bool mptcp_memory_free(const struct sock *sk, int wake)
|
|
|
|
{
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
|
|
|
|
|
|
|
return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:15 -08:00
|
|
|
static struct proto mptcp_prot = {
|
|
|
|
.name = "MPTCP",
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.init = mptcp_init_sock,
|
|
|
|
.close = mptcp_close,
|
2020-01-21 16:56:19 -08:00
|
|
|
.accept = mptcp_accept,
|
2020-01-21 16:56:22 -08:00
|
|
|
.setsockopt = mptcp_setsockopt,
|
|
|
|
.getsockopt = mptcp_getsockopt,
|
2020-01-21 16:56:15 -08:00
|
|
|
.shutdown = tcp_shutdown,
|
2020-01-21 16:56:20 -08:00
|
|
|
.destroy = mptcp_destroy,
|
2020-01-21 16:56:15 -08:00
|
|
|
.sendmsg = mptcp_sendmsg,
|
|
|
|
.recvmsg = mptcp_recvmsg,
|
|
|
|
.hash = inet_hash,
|
|
|
|
.unhash = inet_unhash,
|
2020-01-21 16:56:18 -08:00
|
|
|
.get_port = mptcp_get_port,
|
2020-01-21 16:56:25 -08:00
|
|
|
.stream_memory_free = mptcp_memory_free,
|
2020-01-21 16:56:15 -08:00
|
|
|
.obj_size = sizeof(struct mptcp_sock),
|
|
|
|
.no_autobind = true,
|
|
|
|
};
|
|
|
|
|
2020-01-21 16:56:17 -08:00
|
|
|
static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
|
|
|
|
{
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sock->sk);
|
|
|
|
struct socket *ssock;
|
2020-01-21 16:56:19 -08:00
|
|
|
int err;
|
2020-01-21 16:56:17 -08:00
|
|
|
|
|
|
|
lock_sock(sock->sk);
|
|
|
|
ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
|
|
|
|
if (IS_ERR(ssock)) {
|
|
|
|
err = PTR_ERR(ssock);
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = ssock->ops->bind(ssock, uaddr, addr_len);
|
2020-01-21 16:56:19 -08:00
|
|
|
if (!err)
|
|
|
|
mptcp_copy_inaddrs(sock->sk, ssock->sk);
|
2020-01-21 16:56:17 -08:00
|
|
|
|
|
|
|
unlock:
|
|
|
|
release_sock(sock->sk);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
|
|
|
|
int addr_len, int flags)
|
|
|
|
{
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sock->sk);
|
|
|
|
struct socket *ssock;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
lock_sock(sock->sk);
|
|
|
|
ssock = __mptcp_socket_create(msk, TCP_SYN_SENT);
|
|
|
|
if (IS_ERR(ssock)) {
|
|
|
|
err = PTR_ERR(ssock);
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:19 -08:00
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
|
|
/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
|
|
|
|
* TCP option space.
|
|
|
|
*/
|
|
|
|
if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
|
|
|
|
mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0;
|
|
|
|
#endif
|
|
|
|
|
2020-01-21 16:56:17 -08:00
|
|
|
err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
|
|
|
|
inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
|
2020-01-21 16:56:19 -08:00
|
|
|
mptcp_copy_inaddrs(sock->sk, ssock->sk);
|
2020-01-21 16:56:17 -08:00
|
|
|
|
|
|
|
unlock:
|
|
|
|
release_sock(sock->sk);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:19 -08:00
|
|
|
static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr,
|
|
|
|
int peer)
|
|
|
|
{
|
|
|
|
if (sock->sk->sk_prot == &tcp_prot) {
|
|
|
|
/* we are being invoked from __sys_accept4, after
|
|
|
|
* mptcp_accept() has just accepted a non-mp-capable
|
|
|
|
* flow: sk is a tcp_sk, not an mptcp one.
|
|
|
|
*
|
|
|
|
* Hand the socket over to tcp so all further socket ops
|
|
|
|
* bypass mptcp.
|
|
|
|
*/
|
|
|
|
sock->ops = &inet_stream_ops;
|
|
|
|
}
|
|
|
|
|
|
|
|
return inet_getname(sock, uaddr, peer);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
|
|
|
|
static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr,
|
|
|
|
int peer)
|
|
|
|
{
|
|
|
|
if (sock->sk->sk_prot == &tcpv6_prot) {
|
|
|
|
/* we are being invoked from __sys_accept4 after
|
|
|
|
* mptcp_accept() has accepted a non-mp-capable
|
|
|
|
* subflow: sk is a tcp_sk, not mptcp.
|
|
|
|
*
|
|
|
|
* Hand the socket over to tcp so all further
|
|
|
|
* socket ops bypass mptcp.
|
|
|
|
*/
|
|
|
|
sock->ops = &inet6_stream_ops;
|
|
|
|
}
|
|
|
|
|
|
|
|
return inet6_getname(sock, uaddr, peer);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static int mptcp_listen(struct socket *sock, int backlog)
|
|
|
|
{
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sock->sk);
|
|
|
|
struct socket *ssock;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
pr_debug("msk=%p", msk);
|
|
|
|
|
|
|
|
lock_sock(sock->sk);
|
|
|
|
ssock = __mptcp_socket_create(msk, TCP_LISTEN);
|
|
|
|
if (IS_ERR(ssock)) {
|
|
|
|
err = PTR_ERR(ssock);
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = ssock->ops->listen(ssock, backlog);
|
|
|
|
inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
|
|
|
|
if (!err)
|
|
|
|
mptcp_copy_inaddrs(sock->sk, ssock->sk);
|
|
|
|
|
|
|
|
unlock:
|
|
|
|
release_sock(sock->sk);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool is_tcp_proto(const struct proto *p)
|
|
|
|
{
|
|
|
|
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
|
|
|
|
return p == &tcp_prot || p == &tcpv6_prot;
|
|
|
|
#else
|
|
|
|
return p == &tcp_prot;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
|
|
|
|
int flags, bool kern)
|
|
|
|
{
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sock->sk);
|
|
|
|
struct socket *ssock;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
pr_debug("msk=%p", msk);
|
|
|
|
|
|
|
|
lock_sock(sock->sk);
|
|
|
|
if (sock->sk->sk_state != TCP_LISTEN)
|
|
|
|
goto unlock_fail;
|
|
|
|
|
|
|
|
ssock = __mptcp_nmpc_socket(msk);
|
|
|
|
if (!ssock)
|
|
|
|
goto unlock_fail;
|
|
|
|
|
|
|
|
sock_hold(ssock->sk);
|
|
|
|
release_sock(sock->sk);
|
|
|
|
|
|
|
|
err = ssock->ops->accept(sock, newsock, flags, kern);
|
|
|
|
if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) {
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(newsock->sk);
|
|
|
|
struct mptcp_subflow_context *subflow;
|
|
|
|
|
|
|
|
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
|
|
|
|
* This is needed so NOSPACE flag can be set from tcp stack.
|
|
|
|
*/
|
|
|
|
list_for_each_entry(subflow, &msk->conn_list, node) {
|
|
|
|
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
|
|
|
|
|
|
|
|
if (!ssk->sk_socket)
|
|
|
|
mptcp_sock_graft(ssk, newsock);
|
|
|
|
}
|
|
|
|
|
|
|
|
inet_sk_state_store(newsock->sk, TCP_ESTABLISHED);
|
|
|
|
}
|
|
|
|
|
|
|
|
sock_put(ssock->sk);
|
|
|
|
return err;
|
|
|
|
|
|
|
|
unlock_fail:
|
|
|
|
release_sock(sock->sk);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:17 -08:00
|
|
|
static __poll_t mptcp_poll(struct file *file, struct socket *sock,
|
|
|
|
struct poll_table_struct *wait)
|
|
|
|
{
|
2020-01-21 16:56:25 -08:00
|
|
|
struct sock *sk = sock->sk;
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
struct mptcp_sock *msk;
|
2020-01-21 16:56:25 -08:00
|
|
|
struct socket *ssock;
|
2020-01-21 16:56:17 -08:00
|
|
|
__poll_t mask = 0;
|
|
|
|
|
2020-01-21 16:56:25 -08:00
|
|
|
msk = mptcp_sk(sk);
|
|
|
|
lock_sock(sk);
|
|
|
|
ssock = __mptcp_nmpc_socket(msk);
|
|
|
|
if (ssock) {
|
|
|
|
mask = ssock->ops->poll(file, ssock, wait);
|
|
|
|
release_sock(sk);
|
|
|
|
return mask;
|
|
|
|
}
|
|
|
|
|
|
|
|
release_sock(sk);
|
|
|
|
sock_poll_wait(file, sock, wait);
|
|
|
|
lock_sock(sk);
|
mptcp: cope with later TCP fallback
With MPTCP v1, passive connections can fallback to TCP after the
subflow becomes established:
syn + MP_CAPABLE ->
<- syn, ack + MP_CAPABLE
ack, seq = 3 ->
// OoO packet is accepted because in-sequence
// passive socket is created, is in ESTABLISHED
// status and tentatively as MP_CAPABLE
ack, seq = 2 ->
// no MP_CAPABLE opt, subflow should fallback to TCP
We can't use the 'subflow' socket fallback, as we don't have
it available for passive connection.
Instead, when the fallback is detected, replace the mptcp
socket with the underlying TCP subflow. Beyond covering
the above scenario, it makes a TCP fallback socket as efficient
as plain TCP ones.
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-21 16:56:33 -08:00
|
|
|
ssock = __mptcp_tcp_fallback(msk);
|
|
|
|
if (unlikely(ssock))
|
|
|
|
return ssock->ops->poll(file, ssock, NULL);
|
2020-01-21 16:56:25 -08:00
|
|
|
|
|
|
|
if (test_bit(MPTCP_DATA_READY, &msk->flags))
|
|
|
|
mask = EPOLLIN | EPOLLRDNORM;
|
|
|
|
if (sk_stream_is_writeable(sk) &&
|
|
|
|
test_bit(MPTCP_SEND_SPACE, &msk->flags))
|
|
|
|
mask |= EPOLLOUT | EPOLLWRNORM;
|
|
|
|
if (sk->sk_shutdown & RCV_SHUTDOWN)
|
|
|
|
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
|
|
|
|
|
|
|
|
release_sock(sk);
|
|
|
|
|
2020-01-21 16:56:17 -08:00
|
|
|
return mask;
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:21 -08:00
|
|
|
static int mptcp_shutdown(struct socket *sock, int how)
|
|
|
|
{
|
|
|
|
struct mptcp_sock *msk = mptcp_sk(sock->sk);
|
|
|
|
struct mptcp_subflow_context *subflow;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
pr_debug("sk=%p, how=%d", msk, how);
|
|
|
|
|
|
|
|
lock_sock(sock->sk);
|
|
|
|
|
|
|
|
if (how == SHUT_WR || how == SHUT_RDWR)
|
|
|
|
inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
|
|
|
|
|
|
|
|
how++;
|
|
|
|
|
|
|
|
if ((how & ~SHUTDOWN_MASK) || !how) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sock->state == SS_CONNECTING) {
|
|
|
|
if ((1 << sock->sk->sk_state) &
|
|
|
|
(TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
|
|
|
|
sock->state = SS_DISCONNECTING;
|
|
|
|
else
|
|
|
|
sock->state = SS_CONNECTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
mptcp_for_each_subflow(msk, subflow) {
|
|
|
|
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
|
|
|
|
|
|
|
|
mptcp_subflow_shutdown(tcp_sk, how);
|
|
|
|
}
|
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
release_sock(sock->sk);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-01-24 16:04:02 -08:00
|
|
|
static const struct proto_ops mptcp_stream_ops = {
|
|
|
|
.family = PF_INET,
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.release = inet_release,
|
|
|
|
.bind = mptcp_bind,
|
|
|
|
.connect = mptcp_stream_connect,
|
|
|
|
.socketpair = sock_no_socketpair,
|
|
|
|
.accept = mptcp_stream_accept,
|
|
|
|
.getname = mptcp_v4_getname,
|
|
|
|
.poll = mptcp_poll,
|
|
|
|
.ioctl = inet_ioctl,
|
|
|
|
.gettstamp = sock_gettstamp,
|
|
|
|
.listen = mptcp_listen,
|
|
|
|
.shutdown = mptcp_shutdown,
|
|
|
|
.setsockopt = sock_common_setsockopt,
|
|
|
|
.getsockopt = sock_common_getsockopt,
|
|
|
|
.sendmsg = inet_sendmsg,
|
|
|
|
.recvmsg = inet_recvmsg,
|
|
|
|
.mmap = sock_no_mmap,
|
|
|
|
.sendpage = inet_sendpage,
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
.compat_setsockopt = compat_sock_common_setsockopt,
|
|
|
|
.compat_getsockopt = compat_sock_common_getsockopt,
|
|
|
|
#endif
|
|
|
|
};
|
2020-01-21 16:56:17 -08:00
|
|
|
|
2020-01-21 16:56:15 -08:00
|
|
|
static struct inet_protosw mptcp_protosw = {
|
|
|
|
.type = SOCK_STREAM,
|
|
|
|
.protocol = IPPROTO_MPTCP,
|
|
|
|
.prot = &mptcp_prot,
|
2020-01-21 16:56:17 -08:00
|
|
|
.ops = &mptcp_stream_ops,
|
|
|
|
.flags = INET_PROTOSW_ICSK,
|
2020-01-21 16:56:15 -08:00
|
|
|
};
|
|
|
|
|
2020-01-21 16:56:28 -08:00
|
|
|
void mptcp_proto_init(void)
|
2020-01-21 16:56:15 -08:00
|
|
|
{
|
2020-01-21 16:56:17 -08:00
|
|
|
mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
|
|
|
|
|
|
|
|
mptcp_subflow_init();
|
|
|
|
|
2020-01-21 16:56:15 -08:00
|
|
|
if (proto_register(&mptcp_prot, 1) != 0)
|
|
|
|
panic("Failed to register MPTCP proto.\n");
|
|
|
|
|
|
|
|
inet_register_protosw(&mptcp_protosw);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
|
2020-01-24 16:04:02 -08:00
|
|
|
static const struct proto_ops mptcp_v6_stream_ops = {
|
|
|
|
.family = PF_INET6,
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.release = inet6_release,
|
|
|
|
.bind = mptcp_bind,
|
|
|
|
.connect = mptcp_stream_connect,
|
|
|
|
.socketpair = sock_no_socketpair,
|
|
|
|
.accept = mptcp_stream_accept,
|
|
|
|
.getname = mptcp_v6_getname,
|
|
|
|
.poll = mptcp_poll,
|
|
|
|
.ioctl = inet6_ioctl,
|
|
|
|
.gettstamp = sock_gettstamp,
|
|
|
|
.listen = mptcp_listen,
|
|
|
|
.shutdown = mptcp_shutdown,
|
|
|
|
.setsockopt = sock_common_setsockopt,
|
|
|
|
.getsockopt = sock_common_getsockopt,
|
|
|
|
.sendmsg = inet6_sendmsg,
|
|
|
|
.recvmsg = inet6_recvmsg,
|
|
|
|
.mmap = sock_no_mmap,
|
|
|
|
.sendpage = inet_sendpage,
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
.compat_setsockopt = compat_sock_common_setsockopt,
|
|
|
|
.compat_getsockopt = compat_sock_common_getsockopt,
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
2020-01-21 16:56:15 -08:00
|
|
|
static struct proto mptcp_v6_prot;
|
|
|
|
|
2020-01-21 16:56:20 -08:00
|
|
|
static void mptcp_v6_destroy(struct sock *sk)
|
|
|
|
{
|
|
|
|
mptcp_destroy(sk);
|
|
|
|
inet6_destroy_sock(sk);
|
|
|
|
}
|
|
|
|
|
2020-01-21 16:56:15 -08:00
|
|
|
static struct inet_protosw mptcp_v6_protosw = {
|
|
|
|
.type = SOCK_STREAM,
|
|
|
|
.protocol = IPPROTO_MPTCP,
|
|
|
|
.prot = &mptcp_v6_prot,
|
2020-01-21 16:56:17 -08:00
|
|
|
.ops = &mptcp_v6_stream_ops,
|
2020-01-21 16:56:15 -08:00
|
|
|
.flags = INET_PROTOSW_ICSK,
|
|
|
|
};
|
|
|
|
|
2020-01-21 16:56:28 -08:00
|
|
|
int mptcp_proto_v6_init(void)
|
2020-01-21 16:56:15 -08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
mptcp_v6_prot = mptcp_prot;
|
|
|
|
strcpy(mptcp_v6_prot.name, "MPTCPv6");
|
|
|
|
mptcp_v6_prot.slab = NULL;
|
2020-01-21 16:56:20 -08:00
|
|
|
mptcp_v6_prot.destroy = mptcp_v6_destroy;
|
2020-02-06 00:39:37 +01:00
|
|
|
mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock);
|
2020-01-21 16:56:15 -08:00
|
|
|
|
|
|
|
err = proto_register(&mptcp_v6_prot, 1);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
err = inet6_register_protosw(&mptcp_v6_protosw);
|
|
|
|
if (err)
|
|
|
|
proto_unregister(&mptcp_v6_prot);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
#endif
|