From ec24b988eb26e21f37707d090ec3ab53c51fd386 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 Jul 2024 12:20:36 +0200 Subject: [PATCH 01/15] um: remove variable stack array in os_rcv_fd_msg() When generalizing this, I was in the mindset of this being "userspace" code, but even there we should not use variable arrays as the kernel is moving away from allowing that. Simply reserve (but not use) enough space for the maximum two descriptors we might need now, and return an error if attempting to receive more than that. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407041459.3SYg4TEi-lkp@intel.com/ Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/os-Linux/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index 5adf8f630049..f1d03cf3957f 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -528,7 +528,8 @@ int os_shutdown_socket(int fd, int r, int w) ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds, void *data, size_t data_len) { - char buf[CMSG_SPACE(sizeof(*fds) * n_fds)]; +#define MAX_RCV_FDS 2 + char buf[CMSG_SPACE(sizeof(*fds) * MAX_RCV_FDS)]; struct cmsghdr *cmsg; struct iovec iov = { .iov_base = data, @@ -538,10 +539,13 @@ ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds, .msg_iov = &iov, .msg_iovlen = 1, .msg_control = buf, - .msg_controllen = sizeof(buf), + .msg_controllen = CMSG_SPACE(sizeof(*fds) * n_fds), }; int n; + if (n_fds > MAX_RCV_FDS) + return -EINVAL; + n = recvmsg(fd, &msg, 0); if (n < 0) return -errno; From 612a8c8e0b43ba7e3d0e51f6f76a5fec4912d439 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Fri, 5 Jul 2024 11:53:31 +0100 Subject: [PATCH 02/15] um: vector: Replace locks guarding queue depth with atomics UML vector drivers use ring buffer structures which map preallocated skbs onto mmsg vectors for use with sendmmsg and recvmmsg. They are designed around a single consumer, single producer pattern allowing simultaneous enqueue and dequeue. Lock debugging with preemption showed possible races when locking the queue depth. This patch addresses this by removing extra locks, adding barriers and making queue depth inc/dec and access atomic. Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/vector_kern.c | 208 +++++++++++++++++----------------- arch/um/drivers/vector_kern.h | 4 +- 2 files changed, 110 insertions(+), 102 deletions(-) diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 2d473282ab51..0d7e7a25b674 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -102,18 +103,33 @@ static const struct { static void vector_reset_stats(struct vector_private *vp) { + /* We reuse the existing queue locks for stats */ + + /* RX stats are modified with RX head_lock held + * in vector_poll. + */ + + spin_lock(&vp->rx_queue->head_lock); vp->estats.rx_queue_max = 0; vp->estats.rx_queue_running_average = 0; - vp->estats.tx_queue_max = 0; - vp->estats.tx_queue_running_average = 0; vp->estats.rx_encaps_errors = 0; + vp->estats.sg_ok = 0; + vp->estats.sg_linearized = 0; + spin_unlock(&vp->rx_queue->head_lock); + + /* TX stats are modified with TX head_lock held + * in vector_send. + */ + + spin_lock(&vp->tx_queue->head_lock); vp->estats.tx_timeout_count = 0; vp->estats.tx_restart_queue = 0; vp->estats.tx_kicks = 0; vp->estats.tx_flow_control_xon = 0; vp->estats.tx_flow_control_xoff = 0; - vp->estats.sg_ok = 0; - vp->estats.sg_linearized = 0; + vp->estats.tx_queue_max = 0; + vp->estats.tx_queue_running_average = 0; + spin_unlock(&vp->tx_queue->head_lock); } static int get_mtu(struct arglist *def) @@ -232,12 +248,6 @@ static int get_transport_options(struct arglist *def) static char *drop_buffer; -/* Array backed queues optimized for bulk enqueue/dequeue and - * 1:N (small values of N) or 1:1 enqueuer/dequeuer ratios. - * For more details and full design rationale see - * http://foswiki.cambridgegreys.com/Main/EatYourTailAndEnjoyIt - */ - /* * Advance the mmsg queue head by n = advance. Resets the queue to @@ -247,27 +257,13 @@ static char *drop_buffer; static int vector_advancehead(struct vector_queue *qi, int advance) { - int queue_depth; - qi->head = (qi->head + advance) % qi->max_depth; - spin_lock(&qi->tail_lock); - qi->queue_depth -= advance; - - /* we are at 0, use this to - * reset head and tail so we can use max size vectors - */ - - if (qi->queue_depth == 0) { - qi->head = 0; - qi->tail = 0; - } - queue_depth = qi->queue_depth; - spin_unlock(&qi->tail_lock); - return queue_depth; + atomic_sub(advance, &qi->queue_depth); + return atomic_read(&qi->queue_depth); } /* Advance the queue tail by n = advance. @@ -277,16 +273,11 @@ static int vector_advancehead(struct vector_queue *qi, int advance) static int vector_advancetail(struct vector_queue *qi, int advance) { - int queue_depth; - qi->tail = (qi->tail + advance) % qi->max_depth; - spin_lock(&qi->head_lock); - qi->queue_depth += advance; - queue_depth = qi->queue_depth; - spin_unlock(&qi->head_lock); - return queue_depth; + atomic_add(advance, &qi->queue_depth); + return atomic_read(&qi->queue_depth); } static int prep_msg(struct vector_private *vp, @@ -339,9 +330,7 @@ static int vector_enqueue(struct vector_queue *qi, struct sk_buff *skb) int iov_count; spin_lock(&qi->tail_lock); - spin_lock(&qi->head_lock); - queue_depth = qi->queue_depth; - spin_unlock(&qi->head_lock); + queue_depth = atomic_read(&qi->queue_depth); if (skb) packet_len = skb->len; @@ -360,6 +349,7 @@ static int vector_enqueue(struct vector_queue *qi, struct sk_buff *skb) mmsg_vector->msg_hdr.msg_iovlen = iov_count; mmsg_vector->msg_hdr.msg_name = vp->fds->remote_addr; mmsg_vector->msg_hdr.msg_namelen = vp->fds->remote_addr_size; + wmb(); /* Make the packet visible to the NAPI poll thread */ queue_depth = vector_advancetail(qi, 1); } else goto drop; @@ -398,7 +388,7 @@ static int consume_vector_skbs(struct vector_queue *qi, int count) } /* - * Generic vector deque via sendmmsg with support for forming headers + * Generic vector dequeue via sendmmsg with support for forming headers * using transport specific callback. Allows GRE, L2TPv3, RAW and * other transports to use a common dequeue procedure in vector mode */ @@ -408,69 +398,64 @@ static int vector_send(struct vector_queue *qi) { struct vector_private *vp = netdev_priv(qi->dev); struct mmsghdr *send_from; - int result = 0, send_len, queue_depth = qi->max_depth; + int result = 0, send_len; if (spin_trylock(&qi->head_lock)) { - if (spin_trylock(&qi->tail_lock)) { - /* update queue_depth to current value */ - queue_depth = qi->queue_depth; - spin_unlock(&qi->tail_lock); - while (queue_depth > 0) { - /* Calculate the start of the vector */ - send_len = queue_depth; - send_from = qi->mmsg_vector; - send_from += qi->head; - /* Adjust vector size if wraparound */ - if (send_len + qi->head > qi->max_depth) - send_len = qi->max_depth - qi->head; - /* Try to TX as many packets as possible */ - if (send_len > 0) { - result = uml_vector_sendmmsg( - vp->fds->tx_fd, - send_from, - send_len, - 0 - ); - vp->in_write_poll = - (result != send_len); - } - /* For some of the sendmmsg error scenarios - * we may end being unsure in the TX success - * for all packets. It is safer to declare - * them all TX-ed and blame the network. + /* update queue_depth to current value */ + while (atomic_read(&qi->queue_depth) > 0) { + /* Calculate the start of the vector */ + send_len = atomic_read(&qi->queue_depth); + send_from = qi->mmsg_vector; + send_from += qi->head; + /* Adjust vector size if wraparound */ + if (send_len + qi->head > qi->max_depth) + send_len = qi->max_depth - qi->head; + /* Try to TX as many packets as possible */ + if (send_len > 0) { + result = uml_vector_sendmmsg( + vp->fds->tx_fd, + send_from, + send_len, + 0 + ); + vp->in_write_poll = + (result != send_len); + } + /* For some of the sendmmsg error scenarios + * we may end being unsure in the TX success + * for all packets. It is safer to declare + * them all TX-ed and blame the network. + */ + if (result < 0) { + if (net_ratelimit()) + netdev_err(vp->dev, "sendmmsg err=%i\n", + result); + vp->in_error = true; + result = send_len; + } + if (result > 0) { + consume_vector_skbs(qi, result); + /* This is equivalent to an TX IRQ. + * Restart the upper layers to feed us + * more packets. */ - if (result < 0) { - if (net_ratelimit()) - netdev_err(vp->dev, "sendmmsg err=%i\n", - result); - vp->in_error = true; - result = send_len; - } - if (result > 0) { - queue_depth = - consume_vector_skbs(qi, result); - /* This is equivalent to an TX IRQ. - * Restart the upper layers to feed us - * more packets. - */ - if (result > vp->estats.tx_queue_max) - vp->estats.tx_queue_max = result; - vp->estats.tx_queue_running_average = - (vp->estats.tx_queue_running_average + result) >> 1; - } - netif_wake_queue(qi->dev); - /* if TX is busy, break out of the send loop, - * poll write IRQ will reschedule xmit for us - */ - if (result != send_len) { - vp->estats.tx_restart_queue++; - break; - } + if (result > vp->estats.tx_queue_max) + vp->estats.tx_queue_max = result; + vp->estats.tx_queue_running_average = + (vp->estats.tx_queue_running_average + result) >> 1; + } + netif_wake_queue(qi->dev); + /* if TX is busy, break out of the send loop, + * poll write IRQ will reschedule xmit for us. + */ + if (result != send_len) { + vp->estats.tx_restart_queue++; + break; } } spin_unlock(&qi->head_lock); } - return queue_depth; + return atomic_read(&qi->queue_depth); } /* Queue destructor. Deliberately stateless so we can use @@ -589,7 +574,7 @@ static struct vector_queue *create_queue( } spin_lock_init(&result->head_lock); spin_lock_init(&result->tail_lock); - result->queue_depth = 0; + atomic_set(&result->queue_depth, 0); result->head = 0; result->tail = 0; return result; @@ -668,18 +653,27 @@ static struct sk_buff *prep_skb( } -/* Prepare queue for recvmmsg one-shot rx - fill with fresh sk_buffs*/ +/* Prepare queue for recvmmsg one-shot rx - fill with fresh sk_buffs */ static void prep_queue_for_rx(struct vector_queue *qi) { struct vector_private *vp = netdev_priv(qi->dev); struct mmsghdr *mmsg_vector = qi->mmsg_vector; void **skbuff_vector = qi->skbuff_vector; - int i; + int i, queue_depth; - if (qi->queue_depth == 0) + queue_depth = atomic_read(&qi->queue_depth); + + if (queue_depth == 0) return; - for (i = 0; i < qi->queue_depth; i++) { + + /* RX is always emptied 100% during each cycle, so we do not + * have to do the tail wraparound math for it. + */ + + qi->head = qi->tail = 0; + + for (i = 0; i < queue_depth; i++) { /* it is OK if allocation fails - recvmmsg with NULL data in * iov argument still performs an RX, just drops the packet * This allows us stop faffing around with a "drop buffer" @@ -689,7 +683,7 @@ static void prep_queue_for_rx(struct vector_queue *qi) skbuff_vector++; mmsg_vector++; } - qi->queue_depth = 0; + atomic_set(&qi->queue_depth, 0); } static struct vector_device *find_device(int n) @@ -985,7 +979,7 @@ static int vector_mmsg_rx(struct vector_private *vp, int budget) * many do we need to prep the next time prep_queue_for_rx() is called. */ - qi->queue_depth = packet_count; + atomic_add(packet_count, &qi->queue_depth); for (i = 0; i < packet_count; i++) { skb = (*skbuff_vector); @@ -1172,6 +1166,7 @@ static int vector_poll(struct napi_struct *napi, int budget) if ((vp->options & VECTOR_TX) != 0) tx_enqueued = (vector_send(vp->tx_queue) > 0); + spin_lock(&vp->rx_queue->head_lock); if ((vp->options & VECTOR_RX) > 0) err = vector_mmsg_rx(vp, budget); else { @@ -1179,6 +1174,7 @@ static int vector_poll(struct napi_struct *napi, int budget) if (err > 0) err = 1; } + spin_unlock(&vp->rx_queue->head_lock); if (err > 0) work_done += err; @@ -1225,7 +1221,7 @@ static int vector_net_open(struct net_device *dev) vp->rx_header_size, MAX_IOV_SIZE ); - vp->rx_queue->queue_depth = get_depth(vp->parsed); + atomic_set(&vp->rx_queue->queue_depth, get_depth(vp->parsed)); } else { vp->header_rxbuffer = kmalloc( vp->rx_header_size, @@ -1467,7 +1463,17 @@ static void vector_get_ethtool_stats(struct net_device *dev, { struct vector_private *vp = netdev_priv(dev); + /* Stats are modified in the dequeue portions of + * rx/tx which are protected by the head locks + * grabbing these locks here ensures they are up + * to date. + */ + + spin_lock(&vp->tx_queue->head_lock); + spin_lock(&vp->rx_queue->head_lock); memcpy(tmp_stats, &vp->estats, sizeof(struct vector_estats)); + spin_unlock(&vp->rx_queue->head_lock); + spin_unlock(&vp->tx_queue->head_lock); } static int vector_get_coalesce(struct net_device *netdev, diff --git a/arch/um/drivers/vector_kern.h b/arch/um/drivers/vector_kern.h index 806df551be0b..417834793658 100644 --- a/arch/um/drivers/vector_kern.h +++ b/arch/um/drivers/vector_kern.h @@ -14,6 +14,7 @@ #include #include #include +#include #include "vector_user.h" @@ -44,7 +45,8 @@ struct vector_queue { struct net_device *dev; spinlock_t head_lock; spinlock_t tail_lock; - int queue_depth, head, tail, max_depth, max_iov_frags; + atomic_t queue_depth; + int head, tail, max_depth, max_iov_frags; short options; }; From 671cd5eed9db3415b42826747114a330bc303ae9 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Sat, 6 Jul 2024 10:12:00 +0100 Subject: [PATCH 03/15] um: vector: Fix NAPI budget handling Fix the handling of NAPI budget. Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/vector_kern.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 0d7e7a25b674..c992da83268d 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -966,7 +966,7 @@ static int vector_mmsg_rx(struct vector_private *vp, int budget) budget = qi->max_depth; packet_count = uml_vector_recvmmsg( - vp->fds->rx_fd, qi->mmsg_vector, qi->max_depth, 0); + vp->fds->rx_fd, qi->mmsg_vector, budget, 0); if (packet_count < 0) vp->in_error = true; @@ -1180,7 +1180,7 @@ static int vector_poll(struct napi_struct *napi, int budget) if (tx_enqueued || err > 0) napi_schedule(napi); - if (work_done < budget) + if (work_done <= budget) napi_complete_done(napi, work_done); return work_done; } From 64dcf0b8779363ca07dfb5649a4cc71f9fdf390b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 23 Jul 2024 14:24:56 +0200 Subject: [PATCH 04/15] um: remove ARCH_NO_PREEMPT_DYNAMIC There's no such symbol and we currently don't have any of the mechanisms to make boot-time selection cheap enough, so we can't have HAVE_PREEMPT_DYNAMIC_CALL or HAVE_PREEMPT_DYNAMIC_KEY. Remove the select statement. Reported-by: Lukas Bulwahn Fixes: cd01672d64a3 ("um: Enable preemption in UML") Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index dca84fd6d00a..c89575d05021 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -11,7 +11,6 @@ config UML select ARCH_HAS_KCOV select ARCH_HAS_STRNCPY_FROM_USER select ARCH_HAS_STRNLEN_USER - select ARCH_NO_PREEMPT_DYNAMIC select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN From ab1d5895cf6cdd1df4b6ce0ac8763828e2bf7e62 Mon Sep 17 00:00:00 2001 From: Renzo Davoli Date: Tue, 30 Jul 2024 15:54:54 +0200 Subject: [PATCH 05/15] vector_user: add VDE support This is the actual implementation of VDE support as a vector transport. Signed-off-by: Renzo Davoli Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/vector_user.c | 83 +++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c index b16a5e5619d3..2ea67e6fd067 100644 --- a/arch/um/drivers/vector_user.c +++ b/arch/um/drivers/vector_user.c @@ -46,6 +46,9 @@ #define TRANS_FD "fd" #define TRANS_FD_LEN strlen(TRANS_FD) +#define TRANS_VDE "vde" +#define TRANS_VDE_LEN strlen(TRANS_VDE) + #define VNET_HDR_FAIL "could not enable vnet headers on fd %d" #define TUN_GET_F_FAIL "tapraw: TUNGETFEATURES failed: %s" #define L2TPV3_BIND_FAIL "l2tpv3_open : could not bind socket err=%i" @@ -434,6 +437,84 @@ static struct vector_fds *user_init_fd_fds(struct arglist *ifspec) return NULL; } +/* enough char to store an int type */ +#define ENOUGH(type) ((CHAR_BIT * sizeof(type) - 1) / 3 + 2) +#define ENOUGH_OCTAL(type) ((CHAR_BIT * sizeof(type) + 2) / 3) +/* vde_plug --descr xx --port2 xx --mod2 xx --group2 xx seqpacket://NN vnl (NULL) */ +#define VDE_MAX_ARGC 12 +#define VDE_SEQPACKET_HEAD "seqpacket://" +#define VDE_SEQPACKET_HEAD_LEN (sizeof(VDE_SEQPACKET_HEAD) - 1) +#define VDE_DEFAULT_DESCRIPTION "UML" + +static struct vector_fds *user_init_vde_fds(struct arglist *ifspec) +{ + char seqpacketvnl[VDE_SEQPACKET_HEAD_LEN + ENOUGH(int) + 1]; + char *argv[VDE_MAX_ARGC] = {"vde_plug"}; + int argc = 1; + int rv; + int sv[2]; + struct vector_fds *result = NULL; + + char *vnl = uml_vector_fetch_arg(ifspec,"vnl"); + char *descr = uml_vector_fetch_arg(ifspec,"descr"); + char *port = uml_vector_fetch_arg(ifspec,"port"); + char *mode = uml_vector_fetch_arg(ifspec,"mode"); + char *group = uml_vector_fetch_arg(ifspec,"group"); + if (descr == NULL) descr = VDE_DEFAULT_DESCRIPTION; + + argv[argc++] = "--descr"; + argv[argc++] = descr; + if (port != NULL) { + argv[argc++] = "--port2"; + argv[argc++] = port; + } + if (mode != NULL) { + argv[argc++] = "--mod2"; + argv[argc++] = mode; + } + if (group != NULL) { + argv[argc++] = "--group2"; + argv[argc++] = group; + } + argv[argc++] = seqpacketvnl; + argv[argc++] = vnl; + argv[argc++] = NULL; + + rv = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, sv); + if (rv < 0) { + printk(UM_KERN_ERR "vde: seqpacket socketpair err %d", -errno); + return NULL; + } + rv = os_set_exec_close(sv[0]); + if (rv < 0) { + printk(UM_KERN_ERR "vde: seqpacket socketpair cloexec err %d", -errno); + goto vde_cleanup_sv; + } + snprintf(seqpacketvnl, sizeof(seqpacketvnl), VDE_SEQPACKET_HEAD "%d", sv[1]); + + run_helper(NULL, NULL, argv); + + close(sv[1]); + + result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); + if (result == NULL) { + printk(UM_KERN_ERR "fd open: allocation failed"); + goto vde_cleanup; + } + + result->rx_fd = sv[0]; + result->tx_fd = sv[0]; + result->remote_addr_size = 0; + result->remote_addr = NULL; + return result; + +vde_cleanup_sv: + close(sv[1]); +vde_cleanup: + close(sv[0]); + return NULL; +} + static struct vector_fds *user_init_raw_fds(struct arglist *ifspec) { int rxfd = -1, txfd = -1; @@ -673,6 +754,8 @@ struct vector_fds *uml_vector_user_open( return user_init_unix_fds(parsed, ID_BESS); if (strncmp(transport, TRANS_FD, TRANS_FD_LEN) == 0) return user_init_fd_fds(parsed); + if (strncmp(transport, TRANS_VDE, TRANS_VDE_LEN) == 0) + return user_init_vde_fds(parsed); return NULL; } From cccf19f8b568a4edabb16a998a81bc5d4c4c7e01 Mon Sep 17 00:00:00 2001 From: Renzo Davoli Date: Tue, 30 Jul 2024 15:55:04 +0200 Subject: [PATCH 06/15] user_mode_linux_howto_v2: add VDE vector support in doc Add a description of the VDE vector transport in user_mode_linux_howto_v2.rst. Signed-off-by: Renzo Davoli Signed-off-by: Renzo Davoi Acked-by: Randy Dunlap Signed-off-by: Richard Weinberger --- .../virt/uml/user_mode_linux_howto_v2.rst | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/Documentation/virt/uml/user_mode_linux_howto_v2.rst b/Documentation/virt/uml/user_mode_linux_howto_v2.rst index 27942446f406..584000b743f3 100644 --- a/Documentation/virt/uml/user_mode_linux_howto_v2.rst +++ b/Documentation/virt/uml/user_mode_linux_howto_v2.rst @@ -217,6 +217,8 @@ remote UML and other VM instances. +-----------+--------+------------------------------------+------------+ | fd | vector | dependent on fd type | varies | +-----------+--------+------------------------------------+------------+ +| vde | vector | dep. on VDE VPN: Virt.Net Locator | varies | ++-----------+--------+------------------------------------+------------+ | tuntap | legacy | none | ~ 500Mbit | +-----------+--------+------------------------------------+------------+ | daemon | legacy | none | ~ 450Mbit | @@ -573,6 +575,41 @@ https://github.com/NetSys/bess/wiki/Built-In-Modules-and-Ports BESS transport does not require any special privileges. +VDE vector transport +-------------------- + +Virtual Distributed Ethernet (VDE) is a project whose main goal is to provide a +highly flexible support for virtual networking. + +http://wiki.virtualsquare.org/#/tutorials/vdebasics + +Common usages of VDE include fast prototyping and teaching. + +Examples: + + ``vecX:transport=vde,vnl=tap://tap0`` + +use tap0 + + ``vecX:transport=vde,vnl=slirp://`` + +use slirp + + ``vec0:transport=vde,vnl=vde:///tmp/switch`` + +connect to a vde switch + + ``vecX:transport=\"vde,vnl=cmd://ssh remote.host //tmp/sshlirp\"`` + +connect to a remote slirp (instant VPN: convert ssh to VPN, it uses sshlirp) +https://github.com/virtualsquare/sshlirp + + ``vec0:transport=vde,vnl=vxvde://234.0.0.1`` + +connect to a local area cloud (all the UML nodes using the same +multicast address running on hosts in the same multicast domain (LAN) +will be automagically connected together to a virtual LAN. + Configuring Legacy transports ============================= From 2df8c8d118c750054fdf2c047d2eb3c0ed854dc7 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Sat, 24 Aug 2024 20:04:49 +0800 Subject: [PATCH 07/15] um: Remove obsoleted declaration for execute_syscall_skas The execute_syscall_skas() have been removed since commit e32dacb9f481 ("[PATCH] uml: system call path cleanup"), and now it is useless, so remove it. Signed-off-by: Gaosheng Cui Reviewed-by: Geert Uytterhoeven Signed-off-by: Richard Weinberger --- arch/um/include/shared/skas/skas.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/um/include/shared/skas/skas.h b/arch/um/include/shared/skas/skas.h index ebaa116de30b..854cff8d6319 100644 --- a/arch/um/include/shared/skas/skas.h +++ b/arch/um/include/shared/skas/skas.h @@ -13,7 +13,6 @@ extern int userspace_pid[]; extern int user_thread(unsigned long stack, int flags); extern void new_thread_handler(void); extern void handle_syscall(struct uml_pt_regs *regs); -extern long execute_syscall_skas(void *r); extern unsigned long current_stub_stack(void); extern struct mm_id *current_mm_id(void); extern void current_mm_sync(void); From 2fcd16fbab9f448c7174bf4c3eeda53ef84e28ee Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 26 Aug 2024 18:08:09 +0800 Subject: [PATCH 08/15] um: Remove unused kpte_clear_flush macro This macro has no users, and __flush_tlb_one doesn't exist either. Signed-off-by: Tiwei Bie Signed-off-by: Richard Weinberger --- arch/um/include/asm/pgtable.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 5bb397b65efb..83373c9963e7 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -359,11 +359,4 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return pte; } -/* Clear a kernel PTE and flush it from the TLB */ -#define kpte_clear_flush(ptep, vaddr) \ -do { \ - pte_clear(&init_mm, (vaddr), (ptep)); \ - __flush_tlb_one((vaddr)); \ -} while (0) - #endif From 669afa4e8715c5730fb353166f9aaaa14d4fed64 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 26 Aug 2024 18:08:10 +0800 Subject: [PATCH 09/15] um: Remove the redundant newpage check in update_pte_range The two checks have been identical since commit ef714f15027c ("um: remove force_flush_all from fork_handler"). And the inner one isn't necessary anymore. Signed-off-by: Tiwei Bie Signed-off-by: Richard Weinberger --- arch/um/kernel/tlb.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 44c6fc697f3a..548af31d4111 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -82,16 +82,12 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr, (x ? UM_PROT_EXEC : 0)); if (pte_newpage(*pte)) { if (pte_present(*pte)) { - if (pte_newpage(*pte)) { - __u64 offset; - unsigned long phys = - pte_val(*pte) & PAGE_MASK; - int fd = phys_mapping(phys, &offset); + __u64 offset; + unsigned long phys = pte_val(*pte) & PAGE_MASK; + int fd = phys_mapping(phys, &offset); - ret = ops->mmap(ops->mm_idp, addr, - PAGE_SIZE, prot, fd, - offset); - } + ret = ops->mmap(ops->mm_idp, addr, PAGE_SIZE, + prot, fd, offset); } else ret = ops->unmap(ops->mm_idp, addr, PAGE_SIZE); } else if (pte_newprot(*pte)) From 94090f418fc80c50ca7ea3f8a6d7ff547260a801 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 26 Aug 2024 18:08:11 +0800 Subject: [PATCH 10/15] um: Remove unused fields from thread_struct These fields are no longer used since the removal of tt mode. Signed-off-by: Tiwei Bie Signed-off-by: Richard Weinberger --- arch/um/include/asm/processor-generic.h | 20 +++++--------------- arch/um/kernel/process.c | 8 ++++---- arch/um/kernel/skas/process.c | 4 ++-- 3 files changed, 11 insertions(+), 21 deletions(-) diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index 5a7c05275aa7..bce4595798da 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -28,20 +28,10 @@ struct thread_struct { struct arch_thread arch; jmp_buf switch_buf; struct { - int op; - union { - struct { - int pid; - } fork, exec; - struct { - int (*proc)(void *); - void *arg; - } thread; - struct { - void (*proc)(void *); - void *arg; - } cb; - } u; + struct { + int (*proc)(void *); + void *arg; + } thread; } request; }; @@ -51,7 +41,7 @@ struct thread_struct { .fault_addr = NULL, \ .prev_sched = NULL, \ .arch = INIT_ARCH_THREAD, \ - .request = { 0 } \ + .request = { } \ } /* diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index f36b63f53bab..be2856af6d4c 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -109,8 +109,8 @@ void new_thread_handler(void) schedule_tail(current->thread.prev_sched); current->thread.prev_sched = NULL; - fn = current->thread.request.u.thread.proc; - arg = current->thread.request.u.thread.arg; + fn = current->thread.request.thread.proc; + arg = current->thread.request.thread.arg; /* * callback returns only if the kernel thread execs a process @@ -158,8 +158,8 @@ int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) arch_copy_thread(¤t->thread.arch, &p->thread.arch); } else { get_safe_registers(p->thread.regs.regs.gp, p->thread.regs.regs.fp); - p->thread.request.u.thread.proc = args->fn; - p->thread.request.u.thread.arg = args->fn_arg; + p->thread.request.thread.proc = args->fn; + p->thread.request.thread.arg = args->fn_arg; handler = new_thread_handler; } diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c index 5f9c1c5f36e2..68657988c8d1 100644 --- a/arch/um/kernel/skas/process.c +++ b/arch/um/kernel/skas/process.c @@ -39,8 +39,8 @@ int __init start_uml(void) init_new_thread_signals(); - init_task.thread.request.u.thread.proc = start_kernel_proc; - init_task.thread.request.u.thread.arg = NULL; + init_task.thread.request.thread.proc = start_kernel_proc; + init_task.thread.request.thread.arg = NULL; return start_idle_thread(task_stack_page(&init_task), &init_task.thread.switch_buf); } From 59376fb2a71b81dfc018db6fe9b6fa9cd3f41ce7 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 26 Aug 2024 18:08:12 +0800 Subject: [PATCH 11/15] um: Remove unused mm_fd field from mm_id It's no longer used since the removal of the SKAS3/4 support. Signed-off-by: Tiwei Bie Signed-off-by: Richard Weinberger --- arch/um/include/shared/skas/mm_id.h | 5 +---- arch/um/kernel/reboot.c | 2 +- arch/um/kernel/skas/mmu.c | 12 ++++++------ arch/um/kernel/time.c | 2 +- arch/um/os-Linux/skas/mem.c | 2 +- arch/um/os-Linux/skas/process.c | 2 +- 6 files changed, 11 insertions(+), 14 deletions(-) diff --git a/arch/um/include/shared/skas/mm_id.h b/arch/um/include/shared/skas/mm_id.h index 1e76ba40feba..140388c282f6 100644 --- a/arch/um/include/shared/skas/mm_id.h +++ b/arch/um/include/shared/skas/mm_id.h @@ -7,10 +7,7 @@ #define __MM_ID_H struct mm_id { - union { - int mm_fd; - int pid; - } u; + int pid; unsigned long stack; int syscall_data_len; }; diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c index 3736bca626ba..680bce4bd8fa 100644 --- a/arch/um/kernel/reboot.c +++ b/arch/um/kernel/reboot.c @@ -29,7 +29,7 @@ static void kill_off_processes(void) t = find_lock_task_mm(p); if (!t) continue; - pid = t->mm->context.id.u.pid; + pid = t->mm->context.id.pid; task_unlock(t); os_kill_ptraced_process(pid, 1); } diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 47f98d87ea3c..886ed5e65674 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -32,11 +32,11 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) new_id->stack = stack; block_signals_trace(); - new_id->u.pid = start_userspace(stack); + new_id->pid = start_userspace(stack); unblock_signals_trace(); - if (new_id->u.pid < 0) { - ret = new_id->u.pid; + if (new_id->pid < 0) { + ret = new_id->pid; goto out_free; } @@ -83,12 +83,12 @@ void destroy_context(struct mm_struct *mm) * whole UML suddenly dying. Also, cover negative and * 1 cases, since they shouldn't happen either. */ - if (mmu->id.u.pid < 2) { + if (mmu->id.pid < 2) { printk(KERN_ERR "corrupt mm_context - pid = %d\n", - mmu->id.u.pid); + mmu->id.pid); return; } - os_kill_ptraced_process(mmu->id.u.pid, 1); + os_kill_ptraced_process(mmu->id.pid, 1); free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES)); } diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 47b9f5e63566..29b27b90581f 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -839,7 +839,7 @@ static irqreturn_t um_timer(int irq, void *dev) if (get_current()->mm != NULL) { /* userspace - relay signal, results in correct userspace timers */ - os_alarm_process(get_current()->mm->context.id.u.pid); + os_alarm_process(get_current()->mm->context.id.pid); } (*timer_clockevent.event_handler)(&timer_clockevent); diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index c55430775efd..9a13ac23c606 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -78,7 +78,7 @@ static inline long do_syscall_stub(struct mm_id *mm_idp) { struct stub_data *proc_data = (void *)mm_idp->stack; int n, i; - int err, pid = mm_idp->u.pid; + int err, pid = mm_idp->pid; n = ptrace_setregs(pid, syscall_regs); if (n < 0) { diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index f7088345b3fc..b6f656bcffb1 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -588,5 +588,5 @@ void reboot_skas(void) void __switch_mm(struct mm_id *mm_idp) { - userspace_pid[0] = mm_idp->u.pid; + userspace_pid[0] = mm_idp->pid; } From bf67dbf4f7c0fe61e4e94b30f5913ca1c539f433 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 26 Aug 2024 18:08:13 +0800 Subject: [PATCH 12/15] um: Remove the call to SUBARCH_EXECVE1 macro This macro has never been defined by any supported sub-architectures in tree since it was introduced by commit 1d3468a6643a ("[PATCH uml: move _kern.c files"). Signed-off-by: Tiwei Bie Signed-off-by: Richard Weinberger --- arch/um/kernel/exec.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 2c15bb2c104c..cb8b5cd9285c 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -35,8 +35,5 @@ void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) PT_REGS_IP(regs) = eip; PT_REGS_SP(regs) = esp; clear_thread_flag(TIF_SINGLESTEP); -#ifdef SUBARCH_EXECVE1 - SUBARCH_EXECVE1(regs->regs); -#endif } EXPORT_SYMBOL(start_thread); From fe6abeba24996f826473630b2054699828fd9f18 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 26 Aug 2024 18:08:14 +0800 Subject: [PATCH 13/15] um: Remove the declaration of user_thread function This function has never been defined since its declaration was introduced by commit 1da177e4c3f4 ("Linux-2.6.12-rc2"). Signed-off-by: Tiwei Bie Signed-off-by: Richard Weinberger --- arch/um/include/shared/skas/skas.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/um/include/shared/skas/skas.h b/arch/um/include/shared/skas/skas.h index 854cff8d6319..85c50122ab98 100644 --- a/arch/um/include/shared/skas/skas.h +++ b/arch/um/include/shared/skas/skas.h @@ -10,7 +10,6 @@ extern int userspace_pid[]; -extern int user_thread(unsigned long stack, int flags); extern void new_thread_handler(void); extern void handle_syscall(struct uml_pt_regs *regs); extern unsigned long current_stub_stack(void); From ae0dc67c2512e09fee26226e1b2d78b82ebebf66 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 26 Aug 2024 18:08:15 +0800 Subject: [PATCH 14/15] um: Remove outdated asm/sysrq.h header This header no longer serves a purpose after show_trace was removed by commit 9d1ee8ce92e1 ("um: Rewrite show_stack()"). Signed-off-by: Tiwei Bie Signed-off-by: Richard Weinberger --- arch/um/include/asm/sysrq.h | 8 -------- arch/um/kernel/sysrq.c | 1 - arch/x86/um/sysrq_32.c | 1 - arch/x86/um/sysrq_64.c | 1 - 4 files changed, 11 deletions(-) delete mode 100644 arch/um/include/asm/sysrq.h diff --git a/arch/um/include/asm/sysrq.h b/arch/um/include/asm/sysrq.h deleted file mode 100644 index 8fc8c65cd357..000000000000 --- a/arch/um/include/asm/sysrq.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __UM_SYSRQ_H -#define __UM_SYSRQ_H - -struct task_struct; -extern void show_trace(struct task_struct* task, unsigned long *stack); - -#endif diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 746715379f12..4bb8622dc512 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -11,7 +11,6 @@ #include #include -#include #include #include diff --git a/arch/x86/um/sysrq_32.c b/arch/x86/um/sysrq_32.c index f2383484840d..a1ee415c008d 100644 --- a/arch/x86/um/sysrq_32.c +++ b/arch/x86/um/sysrq_32.c @@ -9,7 +9,6 @@ #include #include #include -#include /* This is declared by */ void show_regs(struct pt_regs *regs) diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c index 0bf6de40abff..340d8a243c8a 100644 --- a/arch/x86/um/sysrq_64.c +++ b/arch/x86/um/sysrq_64.c @@ -12,7 +12,6 @@ #include #include #include -#include void show_regs(struct pt_regs *regs) { From 381d2f95c8aa575d5d42bf1fe0ea9a70c4bec0cf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 27 Aug 2024 16:05:01 +0200 Subject: [PATCH 15/15] um: fix time-travel syscall scheduling hack The schedule() call there really never did anything at least since the introduction of the EEVDF scheduler, but now I found a case where we permanently hang in a loop of -ERESTARTNOINTR (due to locking.) Work around it by making any syscalls with error return take time (and then schedule after) so we cannot hang in such a loop forever. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/kernel/skas/syscall.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 9ee19e566da3..b09e85279d2b 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -12,23 +12,13 @@ #include #include #include +#include void handle_syscall(struct uml_pt_regs *r) { struct pt_regs *regs = container_of(r, struct pt_regs, regs); int syscall; - /* - * If we have infinite CPU resources, then make every syscall also a - * preemption point, since we don't have any other preemption in this - * case, and kernel threads would basically never run until userspace - * went to sleep, even if said userspace interacts with the kernel in - * various ways. - */ - if (time_travel_mode == TT_MODE_INFCPU || - time_travel_mode == TT_MODE_EXTERNAL) - schedule(); - /* Initialize the syscall number and default return value. */ UPT_SYSCALL_NR(r) = PT_SYSCALL_NR(r->gp); PT_REGS_SET_SYSCALL_RETURN(regs, -ENOSYS); @@ -41,9 +31,25 @@ void handle_syscall(struct uml_pt_regs *r) goto out; syscall = UPT_SYSCALL_NR(r); - if (syscall >= 0 && syscall < __NR_syscalls) - PT_REGS_SET_SYSCALL_RETURN(regs, - EXECUTE_SYSCALL(syscall, regs)); + if (syscall >= 0 && syscall < __NR_syscalls) { + unsigned long ret = EXECUTE_SYSCALL(syscall, regs); + + PT_REGS_SET_SYSCALL_RETURN(regs, ret); + + /* + * An error value here can be some form of -ERESTARTSYS + * and then we'd just loop. Make any error syscalls take + * some time, so that it won't just loop if something is + * not ready, and hopefully other things will make some + * progress. + */ + if (IS_ERR_VALUE(ret) && + (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL)) { + um_udelay(1); + schedule(); + } + } out: syscall_trace_leave(regs);