linux-stable/tools/sched_ext/scx_qmap.c
David Vernet 245254f708 sched_ext: Implement sched_ext_ops.cpu_acquire/release()
Scheduler classes are strictly ordered and when a higher priority class has
tasks to run, the lower priority ones lose access to the CPU. Being able to
monitor and act on these events are necessary for use cases includling
strict core-scheduling and latency management.

This patch adds two operations ops.cpu_acquire() and .cpu_release(). The
former is invoked when a CPU becomes available to the BPF scheduler and the
opposite for the latter. This patch also implements
scx_bpf_reenqueue_local() which can be called from .cpu_release() to trigger
requeueing of all tasks in the local dsq of the CPU so that the tasks can be
reassigned to other available CPUs.

scx_pair is updated to use .cpu_acquire/release() along with
%SCX_KICK_WAIT to make the pair scheduling guarantee strict even when a CPU
is preempted by a higher priority scheduler class.

scx_qmap is updated to use .cpu_acquire/release() to empty the local
dsq of a preempted CPU. A similar approach can be adopted by BPF schedulers
that want to have a tight control over latency.

v4: Use the new SCX_KICK_IDLE to wake up a CPU after re-enqueueing.

v3: Drop the const qualifier from scx_cpu_release_args.task. BPF enforces
    access control through the verifier, so the qualifier isn't actually
    operative and only gets in the way when interacting with various
    helpers.

v2: Add p->scx.kf_mask annotation to allow calling scx_bpf_reenqueue_local()
    from ops.cpu_release() nested inside ops.init() and other sleepable
    operations.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
2024-06-18 10:09:20 -10:00

127 lines
3.5 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <inttypes.h>
#include <signal.h>
#include <libgen.h>
#include <bpf/bpf.h>
#include <scx/common.h>
#include "scx_qmap.bpf.skel.h"
const char help_fmt[] =
"A simple five-level FIFO queue sched_ext scheduler.\n"
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
" [-d PID] [-D LEN] [-p] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
" -t COUNT Stall every COUNT'th user thread\n"
" -T COUNT Stall every COUNT'th kernel thread\n"
" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
" -D LEN Set scx_exit_info.dump buffer length\n"
" -S Suppress qmap-specific debug dump\n"
" -p Switch only tasks on SCHED_EXT policy intead of all\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int dummy)
{
exit_req = 1;
}
int main(int argc, char **argv)
{
struct scx_qmap *skel;
struct bpf_link *link;
int opt;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
while ((opt = getopt(argc, argv, "s:e:t:T:l:b:d:D:Spvh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
break;
case 'e':
skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
break;
case 't':
skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
break;
case 'T':
skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
break;
case 'l':
skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0);
break;
case 'b':
skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
break;
case 'd':
skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
if (skel->rodata->disallow_tgid < 0)
skel->rodata->disallow_tgid = getpid();
break;
case 'D':
skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0);
break;
case 'S':
skel->rodata->suppress_dump = true;
break;
case 'p':
skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL;
break;
case 'v':
verbose = true;
break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
}
}
SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
while (!exit_req && !UEI_EXITED(skel, uei)) {
long nr_enqueued = skel->bss->nr_enqueued;
long nr_dispatched = skel->bss->nr_dispatched;
printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64"\n",
nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
skel->bss->nr_reenqueued, skel->bss->nr_dequeued);
fflush(stdout);
sleep(1);
}
bpf_link__destroy(link);
UEI_REPORT(skel, uei);
scx_qmap__destroy(skel);
return 0;
}