Tejun Heo 7b0888b7cc sched_ext: Implement core-sched support
The core-sched support is composed of the following parts:

- task_struct->scx.core_sched_at is added. This is a timestamp which can be
  used to order tasks. Depending on whether the BPF scheduler implements
  custom ordering, it tracks either global FIFO ordering of all tasks or
  local-DSQ ordering within the dispatched tasks on a CPU.

- prio_less() is updated to call scx_prio_less() when comparing SCX tasks.
  scx_prio_less() calls ops.core_sched_before() if available or uses the
  core_sched_at timestamp. For global FIFO ordering, the BPF scheduler
  doesn't need to do anything. Otherwise, it should implement
  ops.core_sched_before() which reflects the ordering.

- When core-sched is enabled, balance_scx() balances all SMT siblings so
  that they all have tasks dispatched if necessary before pick_task_scx() is
  called. pick_task_scx() picks between the current task and the first
  dispatched task on the local DSQ based on availability and the
  core_sched_at timestamps. Note that FIFO ordering is expected among the
  already dispatched tasks whether running or on the local DSQ, so this path
  always compares core_sched_at instead of calling into
  ops.core_sched_before().

qmap_core_sched_before() is added to scx_qmap. It scales the
distances from the heads of the queues to compare the tasks across different
priority queues and seems to behave as expected.

v3: Fixed build error when !CONFIG_SCHED_SMT reported by Andrea Righi.

v2: Sched core added the const qualifiers to prio_less task arguments.
    Explicitly drop them for ops.core_sched_before() task arguments. BPF
    enforces access control through the verifier, so the qualifier isn't
    actually operative and only gets in the way when interacting with
    various helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Reviewed-by: Josh Don <joshdon@google.com>
Cc: Andrea Righi <andrea.righi@canonical.com>
2024-06-18 10:09:20 -10:00

132 lines
3.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <inttypes.h>
#include <signal.h>
#include <libgen.h>
#include <bpf/bpf.h>
#include <scx/common.h>
#include "scx_qmap.bpf.skel.h"
const char help_fmt[] =
"A simple five-level FIFO queue sched_ext scheduler.\n"
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
" [-d PID] [-D LEN] [-p] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
" -t COUNT Stall every COUNT'th user thread\n"
" -T COUNT Stall every COUNT'th kernel thread\n"
" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
" -D LEN Set scx_exit_info.dump buffer length\n"
" -S Suppress qmap-specific debug dump\n"
" -p Switch only tasks on SCHED_EXT policy intead of all\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int dummy)
{
exit_req = 1;
}
int main(int argc, char **argv)
{
struct scx_qmap *skel;
struct bpf_link *link;
int opt;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
while ((opt = getopt(argc, argv, "s:e:t:T:l:b:d:D:Spvh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
break;
case 'e':
skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
break;
case 't':
skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
break;
case 'T':
skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
break;
case 'l':
skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0);
break;
case 'b':
skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
break;
case 'd':
skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
if (skel->rodata->disallow_tgid < 0)
skel->rodata->disallow_tgid = getpid();
break;
case 'D':
skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0);
break;
case 'S':
skel->rodata->suppress_dump = true;
break;
case 'p':
skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL;
break;
case 'v':
verbose = true;
break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
}
}
SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
while (!exit_req && !UEI_EXITED(skel, uei)) {
long nr_enqueued = skel->bss->nr_enqueued;
long nr_dispatched = skel->bss->nr_dispatched;
printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64"\n",
nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
skel->bss->nr_core_sched_execed);
fflush(stdout);
sleep(1);
}
bpf_link__destroy(link);
UEI_REPORT(skel, uei);
scx_qmap__destroy(skel);
/*
* scx_qmap implements ops.cpu_on/offline() and doesn't need to restart
* on CPU hotplug events.
*/
return 0;
}