linux-stable/tools/sched_ext/scx_qmap.c
Tejun Heo 07814a9439 sched_ext: Print debug dump after an error exit
If a BPF scheduler triggers an error, the scheduler is aborted and the
system is reverted to the built-in scheduler. In the process, a lot of
information which may be useful for figuring out what happened can be lost.

This patch adds debug dump which captures information which may be useful
for debugging including runqueue and runnable thread states at the time of
failure. The following shows a debug dump after triggering the watchdog:

  root@test ~# os/work/tools/sched_ext/build/bin/scx_qmap -t 100
  stats  : enq=1 dsp=0 delta=1 deq=0
  stats  : enq=90 dsp=90 delta=0 deq=0
  stats  : enq=156 dsp=156 delta=0 deq=0
  stats  : enq=218 dsp=218 delta=0 deq=0
  stats  : enq=255 dsp=255 delta=0 deq=0
  stats  : enq=271 dsp=271 delta=0 deq=0
  stats  : enq=284 dsp=284 delta=0 deq=0
  stats  : enq=293 dsp=293 delta=0 deq=0

  DEBUG DUMP
  ================================================================================

  kworker/u32:12[320] triggered exit kind 1026:
    runnable task stall (stress[1530] failed to run for 6.841s)

  Backtrace:
    scx_watchdog_workfn+0x136/0x1c0
    process_scheduled_works+0x2b5/0x600
    worker_thread+0x269/0x360
    kthread+0xeb/0x110
    ret_from_fork+0x36/0x40
    ret_from_fork_asm+0x1a/0x30

  QMAP FIFO[0]:
  QMAP FIFO[1]:
  QMAP FIFO[2]: 1436
  QMAP FIFO[3]:
  QMAP FIFO[4]:

  CPU states
  ----------

  CPU 0   : nr_run=1 ops_qseq=244
	    curr=swapper/0[0] class=idle_sched_class

    QMAP: dsp_idx=1 dsp_cnt=0

    R stress[1530] -6841ms
	scx_state/flags=3/0x1 ops_state/qseq=2/20
	sticky/holding_cpu=-1/-1 dsq_id=(n/a)
	cpus=ff

      QMAP: force_local=0

      asm_sysvec_apic_timer_interrupt+0x16/0x20

  CPU 2   : nr_run=2 ops_qseq=142
	    curr=swapper/2[0] class=idle_sched_class

    QMAP: dsp_idx=1 dsp_cnt=0

    R sshd[1703] -5905ms
	scx_state/flags=3/0x9 ops_state/qseq=2/88
	sticky/holding_cpu=-1/-1 dsq_id=(n/a)
	cpus=ff

      QMAP: force_local=1

      __x64_sys_ppoll+0xf6/0x120
      do_syscall_64+0x7b/0x150
      entry_SYSCALL_64_after_hwframe+0x76/0x7e

    R fish[1539] -4141ms
	scx_state/flags=3/0x9 ops_state/qseq=2/124
	sticky/holding_cpu=-1/-1 dsq_id=(n/a)
	cpus=ff

      QMAP: force_local=1

      futex_wait+0x60/0xe0
      do_futex+0x109/0x180
      __x64_sys_futex+0x117/0x190
      do_syscall_64+0x7b/0x150
      entry_SYSCALL_64_after_hwframe+0x76/0x7e

  CPU 3   : nr_run=2 ops_qseq=162
	    curr=kworker/u32:12[320] class=ext_sched_class

    QMAP: dsp_idx=1 dsp_cnt=0

   *R kworker/u32:12[320] +0ms
	scx_state/flags=3/0xd ops_state/qseq=0/0
	sticky/holding_cpu=-1/-1 dsq_id=(n/a)
	cpus=ff

      QMAP: force_local=0

      scx_dump_state+0x613/0x6f0
      scx_ops_error_irq_workfn+0x1f/0x40
      irq_work_run_list+0x82/0xd0
      irq_work_run+0x14/0x30
      __sysvec_irq_work+0x40/0x140
      sysvec_irq_work+0x60/0x70
      asm_sysvec_irq_work+0x16/0x20
      scx_watchdog_workfn+0x15f/0x1c0
      process_scheduled_works+0x2b5/0x600
      worker_thread+0x269/0x360
      kthread+0xeb/0x110
      ret_from_fork+0x36/0x40
      ret_from_fork_asm+0x1a/0x30

    R kworker/3:2[1436] +0ms
	scx_state/flags=3/0x9 ops_state/qseq=2/160
	sticky/holding_cpu=-1/-1 dsq_id=(n/a)
	cpus=08

      QMAP: force_local=0

      kthread+0xeb/0x110
      ret_from_fork+0x36/0x40
      ret_from_fork_asm+0x1a/0x30

  CPU 7   : nr_run=0 ops_qseq=76
	    curr=swapper/7[0] class=idle_sched_class


  ================================================================================

  EXIT: runnable task stall (stress[1530] failed to run for 6.841s)

It shows that CPU 3 was running the watchdog when it triggered the error
condition and the scx_qmap thread has been queued on CPU 0 for over 5
seconds but failed to run. It also prints out scx_qmap specific information
- e.g. which tasks are queued on each FIFO and so on using the dump_*() ops.
This dump has proved pretty useful for developing and debugging BPF
schedulers.

Debug dump is generated automatically when the BPF scheduler exits due to an
error. The debug buffer used in such cases is determined by
sched_ext_ops.exit_dump_len and defaults to 32k. If the debug dump overruns
the available buffer, the output is truncated and marked accordingly.

Debug dump output can also be read through the sched_ext_dump tracepoint.
When read through the tracepoint, there is no length limit.

SysRq-D can be used to trigger debug dump at any time while a BPF scheduler
is loaded. This is non-destructive - the scheduler keeps running afterwards.
The output can be read through the sched_ext_dump tracepoint.

v2: - The size of exit debug dump buffer can now be customized using
      sched_ext_ops.exit_dump_len.

    - sched_ext_ops.dump*() added to enable dumping of BPF scheduler
      specific information.

    - Tracpoint output and SysRq-D triggering added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
2024-06-18 10:09:18 -10:00

123 lines
3.3 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <inttypes.h>
#include <signal.h>
#include <libgen.h>
#include <bpf/bpf.h>
#include <scx/common.h>
#include "scx_qmap.bpf.skel.h"
const char help_fmt[] =
"A simple five-level FIFO queue sched_ext scheduler.\n"
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n"
" [-d PID] [-D LEN] [-p] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
" -t COUNT Stall every COUNT'th user thread\n"
" -T COUNT Stall every COUNT'th kernel thread\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
" -D LEN Set scx_exit_info.dump buffer length\n"
" -S Suppress qmap-specific debug dump\n"
" -p Switch only tasks on SCHED_EXT policy intead of all\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int dummy)
{
exit_req = 1;
}
int main(int argc, char **argv)
{
struct scx_qmap *skel;
struct bpf_link *link;
int opt;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
while ((opt = getopt(argc, argv, "s:e:t:T:b:d:D:Spvh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
break;
case 'e':
skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
break;
case 't':
skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
break;
case 'T':
skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
break;
case 'b':
skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
break;
case 'd':
skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
if (skel->rodata->disallow_tgid < 0)
skel->rodata->disallow_tgid = getpid();
break;
case 'D':
skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0);
break;
case 'S':
skel->rodata->suppress_dump = true;
break;
case 'p':
skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL;
break;
case 'v':
verbose = true;
break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
}
}
SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
while (!exit_req && !UEI_EXITED(skel, uei)) {
long nr_enqueued = skel->bss->nr_enqueued;
long nr_dispatched = skel->bss->nr_dispatched;
printf("stats : enq=%lu dsp=%lu delta=%ld deq=%"PRIu64"\n",
nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
skel->bss->nr_dequeued);
fflush(stdout);
sleep(1);
}
bpf_link__destroy(link);
UEI_REPORT(skel, uei);
scx_qmap__destroy(skel);
return 0;
}