Yang Yang a3b2aeac9d delayacct: track delays from IRQ/SOFTIRQ
Delay accounting does not track the delay of IRQ/SOFTIRQ.  While
IRQ/SOFTIRQ could have obvious impact on some workloads productivity, such
as when workloads are running on system which is busy handling network
IRQ/SOFTIRQ.

Get the delay of IRQ/SOFTIRQ could help users to reduce such delay.  Such
as setting interrupt affinity or task affinity, using kernel thread for
NAPI etc.  This is inspired by "sched/psi: Add PSI_IRQ to track
IRQ/SOFTIRQ pressure"[1].  Also fix some code indent problems of older
code.

And update tools/accounting/getdelays.c:
    / # ./getdelays -p 156 -di
    print delayacct stats ON
    printing IO accounting
    PID     156

    CPU             count     real total  virtual total    delay total  delay average
                       15       15836008       16218149      275700790         18.380ms
    IO              count    delay total  delay average
                        0              0          0.000ms
    SWAP            count    delay total  delay average
                        0              0          0.000ms
    RECLAIM         count    delay total  delay average
                        0              0          0.000ms
    THRASHING       count    delay total  delay average
                        0              0          0.000ms
    COMPACT         count    delay total  delay average
                        0              0          0.000ms
    WPCOPY          count    delay total  delay average
                       36        7586118          0.211ms
    IRQ             count    delay total  delay average
                       42         929161          0.022ms

[1] commit 52b1364ba0b1("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure")

Link: https://lkml.kernel.org/r/202304081728353557233@zte.com.cn
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Cc: Jiang Xuexin <jiang.xuexin@zte.com.cn>
Cc: wangyong <wang.yong12@zte.com.cn>
Cc: junhua huang <huang.junhua@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-18 16:39:34 -07:00

253 lines
8.1 KiB
C

/* SPDX-License-Identifier: LGPL-2.1 WITH Linux-syscall-note */
/* taskstats.h - exporting per-task statistics
*
* Copyright (C) Shailabh Nagar, IBM Corp. 2006
* (C) Balbir Singh, IBM Corp. 2006
* (C) Jay Lan, SGI, 2006
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*/
#ifndef _LINUX_TASKSTATS_H
#define _LINUX_TASKSTATS_H
#include <linux/types.h>
/* Format for per-task data returned to userland when
* - a task exits
* - listener requests stats for a task
*
* The struct is versioned. Newer versions should only add fields to
* the bottom of the struct to maintain backward compatibility.
*
*
* To add new fields
* a) bump up TASKSTATS_VERSION
* b) add comment indicating new version number at end of struct
* c) add new fields after version comment; maintain 64-bit alignment
*/
#define TASKSTATS_VERSION 14
#define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
* in linux/sched.h */
struct taskstats {
/* The version number of this struct. This field is always set to
* TAKSTATS_VERSION, which is defined in <linux/taskstats.h>.
* Each time the struct is changed, the value should be incremented.
*/
__u16 version;
__u32 ac_exitcode; /* Exit status */
/* The accounting flags of a task as defined in <linux/acct.h>
* Defined values are AFORK, ASU, ACOMPAT, ACORE, AXSIG, and AGROUP.
* (AGROUP since version 12).
*/
__u8 ac_flag; /* Record flags */
__u8 ac_nice; /* task_nice */
/* Delay accounting fields start
*
* All values, until comment "Delay accounting fields end" are
* available only if delay accounting is enabled, even though the last
* few fields are not delays
*
* xxx_count is the number of delay values recorded
* xxx_delay_total is the corresponding cumulative delay in nanoseconds
*
* xxx_delay_total wraps around to zero on overflow
* xxx_count incremented regardless of overflow
*/
/* Delay waiting for cpu, while runnable
* count, delay_total NOT updated atomically
*/
__u64 cpu_count __attribute__((aligned(8)));
__u64 cpu_delay_total;
/* Following four fields atomically updated using task->delays->lock */
/* Delay waiting for synchronous block I/O to complete
* does not account for delays in I/O submission
*/
__u64 blkio_count;
__u64 blkio_delay_total;
/* Delay waiting for page fault I/O (swap in only) */
__u64 swapin_count;
__u64 swapin_delay_total;
/* cpu "wall-clock" running time
* On some architectures, value will adjust for cpu time stolen
* from the kernel in involuntary waits due to virtualization.
* Value is cumulative, in nanoseconds, without a corresponding count
* and wraps around to zero silently on overflow
*/
__u64 cpu_run_real_total;
/* cpu "virtual" running time
* Uses time intervals seen by the kernel i.e. no adjustment
* for kernel's involuntary waits due to virtualization.
* Value is cumulative, in nanoseconds, without a corresponding count
* and wraps around to zero silently on overflow
*/
__u64 cpu_run_virtual_total;
/* Delay accounting fields end */
/* version 1 ends here */
/* Basic Accounting Fields start */
char ac_comm[TS_COMM_LEN]; /* Command name */
__u8 ac_sched __attribute__((aligned(8)));
/* Scheduling discipline */
__u8 ac_pad[3];
__u32 ac_uid __attribute__((aligned(8)));
/* User ID */
__u32 ac_gid; /* Group ID */
__u32 ac_pid; /* Process ID */
__u32 ac_ppid; /* Parent process ID */
/* __u32 range means times from 1970 to 2106 */
__u32 ac_btime; /* Begin time [sec since 1970] */
__u64 ac_etime __attribute__((aligned(8)));
/* Elapsed time [usec] */
__u64 ac_utime; /* User CPU time [usec] */
__u64 ac_stime; /* SYstem CPU time [usec] */
__u64 ac_minflt; /* Minor Page Fault Count */
__u64 ac_majflt; /* Major Page Fault Count */
/* Basic Accounting Fields end */
/* Extended accounting fields start */
/* Accumulated RSS usage in duration of a task, in MBytes-usecs.
* The current rss usage is added to this counter every time
* a tick is charged to a task's system time. So, at the end we
* will have memory usage multiplied by system time. Thus an
* average usage per system time unit can be calculated.
*/
__u64 coremem; /* accumulated RSS usage in MB-usec */
/* Accumulated virtual memory usage in duration of a task.
* Same as acct_rss_mem1 above except that we keep track of VM usage.
*/
__u64 virtmem; /* accumulated VM usage in MB-usec */
/* High watermark of RSS and virtual memory usage in duration of
* a task, in KBytes.
*/
__u64 hiwater_rss; /* High-watermark of RSS usage, in KB */
__u64 hiwater_vm; /* High-water VM usage, in KB */
/* The following four fields are I/O statistics of a task. */
__u64 read_char; /* bytes read */
__u64 write_char; /* bytes written */
__u64 read_syscalls; /* read syscalls */
__u64 write_syscalls; /* write syscalls */
/* Extended accounting fields end */
#define TASKSTATS_HAS_IO_ACCOUNTING
/* Per-task storage I/O accounting starts */
__u64 read_bytes; /* bytes of read I/O */
__u64 write_bytes; /* bytes of write I/O */
__u64 cancelled_write_bytes; /* bytes of cancelled write I/O */
__u64 nvcsw; /* voluntary_ctxt_switches */
__u64 nivcsw; /* nonvoluntary_ctxt_switches */
/* time accounting for SMT machines */
__u64 ac_utimescaled; /* utime scaled on frequency etc */
__u64 ac_stimescaled; /* stime scaled on frequency etc */
__u64 cpu_scaled_run_real_total; /* scaled cpu_run_real_total */
/* Delay waiting for memory reclaim */
__u64 freepages_count;
__u64 freepages_delay_total;
/* Delay waiting for thrashing page */
__u64 thrashing_count;
__u64 thrashing_delay_total;
/* v10: 64-bit btime to avoid overflow */
__u64 ac_btime64; /* 64-bit begin time */
/* v11: Delay waiting for memory compact */
__u64 compact_count;
__u64 compact_delay_total;
/* v12 begin */
__u32 ac_tgid; /* thread group ID */
/* Thread group walltime up to now. This is total process walltime if
* AGROUP flag is set.
*/
__u64 ac_tgetime __attribute__((aligned(8)));
/* Lightweight information to identify process binary files.
* This leaves userspace to match this to a file system path, using
* MAJOR() and MINOR() macros to identify a device and mount point,
* the inode to identify the executable file. This is /proc/self/exe
* at the end, so matching the most recent exec(). Values are zero
* for kernel threads.
*/
__u64 ac_exe_dev; /* program binary device ID */
__u64 ac_exe_inode; /* program binary inode number */
/* v12 end */
/* v13: Delay waiting for write-protect copy */
__u64 wpcopy_count;
__u64 wpcopy_delay_total;
/* v14: Delay waiting for IRQ/SOFTIRQ */
__u64 irq_count;
__u64 irq_delay_total;
};
/*
* Commands sent from userspace
* Not versioned. New commands should only be inserted at the enum's end
* prior to __TASKSTATS_CMD_MAX
*/
enum {
TASKSTATS_CMD_UNSPEC = 0, /* Reserved */
TASKSTATS_CMD_GET, /* user->kernel request/get-response */
TASKSTATS_CMD_NEW, /* kernel->user event */
__TASKSTATS_CMD_MAX,
};
#define TASKSTATS_CMD_MAX (__TASKSTATS_CMD_MAX - 1)
enum {
TASKSTATS_TYPE_UNSPEC = 0, /* Reserved */
TASKSTATS_TYPE_PID, /* Process id */
TASKSTATS_TYPE_TGID, /* Thread group id */
TASKSTATS_TYPE_STATS, /* taskstats structure */
TASKSTATS_TYPE_AGGR_PID, /* contains pid + stats */
TASKSTATS_TYPE_AGGR_TGID, /* contains tgid + stats */
TASKSTATS_TYPE_NULL, /* contains nothing */
__TASKSTATS_TYPE_MAX,
};
#define TASKSTATS_TYPE_MAX (__TASKSTATS_TYPE_MAX - 1)
enum {
TASKSTATS_CMD_ATTR_UNSPEC = 0,
TASKSTATS_CMD_ATTR_PID,
TASKSTATS_CMD_ATTR_TGID,
TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
__TASKSTATS_CMD_ATTR_MAX,
};
#define TASKSTATS_CMD_ATTR_MAX (__TASKSTATS_CMD_ATTR_MAX - 1)
/* NETLINK_GENERIC related info */
#define TASKSTATS_GENL_NAME "TASKSTATS"
#define TASKSTATS_GENL_VERSION 0x1
#endif /* _LINUX_TASKSTATS_H */