2019-05-19 13:08:55 +01:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2017-09-25 08:12:05 -07:00
|
|
|
#include "cgroup-internal.h"
|
|
|
|
|
|
|
|
#include <linux/sched/cputime.h>
|
|
|
|
|
2022-08-24 16:31:15 -07:00
|
|
|
#include <linux/bpf.h>
|
|
|
|
#include <linux/btf.h>
|
|
|
|
#include <linux/btf_ids.h>
|
|
|
|
|
2024-04-16 19:51:26 +02:00
|
|
|
#include <trace/events/cgroup.h>
|
|
|
|
|
2018-04-26 14:29:05 -07:00
|
|
|
static DEFINE_SPINLOCK(cgroup_rstat_lock);
|
2018-04-26 14:29:04 -07:00
|
|
|
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
|
2017-09-25 08:12:05 -07:00
|
|
|
|
2018-04-26 14:29:05 -07:00
|
|
|
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
|
|
|
|
|
2018-04-26 14:29:04 -07:00
|
|
|
static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
|
2017-09-25 08:12:05 -07:00
|
|
|
{
|
2018-04-26 14:29:04 -07:00
|
|
|
return per_cpu_ptr(cgrp->rstat_cpu, cpu);
|
2017-09-25 08:12:05 -07:00
|
|
|
}
|
|
|
|
|
2024-05-01 16:04:11 +02:00
|
|
|
/*
|
|
|
|
* Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
|
|
|
|
*
|
|
|
|
* This makes it easier to diagnose locking issues and contention in
|
|
|
|
* production environments. The parameter @fast_path determine the
|
|
|
|
* tracepoints being added, allowing us to diagnose "flush" related
|
|
|
|
* operations without handling high-frequency fast-path "update" events.
|
|
|
|
*/
|
|
|
|
static __always_inline
|
|
|
|
unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
|
|
|
|
struct cgroup *cgrp, const bool fast_path)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
bool contended;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The _irqsave() is needed because cgroup_rstat_lock is
|
|
|
|
* spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
|
|
|
|
* this lock with the _irq() suffix only disables interrupts on
|
|
|
|
* a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
|
|
|
|
* interrupts on both configurations. The _irqsave() ensures
|
|
|
|
* that interrupts are always disabled and later restored.
|
|
|
|
*/
|
|
|
|
contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
|
|
|
|
if (contended) {
|
|
|
|
if (fast_path)
|
|
|
|
trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
|
|
|
|
else
|
|
|
|
trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
|
|
|
|
|
|
|
|
raw_spin_lock_irqsave(cpu_lock, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fast_path)
|
|
|
|
trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
|
|
|
|
else
|
|
|
|
trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
|
|
|
|
|
|
|
|
return flags;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline
|
|
|
|
void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
|
|
|
|
struct cgroup *cgrp, unsigned long flags,
|
|
|
|
const bool fast_path)
|
|
|
|
{
|
|
|
|
if (fast_path)
|
|
|
|
trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
|
|
|
|
else
|
|
|
|
trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
|
|
|
|
|
|
|
|
raw_spin_unlock_irqrestore(cpu_lock, flags);
|
|
|
|
}
|
|
|
|
|
2017-09-25 08:12:05 -07:00
|
|
|
/**
|
2018-04-26 14:29:05 -07:00
|
|
|
* cgroup_rstat_updated - keep track of updated rstat_cpu
|
2017-09-25 08:12:05 -07:00
|
|
|
* @cgrp: target cgroup
|
2018-04-26 14:29:04 -07:00
|
|
|
* @cpu: cpu on which rstat_cpu was updated
|
2017-09-25 08:12:05 -07:00
|
|
|
*
|
2018-04-26 14:29:04 -07:00
|
|
|
* @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
|
|
|
|
* rstat_cpu->updated_children list. See the comment on top of
|
|
|
|
* cgroup_rstat_cpu definition for details.
|
2017-09-25 08:12:05 -07:00
|
|
|
*/
|
2023-02-01 11:30:15 -06:00
|
|
|
__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
|
2017-09-25 08:12:05 -07:00
|
|
|
{
|
2018-04-26 14:29:04 -07:00
|
|
|
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
|
2017-09-25 08:12:05 -07:00
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
/*
|
2020-04-09 14:55:35 -04:00
|
|
|
* Speculative already-on-list test. This may race leading to
|
|
|
|
* temporary inaccuracies, which is fine.
|
|
|
|
*
|
2017-09-25 08:12:05 -07:00
|
|
|
* Because @parent's updated_children is terminated with @parent
|
|
|
|
* instead of NULL, we can tell whether @cgrp is on the list by
|
|
|
|
* testing the next pointer for NULL.
|
|
|
|
*/
|
2021-11-03 17:58:45 +01:00
|
|
|
if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
|
2017-09-25 08:12:05 -07:00
|
|
|
return;
|
|
|
|
|
2024-05-01 16:04:11 +02:00
|
|
|
flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
|
2017-09-25 08:12:05 -07:00
|
|
|
|
|
|
|
/* put @cgrp and all ancestors on the corresponding updated lists */
|
cgroup: rstat: punt root-level optimization to individual controllers
Current users of the rstat code can source root-level statistics from
the native counters of their respective subsystem, allowing them to
forego aggregation at the root level. This optimization is currently
implemented inside the generic rstat code, which doesn't track the root
cgroup and doesn't invoke the subsystem flush callbacks on it.
However, the memory controller cannot do this optimization, because
cgroup1 breaks out memory specifically for the local level, including at
the root level. In preparation for the memory controller switching to
rstat, move the optimization from rstat core to the controllers.
Afterwards, rstat will always track the root cgroup for changes and
invoke the subsystem callbacks on it; and it's up to the subsystem to
special-case and skip aggregation of the root cgroup if it can source
this information through other, cheaper means.
This is the case for the io controller and the cgroup base stats. In
their respective flush callbacks, check whether the parent is the root
cgroup, and if so, skip the unnecessary upward propagation.
The extra cost of tracking the root cgroup is negligible: on stat
changes, we actually remove a branch that checks for the root. The
queueing for a flush touches only per-cpu data, and only the first stat
change since a flush requires a (per-cpu) lock.
Link: https://lkml.kernel.org/r/20210209163304.77088-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-29 22:56:23 -07:00
|
|
|
while (true) {
|
2018-04-26 14:29:04 -07:00
|
|
|
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
|
cgroup: rstat: punt root-level optimization to individual controllers
Current users of the rstat code can source root-level statistics from
the native counters of their respective subsystem, allowing them to
forego aggregation at the root level. This optimization is currently
implemented inside the generic rstat code, which doesn't track the root
cgroup and doesn't invoke the subsystem flush callbacks on it.
However, the memory controller cannot do this optimization, because
cgroup1 breaks out memory specifically for the local level, including at
the root level. In preparation for the memory controller switching to
rstat, move the optimization from rstat core to the controllers.
Afterwards, rstat will always track the root cgroup for changes and
invoke the subsystem callbacks on it; and it's up to the subsystem to
special-case and skip aggregation of the root cgroup if it can source
this information through other, cheaper means.
This is the case for the io controller and the cgroup base stats. In
their respective flush callbacks, check whether the parent is the root
cgroup, and if so, skip the unnecessary upward propagation.
The extra cost of tracking the root cgroup is negligible: on stat
changes, we actually remove a branch that checks for the root. The
queueing for a flush touches only per-cpu data, and only the first stat
change since a flush requires a (per-cpu) lock.
Link: https://lkml.kernel.org/r/20210209163304.77088-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-29 22:56:23 -07:00
|
|
|
struct cgroup *parent = cgroup_parent(cgrp);
|
|
|
|
struct cgroup_rstat_cpu *prstatc;
|
2017-09-25 08:12:05 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Both additions and removals are bottom-up. If a cgroup
|
|
|
|
* is already in the tree, all ancestors are.
|
|
|
|
*/
|
2018-04-26 14:29:04 -07:00
|
|
|
if (rstatc->updated_next)
|
2017-09-25 08:12:05 -07:00
|
|
|
break;
|
|
|
|
|
cgroup: rstat: punt root-level optimization to individual controllers
Current users of the rstat code can source root-level statistics from
the native counters of their respective subsystem, allowing them to
forego aggregation at the root level. This optimization is currently
implemented inside the generic rstat code, which doesn't track the root
cgroup and doesn't invoke the subsystem flush callbacks on it.
However, the memory controller cannot do this optimization, because
cgroup1 breaks out memory specifically for the local level, including at
the root level. In preparation for the memory controller switching to
rstat, move the optimization from rstat core to the controllers.
Afterwards, rstat will always track the root cgroup for changes and
invoke the subsystem callbacks on it; and it's up to the subsystem to
special-case and skip aggregation of the root cgroup if it can source
this information through other, cheaper means.
This is the case for the io controller and the cgroup base stats. In
their respective flush callbacks, check whether the parent is the root
cgroup, and if so, skip the unnecessary upward propagation.
The extra cost of tracking the root cgroup is negligible: on stat
changes, we actually remove a branch that checks for the root. The
queueing for a flush touches only per-cpu data, and only the first stat
change since a flush requires a (per-cpu) lock.
Link: https://lkml.kernel.org/r/20210209163304.77088-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-29 22:56:23 -07:00
|
|
|
/* Root has no parent to link it to, but mark it busy */
|
|
|
|
if (!parent) {
|
|
|
|
rstatc->updated_next = cgrp;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
prstatc = cgroup_rstat_cpu(parent, cpu);
|
2018-04-26 14:29:04 -07:00
|
|
|
rstatc->updated_next = prstatc->updated_children;
|
|
|
|
prstatc->updated_children = cgrp;
|
cgroup: rstat: punt root-level optimization to individual controllers
Current users of the rstat code can source root-level statistics from
the native counters of their respective subsystem, allowing them to
forego aggregation at the root level. This optimization is currently
implemented inside the generic rstat code, which doesn't track the root
cgroup and doesn't invoke the subsystem flush callbacks on it.
However, the memory controller cannot do this optimization, because
cgroup1 breaks out memory specifically for the local level, including at
the root level. In preparation for the memory controller switching to
rstat, move the optimization from rstat core to the controllers.
Afterwards, rstat will always track the root cgroup for changes and
invoke the subsystem callbacks on it; and it's up to the subsystem to
special-case and skip aggregation of the root cgroup if it can source
this information through other, cheaper means.
This is the case for the io controller and the cgroup base stats. In
their respective flush callbacks, check whether the parent is the root
cgroup, and if so, skip the unnecessary upward propagation.
The extra cost of tracking the root cgroup is negligible: on stat
changes, we actually remove a branch that checks for the root. The
queueing for a flush touches only per-cpu data, and only the first stat
change since a flush requires a (per-cpu) lock.
Link: https://lkml.kernel.org/r/20210209163304.77088-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-29 22:56:23 -07:00
|
|
|
|
|
|
|
cgrp = parent;
|
2017-09-25 08:12:05 -07:00
|
|
|
}
|
|
|
|
|
2024-05-01 16:04:11 +02:00
|
|
|
_cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
|
2017-09-25 08:12:05 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2023-11-30 15:43:26 -05:00
|
|
|
* cgroup_rstat_push_children - push children cgroups into the given list
|
|
|
|
* @head: current head of the list (= subtree root)
|
|
|
|
* @child: first child of the root
|
2017-09-25 08:12:05 -07:00
|
|
|
* @cpu: target cpu
|
2023-11-30 15:43:26 -05:00
|
|
|
* Return: A new singly linked list of cgroups to be flush
|
2017-09-25 08:12:05 -07:00
|
|
|
*
|
2023-11-30 15:43:26 -05:00
|
|
|
* Iteratively traverse down the cgroup_rstat_cpu updated tree level by
|
|
|
|
* level and push all the parents first before their next level children
|
|
|
|
* into a singly linked list built from the tail backward like "pushing"
|
|
|
|
* cgroups into a stack. The root is pushed by the caller.
|
|
|
|
*/
|
|
|
|
static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
|
|
|
|
struct cgroup *child, int cpu)
|
|
|
|
{
|
|
|
|
struct cgroup *chead = child; /* Head of child cgroup level */
|
|
|
|
struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */
|
|
|
|
struct cgroup *parent, *grandchild;
|
|
|
|
struct cgroup_rstat_cpu *crstatc;
|
|
|
|
|
|
|
|
child->rstat_flush_next = NULL;
|
|
|
|
|
|
|
|
next_level:
|
|
|
|
while (chead) {
|
|
|
|
child = chead;
|
|
|
|
chead = child->rstat_flush_next;
|
|
|
|
parent = cgroup_parent(child);
|
|
|
|
|
|
|
|
/* updated_next is parent cgroup terminated */
|
|
|
|
while (child != parent) {
|
|
|
|
child->rstat_flush_next = head;
|
|
|
|
head = child;
|
|
|
|
crstatc = cgroup_rstat_cpu(child, cpu);
|
|
|
|
grandchild = crstatc->updated_children;
|
|
|
|
if (grandchild != child) {
|
|
|
|
/* Push the grand child to the next level */
|
|
|
|
crstatc->updated_children = child;
|
|
|
|
grandchild->rstat_flush_next = ghead;
|
|
|
|
ghead = grandchild;
|
|
|
|
}
|
|
|
|
child = crstatc->updated_next;
|
|
|
|
crstatc->updated_next = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ghead) {
|
|
|
|
chead = ghead;
|
|
|
|
ghead = NULL;
|
|
|
|
goto next_level;
|
|
|
|
}
|
|
|
|
return head;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
|
|
|
|
* @root: root of the cgroup subtree to traverse
|
|
|
|
* @cpu: target cpu
|
|
|
|
* Return: A singly linked list of cgroups to be flushed
|
|
|
|
*
|
|
|
|
* Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
|
|
|
|
* each returned cgroup is unlinked from the updated tree.
|
2017-09-25 08:12:05 -07:00
|
|
|
*
|
|
|
|
* The only ordering guarantee is that, for a parent and a child pair
|
2023-11-30 15:43:26 -05:00
|
|
|
* covered by a given traversal, the child is before its parent in
|
|
|
|
* the list.
|
|
|
|
*
|
|
|
|
* Note that updated_children is self terminated and points to a list of
|
|
|
|
* child cgroups if not empty. Whereas updated_next is like a sibling link
|
|
|
|
* within the children list and terminated by the parent cgroup. An exception
|
|
|
|
* here is the cgroup root whose updated_next can be self terminated.
|
2017-09-25 08:12:05 -07:00
|
|
|
*/
|
2023-11-30 15:43:26 -05:00
|
|
|
static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
|
2017-09-25 08:12:05 -07:00
|
|
|
{
|
2023-11-30 15:43:26 -05:00
|
|
|
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
|
|
|
|
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
|
|
|
|
struct cgroup *head = NULL, *parent, *child;
|
|
|
|
unsigned long flags;
|
2017-09-25 08:12:05 -07:00
|
|
|
|
2024-05-01 16:04:11 +02:00
|
|
|
flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
|
2017-09-25 08:12:05 -07:00
|
|
|
|
2023-11-30 15:43:26 -05:00
|
|
|
/* Return NULL if this subtree is not on-list */
|
|
|
|
if (!rstatc->updated_next)
|
|
|
|
goto unlock_ret;
|
2017-09-25 08:12:05 -07:00
|
|
|
|
|
|
|
/*
|
2023-11-30 15:43:26 -05:00
|
|
|
* Unlink @root from its parent. As the updated_children list is
|
2017-09-25 08:12:05 -07:00
|
|
|
* singly linked, we have to walk it to find the removal point.
|
|
|
|
*/
|
2023-11-30 15:43:26 -05:00
|
|
|
parent = cgroup_parent(root);
|
2021-12-25 00:09:32 +00:00
|
|
|
if (parent) {
|
|
|
|
struct cgroup_rstat_cpu *prstatc;
|
|
|
|
struct cgroup **nextp;
|
2017-09-25 08:12:05 -07:00
|
|
|
|
2021-12-25 00:09:32 +00:00
|
|
|
prstatc = cgroup_rstat_cpu(parent, cpu);
|
|
|
|
nextp = &prstatc->updated_children;
|
2023-11-30 15:43:26 -05:00
|
|
|
while (*nextp != root) {
|
2021-12-25 00:09:32 +00:00
|
|
|
struct cgroup_rstat_cpu *nrstatc;
|
|
|
|
|
|
|
|
nrstatc = cgroup_rstat_cpu(*nextp, cpu);
|
|
|
|
WARN_ON_ONCE(*nextp == parent);
|
|
|
|
nextp = &nrstatc->updated_next;
|
|
|
|
}
|
|
|
|
*nextp = rstatc->updated_next;
|
2017-09-25 08:12:05 -07:00
|
|
|
}
|
|
|
|
|
2021-12-25 00:09:32 +00:00
|
|
|
rstatc->updated_next = NULL;
|
cgroup/rstat: Reduce cpu_lock hold time in cgroup_rstat_flush_locked()
When cgroup_rstat_updated() isn't being called concurrently with
cgroup_rstat_flush_locked(), its run time is pretty short. When
both are called concurrently, the cgroup_rstat_updated() run time
can spike to a pretty high value due to high cpu_lock hold time in
cgroup_rstat_flush_locked(). This can be problematic if the task calling
cgroup_rstat_updated() is a realtime task running on an isolated CPU
with a strict latency requirement. The cgroup_rstat_updated() call can
happen when there is a page fault even though the task is running in
user space most of the time.
The percpu cpu_lock is used to protect the update tree -
updated_next and updated_children. This protection is only needed when
cgroup_rstat_cpu_pop_updated() is being called. The subsequent flushing
operation which can take a much longer time does not need that protection
as it is already protected by cgroup_rstat_lock.
To reduce the cpu_lock hold time, we need to perform all the
cgroup_rstat_cpu_pop_updated() calls up front with the lock
released afterward before doing any flushing. This patch adds a new
cgroup_rstat_updated_list() function to return a singly linked list of
cgroups to be flushed.
Some instrumentation code are added to measure the cpu_lock hold time
right after lock acquisition to after releasing the lock. Parallel
kernel build on a 2-socket x86-64 server is used as the benchmarking
tool for measuring the lock hold time.
The maximum cpu_lock hold time before and after the patch are 100us and
29us respectively. So the worst case time is reduced to about 30% of
the original. However, there may be some OS or hardware noises like NMI
or SMI in the test system that can worsen the worst case value. Those
noises are usually tuned out in a real production environment to get
a better result.
OTOH, the lock hold time frequency distribution should give a better
idea of the performance benefit of the patch. Below were the frequency
distribution before and after the patch:
Hold time Before patch After patch
--------- ------------ -----------
0-01 us 804,139 13,738,708
01-05 us 9,772,767 1,177,194
05-10 us 4,595,028 4,984
10-15 us 303,481 3,562
15-20 us 78,971 1,314
20-25 us 24,583 18
25-30 us 6,908 12
30-40 us 8,015
40-50 us 2,192
50-60 us 316
60-70 us 43
70-80 us 7
80-90 us 2
>90 us 3
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2023-11-03 23:13:01 -04:00
|
|
|
|
2023-11-30 15:43:26 -05:00
|
|
|
/* Push @root to the list first before pushing the children */
|
|
|
|
head = root;
|
|
|
|
root->rstat_flush_next = NULL;
|
|
|
|
child = rstatc->updated_children;
|
|
|
|
rstatc->updated_children = root;
|
|
|
|
if (child != root)
|
|
|
|
head = cgroup_rstat_push_children(head, child, cpu);
|
|
|
|
unlock_ret:
|
2024-05-01 16:04:11 +02:00
|
|
|
_cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
|
cgroup/rstat: Reduce cpu_lock hold time in cgroup_rstat_flush_locked()
When cgroup_rstat_updated() isn't being called concurrently with
cgroup_rstat_flush_locked(), its run time is pretty short. When
both are called concurrently, the cgroup_rstat_updated() run time
can spike to a pretty high value due to high cpu_lock hold time in
cgroup_rstat_flush_locked(). This can be problematic if the task calling
cgroup_rstat_updated() is a realtime task running on an isolated CPU
with a strict latency requirement. The cgroup_rstat_updated() call can
happen when there is a page fault even though the task is running in
user space most of the time.
The percpu cpu_lock is used to protect the update tree -
updated_next and updated_children. This protection is only needed when
cgroup_rstat_cpu_pop_updated() is being called. The subsequent flushing
operation which can take a much longer time does not need that protection
as it is already protected by cgroup_rstat_lock.
To reduce the cpu_lock hold time, we need to perform all the
cgroup_rstat_cpu_pop_updated() calls up front with the lock
released afterward before doing any flushing. This patch adds a new
cgroup_rstat_updated_list() function to return a singly linked list of
cgroups to be flushed.
Some instrumentation code are added to measure the cpu_lock hold time
right after lock acquisition to after releasing the lock. Parallel
kernel build on a 2-socket x86-64 server is used as the benchmarking
tool for measuring the lock hold time.
The maximum cpu_lock hold time before and after the patch are 100us and
29us respectively. So the worst case time is reduced to about 30% of
the original. However, there may be some OS or hardware noises like NMI
or SMI in the test system that can worsen the worst case value. Those
noises are usually tuned out in a real production environment to get
a better result.
OTOH, the lock hold time frequency distribution should give a better
idea of the performance benefit of the patch. Below were the frequency
distribution before and after the patch:
Hold time Before patch After patch
--------- ------------ -----------
0-01 us 804,139 13,738,708
01-05 us 9,772,767 1,177,194
05-10 us 4,595,028 4,984
10-15 us 303,481 3,562
15-20 us 78,971 1,314
20-25 us 24,583 18
25-30 us 6,908 12
30-40 us 8,015
40-50 us 2,192
50-60 us 316
60-70 us 43
70-80 us 7
80-90 us 2
>90 us 3
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2023-11-03 23:13:01 -04:00
|
|
|
return head;
|
2017-09-25 08:12:05 -07:00
|
|
|
}
|
|
|
|
|
2022-08-24 16:31:15 -07:00
|
|
|
/*
|
|
|
|
* A hook for bpf stat collectors to attach to and flush their stats.
|
|
|
|
* Together with providing bpf kfuncs for cgroup_rstat_updated() and
|
|
|
|
* cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
|
|
|
|
* collect cgroup stats can integrate with rstat for efficient flushing.
|
|
|
|
*
|
|
|
|
* A static noinline declaration here could cause the compiler to optimize away
|
|
|
|
* the function. A global noinline declaration will keep the definition, but may
|
|
|
|
* optimize away the callsite. Therefore, __weak is needed to ensure that the
|
|
|
|
* call is still emitted, by telling the compiler that we don't know what the
|
|
|
|
* function might eventually be.
|
|
|
|
*/
|
2023-10-31 14:56:25 -07:00
|
|
|
|
|
|
|
__bpf_hook_start();
|
2022-08-24 16:31:15 -07:00
|
|
|
|
|
|
|
__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
|
|
|
|
struct cgroup *parent, int cpu)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2023-10-31 14:56:25 -07:00
|
|
|
__bpf_hook_end();
|
2022-08-24 16:31:15 -07:00
|
|
|
|
2024-04-16 19:51:26 +02:00
|
|
|
/*
|
|
|
|
* Helper functions for locking cgroup_rstat_lock.
|
|
|
|
*
|
|
|
|
* This makes it easier to diagnose locking issues and contention in
|
|
|
|
* production environments. The parameter @cpu_in_loop indicate lock
|
|
|
|
* was released and re-taken when collection data from the CPUs. The
|
|
|
|
* value -1 is used when obtaining the main lock else this is the CPU
|
|
|
|
* number processed last.
|
|
|
|
*/
|
|
|
|
static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
|
|
|
|
__acquires(&cgroup_rstat_lock)
|
|
|
|
{
|
|
|
|
bool contended;
|
|
|
|
|
|
|
|
contended = !spin_trylock_irq(&cgroup_rstat_lock);
|
|
|
|
if (contended) {
|
|
|
|
trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
|
|
|
|
spin_lock_irq(&cgroup_rstat_lock);
|
|
|
|
}
|
|
|
|
trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
|
|
|
|
__releases(&cgroup_rstat_lock)
|
|
|
|
{
|
|
|
|
trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
|
|
|
|
spin_unlock_irq(&cgroup_rstat_lock);
|
|
|
|
}
|
|
|
|
|
2018-04-26 14:29:05 -07:00
|
|
|
/* see cgroup_rstat_flush() */
|
2023-04-21 17:40:20 +00:00
|
|
|
static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
|
2018-04-26 14:29:05 -07:00
|
|
|
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
|
2018-04-26 14:29:05 -07:00
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
|
2018-04-26 14:29:05 -07:00
|
|
|
lockdep_assert_held(&cgroup_rstat_lock);
|
2018-04-26 14:29:05 -07:00
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
cgroup/rstat: Reduce cpu_lock hold time in cgroup_rstat_flush_locked()
When cgroup_rstat_updated() isn't being called concurrently with
cgroup_rstat_flush_locked(), its run time is pretty short. When
both are called concurrently, the cgroup_rstat_updated() run time
can spike to a pretty high value due to high cpu_lock hold time in
cgroup_rstat_flush_locked(). This can be problematic if the task calling
cgroup_rstat_updated() is a realtime task running on an isolated CPU
with a strict latency requirement. The cgroup_rstat_updated() call can
happen when there is a page fault even though the task is running in
user space most of the time.
The percpu cpu_lock is used to protect the update tree -
updated_next and updated_children. This protection is only needed when
cgroup_rstat_cpu_pop_updated() is being called. The subsequent flushing
operation which can take a much longer time does not need that protection
as it is already protected by cgroup_rstat_lock.
To reduce the cpu_lock hold time, we need to perform all the
cgroup_rstat_cpu_pop_updated() calls up front with the lock
released afterward before doing any flushing. This patch adds a new
cgroup_rstat_updated_list() function to return a singly linked list of
cgroups to be flushed.
Some instrumentation code are added to measure the cpu_lock hold time
right after lock acquisition to after releasing the lock. Parallel
kernel build on a 2-socket x86-64 server is used as the benchmarking
tool for measuring the lock hold time.
The maximum cpu_lock hold time before and after the patch are 100us and
29us respectively. So the worst case time is reduced to about 30% of
the original. However, there may be some OS or hardware noises like NMI
or SMI in the test system that can worsen the worst case value. Those
noises are usually tuned out in a real production environment to get
a better result.
OTOH, the lock hold time frequency distribution should give a better
idea of the performance benefit of the patch. Below were the frequency
distribution before and after the patch:
Hold time Before patch After patch
--------- ------------ -----------
0-01 us 804,139 13,738,708
01-05 us 9,772,767 1,177,194
05-10 us 4,595,028 4,984
10-15 us 303,481 3,562
15-20 us 78,971 1,314
20-25 us 24,583 18
25-30 us 6,908 12
30-40 us 8,015
40-50 us 2,192
50-60 us 316
60-70 us 43
70-80 us 7
80-90 us 2
>90 us 3
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2023-11-03 23:13:01 -04:00
|
|
|
struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
|
2018-04-26 14:29:05 -07:00
|
|
|
|
cgroup/rstat: Reduce cpu_lock hold time in cgroup_rstat_flush_locked()
When cgroup_rstat_updated() isn't being called concurrently with
cgroup_rstat_flush_locked(), its run time is pretty short. When
both are called concurrently, the cgroup_rstat_updated() run time
can spike to a pretty high value due to high cpu_lock hold time in
cgroup_rstat_flush_locked(). This can be problematic if the task calling
cgroup_rstat_updated() is a realtime task running on an isolated CPU
with a strict latency requirement. The cgroup_rstat_updated() call can
happen when there is a page fault even though the task is running in
user space most of the time.
The percpu cpu_lock is used to protect the update tree -
updated_next and updated_children. This protection is only needed when
cgroup_rstat_cpu_pop_updated() is being called. The subsequent flushing
operation which can take a much longer time does not need that protection
as it is already protected by cgroup_rstat_lock.
To reduce the cpu_lock hold time, we need to perform all the
cgroup_rstat_cpu_pop_updated() calls up front with the lock
released afterward before doing any flushing. This patch adds a new
cgroup_rstat_updated_list() function to return a singly linked list of
cgroups to be flushed.
Some instrumentation code are added to measure the cpu_lock hold time
right after lock acquisition to after releasing the lock. Parallel
kernel build on a 2-socket x86-64 server is used as the benchmarking
tool for measuring the lock hold time.
The maximum cpu_lock hold time before and after the patch are 100us and
29us respectively. So the worst case time is reduced to about 30% of
the original. However, there may be some OS or hardware noises like NMI
or SMI in the test system that can worsen the worst case value. Those
noises are usually tuned out in a real production environment to get
a better result.
OTOH, the lock hold time frequency distribution should give a better
idea of the performance benefit of the patch. Below were the frequency
distribution before and after the patch:
Hold time Before patch After patch
--------- ------------ -----------
0-01 us 804,139 13,738,708
01-05 us 9,772,767 1,177,194
05-10 us 4,595,028 4,984
10-15 us 303,481 3,562
15-20 us 78,971 1,314
20-25 us 24,583 18
25-30 us 6,908 12
30-40 us 8,015
40-50 us 2,192
50-60 us 316
60-70 us 43
70-80 us 7
80-90 us 2
>90 us 3
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2023-11-03 23:13:01 -04:00
|
|
|
for (; pos; pos = pos->rstat_flush_next) {
|
2018-04-26 14:29:05 -07:00
|
|
|
struct cgroup_subsys_state *css;
|
|
|
|
|
2018-04-26 14:29:05 -07:00
|
|
|
cgroup_base_stat_flush(pos, cpu);
|
2022-08-24 16:31:15 -07:00
|
|
|
bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
|
2018-04-26 14:29:05 -07:00
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
list_for_each_entry_rcu(css, &pos->rstat_css_list,
|
|
|
|
rstat_css_node)
|
|
|
|
css->ss->css_rstat_flush(css, cpu);
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
2018-04-26 14:29:05 -07:00
|
|
|
|
2023-04-21 17:40:20 +00:00
|
|
|
/* play nice and yield if necessary */
|
|
|
|
if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
|
2024-04-16 19:51:26 +02:00
|
|
|
__cgroup_rstat_unlock(cgrp, cpu);
|
2018-04-26 14:29:05 -07:00
|
|
|
if (!cond_resched())
|
|
|
|
cpu_relax();
|
2024-04-16 19:51:26 +02:00
|
|
|
__cgroup_rstat_lock(cgrp, cpu);
|
2018-04-26 14:29:05 -07:00
|
|
|
}
|
2018-04-26 14:29:05 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* cgroup_rstat_flush - flush stats in @cgrp's subtree
|
|
|
|
* @cgrp: target cgroup
|
|
|
|
*
|
|
|
|
* Collect all per-cpu stats in @cgrp's subtree into the global counters
|
|
|
|
* and propagate them upwards. After this function returns, all cgroups in
|
|
|
|
* the subtree have up-to-date ->stat.
|
|
|
|
*
|
|
|
|
* This also gets all cgroups in the subtree including @cgrp off the
|
|
|
|
* ->updated_children lists.
|
2018-04-26 14:29:05 -07:00
|
|
|
*
|
|
|
|
* This function may block.
|
2018-04-26 14:29:05 -07:00
|
|
|
*/
|
2023-02-01 11:30:15 -06:00
|
|
|
__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
|
2018-04-26 14:29:05 -07:00
|
|
|
{
|
2018-04-26 14:29:05 -07:00
|
|
|
might_sleep();
|
|
|
|
|
2024-04-16 19:51:26 +02:00
|
|
|
__cgroup_rstat_lock(cgrp, -1);
|
2023-04-21 17:40:20 +00:00
|
|
|
cgroup_rstat_flush_locked(cgrp);
|
2024-04-16 19:51:26 +02:00
|
|
|
__cgroup_rstat_unlock(cgrp, -1);
|
2018-04-26 14:29:05 -07:00
|
|
|
}
|
|
|
|
|
2018-04-26 14:29:05 -07:00
|
|
|
/**
|
2021-05-26 10:49:09 +08:00
|
|
|
* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
|
2018-04-26 14:29:05 -07:00
|
|
|
* @cgrp: target cgroup
|
|
|
|
*
|
|
|
|
* Flush stats in @cgrp's subtree and prevent further flushes. Must be
|
|
|
|
* paired with cgroup_rstat_flush_release().
|
2018-04-26 14:29:05 -07:00
|
|
|
*
|
|
|
|
* This function may block.
|
2018-04-26 14:29:05 -07:00
|
|
|
*/
|
|
|
|
void cgroup_rstat_flush_hold(struct cgroup *cgrp)
|
2018-04-26 14:29:05 -07:00
|
|
|
__acquires(&cgroup_rstat_lock)
|
2018-04-26 14:29:05 -07:00
|
|
|
{
|
2018-04-26 14:29:05 -07:00
|
|
|
might_sleep();
|
2024-04-16 19:51:26 +02:00
|
|
|
__cgroup_rstat_lock(cgrp, -1);
|
2023-04-21 17:40:20 +00:00
|
|
|
cgroup_rstat_flush_locked(cgrp);
|
2018-04-26 14:29:05 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
|
2024-04-17 12:59:46 +02:00
|
|
|
* @cgrp: cgroup used by tracepoint
|
2018-04-26 14:29:05 -07:00
|
|
|
*/
|
2024-04-16 19:51:26 +02:00
|
|
|
void cgroup_rstat_flush_release(struct cgroup *cgrp)
|
2018-04-26 14:29:05 -07:00
|
|
|
__releases(&cgroup_rstat_lock)
|
2018-04-26 14:29:05 -07:00
|
|
|
{
|
2024-04-16 19:51:26 +02:00
|
|
|
__cgroup_rstat_unlock(cgrp, -1);
|
2018-04-26 14:29:05 -07:00
|
|
|
}
|
|
|
|
|
2018-04-26 14:29:05 -07:00
|
|
|
int cgroup_rstat_init(struct cgroup *cgrp)
|
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
/* the root cgrp has rstat_cpu preallocated */
|
|
|
|
if (!cgrp->rstat_cpu) {
|
|
|
|
cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
|
|
|
|
if (!cgrp->rstat_cpu)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ->updated_children list is self terminated */
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
|
|
|
|
|
|
|
|
rstatc->updated_children = cgrp;
|
|
|
|
u64_stats_init(&rstatc->bsync);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void cgroup_rstat_exit(struct cgroup *cgrp)
|
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
cgroup_rstat_flush(cgrp);
|
|
|
|
|
|
|
|
/* sanity check */
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
|
|
|
|
WARN_ON_ONCE(rstatc->updated_next))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
free_percpu(cgrp->rstat_cpu);
|
|
|
|
cgrp->rstat_cpu = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
void __init cgroup_rstat_boot(void)
|
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu)
|
|
|
|
raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Functions for cgroup basic resource statistics implemented on top of
|
|
|
|
* rstat.
|
|
|
|
*/
|
2019-11-06 12:49:57 -08:00
|
|
|
static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
|
|
|
|
struct cgroup_base_stat *src_bstat)
|
2017-09-25 08:12:05 -07:00
|
|
|
{
|
2018-04-26 14:29:04 -07:00
|
|
|
dst_bstat->cputime.utime += src_bstat->cputime.utime;
|
|
|
|
dst_bstat->cputime.stime += src_bstat->cputime.stime;
|
|
|
|
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
|
2022-06-29 14:14:26 -07:00
|
|
|
#ifdef CONFIG_SCHED_CORE
|
|
|
|
dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
|
|
|
|
#endif
|
2024-10-02 11:47:16 -07:00
|
|
|
dst_bstat->ntime += src_bstat->ntime;
|
2017-09-25 08:12:05 -07:00
|
|
|
}
|
|
|
|
|
2019-11-06 12:49:57 -08:00
|
|
|
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
|
|
|
|
struct cgroup_base_stat *src_bstat)
|
|
|
|
{
|
|
|
|
dst_bstat->cputime.utime -= src_bstat->cputime.utime;
|
|
|
|
dst_bstat->cputime.stime -= src_bstat->cputime.stime;
|
|
|
|
dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
|
2022-06-29 14:14:26 -07:00
|
|
|
#ifdef CONFIG_SCHED_CORE
|
|
|
|
dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
|
|
|
|
#endif
|
2024-10-02 11:47:16 -07:00
|
|
|
dst_bstat->ntime -= src_bstat->ntime;
|
2019-11-06 12:49:57 -08:00
|
|
|
}
|
|
|
|
|
2018-04-26 14:29:04 -07:00
|
|
|
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
|
2017-09-25 08:12:05 -07:00
|
|
|
{
|
2018-04-26 14:29:04 -07:00
|
|
|
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
|
cgroup: rstat: punt root-level optimization to individual controllers
Current users of the rstat code can source root-level statistics from
the native counters of their respective subsystem, allowing them to
forego aggregation at the root level. This optimization is currently
implemented inside the generic rstat code, which doesn't track the root
cgroup and doesn't invoke the subsystem flush callbacks on it.
However, the memory controller cannot do this optimization, because
cgroup1 breaks out memory specifically for the local level, including at
the root level. In preparation for the memory controller switching to
rstat, move the optimization from rstat core to the controllers.
Afterwards, rstat will always track the root cgroup for changes and
invoke the subsystem callbacks on it; and it's up to the subsystem to
special-case and skip aggregation of the root cgroup if it can source
this information through other, cheaper means.
This is the case for the io controller and the cgroup base stats. In
their respective flush callbacks, check whether the parent is the root
cgroup, and if so, skip the unnecessary upward propagation.
The extra cost of tracking the root cgroup is negligible: on stat
changes, we actually remove a branch that checks for the root. The
queueing for a flush touches only per-cpu data, and only the first stat
change since a flush requires a (per-cpu) lock.
Link: https://lkml.kernel.org/r/20210209163304.77088-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-29 22:56:23 -07:00
|
|
|
struct cgroup *parent = cgroup_parent(cgrp);
|
2023-08-07 11:29:30 +08:00
|
|
|
struct cgroup_rstat_cpu *prstatc;
|
2022-01-08 00:38:17 +00:00
|
|
|
struct cgroup_base_stat delta;
|
2017-09-25 08:12:05 -07:00
|
|
|
unsigned seq;
|
|
|
|
|
cgroup: rstat: punt root-level optimization to individual controllers
Current users of the rstat code can source root-level statistics from
the native counters of their respective subsystem, allowing them to
forego aggregation at the root level. This optimization is currently
implemented inside the generic rstat code, which doesn't track the root
cgroup and doesn't invoke the subsystem flush callbacks on it.
However, the memory controller cannot do this optimization, because
cgroup1 breaks out memory specifically for the local level, including at
the root level. In preparation for the memory controller switching to
rstat, move the optimization from rstat core to the controllers.
Afterwards, rstat will always track the root cgroup for changes and
invoke the subsystem callbacks on it; and it's up to the subsystem to
special-case and skip aggregation of the root cgroup if it can source
this information through other, cheaper means.
This is the case for the io controller and the cgroup base stats. In
their respective flush callbacks, check whether the parent is the root
cgroup, and if so, skip the unnecessary upward propagation.
The extra cost of tracking the root cgroup is negligible: on stat
changes, we actually remove a branch that checks for the root. The
queueing for a flush touches only per-cpu data, and only the first stat
change since a flush requires a (per-cpu) lock.
Link: https://lkml.kernel.org/r/20210209163304.77088-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-29 22:56:23 -07:00
|
|
|
/* Root-level stats are sourced from system-wide CPU stats */
|
|
|
|
if (!parent)
|
|
|
|
return;
|
|
|
|
|
2017-09-25 08:12:05 -07:00
|
|
|
/* fetch the current per-cpu values */
|
|
|
|
do {
|
2018-04-26 14:29:04 -07:00
|
|
|
seq = __u64_stats_fetch_begin(&rstatc->bsync);
|
2022-01-08 00:38:17 +00:00
|
|
|
delta = rstatc->bstat;
|
2018-04-26 14:29:04 -07:00
|
|
|
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
|
2017-09-25 08:12:05 -07:00
|
|
|
|
2023-08-07 11:29:30 +08:00
|
|
|
/* propagate per-cpu delta to cgroup and per-cpu global statistics */
|
2019-11-06 12:49:57 -08:00
|
|
|
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
|
|
|
|
cgroup_base_stat_add(&cgrp->bstat, &delta);
|
|
|
|
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
|
2023-08-07 11:29:30 +08:00
|
|
|
cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
|
2019-11-06 12:49:57 -08:00
|
|
|
|
2023-08-07 11:29:30 +08:00
|
|
|
/* propagate cgroup and per-cpu global delta to parent (unless that's root) */
|
cgroup: rstat: punt root-level optimization to individual controllers
Current users of the rstat code can source root-level statistics from
the native counters of their respective subsystem, allowing them to
forego aggregation at the root level. This optimization is currently
implemented inside the generic rstat code, which doesn't track the root
cgroup and doesn't invoke the subsystem flush callbacks on it.
However, the memory controller cannot do this optimization, because
cgroup1 breaks out memory specifically for the local level, including at
the root level. In preparation for the memory controller switching to
rstat, move the optimization from rstat core to the controllers.
Afterwards, rstat will always track the root cgroup for changes and
invoke the subsystem callbacks on it; and it's up to the subsystem to
special-case and skip aggregation of the root cgroup if it can source
this information through other, cheaper means.
This is the case for the io controller and the cgroup base stats. In
their respective flush callbacks, check whether the parent is the root
cgroup, and if so, skip the unnecessary upward propagation.
The extra cost of tracking the root cgroup is negligible: on stat
changes, we actually remove a branch that checks for the root. The
queueing for a flush touches only per-cpu data, and only the first stat
change since a flush requires a (per-cpu) lock.
Link: https://lkml.kernel.org/r/20210209163304.77088-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-29 22:56:23 -07:00
|
|
|
if (cgroup_parent(parent)) {
|
2019-11-06 12:49:57 -08:00
|
|
|
delta = cgrp->bstat;
|
|
|
|
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
|
|
|
|
cgroup_base_stat_add(&parent->bstat, &delta);
|
|
|
|
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
|
2023-08-07 11:29:30 +08:00
|
|
|
|
|
|
|
delta = rstatc->subtree_bstat;
|
|
|
|
prstatc = cgroup_rstat_cpu(parent, cpu);
|
|
|
|
cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
|
|
|
|
cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
|
|
|
|
cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
|
2019-11-06 12:49:57 -08:00
|
|
|
}
|
2017-09-25 08:12:05 -07:00
|
|
|
}
|
|
|
|
|
2018-04-26 14:29:04 -07:00
|
|
|
static struct cgroup_rstat_cpu *
|
2021-07-27 13:12:20 -10:00
|
|
|
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
|
2017-09-25 08:12:05 -07:00
|
|
|
{
|
2018-04-26 14:29:04 -07:00
|
|
|
struct cgroup_rstat_cpu *rstatc;
|
2017-09-25 08:12:05 -07:00
|
|
|
|
2018-04-26 14:29:04 -07:00
|
|
|
rstatc = get_cpu_ptr(cgrp->rstat_cpu);
|
2021-07-27 13:12:20 -10:00
|
|
|
*flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
|
2018-04-26 14:29:04 -07:00
|
|
|
return rstatc;
|
2017-09-25 08:12:05 -07:00
|
|
|
}
|
|
|
|
|
2018-04-26 14:29:04 -07:00
|
|
|
static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
|
2021-07-27 13:12:20 -10:00
|
|
|
struct cgroup_rstat_cpu *rstatc,
|
|
|
|
unsigned long flags)
|
2017-09-25 08:12:05 -07:00
|
|
|
{
|
2021-07-27 13:12:20 -10:00
|
|
|
u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
|
2018-04-26 14:29:05 -07:00
|
|
|
cgroup_rstat_updated(cgrp, smp_processor_id());
|
2018-04-26 14:29:04 -07:00
|
|
|
put_cpu_ptr(rstatc);
|
2017-09-25 08:12:05 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
|
|
|
|
{
|
2018-04-26 14:29:04 -07:00
|
|
|
struct cgroup_rstat_cpu *rstatc;
|
2021-07-27 13:12:20 -10:00
|
|
|
unsigned long flags;
|
2017-09-25 08:12:05 -07:00
|
|
|
|
2021-07-27 13:12:20 -10:00
|
|
|
rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
|
2018-04-26 14:29:04 -07:00
|
|
|
rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
|
2021-07-27 13:12:20 -10:00
|
|
|
cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
|
2017-09-25 08:12:05 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void __cgroup_account_cputime_field(struct cgroup *cgrp,
|
|
|
|
enum cpu_usage_stat index, u64 delta_exec)
|
|
|
|
{
|
2018-04-26 14:29:04 -07:00
|
|
|
struct cgroup_rstat_cpu *rstatc;
|
2021-07-27 13:12:20 -10:00
|
|
|
unsigned long flags;
|
2017-09-25 08:12:05 -07:00
|
|
|
|
2021-07-27 13:12:20 -10:00
|
|
|
rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
|
2017-09-25 08:12:05 -07:00
|
|
|
|
|
|
|
switch (index) {
|
|
|
|
case CPUTIME_NICE:
|
2024-10-02 11:47:16 -07:00
|
|
|
rstatc->bstat.ntime += delta_exec;
|
|
|
|
fallthrough;
|
|
|
|
case CPUTIME_USER:
|
2018-04-26 14:29:04 -07:00
|
|
|
rstatc->bstat.cputime.utime += delta_exec;
|
2017-09-25 08:12:05 -07:00
|
|
|
break;
|
|
|
|
case CPUTIME_SYSTEM:
|
|
|
|
case CPUTIME_IRQ:
|
|
|
|
case CPUTIME_SOFTIRQ:
|
2018-04-26 14:29:04 -07:00
|
|
|
rstatc->bstat.cputime.stime += delta_exec;
|
2017-09-25 08:12:05 -07:00
|
|
|
break;
|
2022-06-29 14:14:26 -07:00
|
|
|
#ifdef CONFIG_SCHED_CORE
|
|
|
|
case CPUTIME_FORCEIDLE:
|
|
|
|
rstatc->bstat.forceidle_sum += delta_exec;
|
|
|
|
break;
|
|
|
|
#endif
|
2017-09-25 08:12:05 -07:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2021-07-27 13:12:20 -10:00
|
|
|
cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
|
2017-09-25 08:12:05 -07:00
|
|
|
}
|
|
|
|
|
2020-05-27 14:43:19 -07:00
|
|
|
/*
|
|
|
|
* compute the cputime for the root cgroup by getting the per cpu data
|
|
|
|
* at a global level, then categorizing the fields in a manner consistent
|
|
|
|
* with how it is done by __cgroup_account_cputime_field for each bit of
|
|
|
|
* cpu time attributed to a cgroup.
|
|
|
|
*/
|
2022-06-29 14:14:26 -07:00
|
|
|
static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
|
2020-05-27 14:43:19 -07:00
|
|
|
{
|
2022-06-29 14:14:26 -07:00
|
|
|
struct task_cputime *cputime = &bstat->cputime;
|
2020-05-27 14:43:19 -07:00
|
|
|
int i;
|
|
|
|
|
2023-03-15 14:40:29 -07:00
|
|
|
memset(bstat, 0, sizeof(*bstat));
|
2020-05-27 14:43:19 -07:00
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
struct kernel_cpustat kcpustat;
|
|
|
|
u64 *cpustat = kcpustat.cpustat;
|
|
|
|
u64 user = 0;
|
|
|
|
u64 sys = 0;
|
|
|
|
|
|
|
|
kcpustat_cpu_fetch(&kcpustat, i);
|
|
|
|
|
|
|
|
user += cpustat[CPUTIME_USER];
|
|
|
|
user += cpustat[CPUTIME_NICE];
|
|
|
|
cputime->utime += user;
|
|
|
|
|
|
|
|
sys += cpustat[CPUTIME_SYSTEM];
|
|
|
|
sys += cpustat[CPUTIME_IRQ];
|
|
|
|
sys += cpustat[CPUTIME_SOFTIRQ];
|
|
|
|
cputime->stime += sys;
|
|
|
|
|
|
|
|
cputime->sum_exec_runtime += user;
|
|
|
|
cputime->sum_exec_runtime += sys;
|
|
|
|
cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
|
2022-06-29 14:14:26 -07:00
|
|
|
|
|
|
|
#ifdef CONFIG_SCHED_CORE
|
|
|
|
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
|
|
|
|
#endif
|
2024-10-02 11:47:16 -07:00
|
|
|
bstat->ntime += cpustat[CPUTIME_NICE];
|
2020-05-27 14:43:19 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-07-04 14:01:19 +00:00
|
|
|
|
|
|
|
static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_SCHED_CORE
|
|
|
|
u64 forceidle_time = bstat->forceidle_sum;
|
|
|
|
|
|
|
|
do_div(forceidle_time, NSEC_PER_USEC);
|
|
|
|
seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2018-04-26 14:29:04 -07:00
|
|
|
void cgroup_base_stat_cputime_show(struct seq_file *seq)
|
2017-09-25 08:12:05 -07:00
|
|
|
{
|
|
|
|
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
2024-10-02 11:47:16 -07:00
|
|
|
u64 usage, utime, stime, ntime;
|
2020-05-27 14:43:19 -07:00
|
|
|
|
|
|
|
if (cgroup_parent(cgrp)) {
|
|
|
|
cgroup_rstat_flush_hold(cgrp);
|
|
|
|
usage = cgrp->bstat.cputime.sum_exec_runtime;
|
|
|
|
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
|
|
|
|
&utime, &stime);
|
2024-10-02 11:47:16 -07:00
|
|
|
ntime = cgrp->bstat.ntime;
|
2024-04-16 19:51:26 +02:00
|
|
|
cgroup_rstat_flush_release(cgrp);
|
2020-05-27 14:43:19 -07:00
|
|
|
} else {
|
2024-07-04 14:01:19 +00:00
|
|
|
/* cgrp->bstat of root is not actually used, reuse it */
|
|
|
|
root_cgroup_cputime(&cgrp->bstat);
|
|
|
|
usage = cgrp->bstat.cputime.sum_exec_runtime;
|
|
|
|
utime = cgrp->bstat.cputime.utime;
|
|
|
|
stime = cgrp->bstat.cputime.stime;
|
2024-10-02 11:47:16 -07:00
|
|
|
ntime = cgrp->bstat.ntime;
|
2020-05-27 14:43:19 -07:00
|
|
|
}
|
2017-09-25 08:12:05 -07:00
|
|
|
|
|
|
|
do_div(usage, NSEC_PER_USEC);
|
|
|
|
do_div(utime, NSEC_PER_USEC);
|
|
|
|
do_div(stime, NSEC_PER_USEC);
|
2024-10-02 11:47:16 -07:00
|
|
|
do_div(ntime, NSEC_PER_USEC);
|
2017-09-25 08:12:05 -07:00
|
|
|
|
2017-10-23 16:18:27 -07:00
|
|
|
seq_printf(seq, "usage_usec %llu\n"
|
2024-10-02 11:47:16 -07:00
|
|
|
"user_usec %llu\n"
|
|
|
|
"system_usec %llu\n"
|
|
|
|
"nice_usec %llu\n",
|
|
|
|
usage, utime, stime, ntime);
|
2022-06-29 14:14:26 -07:00
|
|
|
|
2024-07-04 14:01:19 +00:00
|
|
|
cgroup_force_idle_show(seq, &cgrp->bstat);
|
2017-09-25 08:12:05 -07:00
|
|
|
}
|
2022-08-24 16:31:15 -07:00
|
|
|
|
|
|
|
/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
|
2024-01-28 18:24:08 -07:00
|
|
|
BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
|
2022-08-24 16:31:15 -07:00
|
|
|
BTF_ID_FLAGS(func, cgroup_rstat_updated)
|
|
|
|
BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
|
2024-01-28 18:24:08 -07:00
|
|
|
BTF_KFUNCS_END(bpf_rstat_kfunc_ids)
|
2022-08-24 16:31:15 -07:00
|
|
|
|
|
|
|
static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.set = &bpf_rstat_kfunc_ids,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init bpf_rstat_kfunc_init(void)
|
|
|
|
{
|
|
|
|
return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
|
|
|
|
&bpf_rstat_kfunc_set);
|
|
|
|
}
|
|
|
|
late_initcall(bpf_rstat_kfunc_init);
|