// SPDX-License-Identifier: GPL-2.0 /* * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst ) * Copyright 2024 Red Hat (Maxime Ripard ) * Partially based on the rdma and misc controllers, which bear the following copyrights: * * Copyright 2020 Google LLC * Copyright (C) 2016 Parav Pandit */ #include #include #include #include #include #include #include struct dmem_cgroup_region { /** * @ref: References keeping the region alive. * Keeps the region reference alive after a succesful RCU lookup. */ struct kref ref; /** @rcu: RCU head for freeing */ struct rcu_head rcu; /** * @region_node: Linked into &dmem_cgroup_regions list. * Protected by RCU and global spinlock. */ struct list_head region_node; /** * @pools: List of pools linked to this region. * Protected by global spinlock only */ struct list_head pools; /** @size: Size of region, in bytes */ u64 size; /** @name: Name describing the node, set by dmem_cgroup_register_region */ char *name; /** * @unregistered: Whether the region is unregistered by its caller. * No new pools should be added to the region afterwards. */ bool unregistered; }; struct dmemcg_state { struct cgroup_subsys_state css; struct list_head pools; }; struct dmem_cgroup_pool_state { struct dmem_cgroup_region *region; struct dmemcg_state *cs; /* css node, RCU protected against region teardown */ struct list_head css_node; /* dev node, no RCU protection required */ struct list_head region_node; struct rcu_head rcu; struct page_counter cnt; bool inited; }; /* * 3 operations require locking protection: * - Registering and unregistering region to/from list, requires global lock. * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed. * - Adding a dmem_cgroup_pool_state to a region list. * * Since for the most common operations RCU provides enough protection, I * do not think more granular locking makes sense. Most protection is offered * by RCU and the lockless operating page_counter. */ static DEFINE_SPINLOCK(dmemcg_lock); static LIST_HEAD(dmem_cgroup_regions); static inline struct dmemcg_state * css_to_dmemcs(struct cgroup_subsys_state *css) { return container_of(css, struct dmemcg_state, css); } static inline struct dmemcg_state *get_current_dmemcs(void) { return css_to_dmemcs(task_get_css(current, dmem_cgrp_id)); } static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg) { return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL; } static void free_cg_pool(struct dmem_cgroup_pool_state *pool) { list_del(&pool->region_node); kfree(pool); } static void set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val) { page_counter_set_min(&pool->cnt, val); } static void set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val) { page_counter_set_low(&pool->cnt, val); } static void set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val) { page_counter_set_max(&pool->cnt, val); } static u64 get_resource_low(struct dmem_cgroup_pool_state *pool) { return pool ? READ_ONCE(pool->cnt.low) : 0; } static u64 get_resource_min(struct dmem_cgroup_pool_state *pool) { return pool ? READ_ONCE(pool->cnt.min) : 0; } static u64 get_resource_max(struct dmem_cgroup_pool_state *pool) { return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX; } static u64 get_resource_current(struct dmem_cgroup_pool_state *pool) { return pool ? page_counter_read(&pool->cnt) : 0; } static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool) { set_resource_min(rpool, 0); set_resource_low(rpool, 0); set_resource_max(rpool, PAGE_COUNTER_MAX); } static void dmemcs_offline(struct cgroup_subsys_state *css) { struct dmemcg_state *dmemcs = css_to_dmemcs(css); struct dmem_cgroup_pool_state *pool; rcu_read_lock(); list_for_each_entry_rcu(pool, &dmemcs->pools, css_node) reset_all_resource_limits(pool); rcu_read_unlock(); } static void dmemcs_free(struct cgroup_subsys_state *css) { struct dmemcg_state *dmemcs = css_to_dmemcs(css); struct dmem_cgroup_pool_state *pool, *next; spin_lock(&dmemcg_lock); list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) { /* *The pool is dead and all references are 0, * no need for RCU protection with list_del_rcu or freeing. */ list_del(&pool->css_node); free_cg_pool(pool); } spin_unlock(&dmemcg_lock); kfree(dmemcs); } static struct cgroup_subsys_state * dmemcs_alloc(struct cgroup_subsys_state *parent_css) { struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL); if (!dmemcs) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&dmemcs->pools); return &dmemcs->css; } static struct dmem_cgroup_pool_state * find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region) { struct dmem_cgroup_pool_state *pool; list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock)) if (pool->region == region) return pool; return NULL; } static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool) { if (!pool->cnt.parent) return NULL; return container_of(pool->cnt.parent, typeof(*pool), cnt); } static void dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool, struct dmem_cgroup_pool_state *test_pool) { struct page_counter *climit; struct cgroup_subsys_state *css, *next_css; struct dmemcg_state *dmemcg_iter; struct dmem_cgroup_pool_state *pool, *parent_pool; bool found_descendant; climit = &limit_pool->cnt; rcu_read_lock(); parent_pool = pool = limit_pool; css = &limit_pool->cs->css; /* * This logic is roughly equivalent to css_foreach_descendant_pre, * except we also track the parent pool to find out which pool we need * to calculate protection values for. * * We can stop the traversal once we find test_pool among the * descendants since we don't really care about any others. */ while (pool != test_pool) { next_css = css_next_child(NULL, css); if (next_css) { parent_pool = pool; } else { while (css != &limit_pool->cs->css) { next_css = css_next_child(css, css->parent); if (next_css) break; css = css->parent; parent_pool = pool_parent(parent_pool); } /* * We can only hit this when test_pool is not a * descendant of limit_pool. */ if (WARN_ON_ONCE(css == &limit_pool->cs->css)) break; } css = next_css; found_descendant = false; dmemcg_iter = container_of(css, struct dmemcg_state, css); list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) { if (pool_parent(pool) == parent_pool) { found_descendant = true; break; } } if (!found_descendant) continue; page_counter_calculate_protection( climit, &pool->cnt, true); } rcu_read_unlock(); } /** * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool * @dev: &dmem_cgroup_region * @index: The index number of the region being tested. * @limit_pool: The pool for which we hit limits * @test_pool: The pool for which to test * @ignore_low: Whether we have to respect low watermarks. * @ret_hit_low: Pointer to whether it makes sense to consider low watermark. * * This function returns true if we can evict from @test_pool, false if not. * When returning false and @ignore_low is false, @ret_hit_low may * be set to true to indicate this function can be retried with @ignore_low * set to true. * * Return: bool */ bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, struct dmem_cgroup_pool_state *test_pool, bool ignore_low, bool *ret_hit_low) { struct dmem_cgroup_pool_state *pool = test_pool; struct page_counter *climit, *ctest; u64 used, min, low; /* Can always evict from current pool, despite limits */ if (limit_pool == test_pool) return true; if (limit_pool) { if (!parent_dmemcs(limit_pool->cs)) return true; for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool)) {} if (!pool) return false; } else { /* * If there is no cgroup limiting memory usage, use the root * cgroup instead for limit calculations. */ for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool)) {} } climit = &limit_pool->cnt; ctest = &test_pool->cnt; dmem_cgroup_calculate_protection(limit_pool, test_pool); used = page_counter_read(ctest); min = READ_ONCE(ctest->emin); if (used <= min) return false; if (!ignore_low) { low = READ_ONCE(ctest->elow); if (used > low) return true; *ret_hit_low = true; return false; } return true; } EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable); static struct dmem_cgroup_pool_state * alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region, struct dmem_cgroup_pool_state **allocpool) { struct dmemcg_state *parent = parent_dmemcs(dmemcs); struct dmem_cgroup_pool_state *pool, *ppool = NULL; if (!*allocpool) { pool = kzalloc(sizeof(*pool), GFP_NOWAIT); if (!pool) return ERR_PTR(-ENOMEM); } else { pool = *allocpool; *allocpool = NULL; } pool->region = region; pool->cs = dmemcs; if (parent) ppool = find_cg_pool_locked(parent, region); page_counter_init(&pool->cnt, ppool ? &ppool->cnt : NULL, true); reset_all_resource_limits(pool); list_add_tail_rcu(&pool->css_node, &dmemcs->pools); list_add_tail(&pool->region_node, ®ion->pools); if (!parent) pool->inited = true; else pool->inited = ppool ? ppool->inited : false; return pool; } static struct dmem_cgroup_pool_state * get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region, struct dmem_cgroup_pool_state **allocpool) { struct dmem_cgroup_pool_state *pool, *ppool, *retpool; struct dmemcg_state *p, *pp; /* * Recursively create pool, we may not initialize yet on * recursion, this is done as a separate step. */ for (p = dmemcs; p; p = parent_dmemcs(p)) { pool = find_cg_pool_locked(p, region); if (!pool) pool = alloc_pool_single(p, region, allocpool); if (IS_ERR(pool)) return pool; if (p == dmemcs && pool->inited) return pool; if (pool->inited) break; } retpool = pool = find_cg_pool_locked(dmemcs, region); for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) { if (pool->inited) break; /* ppool was created if it didn't exist by above loop. */ ppool = find_cg_pool_locked(pp, region); /* Fix up parent links, mark as inited. */ pool->cnt.parent = &ppool->cnt; pool->inited = true; pool = ppool; } return retpool; } static void dmemcg_free_rcu(struct rcu_head *rcu) { struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu); struct dmem_cgroup_pool_state *pool, *next; list_for_each_entry_safe(pool, next, ®ion->pools, region_node) free_cg_pool(pool); kfree(region->name); kfree(region); } static void dmemcg_free_region(struct kref *ref) { struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref); call_rcu(&cgregion->rcu, dmemcg_free_rcu); } /** * dmem_cgroup_unregister_region() - Unregister a previously registered region. * @region: The region to unregister. * * This function undoes dmem_cgroup_register_region. */ void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) { struct list_head *entry; if (!region) return; spin_lock(&dmemcg_lock); /* Remove from global region list */ list_del_rcu(®ion->region_node); list_for_each_rcu(entry, ®ion->pools) { struct dmem_cgroup_pool_state *pool = container_of(entry, typeof(*pool), region_node); list_del_rcu(&pool->css_node); } /* * Ensure any RCU based lookups fail. Additionally, * no new pools should be added to the dead region * by get_cg_pool_unlocked. */ region->unregistered = true; spin_unlock(&dmemcg_lock); kref_put(®ion->ref, dmemcg_free_region); } EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region); /** * dmem_cgroup_register_region() - Register a regions for dev cgroup. * @size: Size of region to register, in bytes. * @fmt: Region parameters to register * * This function registers a node in the dmem cgroup with the * name given. After calling this function, the region can be * used for allocations. * * Return: NULL or a struct on success, PTR_ERR on failure. */ struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...) { struct dmem_cgroup_region *ret; char *region_name; va_list ap; if (!size) return NULL; va_start(ap, fmt); region_name = kvasprintf(GFP_KERNEL, fmt, ap); va_end(ap); if (!region_name) return ERR_PTR(-ENOMEM); ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) { kfree(region_name); return ERR_PTR(-ENOMEM); } INIT_LIST_HEAD(&ret->pools); ret->name = region_name; ret->size = size; kref_init(&ret->ref); spin_lock(&dmemcg_lock); list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions); spin_unlock(&dmemcg_lock); return ret; } EXPORT_SYMBOL_GPL(dmem_cgroup_register_region); static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name) { struct dmem_cgroup_region *region; list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock)) if (!strcmp(name, region->name) && kref_get_unless_zero(®ion->ref)) return region; return NULL; } /** * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state * @pool: &dmem_cgroup_pool_state * * Called to drop a reference to the limiting pool returned by * dmem_cgroup_try_charge(). */ void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) { if (pool) css_put(&pool->cs->css); } EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put); static struct dmem_cgroup_pool_state * get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) { struct dmem_cgroup_pool_state *pool, *allocpool = NULL; /* fastpath lookup? */ rcu_read_lock(); pool = find_cg_pool_locked(cg, region); if (pool && !READ_ONCE(pool->inited)) pool = NULL; rcu_read_unlock(); while (!pool) { spin_lock(&dmemcg_lock); if (!region->unregistered) pool = get_cg_pool_locked(cg, region, &allocpool); else pool = ERR_PTR(-ENODEV); spin_unlock(&dmemcg_lock); if (pool == ERR_PTR(-ENOMEM)) { pool = NULL; if (WARN_ON(allocpool)) continue; allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL); if (allocpool) { pool = NULL; continue; } } } kfree(allocpool); return pool; } /** * dmem_cgroup_uncharge() - Uncharge a pool. * @pool: Pool to uncharge. * @size: Size to uncharge. * * Undoes the effects of dmem_cgroup_try_charge. * Must be called with the returned pool as argument, * and same @index and @size. */ void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) { if (!pool) return; page_counter_uncharge(&pool->cnt, size); css_put(&pool->cs->css); } EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge); /** * dmem_cgroup_try_charge() - Try charging a new allocation to a region. * @dev: Device to charge * @size: Size (in bytes) to charge. * @ret_pool: On succesfull allocation, the pool that is charged. * @ret_limit_pool: On a failed allocation, the limiting pool. * * This function charges the current pool for @dev with region at @index for a * size of @size bytes. * * If the function succeeds, @ret_pool is set, which must be passed to * dmem_cgroup_uncharge() when undoing the allocation. * * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it * will be set to the pool for which the limit is hit. This can be used for * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed * with @dmem_cgroup_pool_state_put(). * * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure. */ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, struct dmem_cgroup_pool_state **ret_pool, struct dmem_cgroup_pool_state **ret_limit_pool) { struct dmemcg_state *cg; struct dmem_cgroup_pool_state *pool; struct page_counter *fail; int ret; *ret_pool = NULL; if (ret_limit_pool) *ret_limit_pool = NULL; /* * hold on to css, as cgroup can be removed but resource * accounting happens on css. */ cg = get_current_dmemcs(); pool = get_cg_pool_unlocked(cg, region); if (IS_ERR(pool)) { ret = PTR_ERR(pool); goto err; } if (!page_counter_try_charge(&pool->cnt, size, &fail)) { if (ret_limit_pool) { *ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt); css_get(&(*ret_limit_pool)->cs->css); } ret = -EAGAIN; goto err; } /* On success, reference from get_current_dmemcs is transferred to *ret_pool */ *ret_pool = pool; return 0; err: css_put(&cg->css); return ret; } EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge); static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v) { struct dmem_cgroup_region *region; rcu_read_lock(); list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { seq_puts(sf, region->name); seq_printf(sf, " %llu\n", region->size); } rcu_read_unlock(); return 0; } static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region, u64 *new_limit) { char *end; if (!strcmp(options, "max")) { *new_limit = PAGE_COUNTER_MAX; return 0; } *new_limit = memparse(options, &end); if (*end != '\0') return -EINVAL; return 0; } static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, void (*apply)(struct dmem_cgroup_pool_state *, u64)) { struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of)); int err = 0; while (buf && !err) { struct dmem_cgroup_pool_state *pool = NULL; char *options, *region_name; struct dmem_cgroup_region *region; u64 new_limit; options = buf; buf = strchr(buf, '\n'); if (buf) *buf++ = '\0'; options = strstrip(options); /* eat empty lines */ if (!options[0]) continue; region_name = strsep(&options, " \t"); if (!region_name[0]) continue; rcu_read_lock(); region = dmemcg_get_region_by_name(region_name); rcu_read_unlock(); if (!region) return -EINVAL; err = dmemcg_parse_limit(options, region, &new_limit); if (err < 0) goto out_put; pool = get_cg_pool_unlocked(dmemcs, region); if (IS_ERR(pool)) { err = PTR_ERR(pool); goto out_put; } /* And commit */ apply(pool, new_limit); out_put: kref_put(®ion->ref, dmemcg_free_region); } return err ?: nbytes; } static int dmemcg_limit_show(struct seq_file *sf, void *v, u64 (*fn)(struct dmem_cgroup_pool_state *)) { struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf)); struct dmem_cgroup_region *region; rcu_read_lock(); list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region); u64 val; seq_puts(sf, region->name); val = fn(pool); if (val < PAGE_COUNTER_MAX) seq_printf(sf, " %lld\n", val); else seq_puts(sf, " max\n"); } rcu_read_unlock(); return 0; } static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v) { return dmemcg_limit_show(sf, v, get_resource_current); } static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v) { return dmemcg_limit_show(sf, v, get_resource_min); } static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min); } static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v) { return dmemcg_limit_show(sf, v, get_resource_low); } static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low); } static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v) { return dmemcg_limit_show(sf, v, get_resource_max); } static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max); } static struct cftype files[] = { { .name = "capacity", .seq_show = dmem_cgroup_region_capacity_show, .flags = CFTYPE_ONLY_ON_ROOT, }, { .name = "current", .seq_show = dmem_cgroup_region_current_show, }, { .name = "min", .write = dmem_cgroup_region_min_write, .seq_show = dmem_cgroup_region_min_show, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "low", .write = dmem_cgroup_region_low_write, .seq_show = dmem_cgroup_region_low_show, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "max", .write = dmem_cgroup_region_max_write, .seq_show = dmem_cgroup_region_max_show, .flags = CFTYPE_NOT_ON_ROOT, }, { } /* Zero entry terminates. */ }; struct cgroup_subsys dmem_cgrp_subsys = { .css_alloc = dmemcs_alloc, .css_free = dmemcs_free, .css_offline = dmemcs_offline, .legacy_cftypes = files, .dfl_cftypes = files, };