mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-06 05:06:29 +00:00
cf264e1329
There is currently no good way to query the page cache state of large file sets and directory trees. There is mincore(), but it scales poorly: the kernel writes out a lot of bitmap data that userspace has to aggregate, when the user really doesn not care about per-page information in that case. The user also needs to mmap and unmap each file as it goes along, which can be quite slow as well. Some use cases where this information could come in handy: * Allowing database to decide whether to perform an index scan or direct table queries based on the in-memory cache state of the index. * Visibility into the writeback algorithm, for performance issues diagnostic. * Workload-aware writeback pacing: estimating IO fulfilled by page cache (and IO to be done) within a range of a file, allowing for more frequent syncing when and where there is IO capacity, and batching when there is not. * Computing memory usage of large files/directory trees, analogous to the du tool for disk usage. More information about these use cases could be found in the following thread: https://lore.kernel.org/lkml/20230315170934.GA97793@cmpxchg.org/ This patch implements a new syscall that queries cache state of a file and summarizes the number of cached pages, number of dirty pages, number of pages marked for writeback, number of (recently) evicted pages, etc. in a given range. Currently, the syscall is only wired in for x86 architecture. NAME cachestat - query the page cache statistics of a file. SYNOPSIS #include <sys/mman.h> struct cachestat_range { __u64 off; __u64 len; }; struct cachestat { __u64 nr_cache; __u64 nr_dirty; __u64 nr_writeback; __u64 nr_evicted; __u64 nr_recently_evicted; }; int cachestat(unsigned int fd, struct cachestat_range *cstat_range, struct cachestat *cstat, unsigned int flags); DESCRIPTION cachestat() queries the number of cached pages, number of dirty pages, number of pages marked for writeback, number of evicted pages, number of recently evicted pages, in the bytes range given by `off` and `len`. An evicted page is a page that is previously in the page cache but has been evicted since. A page is recently evicted if its last eviction was recent enough that its reentry to the cache would indicate that it is actively being used by the system, and that there is memory pressure on the system. These values are returned in a cachestat struct, whose address is given by the `cstat` argument. The `off` and `len` arguments must be non-negative integers. If `len` > 0, the queried range is [`off`, `off` + `len`]. If `len` == 0, we will query in the range from `off` to the end of the file. The `flags` argument is unused for now, but is included for future extensibility. User should pass 0 (i.e no flag specified). Currently, hugetlbfs is not supported. Because the status of a page can change after cachestat() checks it but before it returns to the application, the returned values may contain stale information. RETURN VALUE On success, cachestat returns 0. On error, -1 is returned, and errno is set to indicate the error. ERRORS EFAULT cstat or cstat_args points to an invalid address. EINVAL invalid flags. EBADF invalid file descriptor. EOPNOTSUPP file descriptor is of a hugetlbfs file [nphamcs@gmail.com: replace rounddown logic with the existing helper] Link: https://lkml.kernel.org/r/20230504022044.3675469-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20230503013608.2431726-3-nphamcs@gmail.com Signed-off-by: Nhat Pham <nphamcs@gmail.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Brian Foster <bfoster@redhat.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
59 lines
1.8 KiB
C
59 lines
1.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
|
#ifndef _UAPI_LINUX_MMAN_H
|
|
#define _UAPI_LINUX_MMAN_H
|
|
|
|
#include <asm/mman.h>
|
|
#include <asm-generic/hugetlb_encode.h>
|
|
#include <linux/types.h>
|
|
|
|
#define MREMAP_MAYMOVE 1
|
|
#define MREMAP_FIXED 2
|
|
#define MREMAP_DONTUNMAP 4
|
|
|
|
#define OVERCOMMIT_GUESS 0
|
|
#define OVERCOMMIT_ALWAYS 1
|
|
#define OVERCOMMIT_NEVER 2
|
|
|
|
#define MAP_SHARED 0x01 /* Share changes */
|
|
#define MAP_PRIVATE 0x02 /* Changes are private */
|
|
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
|
|
|
|
/*
|
|
* Huge page size encoding when MAP_HUGETLB is specified, and a huge page
|
|
* size other than the default is desired. See hugetlb_encode.h.
|
|
* All known huge page size encodings are provided here. It is the
|
|
* responsibility of the application to know which sizes are supported on
|
|
* the running system. See mmap(2) man page for details.
|
|
*/
|
|
#define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
|
|
#define MAP_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK
|
|
|
|
#define MAP_HUGE_16KB HUGETLB_FLAG_ENCODE_16KB
|
|
#define MAP_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB
|
|
#define MAP_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB
|
|
#define MAP_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB
|
|
#define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB
|
|
#define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB
|
|
#define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB
|
|
#define MAP_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB
|
|
#define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB
|
|
#define MAP_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB
|
|
#define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB
|
|
#define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
|
|
#define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB
|
|
|
|
struct cachestat_range {
|
|
__u64 off;
|
|
__u64 len;
|
|
};
|
|
|
|
struct cachestat {
|
|
__u64 nr_cache;
|
|
__u64 nr_dirty;
|
|
__u64 nr_writeback;
|
|
__u64 nr_evicted;
|
|
__u64 nr_recently_evicted;
|
|
};
|
|
|
|
#endif /* _UAPI_LINUX_MMAN_H */
|