virtio_balloon: introduce memory scan/reclaim info

Expose memory scan/reclaim information to the host side via virtio
balloon device.

Now we have a metric to analyze the memory performance:

y: counter increases
n: counter does not changes
h: the rate of counter change is high
l: the rate of counter change is low

OOM: VIRTIO_BALLOON_S_OOM_KILL
STALL: VIRTIO_BALLOON_S_ALLOC_STALL
ASCAN: VIRTIO_BALLOON_S_SCAN_ASYNC
DSCAN: VIRTIO_BALLOON_S_SCAN_DIRECT
ARCLM: VIRTIO_BALLOON_S_RECLAIM_ASYNC
DRCLM: VIRTIO_BALLOON_S_RECLAIM_DIRECT

- OOM[y], STALL[*], ASCAN[*], DSCAN[*], ARCLM[*], DRCLM[*]:
  the guest runs under really critial memory pressure

- OOM[n], STALL[h], ASCAN[*], DSCAN[l], ARCLM[*], DRCLM[l]:
  the memory allocation stalls due to cgroup, not the global memory
  pressure.

- OOM[n], STALL[h], ASCAN[*], DSCAN[h], ARCLM[*], DRCLM[h]:
  the memory allocation stalls due to global memory pressure. The
  performance gets hurt a lot. A high ratio between DRCLM/DSCAN shows
  quite effective memory reclaiming.

- OOM[n], STALL[h], ASCAN[*], DSCAN[h], ARCLM[*], DRCLM[l]:
  the memory allocation stalls due to global memory pressure.
  the ratio between DRCLM/DSCAN gets low, the guest OS is thrashing
  heavily, the serious case leads poor performance and difficult
  trouble shooting. Ex, sshd may block on memory allocation when
  accepting new connections, a user can't login a VM by ssh command.

- OOM[n], STALL[n], ASCAN[h], DSCAN[n], ARCLM[l], DRCLM[n]:
  the low ratio between ARCLM/ASCAN shows that the guest tries to
  reclaim more memory, but it can't. Once more memory is required in
  future, it will struggle to reclaim memory.

Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
Message-Id: <20240423034109.1552866-5-pizhenwei@bytedance.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
This commit is contained in:
zhenwei pi 2024-04-23 11:41:09 +08:00 committed by Michael S. Tsirkin
parent c5b70a26aa
commit 74c025c5d7
2 changed files with 19 additions and 2 deletions

View File

@ -373,6 +373,15 @@ static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb)
update_stat(vb, idx++, VIRTIO_BALLOON_S_ALLOC_STALL, stall);
update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_SCAN,
pages_to_bytes(events[PGSCAN_KSWAPD]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_SCAN,
pages_to_bytes(events[PGSCAN_DIRECT]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_RECLAIM,
pages_to_bytes(events[PGSTEAL_KSWAPD]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_RECLAIM,
pages_to_bytes(events[PGSTEAL_DIRECT]));
#ifdef CONFIG_HUGETLB_PAGE
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
events[HTLB_BUDDY_PGALLOC]);

View File

@ -73,7 +73,11 @@ struct virtio_balloon_config {
#define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */
#define VIRTIO_BALLOON_S_OOM_KILL 10 /* OOM killer invocations */
#define VIRTIO_BALLOON_S_ALLOC_STALL 11 /* Stall count of memory allocatoin */
#define VIRTIO_BALLOON_S_NR 12
#define VIRTIO_BALLOON_S_ASYNC_SCAN 12 /* Amount of memory scanned asynchronously */
#define VIRTIO_BALLOON_S_DIRECT_SCAN 13 /* Amount of memory scanned directly */
#define VIRTIO_BALLOON_S_ASYNC_RECLAIM 14 /* Amount of memory reclaimed asynchronously */
#define VIRTIO_BALLOON_S_DIRECT_RECLAIM 15 /* Amount of memory reclaimed directly */
#define VIRTIO_BALLOON_S_NR 16
#define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \
VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \
@ -87,7 +91,11 @@ struct virtio_balloon_config {
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \
VIRTIO_BALLOON_S_NAMES_prefix "oom-kills", \
VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls" \
VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls", \
VIRTIO_BALLOON_S_NAMES_prefix "async-scans", \
VIRTIO_BALLOON_S_NAMES_prefix "direct-scans", \
VIRTIO_BALLOON_S_NAMES_prefix "async-reclaims", \
VIRTIO_BALLOON_S_NAMES_prefix "direct-reclaims" \
}
#define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("")