mm, swap: use rbtree for swap_extent

swap_extent is used to map swap page offset to backing device's block
offset.  For a continuous block range, one swap_extent is used and all
these swap_extents are managed in a linked list.

These swap_extents are used by map_swap_entry() during swap's read and
write path.  To find out the backing device's block offset for a page
offset, the swap_extent list will be traversed linearly, with
curr_swap_extent being used as a cache to speed up the search.

This works well as long as swap_extents are not huge or when the number
of processes that access swap device are few, but when the swap device
has many extents and there are a number of processes accessing the swap
device concurrently, it can be a problem.  On one of our servers, the
disk's remaining size is tight:

  $df -h
  Filesystem      Size  Used Avail Use% Mounted on
  ... ...
  /dev/nvme0n1p1  1.8T  1.3T  504G  72% /home/t4

When creating a 80G swapfile there, there are as many as 84656 swap
extents.  The end result is, kernel spends abou 30% time in
map_swap_entry() and swap throughput is only 70MB/s.

As a comparison, when I used smaller sized swapfile, like 4G whose
swap_extent dropped to 2000, swap throughput is back to 400-500MB/s and
map_swap_entry() is about 3%.

One downside of using rbtree for swap_extent is, 'struct rbtree' takes
24 bytes while 'struct list_head' takes 16 bytes, that's 8 bytes more
for each swap_extent.  For a swapfile that has 80k swap_extents, that
means 625KiB more memory consumed.

Test:

Since it's not possible to reboot that server, I can not test this patch
diretly there.  Instead, I tested it on another server with NVMe disk.

I created a 20G swapfile on an NVMe backed XFS fs.  By default, the
filesystem is quite clean and the created swapfile has only 2 extents.
Testing vanilla and this patch shows no obvious performance difference
when swapfile is not fragmented.

To see the patch's effects, I used some tweaks to manually fragment the
swapfile by breaking the extent at 1M boundary.  This made the swapfile
have 20K extents.

  nr_task=4
  kernel   swapout(KB/s) map_swap_entry(perf)  swapin(KB/s) map_swap_entry(perf)
  vanilla  165191           90.77%             171798          90.21%
  patched  858993 +420%      2.16%             715827 +317%     0.77%

  nr_task=8
  kernel   swapout(KB/s) map_swap_entry(perf)  swapin(KB/s) map_swap_entry(perf)
  vanilla  306783           92.19%             318145          87.76%
  patched  954437 +211%      2.35%            1073741 +237%     1.57%

swapout: the throughput of swap out, in KB/s, higher is better 1st
map_swap_entry: cpu cycles percent sampled by perf swapin: the
throughput of swap in, in KB/s, higher is better.  2nd map_swap_entry:
cpu cycles percent sampled by perf

nr_task=1 doesn't show any difference, this is due to the curr_swap_extent
can be effectively used to cache the correct swap extent for single task
workload.

[akpm@linux-foundation.org: s/BUG_ON(1)/BUG()/]
Link: http://lkml.kernel.org/r/20190523142404.GA181@aaronlu
Signed-off-by: Aaron Lu <ziqian.lzq@antfin.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Aaron Lu 2019-07-11 20:55:41 -07:00 committed by Linus Torvalds
parent 054f1d1faa
commit 4efaceb1c5
3 changed files with 75 additions and 63 deletions

View File

@ -148,7 +148,7 @@ struct zone;
* We always assume that blocks are of size PAGE_SIZE. * We always assume that blocks are of size PAGE_SIZE.
*/ */
struct swap_extent { struct swap_extent {
struct list_head list; struct rb_node rb_node;
pgoff_t start_page; pgoff_t start_page;
pgoff_t nr_pages; pgoff_t nr_pages;
sector_t start_block; sector_t start_block;
@ -248,8 +248,7 @@ struct swap_info_struct {
unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_next; /* likely index for next allocation */
unsigned int cluster_nr; /* countdown to next cluster search */ unsigned int cluster_nr; /* countdown to next cluster search */
struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
struct swap_extent *curr_swap_extent; struct rb_root swap_extent_root;/* root of the swap extent rbtree */
struct swap_extent first_swap_extent;
struct block_device *bdev; /* swap device or bdev of swap file */ struct block_device *bdev; /* swap device or bdev of swap file */
struct file *swap_file; /* seldom referenced */ struct file *swap_file; /* seldom referenced */
unsigned int old_block_size; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */

View File

@ -163,7 +163,7 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
blocks_per_page = PAGE_SIZE >> blkbits; blocks_per_page = PAGE_SIZE >> blkbits;
/* /*
* Map all the blocks into the extent list. This code doesn't try * Map all the blocks into the extent tree. This code doesn't try
* to be very smart. * to be very smart.
*/ */
probe_block = 0; probe_block = 0;

View File

@ -152,6 +152,18 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
return ret; return ret;
} }
static inline struct swap_extent *first_se(struct swap_info_struct *sis)
{
struct rb_node *rb = rb_first(&sis->swap_extent_root);
return rb_entry(rb, struct swap_extent, rb_node);
}
static inline struct swap_extent *next_se(struct swap_extent *se)
{
struct rb_node *rb = rb_next(&se->rb_node);
return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
}
/* /*
* swapon tell device that all the old swap contents can be discarded, * swapon tell device that all the old swap contents can be discarded,
* to allow the swap device to optimize its wear-levelling. * to allow the swap device to optimize its wear-levelling.
@ -164,7 +176,7 @@ static int discard_swap(struct swap_info_struct *si)
int err = 0; int err = 0;
/* Do not discard the swap header page! */ /* Do not discard the swap header page! */
se = &si->first_swap_extent; se = first_se(si);
start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) { if (nr_blocks) {
@ -175,7 +187,7 @@ static int discard_swap(struct swap_info_struct *si)
cond_resched(); cond_resched();
} }
list_for_each_entry(se, &si->first_swap_extent.list, list) { for (se = next_se(se); se; se = next_se(se)) {
start_block = se->start_block << (PAGE_SHIFT - 9); start_block = se->start_block << (PAGE_SHIFT - 9);
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
@ -189,6 +201,26 @@ static int discard_swap(struct swap_info_struct *si)
return err; /* That will often be -EOPNOTSUPP */ return err; /* That will often be -EOPNOTSUPP */
} }
static struct swap_extent *
offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
{
struct swap_extent *se;
struct rb_node *rb;
rb = sis->swap_extent_root.rb_node;
while (rb) {
se = rb_entry(rb, struct swap_extent, rb_node);
if (offset < se->start_page)
rb = rb->rb_left;
else if (offset >= se->start_page + se->nr_pages)
rb = rb->rb_right;
else
return se;
}
/* It *must* be present */
BUG();
}
/* /*
* swap allocation tell device that a cluster of swap can now be discarded, * swap allocation tell device that a cluster of swap can now be discarded,
* to allow the swap device to optimize its wear-levelling. * to allow the swap device to optimize its wear-levelling.
@ -196,12 +228,9 @@ static int discard_swap(struct swap_info_struct *si)
static void discard_swap_cluster(struct swap_info_struct *si, static void discard_swap_cluster(struct swap_info_struct *si,
pgoff_t start_page, pgoff_t nr_pages) pgoff_t start_page, pgoff_t nr_pages)
{ {
struct swap_extent *se = si->curr_swap_extent; struct swap_extent *se = offset_to_swap_extent(si, start_page);
int found_extent = 0;
while (nr_pages) { while (nr_pages) {
if (se->start_page <= start_page &&
start_page < se->start_page + se->nr_pages) {
pgoff_t offset = start_page - se->start_page; pgoff_t offset = start_page - se->start_page;
sector_t start_block = se->start_block + offset; sector_t start_block = se->start_block + offset;
sector_t nr_blocks = se->nr_pages - offset; sector_t nr_blocks = se->nr_pages - offset;
@ -211,17 +240,13 @@ static void discard_swap_cluster(struct swap_info_struct *si,
start_page += nr_blocks; start_page += nr_blocks;
nr_pages -= nr_blocks; nr_pages -= nr_blocks;
if (!found_extent++)
si->curr_swap_extent = se;
start_block <<= PAGE_SHIFT - 9; start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block, if (blkdev_issue_discard(si->bdev, start_block,
nr_blocks, GFP_NOIO, 0)) nr_blocks, GFP_NOIO, 0))
break; break;
}
se = list_next_entry(se, list); se = next_se(se);
} }
} }
@ -1755,7 +1780,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
return type; return type;
} }
if (bdev == sis->bdev) { if (bdev == sis->bdev) {
struct swap_extent *se = &sis->first_swap_extent; struct swap_extent *se = first_se(sis);
if (se->start_block == offset) { if (se->start_block == offset) {
if (bdev_p) if (bdev_p)
@ -2232,7 +2257,6 @@ static void drain_mmlist(void)
static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
{ {
struct swap_info_struct *sis; struct swap_info_struct *sis;
struct swap_extent *start_se;
struct swap_extent *se; struct swap_extent *se;
pgoff_t offset; pgoff_t offset;
@ -2240,18 +2264,8 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
*bdev = sis->bdev; *bdev = sis->bdev;
offset = swp_offset(entry); offset = swp_offset(entry);
start_se = sis->curr_swap_extent; se = offset_to_swap_extent(sis, offset);
se = start_se;
for ( ; ; ) {
if (se->start_page <= offset &&
offset < (se->start_page + se->nr_pages)) {
return se->start_block + (offset - se->start_page); return se->start_block + (offset - se->start_page);
}
se = list_next_entry(se, list);
sis->curr_swap_extent = se;
BUG_ON(se == start_se); /* It *must* be present */
}
} }
/* /*
@ -2269,12 +2283,11 @@ sector_t map_swap_page(struct page *page, struct block_device **bdev)
*/ */
static void destroy_swap_extents(struct swap_info_struct *sis) static void destroy_swap_extents(struct swap_info_struct *sis)
{ {
while (!list_empty(&sis->first_swap_extent.list)) { while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
struct swap_extent *se; struct rb_node *rb = sis->swap_extent_root.rb_node;
struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
se = list_first_entry(&sis->first_swap_extent.list, rb_erase(rb, &sis->swap_extent_root);
struct swap_extent, list);
list_del(&se->list);
kfree(se); kfree(se);
} }
@ -2290,7 +2303,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
/* /*
* Add a block range (and the corresponding page range) into this swapdev's * Add a block range (and the corresponding page range) into this swapdev's
* extent list. The extent list is kept sorted in page order. * extent tree.
* *
* This function rather assumes that it is called in ascending page order. * This function rather assumes that it is called in ascending page order.
*/ */
@ -2298,20 +2311,21 @@ int
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block) unsigned long nr_pages, sector_t start_block)
{ {
struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
struct swap_extent *se; struct swap_extent *se;
struct swap_extent *new_se; struct swap_extent *new_se;
struct list_head *lh;
if (start_page == 0) { /*
se = &sis->first_swap_extent; * place the new node at the right most since the
sis->curr_swap_extent = se; * function is called in ascending page order.
se->start_page = 0; */
se->nr_pages = nr_pages; while (*link) {
se->start_block = start_block; parent = *link;
return 1; link = &parent->rb_right;
} else { }
lh = sis->first_swap_extent.list.prev; /* Highest extent */
se = list_entry(lh, struct swap_extent, list); if (parent) {
se = rb_entry(parent, struct swap_extent, rb_node);
BUG_ON(se->start_page + se->nr_pages != start_page); BUG_ON(se->start_page + se->nr_pages != start_page);
if (se->start_block + se->nr_pages == start_block) { if (se->start_block + se->nr_pages == start_block) {
/* Merge it */ /* Merge it */
@ -2320,9 +2334,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
} }
} }
/* /* No merge, insert a new extent. */
* No merge. Insert a new extent, preserving ordering.
*/
new_se = kmalloc(sizeof(*se), GFP_KERNEL); new_se = kmalloc(sizeof(*se), GFP_KERNEL);
if (new_se == NULL) if (new_se == NULL)
return -ENOMEM; return -ENOMEM;
@ -2330,7 +2342,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
new_se->nr_pages = nr_pages; new_se->nr_pages = nr_pages;
new_se->start_block = start_block; new_se->start_block = start_block;
list_add_tail(&new_se->list, &sis->first_swap_extent.list); rb_link_node(&new_se->rb_node, parent, link);
rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
return 1; return 1;
} }
EXPORT_SYMBOL_GPL(add_swap_extent); EXPORT_SYMBOL_GPL(add_swap_extent);
@ -2846,7 +2859,7 @@ static struct swap_info_struct *alloc_swap_info(void)
* would be relying on p->type to remain valid. * would be relying on p->type to remain valid.
*/ */
} }
INIT_LIST_HEAD(&p->first_swap_extent.list); p->swap_extent_root = RB_ROOT;
plist_node_init(&p->list, 0); plist_node_init(&p->list, 0);
for_each_node(i) for_each_node(i)
plist_node_init(&p->avail_lists[i], 0); plist_node_init(&p->avail_lists[i], 0);