mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-08 15:04:45 +00:00
b795854b1f
Ideally it would be possible to distinguish between NUMA hinting faults that are private to a task and those that are shared. If treated identically there is a risk that shared pages bounce between nodes depending on the order they are referenced by tasks. Ultimately what is desirable is that task private pages remain local to the task while shared pages are interleaved between sharing tasks running on different nodes to give good average performance. This is further complicated by THP as even applications that partition their data may not be partitioning on a huge page boundary. To start with, this patch assumes that multi-threaded or multi-process applications partition their data and that in general the private accesses are more important for cpu->memory locality in the general case. Also, no new infrastructure is required to treat private pages properly but interleaving for shared pages requires additional infrastructure. To detect private accesses the pid of the last accessing task is required but the storage requirements are a high. This patch borrows heavily from Ingo Molnar's patch "numa, mm, sched: Implement last-CPU+PID hash tracking" to encode some bits from the last accessing task in the page flags as well as the node information. Collisions will occur but it is better than just depending on the node information. Node information is then used to determine if a page needs to migrate. The PID information is used to detect private/shared accesses. The preferred NUMA node is selected based on where the maximum number of approximately private faults were measured. Shared faults are not taken into consideration for a few reasons. First, if there are many tasks sharing the page then they'll all move towards the same node. The node will be compute overloaded and then scheduled away later only to bounce back again. Alternatively the shared tasks would just bounce around nodes because the fault information is effectively noise. Either way accounting for shared faults the same as private faults can result in lower performance overall. The second reason is based on a hypothetical workload that has a small number of very important, heavily accessed private pages but a large shared array. The shared array would dominate the number of faults and be selected as a preferred node even though it's the wrong decision. The third reason is that multiple threads in a process will race each other to fault the shared page making the fault information unreliable. Signed-off-by: Mel Gorman <mgorman@suse.de> [ Fix complication error when !NUMA_BALANCING. ] Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-30-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
207 lines
5.0 KiB
C
207 lines
5.0 KiB
C
/*
|
|
* mm_init.c - Memory initialisation verification and debugging
|
|
*
|
|
* Copyright 2008 IBM Corporation, 2008
|
|
* Author Mel Gorman <mel@csn.ul.ie>
|
|
*
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/init.h>
|
|
#include <linux/kobject.h>
|
|
#include <linux/export.h>
|
|
#include <linux/memory.h>
|
|
#include <linux/notifier.h>
|
|
#include "internal.h"
|
|
|
|
#ifdef CONFIG_DEBUG_MEMORY_INIT
|
|
int mminit_loglevel;
|
|
|
|
#ifndef SECTIONS_SHIFT
|
|
#define SECTIONS_SHIFT 0
|
|
#endif
|
|
|
|
/* The zonelists are simply reported, validation is manual. */
|
|
void mminit_verify_zonelist(void)
|
|
{
|
|
int nid;
|
|
|
|
if (mminit_loglevel < MMINIT_VERIFY)
|
|
return;
|
|
|
|
for_each_online_node(nid) {
|
|
pg_data_t *pgdat = NODE_DATA(nid);
|
|
struct zone *zone;
|
|
struct zoneref *z;
|
|
struct zonelist *zonelist;
|
|
int i, listid, zoneid;
|
|
|
|
BUG_ON(MAX_ZONELISTS > 2);
|
|
for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
|
|
|
|
/* Identify the zone and nodelist */
|
|
zoneid = i % MAX_NR_ZONES;
|
|
listid = i / MAX_NR_ZONES;
|
|
zonelist = &pgdat->node_zonelists[listid];
|
|
zone = &pgdat->node_zones[zoneid];
|
|
if (!populated_zone(zone))
|
|
continue;
|
|
|
|
/* Print information about the zonelist */
|
|
printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
|
|
listid > 0 ? "thisnode" : "general", nid,
|
|
zone->name);
|
|
|
|
/* Iterate the zonelist */
|
|
for_each_zone_zonelist(zone, z, zonelist, zoneid) {
|
|
#ifdef CONFIG_NUMA
|
|
printk(KERN_CONT "%d:%s ",
|
|
zone->node, zone->name);
|
|
#else
|
|
printk(KERN_CONT "0:%s ", zone->name);
|
|
#endif /* CONFIG_NUMA */
|
|
}
|
|
printk(KERN_CONT "\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
void __init mminit_verify_pageflags_layout(void)
|
|
{
|
|
int shift, width;
|
|
unsigned long or_mask, add_mask;
|
|
|
|
shift = 8 * sizeof(unsigned long);
|
|
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NIDPID_SHIFT;
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
|
|
"Section %d Node %d Zone %d Lastnidpid %d Flags %d\n",
|
|
SECTIONS_WIDTH,
|
|
NODES_WIDTH,
|
|
ZONES_WIDTH,
|
|
LAST_NIDPID_WIDTH,
|
|
NR_PAGEFLAGS);
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
|
|
"Section %d Node %d Zone %d Lastnidpid %d\n",
|
|
SECTIONS_SHIFT,
|
|
NODES_SHIFT,
|
|
ZONES_SHIFT,
|
|
LAST_NIDPID_SHIFT);
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
|
|
"Section %lu Node %lu Zone %lu Lastnidpid %lu\n",
|
|
(unsigned long)SECTIONS_PGSHIFT,
|
|
(unsigned long)NODES_PGSHIFT,
|
|
(unsigned long)ZONES_PGSHIFT,
|
|
(unsigned long)LAST_NIDPID_PGSHIFT);
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
|
|
"Node/Zone ID: %lu -> %lu\n",
|
|
(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
|
|
(unsigned long)ZONEID_PGOFF);
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
|
|
"location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
|
|
shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
|
|
#ifdef NODE_NOT_IN_PAGE_FLAGS
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
|
|
"Node not in page flags");
|
|
#endif
|
|
#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
|
|
"Last nidpid not in page flags");
|
|
#endif
|
|
|
|
if (SECTIONS_WIDTH) {
|
|
shift -= SECTIONS_WIDTH;
|
|
BUG_ON(shift != SECTIONS_PGSHIFT);
|
|
}
|
|
if (NODES_WIDTH) {
|
|
shift -= NODES_WIDTH;
|
|
BUG_ON(shift != NODES_PGSHIFT);
|
|
}
|
|
if (ZONES_WIDTH) {
|
|
shift -= ZONES_WIDTH;
|
|
BUG_ON(shift != ZONES_PGSHIFT);
|
|
}
|
|
|
|
/* Check for bitmask overlaps */
|
|
or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
|
|
(NODES_MASK << NODES_PGSHIFT) |
|
|
(SECTIONS_MASK << SECTIONS_PGSHIFT);
|
|
add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
|
|
(NODES_MASK << NODES_PGSHIFT) +
|
|
(SECTIONS_MASK << SECTIONS_PGSHIFT);
|
|
BUG_ON(or_mask != add_mask);
|
|
}
|
|
|
|
void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
|
|
unsigned long nid, unsigned long pfn)
|
|
{
|
|
BUG_ON(page_to_nid(page) != nid);
|
|
BUG_ON(page_zonenum(page) != zone);
|
|
BUG_ON(page_to_pfn(page) != pfn);
|
|
}
|
|
|
|
static __init int set_mminit_loglevel(char *str)
|
|
{
|
|
get_option(&str, &mminit_loglevel);
|
|
return 0;
|
|
}
|
|
early_param("mminit_loglevel", set_mminit_loglevel);
|
|
#endif /* CONFIG_DEBUG_MEMORY_INIT */
|
|
|
|
struct kobject *mm_kobj;
|
|
EXPORT_SYMBOL_GPL(mm_kobj);
|
|
|
|
#ifdef CONFIG_SMP
|
|
s32 vm_committed_as_batch = 32;
|
|
|
|
static void __meminit mm_compute_batch(void)
|
|
{
|
|
u64 memsized_batch;
|
|
s32 nr = num_present_cpus();
|
|
s32 batch = max_t(s32, nr*2, 32);
|
|
|
|
/* batch size set to 0.4% of (total memory/#cpus), or max int32 */
|
|
memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
|
|
|
|
vm_committed_as_batch = max_t(s32, memsized_batch, batch);
|
|
}
|
|
|
|
static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
|
|
unsigned long action, void *arg)
|
|
{
|
|
switch (action) {
|
|
case MEM_ONLINE:
|
|
case MEM_OFFLINE:
|
|
mm_compute_batch();
|
|
default:
|
|
break;
|
|
}
|
|
return NOTIFY_OK;
|
|
}
|
|
|
|
static struct notifier_block compute_batch_nb __meminitdata = {
|
|
.notifier_call = mm_compute_batch_notifier,
|
|
.priority = IPC_CALLBACK_PRI, /* use lowest priority */
|
|
};
|
|
|
|
static int __init mm_compute_batch_init(void)
|
|
{
|
|
mm_compute_batch();
|
|
register_hotmemory_notifier(&compute_batch_nb);
|
|
|
|
return 0;
|
|
}
|
|
|
|
__initcall(mm_compute_batch_init);
|
|
|
|
#endif
|
|
|
|
static int __init mm_sysfs_init(void)
|
|
{
|
|
mm_kobj = kobject_create_and_add("mm", kernel_kobj);
|
|
if (!mm_kobj)
|
|
return -ENOMEM;
|
|
|
|
return 0;
|
|
}
|
|
|
|
__initcall(mm_sysfs_init);
|