Ravikiran G Thirumalai e073ae1b34 [PATCH] x86-64: Set HASHDIST_DEFAULT to 1 for x86_64 NUMA
Enable system hashtable memory to be distributed among nodes on x86_64 NUMA

Forcing the kernel to use node interleaved vmalloc instead of bootmem for
the system hashtable memory (alloc_large_system_hash) reduces the memory
imbalance on node 0 by around 40MB on a 8 node x86_64 NUMA box:

Before the following patch, on bootup of a 8 node box:

Node 0 MemTotal:      3407488 kB
Node 0 MemFree:       3206296 kB
Node 0 MemUsed:        201192 kB
Node 0 Active:           7012 kB
Node 0 Inactive:          512 kB
Node 0 Dirty:               0 kB
Node 0 Writeback:           0 kB
Node 0 FilePages:        1912 kB
Node 0 Mapped:            420 kB
Node 0 AnonPages:        5612 kB
Node 0 PageTables:        468 kB
Node 0 NFS_Unstable:        0 kB
Node 0 Bounce:              0 kB
Node 0 Slab:             5408 kB
Node 0 SReclaimable:      644 kB
Node 0 SUnreclaim:       4764 kB

After the patch (or using hashdist=1 on the kernel command line):

Node 0 MemTotal:      3407488 kB
Node 0 MemFree:       3247608 kB
Node 0 MemUsed:        159880 kB
Node 0 Active:           3012 kB
Node 0 Inactive:          616 kB
Node 0 Dirty:               0 kB
Node 0 Writeback:           0 kB
Node 0 FilePages:        2424 kB
Node 0 Mapped:            380 kB
Node 0 AnonPages:        1200 kB
Node 0 PageTables:        396 kB
Node 0 NFS_Unstable:        0 kB
Node 0 Bounce:              0 kB
Node 0 Slab:             6304 kB
Node 0 SReclaimable:     1596 kB
Node 0 SUnreclaim:       4708 kB

I guess it is a good idea to keep HASHDIST_DEFAULT "on" for x86_64 NUMA
since x86_64 has no dearth of vmalloc space?  Or maybe enable hash
distribution for all 64bit NUMA arches?  The following patch does it only
for x86_64.

I ran a HPC MPI benchmark -- 'Ansys wingsolid', which takes up quite a bit of
memory and uses up tlb entries.  This was on a 4 way, 2 socket
Tyan AMD box (non vsmp), with 8G total memory (4G pernode).

The results with and without hash distribution are:

1. Vanilla - runtime of 1188.000s
2. With hashdist=1 runtime of 1154.000s

Oprofile output for the duration of run is:

1. Vanilla:
PU: AMD64 processors, speed 2411.16 MHz (estimated)
Counted L1_AND_L2_DTLB_MISSES events (L1 and L2 DTLB misses) with a unit
mask of 0x00 (No unit mask) count 500
samples  %        app name                 symbol name
163054    6.5513  libansys1.so             MultiFront::decompose(int, int,
Elemset *, int *, int, int, int)
162061    6.5114  libansys3.so             blockSaxpy6L_fd
162042    6.5107  libansys3.so             blockInnerProduct6L_fd
156286    6.2794  libansys3.so             maxb33_
87879     3.5309  libansys1.so             elmatrixmultpcg_
84857     3.4095  libansys4.so             saxpy_pcg
58637     2.3560  libansys4.so             .st4560
46612     1.8728  libansys4.so             .st4282
43043     1.7294  vmlinux-t                copy_user_generic_string
41326     1.6604  libansys3.so             blockSaxpyBackSolve6L_fd
41288     1.6589  libansys3.so             blockInnerProductBackSolve6L_fd

2. With hashdist=1
CPU: AMD64 processors, speed 2411.13 MHz (estimated)
Counted L1_AND_L2_DTLB_MISSES events (L1 and L2 DTLB misses) with a unit
mask of 0x00 (No unit mask) count 500
samples  %        app name                 symbol name
162993    6.9814  libansys1.so             MultiFront::decompose(int, int,
Elemset *, int *, int, int, int)
160799    6.8874  libansys3.so             blockInnerProduct6L_fd
160459    6.8729  libansys3.so             blockSaxpy6L_fd
156018    6.6826  libansys3.so             maxb33_
84700     3.6279  libansys4.so             saxpy_pcg
83434     3.5737  libansys1.so             elmatrixmultpcg_
58074     2.4875  libansys4.so             .st4560
46000     1.9703  libansys4.so             .st4282
41166     1.7632  libansys3.so             blockSaxpyBackSolve6L_fd
41033     1.7575  libansys3.so             blockInnerProductBackSolve6L_fd
35762     1.5318  libansys1.so             inner_product_sub
35591     1.5245  libansys1.so             inner_product_sub2
28259     1.2104  libansys4.so             addVectors

Signed-off-by: Pravin B. Shelar <pravin.shelar@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Christoph Lameter <clameter@engr.sgi.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 19:27:08 +02:00

136 lines
4.1 KiB
C

/*
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
*/
#ifndef _LINUX_BOOTMEM_H
#define _LINUX_BOOTMEM_H
#include <linux/mmzone.h>
#include <asm/dma.h>
/*
* simple boot-time physical memory area allocator.
*/
extern unsigned long max_low_pfn;
extern unsigned long min_low_pfn;
/*
* highest page
*/
extern unsigned long max_pfn;
#ifdef CONFIG_CRASH_DUMP
extern unsigned long saved_max_pfn;
#endif
/*
* node_bootmem_map is a map pointer - the bits represent all physical
* memory pages (including holes) on the node.
*/
typedef struct bootmem_data {
unsigned long node_boot_start;
unsigned long node_low_pfn;
void *node_bootmem_map;
unsigned long last_offset;
unsigned long last_pos;
unsigned long last_success; /* Previous allocation point. To speed
* up searching */
struct list_head list;
} bootmem_data_t;
extern unsigned long bootmem_bootmap_pages(unsigned long);
extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
extern void free_bootmem(unsigned long addr, unsigned long size);
extern void *__alloc_bootmem(unsigned long size,
unsigned long align,
unsigned long goal);
extern void *__alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
unsigned long goal);
extern void *__alloc_bootmem_low(unsigned long size,
unsigned long align,
unsigned long goal);
extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal);
extern void *__alloc_bootmem_core(struct bootmem_data *bdata,
unsigned long size,
unsigned long align,
unsigned long goal,
unsigned long limit);
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
extern void reserve_bootmem(unsigned long addr, unsigned long size);
#define alloc_bootmem(x) \
__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low(x) \
__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
#define alloc_bootmem_pages(x) \
__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low_pages(x) \
__alloc_bootmem_low(x, PAGE_SIZE, 0)
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
extern unsigned long free_all_bootmem(void);
extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
extern void *__alloc_bootmem_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal);
extern unsigned long init_bootmem_node(pg_data_t *pgdat,
unsigned long freepfn,
unsigned long startpfn,
unsigned long endpfn);
extern void reserve_bootmem_node(pg_data_t *pgdat,
unsigned long physaddr,
unsigned long size);
extern void free_bootmem_node(pg_data_t *pgdat,
unsigned long addr,
unsigned long size);
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
#define alloc_bootmem_node(pgdat, x) \
__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_node(pgdat, x) \
__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low_pages_node(pgdat, x) \
__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
extern void *alloc_remap(int nid, unsigned long size);
#else
static inline void *alloc_remap(int nid, unsigned long size)
{
return NULL;
}
#endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */
extern unsigned long __meminitdata nr_kernel_pages;
extern unsigned long __meminitdata nr_all_pages;
extern void *alloc_large_system_hash(const char *tablename,
unsigned long bucketsize,
unsigned long numentries,
int scale,
int flags,
unsigned int *_hash_shift,
unsigned int *_hash_mask,
unsigned long limit);
#define HASH_EARLY 0x00000001 /* Allocating during early boot? */
/* Only NUMA needs hash distribution.
* IA64 and x86_64 have sufficient vmalloc space.
*/
#if defined(CONFIG_NUMA) && (defined(CONFIG_IA64) || defined(CONFIG_X86_64))
#define HASHDIST_DEFAULT 1
#else
#define HASHDIST_DEFAULT 0
#endif
extern int hashdist; /* Distribute hashes across NUMA nodes? */
#endif /* _LINUX_BOOTMEM_H */