linux-next/include/linux/mempolicy.h
Donet Tom f8fd525ba3 mm/mempolicy: use numa_node_id() instead of cpu_to_node()
Patch series "Allow migrate on protnone reference with MPOL_PREFERRED_MANY
policy:, v4.

This patchset is to optimize the cross-socket memory access with
MPOL_PREFERRED_MANY policy.

To test this patch we ran the following test on a 3 node system.
 Node 0 - 2GB   - Tier 1
 Node 1 - 11GB  - Tier 1
 Node 6 - 10GB  - Tier 2

Below changes are made to memcached to set the memory policy,
It select Node0 and Node1 as preferred nodes.

   #include <numaif.h>
   #include <numa.h>

    unsigned long nodemask;
    int ret;

    nodemask = 0x03;
    ret = set_mempolicy(MPOL_PREFERRED_MANY | MPOL_F_NUMA_BALANCING,
                                               &nodemask, 10);
    /* If MPOL_F_NUMA_BALANCING isn't supported,
     * fall back to MPOL_PREFERRED_MANY */
    if (ret < 0 && errno == EINVAL){
       printf("set mem policy normal\n");
        ret = set_mempolicy(MPOL_PREFERRED_MANY, &nodemask, 10);
    }
    if (ret < 0) {
       perror("Failed to call set_mempolicy");
       exit(-1);
    }

Test Procedure:
===============
1. Make sure memory tiring and demotion are enabled.
2. Start memcached.

   # ./memcached -b 100000 -m 204800 -u root -c 1000000 -t 7
       -d -s "/tmp/memcached.sock"

3. Run memtier_benchmark to store 3200000 keys.

  #./memtier_benchmark -S "/tmp/memcached.sock" --protocol=memcache_binary
    --threads=1 --pipeline=1 --ratio=1:0 --key-pattern=S:S --key-minimum=1
    --key-maximum=3200000 -n allkeys -c 1 -R -x 1 -d 1024

4. Start a memory eater on node 0 and 1. This will demote all memcached
   pages to node 6.
5. Make sure all the memcached pages got demoted to lower tier by reading
   /proc/<memcaced PID>/numa_maps.

    # cat /proc/2771/numa_maps
     ---
    default anon=1009 dirty=1009 active=0 N6=1009 kernelpagesize_kB=64
    default anon=1009 dirty=1009 active=0 N6=1009 kernelpagesize_kB=64
     ---

6. Kill memory eater.
7. Read the pgpromote_success counter.
8. Start reading the keys by running memtier_benchmark.

  #./memtier_benchmark -S "/tmp/memcached.sock" --protocol=memcache_binary
   --pipeline=1 --distinct-client-seed --ratio=0:3 --key-pattern=R:R
   --key-minimum=1 --key-maximum=3200000 -n allkeys
   --threads=64 -c 1 -R -x 6

9. Read the pgpromote_success counter.

Test Results:
=============
Without Patch
------------------
1. pgpromote_success  before test
Node 0:  pgpromote_success 11
Node 1:  pgpromote_success 140974

pgpromote_success  after test
Node 0:  pgpromote_success 11
Node 1:  pgpromote_success 140974

2. Memtier-benchmark result.
AGGREGATED AVERAGE RESULTS (6 runs)
==================================================================
Type    Ops/sec   Hits/sec   Misses/sec  Avg. Latency  p50 Latency
------------------------------------------------------------------
Sets     0.00       ---         ---        ---          ---
Gets    305792.03  305791.93   0.10       0.18949       0.16700
Waits    0.00       ---         ---        ---          ---
Totals  305792.03  305791.93   0.10       0.18949       0.16700

======================================
p99 Latency  p99.9 Latency  KB/sec
-------------------------------------
---          ---            0.00
0.44700     1.71100        11542.69
---           ---            ---
0.44700     1.71100        11542.69

With Patch
---------------
1. pgpromote_success  before test
Node 0:  pgpromote_success 5
Node 1:  pgpromote_success 89386

pgpromote_success  after test
Node 0:  pgpromote_success 57895
Node 1:  pgpromote_success 141463

2. Memtier-benchmark result.
AGGREGATED AVERAGE RESULTS (6 runs)
====================================================================
Type    Ops/sec    Hits/sec  Misses/sec  Avg. Latency  p50 Latency
--------------------------------------------------------------------
Sets     0.00        ---       ---        ---           ---
Gets    521942.24  521942.07  0.17       0.11459        0.10300
Waits    0.00        ---       ---         ---          ---
Totals  521942.24  521942.07  0.17       0.11459        0.10300

=======================================
p99 Latency  p99.9 Latency  KB/sec
---------------------------------------
 ---          ---            0.00
0.23100      0.31900        19701.68
---          ---             ---
0.23100      0.31900        19701.68


Test Result Analysis:
=====================
1. With patch we could observe pages are getting promoted.
2. Memtier-benchmark results shows that, with the patch,
   performance has increased more than 50%.

 Ops/sec without fix -  305792.03
 Ops/sec with fix    -  521942.24


This patch (of 2):

Instead of using 'cpu_to_node()', we use 'numa_node_id()', which is
quicker.  smp_processor_id is guaranteed to be stable in the
'mpol_misplaced()' function because it is called with ptl held. 
lockdep_assert_held was added to ensure that.

No functional change in this patch.

[donettom@linux.ibm.com: add "* @vmf: structure describing the fault" comment]
  Link: https://lkml.kernel.org/r/d8b993ea9dccfac0bc3ed61d3a81f4ac5f376e46.1711002865.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/cover.1711373653.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/6059f034f436734b472d066db69676fb3a459864.1711373653.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/cover.1709909210.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/744646531af02cc687cde8ae788fb1779e99d02c.1709909210.git.donettom@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Huang, Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-04-25 20:55:48 -07:00

303 lines
7.2 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* NUMA memory policies for Linux.
* Copyright 2003,2004 Andi Kleen SuSE Labs
*/
#ifndef _LINUX_MEMPOLICY_H
#define _LINUX_MEMPOLICY_H 1
#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <uapi/linux/mempolicy.h>
struct mm_struct;
#define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */
#ifdef CONFIG_NUMA
/*
* Describe a memory policy.
*
* A mempolicy can be either associated with a process or with a VMA.
* For VMA related allocations the VMA policy is preferred, otherwise
* the process policy is used. Interrupts ignore the memory policy
* of the current process.
*
* Locking policy for interleave:
* In process context there is no locking because only the process accesses
* its own state. All vma manipulation is somewhat protected by a down_read on
* mmap_lock.
*
* Freeing policy:
* Mempolicy objects are reference counted. A mempolicy will be freed when
* mpol_put() decrements the reference count to zero.
*
* Duplicating policy objects:
* mpol_dup() allocates a new mempolicy and copies the specified mempolicy
* to the new storage. The reference count of the new object is initialized
* to 1, representing the caller of mpol_dup().
*/
struct mempolicy {
atomic_t refcnt;
unsigned short mode; /* See MPOL_* above */
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
nodemask_t nodes; /* interleave/bind/perfer */
int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
union {
nodemask_t cpuset_mems_allowed; /* relative to these nodes */
nodemask_t user_nodemask; /* nodemask passed by user */
} w;
};
/*
* Support for managing mempolicy data objects (clone, copy, destroy)
* The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
*/
extern void __mpol_put(struct mempolicy *pol);
static inline void mpol_put(struct mempolicy *pol)
{
if (pol)
__mpol_put(pol);
}
/*
* Does mempolicy pol need explicit unref after use?
* Currently only needed for shared policies.
*/
static inline int mpol_needs_cond_ref(struct mempolicy *pol)
{
return (pol && (pol->flags & MPOL_F_SHARED));
}
static inline void mpol_cond_put(struct mempolicy *pol)
{
if (mpol_needs_cond_ref(pol))
__mpol_put(pol);
}
extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
{
if (pol)
pol = __mpol_dup(pol);
return pol;
}
static inline void mpol_get(struct mempolicy *pol)
{
if (pol)
atomic_inc(&pol->refcnt);
}
extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
if (a == b)
return true;
return __mpol_equal(a, b);
}
/*
* Tree of shared policies for a shared memory region.
*/
struct shared_policy {
struct rb_root root;
rwlock_t lock;
};
struct sp_node {
struct rb_node nd;
pgoff_t start, end;
struct mempolicy *policy;
};
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
int mpol_set_shared_policy(struct shared_policy *sp,
struct vm_area_struct *vma, struct mempolicy *mpol);
void mpol_free_shared_policy(struct shared_policy *sp);
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
pgoff_t idx);
struct mempolicy *get_task_policy(struct task_struct *p);
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr, pgoff_t *ilx);
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr, int order, pgoff_t *ilx);
bool vma_policy_mof(struct vm_area_struct *vma);
extern void numa_default_policy(void);
extern void numa_policy_init(void);
extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
extern int huge_node(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask);
extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask);
extern unsigned int mempolicy_slab_node(void);
extern enum zone_type policy_zone;
static inline void check_highest_zone(enum zone_type k)
{
if (k > policy_zone && k != ZONE_MOVABLE)
policy_zone = k;
}
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags);
#ifdef CONFIG_TMPFS
extern int mpol_parse_str(char *str, struct mempolicy **mpol);
#endif
extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
/* Check if a vma is migratable */
extern bool vma_migratable(struct vm_area_struct *vma);
int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
unsigned long addr);
extern void mpol_put_task_policy(struct task_struct *);
static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
return (pol->mode == MPOL_PREFERRED_MANY);
}
extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
#else
struct mempolicy {};
static inline struct mempolicy *get_task_policy(struct task_struct *p)
{
return NULL;
}
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
return true;
}
static inline void mpol_put(struct mempolicy *pol)
{
}
static inline void mpol_cond_put(struct mempolicy *pol)
{
}
static inline void mpol_get(struct mempolicy *pol)
{
}
struct shared_policy {};
static inline void mpol_shared_policy_init(struct shared_policy *sp,
struct mempolicy *mpol)
{
}
static inline void mpol_free_shared_policy(struct shared_policy *sp)
{
}
static inline struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
{
return NULL;
}
static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr, int order, pgoff_t *ilx)
{
*ilx = 0;
return NULL;
}
static inline int
vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
return 0;
}
static inline void numa_policy_init(void)
{
}
static inline void numa_default_policy(void)
{
}
static inline void mpol_rebind_task(struct task_struct *tsk,
const nodemask_t *new)
{
}
static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
}
static inline int huge_node(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
{
*mpol = NULL;
*nodemask = NULL;
return 0;
}
static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
{
return false;
}
static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
return 0;
}
static inline void check_highest_zone(int k)
{
}
#ifdef CONFIG_TMPFS
static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
{
return 1; /* error */
}
#endif
static inline int mpol_misplaced(struct folio *folio,
struct vm_fault *vmf,
unsigned long address)
{
return -1; /* no node preference */
}
static inline void mpol_put_task_policy(struct task_struct *task)
{
}
static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
return false;
}
#endif /* CONFIG_NUMA */
#endif