mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-15 09:34:17 +00:00
aa45484031
Ordinarily watermark checks are based on the vmstat NR_FREE_PAGES as it is cheaper than scanning a number of lists. To avoid synchronization overhead, counter deltas are maintained on a per-cpu basis and drained both periodically and when the delta is above a threshold. On large CPU systems, the difference between the estimated and real value of NR_FREE_PAGES can be very high. If NR_FREE_PAGES is much higher than number of real free page in buddy, the VM can allocate pages below min watermark, at worst reducing the real number of pages to zero. Even if the OOM killer kills some victim for freeing memory, it may not free memory if the exit path requires a new page resulting in livelock. This patch introduces a zone_page_state_snapshot() function (courtesy of Christoph) that takes a slightly more accurate view of an arbitrary vmstat counter. It is used to read NR_FREE_PAGES while kswapd is awake to avoid the watermark being accidentally broken. The estimate is not perfect and may result in cache line bounces but is expected to be lighter than the IPI calls necessary to continually drain the per-cpu counters while kswapd is awake. Signed-off-by: Christoph Lameter <cl@linux.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
111 lines
2.5 KiB
C
111 lines
2.5 KiB
C
/*
|
|
* linux/mm/mmzone.c
|
|
*
|
|
* management codes for pgdats and zones.
|
|
*/
|
|
|
|
|
|
#include <linux/stddef.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/mmzone.h>
|
|
#include <linux/module.h>
|
|
|
|
struct pglist_data *first_online_pgdat(void)
|
|
{
|
|
return NODE_DATA(first_online_node);
|
|
}
|
|
|
|
struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
|
|
{
|
|
int nid = next_online_node(pgdat->node_id);
|
|
|
|
if (nid == MAX_NUMNODES)
|
|
return NULL;
|
|
return NODE_DATA(nid);
|
|
}
|
|
|
|
/*
|
|
* next_zone - helper magic for for_each_zone()
|
|
*/
|
|
struct zone *next_zone(struct zone *zone)
|
|
{
|
|
pg_data_t *pgdat = zone->zone_pgdat;
|
|
|
|
if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
|
|
zone++;
|
|
else {
|
|
pgdat = next_online_pgdat(pgdat);
|
|
if (pgdat)
|
|
zone = pgdat->node_zones;
|
|
else
|
|
zone = NULL;
|
|
}
|
|
return zone;
|
|
}
|
|
|
|
static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
|
|
{
|
|
#ifdef CONFIG_NUMA
|
|
return node_isset(zonelist_node_idx(zref), *nodes);
|
|
#else
|
|
return 1;
|
|
#endif /* CONFIG_NUMA */
|
|
}
|
|
|
|
/* Returns the next zone at or below highest_zoneidx in a zonelist */
|
|
struct zoneref *next_zones_zonelist(struct zoneref *z,
|
|
enum zone_type highest_zoneidx,
|
|
nodemask_t *nodes,
|
|
struct zone **zone)
|
|
{
|
|
/*
|
|
* Find the next suitable zone to use for the allocation.
|
|
* Only filter based on nodemask if it's set
|
|
*/
|
|
if (likely(nodes == NULL))
|
|
while (zonelist_zone_idx(z) > highest_zoneidx)
|
|
z++;
|
|
else
|
|
while (zonelist_zone_idx(z) > highest_zoneidx ||
|
|
(z->zone && !zref_in_nodemask(z, nodes)))
|
|
z++;
|
|
|
|
*zone = zonelist_zone(z);
|
|
return z;
|
|
}
|
|
|
|
#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
|
|
int memmap_valid_within(unsigned long pfn,
|
|
struct page *page, struct zone *zone)
|
|
{
|
|
if (page_to_pfn(page) != pfn)
|
|
return 0;
|
|
|
|
if (page_zone(page) != zone)
|
|
return 0;
|
|
|
|
return 1;
|
|
}
|
|
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
|
|
|
|
#ifdef CONFIG_SMP
|
|
/* Called when a more accurate view of NR_FREE_PAGES is needed */
|
|
unsigned long zone_nr_free_pages(struct zone *zone)
|
|
{
|
|
unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
|
|
|
|
/*
|
|
* While kswapd is awake, it is considered the zone is under some
|
|
* memory pressure. Under pressure, there is a risk that
|
|
* per-cpu-counter-drift will allow the min watermark to be breached
|
|
* potentially causing a live-lock. While kswapd is awake and
|
|
* free pages are low, get a better estimate for free pages
|
|
*/
|
|
if (nr_free_pages < zone->percpu_drift_mark &&
|
|
!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
|
|
return zone_page_state_snapshot(zone, NR_FREE_PAGES);
|
|
|
|
return nr_free_pages;
|
|
}
|
|
#endif /* CONFIG_SMP */
|