diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 8821e1f75b44..826b15e914e2 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -30,10 +30,19 @@ void cpuset_update_task_memory_state(void); nodes_subset((nodes), current->mems_allowed) int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl); -extern int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask); -static int inline cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask); +extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask); + +static int inline cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) { - return number_of_cpusets <= 1 || __cpuset_zone_allowed(z, gfp_mask); + return number_of_cpusets <= 1 || + __cpuset_zone_allowed_softwall(z, gfp_mask); +} + +static int inline cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) +{ + return number_of_cpusets <= 1 || + __cpuset_zone_allowed_hardwall(z, gfp_mask); } extern int cpuset_excl_nodes_overlap(const struct task_struct *p); @@ -94,7 +103,12 @@ static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) return 1; } -static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) +{ + return 1; +} + +static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) { return 1; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2c3b4431472b..232aed2b10f9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2342,32 +2342,48 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) } /** - * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? + * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? * @z: is this zone on an allowed node? - * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) + * @gfp_mask: memory allocation flags * - * If we're in interrupt, yes, we can always allocate. If zone + * If we're in interrupt, yes, we can always allocate. If + * __GFP_THISNODE is set, yes, we can always allocate. If zone * z's node is in our tasks mems_allowed, yes. If it's not a * __GFP_HARDWALL request and this zone's nodes is in the nearest * mem_exclusive cpuset ancestor to this tasks cpuset, yes. * Otherwise, no. * + * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() + * reduces to cpuset_zone_allowed_hardwall(). Otherwise, + * cpuset_zone_allowed_softwall() might sleep, and might allow a zone + * from an enclosing cpuset. + * + * cpuset_zone_allowed_hardwall() only handles the simpler case of + * hardwall cpusets, and never sleeps. + * + * The __GFP_THISNODE placement logic is really handled elsewhere, + * by forcibly using a zonelist starting at a specified node, and by + * (in get_page_from_freelist()) refusing to consider the zones for + * any node on the zonelist except the first. By the time any such + * calls get to this routine, we should just shut up and say 'yes'. + * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset. * GFP_KERNEL allocations are not so marked, so can escape to the - * nearest mem_exclusive ancestor cpuset. + * nearest enclosing mem_exclusive ancestor cpuset. * - * Scanning up parent cpusets requires callback_mutex. The __alloc_pages() - * routine only calls here with __GFP_HARDWALL bit _not_ set if - * it's a GFP_KERNEL allocation, and all nodes in the current tasks - * mems_allowed came up empty on the first pass over the zonelist. - * So only GFP_KERNEL allocations, if all nodes in the cpuset are - * short of memory, might require taking the callback_mutex mutex. + * Scanning up parent cpusets requires callback_mutex. The + * __alloc_pages() routine only calls here with __GFP_HARDWALL bit + * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the + * current tasks mems_allowed came up empty on the first pass over + * the zonelist. So only GFP_KERNEL allocations, if all nodes in the + * cpuset are short of memory, might require taking the callback_mutex + * mutex. * * The first call here from mm/page_alloc:get_page_from_freelist() - * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so - * no allocation on a node outside the cpuset is allowed (unless in - * interrupt, of course). + * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, + * so no allocation on a node outside the cpuset is allowed (unless + * in interrupt, of course). * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() @@ -2380,12 +2396,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * GFP_USER - only nodes in current tasks mems allowed ok. * * Rule: - * Don't call cpuset_zone_allowed() if you can't sleep, unless you + * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables * the code that might scan up ancestor cpusets and sleep. - **/ + */ -int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) { int node; /* node that zone z is on */ const struct cpuset *cs; /* current cpuset ancestors */ @@ -2415,6 +2431,40 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) return allowed; } +/* + * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node? + * @z: is this zone on an allowed node? + * @gfp_mask: memory allocation flags + * + * If we're in interrupt, yes, we can always allocate. + * If __GFP_THISNODE is set, yes, we can always allocate. If zone + * z's node is in our tasks mems_allowed, yes. Otherwise, no. + * + * The __GFP_THISNODE placement logic is really handled elsewhere, + * by forcibly using a zonelist starting at a specified node, and by + * (in get_page_from_freelist()) refusing to consider the zones for + * any node on the zonelist except the first. By the time any such + * calls get to this routine, we should just shut up and say 'yes'. + * + * Unlike the cpuset_zone_allowed_softwall() variant, above, + * this variant requires that the zone be in the current tasks + * mems_allowed or that we're in interrupt. It does not scan up the + * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. + * It never sleeps. + */ + +int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) +{ + int node; /* node that zone z is on */ + + if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) + return 1; + node = zone_to_nid(z); + if (node_isset(node, current->mems_allowed)) + return 1; + return 0; +} + /** * cpuset_lock - lock out any changes to cpuset structures * diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0ccc7f230252..089092d152ab 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -73,7 +73,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, for (z = zonelist->zones; *z; z++) { nid = zone_to_nid(*z); - if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && + if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) && !list_empty(&hugepage_freelists[nid])) break; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 223d9ccb7d64..64cf3c214634 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -177,7 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) nodemask_t nodes = node_online_map; for (z = zonelist->zones; *z; z++) - if (cpuset_zone_allowed(*z, gfp_mask)) + if (cpuset_zone_allowed_softwall(*z, gfp_mask)) node_clear(zone_to_nid(*z), nodes); else return CONSTRAINT_CPUSET; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e6b17b2989e0..8c1a116875bc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1162,7 +1162,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) break; if ((alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed(zone, gfp_mask)) + !cpuset_zone_allowed_softwall(zone, gfp_mask)) goto try_next_zone; if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { diff --git a/mm/slab.c b/mm/slab.c index 9d3550086c93..b856786a3a30 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3262,7 +3262,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) for (z = zonelist->zones; *z && !obj; z++) { nid = zone_to_nid(*z); - if (cpuset_zone_allowed(*z, flags | __GFP_HARDWALL) && + if (cpuset_zone_allowed_hardwall(*z, flags) && cache->nodelists[nid] && cache->nodelists[nid]->free_objects) obj = ____cache_alloc_node(cache, diff --git a/mm/vmscan.c b/mm/vmscan.c index 093f5fe6dd77..e9813b06c7a3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -984,7 +984,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, if (!populated_zone(zone)) continue; - if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; note_zone_scanning_priority(zone, priority); @@ -1034,7 +1034,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; - if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; lru_pages += zone->nr_active + zone->nr_inactive; @@ -1089,7 +1089,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) for (i = 0; zones[i] != 0; i++) { struct zone *zone = zones[i]; - if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; zone->prev_priority = priority; @@ -1354,7 +1354,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; - if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; if (!waitqueue_active(&pgdat->kswapd_wait)) return;