mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-09 23:39:18 +00:00
6e543d5780
This patch is based on KOSAKI's work and I add a little more description, please refer https://lkml.org/lkml/2012/6/14/74. Currently, I found system can enter a state that there are lots of free pages in a zone but only order-0 and order-1 pages which means the zone is heavily fragmented, then high order allocation could make direct reclaim path's long stall(ex, 60 seconds) especially in no swap and no compaciton enviroment. This problem happened on v3.4, but it seems issue still lives in current tree, the reason is do_try_to_free_pages enter live lock: kswapd will go to sleep if the zones have been fully scanned and are still not balanced. As kswapd thinks there's little point trying all over again to avoid infinite loop. Instead it changes order from high-order to 0-order because kswapd think order-0 is the most important. Look at 73ce02e9 in detail. If watermarks are ok, kswapd will go back to sleep and may leave zone->all_unreclaimable =3D 0. It assume high-order users can still perform direct reclaim if they wish. Direct reclaim continue to reclaim for a high order which is not a COSTLY_ORDER without oom-killer until kswapd turn on zone->all_unreclaimble= . This is because to avoid too early oom-kill. So it means direct_reclaim depends on kswapd to break this loop. In worst case, direct-reclaim may continue to page reclaim forever when kswapd sleeps forever until someone like watchdog detect and finally kill the process. As described in: http://thread.gmane.org/gmane.linux.kernel.mm/103737 We can't turn on zone->all_unreclaimable from direct reclaim path because direct reclaim path don't take any lock and this way is racy. Thus this patch removes zone->all_unreclaimable field completely and recalculates zone reclaimable state every time. Note: we can't take the idea that direct-reclaim see zone->pages_scanned directly and kswapd continue to use zone->all_unreclaimable. Because, it is racy. commit 929bea7c71 (vmscan: all_unreclaimable() use zone->all_unreclaimable as a name) describes the detail. [akpm@linux-foundation.org: uninline zone_reclaimable_pages() and zone_reclaimable()] Cc: Aaditya Kumar <aaditya.kumar.30@gmail.com> Cc: Ying Han <yinghan@google.com> Cc: Nick Piggin <npiggin@gmail.com> Acked-by: Rik van Riel <riel@redhat.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux.com> Cc: Bob Liu <lliubbo@gmail.com> Cc: Neil Zhang <zhangwm@marvell.com> Cc: Russell King - ARM Linux <linux@arm.linux.org.uk> Reviewed-by: Michal Hocko <mhocko@suse.cz> Acked-by: Minchan Kim <minchan@kernel.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Lisa Du <cldu@marvell.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
104 lines
2.7 KiB
C
104 lines
2.7 KiB
C
#ifndef LINUX_MM_INLINE_H
|
|
#define LINUX_MM_INLINE_H
|
|
|
|
#include <linux/huge_mm.h>
|
|
#include <linux/swap.h>
|
|
|
|
/**
|
|
* page_is_file_cache - should the page be on a file LRU or anon LRU?
|
|
* @page: the page to test
|
|
*
|
|
* Returns 1 if @page is page cache page backed by a regular filesystem,
|
|
* or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed.
|
|
* Used by functions that manipulate the LRU lists, to sort a page
|
|
* onto the right LRU list.
|
|
*
|
|
* We would like to get this info without a page flag, but the state
|
|
* needs to survive until the page is last deleted from the LRU, which
|
|
* could be as far down as __page_cache_release.
|
|
*/
|
|
static inline int page_is_file_cache(struct page *page)
|
|
{
|
|
return !PageSwapBacked(page);
|
|
}
|
|
|
|
static __always_inline void add_page_to_lru_list(struct page *page,
|
|
struct lruvec *lruvec, enum lru_list lru)
|
|
{
|
|
int nr_pages = hpage_nr_pages(page);
|
|
mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
|
|
list_add(&page->lru, &lruvec->lists[lru]);
|
|
__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
|
|
}
|
|
|
|
static __always_inline void del_page_from_lru_list(struct page *page,
|
|
struct lruvec *lruvec, enum lru_list lru)
|
|
{
|
|
int nr_pages = hpage_nr_pages(page);
|
|
mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
|
|
list_del(&page->lru);
|
|
__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
|
|
}
|
|
|
|
/**
|
|
* page_lru_base_type - which LRU list type should a page be on?
|
|
* @page: the page to test
|
|
*
|
|
* Used for LRU list index arithmetic.
|
|
*
|
|
* Returns the base LRU type - file or anon - @page should be on.
|
|
*/
|
|
static inline enum lru_list page_lru_base_type(struct page *page)
|
|
{
|
|
if (page_is_file_cache(page))
|
|
return LRU_INACTIVE_FILE;
|
|
return LRU_INACTIVE_ANON;
|
|
}
|
|
|
|
/**
|
|
* page_off_lru - which LRU list was page on? clearing its lru flags.
|
|
* @page: the page to test
|
|
*
|
|
* Returns the LRU list a page was on, as an index into the array of LRU
|
|
* lists; and clears its Unevictable or Active flags, ready for freeing.
|
|
*/
|
|
static __always_inline enum lru_list page_off_lru(struct page *page)
|
|
{
|
|
enum lru_list lru;
|
|
|
|
if (PageUnevictable(page)) {
|
|
__ClearPageUnevictable(page);
|
|
lru = LRU_UNEVICTABLE;
|
|
} else {
|
|
lru = page_lru_base_type(page);
|
|
if (PageActive(page)) {
|
|
__ClearPageActive(page);
|
|
lru += LRU_ACTIVE;
|
|
}
|
|
}
|
|
return lru;
|
|
}
|
|
|
|
/**
|
|
* page_lru - which LRU list should a page be on?
|
|
* @page: the page to test
|
|
*
|
|
* Returns the LRU list a page should be on, as an index
|
|
* into the array of LRU lists.
|
|
*/
|
|
static __always_inline enum lru_list page_lru(struct page *page)
|
|
{
|
|
enum lru_list lru;
|
|
|
|
if (PageUnevictable(page))
|
|
lru = LRU_UNEVICTABLE;
|
|
else {
|
|
lru = page_lru_base_type(page);
|
|
if (PageActive(page))
|
|
lru += LRU_ACTIVE;
|
|
}
|
|
return lru;
|
|
}
|
|
|
|
#endif
|