Sophie: kernel-2.6.18-238.19.1.el5.centos.plus src

kernel-2.6.18-238.19.1.el5.centos.plus.src.rpm

From: Larry Woodman <lwoodman@redhat.com>
Date: Mon, 11 May 2009 19:37:12 -0400
Subject: [mm] don't oomkill when hugepage alloc fails on node
Message-id: <1242070632.10978.185.camel@dhcp-100-19-198.bos.redhat.com>
Patchwork-id: 19981
O-Subject: [RHEL5 patch] Prevent a task from being OOM killed when a hugepage
	allocation fails on a specic node
Bugzilla: 498510
RH-Acked-by: Josef Bacik <josef@redhat.com>
RH-Acked-by: Rik van Riel <riel@redhat.com>

Currently, RHEL5 will oom kill a task attempting to increase the number
of huge pages in the huge page free pool when it can't allocate memory
from a given node.  This is not really an out of memory condition, and
it is not under the user/administrator's control, as the kernel will
unconditionally attempt to distribute allocations across all on-line
nodes, whether or not the node has sufficient memory.

This can be avoided by having the huge page free pool allocator--
alloc_fresh_huge_page_node() pass a new flag--__GFP_NO_OOM--to the call
to alloc_pages_thisnode(), used only for huge page allocations.  With
this flag, __alloc_pages() will not call out_of_memory() before
restarting the allocation loop but set a local would_oom_kill instead.
Since we don't call out_of_memory(), the task will not have the
TF_MEMDIE flag set, therefore we need to indicate that it WOULD have
oom-killed the task so that we don't loop forever, attempting to
allocate huge pages.

Fixes BZ 498510

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 58a8607..765ba80 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -48,6 +48,7 @@ extern void kfree(const void *);
 #define __GFP_ZERO	((__force gfp_t)0x8000u)/* Return zeroed page on success */
 #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
 #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
+#define __GFP_NO_OOM     ((__force gfp_t)0x40000u) /* Don't OOM on alloc fail - hugepages */
 
 #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -56,7 +57,7 @@ extern void kfree(const void *);
 #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
 			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
 			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
-			__GFP_NOMEMALLOC|__GFP_HARDWALL)
+			__GFP_NOMEMALLOC|__GFP_HARDWALL|__GFP_NO_OOM)
 
 /* This equals 0, but use constants in case they ever change */
 #define GFP_NOWAIT	(GFP_ATOMIC & ~__GFP_HIGH)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f737968..fa2fd01 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -105,8 +105,9 @@ static struct page *alloc_fresh_huge_page_node(int nid)
 {
 	struct page *page;
 
-	page = alloc_pages_thisnode(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
-					HUGETLB_PAGE_ORDER);
+	page = alloc_pages_thisnode(nid,
+		GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN|__GFP_NOMEMALLOC|__GFP_NO_OOM,
+		HUGETLB_PAGE_ORDER);
 	if (page) {
 		if (arch_prepare_hugepage(page)) {
 			__free_pages(page, HUGETLB_PAGE_ORDER);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0f0f756..cddb8ce 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -939,6 +939,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
 	int do_retry;
 	int alloc_flags;
 	int did_some_progress;
+	int would_oom_kill = 0;
 
 	might_sleep_if(wait);
 
@@ -991,7 +992,8 @@ restart:
 
 	/* This allocation should allow future memory freeing. */
 
-	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+	if (((p->flags & PF_MEMALLOC) ||
+			unlikely(test_thread_flag(TIF_MEMDIE) || would_oom_kill))
 			&& !in_interrupt()) {
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 nofail_alloc:
@@ -1013,7 +1015,7 @@ nofail_alloc:
 		goto nopage;
 
 rebalance:
-	if (test_thread_flag(TIF_MEMDIE))
+	if (test_thread_flag(TIF_MEMDIE) || would_oom_kill)
 		goto nopage;
 	cond_resched();
 
@@ -1047,7 +1049,10 @@ rebalance:
 		if (page)
 			goto got_pg;
 
-		out_of_memory(zonelist, gfp_mask, order, 0);
+		if (!(gfp_mask & __GFP_NO_OOM))
+			out_of_memory(zonelist, gfp_mask, order, 0);
+		else
+			would_oom_kill = 1;
 		goto restart;
 	}