Sophie: kernel-2.6.18-238.19.1.el5.centos.plus src

kernel-2.6.18-238.19.1.el5.centos.plus.src.rpm

From: Larry Woodman <lwoodman@redhat.com>
Date: Thu, 25 Jun 2009 14:48:06 -0400
Subject: [mm] prevent panic in copy_hugetlb_page_range
Message-id: 1245955686.31419.6.camel@dhcp-100-19-198.bos.redhat.com
O-Subject: [RHEL5-U4 patch] prevent panic in copy_hugetlb_page_range()
Bugzilla: 507860
RH-Acked-by: Rik van Riel <riel@redhat.com>
RH-Acked-by: Dean Nelson <dnelson@redhat.com>
RH-Acked-by: Andrea Arcangeli <aarcange@redhat.com>

After linux-2.6-mm-fork-o_direct-race-v3.patch was applied to RHEL5-U4
we saw a BUG_ON() in copy_hugetlb_page_range().  The BUG_ON() was
encountered by a customer running an application that does Direct_IO to
a HUGEPAGE mapped file region by a multi-threaded application that
fork()'s(its not exactly happening everywhere!!!).

After debugging this problem I found and fixed 3 problems:

1.) The BUG_ON():

BUG_ON(!pte_same(huge_ptep_get(src_pte), entry));

The BUGON() uses pte_same(), on some architectures(ia64 & x86_64) the
pte access and write bits can be updated by the microcode or
low-level assembler code without kernel intervention and therefore
without taking the page_table_lock.  This means that one of those bits
could be different by the in the pte by the time the BUG_ON() occurs.
Its not safe to use pte_same() in any BUG_ON().

2.) The source pte is retrieved using huge_pte_offset() which gets
address of the pmd of the huge page mapping.  The code only checked if
src_pte was NULL, it also needs to check that the pmd entry itself is
not NULL.  This was accomplished be a bit of code restructuring.

3.) copy_hugetlb_page_range() drops the source mm->page_table_lock
before calling hugetlb_cow().  The assumption was that it is safe
because every place that could update the page table also had taken the
mmap_sem for write.

                        * We hold mmap_sem in write mode and
                        * the VM doesn't know about hugepages
                        * so the src_pte/dst_pte can't change
                        * from under us even without both
                        * page_table_lock hold the whole time.

This is not 100% true, there are several ways that
__unmap_hugepage_range() can be called by hugetlb_vmtruncate_list()
without the mmap_sem held.  This can cause the page tables to zeroed out
while copy_hugetlb_page_range() has the page_table_lock unlocked.  To
fix this I created hugetlb_cow_locked() and copy_huge_page_locked() so
they can be called by copy_hugetlb_page_range() without unlocking the
page_table_lock and other callers of hugetlb_cow() and copy_huge_page()
are not effected.

Fixes BZ507860

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e132875..4ac0821 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -57,6 +57,16 @@ static void copy_huge_page(struct page *dst, struct page *src,
 	}
 }
 
+static void copy_huge_page_locked(struct page *dst, struct page *src,
+			   unsigned long addr)
+{
+	int i;
+
+	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+	}
+}
+
 static void enqueue_huge_page(struct page *page)
 {
 	int nid = page_to_nid(page);
@@ -360,6 +370,9 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 		       unsigned long address, pte_t *ptep, pte_t pte);
 
+static int hugetlb_cow_locked(struct mm_struct *mm, struct vm_area_struct *vma,
+		       unsigned long address, pte_t *ptep, pte_t pte);
+
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			    struct vm_area_struct *dst_vma,
 			    struct vm_area_struct *src_vma)
@@ -399,30 +412,23 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			}
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
- 		spin_unlock(&src->page_table_lock);
 		if (forcecow) {
 			int cow_ret;
 			/* force atomic copy from parent to child */
 			flush_tlb_range(src_vma, addr, addr+HPAGE_SIZE);
-			cow_ret = hugetlb_cow(dst, dst_vma, addr,
+			cow_ret = hugetlb_cow_locked(dst, dst_vma, addr,
 					      dst_pte, entry);
 			/*
-			 * We hold mmap_sem in write mode and
-			 * the VM doesn't know about hugepages
-			 * so the src_pte/dst_pte can't change
-			 * from under us even without both
-			 * page_table_lock hold the whole time.
+			 * shouldnt happen!!!
 			 */
-			spin_lock(&src->page_table_lock);
-			BUG_ON(!pte_same(huge_ptep_get(src_pte),
-					 entry));
+			BUG_ON(pte_pfn(huge_ptep_get(src_pte)) != pte_pfn(entry));
 			set_huge_pte_at(src, addr,
 					src_pte,
 					orig_entry);
-			spin_unlock(&src->page_table_lock);
 			if (cow_ret != VM_FAULT_MINOR)
 				oom = 1;
 		}
+		spin_unlock(&src->page_table_lock);
 		spin_unlock(&dst->page_table_lock);
 		if (oom)
 			goto nomem;
@@ -528,6 +534,46 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	return VM_FAULT_MINOR;
 }
 
+static int hugetlb_cow_locked(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, pte_t *ptep, pte_t pte)
+{
+	struct page *old_page, *new_page;
+	int avoidcopy;
+
+	old_page = pte_page(pte);
+
+	/* If no-one else is actually using this page, avoid the copy
+	 * and just make the page writable */
+	avoidcopy = (page_count(old_page) == 1);
+	if (avoidcopy) {
+		set_huge_ptep_writable(vma, address, ptep);
+		return VM_FAULT_MINOR;
+	}
+
+	page_cache_get(old_page);
+	new_page = alloc_huge_page(vma, address);
+
+	if (!new_page) {
+		page_cache_release(old_page);
+		return VM_FAULT_OOM;
+	}
+
+	copy_huge_page_locked(new_page, old_page, address);
+
+	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
+		/* Break COW */
+		huge_ptep_clear_flush(vma, address, ptep);
+		set_huge_pte_at(mm, address, ptep,
+				make_huge_pte(vma, new_page, 1));
+		/* Make the old page be freed below */
+		new_page = old_page;
+	}
+	page_cache_release(new_page);
+	page_cache_release(old_page);
+	return VM_FAULT_MINOR;
+}
+
 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, int write_access)
 {