From: Ed Pollard <epollard@redhat.com> Date: Thu, 14 Aug 2008 11:19:26 -0400 Subject: [mm] add support for fast get user pages Message-id: 48A44CFE.6060904@redhat.com O-Subject: [RHEL 5.3 PATCH] RHBZ 447649 fast_gup: Improve thread scalability for TPC-C Bugzilla: 447649 RH-Acked-by: Rik van Riel <riel@redhat.com> Bugzilla: 447649 https://bugzilla.redhat.com/show_bug.cgi?id=447649 Brew: http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1428917 Description: Improve thread scalability for TPC-C benchmarking, in particular via reduction of DIO induced mmap_sem contention and lock contention in follow_hugetlb_page() This has been discussed upstream as "fast_gup" and has been accepted into the 2.6.27 tree. backported to RHEL from upstream commit #s: b379d790197cdf8a95fb67507d75a24ac0a1678d 7e675137a8e1a4d45822746456dd389b65745bf6 a0a8f5364a5ad248aec6cb705e0092ff563edc2f 21cc199baa815d7b3f1ace4be20b9558cbddc00f 8174c430e445a93016ef18f717fe570214fa38bf 9b79022ca909b66e2cd0cfd9248f832fc165f77f f5dd33c494a427b1d1a3b574de5c9e511c888864 bc40d73c950146725e9e768e856a416ec8949065 diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index fe3be72..19fb9ef 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -12,6 +12,7 @@ mainmenu "Linux Kernel Configuration" config X86_64 bool default y + select HAVE_GET_USER_PAGES_FAST help Port to the x86-64 architecture. x86-64 is a 64-bit extension to the classical 32-bit x86 architecture. For details see diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile index 21ff5d9..88d0a8a 100644 --- a/arch/x86_64/mm/Makefile +++ b/arch/x86_64/mm/Makefile @@ -3,6 +3,7 @@ # obj-y := init.o fault.o ioremap.o extable.o pageattr.o mmap.o +obj-$(CONFIG_HAVE_GET_USER_PAGES_FAST) += gup.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_K8_NUMA) += k8topology.o diff --git a/arch/x86_64/mm/gup.c b/arch/x86_64/mm/gup.c new file mode 100644 index 0000000..acb5d93 --- /dev/null +++ b/arch/x86_64/mm/gup.c @@ -0,0 +1,262 @@ +/* + * Lockless get_user_pages_fast for x86 + * + * Copyright (C) 2008 Nick Piggin + * Copyright (C) 2008 Novell Inc. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmstat.h> +#include <linux/highmem.h> +#include <linux/uaccess.h> + +#include <asm/pgtable.h> + +static inline pte_t gup_get_pte(pte_t *ptep) +{ +#ifndef CONFIG_X86_PAE + return *ptep; +#else + /* + * With get_user_pages_fast, we walk down the pagetables without taking + * any locks. For this we would like to load the pointers atoimcally, + * but that is not possible (without expensive cmpxchg8b) on PAE. What + * we do have is the guarantee that a pte will only either go from not + * present to present, or present to not present or both -- it will not + * switch to a completely different present page without a TLB flush in + * between; something that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees l iff pte_high + * sees h. We load pte_high *after* loading pte_low, which ensures we + * don't see an older value of pte_high. *Then* we recheck pte_low, + * which ensures that we haven't picked up a changed pte high. We might + * have got rubbish values from pte_low and pte_high, but we are + * guaranteed that pte_low will not have the present bit set *unless* + * it is 'l'. And get_user_pages_fast only operates on present ptes, so + * we're safe. + * + * gup_get_pte should not be used or copied outside gup.c without being + * very careful -- it does not atomically load the pte or anything that + * is likely to be useful for you. + */ + pte_t pte; + +retry: + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + if (unlikely(pte.pte_low != ptep->pte_low)) + goto retry; + + return pte; +#endif +} + +/* + * The performance critical leaf functions are made noinline otherwise gcc + * inlines everything into a single function which results in too much + * register pressure. + */ +static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t *ptep; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + + ptep = pte_offset_map(&pmd, addr); + do { + pte_t pte = gup_get_pte(ptep); + struct page *page; + + if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { + pte_unmap(ptep); + return 0; + } + BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + get_page(page); + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + pte_unmap(ptep - 1); + + return 1; +} + +static inline void get_head_page_multiple(struct page *page, int nr) +{ + BUG_ON(page != compound_head(page)); + BUG_ON(page_count(page) == 0); + atomic_add(nr, &page->_count); +} + +static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t pte = *(pte_t *)&pmd; + struct page *head, *page; + int refs; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + if ((pte_val(pte) & mask) != mask) + return 0; + /* hugepages are never "special" */ + BUG_ON(pte_val(pte) & _PAGE_SPECIAL); + BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((addr & ~HPAGE_MASK) >> PAGE_SHIFT); + do { + BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + get_head_page_multiple(head, refs); + + return 1; +} + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = *pmdp; + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + return 0; + if (unlikely(pmd_large(pmd))) { + if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) + return 0; + } else { + if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + return 0; + } + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(&pgd, addr); + do { + pud_t pud = *pudp; + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } while (pudp++, addr = next, addr != end); + + return 1; +} + +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + start, len))) + goto slow_irqon; + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables and pages from being freed on x86. + * + * So long as we atomically load page table pointers versus teardown + * (which we do on x86, with the above PAE exception), we can follow the + * address down to the the page and take a ref on it. + */ + local_irq_disable(); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + goto slow; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + goto slow; + } while (pgdp++, addr = next, addr != end); + local_irq_enable(); + + BUG_ON(nr != (end - start) >> PAGE_SHIFT); + return nr; + + { + int ret; + +slow: + local_irq_enable(); +slow_irqon: + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, + (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + + return ret; + } +} diff --git a/fs/bio.c b/fs/bio.c index 6a0b9ad..00a0b7f 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -648,12 +648,8 @@ static struct bio *__bio_map_user_iov(request_queue_t *q, const int local_nr_pages = end - start; const int page_limit = cur_page + local_nr_pages; - down_read(¤t->mm->mmap_sem); - ret = get_user_pages(current, current->mm, uaddr, - local_nr_pages, - write_to_vm, 0, &pages[cur_page], NULL); - up_read(¤t->mm->mmap_sem); - + ret = get_user_pages_fast(uaddr, local_nr_pages, + write_to_vm, &pages[cur_page]); if (ret < local_nr_pages) { ret = -EFAULT; goto out_unmap; diff --git a/fs/direct-io.c b/fs/direct-io.c index da7a1d6..6046e0e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -149,17 +149,11 @@ static int dio_refill_pages(struct dio *dio) int nr_pages; nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES); - down_read(¤t->mm->mmap_sem); - ret = get_user_pages( - current, /* Task for fault acounting */ - current->mm, /* whose pages? */ + ret = get_user_pages_fast( dio->curr_user_address, /* Where from? */ nr_pages, /* How many pages? */ dio->rw == READ, /* Write to memory? */ - 0, /* force (?) */ - &dio->pages[0], - NULL); /* vmas */ - up_read(¤t->mm->mmap_sem); + &dio->pages[0]); /* Put results here */ if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) { struct page *page = ZERO_PAGE(dio->curr_user_address); diff --git a/fs/splice.c b/fs/splice.c index cee12d8..1f8c911 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1124,12 +1124,6 @@ static int get_iovec_page_array(const struct iovec __user *iov, { int buffers = 0, error = 0; - /* - * It's ok to take the mmap_sem for reading, even - * across a "get_user()". - */ - down_read(¤t->mm->mmap_sem); - while (nr_vecs) { unsigned long off, npages; void __user *base; @@ -1176,9 +1170,8 @@ static int get_iovec_page_array(const struct iovec __user *iov, if (npages > PIPE_BUFFERS - buffers) npages = PIPE_BUFFERS - buffers; - error = get_user_pages(current, current->mm, - (unsigned long) base, npages, 0, 0, - &pages[buffers], NULL); + error = get_user_pages_fast((unsigned long)base, npages, + 0, &pages[buffers]); if (unlikely(error <= 0)) break; @@ -1217,8 +1210,6 @@ static int get_iovec_page_array(const struct iovec __user *iov, iov++; } - up_read(¤t->mm->mmap_sem); - if (buffers) return buffers; diff --git a/include/asm-alpha/pgtable.h b/include/asm-alpha/pgtable.h index 93eaa58..3f69582 100644 --- a/include/asm-alpha/pgtable.h +++ b/include/asm-alpha/pgtable.h @@ -269,6 +269,7 @@ extern inline int pte_exec(pte_t pte) { return !(pte_val(pte) & _PAGE_FOE); } extern inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } extern inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } extern inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } +extern inline int pte_special(pte_t pte) { return 0; } extern inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) |= _PAGE_FOW; return pte; } extern inline pte_t pte_rdprotect(pte_t pte) { pte_val(pte) |= _PAGE_FOR; return pte; } @@ -280,6 +281,7 @@ extern inline pte_t pte_mkread(pte_t pte) { pte_val(pte) &= ~_PAGE_FOR; return p extern inline pte_t pte_mkexec(pte_t pte) { pte_val(pte) &= ~_PAGE_FOE; return pte; } extern inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= __DIRTY_BITS; return pte; } extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; return pte; } +extern inline pte_t pte_mkspecial(pte_t pte) { return pte; } #define PAGE_DIR_OFFSET(tsk,address) pgd_offset((tsk),(address)) diff --git a/include/asm-arm/pgtable.h b/include/asm-arm/pgtable.h index 8d3919c..cfb60f0 100644 --- a/include/asm-arm/pgtable.h +++ b/include/asm-arm/pgtable.h @@ -243,6 +243,7 @@ extern struct page *empty_zero_page; #define pte_exec(pte) (pte_val(pte) & L_PTE_EXEC) #define pte_dirty(pte) (pte_val(pte) & L_PTE_DIRTY) #define pte_young(pte) (pte_val(pte) & L_PTE_YOUNG) +#define pte_special(pte) (0) /* * The following only works if pte_present() is not true. @@ -267,6 +268,8 @@ PTE_BIT_FUNC(mkdirty, |= L_PTE_DIRTY); PTE_BIT_FUNC(mkold, &= ~L_PTE_YOUNG); PTE_BIT_FUNC(mkyoung, |= L_PTE_YOUNG); +static inline pte_t pte_mkspecial(pte_t pte) { return pte; } + /* * Mark the prot value as uncacheable and unbufferable. */ diff --git a/include/asm-cris/pgtable.h b/include/asm-cris/pgtable.h index 5d76c1c..75833eb 100644 --- a/include/asm-cris/pgtable.h +++ b/include/asm-cris/pgtable.h @@ -117,6 +117,7 @@ static inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_RE static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_MODIFIED; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } +static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_wrprotect(pte_t pte) { @@ -192,6 +193,7 @@ static inline pte_t pte_mkyoung(pte_t pte) } return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { return pte; } /* * Conversion functions: convert a page and protection to a page entry, diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h index 7af7485..72164f8 100644 --- a/include/asm-frv/pgtable.h +++ b/include/asm-frv/pgtable.h @@ -380,6 +380,7 @@ static inline int pte_exec(pte_t pte) { return !((pte).pte & _PAGE_SUPER); } static inline int pte_dirty(pte_t pte) { return (pte).pte & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return (pte).pte & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return !((pte).pte & _PAGE_WP); } +static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_rdprotect(pte_t pte) { (pte).pte |= _PAGE_SUPER; return pte; } static inline pte_t pte_exprotect(pte_t pte) { (pte).pte |= _PAGE_SUPER; return pte; } @@ -391,6 +392,7 @@ static inline pte_t pte_mkexec(pte_t pte) { (pte).pte &= ~_PAGE_SUPER; return pt static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte |= _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte &= ~_PAGE_WP; return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { return pte; } static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/include/asm-i386/mach-xen/asm/pgtable.h b/include/asm-i386/mach-xen/asm/pgtable.h index 05b6e71..7c3b34f 100644 --- a/include/asm-i386/mach-xen/asm/pgtable.h +++ b/include/asm-i386/mach-xen/asm/pgtable.h @@ -228,6 +228,7 @@ static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; } +static inline int pte_special(pte_t pte) { return 0; } /* * The following only works if pte_present() is not true. @@ -245,6 +246,7 @@ static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; retur static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { return pte; } #ifdef CONFIG_X86_PAE # include <asm/pgtable-3level.h> diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 09697fe..c55caa4 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -222,6 +222,7 @@ static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; } +static inline int pte_special(pte_t pte) { return 0; } /* * The following only works if pte_present() is not true. @@ -239,6 +240,7 @@ static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; retur static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { return pte; } #ifdef CONFIG_X86_PAE # include <asm/pgtable-3level.h> diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h index 228981c..50d7665 100644 --- a/include/asm-ia64/pgtable.h +++ b/include/asm-ia64/pgtable.h @@ -302,6 +302,8 @@ ia64_phys_addr_valid (unsigned long addr) #define pte_dirty(pte) ((pte_val(pte) & _PAGE_D) != 0) #define pte_young(pte) ((pte_val(pte) & _PAGE_A) != 0) #define pte_file(pte) ((pte_val(pte) & _PAGE_FILE) != 0) +#define pte_special(pte) 0 + /* * Note: we convert AR_RWX to AR_RX and AR_RW to AR_R by clearing the 2nd bit in the * access rights: @@ -314,6 +316,7 @@ ia64_phys_addr_valid (unsigned long addr) #define pte_mkclean(pte) (__pte(pte_val(pte) & ~_PAGE_D)) #define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_D)) #define pte_mkhuge(pte) (__pte(pte_val(pte))) +#define pte_mkspecial(pte) (pte) /* * Make page protection values cacheable, uncacheable, or write- diff --git a/include/asm-m32r/pgtable.h b/include/asm-m32r/pgtable.h index 1983b7f..8b141e5 100644 --- a/include/asm-m32r/pgtable.h +++ b/include/asm-m32r/pgtable.h @@ -236,6 +236,11 @@ static inline pte_t pte_exprotect(pte_t pte) return pte; } +static inline int pte_special(pte_t pte) +{ + return 0; +} + static inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~_PAGE_DIRTY; @@ -289,6 +294,11 @@ static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigne return test_and_clear_bit(_PAGE_BIT_DIRTY, ptep); } +static inline pte_t pte_mkspecial(pte_t pte) +{ + return pte; +} + static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return test_and_clear_bit(_PAGE_BIT_ACCESSED, ptep); diff --git a/include/asm-m68k/motorola_pgtable.h b/include/asm-m68k/motorola_pgtable.h index 1ccc733..ea9fd67 100644 --- a/include/asm-m68k/motorola_pgtable.h +++ b/include/asm-m68k/motorola_pgtable.h @@ -169,6 +169,7 @@ static inline int pte_exec(pte_t pte) { return 1; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } +static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) |= _PAGE_RONLY; return pte; } static inline pte_t pte_rdprotect(pte_t pte) { return pte; } @@ -190,6 +191,7 @@ static inline pte_t pte_mkcache(pte_t pte) pte_val(pte) = (pte_val(pte) & _CACHEMASK040) | m68k_supervisor_cachemode; return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { return pte; } #define PAGE_DIR_OFFSET(tsk,address) pgd_offset((tsk),(address)) diff --git a/include/asm-m68k/sun3_pgtable.h b/include/asm-m68k/sun3_pgtable.h index 5156a28..4761900 100644 --- a/include/asm-m68k/sun3_pgtable.h +++ b/include/asm-m68k/sun3_pgtable.h @@ -171,6 +171,7 @@ static inline int pte_exec(pte_t pte) { return 1; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & SUN3_PAGE_MODIFIED; } static inline int pte_young(pte_t pte) { return pte_val(pte) & SUN3_PAGE_ACCESSED; } static inline int pte_file(pte_t pte) { return pte_val(pte) & SUN3_PAGE_ACCESSED; } +static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) &= ~SUN3_PAGE_WRITEABLE; return pte; } static inline pte_t pte_rdprotect(pte_t pte) { return pte; } @@ -187,6 +188,7 @@ static inline pte_t pte_mknocache(pte_t pte) { pte_val(pte) |= SUN3_PAGE_NOCACHE //static inline pte_t pte_mkcache(pte_t pte) { pte_val(pte) &= SUN3_PAGE_NOCACHE; return pte; } // until then, use: static inline pte_t pte_mkcache(pte_t pte) { return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { return pte; } extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; diff --git a/include/asm-mips/pgtable.h b/include/asm-mips/pgtable.h index a36ca1b..365a24d 100644 --- a/include/asm-mips/pgtable.h +++ b/include/asm-mips/pgtable.h @@ -325,6 +325,8 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte; } #endif +static inline int pte_special(pte_t pte) { return 0; } +static inline pte_t pte_mkspecial(pte_t pte) { return pte; } /* * Macro to make mark a page protection value as "uncacheable". Note diff --git a/include/asm-parisc/pgtable.h b/include/asm-parisc/pgtable.h index 5066c54..b0424c3 100644 --- a/include/asm-parisc/pgtable.h +++ b/include/asm-parisc/pgtable.h @@ -341,6 +341,7 @@ extern inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; extern inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; } extern inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } extern inline int pte_user(pte_t pte) { return pte_val(pte) & _PAGE_USER; } +static inline int pte_special(pte_t pte) { return 0; } extern inline pte_t pte_rdprotect(pte_t pte) { pte_val(pte) &= ~_PAGE_READ; return pte; } extern inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~_PAGE_DIRTY; return pte; } @@ -350,6 +351,7 @@ extern inline pte_t pte_mkread(pte_t pte) { pte_val(pte) |= _PAGE_READ; return p extern inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= _PAGE_DIRTY; return pte; } extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= _PAGE_ACCESSED; return pte; } extern inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) |= _PAGE_WRITE; return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { return pte; } /* * Conversion functions: convert a page and protection to a page entry, diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h index 8dbf5ad..b81c9e3 100644 --- a/include/asm-powerpc/pgtable.h +++ b/include/asm-powerpc/pgtable.h @@ -243,6 +243,7 @@ static inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXEC;} static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY;} static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED;} static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE;} +static inline int pte_special(pte_t pte) { return 0; } static inline void pte_uncache(pte_t pte) { pte_val(pte) |= _PAGE_NO_CACHE; } static inline void pte_cache(pte_t pte) { pte_val(pte) &= ~_PAGE_NO_CACHE; } @@ -269,6 +270,8 @@ static inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkhuge(pte_t pte) { return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { + return pte; } /* Atomic PTE updates */ static inline unsigned long pte_update(pte_t *p, unsigned long clr) diff --git a/include/asm-ppc/pgtable.h b/include/asm-ppc/pgtable.h index 51fa7c6..5d27843 100644 --- a/include/asm-ppc/pgtable.h +++ b/include/asm-ppc/pgtable.h @@ -539,6 +539,7 @@ static inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXEC; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } +static inline int pte_special(pte_t pte) { return 0; } static inline void pte_uncache(pte_t pte) { pte_val(pte) |= _PAGE_NO_CACHE; } static inline void pte_cache(pte_t pte) { pte_val(pte) &= ~_PAGE_NO_CACHE; } @@ -564,6 +565,8 @@ static inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= _PAGE_ACCESSED; return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { + return pte; } static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { diff --git a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h index 3515ed8..f10f399 100644 --- a/include/asm-s390/pgtable.h +++ b/include/asm-s390/pgtable.h @@ -423,6 +423,11 @@ static inline int pte_file(pte_t pte) return (pte_val(pte) & mask) == _PAGE_TYPE_FILE; } +static inline int pte_special(pte_t pte) +{ + return 0; +} + #define pte_same(a,b) (pte_val(a) == pte_val(b)) /* @@ -555,6 +560,11 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte; } +static inline pte_t pte_mkspecial(pte_t pte) +{ + return pte; +} + static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; diff --git a/include/asm-sh/pgtable.h b/include/asm-sh/pgtable.h index dcd23a0..34865c8 100644 --- a/include/asm-sh/pgtable.h +++ b/include/asm-sh/pgtable.h @@ -184,6 +184,7 @@ static inline int pte_young(pte_t pte){ return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } static inline int pte_write(pte_t pte){ return pte_val(pte) & _PAGE_RW; } static inline int pte_not_present(pte_t pte){ return !(pte_val(pte) & _PAGE_PRESENT); } +static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_rdprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } static inline pte_t pte_exprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } @@ -285,6 +286,8 @@ typedef pte_t *pte_addr_t; struct mm_struct; +static inline pte_t pte_mkspecial(pte_t pte) { return pte; } + /* * No page table caches to initialise */ diff --git a/include/asm-sh64/pgtable.h b/include/asm-sh64/pgtable.h index 54c7821..641df64 100644 --- a/include/asm-sh64/pgtable.h +++ b/include/asm-sh64/pgtable.h @@ -413,12 +413,13 @@ extern void __handle_bad_pmd_kernel(pmd_t * pmd); /* * The following have defined behavior only work if pte_present() is true. */ -static inline int pte_read(pte_t pte) { return pte_val(pte) & _PAGE_READ; } -static inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXECUTE; } -static inline int pte_dirty(pte_t pte){ return pte_val(pte) & _PAGE_DIRTY; } -static inline int pte_young(pte_t pte){ return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } -static inline int pte_write(pte_t pte){ return pte_val(pte) & _PAGE_WRITE; } +static inline int pte_read(pte_t pte) { return pte_val(pte) & _PAGE_READ; } +static inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXECUTE; } +static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } +static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } +static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } +static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; } +static inline int pte_special(pte_t pte){ return 0; } static inline pte_t pte_rdprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_READ)); return pte; } static inline pte_t pte_wrprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_WRITE)); return pte; } @@ -432,6 +433,7 @@ static inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _ static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; } static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_SZHUGE)); return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { return pte; } /* diff --git a/include/asm-sparc/pgtable.h b/include/asm-sparc/pgtable.h index 226c647..599a2cf 100644 --- a/include/asm-sparc/pgtable.h +++ b/include/asm-sparc/pgtable.h @@ -222,6 +222,11 @@ static inline int pte_file(pte_t pte) return pte_val(pte) & BTFIXUP_HALF(pte_filei); } +static inline int pte_special(pte_t pte) +{ + return 0; +} + /* */ BTFIXUPDEF_HALF(pte_wrprotecti) @@ -254,6 +259,8 @@ BTFIXUPDEF_CALL_CONST(pte_t, pte_mkyoung, pte_t) #define pte_mkdirty(pte) BTFIXUP_CALL(pte_mkdirty)(pte) #define pte_mkyoung(pte) BTFIXUP_CALL(pte_mkyoung)(pte) +#define pte_mkspecial(pte) (pte) + #define pfn_pte(pfn, prot) mk_pte(pfn_to_page(pfn), prot) BTFIXUPDEF_CALL(unsigned long, pte_pfn, pte_t) diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h index ebfe395..97c0a9e 100644 --- a/include/asm-sparc64/pgtable.h +++ b/include/asm-sparc64/pgtable.h @@ -503,6 +503,11 @@ static inline pte_t pte_mkyoung(pte_t pte) return __pte(pte_val(pte) | mask); } +static inline pte_t pte_mkspecial(pte_t pte) +{ + return pte; +} + static inline unsigned long pte_young(pte_t pte) { unsigned long mask; @@ -623,6 +628,11 @@ static inline unsigned long pte_present(pte_t pte) return val; } +static inline int pte_special(pte_t pte) +{ + return 0; +} + #define pmd_set(pmdp, ptep) \ (pmd_val(*(pmdp)) = (__pa((unsigned long) (ptep)) >> 11UL)) #define pud_set(pudp, pmdp) \ diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h index ac64eb9..a4e33ed 100644 --- a/include/asm-um/pgtable.h +++ b/include/asm-um/pgtable.h @@ -226,6 +226,11 @@ static inline int pte_newprot(pte_t pte) return(pte_present(pte) && (pte_get_bits(pte, _PAGE_NEWPROT))); } +static inline int pte_special(pte_t pte) +{ + return 0; +} + /* * ================================= * Flags setting section. @@ -312,6 +317,11 @@ static inline pte_t pte_mknewpage(pte_t pte) return(pte); } +static inline pte_t pte_mkspecial(pte_t pte) +{ + return(pte); +} + static inline void set_pte(pte_t *pteptr, pte_t pteval) { pte_copy(*pteptr, pteval); diff --git a/include/asm-x86_64/mach-xen/asm/pgtable.h b/include/asm-x86_64/mach-xen/asm/pgtable.h index f5cfe8e..d0b91cc 100644 --- a/include/asm-x86_64/mach-xen/asm/pgtable.h +++ b/include/asm-x86_64/mach-xen/asm/pgtable.h @@ -212,6 +212,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_SOFTDIRTY 9 /* save dirty state when hdw dirty bit cleared */ +#define _PAGE_BIT_SPECIAL 10 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ #define _PAGE_PRESENT 0x001 @@ -225,6 +226,8 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long #define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ #define _PAGE_GLOBAL 0x100 /* Global TLB entry */ #define _PAGE_SOFTDIRTY 0x200 +#define _PAGE_SPECIAL 0x400 +#define __HAVE_ARCH_PTE_SPECIAL #define _PAGE_PROTNONE 0x080 /* If not present */ #define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX) @@ -358,6 +361,7 @@ static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; } static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; } static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; } +static inline int pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL; } static inline pte_t pte_rdprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; } static inline pte_t pte_exprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; } @@ -375,6 +379,7 @@ static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; retu static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; } static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_SPECIAL)); return pte; } struct vm_area_struct; diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index bd3d0c1..0f8ff9e 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -168,6 +168,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_SOFTDIRTY 9 /* save dirty state when hdw dirty bit cleared */ +#define _PAGE_BIT_SPECIAL 10 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ #define _PAGE_PRESENT 0x001 @@ -181,6 +182,8 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long #define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ #define _PAGE_GLOBAL 0x100 /* Global TLB entry */ #define _PAGE_SOFTDIRTY 0x200 +#define _PAGE_SPECIAL 0x400 +#define __HAVE_ARCH_PTE_SPECIAL #define _PAGE_PROTNONE 0x080 /* If not present */ #define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX) @@ -294,6 +297,7 @@ static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; } static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } static inline int pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_PSE; } +static inline int pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL; } static inline pte_t pte_rdprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } static inline pte_t pte_exprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } @@ -306,6 +310,7 @@ static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; } static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_SPECIAL)); return pte; } struct vm_area_struct; diff --git a/include/asm-xtensa/pgtable.h b/include/asm-xtensa/pgtable.h index 7b15afb..4429070 100644 --- a/include/asm-xtensa/pgtable.h +++ b/include/asm-xtensa/pgtable.h @@ -241,6 +241,8 @@ static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } +static inline int pte_special(pte_t pte) { return 0; } + static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) &= ~(_PAGE_RW | _PAGE_WRENABLE); return pte; } static inline pte_t pte_rdprotect(pte_t pte) { pte_val(pte) &= ~_PAGE_USER; return pte; } static inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~_PAGE_DIRTY; return pte; } @@ -249,6 +251,7 @@ static inline pte_t pte_mkread(pte_t pte) { pte_val(pte) |= _PAGE_USER; return p static inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) |= _PAGE_RW; return pte; } +static inline pte_t pte_mkspecial(pte_t pte) { return pte; } /* * Conversion functions: convert a page and protection to a page entry, diff --git a/include/linux/mm.h b/include/linux/mm.h index 8fe1de1..7a40b34 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -173,6 +173,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_FOREIGN 0x04000000 /* Has pages belonging to another VM */ #endif #define VM_ALWAYSDUMP 0x08000000 /* Always include in core dumps */ +#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS @@ -762,7 +763,9 @@ struct zap_details { unsigned long truncate_count; /* Compare vm_truncate_count */ }; -struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t); +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); + unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); unsigned long unmap_vmas(struct mmu_gather **tlb, @@ -833,6 +836,39 @@ extern unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr); +#ifdef CONFIG_HAVE_GET_USER_PAGES_FAST +/* + * get_user_pages_fast provides equivalent functionality to get_user_pages, + * operating on current and current->mm (force=0 and doesn't return any vmas). + * + * get_user_pages_fast may take mmap_sem and page tables, so no assumptions + * can be made about locking. get_user_pages_fast is to be implemented in a + * way that is advantageous (vs get_user_pages()) when the user memory area is + * already faulted in and present in ptes. However if the pages have to be + * faulted in, it may turn out to be slightly slower). + */ +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages); + +#else +/* + * Should probably be moved to asm-generic, and architectures can include it if + * they don't implement their own get_user_pages_fast. + */ +#define get_user_pages_fast(start, nr_pages, write, pages) \ +({ \ + struct mm_struct *mm = current->mm; \ + int ret; \ + \ + down_read(&mm->mmap_sem); \ + ret = get_user_pages(current, mm, start, nr_pages, \ + write, 0, pages, NULL); \ + up_read(&mm->mmap_sem); \ + \ + ret; \ +}) +#endif + /* * Prototype to add a shrinker callback for ageable caches. * diff --git a/mm/Kconfig b/mm/Kconfig index 59d3216..fdbc3dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -77,6 +77,9 @@ config FLAT_NODE_MEM_MAP def_bool y depends on !SPARSEMEM +config HAVE_GET_USER_PAGES_FAST + bool + # # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's # to represent different areas of memory. This variable allows diff --git a/mm/memory.c b/mm/memory.c index 9ea444b..9de1c7d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -367,56 +367,93 @@ static inline int is_cow_mapping(unsigned int flags) } /* - * This function gets the "struct page" associated with a pte. + * vm_normal_page -- This function gets the "struct page" associated with a pte. * - * NOTE! Some mappings do not have "struct pages". A raw PFN mapping - * will have each page table entry just pointing to a raw page frame - * number, and as far as the VM layer is concerned, those do not have - * pages associated with them - even if the PFN might point to memory - * that otherwise is perfectly fine and has a "struct page". + * "Special" mappings do not wish to be associated with a "struct page" (either + * it doesn't exist, or it exists but they don't want to touch it). In this + * case, NULL is returned here. "Normal" mappings do have a struct page. * - * The way we recognize those mappings is through the rules set up - * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, - * and the vm_pgoff will point to the first PFN mapped: thus every - * page that is a raw mapping will always honor the rule + * There are 2 broad cases. Firstly, an architecture may define a pte_special() + * pte bit, in which case this function is trivial. Secondly, an architecture + * may not have a spare pte bit, which requires a more complicated scheme, + * described below. + * + * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a + * special mapping (even if there are underlying and valid "struct pages"). + * COWed pages of a VM_PFNMAP are always normal. + * + * The way we recognize COWed pages within VM_PFNMAP mappings is through the + * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit + * set, and the vm_pgoff will point to the first PFN mapped: thus every special + * mapping will always honor the rule * * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) * - * and if that isn't true, the page has been COW'ed (in which case it - * _does_ have a "struct page" associated with it even if it is in a - * VM_PFNMAP range). + * And for normal mappings this is false. + * + * This restricts such mappings to be a linear translation from virtual address + * to pfn. To get around this restriction, we allow arbitrary mappings so long + * as the vma is not a COW mapping; in that case, we know that all ptes are + * special (because none can have been COWed). + * + * + * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. + * + * VM_MIXEDMAP mappings can likewise contain memory with or without "struct + * page" backing, however the difference is that _all_ pages with a struct + * page (that is, those where pfn_valid is true) are refcounted and considered + * normal pages by the VM. The disadvantage is that pages are refcounted + * (which can be slower and simply not an option for some PFNMAP users). The + * advantage is that we don't have to follow the strict linearity rule of + * PFNMAP mappings in order to support COWable mappings. + * */ -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) +#ifdef __HAVE_ARCH_PTE_SPECIAL +# define HAVE_PTE_SPECIAL 1 +#else +# define HAVE_PTE_SPECIAL 0 +#endif +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) { - unsigned long pfn = pte_pfn(pte); + unsigned long pfn; - if (unlikely(vma->vm_flags & VM_PFNMAP)) { - unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; + if (HAVE_PTE_SPECIAL) { + if (likely(!pte_special(pte))) { + BUG_ON(!pfn_valid(pte_pfn(pte))); + return pte_page(pte); + } + BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); + return NULL; } - /* - * Add some anal sanity checks for now. Eventually, - * we should just do "return pfn_to_page(pfn)", but - * in the meantime we check that we get a valid pfn, - * and that the resulting page looks ok. - */ - if (unlikely(!pfn_valid(pfn))) { - if (!(vma->vm_flags & VM_RESERVED)) - print_bad_pte(vma, pte, addr); - return NULL; + /* !HAVE_PTE_SPECIAL case follows: */ + + pfn = pte_pfn(pte); + + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } } + BUG_ON(!pfn_valid(pfn)); + /* - * NOTE! We still have PageReserved() pages in the page - * tables. + * NOTE! We still have PageReserved() pages in the page tables. * - * The PAGE_ZERO() pages and various VDSO mappings can - * cause them to exist. + * eg. VDSO mappings can cause them to exist. */ +out: return pfn_to_page(pfn); } @@ -1309,8 +1346,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, entry; spinlock_t *ptl; - BUG_ON(!(vma->vm_flags & VM_PFNMAP)); - BUG_ON(is_cow_mapping(vma->vm_flags)); + /* + * Technically, architectures with pte_special can avoid all these + * restrictions (same for remap_pfn_range). However we would like + * consistency in testing and feature parity among all, so we should + * try to keep these invariants in place for everybody. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); retval = -ENOMEM; pte = get_locked_pte(mm, addr, &ptl); @@ -1321,7 +1367,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, goto out_unlock; /* Ok, finally just insert the thing.. */ - entry = pfn_pte(pfn, vma->vm_page_prot); + entry = pte_mkspecial(pfn_pte(pfn, vma->vm_page_prot)); set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, entry); @@ -1351,7 +1397,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, return -ENOMEM; do { BUG_ON(!pte_none(*pte)); - set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); + set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); @@ -2457,10 +2503,13 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, int ret = VM_FAULT_MINOR; pte_unmap(page_table); - BUG_ON(!(vma->vm_flags & VM_PFNMAP)); - BUG_ON(is_cow_mapping(vma->vm_flags)); + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); + + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + if (unlikely(pfn == NOPFN_OOM)) return VM_FAULT_OOM; else if (unlikely(pfn == NOPFN_SIGBUS))