From: Masami Hiramatsu <mhiramat@redhat.com> Date: Wed, 13 Aug 2008 16:48:30 -0400 Subject: [fs] relayfs: support larger on-memory buffer Message-id: 48A3489E.2070406@redhat.com O-Subject: [RHEL5.3 PATCH] BZ439269: relayfs support larger on-memory buffer Bugzilla: 439269 RH-Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com> RH-Acked-by: Anton Arapov <aarapov@redhat.com> relayfs: support larger relay buffer Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=439269 Here is a backported kernel patch for 2.6.18-101.el5. Description: Use vmalloc() and memset() instead of kcalloc() to allocate a page* array when the array size is bigger than one page. This enables relayfs to support bigger relay buffers than 64MB on 4k-page system, 512MB on 16k-page system. This patch also includes is_vmalloc_addr() macro patches and resolving dependency of pgtable.h on highmem.h patch, because new relayfs code uses is_vmalloc_addr() and VMALLOC_END macro(which is used in is_vmalloc_addr) depends on PKMAP_BASE. I also ported pgtable.h and highmem.h dependency solution patch to redhat xen kernel, because it had same dependency problem. Upstream status: 9e2779fa281cfda13ac060753d674bbcaa23367e //is_vmalloc_addr macro patch 8ca3ed87db062201e1fa15b64a9214e193fc3a8a //is_vmalloc_addr fix1 0738c4bb8f2a8bf15178f852494643b0981f578b //is_vmalloc_addr fix2 0b7a96114bd5991d355a1f1c1d3d9c0c9d9c1cfc //resolve dependency of pgtable.h on highmem.h 68ab3d883a2df13f4b93a923bae3a287cbee29d3 //relayfs larger buffer support Brew build: http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1420045 Testing: Tested with the latest systemtap. I ran stap with 1024MB relay buffer on x86-64 and checked VmallocUsed in /proc/meminfo. Masami diff --git a/drivers/net/cxgb3/cxgb3_offload.c b/drivers/net/cxgb3/cxgb3_offload.c index a1f6584..fcc53b6 100644 --- a/drivers/net/cxgb3/cxgb3_offload.c +++ b/drivers/net/cxgb3/cxgb3_offload.c @@ -1067,9 +1067,7 @@ void *cxgb_alloc_mem(unsigned long size) */ void cxgb_free_mem(void *addr) { - unsigned long p = (unsigned long)addr; - - if (p >= VMALLOC_START && p < VMALLOC_END) + if (is_vmalloc_addr(addr)) vfree(addr); else kfree(addr); diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index e38e402..cd0be3f 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -85,8 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size) static inline void ntfs_free(void *addr) { - if (likely(((unsigned long)addr < VMALLOC_START) || - ((unsigned long)addr >= VMALLOC_END ))) { + if (!is_vmalloc_addr(addr)) { kfree(addr); /* free_page((unsigned long)addr); */ return; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 0ca9a2f..0331300 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -328,7 +328,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (m == NULL) { if (clear_user(buffer, tsz)) return -EFAULT; - } else if ((start >= VMALLOC_START) && (start < VMALLOC_END)) { + } else if (is_vmalloc_addr((void *)start)) { char * elf_buf; struct vm_struct *m; unsigned long curstart = start; diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index aba7fcf..6ba9684 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -63,8 +63,7 @@ kmem_zalloc(size_t size, unsigned int __nocast flags) void kmem_free(void *ptr, size_t size) { - if (((unsigned long)ptr < VMALLOC_START) || - ((unsigned long)ptr >= VMALLOC_END)) { + if (!is_vmalloc_addr(ptr)) { kfree(ptr); } else { vfree(ptr); diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 2af528d..38d0dfd 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -693,8 +693,7 @@ static inline struct page * mem_to_page( void *addr) { - if (((unsigned long)addr < VMALLOC_START) || - ((unsigned long)addr >= VMALLOC_END)) { + if ((!is_vmalloc_addr(addr))) { return virt_to_page(addr); } else { return vmalloc_to_page(addr); diff --git a/include/asm-i386/highmem.h b/include/asm-i386/highmem.h index e9a34eb..d027b71 100644 --- a/include/asm-i386/highmem.h +++ b/include/asm-i386/highmem.h @@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table; * easily, subsequent pte tables have to be allocated in one physical * chunk of RAM. */ -#ifdef CONFIG_X86_PAE -#define LAST_PKMAP 512 -#else -#define LAST_PKMAP 1024 -#endif /* * Ordering is: * @@ -57,7 +52,6 @@ extern pte_t *pkmap_page_table; * VMALLOC_START * high_memory */ -#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK ) #define LAST_PKMAP_MASK (LAST_PKMAP-1) #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) diff --git a/include/asm-i386/mach-xen/asm/highmem.h b/include/asm-i386/mach-xen/asm/highmem.h index d379186..00ea2a4 100644 --- a/include/asm-i386/mach-xen/asm/highmem.h +++ b/include/asm-i386/mach-xen/asm/highmem.h @@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table; * easily, subsequent pte tables have to be allocated in one physical * chunk of RAM. */ -#ifdef CONFIG_X86_PAE -#define LAST_PKMAP 512 -#else -#define LAST_PKMAP 1024 -#endif /* * Ordering is: * @@ -57,7 +52,6 @@ extern pte_t *pkmap_page_table; * VMALLOC_START * high_memory */ -#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK ) #define LAST_PKMAP_MASK (LAST_PKMAP-1) #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) diff --git a/include/asm-i386/mach-xen/asm/pgtable.h b/include/asm-i386/mach-xen/asm/pgtable.h index 05b6e71..bb44c27 100644 --- a/include/asm-i386/mach-xen/asm/pgtable.h +++ b/include/asm-i386/mach-xen/asm/pgtable.h @@ -82,6 +82,14 @@ void paging_init(void); #define VMALLOC_OFFSET (8*1024*1024) #define VMALLOC_START (((unsigned long) high_memory + vmalloc_earlyreserve + \ 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) +#ifdef CONFIG_X86_PAE +#define LAST_PKMAP 512 +#else +#define LAST_PKMAP 1024 +#endif + +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK) + #ifdef CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) #else diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 09697fe..b3704b4 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -81,6 +81,14 @@ void paging_init(void); #define VMALLOC_OFFSET (8*1024*1024) #define VMALLOC_START (((unsigned long) high_memory + vmalloc_earlyreserve + \ 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) +#ifdef CONFIG_X86_PAE +#define LAST_PKMAP 512 +#else +#define LAST_PKMAP 1024 +#endif + +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK) + #ifdef CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) #else diff --git a/include/linux/mm.h b/include/linux/mm.h index 8fe1de1..0afb317 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -322,6 +322,23 @@ static inline int get_page_unless_zero(struct page *page) extern void FASTCALL(__page_cache_release(struct page *)); +/* + * Determine if an address is within the vmalloc range + * + * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there + * is no special casing required. + */ +static inline int is_vmalloc_addr(const void *x) +{ +#ifdef CONFIG_MMU + unsigned long addr = (unsigned long)x; + + return addr >= VMALLOC_START && addr < VMALLOC_END; +#else + return 0; +#endif +} + static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) diff --git a/kernel/relay.c b/kernel/relay.c index d82d6da..c780a8d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -70,6 +70,35 @@ static struct vm_operations_struct relay_file_mmap_ops = { .close = relay_file_mmap_close, }; +/* + * allocate an array of pointers of struct page + */ +static struct page **relay_alloc_page_array(unsigned int n_pages) +{ + struct page **array; + size_t pa_size = n_pages * sizeof(struct page *); + + if (pa_size > PAGE_SIZE) { + array = vmalloc(pa_size); + if (array) + memset(array, 0, pa_size); + } else { + array = kzalloc(pa_size, GFP_KERNEL); + } + return array; +} + +/* + * free an array of pointers of struct page + */ +static void relay_free_page_array(struct page **array) +{ + if (is_vmalloc_addr(array)) + vfree(array); + else + kfree(array); +} + /** * relay_mmap_buf: - mmap channel buffer to process address space * @buf: relay channel buffer @@ -114,7 +143,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) *size = PAGE_ALIGN(*size); n_pages = *size >> PAGE_SHIFT; - buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); + buf->page_array = relay_alloc_page_array(n_pages); if (!buf->page_array) return NULL; @@ -134,7 +163,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) depopulate: for (j = 0; j < i; j++) __free_page(buf->page_array[j]); - kfree(buf->page_array); + relay_free_page_array(buf->page_array); return NULL; } @@ -193,7 +222,7 @@ void relay_destroy_buf(struct rchan_buf *buf) vunmap(buf->start); for (i = 0; i < buf->page_count; i++) __free_page(buf->page_array[i]); - kfree(buf->page_array); + relay_free_page_array(buf->page_array); } chan->buf[buf->cpu] = NULL; kfree(buf->padding); diff --git a/mm/sparse.c b/mm/sparse.c index a80415c..9c4bb5f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -239,17 +239,9 @@ got_map_ptr: return ret; } -static int vaddr_in_vmalloc_area(void *addr) -{ - if (addr >= (void *)VMALLOC_START && - addr < (void *)VMALLOC_END) - return 1; - return 0; -} - static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { - if (vaddr_in_vmalloc_area(memmap)) + if (is_vmalloc_addr(memmap)) vfree(memmap); else free_pages((unsigned long)memmap,