On Mon, 10 Jan 2005, Linus Torvalds wrote: > Currently the BK tree > - doesn't use __GFP_ZERO with anonymous user-mapped pages (which is what > you wrote this whole thing for ;)
> Potential fix: declare a per-architecture "alloc_user_highpage(vaddr)" > that does the proper magic on virtually indexed machines, and on others > it just does a "alloc_page(GFP_HIGHUSER | __GFP_ZERO)".
The following patch adds an alloc_zeroed_user_highpage(vma, vaddr). It also uses zeroed pages on COW. clear_user_highpage is now only used by that function. Fold it into alloc_zeroed_user_highpage?
This is against last hours bitkeeper tree. mm/memory.o compiles fine but I was not able to build a ia64 kernel due to some pieces that seem to be missing in last hours tree.
+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE + /* * These are used to make use of C type-checking.. */ Index: linus/mm/memory.c =================================================================== --- linus.orig/mm/memory.c 2005-01-10 11:44:39.000000000 -0800 +++ linus/mm/memory.c 2005-01-10 12:05:21.000000000 -0800 @@ -84,20 +84,6 @@ EXPORT_SYMBOL(vmalloc_earlyreserve);
/* - * We special-case the C-O-W ZERO_PAGE, because it's such - * a common occurrence (no need to read the page to know - * that it's zero - better for the cache and memory subsystem). - */ -static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address) -{ - if (from == ZERO_PAGE(address)) { - clear_user_highpage(to, address); - return; - } - copy_user_highpage(to, from, address); -} - -/* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ @@ -1329,11 +1315,16 @@
if (unlikely(anon_vma_prepare(vma))) goto no_new_page; - new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); - if (!new_page) - goto no_new_page; - copy_cow_page(old_page,new_page,address); - + if (old_page == ZERO_PAGE(address)) { + new_page = alloc_zeroed_user_highpage(vma, address); + if (!new_page) + goto no_new_page; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (!new_page) + goto no_new_page; + copy_user_highpage(new_page, old_page, address); + } /* * Re-check the pte - we dropped the lock */ @@ -1795,10 +1786,9 @@
+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE + /* * These are used to make use of C type-checking.. */ Index: linus/include/asm-x86_64/page.h =================================================================== --- linus.orig/include/asm-x86_64/page.h 2005-01-06 12:58:48.000000000 -0800 +++ linus/include/asm-x86_64/page.h 2005-01-10 11:56:04.000000000 -0800 @@ -38,6 +38,8 @@ #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE /* * These are used to make use of C type-checking.. */ Index: linus/include/asm-s390/page.h =================================================================== --- linus.orig/include/asm-s390/page.h 2004-10-20 12:04:59.000000000 -0700 +++ linus/include/asm-s390/page.h 2005-01-10 11:56:33.000000000 -0800 @@ -106,6 +106,9 @@ #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE + /* Pure 2^n version of get_order */ extern __inline__ int get_order(unsigned long size) { - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
This patch fixes the __GFP_ZERO related code by adding a new function alloc_zeroed_user_highpage that is then used in the anonymous page fault handler and in the COW code to allocate pages. The function can be defined per arch to setup special processing for user pages by defining __HAVE_ARCH_ALLOC_ZEROED_USER_PAGE.
Signed-off-by: Christoph Lameter <clame...@sgi.com>
+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE + /* * These are used to make use of C type-checking.. */ Index: linux-2.6.10/mm/memory.c =================================================================== --- linux-2.6.10.orig/mm/memory.c 2005-01-10 13:48:11.000000000 -0800 +++ linux-2.6.10/mm/memory.c 2005-01-10 13:54:30.000000000 -0800 @@ -84,20 +84,6 @@ EXPORT_SYMBOL(vmalloc_earlyreserve);
/* - * We special-case the C-O-W ZERO_PAGE, because it's such - * a common occurrence (no need to read the page to know - * that it's zero - better for the cache and memory subsystem). - */ -static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address) -{ - if (from == ZERO_PAGE(address)) { - clear_user_highpage(to, address); - return; - } - copy_user_highpage(to, from, address); -} - -/* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ @@ -1329,11 +1315,16 @@
if (unlikely(anon_vma_prepare(vma))) goto no_new_page; - new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); - if (!new_page) - goto no_new_page; - copy_cow_page(old_page,new_page,address); - + if (old_page == ZERO_PAGE(address)) { + new_page = alloc_zeroed_user_highpage(vma, address); + if (!new_page) + goto no_new_page; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (!new_page) + goto no_new_page; + copy_user_highpage(new_page, old_page, address); + } /* * Re-check the pte - we dropped the lock */ @@ -1795,7 +1786,7 @@
if (unlikely(anon_vma_prepare(vma))) goto no_mem; - page = alloc_page_vma(GFP_HIGHZERO, vma, addr); + page = alloc_zeroed_user_highpage(vma, addr); if (!page) goto no_mem;
+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE + /* * These are used to make use of C type-checking.. */ Index: linux-2.6.10/include/asm-x86_64/page.h =================================================================== --- linux-2.6.10.orig/include/asm-x86_64/page.h 2005-01-10 13:48:11.000000000 -0800 +++ linux-2.6.10/include/asm-x86_64/page.h 2005-01-10 13:53:59.000000000 -0800 @@ -38,6 +38,8 @@ #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE /* * These are used to make use of C type-checking.. */ Index: linux-2.6.10/include/asm-s390/page.h =================================================================== --- linux-2.6.10.orig/include/asm-s390/page.h 2004-12-24 13:34:01.000000000 -0800 +++ linux-2.6.10/include/asm-s390/page.h 2005-01-10 13:53:59.000000000 -0800 @@ -106,6 +106,9 @@ #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE + /* Pure 2^n version of get_order */ extern __inline__ int get_order(unsigned long size) {
Changes from V3 to V4: o Drop __GFP_ZERO patch since its in Linus tree. Include new patch that allows archs that need special measures around zeroing of user pages during a page fault to maintain their special adaptations. o Use zeroed pages during COW. o Updates for clear_page for various platforms. Make clear_page an optional patch and fall back to a series of clear_page without order if the patch to expand clear_page patch has not been applied. o x86_64 asm code fixed up o Port patches to 2.6.10-bk13 and make it fit the bitmapless buddy allocator
The patches increasing the page fault rate (introduction of atomic pte operations and anticipatory prefaulting) do so by reducing the locking overhead and are therefore mainly of interest for applications running in SMP systems with a high number of cpus. The single thread performance does just show minor increases. Only the performance of multi-threaded applications increases significantly.
The most expensive operation in the page fault handler is (apart of SMP locking overhead) the zeroing of the page that is also done in the page fault handler. This zeroing means that all cachelines of the faulted page (on Altix that means all 128 cachelines of 128 byte each) must be loaded and later written back. This patch allows to avoid having to load all cachelines if only a part of the cachelines of that page is needed immediately after the fault. Doing so will only be effective for sparsely accessed memory which is typical for anonymous memory and pte maps. Prezeroed pages will only be used for those purposes. Unzeroed pages will be used as usual for file mapping, page caching etc etc.
The patch makes prezeroing very effective by:
1. Aggregating zeroing operations to only apply to pages of higher order, which results in many pages that will later become zero 0 to be zeroed in one step. For that purpose the existing clear_page function is extended and made to take an additional argument specifying the order of the page to be cleared.
2. Hardware support for offloading zeroing from the cpu. This avoids the invalidation of the cpu caches by extensive zeroing operations.
The scrub daemon is invoked when a unzeroed page of a certain order has been generated so that its worth running it. If no higher order pages are present then the logic will favor hot zeroing rather than simply shifting processing around. kscrubd typically runs only for a fraction of a second and sleeps for long periods of time even under memory benchmarking. kscrubd performs short bursts of zeroing when needed and tries to stay out off the processor as much as possible.
The benefits of prezeroing are reduced to minimal quantities if all cachelines of a page are touched. Prezeroing can only be effective if the whole page is not immediately used after the page fault.
The patch is composed of 4 parts:
[1/4] GFP_ZERO fixups Adds alloc_zeroed_user_highpage(vma, vaddr) that may be customized for each arch by defining __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE. Includes proper definitions for a large selection of arches, others fall back to the default function in include/linux/highmem.h (and falls back to not using prezeroed pages).
[2/4] Page Zeroing Adds management of ZEROED and NOT_ZEROED pages and a background daemon called scrubd. scrubd is disabled by default but can be enabled by writing an order number to /proc/sys/vm/scrub_start. If a page is coalesced of that order or higher then the scrub daemon will start zeroing until all pages of order /proc/sys/vm/scrub_stop and higher are zeroed and then go back to sleep.
In an SMP environment the scrub daemon is typically running on the most idle cpu. Thus a single threaded application running on one cpu may have the other cpu zeroing pages for it etc. The scrub daemon is hardly noticable and usually finished zeroing quickly since most processors are optimized for linear memory filling.
The following patches increase performance but may be omitted:
[2/4] SGI Altix Block Transfer Engine Support Implements a driver to shift the zeroing off the cpu into hardware. With hardware support the impact of zeroing on the system is reduced to a minimum.
[4/4] Architecture specific clear_page updates Adds second order argument to clear_page and updates all arches. This allows the zeroing of large areas of memory without repeately invoking clear_page() for the page allocator, scrubd and the huge page allocator.
@@ -30,7 +34,7 @@ /* two interfaces on two btes */ #define MAX_INTERFACES_TO_TRY 4
-static struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface) +static inline struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface) { nodepda_t *tmp_nodepda;
@@ -132,7 +136,6 @@ if (bte == NULL) { continue; } - if (spin_trylock(&bte->spinlock)) { if (!(*bte->most_rcnt_na & BTE_WORD_AVAILABLE) || (BTE_LNSTAT_LOAD(bte) & BTE_ACTIVE)) { @@ -157,7 +160,7 @@ } } while (1);
- if (notification == NULL) { + if (notification == NULL || (mode & BTE_NOTIFY_AND_GET_POINTER)) { /* User does not want to be notified. */ bte->most_rcnt_na = &bte->notify; } else { @@ -192,6 +195,8 @@
if (notification != NULL) { @@ -449,5 +454,47 @@ mynodepda->bte_if[i].cleanup_active = 0; mynodepda->bte_if[i].bh_error = 0; } +} + +u64 *bte_zero_notify[MAX_COMPACT_NODES]; + +#define ZERO_RATE_PER_SEC 500000000 + +static int bte_start_bzero(void *p, unsigned long len) +{ + int rc; + int ticks; + int node = get_nasid(); + + /* Check limitations. + 1. System must be running (weird things happen during bootup) + 2. Size >64KB. Smaller requests cause too much bte traffic + */ + if (len >= BTE_MAX_XFER || len < 60000 || system_state != SYSTEM_RUNNING) + return EINVAL; + + rc = bte_zero(ia64_tpa(p), len, BTE_NOTIFY_AND_GET_POINTER, bte_zero_notify+node); + if (rc) + return rc; + + ticks = (len*HZ)/ZERO_RATE_PER_SEC; + if (ticks) { + /* Wait the minimum time of the transfer */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(ticks); + } + while (*(bte_zero_notify[node]) != BTE_WORD_BUSY) { + /* Then keep on checking until transfer is complete */ + cpu_relax(); + schedule(); + } + return 0; +} + +static struct zero_driver bte_bzero = { + .start = bte_start_bzero, +};
+void sn_bte_bzero_init(void) { + register_zero_driver(&bte_bzero); } Index: linux-2.6.10/arch/ia64/sn/kernel/setup.c =================================================================== --- linux-2.6.10.orig/arch/ia64/sn/kernel/setup.c 2005-01-10 13:48:08.000000000 -0800 +++ linux-2.6.10/arch/ia64/sn/kernel/setup.c 2005-01-10 13:54:52.000000000 -0800 @@ -244,6 +244,7 @@ int pxm; int major = sn_sal_rev_major(), minor = sn_sal_rev_minor(); extern void sn_cpu_init(void); + extern void sn_bte_bzero_init(void);
/* * If the generic code has enabled vga console support - lets @@ -334,6 +335,7 @@ screen_info = sn_screen_info;
sn_timer_init(); + sn_bte_bzero_init(); }
/** Index: linux-2.6.10/include/asm-ia64/sn/bte.h =================================================================== --- linux-2.6.10.orig/include/asm-ia64/sn/bte.h 2004-12-24 13:34:45.000000000 -0800 +++ linux-2.6.10/include/asm-ia64/sn/bte.h 2005-01-10 13:54:52.000000000 -0800 @@ -48,6 +48,8 @@ #define BTE_ZERO_FILL (BTE_NOTIFY | IBCT_ZFIL_MODE) /* Use a reserved bit to let the caller specify a wait for any BTE */ #define BTE_WACQUIRE (0x4000) +/* Return the pointer to the notification cacheline to the user */ +#define BTE_NOTIFY_AND_GET_POINTER (0x8000) /* Use the BTE on the node with the destination memory */ #define BTE_USE_DEST (BTE_WACQUIRE << 1) /* Use any available BTE interface on any node for the transfer */
#endif Index: linux-2.6.10/arch/ia64/lib/clear_page.S =================================================================== --- linux-2.6.10.orig/arch/ia64/lib/clear_page.S 2004-12-24 13:33:50.000000000 -0800 +++ linux-2.6.10/arch/ia64/lib/clear_page.S 2005-01-10 14:23:22.000000000 -0800 @@ -7,6 +7,7 @@ * 1/06/01 davidm Tuned for Itanium. * 2/12/02 kchen Tuned for both Itanium and McKinley * 3/08/02 davidm Some more tweaking + * 12/10/04 clameter Make it work on pages of order size */ #include <linux/config.h>