Google Groups Home
Help | Sign in
Prezeroing V3 [1/4]: Allow request for zeroed memory
There are currently too many topics in this group that display first. To make this topic appear first, remove this option from another topic.
There was an error processing your request. Please try again.
flag
  Messages 76 - 100 of 140 - Collapse all < Older  Newer >
The group you are posting to is a Usenet group. Messages posted to this group will make your email address visible to anyone on the Internet.
Your reply message has not been sent.
Your post was successful
Christoph Lameter  
View profile
 More options Jan 10 2005, 3:30 pm
Newsgroups: linux.kernel
From: Christoph Lameter <clame...@sgi.com>
Date: Mon, 10 Jan 2005 21:30:45 +0100
Local: Mon, Jan 10 2005 3:30 pm
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory

On Mon, 10 Jan 2005, Linus Torvalds wrote:
> Currently the BK tree
>  - doesn't use __GFP_ZERO with anonymous user-mapped pages (which is what
>    you wrote this whole thing for ;)

>    Potential fix: declare a per-architecture "alloc_user_highpage(vaddr)"
>    that does the proper magic on virtually indexed machines, and on others
>    it just does a "alloc_page(GFP_HIGHUSER | __GFP_ZERO)".

The following patch adds an alloc_zeroed_user_highpage(vma, vaddr). It
also uses zeroed pages on COW. clear_user_highpage is now only used by
that function. Fold it into alloc_zeroed_user_highpage?

This is against last hours bitkeeper tree. mm/memory.o compiles fine but
I was not able to build a ia64 kernel due to some pieces that seem to be
missing in last hours tree.

Index: linus/include/asm-ia64/page.h
===================================================================
--- linus.orig/include/asm-ia64/page.h  2004-10-20 12:04:58.000000000 -0700
+++ linus/include/asm-ia64/page.h       2005-01-10 12:05:55.000000000 -0800
@@ -75,6 +75,16 @@
        flush_dcache_page(page);                \
 } while (0)

+
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+({                                             \
+       struct page *page = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr); \
+       flush_dcache_page(page);                \
+        page;                                  \
+})
+
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)

 #ifdef CONFIG_VIRTUAL_MEM_MAP
Index: linus/include/asm-h8300/page.h
===================================================================
--- linus.orig/include/asm-h8300/page.h 2004-10-20 12:04:58.000000000 -0700
+++ linus/include/asm-h8300/page.h      2005-01-10 11:53:17.000000000 -0800
@@ -30,6 +30,9 @@
 #define clear_user_page(page, vaddr, pg)       clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 /*
  * These are used to make use of C type-checking..
  */
Index: linus/mm/memory.c
===================================================================
--- linus.orig/mm/memory.c      2005-01-10 11:44:39.000000000 -0800
+++ linus/mm/memory.c   2005-01-10 12:05:21.000000000 -0800
@@ -84,20 +84,6 @@
 EXPORT_SYMBOL(vmalloc_earlyreserve);

 /*
- * We special-case the C-O-W ZERO_PAGE, because it's such
- * a common occurrence (no need to read the page to know
- * that it's zero - better for the cache and memory subsystem).
- */
-static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
-{
-       if (from == ZERO_PAGE(address)) {
-               clear_user_highpage(to, address);
-               return;
-       }
-       copy_user_highpage(to, from, address);
-}
-
-/*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
  */
@@ -1329,11 +1315,16 @@

        if (unlikely(anon_vma_prepare(vma)))
                goto no_new_page;
-       new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
-       if (!new_page)
-               goto no_new_page;
-       copy_cow_page(old_page,new_page,address);
-
+       if (old_page == ZERO_PAGE(address)) {
+               new_page = alloc_zeroed_user_highpage(vma, address);
+               if (!new_page)
+                       goto no_new_page;
+       } else {
+               new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+               if (!new_page)
+                       goto no_new_page;
+               copy_user_highpage(new_page, old_page, address);
+       }
        /*
         * Re-check the pte - we dropped the lock
         */
@@ -1795,10 +1786,9 @@

                if (unlikely(anon_vma_prepare(vma)))
                        goto no_mem;
-               page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+               page = alloc_zeroed_user_highpage(vma, addr);
                if (!page)
                        goto no_mem;
-               clear_user_highpage(page, addr);

                spin_lock(&mm->page_table_lock);
                page_table = pte_offset_map(pmd, addr);
Index: linus/include/asm-m32r/page.h
===================================================================
--- linus.orig/include/asm-m32r/page.h  2004-10-20 12:04:58.000000000 -0700
+++ linus/include/asm-m32r/page.h       2005-01-10 12:08:03.000000000 -0800
@@ -17,6 +17,9 @@
 #define clear_user_page(page, vaddr, pg)       clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 /*
  * These are used to make use of C type-checking..
  */
Index: linus/include/asm-alpha/page.h
===================================================================
--- linus.orig/include/asm-alpha/page.h 2004-10-20 12:04:57.000000000 -0700
+++ linus/include/asm-alpha/page.h      2005-01-10 11:54:37.000000000 -0800
@@ -18,6 +18,9 @@
 extern void clear_page(void *page);
 #define clear_user_page(page, vaddr, pg)       clear_page(page)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vmaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 extern void copy_page(void * _to, void * _from);
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

Index: linus/include/asm-m68knommu/page.h
===================================================================
--- linus.orig/include/asm-m68knommu/page.h     2005-01-10 09:53:05.000000000 -0800
+++ linus/include/asm-m68knommu/page.h  2005-01-10 11:54:27.000000000 -0800
@@ -30,6 +30,9 @@
 #define clear_user_page(page, vaddr, pg)       clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 /*
  * These are used to make use of C type-checking..
  */
Index: linus/include/asm-cris/page.h
===================================================================
--- linus.orig/include/asm-cris/page.h  2004-10-20 12:04:57.000000000 -0700
+++ linus/include/asm-cris/page.h       2005-01-10 11:55:06.000000000 -0800
@@ -21,6 +21,9 @@
 #define clear_user_page(page, vaddr, pg)    clear_page(page)
 #define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 /*
  * These are used to make use of C type-checking..
  */
Index: linus/include/linux/highmem.h
===================================================================
--- linus.orig/include/linux/highmem.h  2005-01-06 12:58:48.000000000 -0800
+++ linus/include/linux/highmem.h       2005-01-10 12:08:56.000000000 -0800
@@ -42,6 +42,17 @@
        smp_wmb();
 }

+#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+static inline struct page* alloc_zeroed_user_highpage(struct vm_area_struct *vma,
+        unsigned long vaddr)
+{
+       struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, vaddr);
+
+       clear_user_highpage(page, vaddr);
+       return page;
+}
+#endif
+
 static inline void clear_highpage(struct page *page)
 {
        void *kaddr = kmap_atomic(page, KM_USER0);
Index: linus/include/asm-i386/page.h
===================================================================
--- linus.orig/include/asm-i386/page.h  2005-01-06 12:58:47.000000000 -0800
+++ linus/include/asm-i386/page.h       2005-01-10 12:09:43.000000000 -0800
@@ -36,6 +36,9 @@
 #define clear_user_page(page, vaddr, pg)       clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 /*
  * These are used to make use of C type-checking..
  */
Index: linus/include/asm-x86_64/page.h
===================================================================
--- linus.orig/include/asm-x86_64/page.h        2005-01-06 12:58:48.000000000 -0800
+++ linus/include/asm-x86_64/page.h     2005-01-10 11:56:04.000000000 -0800
@@ -38,6 +38,8 @@
 #define clear_user_page(page, vaddr, pg)       clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 /*
  * These are used to make use of C type-checking..
  */
Index: linus/include/asm-s390/page.h
===================================================================
--- linus.orig/include/asm-s390/page.h  2004-10-20 12:04:59.000000000 -0700
+++ linus/include/asm-s390/page.h       2005-01-10 11:56:33.000000000 -0800
@@ -106,6 +106,9 @@
 #define clear_user_page(page, vaddr, pg)       clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 /* Pure 2^n version of get_order */
 extern __inline__ int get_order(unsigned long size)
 {
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


    Reply to author    Forward  
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Discussion subject changed to "Prezeroing V4 [1/4]: Arch specific page zeroing during page fault" by Christoph Lameter
Christoph Lameter  
View profile
 More options Jan 10 2005, 7:10 pm
Newsgroups: linux.kernel
From: Christoph Lameter <clame...@sgi.com>
Date: Tue, 11 Jan 2005 01:10:05 +0100
Local: Mon, Jan 10 2005 7:10 pm
Subject: Prezeroing V4 [1/4]: Arch specific page zeroing during page fault
This patch fixes the __GFP_ZERO related code by adding a new function
alloc_zeroed_user_highpage that is then used in the anonymous page fault
handler and in the COW code to allocate pages. The function can be defined
per arch to setup special processing for user pages by defining
__HAVE_ARCH_ALLOC_ZEROED_USER_PAGE.

Signed-off-by: Christoph Lameter <clame...@sgi.com>

Index: linux-2.6.10/include/asm-ia64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/page.h   2004-12-24 13:34:00.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/page.h        2005-01-10 13:53:59.000000000 -0800
@@ -75,6 +75,16 @@
        flush_dcache_page(page);                \
 } while (0)

+
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+({                                             \
+       struct page *page = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr); \
+       flush_dcache_page(page);                \
+        page;                                  \
+})
+
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)

 #ifdef CONFIG_VIRTUAL_MEM_MAP
Index: linux-2.6.10/include/asm-h8300/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-h8300/page.h  2004-12-24 13:35:25.000000000 -0800
+++ linux-2.6.10/include/asm-h8300/page.h       2005-01-10 13:53:59.000000000 -0800
@@ -30,6 +30,9 @@
 #define clear_user_page(page, vaddr, pg)       clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 /*
  * These are used to make use of C type-checking..
  */
Index: linux-2.6.10/mm/memory.c
===================================================================
--- linux-2.6.10.orig/mm/memory.c       2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/mm/memory.c    2005-01-10 13:54:30.000000000 -0800
@@ -84,20 +84,6 @@
 EXPORT_SYMBOL(vmalloc_earlyreserve);

 /*
- * We special-case the C-O-W ZERO_PAGE, because it's such
- * a common occurrence (no need to read the page to know
- * that it's zero - better for the cache and memory subsystem).
- */
-static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
-{
-       if (from == ZERO_PAGE(address)) {
-               clear_user_highpage(to, address);
-               return;
-       }
-       copy_user_highpage(to, from, address);
-}
-
-/*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
  */
@@ -1329,11 +1315,16 @@

        if (unlikely(anon_vma_prepare(vma)))
                goto no_new_page;
-       new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
-       if (!new_page)
-               goto no_new_page;
-       copy_cow_page(old_page,new_page,address);
-
+       if (old_page == ZERO_PAGE(address)) {
+               new_page = alloc_zeroed_user_highpage(vma, address);
+               if (!new_page)
+                       goto no_new_page;
+       } else {
+               new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+               if (!new_page)
+                       goto no_new_page;
+               copy_user_highpage(new_page, old_page, address);
+       }
        /*
         * Re-check the pte - we dropped the lock
         */
@@ -1795,7 +1786,7 @@

                if (unlikely(anon_vma_prepare(vma)))
                        goto no_mem;
-               page = alloc_page_vma(GFP_HIGHZERO, vma, addr);
+               page = alloc_zeroed_user_highpage(vma, addr);
                if (!page)
                        goto no_mem;

Index: linux-2.6.10/include/asm-m32r/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m32r/page.h   2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-m32r/page.h        2005-01-10 13:53:59.000000000 -0800
@@ -17,6 +17,9 @@
 #define clear_user_page(page, vaddr, pg)       clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 /*
  * These are used to make use of C type-checking..
  */
Index: linux-2.6.10/include/asm-alpha/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-alpha/page.h  2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/include/asm-alpha/page.h       2005-01-10 13:53:59.000000000 -0800
@@ -18,6 +18,9 @@
 extern void clear_page(void *page);
 #define clear_user_page(page, vaddr, pg)       clear_page(page)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vmaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 extern void copy_page(void * _to, void * _from);
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

Index: linux-2.6.10/include/asm-m68knommu/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68knommu/page.h      2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/asm-m68knommu/page.h   2005-01-10 13:53:59.000000000 -0800
@@ -30,6 +30,9 @@
 #define clear_user_page(page, vaddr, pg)       clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 /*
  * These are used to make use of C type-checking..
  */
Index: linux-2.6.10/include/asm-cris/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-cris/page.h   2004-12-24 13:34:30.000000000 -0800
+++ linux-2.6.10/include/asm-cris/page.h        2005-01-10 13:53:59.000000000 -0800
@@ -21,6 +21,9 @@
 #define clear_user_page(page, vaddr, pg)    clear_page(page)
 #define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 /*
  * These are used to make use of C type-checking..
  */
Index: linux-2.6.10/include/linux/highmem.h
===================================================================
--- linux-2.6.10.orig/include/linux/highmem.h   2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/linux/highmem.h        2005-01-10 13:53:59.000000000 -0800
@@ -42,6 +42,17 @@
        smp_wmb();
 }

+#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+static inline struct page* alloc_zeroed_user_highpage(struct vm_area_struct *vma,
+        unsigned long vaddr)
+{
+       struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, vaddr);
+
+       clear_user_highpage(page, vaddr);
+       return page;
+}
+#endif
+
 static inline void clear_highpage(struct page *page)
 {
        void *kaddr = kmap_atomic(page, KM_USER0);
Index: linux-2.6.10/include/asm-i386/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/page.h   2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/asm-i386/page.h        2005-01-10 13:53:59.000000000 -0800
@@ -36,6 +36,9 @@
 #define clear_user_page(page, vaddr, pg)       clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 /*
  * These are used to make use of C type-checking..
  */
Index: linux-2.6.10/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/page.h 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/page.h      2005-01-10 13:53:59.000000000 -0800
@@ -38,6 +38,8 @@
 #define clear_user_page(page, vaddr, pg)       clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 /*
  * These are used to make use of C type-checking..
  */
Index: linux-2.6.10/include/asm-s390/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-s390/page.h   2004-12-24 13:34:01.000000000 -0800
+++ linux-2.6.10/include/asm-s390/page.h        2005-01-10 13:53:59.000000000 -0800
@@ -106,6 +106,9 @@
 #define clear_user_page(page, vaddr, pg)       clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
 /* Pure 2^n version of get_order */
 extern __inline__ int get_order(unsigned long size)
 {

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


    Reply to author    Forward  
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Discussion subject changed to "Prezeroing V4 [0/4]: Overview" by Christoph Lameter
Christoph Lameter  
View profile
 More options Jan 10 2005, 7:10 pm
Newsgroups: linux.kernel
From: Christoph Lameter <clame...@sgi.com>
Date: Tue, 11 Jan 2005 01:10:10 +0100
Local: Mon, Jan 10 2005 7:10 pm
Subject: Prezeroing V4 [0/4]: Overview
Changes from V3 to V4:
o Drop __GFP_ZERO patch since its in Linus tree. Include new patch that allows
  archs that need special measures around zeroing of user pages during a page
  fault to maintain their special adaptations.
o Use zeroed pages during COW.
o Updates for clear_page for various platforms. Make clear_page an optional
  patch and fall back to a series of clear_page without order if the patch
  to expand clear_page patch has not been applied.
o x86_64 asm code fixed up
o Port patches to 2.6.10-bk13 and make it fit the bitmapless buddy allocator

The patches increasing the page fault rate (introduction of atomic pte
operations and anticipatory prefaulting) do so by reducing the locking
overhead and are therefore mainly of interest for applications running in
SMP systems with a high number of cpus. The single thread performance does
just show minor increases. Only the performance of multi-threaded
applications increases significantly.

The most expensive operation in the page fault handler is (apart of SMP
locking overhead) the zeroing of the page that is also done in the page fault
handler. This zeroing means that all cachelines of the faulted page (on Altix
that means all 128 cachelines of 128 byte each) must be loaded and later
written back. This patch allows to avoid having to load all cachelines
if only a part of the cachelines of that page is needed immediately after
the fault. Doing so will only be effective for sparsely accessed memory
which is typical for anonymous memory and pte maps. Prezeroed pages will
only be used for those purposes. Unzeroed pages will be used as usual for
file mapping, page caching etc etc.

The patch makes prezeroing very effective by:

1. Aggregating zeroing operations to only apply to pages of higher order,
which results in many pages that will later become zero 0 to be zeroed in one
step.
For that purpose the existing clear_page function is extended and made to
take an additional argument specifying the order of the page to be cleared.

2. Hardware support for offloading zeroing from the cpu. This avoids
the invalidation of the cpu caches by extensive zeroing operations.

The scrub daemon is invoked when a unzeroed page of a certain order has
been generated so that its worth running it. If no higher order pages are
present then the logic will favor hot zeroing rather than simply shifting
processing around. kscrubd typically runs only for a fraction of a second
and sleeps for long periods of time even under memory benchmarking. kscrubd
performs short bursts of zeroing when needed and tries to stay out off the
processor as much as possible.

The benefits of prezeroing are reduced to minimal quantities if all
cachelines of a page are touched. Prezeroing can only be effective
if the whole page is not immediately used after the page fault.

The patch is composed of 4 parts:

[1/4] GFP_ZERO fixups
        Adds alloc_zeroed_user_highpage(vma, vaddr) that may be customized for
        each arch by defining __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE. Includes
        proper definitions for a large selection of arches, others fall back to
        the default function in include/linux/highmem.h (and falls back to not
        using prezeroed pages).

[2/4] Page Zeroing
        Adds management of ZEROED and NOT_ZEROED pages and a background daemon
        called scrubd. scrubd is disabled by default but can be enabled
        by writing an order number to /proc/sys/vm/scrub_start. If a page
        is coalesced of that order or higher then the scrub daemon will
        start zeroing until all pages of order /proc/sys/vm/scrub_stop and
        higher are zeroed and then go back to sleep.

        In an SMP environment the scrub daemon is typically
        running on the most idle cpu. Thus a single threaded application running
        on one cpu may have the other cpu zeroing pages for it etc. The scrub
        daemon is hardly noticable and usually finished zeroing quickly since
        most processors are optimized for linear memory filling.

The following patches increase performance but may be omitted:

[2/4] SGI Altix Block Transfer Engine Support
        Implements a driver to shift the zeroing off the cpu into hardware.
        With hardware support the impact of zeroing on the system is reduced
        to a minimum.

[4/4] Architecture specific clear_page updates
        Adds second order argument to clear_page and updates all arches.
        This allows the zeroing of large areas of memory without repeately
        invoking clear_page() for the page allocator, scrubd and the huge
        page allocator.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


    Reply to author    Forward  
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Discussion subject changed to "Prezeroing V4 [3/4]: Altix SN2 BTE zero driver" by Christoph Lameter
Christoph Lameter  
View profile
 More options Jan 10 2005, 7:20 pm
Newsgroups: linux.kernel
From: Christoph Lameter <clame...@sgi.com>
Date: Tue, 11 Jan 2005 01:20:11 +0100
Local: Mon, Jan 10 2005 7:20 pm
Subject: Prezeroing V4 [3/4]: Altix SN2 BTE zero driver
o Zeroing driver implemented with the Block Transfer Engine in the Altix
  SN2 SHub.

Signed-off-by: Christoph Lameter <clame...@sgi.com>

Index: linux-2.6.10/arch/ia64/sn/kernel/bte.c
===================================================================
--- linux-2.6.10.orig/arch/ia64/sn/kernel/bte.c 2004-12-24 13:34:58.000000000 -0800
+++ linux-2.6.10/arch/ia64/sn/kernel/bte.c      2005-01-10 13:54:52.000000000 -0800
@@ -4,6 +4,8 @@
  * for more details.
  *
  * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * Support for zeroing pages, Christoph Lameter, SGI, December 2004.
  */

 #include <linux/config.h>
@@ -20,6 +22,8 @@
 #include <linux/bootmem.h>
 #include <linux/string.h>
 #include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/scrub.h>

 #include <asm/sn/bte.h>

@@ -30,7 +34,7 @@
 /* two interfaces on two btes */
 #define MAX_INTERFACES_TO_TRY          4

-static struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface)
+static inline struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface)
 {
        nodepda_t *tmp_nodepda;

@@ -132,7 +136,6 @@
                        if (bte == NULL) {
                                continue;
                        }
-
                        if (spin_trylock(&bte->spinlock)) {
                                if (!(*bte->most_rcnt_na & BTE_WORD_AVAILABLE) ||
                                    (BTE_LNSTAT_LOAD(bte) & BTE_ACTIVE)) {
@@ -157,7 +160,7 @@
                }
        } while (1);

-       if (notification == NULL) {
+       if (notification == NULL || (mode & BTE_NOTIFY_AND_GET_POINTER)) {
                /* User does not want to be notified. */
                bte->most_rcnt_na = &bte->notify;
        } else {
@@ -192,6 +195,8 @@

        itc_end = ia64_get_itc() + (40000000 * local_cpu_data->cyc_per_usec);

+       if (mode & BTE_NOTIFY_AND_GET_POINTER)
+                *(u64 volatile **)(notification) = &bte->notify;
        spin_unlock_irqrestore(&bte->spinlock, irq_flags);

        if (notification != NULL) {
@@ -449,5 +454,47 @@
                mynodepda->bte_if[i].cleanup_active = 0;
                mynodepda->bte_if[i].bh_error = 0;
        }
+}
+
+u64 *bte_zero_notify[MAX_COMPACT_NODES];
+
+#define ZERO_RATE_PER_SEC 500000000
+
+static int bte_start_bzero(void *p, unsigned long len)
+{
+       int rc;
+       int ticks;
+       int node = get_nasid();
+
+       /* Check limitations.
+               1. System must be running (weird things happen during bootup)
+               2. Size >64KB. Smaller requests cause too much bte traffic
+        */
+       if (len >= BTE_MAX_XFER || len < 60000 || system_state != SYSTEM_RUNNING)
+               return EINVAL;
+
+       rc = bte_zero(ia64_tpa(p), len, BTE_NOTIFY_AND_GET_POINTER, bte_zero_notify+node);
+       if (rc)
+               return rc;
+
+       ticks = (len*HZ)/ZERO_RATE_PER_SEC;
+       if (ticks) {
+               /* Wait the minimum time of the transfer */
+               current->state = TASK_INTERRUPTIBLE;
+               schedule_timeout(ticks);
+       }
+       while (*(bte_zero_notify[node]) != BTE_WORD_BUSY) {
+               /* Then keep on checking until transfer is complete */
+               cpu_relax();
+               schedule();
+       }
+       return 0;
+}
+
+static struct zero_driver bte_bzero = {
+       .start = bte_start_bzero,
+};

+void sn_bte_bzero_init(void) {
+       register_zero_driver(&bte_bzero);
 }
Index: linux-2.6.10/arch/ia64/sn/kernel/setup.c
===================================================================
--- linux-2.6.10.orig/arch/ia64/sn/kernel/setup.c       2005-01-10 13:48:08.000000000 -0800
+++ linux-2.6.10/arch/ia64/sn/kernel/setup.c    2005-01-10 13:54:52.000000000 -0800
@@ -244,6 +244,7 @@
        int pxm;
        int major = sn_sal_rev_major(), minor = sn_sal_rev_minor();
        extern void sn_cpu_init(void);
+       extern void sn_bte_bzero_init(void);

        /*
         * If the generic code has enabled vga console support - lets
@@ -334,6 +335,7 @@
        screen_info = sn_screen_info;

        sn_timer_init();
+       sn_bte_bzero_init();
 }

 /**
Index: linux-2.6.10/include/asm-ia64/sn/bte.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/sn/bte.h 2004-12-24 13:34:45.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/sn/bte.h      2005-01-10 13:54:52.000000000 -0800
@@ -48,6 +48,8 @@
 #define BTE_ZERO_FILL (BTE_NOTIFY | IBCT_ZFIL_MODE)
 /* Use a reserved bit to let the caller specify a wait for any BTE */
 #define BTE_WACQUIRE (0x4000)
+/* Return the pointer to the notification cacheline to the user */
+#define BTE_NOTIFY_AND_GET_POINTER (0x8000)
 /* Use the BTE on the node with the destination memory */
 #define BTE_USE_DEST (BTE_WACQUIRE << 1)
 /* Use any available BTE interface on any node for the transfer */

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


    Reply to author    Forward  
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Discussion subject changed to "Prezeroing V4 [4/4]: Extend clear_page to take an order parameter" by Christoph Lameter
Christoph Lameter  
View profile
 More options Jan 10 2005, 7:40 pm
Newsgroups: linux.kernel
From: Christoph Lameter <clame...@sgi.com>
Date: Tue, 11 Jan 2005 01:40:11 +0100
Local: Mon, Jan 10 2005 7:40 pm
Subject: Prezeroing V4 [4/4]: Extend clear_page to take an order parameter

- Extend clear_page to take an order parameter.

Architecture support:
---------------------

Known to work:

ia64
i386
x86_64
sparc64
m68k

Trivial modification expected to simply work:

arm
cris
h8300
m68knommu
ppc
ppc64
sh64
v850
parisc
sparc
um

Modification made but it would be good to have some feedback from the arch maintainers:

s390
alpha
sh
mips
m32r

Index: linux-2.6.10/include/asm-ia64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/page.h   2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/page.h        2005-01-10 14:23:21.000000000 -0800
@@ -56,7 +56,7 @@
 # ifdef __KERNEL__
 #  define STRICT_MM_TYPECHECKS

-extern void clear_page (void *page);
+extern void clear_page (void *page, int order);
 extern void copy_page (void *to, void *from);

 /*
@@ -65,7 +65,7 @@
  */
 #define clear_user_page(addr, vaddr, page)     \
 do {                                           \
-       clear_page(addr);                       \
+       clear_page(addr, 0);                    \
        flush_dcache_page(page);                \
 } while (0)

Index: linux-2.6.10/include/asm-i386/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/page.h   2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-i386/page.h        2005-01-10 14:23:22.000000000 -0800
@@ -18,7 +18,7 @@

 #include <asm/mmx.h>

-#define clear_page(page)       mmx_clear_page((void *)(page))
+#define clear_page(page, order)        mmx_clear_page((void *)(page),order)
 #define copy_page(to,from)     mmx_copy_page(to,from)

 #else
@@ -28,12 +28,12 @@
  *     Maybe the K6-III ?
  */

-#define clear_page(page)       memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order)        memset((void *)(page), 0, PAGE_SIZE << (order))
 #define copy_page(to,from)     memcpy((void *)(to), (void *)(from), PAGE_SIZE)

 #endif

-#define clear_user_page(page, vaddr, pg)       clear_page(page)
+#define clear_user_page(page, vaddr, pg)       clear_page(page, 0)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

 #define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
Index: linux-2.6.10/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/page.h 2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/page.h      2005-01-10 14:23:22.000000000 -0800
@@ -32,10 +32,10 @@
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__

-void clear_page(void *);
+void clear_page(void *, int);
 void copy_page(void *, void *);

-#define clear_user_page(page, vaddr, pg)       clear_page(page)
+#define clear_user_page(page, vaddr, pg)       clear_page(page, 0)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

 #define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
Index: linux-2.6.10/include/asm-sparc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sparc/page.h  2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-sparc/page.h       2005-01-10 14:23:22.000000000 -0800
@@ -28,10 +28,10 @@

 #ifndef __ASSEMBLY__

-#define clear_page(page)        memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order)         memset((void *)(page), 0, PAGE_SIZE << (order))
 #define copy_page(to,from)     memcpy((void *)(to), (void *)(from), PAGE_SIZE)
 #define clear_user_page(addr, vaddr, page)     \
-       do {    clear_page(addr);               \
+       do {    clear_page(addr, 0);            \
                sparc_flush_page_to_ram(page);  \
        } while (0)
 #define copy_user_page(to, from, vaddr, page)  \
Index: linux-2.6.10/include/asm-s390/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-s390/page.h   2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-s390/page.h        2005-01-10 14:23:22.000000000 -0800
@@ -22,12 +22,12 @@

 #ifndef __s390x__

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
 {
        register_pair rp;

        rp.subreg.even = (unsigned long) page;
-       rp.subreg.odd = (unsigned long) 4096;
+       rp.subreg.odd = (unsigned long) 4096 << order;
         asm volatile ("   slr  1,1\n"
                      "   mvcl %0,0"
                      : "+&a" (rp) : : "memory", "cc", "1" );
@@ -63,14 +63,19 @@

 #else /* __s390x__ */

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
 {
-        asm volatile ("   lgr  2,%0\n"
+       int nr = 1 << order;
+
+       while (nr-- >0) {
+               asm volatile ("   lgr  2,%0\n"
                       "   lghi 3,4096\n"
                       "   slgr 1,1\n"
                       "   mvcl 2,0"
                       : : "a" ((void *) (page))
                      : "memory", "cc", "1", "2", "3" );
+               page += PAGE_SIZE;
+       }
 }

 static inline void copy_page(void *to, void *from)
@@ -103,7 +108,7 @@

 #endif /* __s390x__ */

-#define clear_user_page(page, vaddr, pg)       clear_page(page)
+#define clear_user_page(page, vaddr, pg)       clear_page(page, 0)
 #define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)

 #define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
Index: linux-2.6.10/arch/i386/lib/mmx.c
===================================================================
--- linux-2.6.10.orig/arch/i386/lib/mmx.c       2004-12-24 13:34:48.000000000 -0800
+++ linux-2.6.10/arch/i386/lib/mmx.c    2005-01-10 14:23:22.000000000 -0800
@@ -128,7 +128,7 @@
  *     other MMX using processors do not.
  */

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
 {
        int i;

@@ -138,7 +138,7 @@
                "  pxor %%mm0, %%mm0\n" : :
        );

-       for(i=0;i<4096/64;i++)
+       for(i=0;i<((4096/64) << order);i++)
        {
                __asm__ __volatile__ (
                "  movntq %%mm0, (%0)\n"
@@ -257,7 +257,7 @@
  *     Generic MMX implementation without K7 specific streaming
  */

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
 {
        int i;

@@ -267,7 +267,7 @@
                "  pxor %%mm0, %%mm0\n" : :
        );

-       for(i=0;i<4096/128;i++)
+       for(i=0;i<((4096/128) << order);i++)
        {
                __asm__ __volatile__ (
                "  movq %%mm0, (%0)\n"
@@ -359,23 +359,23 @@
  *     Favour MMX for page clear and copy.
  */

-static void slow_zero_page(void * page)
+static void slow_clear_page(void * page, int order)
 {
        int d0, d1;
        __asm__ __volatile__( \
                "cld\n\t" \
                "rep ; stosl" \
                : "=&c" (d0), "=&D" (d1)
-               :"a" (0),"1" (page),"0" (1024)
+               :"a" (0),"1" (page),"0" (1024 << order)
                :"memory");
 }
-
-void mmx_clear_page(void * page)
+
+void mmx_clear_page(void * page, int order)
 {
        if(unlikely(in_interrupt()))
-               slow_zero_page(page);
+               slow_clear_page(page, order);
        else
-               fast_clear_page(page);
+               fast_clear_page(page, order);
 }

 static void slow_copy_page(void *to, void *from)
Index: linux-2.6.10/include/asm-x86_64/mmx.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/mmx.h  2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/mmx.h       2005-01-10 14:23:22.000000000 -0800
@@ -8,7 +8,7 @@
 #include <linux/types.h>

 extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
 extern void mmx_copy_page(void *to, void *from);

 #endif
Index: linux-2.6.10/arch/ia64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/ia64/lib/clear_page.S        2004-12-24 13:33:50.000000000 -0800
+++ linux-2.6.10/arch/ia64/lib/clear_page.S     2005-01-10 14:23:22.000000000 -0800
@@ -7,6 +7,7 @@
  * 1/06/01 davidm      Tuned for Itanium.
  * 2/12/02 kchen       Tuned for both Itanium and McKinley
  * 3/08/02 davidm      Some more tweaking
+ * 12/10/04 clameter   Make it work on pages of order size
  */
 #include <linux/config.h>

@@ -29,27 +30,33 @@
 #define dst4           r11

 #define dst_last       r31
+#define totsize                r14

 GLOBAL_ENTRY(clear_page)
        .prologue
-       .regstk 1,0,0,0
-       mov r16 = PAGE_SIZE/L3_LINE_SIZE-1      // main loop count, -1=repeat/until
+       .regstk 2,0,0,0
+       mov r16 = PAGE_SIZE/L3_LINE_SIZE        // main loop count
+       mov totsize = PAGE_SIZE
        .save ar.lc, saved_lc
        mov saved_lc = ar.lc
-
+       ;;
        .body
+       adds dst1 = 16, in0
        mov ar.lc = (PREFETCH_LINES - 1)
        mov dst_fetch = in0
-       adds dst1 = 16, in0
        adds dst2 = 32, in0
+       shl r16 = r16, in1
+       shl totsize = totsize, in1
        ;;
 .fetch:        stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
        adds dst3 = 48, in0             // executing this multiple times is harmless
        br.cloop.sptk.few .fetch
+       add r16 = -1,r16
+       add dst_last = totsize, dst_fetch
+       adds dst4 = 64, in0
        ;;
-       addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
        mov ar.lc = r16                 // one L3 line per iteration
-       adds dst4 = 64, in0
+       adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last
        ;;
 #ifdef CONFIG_ITANIUM
        // Optimized for Itanium
Index: linux-2.6.10/arch/x86_64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/x86_64/lib/clear_page.S      2004-12-24 13:34:33.000000000 -0800
+++ linux-2.6.10/arch/x86_64/lib/clear_page.S   2005-01-10 14:23:22.000000000 -0800
@@ -1,12 +1,16 @@
 /*
  * Zero a page.
  * rdi page
+ * rsi order
  */
        .globl clear_page
        .p2align 4
 clear_page:
+       movl   $4096/64,%eax
+       movl    %esi, %ecx
+       shll    %cl, %eax
+       movl    %eax, %ecx
        xorl   %eax,%eax
-       movl   $4096/64,%ecx
        .p2align 4
 .Lloop:
        decl    %ecx
@@ -41,7 +45,10 @@

        .section .altinstr_replacement,"ax"
 clear_page_c:
-       movl $4096/8,%ecx
+       movl $4096/8,%eax
+       movl %esi, %ecx
+       shll %cl, %eax
+       movl %eax, %ecx
        xorl %eax,%eax
        rep
        stosq
Index: linux-2.6.10/include/asm-sh/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh/page.h     2004-12-24 13:35:28.000000000 -0800
+++ linux-2.6.10/include/asm-sh/page.h  2005-01-10 14:23:22.000000000 -0800
@@ -36,12 +36,22 @@
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__

-extern void (*clear_page)(void *to);
+extern void ...

read more »


    Reply to author    Forward  
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Discussion subject changed to "Prezeroing V4 [2/4]: Zeroing implementation" by Christoph Lameter