add copy user patches from Marvell
svn path=/dists/trunk/linux-2.6/; revision=13721
This commit is contained in:
parent
52a7c27516
commit
063ca30d03
|
@ -45,6 +45,9 @@ linux-2.6 (2.6.30~rc8-1~experimental.1) UNRELEASED; urgency=low
|
|||
these days, so disable IDE and build in ATA, SCSI and BLK_DEV_SD.
|
||||
* [mips/sb1-bcm91250a, mips/sb1a-bcm91480b] Compile in SB1250_MAC and
|
||||
BROADCOM_PHY.
|
||||
* Add patches from git.marvell.com:
|
||||
- alternative copy_to_user: more precise fallback threshold
|
||||
- lower overhead with alternative copy_to_user for small copies
|
||||
|
||||
[ Aurelien Jarno ]
|
||||
* [mips(el)/sb1-bcm91250a] Set CONFIG_SCSI_AIC7XXX=y, it is needed
|
||||
|
|
|
@ -0,0 +1,121 @@
|
|||
From: Nicolas Pitre <nico@cam.org>
|
||||
Date: Sat, 30 May 2009 01:55:50 +0000 (-0400)
|
||||
Subject: [ARM] alternative copy_to_user: more precise fallback threshold
|
||||
X-Git-Url: http://git.marvell.com/?p=orion.git;a=commitdiff_plain;h=c626e3f5ca1d95ad2204d3128c26e7678714eb55
|
||||
|
||||
[ARM] alternative copy_to_user: more precise fallback threshold
|
||||
|
||||
Previous size thresholds were guessed from various user space benchmarks
|
||||
using a kernel with and without the alternative uaccess option. This
|
||||
is however not as precise as a kernel based test to measure the real
|
||||
speed of each method.
|
||||
|
||||
This adds a simple test bench to show the time needed for each method.
|
||||
With this, the optimal size treshold for the alternative implementation
|
||||
can be determined with more confidence. It appears that the optimal
|
||||
threshold for both copy_to_user and clear_user is around 64 bytes. This
|
||||
is not a surprise knowing that the memcpy and memset implementations
|
||||
need at least 64 bytes to achieve maximum throughput.
|
||||
|
||||
One might suggest that such test be used to determine the optimal
|
||||
threshold at run time instead, but results are near enough to 64 on
|
||||
tested targets concerned by this alternative copy_to_user implementation,
|
||||
so adding some overhead associated with a variable threshold is probably
|
||||
not worth it for now.
|
||||
|
||||
Signed-off-by: Nicolas Pitre <nico@marvell.com>
|
||||
---
|
||||
|
||||
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
|
||||
index 92838e7..6b967ff 100644
|
||||
--- a/arch/arm/lib/uaccess_with_memcpy.c
|
||||
+++ b/arch/arm/lib/uaccess_with_memcpy.c
|
||||
@@ -106,7 +106,7 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
|
||||
* With frame pointer disabled, tail call optimization kicks in
|
||||
* as well making this test almost invisible.
|
||||
*/
|
||||
- if (n < 1024)
|
||||
+ if (n < 64)
|
||||
return __copy_to_user_std(to, from, n);
|
||||
return __copy_to_user_memcpy(to, from, n);
|
||||
}
|
||||
@@ -151,7 +151,78 @@ out:
|
||||
unsigned long __clear_user(void __user *addr, unsigned long n)
|
||||
{
|
||||
/* See rational for this in __copy_to_user() above. */
|
||||
- if (n < 256)
|
||||
+ if (n < 64)
|
||||
return __clear_user_std(addr, n);
|
||||
return __clear_user_memset(addr, n);
|
||||
}
|
||||
+
|
||||
+#if 0
|
||||
+
|
||||
+/*
|
||||
+ * This code is disabled by default, but kept around in case the chosen
|
||||
+ * thresholds need to be revalidated. Some overhead (small but still)
|
||||
+ * would be implied by a runtime determined variable threshold, and
|
||||
+ * so far the measurement on concerned targets didn't show a worthwhile
|
||||
+ * variation.
|
||||
+ *
|
||||
+ * Note that a fairly precise sched_clock() implementation is needed
|
||||
+ * for results to make some sense.
|
||||
+ */
|
||||
+
|
||||
+#include <linux/vmalloc.h>
|
||||
+
|
||||
+static int __init test_size_treshold(void)
|
||||
+{
|
||||
+ struct page *src_page, *dst_page;
|
||||
+ void *user_ptr, *kernel_ptr;
|
||||
+ unsigned long long t0, t1, t2;
|
||||
+ int size, ret;
|
||||
+
|
||||
+ ret = -ENOMEM;
|
||||
+ src_page = alloc_page(GFP_KERNEL);
|
||||
+ if (!src_page)
|
||||
+ goto no_src;
|
||||
+ dst_page = alloc_page(GFP_KERNEL);
|
||||
+ if (!dst_page)
|
||||
+ goto no_dst;
|
||||
+ kernel_ptr = page_address(src_page);
|
||||
+ user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__P010));
|
||||
+ if (!user_ptr)
|
||||
+ goto no_vmap;
|
||||
+
|
||||
+ /* warm up the src page dcache */
|
||||
+ ret = __copy_to_user_memcpy(user_ptr, kernel_ptr, PAGE_SIZE);
|
||||
+
|
||||
+ for (size = PAGE_SIZE; size >= 4; size /= 2) {
|
||||
+ t0 = sched_clock();
|
||||
+ ret |= __copy_to_user_memcpy(user_ptr, kernel_ptr, size);
|
||||
+ t1 = sched_clock();
|
||||
+ ret |= __copy_to_user_std(user_ptr, kernel_ptr, size);
|
||||
+ t2 = sched_clock();
|
||||
+ printk("copy_to_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
|
||||
+ }
|
||||
+
|
||||
+ for (size = PAGE_SIZE; size >= 4; size /= 2) {
|
||||
+ t0 = sched_clock();
|
||||
+ ret |= __clear_user_memset(user_ptr, size);
|
||||
+ t1 = sched_clock();
|
||||
+ ret |= __clear_user_std(user_ptr, size);
|
||||
+ t2 = sched_clock();
|
||||
+ printk("clear_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
|
||||
+ }
|
||||
+
|
||||
+ if (ret)
|
||||
+ ret = -EFAULT;
|
||||
+
|
||||
+ vunmap(user_ptr);
|
||||
+no_vmap:
|
||||
+ put_page(dst_page);
|
||||
+no_dst:
|
||||
+ put_page(src_page);
|
||||
+no_src:
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+subsys_initcall(test_size_treshold);
|
||||
+
|
||||
+#endif
|
|
@ -0,0 +1,88 @@
|
|||
From: Nicolas Pitre <nico@cam.org>
|
||||
Date: Fri, 22 May 2009 02:17:17 +0000 (-0400)
|
||||
Subject: [ARM] lower overhead with alternative copy_to_user for small copies
|
||||
X-Git-Url: http://git.marvell.com/?p=orion.git;a=commitdiff_plain;h=cb9dc92c0a1b76165c8c334402e27191084b2047
|
||||
|
||||
[ARM] lower overhead with alternative copy_to_user for small copies
|
||||
|
||||
Because the alternate copy_to_user implementation has a higher setup cost
|
||||
than the standard implementation, the size of the memory area to copy
|
||||
is tested and the standard implementation invoked instead when that size
|
||||
is too small. Still, that test is made after the processor has preserved
|
||||
a bunch of registers on the stack which have to be reloaded right away
|
||||
needlessly in that case, causing a measurable performance regression
|
||||
compared to plain usage of the standard implementation only.
|
||||
|
||||
To make the size test overhead negligible, let's factorize it out of
|
||||
the alternate copy_to_user function where it is clear to the compiler
|
||||
that no stack frame is needed. Thanks to CONFIG_ARM_UNWIND allowing
|
||||
for frame pointers to be disabled and tail call optimization to kick in,
|
||||
the overhead in the small copy case becomes only 3 assembly instructions.
|
||||
|
||||
A similar trick is applied to clear_user as well.
|
||||
|
||||
Signed-off-by: Nicolas Pitre <nico@marvell.com>
|
||||
---
|
||||
|
||||
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
|
||||
index bf987b4..92838e7 100644
|
||||
--- a/arch/arm/lib/uaccess_with_memcpy.c
|
||||
+++ b/arch/arm/lib/uaccess_with_memcpy.c
|
||||
@@ -49,14 +49,11 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
|
||||
return 1;
|
||||
}
|
||||
|
||||
-unsigned long
|
||||
-__copy_to_user(void __user *to, const void *from, unsigned long n)
|
||||
+static unsigned long noinline
|
||||
+__copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
|
||||
{
|
||||
int atomic;
|
||||
|
||||
- if (n < 1024)
|
||||
- return __copy_to_user_std(to, from, n);
|
||||
-
|
||||
if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
|
||||
memcpy((void *)to, from, n);
|
||||
return 0;
|
||||
@@ -99,11 +96,24 @@ out:
|
||||
return n;
|
||||
}
|
||||
|
||||
-unsigned long __clear_user(void __user *addr, unsigned long n)
|
||||
+unsigned long
|
||||
+__copy_to_user(void __user *to, const void *from, unsigned long n)
|
||||
+{
|
||||
+ /*
|
||||
+ * This test is stubbed out of the main function above to keep
|
||||
+ * the overhead for small copies low by avoiding a large
|
||||
+ * register dump on the stack just to reload them right away.
|
||||
+ * With frame pointer disabled, tail call optimization kicks in
|
||||
+ * as well making this test almost invisible.
|
||||
+ */
|
||||
+ if (n < 1024)
|
||||
+ return __copy_to_user_std(to, from, n);
|
||||
+ return __copy_to_user_memcpy(to, from, n);
|
||||
+}
|
||||
+
|
||||
+static unsigned long noinline
|
||||
+__clear_user_memset(void __user *addr, unsigned long n)
|
||||
{
|
||||
- if (n < 256)
|
||||
- return __clear_user_std(addr, n);
|
||||
-
|
||||
if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
|
||||
memset((void *)addr, 0, n);
|
||||
return 0;
|
||||
@@ -137,3 +147,11 @@ unsigned long __clear_user(void __user *addr, unsigned long n)
|
||||
out:
|
||||
return n;
|
||||
}
|
||||
+
|
||||
+unsigned long __clear_user(void __user *addr, unsigned long n)
|
||||
+{
|
||||
+ /* See rational for this in __copy_to_user() above. */
|
||||
+ if (n < 256)
|
||||
+ return __clear_user_std(addr, n);
|
||||
+ return __clear_user_memset(addr, n);
|
||||
+}
|
|
@ -27,6 +27,8 @@
|
|||
#+ features/sparc/video-sunxvr500-intergraph.patch
|
||||
+ features/arm/allow-alternative-copy-user.patch
|
||||
+ features/arm/alternative-copy-user.patch
|
||||
+ features/arm/lower_overhead_with_alternative.patch
|
||||
+ features/arm/copy_to_user-better_threshold.patch
|
||||
+ bugfix/all/mvsdio-platform.patch
|
||||
+ bugfix/all/mvsdio-ignore-high-speed.patch
|
||||
+ bugfix/all/mvsdio-config-failure.patch
|
||||
|
|
Loading…
Reference in New Issue