2018-08-27 14:32:32 +00:00
|
|
|
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
|
|
|
Date: Thu, 30 Nov 2017 13:40:10 +0100
|
|
|
|
Subject: [PATCH] crypto: limit more FPU-enabled sections
|
|
|
|
MIME-Version: 1.0
|
|
|
|
Content-Type: text/plain; charset=UTF-8
|
|
|
|
Content-Transfer-Encoding: 8bit
|
2018-09-13 17:28:08 +00:00
|
|
|
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.18/older/patches-4.18.7-rt5.tar.xz
|
2018-08-27 14:32:32 +00:00
|
|
|
|
|
|
|
Those crypto drivers use SSE/AVX/… for their crypto work and in order to
|
|
|
|
do so in kernel they need to enable the "FPU" in kernel mode which
|
|
|
|
disables preemption.
|
|
|
|
There are two problems with the way they are used:
|
|
|
|
- the while loop which processes X bytes may create latency spikes and
|
|
|
|
should be avoided or limited.
|
|
|
|
- the cipher-walk-next part may allocate/free memory and may use
|
|
|
|
kmap_atomic().
|
|
|
|
|
|
|
|
The whole kernel_fpu_begin()/end() processing isn't probably that cheap.
|
|
|
|
It most likely makes sense to process as much of those as possible in one
|
|
|
|
go. The new *_fpu_sched_rt() schedules only if a RT task is pending.
|
|
|
|
|
|
|
|
Probably we should measure the performance those ciphers in pure SW
|
|
|
|
mode and with this optimisations to see if it makes sense to keep them
|
|
|
|
for RT.
|
|
|
|
|
|
|
|
This kernel_fpu_resched() makes the code more preemptible which might hurt
|
|
|
|
performance.
|
|
|
|
|
|
|
|
Cc: stable-rt@vger.kernel.org
|
|
|
|
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
|
|
|
---
|
|
|
|
arch/x86/crypto/chacha20_glue.c | 9 +++++----
|
|
|
|
arch/x86/include/asm/fpu/api.h | 1 +
|
|
|
|
arch/x86/kernel/fpu/core.c | 12 ++++++++++++
|
|
|
|
3 files changed, 18 insertions(+), 4 deletions(-)
|
|
|
|
|
|
|
|
--- a/arch/x86/crypto/chacha20_glue.c
|
|
|
|
+++ b/arch/x86/crypto/chacha20_glue.c
|
|
|
|
@@ -81,23 +81,24 @@ static int chacha20_simd(struct skcipher
|
|
|
|
|
|
|
|
crypto_chacha20_init(state, ctx, walk.iv);
|
|
|
|
|
|
|
|
- kernel_fpu_begin();
|
|
|
|
-
|
|
|
|
while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
|
|
|
|
+ kernel_fpu_begin();
|
|
|
|
+
|
|
|
|
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
|
|
|
|
rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
|
|
|
|
+ kernel_fpu_end();
|
|
|
|
err = skcipher_walk_done(&walk,
|
|
|
|
walk.nbytes % CHACHA20_BLOCK_SIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (walk.nbytes) {
|
|
|
|
+ kernel_fpu_begin();
|
|
|
|
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
|
|
|
|
walk.nbytes);
|
|
|
|
+ kernel_fpu_end();
|
|
|
|
err = skcipher_walk_done(&walk, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
- kernel_fpu_end();
|
|
|
|
-
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
--- a/arch/x86/include/asm/fpu/api.h
|
|
|
|
+++ b/arch/x86/include/asm/fpu/api.h
|
|
|
|
@@ -25,6 +25,7 @@ extern void __kernel_fpu_begin(void);
|
|
|
|
extern void __kernel_fpu_end(void);
|
|
|
|
extern void kernel_fpu_begin(void);
|
|
|
|
extern void kernel_fpu_end(void);
|
|
|
|
+extern void kernel_fpu_resched(void);
|
|
|
|
extern bool irq_fpu_usable(void);
|
|
|
|
|
|
|
|
/*
|
|
|
|
--- a/arch/x86/kernel/fpu/core.c
|
|
|
|
+++ b/arch/x86/kernel/fpu/core.c
|
2018-08-31 15:16:36 +00:00
|
|
|
@@ -138,6 +138,18 @@ void kernel_fpu_end(void)
|
2018-08-27 14:32:32 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(kernel_fpu_end);
|
|
|
|
|
|
|
|
+void kernel_fpu_resched(void)
|
|
|
|
+{
|
|
|
|
+ WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
|
|
|
|
+
|
|
|
|
+ if (should_resched(PREEMPT_OFFSET)) {
|
|
|
|
+ kernel_fpu_end();
|
|
|
|
+ cond_resched();
|
|
|
|
+ kernel_fpu_begin();
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+EXPORT_SYMBOL_GPL(kernel_fpu_resched);
|
|
|
|
+
|
|
|
|
/*
|
|
|
|
* Save the FPU state (mark it for reload if necessary):
|
|
|
|
*
|