From: Sebastian Andrzej Siewior Date: Fri, 21 Feb 2014 17:24:04 +0100 Subject: crypto: Reduce preempt disabled regions, more algos Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.16/older/patches-4.16.8-rt3.tar.xz Don Estabrook reported | kernel: WARNING: CPU: 2 PID: 858 at kernel/sched/core.c:2428 migrate_disable+0xed/0x100() | kernel: WARNING: CPU: 2 PID: 858 at kernel/sched/core.c:2462 migrate_enable+0x17b/0x200() | kernel: WARNING: CPU: 3 PID: 865 at kernel/sched/core.c:2428 migrate_disable+0xed/0x100() and his backtrace showed some crypto functions which looked fine. The problem is the following sequence: glue_xts_crypt_128bit() { blkcipher_walk_virt(); /* normal migrate_disable() */ glue_fpu_begin(); /* get atomic */ while (nbytes) { __glue_xts_crypt_128bit(); blkcipher_walk_done(); /* with nbytes = 0, migrate_enable() * while we are atomic */ }; glue_fpu_end() /* no longer atomic */ } and this is why the counter get out of sync and the warning is printed. The other problem is that we are non-preemptible between glue_fpu_begin() and glue_fpu_end() and the latency grows. To fix this, I shorten the FPU off region and ensure blkcipher_walk_done() is called with preemption enabled. This might hurt the performance because we now enable/disable the FPU state more often but we gain lower latency and the bug is gone. Reported-by: Don Estabrook Signed-off-by: Sebastian Andrzej Siewior --- arch/x86/crypto/cast5_avx_glue.c | 21 +++++++++------------ arch/x86/crypto/glue_helper.c | 31 +++++++++++++++---------------- 2 files changed, 24 insertions(+), 28 deletions(-) --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fp static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, bool enc) { - bool fpu_enabled = false; + bool fpu_enabled; struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); const unsigned int bsize = CAST5_BLOCK_SIZE; unsigned int nbytes; @@ -73,7 +73,7 @@ static int ecb_crypt(struct blkcipher_de u8 *wsrc = walk->src.virt.addr; u8 *wdst = walk->dst.virt.addr; - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); + fpu_enabled = cast5_fpu_begin(false, nbytes); /* Process multi-block batch */ if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { @@ -102,10 +102,9 @@ static int ecb_crypt(struct blkcipher_de } while (nbytes >= bsize); done: + cast5_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, walk, nbytes); } - - cast5_fpu_end(fpu_enabled); return err; } @@ -226,7 +225,7 @@ static unsigned int __cbc_decrypt(struct static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - bool fpu_enabled = false; + bool fpu_enabled; struct blkcipher_walk walk; int err; @@ -235,12 +234,11 @@ static int cbc_decrypt(struct blkcipher_ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; while ((nbytes = walk.nbytes)) { - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); + fpu_enabled = cast5_fpu_begin(false, nbytes); nbytes = __cbc_decrypt(desc, &walk); + cast5_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, &walk, nbytes); } - - cast5_fpu_end(fpu_enabled); return err; } @@ -309,7 +307,7 @@ static unsigned int __ctr_crypt(struct b static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - bool fpu_enabled = false; + bool fpu_enabled; struct blkcipher_walk walk; int err; @@ -318,13 +316,12 @@ static int ctr_crypt(struct blkcipher_de desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) { - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); + fpu_enabled = cast5_fpu_begin(false, nbytes); nbytes = __ctr_crypt(desc, &walk); + cast5_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, &walk, nbytes); } - cast5_fpu_end(fpu_enabled); - if (walk.nbytes) { ctr_crypt_final(desc, &walk); err = blkcipher_walk_done(desc, &walk, 0); --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -40,7 +40,7 @@ static int __glue_ecb_crypt_128bit(const void *ctx = crypto_blkcipher_ctx(desc->tfm); const unsigned int bsize = 128 / 8; unsigned int nbytes, i, func_bytes; - bool fpu_enabled = false; + bool fpu_enabled; int err; err = blkcipher_walk_virt(desc, walk); @@ -50,7 +50,7 @@ static int __glue_ecb_crypt_128bit(const u8 *wdst = walk->dst.virt.addr; fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, fpu_enabled, nbytes); + desc, false, nbytes); for (i = 0; i < gctx->num_funcs; i++) { func_bytes = bsize * gctx->funcs[i].num_blocks; @@ -72,10 +72,10 @@ static int __glue_ecb_crypt_128bit(const } done: + glue_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, walk, nbytes); } - glue_fpu_end(fpu_enabled); return err; } @@ -192,7 +192,7 @@ int glue_cbc_decrypt_128bit(const struct struct scatterlist *src, unsigned int nbytes) { const unsigned int bsize = 128 / 8; - bool fpu_enabled = false; + bool fpu_enabled; struct blkcipher_walk walk; int err; @@ -201,12 +201,12 @@ int glue_cbc_decrypt_128bit(const struct while ((nbytes = walk.nbytes)) { fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, fpu_enabled, nbytes); + desc, false, nbytes); nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk); + glue_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, &walk, nbytes); } - glue_fpu_end(fpu_enabled); return err; } EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit); @@ -275,7 +275,7 @@ int glue_ctr_crypt_128bit(const struct c struct scatterlist *src, unsigned int nbytes) { const unsigned int bsize = 128 / 8; - bool fpu_enabled = false; + bool fpu_enabled; struct blkcipher_walk walk; int err; @@ -284,13 +284,12 @@ int glue_ctr_crypt_128bit(const struct c while ((nbytes = walk.nbytes) >= bsize) { fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, fpu_enabled, nbytes); + desc, false, nbytes); nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk); + glue_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, &walk, nbytes); } - glue_fpu_end(fpu_enabled); - if (walk.nbytes) { glue_ctr_crypt_final_128bit( gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk); @@ -380,7 +379,7 @@ int glue_xts_crypt_128bit(const struct c void *tweak_ctx, void *crypt_ctx) { const unsigned int bsize = 128 / 8; - bool fpu_enabled = false; + bool fpu_enabled; struct blkcipher_walk walk; int err; @@ -393,21 +392,21 @@ int glue_xts_crypt_128bit(const struct c /* set minimum length to bsize, for tweak_fn */ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, fpu_enabled, + desc, false, nbytes < bsize ? bsize : nbytes); - /* calculate first value of T */ tweak_fn(tweak_ctx, walk.iv, walk.iv); + glue_fpu_end(fpu_enabled); while (nbytes) { + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, false, nbytes); nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk); + glue_fpu_end(fpu_enabled); err = blkcipher_walk_done(desc, &walk, nbytes); nbytes = walk.nbytes; } - - glue_fpu_end(fpu_enabled); - return err; } EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);