diff -up openssl-1.0.1e/crypto/bn/asm/x86_64-mont5.pl.rsa-const openssl-1.0.1e/crypto/bn/asm/x86_64-mont5.pl --- openssl-1.0.1e/crypto/bn/asm/x86_64-mont5.pl.rsa-const 2016-01-14 17:38:50.127212987 +0100 +++ openssl-1.0.1e/crypto/bn/asm/x86_64-mont5.pl 2016-02-24 12:03:28.180178677 +0100 @@ -66,60 +66,113 @@ bn_mul_mont_gather5: .align 16 .Lmul_enter: mov ${num}d,${num}d - mov `($win64?56:8)`(%rsp),%r10d # load 7th argument + movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument + lea .Linc(%rip),%r10 push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) + .Lmul_alloca: -___ -$code.=<<___; mov %rsp,%rax lea 2($num),%r11 neg %r11 - lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) + lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) and \$-1024,%rsp # minimize TLB usage mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .Lmul_body: - mov $bp,%r12 # reassign $bp + lea 128($bp),%r12 # reassign $bp (+size optimization) ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96($bp,%r11,8),$bp # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 + lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) + and \$-16,%r10 + + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 + .byte 0x67 + movdqa %xmm4,%xmm3 +___ +for($k=0;$k<$STRIDE/16-4;$k+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($k+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($k+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($k+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($k+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($k+0)+112`(%r10) + + paddd %xmm2,%xmm3 + .byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($k+1)+112`(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($k+2)+112`(%r10) + pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register + + pand `16*($k+1)-128`($bp),%xmm1 + pand `16*($k+2)-128`($bp),%xmm2 + movdqa %xmm3,`16*($k+3)+112`(%r10) + pand `16*($k+3)-128`($bp),%xmm3 por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($k=0;$k<$STRIDE/16-4;$k+=4) { +$code.=<<___; + movdqa `16*($k+0)-128`($bp),%xmm4 + movdqa `16*($k+1)-128`($bp),%xmm5 + movdqa `16*($k+2)-128`($bp),%xmm2 + pand `16*($k+0)+112`(%r10),%xmm4 + movdqa `16*($k+3)-128`($bp),%xmm3 + pand `16*($k+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($k+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($k+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + por %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - movq %xmm0,$m0 # m0=bp[0] mov ($n0),$n0 # pull n0[0] value @@ -128,29 +181,14 @@ $code.=<<___; xor $i,$i # i=0 xor $j,$j # j=0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$lo0 mov ($np),%rax - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq $lo0,$m1 # "tp[0]"*n0 mov %rdx,$hi0 - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax @@ -183,8 +221,6 @@ $code.=<<___; cmp $num,$j jne .L1st - movq %xmm0,$m0 # bp[1] - add %rax,$hi1 mov ($ap),%rax # ap[0] adc \$0,%rdx @@ -204,33 +240,46 @@ $code.=<<___; jmp .Louter .align 16 .Louter: + lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) + and \$-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($k=0;$k<$STRIDE/16;$k+=4) { +$code.=<<___; + movdqa `16*($k+0)-128`($bp),%xmm0 + movdqa `16*($k+1)-128`($bp),%xmm1 + movdqa `16*($k+2)-128`($bp),%xmm2 + movdqa `16*($k+3)-128`($bp),%xmm3 + pand `16*($k+0)-128`(%rdx),%xmm0 + pand `16*($k+1)-128`(%rdx),%xmm1 + por %xmm0,%xmm4 + pand `16*($k+2)-128`(%rdx),%xmm2 + por %xmm1,%xmm5 + pand `16*($k+3)-128`(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bp),$bp + movq %xmm0,$m0 # m0=bp[i] + xor $j,$j # j=0 mov $n0,$m1 mov (%rsp),$lo0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - mulq $m0 # ap[0]*bp[i] add %rax,$lo0 # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq $lo0,$m1 # tp[0]*n0 mov %rdx,$hi0 - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax @@ -266,8 +315,6 @@ $code.=<<___; cmp $num,$j jne .Linner - movq %xmm0,$m0 # bp[i+1] - add %rax,$hi1 mov ($ap),%rax # ap[0] adc \$0,%rdx @@ -321,13 +368,7 @@ $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps (%rsi),%xmm6 - movaps 0x10(%rsi),%xmm7 - lea 0x28(%rsi),%rsi -___ -$code.=<<___; + mov (%rsi),%r15 mov 8(%rsi),%r14 mov 16(%rsi),%r13 @@ -348,91 +389,130 @@ $code.=<<___; bn_mul4x_mont_gather5: .Lmul4x_enter: mov ${num}d,${num}d - mov `($win64?56:8)`(%rsp),%r10d # load 7th argument + movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument + lea .Linc(%rip),%r10 push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) + .Lmul4x_alloca: -___ -$code.=<<___; mov %rsp,%rax lea 4($num),%r11 neg %r11 - lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) + lea -256(%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)+256) and \$-1024,%rsp # minimize TLB usage mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .Lmul4x_body: mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp - mov %rdx,%r12 # reassign $bp + lea 128(%rdx),%r12 # reassign $bp (+size optimization) ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96($bp,%r11,8),$bp # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 + lea 32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization) + + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + .byte 0x67,0x67 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 + .byte 0x67 + movdqa %xmm4,%xmm3 +___ +for($k=0;$k<$STRIDE/16-4;$k+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($k+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($k+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($k+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($k+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($k+0)+112`(%r10) + + paddd %xmm2,%xmm3 + .byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($k+1)+112`(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($k+2)+112`(%r10) + pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register + + pand `16*($k+1)-128`($bp),%xmm1 + pand `16*($k+2)-128`($bp),%xmm2 + movdqa %xmm3,`16*($k+3)+112`(%r10) + pand `16*($k+3)-128`($bp),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($k=0;$k<$STRIDE/16-4;$k+=4) { +$code.=<<___; + movdqa `16*($k+0)-128`($bp),%xmm4 + movdqa `16*($k+1)-128`($bp),%xmm5 + movdqa `16*($k+2)-128`($bp),%xmm2 + pand `16*($k+0)+112`(%r10),%xmm4 + movdqa `16*($k+3)-128`($bp),%xmm3 + pand `16*($k+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($k+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($k+3)+112`(%r10),%xmm3 por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + por %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - movq %xmm0,$m0 # m0=bp[0] + mov ($n0),$n0 # pull n0[0] value mov ($ap),%rax xor $i,$i # i=0 xor $j,$j # j=0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$A[0] mov ($np),%rax - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq $A[0],$m1 # "tp[0]"*n0 mov %rdx,$A[1] - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded mov 8($ap),%rax @@ -550,8 +630,6 @@ $code.=<<___; mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] - movq %xmm0,$m0 # bp[1] - xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] @@ -561,12 +639,34 @@ $code.=<<___; lea 1($i),$i # i++ .align 4 .Louter4x: + lea 32+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($k=0;$k<$STRIDE/16;$k+=4) { +$code.=<<___; + movdqa `16*($k+0)-128`($bp),%xmm0 + movdqa `16*($k+1)-128`($bp),%xmm1 + movdqa `16*($k+2)-128`($bp),%xmm2 + movdqa `16*($k+3)-128`($bp),%xmm3 + pand `16*($k+0)-128`(%rdx),%xmm0 + pand `16*($k+1)-128`(%rdx),%xmm1 + por %xmm0,%xmm4 + pand `16*($k+2)-128`(%rdx),%xmm2 + por %xmm1,%xmm5 + pand `16*($k+3)-128`(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bp),$bp + movq %xmm0,$m0 # m0=bp[i] + xor $j,$j # j=0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 mov (%rsp),$A[0] mov $n0,$m1 @@ -575,18 +675,9 @@ $code.=<<___; mov ($np),%rax adc \$0,%rdx - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq $A[0],$m1 # tp[0]*n0 mov %rdx,$A[1] - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded mov 8($ap),%rax @@ -718,7 +809,6 @@ $code.=<<___; mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] - movq %xmm0,$m0 # bp[i+1] mov $N[1],-16(%rsp,$j,8) # tp[j-1] xor $N[1],$N[1] @@ -809,13 +899,7 @@ ___ $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps (%rsi),%xmm6 - movaps 0x10(%rsi),%xmm7 - lea 0x28(%rsi),%rsi -___ -$code.=<<___; + mov (%rsi),%r15 mov 8(%rsi),%r14 mov 16(%rsi),%r13 @@ -830,8 +914,8 @@ ___ }}} { -my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order - ("%rdi","%rsi","%rdx","%rcx"); # Unix order +my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order + ("%rdi","%rsi","%rdx","%ecx"); # Unix order my $out=$inp; my $STRIDE=2**5*8; my $N=$STRIDE/4; @@ -859,53 +943,89 @@ bn_scatter5: .type bn_gather5,\@abi-omnipotent .align 16 bn_gather5: -___ -$code.=<<___ if ($win64); -.LSEH_begin_bn_gather5: +.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp - .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) - .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) -___ -$code.=<<___; - mov $idx,%r11 - shr \$`log($N/8)/log(2)`,$idx - and \$`$N/8-1`,%r11 - not $idx - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,$idx # 5 is "window size" - lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line - movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which - movq 8(%rax,$idx,8),%xmm5 # cache line contains element - movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument - movq 24(%rax,$idx,8),%xmm7 + .byte 0x4c,0x8d,0x14,0x24 # lea (%rsp),%r10 + .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 # sub $0x108,%rsp + lea .Linc(%rip),%rax + and \$-16,%rsp # shouldn't be formally required + + movd $idx,%xmm5 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 128($tbl),%r11 # size optimization + lea 128(%rsp),%rax # size optimization + + pshufd \$0,%xmm5,%xmm5 # broadcast $idx + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to $idx and save result to stack +# +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 +___ +$code.=<<___ if ($i); + movdqa %xmm3,`16*($i-1)-128`(%rax) +___ +$code.=<<___; + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)-128`(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)-128`(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)-128`(%rax) + movdqa %xmm4,%xmm2 +___ +} +$code.=<<___; + movdqa %xmm3,`16*($i-1)-128`(%rax) jmp .Lgather -.align 16 -.Lgather: - movq `0*$STRIDE/4-96`($tbl),%xmm0 - movq `1*$STRIDE/4-96`($tbl),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($tbl),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($tbl),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 - lea $STRIDE($tbl),$tbl - por %xmm3,%xmm0 +.align 32 +.Lgather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`(%r11),%xmm0 + movdqa `16*($i+1)-128`(%r11),%xmm1 + movdqa `16*($i+2)-128`(%r11),%xmm2 + pand `16*($i+0)-128`(%rax),%xmm0 + movdqa `16*($i+3)-128`(%r11),%xmm3 + pand `16*($i+1)-128`(%rax),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)-128`(%rax),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)-128`(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + lea $STRIDE(%r11),%r11 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 movq %xmm0,($out) # m0=bp[0] lea 8($out),$out sub \$1,$num jnz .Lgather -___ -$code.=<<___ if ($win64); - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) - lea 0x28(%rsp),%rsp -___ -$code.=<<___; + + lea (%r10),%rsp ret .LSEH_end_bn_gather5: .size bn_gather5,.-bn_gather5 @@ -913,9 +1033,9 @@ ___ } $code.=<<___; .align 64 -.Lmagic_masks: - .long 0,0, 0,0, 0,0, -1,-1 - .long 0,0, 0,0, 0,0, 0,0 +.Linc: + .long 0,0, 1,1 + .long 2,2, 2,2 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by " ___ @@ -954,7 +1074,7 @@ mul_handler: cmp %r10,%rbx # context->RipR13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 - movups %xmm0,512($context) # restore context->Xmm6 - movups %xmm1,528($context) # restore context->Xmm7 .Lcommon_seh_tail: mov 8(%rax),%rdi @@ -1057,10 +1173,9 @@ mul_handler: .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_gather5: - .byte 0x01,0x0d,0x05,0x00 - .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 - .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 - .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 + .byte 0x01,0x0b,0x03,0x0a + .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 + .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp), set_frame r10 .align 8 ___ } diff -up openssl-1.0.1e/crypto/bn/bn_exp.c.rsa-const openssl-1.0.1e/crypto/bn/bn_exp.c --- openssl-1.0.1e/crypto/bn/bn_exp.c.rsa-const 2013-02-11 16:26:04.000000000 +0100 +++ openssl-1.0.1e/crypto/bn/bn_exp.c 2016-02-24 12:00:20.807856735 +0100 @@ -111,6 +111,7 @@ #include "cryptlib.h" +#include "constant_time_locl.h" #include "bn_lcl.h" #include @@ -534,30 +535,74 @@ err: * as cache lines are concerned. The following functions are used to transfer a BIGNUM * from/to that table. */ -static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width) +static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int window) { - size_t i, j; + int i, j; + int width = 1 << window; + BN_ULONG *table = (BN_ULONG *)buf; if (top > b->top) top = b->top; /* this works because 'buf' is explicitly zeroed */ - for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width) + for (i = 0, j=idx; i < top; i++, j+=width) { - buf[j] = ((unsigned char*)b->d)[i]; + table[j] = b->d[i]; } return 1; } -static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width) +static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int window) { - size_t i, j; + int i, j; + int width = 1 << window; + volatile BN_ULONG *table = (volatile BN_ULONG *)buf; if (bn_wexpand(b, top) == NULL) return 0; - for (i=0, j=idx; i < top * sizeof b->d[0]; i++, j+=width) + if (window <= 3) { - ((unsigned char*)b->d)[i] = buf[j]; + for (i = 0; i < top; i++, table += width) + { + BN_ULONG acc = 0; + + for (j = 0; j < width; j++) + { + acc |= table[j] & + ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1)); + } + + b->d[i] = acc; + } + } + else + { + int xstride = 1 << (window - 2); + BN_ULONG y0, y1, y2, y3; + + i = idx >> (window - 2); /* equivalent of idx / xstride */ + idx &= xstride - 1; /* equivalent of idx % xstride */ + + y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1); + y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1); + y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1); + y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1); + + for (i = 0; i < top; i++, table += width) + { + BN_ULONG acc = 0; + + for (j = 0; j < xstride; j++) + { + acc |= ( (table[j + 0 * xstride] & y0) | + (table[j + 1 * xstride] & y1) | + (table[j + 2 * xstride] & y2) | + (table[j + 3 * xstride] & y3) ) + & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1)); + } + + b->d[i] = acc; + } } b->top = top; @@ -592,17 +637,27 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr bn_check_top(p); bn_check_top(m); - top = m->top; - - if (!(m->d[0] & 1)) + if (!BN_is_odd(m)) { BNerr(BN_F_BN_MOD_EXP_MONT_CONSTTIME,BN_R_CALLED_WITH_EVEN_MODULUS); return(0); } + + top = m->top; + bits=BN_num_bits(p); if (bits == 0) { - ret = BN_one(rr); + /* x**0 mod 1 is still zero. */ + if (BN_is_one(m)) + { + ret = 1; + BN_zero(rr); + } + else + { + ret = BN_one(rr); + } return ret; } @@ -680,7 +735,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as * 512-bit RSA is hardly relevant, we omit it to spare size... */ - if (window==5) + if (window==5 && top>1) { void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap, const void *table,const BN_ULONG *np, @@ -767,8 +822,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr else #endif { - if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err; - if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers)) goto err; + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, window)) goto err; + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, window)) goto err; /* If the window size is greater than 1, then calculate * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) @@ -778,20 +833,20 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr if (window > 1) { if (!BN_mod_mul_montgomery(&tmp,&am,&am,mont,ctx)) goto err; - if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, numPowers)) goto err; + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, window)) goto err; for (i=3; i=0; i--,bits--) wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); - if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err; + if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,window)) goto err; /* Scan the exponent one window at a time starting from the most * significant bits. @@ -808,7 +863,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr } /* Fetch the appropriate pre-computed value from the pre-buf */ - if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, numPowers)) goto err; + if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, window)) goto err; /* Multiply the result into the intermediate result */ if (!BN_mod_mul_montgomery(&tmp,&tmp,&am,mont,ctx)) goto err; @@ -874,7 +929,16 @@ int BN_mod_exp_mont_word(BIGNUM *rr, BN_ bits = BN_num_bits(p); if (bits == 0) { - ret = BN_one(rr); + /* x**0 mod 1 is still zero. */ + if (BN_is_one(m)) + { + ret = 1; + BN_zero(rr); + } + else + { + ret = BN_one(rr); + } return ret; } if (a == 0) @@ -997,10 +1061,18 @@ int BN_mod_exp_simple(BIGNUM *r, const B } bits=BN_num_bits(p); - if (bits == 0) { - ret = BN_one(r); + /* x**0 mod 1 is still zero. */ + if (BN_is_one(m)) + { + ret = 1; + BN_zero(r); + } + else + { + ret = BN_one(r); + } return ret; } diff -up openssl-1.0.1e/crypto/constant_time_locl.h.rsa-const openssl-1.0.1e/crypto/constant_time_locl.h --- openssl-1.0.1e/crypto/constant_time_locl.h.rsa-const 2016-02-24 11:29:44.635303119 +0100 +++ openssl-1.0.1e/crypto/constant_time_locl.h 2016-02-24 11:27:44.000000000 +0100 @@ -0,0 +1,211 @@ +/* crypto/constant_time_locl.h */ +/*- + * Utilities for constant-time cryptography. + * + * Author: Emilia Kasper (emilia@openssl.org) + * Based on previous work by Bodo Moeller, Emilia Kasper, Adam Langley + * (Google). + * ==================================================================== + * Copyright (c) 2014 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#ifndef HEADER_CONSTANT_TIME_LOCL_H +# define HEADER_CONSTANT_TIME_LOCL_H + +# include "e_os.h" /* For 'inline' */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*- + * The boolean methods return a bitmask of all ones (0xff...f) for true + * and 0 for false. This is useful for choosing a value based on the result + * of a conditional in constant time. For example, + * + * if (a < b) { + * c = a; + * } else { + * c = b; + * } + * + * can be written as + * + * unsigned int lt = constant_time_lt(a, b); + * c = constant_time_select(lt, a, b); + */ + +/* + * Returns the given value with the MSB copied to all the other + * bits. Uses the fact that arithmetic shift shifts-in the sign bit. + * However, this is not ensured by the C standard so you may need to + * replace this with something else on odd CPUs. + */ +static inline unsigned int constant_time_msb(unsigned int a); + +/* + * Returns 0xff..f if a < b and 0 otherwise. + */ +static inline unsigned int constant_time_lt(unsigned int a, unsigned int b); +/* Convenience method for getting an 8-bit mask. */ +static inline unsigned char constant_time_lt_8(unsigned int a, + unsigned int b); + +/* + * Returns 0xff..f if a >= b and 0 otherwise. + */ +static inline unsigned int constant_time_ge(unsigned int a, unsigned int b); +/* Convenience method for getting an 8-bit mask. */ +static inline unsigned char constant_time_ge_8(unsigned int a, + unsigned int b); + +/* + * Returns 0xff..f if a == 0 and 0 otherwise. + */ +static inline unsigned int constant_time_is_zero(unsigned int a); +/* Convenience method for getting an 8-bit mask. */ +static inline unsigned char constant_time_is_zero_8(unsigned int a); + +/* + * Returns 0xff..f if a == b and 0 otherwise. + */ +static inline unsigned int constant_time_eq(unsigned int a, unsigned int b); +/* Convenience method for getting an 8-bit mask. */ +static inline unsigned char constant_time_eq_8(unsigned int a, + unsigned int b); +/* Signed integers. */ +static inline unsigned int constant_time_eq_int(int a, int b); +/* Convenience method for getting an 8-bit mask. */ +static inline unsigned char constant_time_eq_int_8(int a, int b); + +/*- + * Returns (mask & a) | (~mask & b). + * + * When |mask| is all 1s or all 0s (as returned by the methods above), + * the select methods return either |a| (if |mask| is nonzero) or |b| + * (if |mask| is zero). + */ +static inline unsigned int constant_time_select(unsigned int mask, + unsigned int a, + unsigned int b); +/* Convenience method for unsigned chars. */ +static inline unsigned char constant_time_select_8(unsigned char mask, + unsigned char a, + unsigned char b); +/* Convenience method for signed integers. */ +static inline int constant_time_select_int(unsigned int mask, int a, int b); + +static inline unsigned int constant_time_msb(unsigned int a) +{ + return 0 - (a >> (sizeof(a) * 8 - 1)); +} + +static inline unsigned int constant_time_lt(unsigned int a, unsigned int b) +{ + return constant_time_msb(a ^ ((a ^ b) | ((a - b) ^ b))); +} + +static inline unsigned char constant_time_lt_8(unsigned int a, unsigned int b) +{ + return (unsigned char)(constant_time_lt(a, b)); +} + +static inline unsigned int constant_time_ge(unsigned int a, unsigned int b) +{ + return ~constant_time_lt(a, b); +} + +static inline unsigned char constant_time_ge_8(unsigned int a, unsigned int b) +{ + return (unsigned char)(constant_time_ge(a, b)); +} + +static inline unsigned int constant_time_is_zero(unsigned int a) +{ + return constant_time_msb(~a & (a - 1)); +} + +static inline unsigned char constant_time_is_zero_8(unsigned int a) +{ + return (unsigned char)(constant_time_is_zero(a)); +} + +static inline unsigned int constant_time_eq(unsigned int a, unsigned int b) +{ + return constant_time_is_zero(a ^ b); +} + +static inline unsigned char constant_time_eq_8(unsigned int a, unsigned int b) +{ + return (unsigned char)(constant_time_eq(a, b)); +} + +static inline unsigned int constant_time_eq_int(int a, int b) +{ + return constant_time_eq((unsigned)(a), (unsigned)(b)); +} + +static inline unsigned char constant_time_eq_int_8(int a, int b) +{ + return constant_time_eq_8((unsigned)(a), (unsigned)(b)); +} + +static inline unsigned int constant_time_select(unsigned int mask, + unsigned int a, + unsigned int b) +{ + return (mask & a) | (~mask & b); +} + +static inline unsigned char constant_time_select_8(unsigned char mask, + unsigned char a, + unsigned char b) +{ + return (unsigned char)(constant_time_select(mask, a, b)); +} + +static inline int constant_time_select_int(unsigned int mask, int a, int b) +{ + return (int)(constant_time_select(mask, (unsigned)(a), (unsigned)(b))); +} + +#ifdef __cplusplus +} +#endif + +#endif /* HEADER_CONSTANT_TIME_LOCL_H */ diff -up openssl-1.0.1e/crypto/perlasm/x86_64-xlate.pl.rsa-const openssl-1.0.1e/crypto/perlasm/x86_64-xlate.pl --- openssl-1.0.1e/crypto/perlasm/x86_64-xlate.pl.rsa-const 2016-01-14 17:38:50.121212850 +0100 +++ openssl-1.0.1e/crypto/perlasm/x86_64-xlate.pl 2016-02-24 11:26:40.316912890 +0100 @@ -121,7 +121,7 @@ my %globals; $self->{sz} = ""; } elsif ($self->{op} =~ /^v/) { # VEX $self->{sz} = ""; - } elsif ($self->{op} =~ /movq/ && $line =~ /%xmm/) { + } elsif ($self->{op} =~ /mov[dq]/ && $line =~ /%xmm/) { $self->{sz} = ""; } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { $self->{op} = $1;