ext-libressl-portable/crypto/bn/arch/amd64/bignum_sqr.S

// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

// ----------------------------------------------------------------------------
// Square z := x^2
// Input x[n]; output z[k]
//
//    extern void bignum_sqr
//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
//
// Does the "z := x^2" operation where x is n digits and result z is k.
// Truncates the result in general unless k >= 2 * n
//
// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x
// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n, R9 = x
// ----------------------------------------------------------------------------

#include "s2n_bignum_internal.h"

        .intel_syntax noprefix
        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr)
        .text

// First three are where arguments come in, but n is moved.

#define p rdi
#define z rsi
#define x rcx
#define n r8

// These are always local scratch since multiplier result is in these

#define a rax
#define d rdx

// Other variables

#define i rbx
#define ll rbp
#define hh r9
#define k r10
#define y r11
#define htop r12
#define l r13
#define h r14
#define c r15

// Short versions

#define llshort ebp

S2N_BN_SYMBOL(bignum_sqr):
	endbr64

#if WINDOWS_ABI
        push    rdi
        push    rsi
        mov     rdi, rcx
        mov     rsi, rdx
        mov     rdx, r8
        mov     rcx, r9
#endif

// We use too many registers, and also we need rax:rdx for multiplications

        push    rbx
        push    rbp
        push    r12
        push    r13
        push    r14
        push    r15
        mov     n, rdx

// If p = 0 the result is trivial and nothing needs doing

        test    p, p
        jz      end

// initialize (hh,ll) = 0

        xor     llshort, llshort
        xor     hh, hh

// Iterate outer loop from k = 0 ... k = p - 1 producing result digits

        xor     k, k

outerloop:

// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
// We want to accumulate all x[i] * x[k - i] for bot <= i < top
// For the optimization of squaring we avoid duplication and do
// 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n
// Initialize i = bot; in fact just compute bot as i directly.

        xor     c, c
        lea     i, [k+1]
        mov     htop, i
        shr     htop, 1
        sub     i, n
        cmovc   i, c
        cmp     htop, n
        cmovnc  htop, n

// Initialize the three-part local sum (c,h,l); c was already done above

        xor     l, l
        xor     h, h

// If htop <= bot then main doubled part of the sum is empty

        cmp     i, htop
        jnc     nosumming

// Use a moving pointer for [y] = x[k-i] for the cofactor

        mov     a, k
        sub     a, i
        lea     y, [x+8*a]

// Do the main part of the sum x[i] * x[k - i] for 2 * i < k

innerloop:
        mov     a, [x+8*i]
        mul     QWORD PTR [y]
        add     l, a
        adc     h, d
        adc     c, 0
        sub     y, 8
        inc     i
        cmp     i, htop
        jc      innerloop

// Now double it

        add     l, l
        adc     h, h
        adc     c, c

// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term

nosumming:
        test    k, 1
        jnz     innerend
        cmp     i, n
        jnc     innerend

        mov     a, [x+8*i]
        mul     a
        add     l, a
        adc     h, d
        adc     c, 0

// Now add the local sum into the global sum, store and shift

innerend:
        add     l, ll
        mov     [z+8*k], l
        adc     h, hh
        mov     ll, h
        adc     c, 0
        mov     hh, c

        inc     k
        cmp     k, p
        jc      outerloop

// Restore registers and return

end:
        pop     r15
        pop     r14
        pop     r13
        pop     r12
        pop     rbp
        pop     rbx
#if WINDOWS_ABI
        pop    rsi
        pop    rdi
#endif
        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
libressl: Update to 3.8.0 2023-06-04 16:48:43 -07:00			`// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.`
			`//`
			`// Permission to use, copy, modify, and/or distribute this software for any`
			`// purpose with or without fee is hereby granted, provided that the above`
			`// copyright notice and this permission notice appear in all copies.`
			`//`
			`// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES`
			`// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF`
			`// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR`
			`// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES`
			`// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN`
			`// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF`
			`// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.`

			`// ----------------------------------------------------------------------------`
			`// Square z := x^2`
			`// Input x[n]; output z[k]`
			`//`
			`// extern void bignum_sqr`
			`// (uint64_t k, uint64_t z, uint64_t n, uint64_t x);`
			`//`
			`// Does the "z := x^2" operation where x is n digits and result z is k.`
			`// Truncates the result in general unless k >= 2 * n`
			`//`
			`// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x`
			`// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x`
			`// ----------------------------------------------------------------------------`

			`#include "s2n_bignum_internal.h"`

			`.intel_syntax noprefix`
			`S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr)`
			`S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr)`
			`.text`

			`// First three are where arguments come in, but n is moved.`

			`#define p rdi`
			`#define z rsi`
			`#define x rcx`
			`#define n r8`

			`// These are always local scratch since multiplier result is in these`

			`#define a rax`
			`#define d rdx`

			`// Other variables`

			`#define i rbx`
			`#define ll rbp`
			`#define hh r9`
			`#define k r10`
			`#define y r11`
			`#define htop r12`
			`#define l r13`
			`#define h r14`
			`#define c r15`

			`// Short versions`

			`#define llshort ebp`

			`S2N_BN_SYMBOL(bignum_sqr):`
			`endbr64`

			`#if WINDOWS_ABI`
			`push rdi`
			`push rsi`
			`mov rdi, rcx`
			`mov rsi, rdx`
			`mov rdx, r8`
			`mov rcx, r9`
			`#endif`

			`// We use too many registers, and also we need rax:rdx for multiplications`

			`push rbx`
			`push rbp`
			`push r12`
			`push r13`
			`push r14`
			`push r15`
			`mov n, rdx`

			`// If p = 0 the result is trivial and nothing needs doing`

			`test p, p`
			`jz end`

			`// initialize (hh,ll) = 0`

			`xor llshort, llshort`
			`xor hh, hh`

			`// Iterate outer loop from k = 0 ... k = p - 1 producing result digits`

			`xor k, k`

			`outerloop:`

			`// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n`
			`// We want to accumulate all x[i] * x[k - i] for bot <= i < top`
			`// For the optimization of squaring we avoid duplication and do`
			`// 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n`
			`// Initialize i = bot; in fact just compute bot as i directly.`

			`xor c, c`
			`lea i, [k+1]`
			`mov htop, i`
			`shr htop, 1`
			`sub i, n`
			`cmovc i, c`
			`cmp htop, n`
			`cmovnc htop, n`

			`// Initialize the three-part local sum (c,h,l); c was already done above`

			`xor l, l`
			`xor h, h`

			`// If htop <= bot then main doubled part of the sum is empty`

			`cmp i, htop`
			`jnc nosumming`

			`// Use a moving pointer for [y] = x[k-i] for the cofactor`

			`mov a, k`
			`sub a, i`
			`lea y, [x+8*a]`

			`// Do the main part of the sum x[i] * x[k - i] for 2 * i < k`

			`innerloop:`
			`mov a, [x+8*i]`
			`mul QWORD PTR [y]`
			`add l, a`
			`adc h, d`
			`adc c, 0`
			`sub y, 8`
			`inc i`
			`cmp i, htop`
			`jc innerloop`

			`// Now double it`

			`add l, l`
			`adc h, h`
			`adc c, c`

			`// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term`

			`nosumming:`
			`test k, 1`
			`jnz innerend`
			`cmp i, n`
			`jnc innerend`

			`mov a, [x+8*i]`
			`mul a`
			`add l, a`
			`adc h, d`
			`adc c, 0`

			`// Now add the local sum into the global sum, store and shift`

			`innerend:`
			`add l, ll`
			`mov [z+8*k], l`
			`adc h, hh`
			`mov ll, h`
			`adc c, 0`
			`mov hh, c`

			`inc k`
			`cmp k, p`
			`jc outerloop`

			`// Restore registers and return`

			`end:`
			`pop r15`
			`pop r14`
			`pop r13`
			`pop r12`
			`pop rbp`
			`pop rbx`
			`#if WINDOWS_ABI`
			`pop rsi`
			`pop rdi`
			`#endif`
			`ret`

			`#if defined(__linux__) && defined(__ELF__)`
			`.section .note.GNU-stack,"",%progbits`
			`#endif`