/*
 * Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2016, Intel Corporation. All rights reserved.
 * Intel Math Library (LIBM) Source Code
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */


package org.graalvm.compiler.lir.amd64;

import static jdk.vm.ci.amd64.AMD64.r11;
import static jdk.vm.ci.amd64.AMD64.rax;
import static jdk.vm.ci.amd64.AMD64.rcx;
import static jdk.vm.ci.amd64.AMD64.rdx;
import static jdk.vm.ci.amd64.AMD64.rsp;
import static jdk.vm.ci.amd64.AMD64.xmm0;
import static jdk.vm.ci.amd64.AMD64.xmm1;
import static jdk.vm.ci.amd64.AMD64.xmm2;
import static jdk.vm.ci.amd64.AMD64.xmm3;
import static jdk.vm.ci.amd64.AMD64.xmm4;
import static jdk.vm.ci.amd64.AMD64.xmm5;
import static jdk.vm.ci.amd64.AMD64.xmm6;
import static jdk.vm.ci.amd64.AMD64.xmm7;
import static org.graalvm.compiler.lir.amd64.AMD64HotSpotHelper.pointerConstant;
import static org.graalvm.compiler.lir.amd64.AMD64HotSpotHelper.recordExternalAddress;

import org.graalvm.compiler.asm.Label;
import org.graalvm.compiler.asm.amd64.AMD64Address;
import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
import org.graalvm.compiler.lir.LIRInstructionClass;
import org.graalvm.compiler.lir.asm.ArrayDataPointerConstant;
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;

                    ALGORITHM DESCRIPTION - EXP()
                    ---------------------
Description:
 Let K = 64 (table size).
       x    x/log(2)     n
      e  = 2          = 2 * T[j] * (1 + P(y))
 where
      x = m*log(2)/K + y,    y in [-log(2)/K..log(2)/K]
      m = n*K + j,           m,n,j - signed integer, j in [-K/2..K/2]
                 j/K
      values of 2   are tabulated as T[j] = T_hi[j] ( 1 + T_lo[j]).
      P(y) is a minimax polynomial approximation of exp(x)-1
      on small interval [-log(2)/K..log(2)/K] (were calculated by Maple V).
 To avoid problems with arithmetic overflow and underflow,
           n                        n1  n2
 value of 2  is safely computed as 2 * 2 where n1 in [-BIAS/2..BIAS/2]
 where BIAS is a value of exponent bias.
Special cases:
 exp(NaN) = NaN
 exp(+INF) = +INF
 exp(-INF) = 0
 exp(x) = 1 for subnormals
 for finite argument, only exp(0)=1 is exact
 For IEEE double
   if x >  709.782712893383973096 then exp(x) overflow
   if x < -745.133219101941108420 then exp(x) underflow
/** * <pre> * ALGORITHM DESCRIPTION - EXP() * --------------------- * * Description: * Let K = 64 (table size). * x x/log(2) n * e = 2 = 2 * T[j] * (1 + P(y)) * where * x = m*log(2)/K + y, y in [-log(2)/K..log(2)/K] * m = n*K + j, m,n,j - signed integer, j in [-K/2..K/2] * j/K * values of 2 are tabulated as T[j] = T_hi[j] ( 1 + T_lo[j]). * * P(y) is a minimax polynomial approximation of exp(x)-1 * on small interval [-log(2)/K..log(2)/K] (were calculated by Maple V). * * To avoid problems with arithmetic overflow and underflow, * n n1 n2 * value of 2 is safely computed as 2 * 2 where n1 in [-BIAS/2..BIAS/2] * where BIAS is a value of exponent bias. * * Special cases: * exp(NaN) = NaN * exp(+INF) = +INF * exp(-INF) = 0 * exp(x) = 1 for subnormals * for finite argument, only exp(0)=1 is exact * For IEEE double * if x > 709.782712893383973096 then exp(x) overflow * if x < -745.133219101941108420 then exp(x) underflow * </pre> */
public final class AMD64MathExpOp extends AMD64MathIntrinsicUnaryOp { public static final LIRInstructionClass<AMD64MathExpOp> TYPE = LIRInstructionClass.create(AMD64MathExpOp.class); public AMD64MathExpOp() { super(TYPE, /* GPR */ rax, rcx, rdx, r11, /* XMM */ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); } private ArrayDataPointerConstant cv = pointerConstant(16, new int[]{ // @formatter:off 0x652b82fe, 0x40571547, 0x652b82fe, 0x40571547, }); private ArrayDataPointerConstant cv16 = pointerConstant(16, new int[]{ 0xfefa0000, 0x3f862e42, 0xfefa0000, 0x3f862e42, }); private ArrayDataPointerConstant cv32 = pointerConstant(16, new int[]{ 0xbc9e3b3a, 0x3d1cf79a, 0xbc9e3b3a, 0x3d1cf79a, }); private ArrayDataPointerConstant cv48 = pointerConstant(16, new int[]{ 0xfffffffe, 0x3fdfffff, 0xfffffffe, 0x3fdfffff, }); private ArrayDataPointerConstant cv64 = pointerConstant(16, new int[]{ 0xe3289860, 0x3f56c15c, 0x555b9e25, 0x3fa55555, }); private ArrayDataPointerConstant cv80 = pointerConstant(16, new int[]{ 0xc090cf0f, 0x3f811115, 0x55548ba1, 0x3fc55555 // @formatter:on }); private ArrayDataPointerConstant shifter = pointerConstant(16, new int[]{ // @formatter:off 0x00000000, 0x43380000, 0x00000000, 0x43380000 // @formatter:on }); private ArrayDataPointerConstant mmask = pointerConstant(16, new int[]{ // @formatter:off 0xffffffc0, 0x00000000, 0xffffffc0, 0x00000000 // @formatter:on }); private ArrayDataPointerConstant bias = pointerConstant(16, new int[]{ // @formatter:off 0x0000ffc0, 0x00000000, 0x0000ffc0, 0x00000000 // @formatter:on }); private ArrayDataPointerConstant tblAddr = pointerConstant(16, new int[]{ // @formatter:off 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0e03754d, 0x3cad7bbf, 0x3e778060, 0x00002c9a, 0x3567f613, 0x3c8cd252, 0xd3158574, 0x000059b0, 0x61e6c861, 0x3c60f74e, 0x18759bc8, 0x00008745, 0x5d837b6c, 0x3c979aa6, 0x6cf9890f, 0x0000b558, 0x702f9cd1, 0x3c3ebe3d, 0x32d3d1a2, 0x0000e3ec, 0x1e63bcd8, 0x3ca3516e, 0xd0125b50, 0x00011301, 0x26f0387b, 0x3ca4c554, 0xaea92ddf, 0x0001429a, 0x62523fb6, 0x3ca95153, 0x3c7d517a, 0x000172b8, 0x3f1353bf, 0x3c8b898c, 0xeb6fcb75, 0x0001a35b, 0x3e3a2f5f, 0x3c9aecf7, 0x3168b9aa, 0x0001d487, 0x44a6c38d, 0x3c8a6f41, 0x88628cd6, 0x0002063b, 0xe3a8a894, 0x3c968efd, 0x6e756238, 0x0002387a, 0x981fe7f2, 0x3c80472b, 0x65e27cdd, 0x00026b45, 0x6d09ab31, 0x3c82f7e1, 0xf51fdee1, 0x00029e9d, 0x720c0ab3, 0x3c8b3782, 0xa6e4030b, 0x0002d285, 0x4db0abb6, 0x3c834d75, 0x0a31b715, 0x000306fe, 0x5dd3f84a, 0x3c8fdd39, 0xb26416ff, 0x00033c08, 0xcc187d29, 0x3ca12f8c, 0x373aa9ca, 0x000371a7, 0x738b5e8b, 0x3ca7d229, 0x34e59ff6, 0x0003a7db, 0xa72a4c6d, 0x3c859f48, 0x4c123422, 0x0003dea6, 0x259d9205, 0x3ca8b846, 0x21f72e29, 0x0004160a, 0x60c2ac12, 0x3c4363ed, 0x6061892d, 0x00044e08, 0xdaa10379, 0x3c6ecce1, 0xb5c13cd0, 0x000486a2, 0xbb7aafb0, 0x3c7690ce, 0xd5362a27, 0x0004bfda, 0x9b282a09, 0x3ca083cc, 0x769d2ca6, 0x0004f9b2, 0xc1aae707, 0x3ca509b0, 0x569d4f81, 0x0005342b, 0x18fdd78e, 0x3c933505, 0x36b527da, 0x00056f47, 0xe21c5409, 0x3c9063e1, 0xdd485429, 0x0005ab07, 0x2b64c035, 0x3c9432e6, 0x15ad2148, 0x0005e76f, 0x99f08c0a, 0x3ca01284, 0xb03a5584, 0x0006247e, 0x0073dc06, 0x3c99f087, 0x82552224, 0x00066238, 0x0da05571, 0x3c998d4d, 0x667f3bcc, 0x0006a09e, 0x86ce4786, 0x3ca52bb9, 0x3c651a2e, 0x0006dfb2, 0x206f0dab, 0x3ca32092, 0xe8ec5f73, 0x00071f75, 0x8e17a7a6, 0x3ca06122, 0x564267c8, 0x00075feb, 0x461e9f86, 0x3ca244ac, 0x73eb0186, 0x0007a114, 0xabd66c55, 0x3c65ebe1, 0x36cf4e62, 0x0007e2f3, 0xbbff67d0, 0x3c96fe9f, 0x994cce12, 0x00082589, 0x14c801df, 0x3c951f14, 0x9b4492ec, 0x000868d9, 0xc1f0eab4, 0x3c8db72f, 0x422aa0db, 0x0008ace5, 0x59f35f44, 0x3c7bf683, 0x99157736, 0x0008f1ae, 0x9c06283c, 0x3ca360ba, 0xb0cdc5e4, 0x00093737, 0x20f962aa, 0x3c95e8d1, 0x9fde4e4f, 0x00097d82, 0x2b91ce27, 0x3c71affc, 0x82a3f090, 0x0009c491, 0x589a2ebd, 0x3c9b6d34, 0x7b5de564, 0x000a0c66, 0x9ab89880, 0x3c95277c, 0xb23e255c, 0x000a5503, 0x6e735ab3, 0x3c846984, 0x5579fdbf, 0x000a9e6b, 0x92cb3387, 0x3c8c1a77, 0x995ad3ad, 0x000ae89f, 0xdc2d1d96, 0x3ca22466, 0xb84f15fa, 0x000b33a2, 0xb19505ae, 0x3ca1112e, 0xf2fb5e46, 0x000b7f76, 0x0a5fddcd, 0x3c74ffd7, 0x904bc1d2, 0x000bcc1e, 0x30af0cb3, 0x3c736eae, 0xdd85529c, 0x000c199b, 0xd10959ac, 0x3c84e08f, 0x2e57d14b, 0x000c67f1, 0x6c921968, 0x3c676b2c, 0xdcef9069, 0x000cb720, 0x36df99b3, 0x3c937009, 0x4a07897b, 0x000d072d, 0xa63d07a7, 0x3c74a385, 0xdcfba487, 0x000d5818, 0xd5c192ac, 0x3c8e5a50, 0x03db3285, 0x000da9e6, 0x1c4a9792, 0x3c98bb73, 0x337b9b5e, 0x000dfc97, 0x603a88d3, 0x3c74b604, 0xe78b3ff6, 0x000e502e, 0x92094926, 0x3c916f27, 0xa2a490d9, 0x000ea4af, 0x41aa2008, 0x3c8ec3bc, 0xee615a27, 0x000efa1b, 0x31d185ee, 0x3c8a64a9, 0x5b6e4540, 0x000f5076, 0x4d91cd9d, 0x3c77893b, 0x819e90d8, 0x000fa7c1 // @formatter:on }); private ArrayDataPointerConstant allones = pointerConstant(16, new int[]{ // @formatter:off 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff // @formatter:on }); private ArrayDataPointerConstant ebias = pointerConstant(16, new int[]{ // @formatter:off 0x00000000, 0x3ff00000, 0x00000000, 0x3ff00000 // @formatter:on }); private ArrayDataPointerConstant xmax = pointerConstant(4, new int[]{ // @formatter:off 0xffffffff, 0x7fefffff // @formatter:on }); private ArrayDataPointerConstant xmin = pointerConstant(4, new int[]{ // @formatter:off 0x00000000, 0x00100000 // @formatter:on }); private ArrayDataPointerConstant inf = pointerConstant(4, new int[]{ // @formatter:off 0x00000000, 0x7ff00000 // @formatter:on }); private ArrayDataPointerConstant zero = pointerConstant(4, new int[]{ // @formatter:off 0x00000000, 0x00000000 // @formatter:on }); private ArrayDataPointerConstant oneVal = pointerConstant(4, new int[]{ // @formatter:off 0x00000000, 0x3ff00000 // @formatter:on }); @Override public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) { // Registers: // input: xmm0 // scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 // rax, rdx, rcx, tmp - r11 // Code generated by Intel C compiler for LIBM library Label block0 = new Label(); Label block1 = new Label(); Label block2 = new Label(); Label block3 = new Label(); Label block4 = new Label(); Label block5 = new Label(); Label block6 = new Label(); Label block7 = new Label(); Label block8 = new Label(); Label block9 = new Label(); Label block10 = new Label(); Label block11 = new Label(); Label block12 = new Label(); Label block13 = new Label(); masm.subq(rsp, 24); masm.movsd(new AMD64Address(rsp, 8), xmm0); masm.unpcklpd(xmm0, xmm0); masm.movdqu(xmm1, recordExternalAddress(crb, cv)); // 0x652b82fe, 0x40571547, // 0x652b82fe, 0x40571547 masm.movdqu(xmm6, recordExternalAddress(crb, shifter)); // 0x00000000, 0x43380000, // 0x00000000, 0x43380000 masm.movdqu(xmm2, recordExternalAddress(crb, cv16)); // 0xfefa0000, 0x3f862e42, // 0xfefa0000, 0x3f862e42 masm.movdqu(xmm3, recordExternalAddress(crb, cv32)); // 0xbc9e3b3a, 0x3d1cf79a, // 0xbc9e3b3a, 0x3d1cf79a masm.pextrw(rax, xmm0, 3); masm.andl(rax, 32767); masm.movl(rdx, 16527); masm.subl(rdx, rax); masm.subl(rax, 15504); masm.orl(rdx, rax); masm.cmplAndJcc(rdx, Integer.MIN_VALUE, ConditionFlag.AboveEqual, block0, false); masm.mulpd(xmm1, xmm0); masm.addpd(xmm1, xmm6); masm.movapd(xmm7, xmm1); masm.subpd(xmm1, xmm6); masm.mulpd(xmm2, xmm1); masm.movdqu(xmm4, recordExternalAddress(crb, cv64)); // 0xe3289860, 0x3f56c15c, // 0x555b9e25, 0x3fa55555 masm.mulpd(xmm3, xmm1); masm.movdqu(xmm5, recordExternalAddress(crb, cv80)); // 0xc090cf0f, 0x3f811115, // 0x55548ba1, 0x3fc55555 masm.subpd(xmm0, xmm2); masm.movdl(rax, xmm7); masm.movl(rcx, rax); masm.andl(rcx, 63); masm.shll(rcx, 4); masm.sarl(rax, 6); masm.movl(rdx, rax); masm.movdqu(xmm6, recordExternalAddress(crb, mmask)); // 0xffffffc0, 0x00000000, // 0xffffffc0, 0x00000000 masm.pand(xmm7, xmm6); masm.movdqu(xmm6, recordExternalAddress(crb, bias)); // 0x0000ffc0, 0x00000000, // 0x0000ffc0, 0x00000000 masm.paddq(xmm7, xmm6); masm.psllq(xmm7, 46); masm.subpd(xmm0, xmm3); masm.leaq(r11, recordExternalAddress(crb, tblAddr)); masm.movdqu(xmm2, new AMD64Address(rcx, r11, AMD64Address.Scale.Times1)); masm.mulpd(xmm4, xmm0); masm.movapd(xmm6, xmm0); masm.movapd(xmm1, xmm0); masm.mulpd(xmm6, xmm6); masm.mulpd(xmm0, xmm6); masm.addpd(xmm5, xmm4); masm.mulsd(xmm0, xmm6); masm.mulpd(xmm6, recordExternalAddress(crb, cv48)); // 0xfffffffe, 0x3fdfffff, // 0xfffffffe, 0x3fdfffff masm.addsd(xmm1, xmm2); masm.unpckhpd(xmm2, xmm2); masm.mulpd(xmm0, xmm5); masm.addsd(xmm1, xmm0); masm.por(xmm2, xmm7); masm.unpckhpd(xmm0, xmm0); masm.addsd(xmm0, xmm1); masm.addsd(xmm0, xmm6); masm.addl(rdx, 894); masm.cmplAndJcc(rdx, 1916, ConditionFlag.Above, block1, false); masm.mulsd(xmm0, xmm2); masm.addsd(xmm0, xmm2); masm.jmp(block13); masm.bind(block1); masm.xorpd(xmm3, xmm3); masm.movdqu(xmm4, recordExternalAddress(crb, allones)); // 0xffffffff, 0xffffffff, // 0xffffffff, 0xffffffff masm.movl(rdx, -1022); masm.subl(rdx, rax); masm.movdl(xmm5, rdx); masm.psllq(xmm4, xmm5); masm.movl(rcx, rax); masm.sarl(rax, 1); masm.pinsrw(xmm3, rax, 3); masm.movdqu(xmm6, recordExternalAddress(crb, ebias)); // 0x00000000, 0x3ff00000, // 0x00000000, 0x3ff00000 masm.psllq(xmm3, 4); masm.psubd(xmm2, xmm3); masm.mulsd(xmm0, xmm2); masm.cmplAndJcc(rdx, 52, ConditionFlag.Greater, block2, false); masm.pand(xmm4, xmm2); masm.paddd(xmm3, xmm6); masm.subsd(xmm2, xmm4); masm.addsd(xmm0, xmm2); masm.cmplAndJcc(rcx, 1023, ConditionFlag.GreaterEqual, block3, false); masm.pextrw(rcx, xmm0, 3); masm.andl(rcx, 32768); masm.orl(rdx, rcx); masm.cmplAndJcc(rdx, 0, ConditionFlag.Equal, block4, false); masm.movapd(xmm6, xmm0); masm.addsd(xmm0, xmm4); masm.mulsd(xmm0, xmm3); masm.pextrw(rcx, xmm0, 3); masm.andl(rcx, 32752); masm.cmplAndJcc(rcx, 0, ConditionFlag.Equal, block5, false); masm.jmp(block13); masm.bind(block5); masm.mulsd(xmm6, xmm3); masm.mulsd(xmm4, xmm3); masm.movdqu(xmm0, xmm6); masm.pxor(xmm6, xmm4); masm.psrad(xmm6, 31); masm.pshufd(xmm6, xmm6, 85); masm.psllq(xmm0, 1); masm.psrlq(xmm0, 1); masm.pxor(xmm0, xmm6); masm.psrlq(xmm6, 63); masm.paddq(xmm0, xmm6); masm.paddq(xmm0, xmm4); masm.movl(new AMD64Address(rsp, 0), 15); masm.jmp(block6); masm.bind(block4); masm.addsd(xmm0, xmm4); masm.mulsd(xmm0, xmm3); masm.jmp(block13); masm.bind(block3); masm.addsd(xmm0, xmm4); masm.mulsd(xmm0, xmm3); masm.pextrw(rcx, xmm0, 3); masm.andl(rcx, 32752); masm.cmplAndJcc(rcx, 32752, ConditionFlag.AboveEqual, block7, false); masm.jmp(block13); masm.bind(block2); masm.paddd(xmm3, xmm6); masm.addpd(xmm0, xmm2); masm.mulsd(xmm0, xmm3); masm.movl(new AMD64Address(rsp, 0), 15); masm.jmp(block6); masm.bind(block8); masm.cmplAndJcc(rax, 2146435072, ConditionFlag.AboveEqual, block9, false); masm.movl(rax, new AMD64Address(rsp, 12)); masm.cmplAndJcc(rax, Integer.MIN_VALUE, ConditionFlag.AboveEqual, block10, false); masm.movsd(xmm0, recordExternalAddress(crb, xmax)); // 0xffffffff, 0x7fefffff masm.mulsd(xmm0, xmm0); masm.bind(block7); masm.movl(new AMD64Address(rsp, 0), 14); masm.jmp(block6); masm.bind(block10); masm.movsd(xmm0, recordExternalAddress(crb, xmin)); // 0x00000000, 0x00100000 masm.mulsd(xmm0, xmm0); masm.movl(new AMD64Address(rsp, 0), 15); masm.jmp(block6); masm.bind(block9); masm.movl(rdx, new AMD64Address(rsp, 8)); masm.cmplAndJcc(rax, 2146435072, ConditionFlag.Above, block11, false); masm.cmplAndJcc(rdx, 0, ConditionFlag.NotEqual, block11, false); masm.movl(rax, new AMD64Address(rsp, 12)); masm.cmplAndJcc(rax, 2146435072, ConditionFlag.NotEqual, block12, false); masm.movsd(xmm0, recordExternalAddress(crb, inf)); // 0x00000000, 0x7ff00000 masm.jmp(block13); masm.bind(block12); masm.movsd(xmm0, recordExternalAddress(crb, zero)); // 0x00000000, 0x00000000 masm.jmp(block13); masm.bind(block11); masm.movsd(xmm0, new AMD64Address(rsp, 8)); masm.addsd(xmm0, xmm0); masm.jmp(block13); masm.bind(block0); masm.movl(rax, new AMD64Address(rsp, 12)); masm.andl(rax, 2147483647); masm.cmplAndJcc(rax, 1083179008, ConditionFlag.AboveEqual, block8, false); masm.movsd(new AMD64Address(rsp, 8), xmm0); masm.addsd(xmm0, recordExternalAddress(crb, oneVal)); // 0x00000000, 0x3ff00000 masm.jmp(block13); masm.bind(block6); masm.movq(new AMD64Address(rsp, 16), xmm0); masm.movq(xmm0, new AMD64Address(rsp, 16)); masm.bind(block13); masm.addq(rsp, 24); } }