From c619bed9753316c55a67ade41b9ca07ac1a07d62 Mon Sep 17 00:00:00 2001 From: Martin Diehl Date: Sun, 24 Sep 2023 06:31:36 +0200 Subject: [PATCH 1/2] let the compiler do the work compilers are aware of the FMA instruction and use it --- src/math.f90 | 10 ---------- src/phase_mechanical.f90 | 30 ------------------------------ src/polynomials.f90 | 4 ---- 3 files changed, 44 deletions(-) diff --git a/src/math.f90 b/src/math.f90 index 24141f4e9..0fd3a9ba7 100644 --- a/src/math.f90 +++ b/src/math.f90 @@ -1047,26 +1047,16 @@ pure subroutine math_eigh33(w,v,m) U = max(T, T**2) threshold = sqrt(5.68e-14_pREAL * U**2) -#ifndef __INTEL_LLVM_COMPILER v(1:3,1) = [m(1,3)*w(1) + v(1,2), & m(2,3)*w(1) + v(2,2), & -#else - v(1:3,1) = [IEEE_FMA(m(1,3),w(1),v(1,2)), & - IEEE_FMA(m(2,3),w(1),v(2,2)), & -#endif (m(1,1) - w(1)) * (m(2,2) - w(1)) - v(3,2)] norm = norm2(v(1:3, 1)) fallback1: if (norm < threshold) then call math_eigh(w,v,error,m) else fallback1 v(1:3,1) = v(1:3, 1) / norm -#ifndef __INTEL_LLVM_COMPILER v(1:3,2) = [m(1,3)*w(2) + v(1,2), & m(2,3)*w(2) + v(2,2), & -#else - v(1:3,2) = [IEEE_FMA(m(1,3),w(2),v(1,2)), & - IEEE_FMA(m(2,3),w(2),v(2,2)), & -#endif (m(1,1) - w(2)) * (m(2,2) - w(2)) - v(3,2)] norm = norm2(v(1:3, 2)) fallback2: if (norm < threshold) then diff --git a/src/phase_mechanical.f90 b/src/phase_mechanical.f90 index e7103bedc..beb968875 100644 --- a/src/phase_mechanical.f90 +++ b/src/phase_mechanical.f90 @@ -693,11 +693,7 @@ function integrateStateEuler(F_0,F,Fp0,Fi0,state0,Delta_t,ph,en) result(broken) if (any(IEEE_is_NaN(dotState))) return sizeDotState = plasticState(ph)%sizeDotState -#ifndef __INTEL_LLVM_COMPILER plasticState(ph)%state(1:sizeDotState,en) = state0 + dotState*Delta_t -#else - plasticState(ph)%state(1:sizeDotState,en) = IEEE_FMA(dotState,Delta_t,state0) -#endif broken = plastic_deltaState(ph,en) if (broken) return @@ -736,11 +732,7 @@ function integrateStateAdaptiveEuler(F_0,F,Fp0,Fi0,state0,Delta_t,ph,en) result( sizeDotState = plasticState(ph)%sizeDotState r = - dotState * 0.5_pREAL * Delta_t -#ifndef __INTEL_LLVM_COMPILER plasticState(ph)%state(1:sizeDotState,en) = state0 + dotState*Delta_t -#else - plasticState(ph)%state(1:sizeDotState,en) = IEEE_FMA(dotState,Delta_t,state0) -#endif broken = plastic_deltaState(ph,en) if (broken) return @@ -861,18 +853,10 @@ function integrateStateRK(F_0,F,Fp0,Fi0,state0,Delta_t,ph,en,A,B,C,DB) result(br dotState = A(1,stage) * plastic_RKdotState(1:sizeDotState,1) do n = 2, stage -#ifndef __INTEL_LLVM_COMPILER dotState = dotState + A(n,stage)*plastic_RKdotState(1:sizeDotState,n) -#else - dotState = IEEE_FMA(A(n,stage),plastic_RKdotState(1:sizeDotState,n),dotState) -#endif end do -#ifndef __INTEL_LLVM_COMPILER plasticState(ph)%state(1:sizeDotState,en) = state0 + dotState*Delta_t -#else - plasticState(ph)%state(1:sizeDotState,en) = IEEE_FMA(dotState,Delta_t,state0) -#endif broken = integrateStress(F_0+(F-F_0)*Delta_t*C(stage),Fp0,Fi0,Delta_t*C(stage), ph,en) if (broken) exit @@ -886,11 +870,7 @@ function integrateStateRK(F_0,F,Fp0,Fi0,state0,Delta_t,ph,en,A,B,C,DB) result(br plastic_RKdotState(1:sizeDotState,size(B)) = dotState dotState = matmul(plastic_RKdotState,B) -#ifndef __INTEL_LLVM_COMPILER plasticState(ph)%state(1:sizeDotState,en) = state0 + dotState*Delta_t -#else - plasticState(ph)%state(1:sizeDotState,en) = IEEE_FMA(dotState,Delta_t,state0) -#endif if (present(DB)) & broken = .not. converged(matmul(plastic_RKdotState(1:sizeDotState,1:size(DB)),DB) * Delta_t, & @@ -1174,18 +1154,12 @@ module function phase_mechanical_dPdF(Delta_t,co,ce) result(dPdF) else lhs_3333 = 0.0_pREAL; rhs_3333 = 0.0_pREAL do o=1,3; do p=1,3 -#ifndef __INTEL_LLVM_COMPILER lhs_3333(1:3,1:3,o,p) = lhs_3333(1:3,1:3,o,p) & + matmul(invSubFi0,dLidFi(1:3,1:3,o,p)) * Delta_t lhs_3333(1:3,o,1:3,p) = lhs_3333(1:3,o,1:3,p) & + invFi*invFi(p,o) rhs_3333(1:3,1:3,o,p) = rhs_3333(1:3,1:3,o,p) & - matmul(invSubFi0,dLidS(1:3,1:3,o,p)) * Delta_t -#else - lhs_3333(1:3,1:3,o,p) = IEEE_FMA(matmul(invSubFi0,dLidFi(1:3,1:3,o,p)),Delta_t,lhs_3333(1:3,1:3,o,p)) - lhs_3333(1:3,o,1:3,p) = IEEE_FMA(invFi,invFi(p,o),lhs_3333(1:3,o,1:3,p)) - rhs_3333(1:3,1:3,o,p) = IEEE_FMA(matmul(invSubFi0,dLidS(1:3,1:3,o,p)),-Delta_t,rhs_3333(1:3,1:3,o,p)) -#endif end do; end do call math_invert(temp_99,error,math_3333to99(lhs_3333)) if (error) then @@ -1214,12 +1188,8 @@ module function phase_mechanical_dPdF(Delta_t,co,ce) result(dPdF) temp_3333(1:3,1:3,p,o) = matmul(matmul(temp_33_2,dLpdS(1:3,1:3,p,o)), invFi) & + matmul(temp_33_3,dLidS(1:3,1:3,p,o)) end do; end do -#ifndef __INTEL_LLVM_COMPILER lhs_3333 = math_mul3333xx3333(dSdFe,temp_3333) * Delta_t & + math_mul3333xx3333(dSdFi,dFidS) -#else - lhs_3333 = IEEE_FMA(math_mul3333xx3333(dSdFe,temp_3333),Delta_t,math_mul3333xx3333(dSdFi,dFidS)) -#endif call math_invert(temp_99,error,math_eye(9)+math_3333to99(lhs_3333)) if (error) then diff --git a/src/polynomials.f90 b/src/polynomials.f90 index 062f99911..162b40cea 100644 --- a/src/polynomials.f90 +++ b/src/polynomials.f90 @@ -107,11 +107,7 @@ pure function eval(self,x) result(y) y = self%coef(ubound(self%coef,1)) do o = ubound(self%coef,1)-1, 0, -1 -#ifndef __INTEL_LLVM_COMPILER y = y*(x-self%x_ref) +self%coef(o) -#else - y = IEEE_FMA(y,x-self%x_ref,self%coef(o)) -#endif end do end function eval From 836ddb4ba570a176c6648c5b416bfe04371fba68 Mon Sep 17 00:00:00 2001 From: Philip Eisenlohr Date: Tue, 26 Sep 2023 22:27:28 +0000 Subject: [PATCH 2/2] easier to understand (and maybe failsafe against empty coef) --- src/polynomials.f90 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/polynomials.f90 b/src/polynomials.f90 index 162b40cea..910441d0d 100644 --- a/src/polynomials.f90 +++ b/src/polynomials.f90 @@ -105,9 +105,9 @@ pure function eval(self,x) result(y) integer :: o - y = self%coef(ubound(self%coef,1)) - do o = ubound(self%coef,1)-1, 0, -1 - y = y*(x-self%x_ref) +self%coef(o) + y = 0.0_pREAL + do o = ubound(self%coef,1), 0, -1 + y = y*(x-self%x_ref) + self%coef(o) end do end function eval