use fused multiply-add where possible
only possible for Intel compiler
This commit is contained in:
parent
5b8e5591ed
commit
8f9fbb30e5
28
src/math.f90
28
src/math.f90
|
@ -82,7 +82,7 @@ contains
|
||||||
!--------------------------------------------------------------------------------------------------
|
!--------------------------------------------------------------------------------------------------
|
||||||
!> @brief initialization of random seed generator and internal checks
|
!> @brief initialization of random seed generator and internal checks
|
||||||
!--------------------------------------------------------------------------------------------------
|
!--------------------------------------------------------------------------------------------------
|
||||||
subroutine math_init
|
subroutine math_init()
|
||||||
|
|
||||||
real(pReal), dimension(4) :: randTest
|
real(pReal), dimension(4) :: randTest
|
||||||
integer :: randSize
|
integer :: randSize
|
||||||
|
@ -1045,24 +1045,34 @@ pure subroutine math_eigh33(w,v,m)
|
||||||
|
|
||||||
w = math_eigvalsh33(m)
|
w = math_eigvalsh33(m)
|
||||||
|
|
||||||
v(1:3,2) = [ m(1, 2) * m(2, 3) - m(1, 3) * m(2, 2), &
|
v(1:3,2) = [ m(1,2) * m(2,3) - m(1,3) * m(2,2), &
|
||||||
m(1, 3) * m(1, 2) - m(2, 3) * m(1, 1), &
|
m(1,3) * m(1,2) - m(2,3) * m(1,1), &
|
||||||
m(1, 2)**2]
|
m(1,2)**2]
|
||||||
|
|
||||||
T = maxval(abs(w))
|
T = maxval(abs(w))
|
||||||
U = max(T, T**2)
|
U = max(T, T**2)
|
||||||
threshold = sqrt(5.68e-14_pReal * U**2)
|
threshold = sqrt(5.68e-14_pReal * U**2)
|
||||||
|
|
||||||
v(1:3,1) = [ v(1,2) + m(1, 3) * w(1), &
|
#ifndef __INTEL_COMPILER
|
||||||
v(2,2) + m(2, 3) * w(1), &
|
v(1:3,1) = [m(1,3)*w(1) + v(1,2), &
|
||||||
|
m(2,3)*w(1) + v(2,2), &
|
||||||
|
#else
|
||||||
|
v(1:3,1) = [IEEE_FMA(m(1,3),w(1),v(1,2)), &
|
||||||
|
IEEE_FMA(m(2,3),w(1),v(2,2)), &
|
||||||
|
#endif
|
||||||
(m(1,1) - w(1)) * (m(2,2) - w(1)) - v(3,2)]
|
(m(1,1) - w(1)) * (m(2,2) - w(1)) - v(3,2)]
|
||||||
norm = norm2(v(1:3, 1))
|
norm = norm2(v(1:3, 1))
|
||||||
fallback1: if (norm < threshold) then
|
fallback1: if (norm < threshold) then
|
||||||
call math_eigh(w,v,error,m)
|
call math_eigh(w,v,error,m)
|
||||||
else fallback1
|
else fallback1
|
||||||
v(1:3,1) = v(1:3, 1) / norm
|
v(1:3,1) = v(1:3, 1) / norm
|
||||||
v(1:3,2) = [ v(1,2) + m(1, 3) * w(2), &
|
#ifndef __INTEL_COMPILER
|
||||||
v(2,2) + m(2, 3) * w(2), &
|
v(1:3,2) = [m(1,3)*w(2) + v(1,2), &
|
||||||
|
m(2,3)*w(2) + v(2,2), &
|
||||||
|
#else
|
||||||
|
v(1:3,2) = [IEEE_FMA(m(1,3),w(2),v(1,2)), &
|
||||||
|
IEEE_FMA(m(2,3),w(2),v(2,2)), &
|
||||||
|
#endif
|
||||||
(m(1,1) - w(2)) * (m(2,2) - w(2)) - v(3,2)]
|
(m(1,1) - w(2)) * (m(2,2) - w(2)) - v(3,2)]
|
||||||
norm = norm2(v(1:3, 2))
|
norm = norm2(v(1:3, 2))
|
||||||
fallback2: if (norm < threshold) then
|
fallback2: if (norm < threshold) then
|
||||||
|
@ -1300,7 +1310,7 @@ end function math_clip
|
||||||
!--------------------------------------------------------------------------------------------------
|
!--------------------------------------------------------------------------------------------------
|
||||||
!> @brief Check correctness of some math functions.
|
!> @brief Check correctness of some math functions.
|
||||||
!--------------------------------------------------------------------------------------------------
|
!--------------------------------------------------------------------------------------------------
|
||||||
subroutine selfTest
|
subroutine selfTest()
|
||||||
|
|
||||||
integer, dimension(2,4) :: &
|
integer, dimension(2,4) :: &
|
||||||
sort_in_ = reshape([+1,+5, +5,+6, -1,-1, +3,-2],[2,4])
|
sort_in_ = reshape([+1,+5, +5,+6, -1,-1, +3,-2],[2,4])
|
||||||
|
|
|
@ -680,8 +680,11 @@ function integrateStateEuler(F_0,F,subFp0,subFi0,subState0,Delta_t,ph,en) result
|
||||||
if (any(IEEE_is_NaN(dotState))) return
|
if (any(IEEE_is_NaN(dotState))) return
|
||||||
|
|
||||||
sizeDotState = plasticState(ph)%sizeDotState
|
sizeDotState = plasticState(ph)%sizeDotState
|
||||||
plasticState(ph)%state(1:sizeDotState,en) = subState0 &
|
#ifndef __INTEL_COMPILER
|
||||||
+ dotState * Delta_t
|
plasticState(ph)%state(1:sizeDotState,en) = subState0 + dotState*Delta_t
|
||||||
|
#else
|
||||||
|
plasticState(ph)%state(1:sizeDotState,en) = IEEE_FMA(dotState,Delta_t,subState0)
|
||||||
|
#endif
|
||||||
|
|
||||||
broken = plastic_deltaState(ph,en)
|
broken = plastic_deltaState(ph,en)
|
||||||
if(broken) return
|
if(broken) return
|
||||||
|
@ -720,8 +723,11 @@ function integrateStateAdaptiveEuler(F_0,F,subFp0,subFi0,subState0,Delta_t,ph,en
|
||||||
sizeDotState = plasticState(ph)%sizeDotState
|
sizeDotState = plasticState(ph)%sizeDotState
|
||||||
|
|
||||||
r = - dotState * 0.5_pReal * Delta_t
|
r = - dotState * 0.5_pReal * Delta_t
|
||||||
plasticState(ph)%state(1:sizeDotState,en) = subState0 &
|
#ifndef __INTEL_COMPILER
|
||||||
+ dotState * Delta_t
|
plasticState(ph)%state(1:sizeDotState,en) = subState0 + dotState*Delta_t
|
||||||
|
#else
|
||||||
|
plasticState(ph)%state(1:sizeDotState,en) = IEEE_FMA(dotState,Delta_t,subState0)
|
||||||
|
#endif
|
||||||
|
|
||||||
broken = plastic_deltaState(ph,en)
|
broken = plastic_deltaState(ph,en)
|
||||||
if(broken) return
|
if(broken) return
|
||||||
|
@ -842,12 +848,18 @@ function integrateStateRK(F_0,F,subFp0,subFi0,subState0,Delta_t,ph,en,A,B,C,DB)
|
||||||
dotState = A(1,stage) * plastic_RKdotState(1:sizeDotState,1)
|
dotState = A(1,stage) * plastic_RKdotState(1:sizeDotState,1)
|
||||||
|
|
||||||
do n = 2, stage
|
do n = 2, stage
|
||||||
dotState = dotState &
|
#ifndef __INTEL_COMPILER
|
||||||
+ A(n,stage) * plastic_RKdotState(1:sizeDotState,n)
|
dotState = dotState + A(n,stage)*plastic_RKdotState(1:sizeDotState,n)
|
||||||
|
#else
|
||||||
|
dotState = IEEE_FMA(A(n,stage),plastic_RKdotState(1:sizeDotState,n),dotState)
|
||||||
|
#endif
|
||||||
enddo
|
enddo
|
||||||
|
|
||||||
plasticState(ph)%state(1:sizeDotState,en) = subState0 &
|
#ifndef __INTEL_COMPILER
|
||||||
+ dotState * Delta_t
|
plasticState(ph)%state(1:sizeDotState,en) = subState0 + dotState*Delta_t
|
||||||
|
#else
|
||||||
|
plasticState(ph)%state(1:sizeDotState,en) = IEEE_FMA(dotState,Delta_t,subState0)
|
||||||
|
#endif
|
||||||
|
|
||||||
broken = integrateStress(F_0+(F-F_0)*Delta_t*C(stage),subFp0,subFi0,Delta_t*C(stage), ph,en)
|
broken = integrateStress(F_0+(F-F_0)*Delta_t*C(stage),subFp0,subFi0,Delta_t*C(stage), ph,en)
|
||||||
if(broken) exit
|
if(broken) exit
|
||||||
|
@ -861,8 +873,11 @@ function integrateStateRK(F_0,F,subFp0,subFi0,subState0,Delta_t,ph,en,A,B,C,DB)
|
||||||
|
|
||||||
plastic_RKdotState(1:sizeDotState,size(B)) = dotState
|
plastic_RKdotState(1:sizeDotState,size(B)) = dotState
|
||||||
dotState = matmul(plastic_RKdotState,B)
|
dotState = matmul(plastic_RKdotState,B)
|
||||||
plasticState(ph)%state(1:sizeDotState,en) = subState0 &
|
#ifndef __INTEL_COMPILER
|
||||||
+ dotState * Delta_t
|
plasticState(ph)%state(1:sizeDotState,en) = subState0 + dotState*Delta_t
|
||||||
|
#else
|
||||||
|
plasticState(ph)%state(1:sizeDotState,en) = IEEE_FMA(dotState,Delta_t,subState0)
|
||||||
|
#endif
|
||||||
|
|
||||||
if(present(DB)) &
|
if(present(DB)) &
|
||||||
broken = .not. converged(matmul(plastic_RKdotState(1:sizeDotState,1:size(DB)),DB) * Delta_t, &
|
broken = .not. converged(matmul(plastic_RKdotState(1:sizeDotState,1:size(DB)),DB) * Delta_t, &
|
||||||
|
@ -1146,12 +1161,18 @@ module function phase_mechanical_dPdF(Delta_t,co,ce) result(dPdF)
|
||||||
else
|
else
|
||||||
lhs_3333 = 0.0_pReal; rhs_3333 = 0.0_pReal
|
lhs_3333 = 0.0_pReal; rhs_3333 = 0.0_pReal
|
||||||
do o=1,3; do p=1,3
|
do o=1,3; do p=1,3
|
||||||
|
#ifndef __INTEL_COMPILER
|
||||||
lhs_3333(1:3,1:3,o,p) = lhs_3333(1:3,1:3,o,p) &
|
lhs_3333(1:3,1:3,o,p) = lhs_3333(1:3,1:3,o,p) &
|
||||||
+ matmul(invSubFi0,dLidFi(1:3,1:3,o,p)) * Delta_t
|
+ matmul(invSubFi0,dLidFi(1:3,1:3,o,p)) * Delta_t
|
||||||
lhs_3333(1:3,o,1:3,p) = lhs_3333(1:3,o,1:3,p) &
|
lhs_3333(1:3,o,1:3,p) = lhs_3333(1:3,o,1:3,p) &
|
||||||
+ invFi*invFi(p,o)
|
+ invFi*invFi(p,o)
|
||||||
rhs_3333(1:3,1:3,o,p) = rhs_3333(1:3,1:3,o,p) &
|
rhs_3333(1:3,1:3,o,p) = rhs_3333(1:3,1:3,o,p) &
|
||||||
- matmul(invSubFi0,dLidS(1:3,1:3,o,p)) * Delta_t
|
- matmul(invSubFi0,dLidS(1:3,1:3,o,p)) * Delta_t
|
||||||
|
#else
|
||||||
|
lhs_3333(1:3,1:3,o,p) = IEEE_FMA(matmul(invSubFi0,dLidFi(1:3,1:3,o,p)),Delta_t,lhs_3333(1:3,1:3,o,p))
|
||||||
|
lhs_3333(1:3,o,1:3,p) = IEEE_FMA(invFi,invFi(p,o),lhs_3333(1:3,o,1:3,p))
|
||||||
|
rhs_3333(1:3,1:3,o,p) = IEEE_FMA(matmul(invSubFi0,dLidS(1:3,1:3,o,p)),-Delta_t,rhs_3333(1:3,1:3,o,p))
|
||||||
|
#endif
|
||||||
enddo; enddo
|
enddo; enddo
|
||||||
call math_invert(temp_99,error,math_3333to99(lhs_3333))
|
call math_invert(temp_99,error,math_3333to99(lhs_3333))
|
||||||
if (error) then
|
if (error) then
|
||||||
|
@ -1180,8 +1201,12 @@ module function phase_mechanical_dPdF(Delta_t,co,ce) result(dPdF)
|
||||||
temp_3333(1:3,1:3,p,o) = matmul(matmul(temp_33_2,dLpdS(1:3,1:3,p,o)), invFi) &
|
temp_3333(1:3,1:3,p,o) = matmul(matmul(temp_33_2,dLpdS(1:3,1:3,p,o)), invFi) &
|
||||||
+ matmul(temp_33_3,dLidS(1:3,1:3,p,o))
|
+ matmul(temp_33_3,dLidS(1:3,1:3,p,o))
|
||||||
enddo; enddo
|
enddo; enddo
|
||||||
|
#ifndef __INTEL_COMPILER
|
||||||
lhs_3333 = math_mul3333xx3333(dSdFe,temp_3333) * Delta_t &
|
lhs_3333 = math_mul3333xx3333(dSdFe,temp_3333) * Delta_t &
|
||||||
+ math_mul3333xx3333(dSdFi,dFidS)
|
+ math_mul3333xx3333(dSdFi,dFidS)
|
||||||
|
#else
|
||||||
|
lhs_3333 = IEEE_FMA(math_mul3333xx3333(dSdFe,temp_3333),Delta_t,math_mul3333xx3333(dSdFi,dFidS))
|
||||||
|
#endif
|
||||||
|
|
||||||
call math_invert(temp_99,error,math_eye(9)+math_3333to99(lhs_3333))
|
call math_invert(temp_99,error,math_eye(9)+math_3333to99(lhs_3333))
|
||||||
if (error) then
|
if (error) then
|
||||||
|
|
|
@ -106,8 +106,12 @@ pure function eval(self,x) result(y)
|
||||||
|
|
||||||
|
|
||||||
y = 0.0_pReal
|
y = 0.0_pReal
|
||||||
do i = ubound(self%coef,1), 0 , -1
|
do i = ubound(self%coef,1), 0, -1
|
||||||
y = y*(x-self%x_ref) + self%coef(i)
|
#ifndef __INTEL_COMPILER
|
||||||
|
y = y*(x-self%x_ref) +self%coef(i)
|
||||||
|
#else
|
||||||
|
y = IEEE_FMA(y,x-self%x_ref,self%coef(i))
|
||||||
|
#endif
|
||||||
enddo
|
enddo
|
||||||
|
|
||||||
end function eval
|
end function eval
|
||||||
|
|
Loading…
Reference in New Issue