use fused multiply-add where possible

only possible for Intel compiler
2022-05-08 15:02:57 +02:00 · 2022-05-08 15:02:57 +02:00 · 8f9fbb30e5
parent 5b8e5591ed
commit 8f9fbb30e5
3 changed files with 60 additions and 21 deletions
--- a/src/math.f90
+++ b/src/math.f90
@ -82,7 +82,7 @@ contains
 !--------------------------------------------------------------------------------------------------
 !> @brief initialization of random seed generator and internal checks
 !--------------------------------------------------------------------------------------------------
-subroutine math_init
+subroutine math_init()

  real(pReal), dimension(4) :: randTest
  integer :: randSize
@ -1053,16 +1053,26 @@ pure subroutine math_eigh33(w,v,m)
  U = max(T, T**2)
  threshold = sqrt(5.68e-14_pReal * U**2)

-  v(1:3,1) = [ v(1,2) + m(1, 3) * w(1), &
-               v(2,2) + m(2, 3) * w(1), &
+#ifndef __INTEL_COMPILER
+  v(1:3,1) = [m(1,3)*w(1) + v(1,2), &
+              m(2,3)*w(1) + v(2,2), &
+#else
+  v(1:3,1) = [IEEE_FMA(m(1,3),w(1),v(1,2)), &
+              IEEE_FMA(m(2,3),w(1),v(2,2)), &
+#endif
              (m(1,1) - w(1)) * (m(2,2) - w(1)) - v(3,2)]
  norm = norm2(v(1:3, 1))
  fallback1: if (norm < threshold) then
    call math_eigh(w,v,error,m)
  else fallback1
    v(1:3,1) = v(1:3, 1) / norm
-    v(1:3,2) = [ v(1,2) + m(1, 3) * w(2), &
-                 v(2,2) + m(2, 3) * w(2), &
+#ifndef __INTEL_COMPILER
+    v(1:3,2) = [m(1,3)*w(2) + v(1,2), &
+                m(2,3)*w(2) + v(2,2), &
+#else
+    v(1:3,2) = [IEEE_FMA(m(1,3),w(2),v(1,2)), &
+                IEEE_FMA(m(2,3),w(2),v(2,2)), &
+#endif
                (m(1,1) - w(2)) * (m(2,2) - w(2)) - v(3,2)]
    norm = norm2(v(1:3, 2))
    fallback2: if (norm < threshold) then
@ -1300,7 +1310,7 @@ end function math_clip
 !--------------------------------------------------------------------------------------------------
 !> @brief Check correctness of some math functions.
 !--------------------------------------------------------------------------------------------------
-subroutine selfTest
+subroutine selfTest()

  integer, dimension(2,4) :: &
    sort_in_   = reshape([+1,+5,  +5,+6,  -1,-1,  +3,-2],[2,4])
--- a/src/phase_mechanical.f90
+++ b/src/phase_mechanical.f90
@ -680,8 +680,11 @@ function integrateStateEuler(F_0,F,subFp0,subFi0,subState0,Delta_t,ph,en) result
  if (any(IEEE_is_NaN(dotState))) return

  sizeDotState = plasticState(ph)%sizeDotState
-  plasticState(ph)%state(1:sizeDotState,en) = subState0 &
-                                            + dotState * Delta_t
+#ifndef __INTEL_COMPILER
+  plasticState(ph)%state(1:sizeDotState,en) = subState0 + dotState*Delta_t
+#else
+  plasticState(ph)%state(1:sizeDotState,en) = IEEE_FMA(dotState,Delta_t,subState0)
+#endif

  broken = plastic_deltaState(ph,en)
  if(broken) return
@ -720,8 +723,11 @@ function integrateStateAdaptiveEuler(F_0,F,subFp0,subFi0,subState0,Delta_t,ph,en
  sizeDotState = plasticState(ph)%sizeDotState

  r = - dotState * 0.5_pReal * Delta_t
-  plasticState(ph)%state(1:sizeDotState,en) = subState0 &
-                                            + dotState * Delta_t
+#ifndef __INTEL_COMPILER
+  plasticState(ph)%state(1:sizeDotState,en) = subState0 + dotState*Delta_t
+#else
+  plasticState(ph)%state(1:sizeDotState,en) = IEEE_FMA(dotState,Delta_t,subState0)
+#endif

  broken = plastic_deltaState(ph,en)
  if(broken) return
@ -842,12 +848,18 @@ function integrateStateRK(F_0,F,subFp0,subFi0,subState0,Delta_t,ph,en,A,B,C,DB)
    dotState = A(1,stage) * plastic_RKdotState(1:sizeDotState,1)

    do n = 2, stage
-      dotState = dotState &
-               + A(n,stage) * plastic_RKdotState(1:sizeDotState,n)
+#ifndef __INTEL_COMPILER
+      dotState = dotState + A(n,stage)*plastic_RKdotState(1:sizeDotState,n)
+#else
+      dotState = IEEE_FMA(A(n,stage),plastic_RKdotState(1:sizeDotState,n),dotState)
+#endif
    enddo

-    plasticState(ph)%state(1:sizeDotState,en) = subState0 &
-                                              + dotState * Delta_t
+#ifndef __INTEL_COMPILER
+    plasticState(ph)%state(1:sizeDotState,en) = subState0 + dotState*Delta_t
+#else
+    plasticState(ph)%state(1:sizeDotState,en) = IEEE_FMA(dotState,Delta_t,subState0)
+#endif

    broken = integrateStress(F_0+(F-F_0)*Delta_t*C(stage),subFp0,subFi0,Delta_t*C(stage), ph,en)
    if(broken) exit
@ -861,8 +873,11 @@ function integrateStateRK(F_0,F,subFp0,subFi0,subState0,Delta_t,ph,en,A,B,C,DB)

  plastic_RKdotState(1:sizeDotState,size(B)) = dotState
  dotState = matmul(plastic_RKdotState,B)
-  plasticState(ph)%state(1:sizeDotState,en) = subState0 &
-                                            + dotState * Delta_t
+#ifndef __INTEL_COMPILER
+  plasticState(ph)%state(1:sizeDotState,en) = subState0 + dotState*Delta_t
+#else
+  plasticState(ph)%state(1:sizeDotState,en) = IEEE_FMA(dotState,Delta_t,subState0)
+#endif

  if(present(DB)) &
    broken = .not. converged(matmul(plastic_RKdotState(1:sizeDotState,1:size(DB)),DB) * Delta_t, &
@ -1146,12 +1161,18 @@ module function phase_mechanical_dPdF(Delta_t,co,ce) result(dPdF)
  else
    lhs_3333 = 0.0_pReal; rhs_3333 = 0.0_pReal
    do o=1,3; do p=1,3
+#ifndef __INTEL_COMPILER
      lhs_3333(1:3,1:3,o,p) = lhs_3333(1:3,1:3,o,p) &
                            + matmul(invSubFi0,dLidFi(1:3,1:3,o,p)) * Delta_t
      lhs_3333(1:3,o,1:3,p) = lhs_3333(1:3,o,1:3,p) &
                            + invFi*invFi(p,o)
      rhs_3333(1:3,1:3,o,p) = rhs_3333(1:3,1:3,o,p) &
                            - matmul(invSubFi0,dLidS(1:3,1:3,o,p)) * Delta_t
+#else
+      lhs_3333(1:3,1:3,o,p) = IEEE_FMA(matmul(invSubFi0,dLidFi(1:3,1:3,o,p)),Delta_t,lhs_3333(1:3,1:3,o,p))
+      lhs_3333(1:3,o,1:3,p) = IEEE_FMA(invFi,invFi(p,o),lhs_3333(1:3,o,1:3,p))
+      rhs_3333(1:3,1:3,o,p) = IEEE_FMA(matmul(invSubFi0,dLidS(1:3,1:3,o,p)),-Delta_t,rhs_3333(1:3,1:3,o,p))
+#endif
    enddo; enddo
    call math_invert(temp_99,error,math_3333to99(lhs_3333))
    if (error) then
@ -1180,8 +1201,12 @@ module function phase_mechanical_dPdF(Delta_t,co,ce) result(dPdF)
    temp_3333(1:3,1:3,p,o) = matmul(matmul(temp_33_2,dLpdS(1:3,1:3,p,o)), invFi) &
                           + matmul(temp_33_3,dLidS(1:3,1:3,p,o))
  enddo; enddo
+#ifndef __INTEL_COMPILER
  lhs_3333 = math_mul3333xx3333(dSdFe,temp_3333) * Delta_t &
           + math_mul3333xx3333(dSdFi,dFidS)
+#else
+  lhs_3333 = IEEE_FMA(math_mul3333xx3333(dSdFe,temp_3333),Delta_t,math_mul3333xx3333(dSdFi,dFidS))
+#endif

  call math_invert(temp_99,error,math_eye(9)+math_3333to99(lhs_3333))
  if (error) then
--- a/src/polynomials.f90
+++ b/src/polynomials.f90
@ -107,7 +107,11 @@ pure function eval(self,x) result(y)

  y = 0.0_pReal
  do i = ubound(self%coef,1), 0, -1
+#ifndef __INTEL_COMPILER
    y = y*(x-self%x_ref) +self%coef(i)
+#else
+    y = IEEE_FMA(y,x-self%x_ref,self%coef(i))
+#endif
  enddo

 end function eval