From d44fce4a76532914ee769b9c5bd1b37e1f66909e Mon Sep 17 00:00:00 2001
From: Pratheek	Shanthraj <p.shanthraj@mpie.de>
Date: Wed, 25 Mar 2015 16:06:19 +0000
Subject: [PATCH] Spectral solver now fully parallel (parallel IO, domain
 decomposition, FFTs and restart). Working but not extensively tested so
 please report bugs to me

---
 code/CPFEM.f90                              |  49 +-
 code/DAMASK_spectral_driver.f90             | 327 ++++----
 code/DAMASK_spectral_interface.f90          |   4 +-
 code/DAMASK_spectral_solverAL.f90           | 273 +++----
 code/DAMASK_spectral_solverBasicPETSc.f90   | 210 ++---
 code/DAMASK_spectral_solverPolarisation.f90 | 275 +++----
 code/DAMASK_spectral_utilities.f90          | 822 +++++++++++---------
 code/Makefile                               |  17 +-
 code/mesh.f90                               | 299 +++----
 9 files changed, 1238 insertions(+), 1038 deletions(-)

diff --git a/code/CPFEM.f90 b/code/CPFEM.f90
index 89db4dab4..bbe70c1f5 100644
--- a/code/CPFEM.f90
+++ b/code/CPFEM.f90
@@ -159,6 +159,7 @@ subroutine CPFEM_init
 
  implicit none
  integer(pInt) :: k,l,m,ph,homog
+ character(len=1024) :: rankStr
 
  mainProcess: if (worldrank == 0) then 
    write(6,'(/,a)')   ' <<<+-  CPFEM init  -+>>>'
@@ -180,40 +181,42 @@ subroutine CPFEM_init
      write(6,'(a)') '<< CPFEM >> restored state variables of last converged step from binary files'
      flush(6)
    endif
+   
+   write(rankStr,'(a1,i0)')'_',worldrank
 
-   call IO_read_intFile(777,'recordedPhase',modelName,size(material_phase))
+   call IO_read_intFile(777,'recordedPhase'//trim(rankStr),modelName,size(material_phase))
    read (777,rec=1) material_phase
    close (777)
 
-   call IO_read_realFile(777,'convergedF',modelName,size(crystallite_F0))
+   call IO_read_realFile(777,'convergedF'//trim(rankStr),modelName,size(crystallite_F0))
    read (777,rec=1) crystallite_F0
    close (777)
 
-   call IO_read_realFile(777,'convergedFp',modelName,size(crystallite_Fp0))
+   call IO_read_realFile(777,'convergedFp'//trim(rankStr),modelName,size(crystallite_Fp0))
    read (777,rec=1) crystallite_Fp0
    close (777)
 
-   call IO_read_realFile(777,'convergedFi',modelName,size(crystallite_Fi0))
+   call IO_read_realFile(777,'convergedFi'//trim(rankStr),modelName,size(crystallite_Fi0))
    read (777,rec=1) crystallite_Fi0
    close (777)
 
-   call IO_read_realFile(777,'convergedLp',modelName,size(crystallite_Lp0))
+   call IO_read_realFile(777,'convergedLp'//trim(rankStr),modelName,size(crystallite_Lp0))
    read (777,rec=1) crystallite_Lp0
    close (777)
 
-   call IO_read_realFile(777,'convergedLi',modelName,size(crystallite_Li0))
+   call IO_read_realFile(777,'convergedLi'//trim(rankStr),modelName,size(crystallite_Li0))
    read (777,rec=1) crystallite_Li0
    close (777)
 
-   call IO_read_realFile(777,'convergeddPdF',modelName,size(crystallite_dPdF0))
+   call IO_read_realFile(777,'convergeddPdF'//trim(rankStr),modelName,size(crystallite_dPdF0))
    read (777,rec=1) crystallite_dPdF0
    close (777)
 
-   call IO_read_realFile(777,'convergedTstar',modelName,size(crystallite_Tstar0_v))
+   call IO_read_realFile(777,'convergedTstar'//trim(rankStr),modelName,size(crystallite_Tstar0_v))
    read (777,rec=1) crystallite_Tstar0_v
    close (777)
 
-   call IO_read_realFile(777,'convergedStateConst',modelName)
+   call IO_read_realFile(777,'convergedStateConst'//trim(rankStr),modelName)
    m = 0_pInt
    readPlasticityInstances: do ph = 1_pInt, size(phase_plasticity)
      do k = 1_pInt, plasticState(ph)%sizeState
@@ -224,7 +227,7 @@ subroutine CPFEM_init
    enddo readPlasticityInstances
    close (777)
 
-   call IO_read_realFile(777,'convergedStateHomog',modelName)
+   call IO_read_realFile(777,'convergedStateHomog'//trim(rankStr),modelName)
    m = 0_pInt
    readHomogInstances: do homog = 1_pInt, material_Nhomogenization
      do k = 1_pInt, homogState(homog)%sizeState
@@ -265,7 +268,8 @@ subroutine CPFEM_general(mode, ffn, ffn1, temperature, dt, elFE, ip)
 #endif
  use numerics, only: &
    defgradTolerance, &
-   iJacoStiffness
+   iJacoStiffness, &
+   worldrank
  use debug, only: &
    debug_level, &
    debug_CPFEM, &
@@ -376,6 +380,7 @@ subroutine CPFEM_general(mode, ffn, ffn1, temperature, dt, elFE, ip)
  integer(pInt)                                       elCP, &                                        ! crystal plasticity element number
                                                      i, j, k, l, m, n, ph, homog
  logical                                             updateJaco                                     ! flag indicating if JAcobian has to be updated
+ character(len=1024) :: rankStr
 
 #if defined(Marc4DAMASK) || defined(Abaqus)
  elCP = mesh_FEasCP('elem',elFE)
@@ -442,40 +447,42 @@ subroutine CPFEM_general(mode, ffn, ffn1, temperature, dt, elFE, ip)
    if (restartWrite) then
      if (iand(debug_level(debug_CPFEM), debug_levelBasic) /= 0_pInt) &
         write(6,'(a)') '<< CPFEM >> writing state variables of last converged step to binary files'
+   
+     write(rankStr,'(a1,i0)')'_',worldrank
 
-     call IO_write_jobRealFile(777,'recordedPhase',size(material_phase))
+     call IO_write_jobRealFile(777,'recordedPhase'//trim(rankStr),size(material_phase))
      write (777,rec=1) material_phase
      close (777)
 
-     call IO_write_jobRealFile(777,'convergedF',size(crystallite_F0))
+     call IO_write_jobRealFile(777,'convergedF'//trim(rankStr),size(crystallite_F0))
      write (777,rec=1) crystallite_F0
      close (777)
 
-     call IO_write_jobRealFile(777,'convergedFp',size(crystallite_Fp0))
+     call IO_write_jobRealFile(777,'convergedFp'//trim(rankStr),size(crystallite_Fp0))
      write (777,rec=1) crystallite_Fp0
      close (777)
 
-     call IO_write_jobRealFile(777,'convergedFi',size(crystallite_Fi0))
+     call IO_write_jobRealFile(777,'convergedFi'//trim(rankStr),size(crystallite_Fi0))
      write (777,rec=1) crystallite_Fi0
      close (777)
 
-     call IO_write_jobRealFile(777,'convergedLp',size(crystallite_Lp0))
+     call IO_write_jobRealFile(777,'convergedLp'//trim(rankStr),size(crystallite_Lp0))
      write (777,rec=1) crystallite_Lp0
      close (777)
 
-     call IO_write_jobRealFile(777,'convergedLi',size(crystallite_Li0))
+     call IO_write_jobRealFile(777,'convergedLi'//trim(rankStr),size(crystallite_Li0))
      write (777,rec=1) crystallite_Li0
      close (777)
 
-     call IO_write_jobRealFile(777,'convergeddPdF',size(crystallite_dPdF0))
+     call IO_write_jobRealFile(777,'convergeddPdF'//trim(rankStr),size(crystallite_dPdF0))
      write (777,rec=1) crystallite_dPdF0
      close (777)
 
-     call IO_write_jobRealFile(777,'convergedTstar',size(crystallite_Tstar0_v))
+     call IO_write_jobRealFile(777,'convergedTstar'//trim(rankStr),size(crystallite_Tstar0_v))
      write (777,rec=1) crystallite_Tstar0_v
      close (777)
 
-     call IO_write_jobRealFile(777,'convergedStateConst')
+     call IO_write_jobRealFile(777,'convergedStateConst'//trim(rankStr))
      m = 0_pInt
      writePlasticityInstances: do ph = 1_pInt, size(phase_plasticity)
        do k = 1_pInt, plasticState(ph)%sizeState
@@ -486,7 +493,7 @@ subroutine CPFEM_general(mode, ffn, ffn1, temperature, dt, elFE, ip)
      enddo writePlasticityInstances
      close (777)
 
-     call IO_write_jobRealFile(777,'convergedStateHomog')
+     call IO_write_jobRealFile(777,'convergedStateHomog'//trim(rankStr))
      m = 0_pInt
      writeHomogInstances: do homog = 1_pInt, material_Nhomogenization
        do k = 1_pInt, homogState(homog)%sizeState
diff --git a/code/DAMASK_spectral_driver.f90 b/code/DAMASK_spectral_driver.f90
index b833a7342..986d71033 100644
--- a/code/DAMASK_spectral_driver.f90
+++ b/code/DAMASK_spectral_driver.f90
@@ -41,12 +41,17 @@ program DAMASK_spectral_Driver
    debug_spectral, &
    debug_levelBasic
  use math                                                                                           ! need to include the whole module for FFTW
+ use mesh, only: &
+   gridGlobal, &
+   geomSizeGlobal
  use CPFEM, only: &
    CPFEM_initAll
  use FEsolving, only: &
    restartWrite, &
    restartInc
  use numerics, only: &
+   worldrank, &
+   worldsize, &
    maxCutBack, &
    spectral_solver, &
    continueCalculation
@@ -54,17 +59,17 @@ program DAMASK_spectral_Driver
    materialpoint_sizeResults, &
    materialpoint_results
  use DAMASK_spectral_Utilities, only: &
-   grid, &
-   geomSize, &
    tBoundaryCondition, &
    tSolutionState, &
    cutBack
- use DAMASK_spectral_SolverBasic
  use DAMASK_spectral_SolverBasicPETSC
  use DAMASK_spectral_SolverAL
  use DAMASK_spectral_SolverPolarisation
 
  implicit none
+
+#include <petsc-finclude/petscsys.h>
+
  type tLoadCase
    real(pReal), dimension (3,3) :: rotation               = math_I3                                 !< rotation of BC
    type(tBoundaryCondition) ::     P, &                                                             !< stress BC
@@ -129,14 +134,19 @@ program DAMASK_spectral_Driver
  character(len=1024)  :: incInfo                                                                    !< string parsed to solution with information about current load case
  type(tLoadCase), allocatable, dimension(:) :: loadCases                                            !< array of all load cases
  type(tSolutionState) solres
+ integer(kind=MPI_OFFSET_KIND) :: my_offset
+ integer, dimension(:), allocatable :: outputSize
+ PetscErrorCode :: ierr
  external :: quit
 !--------------------------------------------------------------------------------------------------
 ! init DAMASK (all modules)
  call CPFEM_initAll(temperature = 300.0_pReal, el = 1_pInt, ip = 1_pInt)
- write(6,'(/,a)')   ' <<<+-  DAMASK_spectral_driver init  -+>>>'
- write(6,'(a)')     ' $Id$'
- write(6,'(a15,a)') ' Current time: ',IO_timeStamp()
+ mainProcess: if (worldrank == 0) then
+   write(6,'(/,a)')   ' <<<+-  DAMASK_spectral_driver init  -+>>>'
+   write(6,'(a)')     ' $Id$'
+   write(6,'(a15,a)') ' Current time: ',IO_timeStamp()
 #include "compilation_info.f90"
+ endif mainProcess
 
 !--------------------------------------------------------------------------------------------------
 ! reading basic information from load case file and allocate data structure containing load cases
@@ -256,72 +266,72 @@ program DAMASK_spectral_Driver
 ! consistency checks and output of load case
  loadCases(1)%followFormerTrajectory = .false.                                                      ! cannot guess along trajectory for first inc of first currentLoadCase
  errorID = 0_pInt
- checkLoadcases: do currentLoadCase = 1_pInt, size(loadCases)
-   write (loadcase_string, '(i6)' ) currentLoadCase
-   write(6,'(1x,a,i6)') 'load case: ', currentLoadCase
-   if (.not. loadCases(currentLoadCase)%followFormerTrajectory) &
-     write(6,'(2x,a)') 'drop guessing along trajectory'
-   if (loadCases(currentLoadCase)%deformation%myType=='l') then
-     do j = 1_pInt, 3_pInt
-       if (any(loadCases(currentLoadCase)%deformation%maskLogical(j,1:3) .eqv. .true.) .and. &
-           any(loadCases(currentLoadCase)%deformation%maskLogical(j,1:3) .eqv. .false.)) &
-                                                                  errorID = 832_pInt                ! each row should be either fully or not at all defined
-     enddo
-     write(6,'(2x,a)') 'velocity gradient:'
-   else if (loadCases(currentLoadCase)%deformation%myType=='f') then
-     write(6,'(2x,a)') 'deformation gradient at end of load case:'
-   else
-     write(6,'(2x,a)') 'deformation gradient rate:'
-   endif
-   write(6,'(3(3(3x,f12.7,1x)/))',advance='no') &
-              merge(math_transpose33(loadCases(currentLoadCase)%deformation%values), &
-              reshape(spread(huge(1.0_pReal),1,9),[ 3,3]), &                                        ! print *** (huge) for undefined
-              transpose(loadCases(currentLoadCase)%deformation%maskLogical))
-   if (any(loadCases(currentLoadCase)%P%maskLogical .eqv. &
-            loadCases(currentLoadCase)%deformation%maskLogical)) errorID = 831_pInt                 ! exclusive or masking only
-   if (any(loadCases(currentLoadCase)%P%maskLogical .and. &                                   
-           transpose(loadCases(currentLoadCase)%P%maskLogical) .and. &
-           reshape([ .false.,.true.,.true.,.true.,.false.,.true.,.true.,.true.,.false.],[ 3,3]))) &
-           errorID = 838_pInt                                                                       ! no rotation is allowed by stress BC
-   write(6,'(2x,a,/,3(3(3x,f12.7,1x)/))',advance='no') 'stress / GPa:',&
-              1e-9_pReal*merge(math_transpose33(loadCases(currentLoadCase)%P%values),&
-              reshape(spread(huge(1.0_pReal),1,9),[ 3,3]),&
-              transpose(loadCases(currentLoadCase)%P%maskLogical))
-  if (any(abs(math_mul33x33(loadCases(currentLoadCase)%rotation, &
-              math_transpose33(loadCases(currentLoadCase)%rotation))-math_I3) >&
-              reshape(spread(tol_math_check,1,9),[ 3,3]))&
-              .or. abs(math_det33(loadCases(currentLoadCase)%rotation)) > &
-              1.0_pReal + tol_math_check) errorID = 846_pInt                                          ! given rotation matrix contains strain
-   if (any(loadCases(currentLoadCase)%rotation /= math_I3)) &
-     write(6,'(2x,a,/,3(3(3x,f12.7,1x)/))',advance='no') 'rotation of loadframe:',&
-              math_transpose33(loadCases(currentLoadCase)%rotation)
-   write(6,'(2x,a,f12.6)') 'temperature:', loadCases(currentLoadCase)%temperature
-   write(6,'(2x,a,f12.6)') 'density:    ', loadCases(currentLoadCase)%density
-   if (loadCases(currentLoadCase)%time < 0.0_pReal)          errorID = 834_pInt                     ! negative time increment
-   write(6,'(2x,a,f12.6)') 'time:       ', loadCases(currentLoadCase)%time
-   if (loadCases(currentLoadCase)%incs < 1_pInt)             errorID = 835_pInt                     ! non-positive incs count
-   write(6,'(2x,a,i5)')    'increments: ', loadCases(currentLoadCase)%incs
-   if (loadCases(currentLoadCase)%outputfrequency < 1_pInt)  errorID = 836_pInt                     ! non-positive result frequency
-   write(6,'(2x,a,i5)')    'output  frequency:  ', &
-              loadCases(currentLoadCase)%outputfrequency
-   write(6,'(2x,a,i5,/)')    'restart frequency:  ', &
-              loadCases(currentLoadCase)%restartfrequency
-   if (errorID > 0_pInt) call IO_error(error_ID = errorID, ext_msg = loadcase_string)               ! exit with error message
- enddo checkLoadcases
+ if (worldrank == 0) then
+   checkLoadcases: do currentLoadCase = 1_pInt, size(loadCases)
+     write (loadcase_string, '(i6)' ) currentLoadCase
+     write(6,'(1x,a,i6)') 'load case: ', currentLoadCase
+     if (.not. loadCases(currentLoadCase)%followFormerTrajectory) &
+       write(6,'(2x,a)') 'drop guessing along trajectory'
+     if (loadCases(currentLoadCase)%deformation%myType=='l') then
+       do j = 1_pInt, 3_pInt
+         if (any(loadCases(currentLoadCase)%deformation%maskLogical(j,1:3) .eqv. .true.) .and. &
+             any(loadCases(currentLoadCase)%deformation%maskLogical(j,1:3) .eqv. .false.)) &
+                                                                    errorID = 832_pInt              ! each row should be either fully or not at all defined
+       enddo
+       write(6,'(2x,a)') 'velocity gradient:'
+     else if (loadCases(currentLoadCase)%deformation%myType=='f') then
+       write(6,'(2x,a)') 'deformation gradient at end of load case:'
+     else
+       write(6,'(2x,a)') 'deformation gradient rate:'
+     endif
+     write(6,'(3(3(3x,f12.7,1x)/))',advance='no') &
+                merge(math_transpose33(loadCases(currentLoadCase)%deformation%values), &
+                reshape(spread(huge(1.0_pReal),1,9),[ 3,3]), &                                      ! print *** (huge) for undefined
+                transpose(loadCases(currentLoadCase)%deformation%maskLogical))
+     if (any(loadCases(currentLoadCase)%P%maskLogical .eqv. &
+              loadCases(currentLoadCase)%deformation%maskLogical)) errorID = 831_pInt               ! exclusive or masking only
+     if (any(loadCases(currentLoadCase)%P%maskLogical .and. &                                   
+             transpose(loadCases(currentLoadCase)%P%maskLogical) .and. &
+             reshape([ .false.,.true.,.true.,.true.,.false.,.true.,.true.,.true.,.false.],[ 3,3]))) &
+             errorID = 838_pInt                                                                     ! no rotation is allowed by stress BC
+     write(6,'(2x,a,/,3(3(3x,f12.7,1x)/))',advance='no') 'stress / GPa:',&
+                1e-9_pReal*merge(math_transpose33(loadCases(currentLoadCase)%P%values),&
+                reshape(spread(huge(1.0_pReal),1,9),[ 3,3]),&
+                transpose(loadCases(currentLoadCase)%P%maskLogical))
+    if (any(abs(math_mul33x33(loadCases(currentLoadCase)%rotation, &
+                math_transpose33(loadCases(currentLoadCase)%rotation))-math_I3) >&
+                reshape(spread(tol_math_check,1,9),[ 3,3]))&
+                .or. abs(math_det33(loadCases(currentLoadCase)%rotation)) > &
+                1.0_pReal + tol_math_check) errorID = 846_pInt                                      ! given rotation matrix contains strain
+     if (any(loadCases(currentLoadCase)%rotation /= math_I3)) &
+       write(6,'(2x,a,/,3(3(3x,f12.7,1x)/))',advance='no') 'rotation of loadframe:',&
+                math_transpose33(loadCases(currentLoadCase)%rotation)
+     write(6,'(2x,a,f12.6)') 'temperature:', loadCases(currentLoadCase)%temperature
+     write(6,'(2x,a,f12.6)') 'density:    ', loadCases(currentLoadCase)%density
+     if (loadCases(currentLoadCase)%time < 0.0_pReal)          errorID = 834_pInt                   ! negative time increment
+     write(6,'(2x,a,f12.6)') 'time:       ', loadCases(currentLoadCase)%time
+     if (loadCases(currentLoadCase)%incs < 1_pInt)             errorID = 835_pInt                   ! non-positive incs count
+     write(6,'(2x,a,i5)')    'increments: ', loadCases(currentLoadCase)%incs
+     if (loadCases(currentLoadCase)%outputfrequency < 1_pInt)  errorID = 836_pInt                   ! non-positive result frequency
+     write(6,'(2x,a,i5)')    'output  frequency:  ', &
+                loadCases(currentLoadCase)%outputfrequency
+     write(6,'(2x,a,i5,/)')    'restart frequency:  ', &
+                loadCases(currentLoadCase)%restartfrequency
+     if (errorID > 0_pInt) call IO_error(error_ID = errorID, ext_msg = loadcase_string)             ! exit with error message
+   enddo checkLoadcases
+ endif
 
 !--------------------------------------------------------------------------------------------------
 ! doing initialization depending on selected solver 
  select case (spectral_solver)
-   case (DAMASK_spectral_SolverBasic_label)
-     call basic_init(loadCases(1)%temperature)
    case (DAMASK_spectral_SolverBasicPETSc_label)
      call basicPETSc_init(loadCases(1)%temperature)
    case (DAMASK_spectral_SolverAL_label)
-     if(iand(debug_level(debug_spectral),debug_levelBasic)/= 0) &
+     if(iand(debug_level(debug_spectral),debug_levelBasic)/= 0 .and. worldrank == 0_pInt) &
        call IO_warning(42_pInt, ext_msg='debug Divergence')
      call AL_init(loadCases(1)%temperature)
    case (DAMASK_spectral_SolverPolarisation_label)
-     if(iand(debug_level(debug_spectral),debug_levelBasic)/= 0) &
+     if(iand(debug_level(debug_spectral),debug_levelBasic)/= 0 .and. worldrank == 0_pInt) &
        call IO_warning(42_pInt, ext_msg='debug Divergence')
      call Polarisation_init(loadCases(1)%temperature)
    case default
@@ -330,34 +340,52 @@ program DAMASK_spectral_Driver
  
 !--------------------------------------------------------------------------------------------------
 ! write header of output file
- if (appendToOutFile) then                                                                          ! after restart, append to existing results file
-   open(newunit=resUnit,file=trim(getSolverWorkingDirectoryName())//trim(getSolverJobName())//&
-                               '.spectralOut',form='UNFORMATTED', position='APPEND', status='OLD')
-   open(newunit=statUnit,file=trim(getSolverWorkingDirectoryName())//trim(getSolverJobName())//&
-                               '.sta',form='FORMATTED', position='APPEND', status='OLD')
- else                                                                                               ! open new files ...
-   open(newunit=resUnit,file=trim(getSolverWorkingDirectoryName())//trim(getSolverJobName())//&
-                               '.spectralOut',form='UNFORMATTED',status='REPLACE')
-   write(resUnit) 'load:',       trim(loadCaseFile)                                                 ! ... and write header
-   write(resUnit) 'workingdir:', trim(getSolverWorkingDirectoryName())
-   write(resUnit) 'geometry:',   trim(geometryFile)
-   write(resUnit) 'grid:',       grid
-   write(resUnit) 'size:',       geomSize
-   write(resUnit) 'materialpoint_sizeResults:', materialpoint_sizeResults
-   write(resUnit) 'loadcases:',  size(loadCases)
-   write(resUnit) 'frequencies:', loadCases%outputfrequency                                         ! one entry per currentLoadCase
-   write(resUnit) 'times:',      loadCases%time                                                     ! one entry per currentLoadCase
-   write(resUnit) 'logscales:',  loadCases%logscale
-   write(resUnit) 'increments:', loadCases%incs                                                     ! one entry per currentLoadCase
-   write(resUnit) 'startingIncrement:', restartInc - 1_pInt                                         ! start with writing out the previous inc
-   write(resUnit) 'eoh'                                                                             ! end of header
-   write(resUnit) materialpoint_results                                                             ! initial (non-deformed or read-in) results
-   open(newunit=statUnit,file=trim(getSolverWorkingDirectoryName())//trim(getSolverJobName())//&
-                               '.sta',form='FORMATTED',status='REPLACE')
-   write(statUnit,'(a)') 'Increment Time CutbackLevel Converged IterationsNeeded'                   ! statistics file
-   if (iand(debug_level(debug_spectral),debug_levelBasic) /= 0) &
-     write(6,'(/,a)') ' header of result file written out'
-   flush(6)
+ if (.not. appendToOutFile) then                                                                      ! after restart, append to existing results file
+   if (worldrank == 0) then
+     open(newunit=resUnit,file=trim(getSolverWorkingDirectoryName())//trim(getSolverJobName())//&
+                                 '.spectralOut',form='UNFORMATTED',status='REPLACE')
+     write(resUnit) 'load:',       trim(loadCaseFile)                                                 ! ... and write header
+     write(resUnit) 'workingdir:', trim(getSolverWorkingDirectoryName())
+     write(resUnit) 'geometry:',   trim(geometryFile)
+     write(resUnit) 'grid:',       gridGlobal
+     write(resUnit) 'size:',       geomSizeGlobal
+     write(resUnit) 'materialpoint_sizeResults:', materialpoint_sizeResults
+     write(resUnit) 'loadcases:',  size(loadCases)
+     write(resUnit) 'frequencies:', loadCases%outputfrequency                                         ! one entry per currentLoadCase
+     write(resUnit) 'times:',      loadCases%time                                                     ! one entry per currentLoadCase
+     write(resUnit) 'logscales:',  loadCases%logscale
+     write(resUnit) 'increments:', loadCases%incs                                                     ! one entry per currentLoadCase
+     write(resUnit) 'startingIncrement:', restartInc - 1_pInt                                         ! start with writing out the previous inc
+     write(resUnit) 'eoh'    
+     close(resUnit)                                                                                   ! end of header
+   endif
+ endif  
+ allocate(outputSize(worldsize), source = 0_pInt); outputSize(worldrank+1) = sizeof(materialpoint_results)
+ call MPI_Allreduce(MPI_IN_PLACE,outputSize,worldsize,MPI_INT,MPI_SUM,PETSC_COMM_WORLD,ierr)
+ call MPI_File_open(PETSC_COMM_WORLD, &
+                    trim(getSolverWorkingDirectoryName())//trim(getSolverJobName())//'.spectralOut', &
+                    MPI_MODE_WRONLY + MPI_MODE_APPEND, &
+                    MPI_INFO_NULL, &
+                    resUnit, &
+                    ierr)
+ call MPI_File_get_position(resUnit,my_offset,ierr)
+ my_offset = my_offset + sum(outputSize(1:worldrank))
+ call MPI_File_seek (resUnit,my_offset,MPI_SEEK_SET,ierr)
+ call MPI_File_write(resUnit, materialpoint_results, size(materialpoint_results), &
+                     MPI_DOUBLE, MPI_STATUS_IGNORE, ierr)
+ if (iand(debug_level(debug_spectral),debug_levelBasic) /= 0 .and. worldrank == 0_pInt) &
+   write(6,'(/,a)') ' header of result file written out'
+ flush(6)
+
+ if (worldrank == 0) then
+   if (appendToOutFile) then                                                                        ! after restart, append to existing results file
+     open(newunit=statUnit,file=trim(getSolverWorkingDirectoryName())//trim(getSolverJobName())//&
+                                 '.sta',form='FORMATTED', position='APPEND', status='OLD')
+   else                                                                                             ! open new files ...
+     open(newunit=statUnit,file=trim(getSolverWorkingDirectoryName())//trim(getSolverJobName())//&
+                                 '.sta',form='FORMATTED',status='REPLACE')
+     write(statUnit,'(a)') 'Increment Time CutbackLevel Converged IterationsNeeded'                 ! statistics file
+   endif
  endif
 
 !--------------------------------------------------------------------------------------------------
@@ -405,7 +433,6 @@ program DAMASK_spectral_Driver
 !--------------------------------------------------------------------------------------------------
 ! forward solution 
          select case(spectral_solver)
-           case (DAMASK_spectral_SolverBasic_label)
            case (DAMASK_spectral_SolverBasicPETSC_label)
              call BasicPETSC_forward (&
                  guess,timeinc,timeIncOld,remainingLoadCaseTime, &
@@ -430,31 +457,26 @@ program DAMASK_spectral_Driver
            
 !--------------------------------------------------------------------------------------------------
 ! report begin of new increment
-         write(6,'(/,a)') ' ###########################################################################'
-         write(6,'(1x,a,es12.5'//&
-                 ',a,'//IO_intOut(inc)//',a,'//IO_intOut(loadCases(currentLoadCase)%incs)//&
-                 ',a,'//IO_intOut(stepFraction)//',a,'//IO_intOut(subStepFactor**cutBackLevel)//&
-                 ',a,'//IO_intOut(currentLoadCase)//',a,'//IO_intOut(size(loadCases))//')') &
-                 'Time', time, &
-                 's: Increment ', inc, '/', loadCases(currentLoadCase)%incs,&
-                 '-', stepFraction, '/', subStepFactor**cutBackLevel,&
-                 ' of load case ', currentLoadCase,'/',size(loadCases)
-         flush(6)
-         write(incInfo,'(a,'//IO_intOut(totalIncsCounter)//',a,'//IO_intOut(sum(loadCases%incs))//&
-               ',a,'//IO_intOut(stepFraction)//',a,'//IO_intOut(subStepFactor**cutBackLevel)//')') &
-               'Increment ',totalIncsCounter,'/',sum(loadCases%incs),&
-               '-',stepFraction, '/', subStepFactor**cutBackLevel
-         select case(spectral_solver)
+         if (worldrank == 0) then
+           write(6,'(/,a)') ' ###########################################################################'
+           write(6,'(1x,a,es12.5'//&
+                   ',a,'//IO_intOut(inc)//',a,'//IO_intOut(loadCases(currentLoadCase)%incs)//&
+                   ',a,'//IO_intOut(stepFraction)//',a,'//IO_intOut(subStepFactor**cutBackLevel)//&
+                   ',a,'//IO_intOut(currentLoadCase)//',a,'//IO_intOut(size(loadCases))//')') &
+                   'Time', time, &
+                   's: Increment ', inc, '/', loadCases(currentLoadCase)%incs,&
+                   '-', stepFraction, '/', subStepFactor**cutBackLevel,&
+                   ' of load case ', currentLoadCase,'/',size(loadCases)
+           flush(6)
+           write(incInfo,'(a,'//IO_intOut(totalIncsCounter)//',a,'//IO_intOut(sum(loadCases%incs))//&
+                 ',a,'//IO_intOut(stepFraction)//',a,'//IO_intOut(subStepFactor**cutBackLevel)//')') &
+                 'Increment ',totalIncsCounter,'/',sum(loadCases%incs),&
+                 '-',stepFraction, '/', subStepFactor**cutBackLevel
+         endif     
          
 !--------------------------------------------------------------------------------------------------
 ! calculate solution 
-           case (DAMASK_spectral_SolverBasic_label)
-             solres = basic_solution (&
-                 incInfo,guess,timeinc,timeIncOld,remainingLoadCaseTime, &
-                 P_BC               = loadCases(currentLoadCase)%P, &
-                 F_BC               = loadCases(currentLoadCase)%deformation, &
-                 temperature_bc     = loadCases(currentLoadCase)%temperature, &
-                 rotation_BC        = loadCases(currentLoadCase)%rotation)
+         select case(spectral_solver)
            case (DAMASK_spectral_SolverBasicPETSC_label)
              solres = BasicPETSC_solution (&
                  incInfo,guess,timeinc,timeIncOld,remainingLoadCaseTime, &
@@ -486,7 +508,7 @@ program DAMASK_spectral_Driver
          cutBack = .False.                                                                   
          if(solres%termIll .or. .not. solres%converged) then                                        ! no solution found
            if (cutBackLevel < maxCutBack) then                                                      ! do cut back
-             write(6,'(/,a)') ' cut back detected'
+             if (worldrank == 0) write(6,'(/,a)') ' cut back detected'
              cutBack = .True.
              stepFraction = (stepFraction - 1_pInt) * subStepFactor                                 ! adjust to new denominator
              cutBackLevel = cutBackLevel + 1_pInt
@@ -505,28 +527,36 @@ program DAMASK_spectral_Driver
          else
            guess = .true.                                                                           ! start guessing after first converged (sub)inc
          endif
-         if (.not. cutBack) &
-           write(statUnit,*) totalIncsCounter, time, cutBackLevel, &
-                             solres%converged, solres%iterationsNeeded                              ! write statistics about accepted solution
-           flush(statUnit)
+         if (.not. cutBack) then
+           if (worldrank == 0) then
+             write(statUnit,*) totalIncsCounter, time, cutBackLevel, &
+                               solres%converged, solres%iterationsNeeded                            ! write statistics about accepted solution
+             flush(statUnit)
+           endif 
+         endif  
        enddo subIncLooping
        cutBackLevel = max(0_pInt, cutBackLevel - 1_pInt)                                            ! try half number of subincs next inc
        if(solres%converged) then                                                                    ! report converged inc
          convergedCounter = convergedCounter + 1_pInt
-         write(6,'(/,a,'//IO_intOut(totalIncsCounter)//',a)') &
+         if (worldrank == 0) &
+           write(6,'(/,a,'//IO_intOut(totalIncsCounter)//',a)') &
                                      ' increment ', totalIncsCounter, ' converged'
        else
-         write(6,'(/,a,'//IO_intOut(totalIncsCounter)//',a)') &                                     ! report non-converged inc
+         if (worldrank == 0) &
+           write(6,'(/,a,'//IO_intOut(totalIncsCounter)//',a)') &                                   ! report non-converged inc
                                      ' increment ', totalIncsCounter, ' NOT converged'
          notConvergedCounter = notConvergedCounter + 1_pInt
        endif; flush(6)
        if (mod(inc,loadCases(currentLoadCase)%outputFrequency) == 0_pInt) then                      ! at output frequency
-         write(6,'(1/,a)') ' ... writing results to file ......................................'
-         write(resUnit)  materialpoint_results                                                      ! write result to file
-         flush(resUnit)
+         if (worldrank == 0) &
+           write(6,'(1/,a)') ' ... writing results to file ......................................'
+         my_offset = my_offset + sum(outputSize)
+         call MPI_File_seek (resUnit,my_offset,MPI_SEEK_SET,ierr)
+         call MPI_File_write(resUnit, materialpoint_results, size(materialpoint_results), &
+                             MPI_DOUBLE, MPI_STATUS_IGNORE, ierr)
        endif
        if( loadCases(currentLoadCase)%restartFrequency > 0_pInt .and. &                             ! at frequency of writing restart information set restart parameter for FEsolving 
-                      mod(inc,loadCases(currentLoadCase)%restartFrequency) == 0_pInt) then          ! ToDo first call to CPFEM_general will write? 
+                      mod(inc,loadCases(currentLoadCase)%restartFrequency) == 0_pInt) then          ! first call to CPFEM_general will write? 
          restartWrite = .true.
          lastRestartWritten = inc
        endif 
@@ -538,9 +568,20 @@ program DAMASK_spectral_Driver
     enddo incLooping
  enddo loadCaseLooping
  
+!--------------------------------------------------------------------------------------------------
+! report summary of whole calculation
+ if (worldrank == 0) then
+   write(6,'(/,a)') ' ###########################################################################'
+   write(6,'(1x,i6.6,a,i6.6,a,f5.1,a)') convergedCounter, ' out of ', &
+                                     notConvergedCounter + convergedCounter, ' (', &
+                                     real(convergedCounter, pReal)/&
+                                     real(notConvergedCounter + convergedCounter,pReal)*100.0_pReal, &
+                                     ' %) increments converged!'
+ endif
+ call MPI_file_close(resUnit,ierr)
+ close(statUnit)
+
  select case (spectral_solver)
-   case (DAMASK_spectral_SolverBasic_label)
-     call basic_destroy()
    case (DAMASK_spectral_SolverBasicPETSC_label)
      call BasicPETSC_destroy()
    case (DAMASK_spectral_SolverAL_label)
@@ -549,16 +590,6 @@ program DAMASK_spectral_Driver
      call Polarisation_destroy()
  end select
  
-!--------------------------------------------------------------------------------------------------
-! report summary of whole calculation
- write(6,'(/,a)') ' ###########################################################################'
- write(6,'(1x,i6.6,a,i6.6,a,f5.1,a)') convergedCounter, ' out of ', &
-                                   notConvergedCounter + convergedCounter, ' (', &
-                                   real(convergedCounter, pReal)/&
-                                   real(notConvergedCounter + convergedCounter,pReal)*100.0_pReal, &
-                                   ' %) increments converged!'
- close(resUnit)
- close(statUnit)
  if (notConvergedCounter > 0_pInt) call quit(3_pInt)                                                ! error if some are not converged
  call quit(0_pInt)                                                                                  ! no complains ;)
 
@@ -576,22 +607,28 @@ end program DAMASK_spectral_Driver
 subroutine quit(stop_id)
  use prec, only: &
    pInt
+ use numerics, only: &
+   worldrank  
    
  implicit none
  integer(pInt), intent(in) :: stop_id
  integer, dimension(8) :: dateAndTime                                                               ! type default integer
 
- call date_and_time(values = dateAndTime)
- write(6,'(/,a)') 'DAMASK terminated on:'
- write(6,'(a,2(i2.2,a),i4.4)') 'Date:               ',dateAndTime(3),'/',&
-                                                      dateAndTime(2),'/',&
-                                                      dateAndTime(1) 
- write(6,'(a,2(i2.2,a),i2.2)') 'Time:               ',dateAndTime(5),':',&
-                                                      dateAndTime(6),':',&
-                                                      dateAndTime(7)  
+ if (worldrank == 0_pInt) then
+   call date_and_time(values = dateAndTime)
+   write(6,'(/,a)') 'DAMASK terminated on:'
+   write(6,'(a,2(i2.2,a),i4.4)') 'Date:               ',dateAndTime(3),'/',&
+                                                        dateAndTime(2),'/',&
+                                                        dateAndTime(1) 
+   write(6,'(a,2(i2.2,a),i2.2)') 'Time:               ',dateAndTime(5),':',&
+                                                        dateAndTime(6),':',&
+                                                        dateAndTime(7)  
+ endif
+ 
  if (stop_id == 0_pInt) stop 0                                                                      ! normal termination
  if (stop_id <  0_pInt) then                                                                        ! trigger regridding
-   write(0,'(a,i6)') 'restart information available at ', stop_id*(-1_pInt)
+   if (worldrank == 0_pInt) &
+     write(0,'(a,i6)') 'restart information available at ', stop_id*(-1_pInt)
    stop 2
  endif
  if (stop_id == 3_pInt) stop 3                                                                      ! not all incs converged
diff --git a/code/DAMASK_spectral_interface.f90 b/code/DAMASK_spectral_interface.f90
index 8de94cbf3..3f9c795fa 100644
--- a/code/DAMASK_spectral_interface.f90
+++ b/code/DAMASK_spectral_interface.f90
@@ -84,7 +84,7 @@ subroutine DAMASK_interface_init(loadCaseParameterIn,geometryParameterIn)
 !--------------------------------------------------------------------------------------------------
 ! PETSc Init
 #ifdef PETSc
-  call PetscInitialize(PETSC_NULL_CHARACTER,ierr)                                                    ! according to PETSc manual, that should be the first line in the code
+  call PetscInitialize(PETSC_NULL_CHARACTER,ierr)                                                   ! according to PETSc manual, that should be the first line in the code
  CHKERRQ(ierr)                                                                                      ! this is a macro definition, it is case sensitive
 
  open(6, encoding='UTF-8')                                                                          ! modern fortran compilers (gfortran >4.4, ifort >11 support it)
@@ -161,7 +161,7 @@ subroutine DAMASK_interface_init(loadCaseParameterIn,geometryParameterIn)
            write(6,'(a)')  ' Help:'
            write(6,'(/,a)')'   --help'
            write(6,'(a,/)')'        Prints this message and exits'
-           call quit(0_pInt)                                                                              ! normal Termination
+           call quit(0_pInt)                                                                        ! normal Termination
          endif mainProcess2
        case ('-l', '--load', '--loadcase')
          loadcaseArg = IIO_stringValue(commandLine,positions,i+1_pInt)
diff --git a/code/DAMASK_spectral_solverAL.f90 b/code/DAMASK_spectral_solverAL.f90
index b81472dc9..1ccf47a8d 100644
--- a/code/DAMASK_spectral_solverAL.f90
+++ b/code/DAMASK_spectral_solverAL.f90
@@ -45,11 +45,9 @@ module DAMASK_spectral_solverAL
 ! common pointwise data
  real(pReal), private, dimension(:,:,:,:,:), allocatable :: &
    F_lastInc, &                                                                                     !< field of previous compatible deformation gradients
-   F_lastInc2, &                                                                                    !< field of 2nd previous compatible deformation gradients
    F_lambda_lastInc, &                                                                              !< field of previous incompatible deformation gradient 
    Fdot, &                                                                                          !< field of assumed rate of compatible deformation gradient
    F_lambdaDot                                                                                      !< field of assumed rate of incopatible deformation gradient
- complex(pReal),private, dimension(:,:,:,:,:), allocatable :: inertiaField_fourier
 
 !--------------------------------------------------------------------------------------------------
 ! stress, stiffness and compliance average etc.
@@ -70,7 +68,7 @@ module DAMASK_spectral_solverAL
    S_scale = 0.0_pReal
  
  real(pReal), private :: &
-   err_BC, &                                                                                    !< deviation from stress BC
+   err_BC, &                                                                                        !< deviation from stress BC
    err_curl, &                                                                                      !< RMS of curl of F
    err_div                                                                                          !< RMS of div of P
  logical, private :: ForwardData
@@ -118,19 +116,21 @@ subroutine AL_init(temperature)
   debug_spectralRestart
  use FEsolving, only: &
    restartInc
+ use numerics, only: &
+   worldrank, &
+   worldsize
  use DAMASK_interface, only: &
    getSolverJobName
  use DAMASK_spectral_Utilities, only: &
    Utilities_init, &
    Utilities_constitutiveResponse, &
    Utilities_updateGamma, &
-   grid, &
+   Utilities_updateIPcoords, &
    grid1Red, &
-   geomSize, &
    wgt
  use mesh, only: &
-   mesh_ipCoordinates, &
-   mesh_deformedCoordsFFT
+   gridLocal, &
+   gridGlobal
  use math, only: &
    math_invSym3333
    
@@ -144,30 +144,41 @@ subroutine AL_init(temperature)
  PetscErrorCode :: ierr
  PetscObject :: dummy
  PetscScalar, pointer, dimension(:,:,:,:) :: xx_psc, F, F_lambda
+ integer(pInt), dimension(:), allocatable :: localK  
+ integer(pInt) :: proc
+ character(len=1024) :: rankStr
  
  call Utilities_init()
- write(6,'(/,a)') ' <<<+-  DAMASK_spectral_solverAL init  -+>>>'
- write(6,'(a)') ' $Id$'
- write(6,'(a15,a)')   ' Current time: ',IO_timeStamp()
+ if (worldrank == 0_pInt) then
+   write(6,'(/,a)') ' <<<+-  DAMASK_spectral_solverAL init  -+>>>'
+   write(6,'(a)') ' $Id$'
+   write(6,'(a15,a)')   ' Current time: ',IO_timeStamp()
 #include "compilation_info.f90"
+ endif
 
- allocate (P            (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
+ allocate (P               (3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
 !--------------------------------------------------------------------------------------------------
 ! allocate global fields
- allocate (F_lastInc    (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (F_lastInc2   (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (Fdot         (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (F_lambda_lastInc(3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (F_lambdaDot     (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (inertiaField_fourier (grid1Red,grid(2),grid(3),3,3),source = cmplx(0.0_pReal,0.0_pReal,pReal))
+ allocate (F_lastInc       (3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
+ allocate (Fdot            (3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
+ allocate (F_lambda_lastInc(3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
+ allocate (F_lambdaDot     (3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
     
 !--------------------------------------------------------------------------------------------------
 ! PETSc Init
  call SNESCreate(PETSC_COMM_WORLD,snes,ierr); CHKERRQ(ierr)
- call DMDACreate3d(PETSC_COMM_WORLD,                               &
-           DM_BOUNDARY_NONE, DM_BOUNDARY_NONE, DM_BOUNDARY_NONE, &
-           DMDA_STENCIL_BOX,grid(1),grid(2),grid(3),PETSC_DECIDE,PETSC_DECIDE,PETSC_DECIDE, &
-           18,1,PETSC_NULL_INTEGER,PETSC_NULL_INTEGER,PETSC_NULL_INTEGER,da,ierr)
+ allocate(localK(worldsize), source = 0); localK(worldrank+1) = gridLocal(3)
+ do proc = 1, worldsize
+   call MPI_Bcast(localK(proc),1,MPI_INTEGER,proc-1,PETSC_COMM_WORLD,ierr)
+ enddo  
+ call DMDACreate3d(PETSC_COMM_WORLD, &
+        DM_BOUNDARY_NONE, DM_BOUNDARY_NONE, DM_BOUNDARY_NONE, &                                     ! cut off stencil at boundary
+        DMDA_STENCIL_BOX, &                                                                         ! Moore (26) neighborhood around central point
+        gridGlobal(1),gridGlobal(2),gridGlobal(3), &                                                ! global grid
+        1 , 1, worldsize, &
+        18, 0, &                                                                                    ! #dof (F tensor), ghost boundary width (domain overlap)
+        gridLocal (1),gridLocal (2),localK, &                                                       ! local grid
+        da,ierr)                                                                                    ! handle, error
  CHKERRQ(ierr)
  call DMCreateGlobalVector(da,solution_vec,ierr); CHKERRQ(ierr)
  call DMDASNESSetFunctionLocal(da,INSERT_VALUES,AL_formResidual,dummy,ierr)
@@ -183,53 +194,51 @@ subroutine AL_init(temperature)
  F => xx_psc(0:8,:,:,:)
  F_lambda => xx_psc(9:17,:,:,:)
  if (restartInc == 1_pInt) then                                                                     ! no deformation (no restart)
-   F_lastInc     = spread(spread(spread(math_I3,3,grid(1)),4,grid(2)),5,grid(3))                    ! initialize to identity
-   F_lastInc2 = F_lastInc
-   F = reshape(F_lastInc,[9,grid(1),grid(2),grid(3)])
+   F_lastInc     = spread(spread(spread(math_I3,3,gridLocal(1)),4,gridLocal(2)),5,gridLocal(3))     ! initialize to identity
+   F = reshape(F_lastInc,[9,gridLocal(1),gridLocal(2),gridLocal(3)])
    F_lambda = F
    F_lambda_lastInc = F_lastInc
 
  elseif (restartInc > 1_pInt) then                                                                  ! read in F to calculate coordinates and initialize CPFEM general
-   if (iand(debug_level(debug_spectral),debug_spectralRestart)/= 0) &
+   if (iand(debug_level(debug_spectral),debug_spectralRestart)/= 0 .and. worldrank == 0_pInt) &
      write(6,'(/,a,'//IO_intOut(restartInc-1_pInt)//',a)') &
      'reading values of increment ', restartInc - 1_pInt, ' from file'
    flush(6)
-   call IO_read_realFile(777,'F', trim(getSolverJobName()),size(F))
+   write(rankStr,'(a1,i0)')'_',worldrank
+   call IO_read_realFile(777,'F'//trim(rankStr), trim(getSolverJobName()),size(F))
    read (777,rec=1) F
    close (777)
-   call IO_read_realFile(777,'F_lastInc', trim(getSolverJobName()),size(F_lastInc))
+   call IO_read_realFile(777,'F_lastInc'//trim(rankStr), trim(getSolverJobName()),size(F_lastInc))
    read (777,rec=1) F_lastInc
    close (777)
-   call IO_read_realFile(777,'F_lastInc2', trim(getSolverJobName()),size(F_lastInc2))
-   read (777,rec=1) F_lastInc2
-   close (777)
-   call IO_read_realFile(777,'F_lambda',trim(getSolverJobName()),size(F_lambda))
+   call IO_read_realFile(777,'F_lambda'//trim(rankStr),trim(getSolverJobName()),size(F_lambda))
    read (777,rec=1) F_lambda
    close (777)
-   call IO_read_realFile(777,'F_lambda_lastInc',&
+   call IO_read_realFile(777,'F_lambda_lastInc'//trim(rankStr),&
                                         trim(getSolverJobName()),size(F_lambda_lastInc))
    read (777,rec=1) F_lambda_lastInc
    close (777)
-   call IO_read_realFile(777,'F_aim', trim(getSolverJobName()),size(F_aim))
-   read (777,rec=1) F_aim
-   close (777)
-   call IO_read_realFile(777,'F_aim_lastInc', trim(getSolverJobName()),size(F_aim_lastInc))
-   read (777,rec=1) F_aim_lastInc
-   close (777)
-   call IO_read_realFile(777,'F_aimDot',trim(getSolverJobName()),size(f_aimDot))
-   read (777,rec=1) f_aimDot
-   close (777)
+   if (worldrank == 0_pInt) then
+     call IO_read_realFile(777,'F_aim', trim(getSolverJobName()),size(F_aim))
+     read (777,rec=1) F_aim
+     close (777)
+     call IO_read_realFile(777,'F_aim_lastInc', trim(getSolverJobName()),size(F_aim_lastInc))
+     read (777,rec=1) F_aim_lastInc
+     close (777)
+     call IO_read_realFile(777,'F_aimDot',trim(getSolverJobName()),size(f_aimDot))
+     read (777,rec=1) f_aimDot
+     close (777)
+   endif
  endif
  
- mesh_ipCoordinates = reshape(mesh_deformedCoordsFFT(geomSize,reshape(&
-                              F,[3,3,grid(1),grid(2),grid(3)])),[3,1,product(grid)])
- call Utilities_constitutiveResponse(F_lastInc,reshape(F,[3,3,grid(1),grid(2),grid(3)]),&
-   temperature,0.0_pReal,P,C_volAvg,C_minMaxAvg,temp33_Real,.false.,math_I3)
+ call utilities_updateIPcoords(F)
+ call Utilities_constitutiveResponse(F_lastInc,F,&
+                   temperature,0.0_pReal,P,C_volAvg,C_minMaxAvg,temp33_Real,.false.,math_I3)
  nullify(F)
  nullify(F_lambda)
  call DMDAVecRestoreArrayF90(da,solution_vec,xx_psc,ierr); CHKERRQ(ierr)                            ! write data back to PETSc
                              
- if (restartInc > 1_pInt) then                                                                      ! using old values from files
+ if (restartInc > 1_pInt .and. worldrank == 0_pInt) then                                            ! using old values from files
    if (iand(debug_level(debug_spectral),debug_spectralRestart)/= 0) &
      write(6,'(/,a,'//IO_intOut(restartInc-1_pInt)//',a)') &
      'reading more values of increment', restartInc - 1_pInt, 'from file'
@@ -256,7 +265,8 @@ end subroutine AL_init
 !> @brief solution for the AL scheme with internal iterations
 !--------------------------------------------------------------------------------------------------
 type(tSolutionState) function &
-  AL_solution(incInfoIn,guess,timeinc,timeinc_old,loadCaseTime,P_BC,F_BC,temperature_bc,rotation_BC,density)
+  AL_solution(incInfoIn,guess,timeinc,timeinc_old,loadCaseTime,P_BC,F_BC,temperature_bc, &
+                                                                     rotation_BC,density)
  use numerics, only: &
    update_gamma
  use math, only: &
@@ -342,7 +352,10 @@ subroutine AL_formResidual(in,x_scal,f_scal,dummy,ierr)
    itmax, &
    itmin, &
    polarAlpha, &
-   polarBeta
+   polarBeta, &
+   worldrank
+ use mesh, only: &
+   gridLocal
  use IO, only: &
    IO_intOut
  use math, only: &
@@ -353,11 +366,9 @@ subroutine AL_formResidual(in,x_scal,f_scal,dummy,ierr)
    math_mul33x33, &
    PI
  use DAMASK_spectral_Utilities, only: &
-   grid, &
-   geomSize, &
    wgt, &
-   field_real, &
-   field_fourier, &
+   field_realMPI, &
+   field_fourierMPI, &
    Utilities_FFTforward, &
    Utilities_fourierConvolution, &
    Utilities_inverseLaplace, &
@@ -371,6 +382,8 @@ subroutine AL_formResidual(in,x_scal,f_scal,dummy,ierr)
    debug_spectralRotation
  use homogenization, only: &
    materialpoint_dPdF
+ use FEsolving, only: &
+   terminallyIll
 
  implicit none
 !--------------------------------------------------------------------------------------------------
@@ -410,43 +423,30 @@ subroutine AL_formResidual(in,x_scal,f_scal,dummy,ierr)
  call SNESGetIterationNumber(snes,PETScIter,ierr); CHKERRQ(ierr)
 
  F_av = sum(sum(sum(F,dim=5),dim=4),dim=3) * wgt
+ call MPI_Allreduce(MPI_IN_PLACE,F_av,9,MPI_DOUBLE,MPI_SUM,PETSC_COMM_WORLD,ierr)
  
  if(nfuncs== 0 .and. PETScIter == 0) totalIter = -1_pInt                                            ! new increment
  if(totalIter <= PETScIter) then                                                                    ! new iteration
 !--------------------------------------------------------------------------------------------------
 ! report begin of new iteration
    totalIter = totalIter + 1_pInt
-   write(6,'(1x,a,3(a,'//IO_intOut(itmax)//'))') trim(incInfo), &
+   if (worldrank == 0_pInt) then
+     write(6,'(1x,a,3(a,'//IO_intOut(itmax)//'))') trim(incInfo), &
                     ' @ Iteration ', itmin, '≤',totalIter, '≤', itmax
-   if (iand(debug_level(debug_spectral),debug_spectralRotation) /= 0) &
-     write(6,'(/,a,/,3(3(f12.7,1x)/))',advance='no') ' deformation gradient aim (lab) =', &
-                                   math_transpose33(math_rotate_backward33(F_aim,params%rotation_BC))
+     if (iand(debug_level(debug_spectral),debug_spectralRotation) /= 0) &
+       write(6,'(/,a,/,3(3(f12.7,1x)/))',advance='no') ' deformation gradient aim (lab) =', &
+                             math_transpose33(math_rotate_backward33(F_aim,params%rotation_BC))
    write(6,'(/,a,/,3(3(f12.7,1x)/))',advance='no') ' deformation gradient aim =', &
                                  math_transpose33(F_aim)
+   endif
    flush(6)
  endif
 
-!--------------------------------------------------------------------------------------------------
-! evaluate inertia
- dynamic: if (params%density > 0.0_pReal) then
-   residual_F = ((F - F_lastInc)/params%timeinc - (F_lastInc - F_lastInc2)/params%timeincOld)/&
-                                                  ((params%timeinc + params%timeincOld)/2.0_pReal)
-   residual_F = params%density*product(geomSize/grid)*residual_F
-   field_real = 0.0_pReal
-   field_real(1:grid(1),1:grid(2),1:grid(3),1:3,1:3) = reshape(residual_F,[grid(1),grid(2),grid(3),3,3],&
-                                                               order=[4,5,1,2,3])                   ! field real has a different order
-   call Utilities_FFTforward()
-   call Utilities_inverseLaplace()
-   inertiaField_fourier = field_fourier
- else dynamic
-   inertiaField_fourier = cmplx(0.0_pReal,0.0_pReal,pReal)
- endif dynamic
-
 !--------------------------------------------------------------------------------------------------
 ! 
- field_real = 0.0_pReal
- do k = 1_pInt, grid(3); do j = 1_pInt, grid(2); do i = 1_pInt, grid(1)
-   field_real(i,j,k,1:3,1:3) = &
+ field_realMPI = 0.0_pReal
+ do k = 1_pInt, gridLocal(3); do j = 1_pInt, gridLocal(2); do i = 1_pInt, gridLocal(1)
+   field_realMPI(1:3,1:3,i,j,k) = &
      polarBeta*math_mul3333xx33(C_scale,F(1:3,1:3,i,j,k) - math_I3) -&
      polarAlpha*math_mul33x33(F(1:3,1:3,i,j,k), &
                               math_mul3333xx33(C_scale,F_lambda(1:3,1:3,i,j,k) - math_I3))
@@ -456,26 +456,25 @@ subroutine AL_formResidual(in,x_scal,f_scal,dummy,ierr)
 !--------------------------------------------------------------------------------------------------
 ! doing convolution in Fourier space 
  call Utilities_FFTforward()
- field_fourier = field_fourier + polarAlpha*inertiaField_fourier
  call Utilities_fourierConvolution(math_rotate_backward33(polarBeta*F_aim,params%rotation_BC)) 
  call Utilities_FFTbackward()
 
 !--------------------------------------------------------------------------------------------------
 ! constructing residual                         
- residual_F_lambda = polarBeta*F - reshape(field_real(1:grid(1),1:grid(2),1:grid(3),1:3,1:3),&
-                                      [3,3,grid(1),grid(2),grid(3)],order=[3,4,5,1,2])
+ residual_F_lambda = polarBeta*F - field_realMPI(1:3,1:3,1:gridLocal(1),1:gridLocal(2),1:gridLocal(3))
 
 !--------------------------------------------------------------------------------------------------
 ! evaluate constitutive response
  P_avLastEval = P_av
  call Utilities_constitutiveResponse(F_lastInc,F - residual_F_lambda/polarBeta,params%temperature,params%timeinc, &
                                      residual_F,C_volAvg,C_minMaxAvg,P_av,ForwardData,params%rotation_BC)
+ call MPI_Allreduce(MPI_IN_PLACE,terminallyIll,1,MPI_LOGICAL,MPI_LOR,PETSC_COMM_WORLD,ierr)
  ForwardData = .False.
 
 !--------------------------------------------------------------------------------------------------
 ! calculate divergence
- field_real = 0.0_pReal
- field_real = reshape(residual_F,[grid(1),grid(2),grid(3),3,3],order=[4,5,1,2,3]) 
+ field_realMPI = 0.0_pReal
+ field_realMPI(1:3,1:3,1:gridLocal(1),1:gridLocal(2),1:gridLocal(3)) = residual_F
  call Utilities_FFTforward()
  err_div = Utilities_divergenceRMS()
  call Utilities_FFTbackward()
@@ -483,19 +482,19 @@ subroutine AL_formResidual(in,x_scal,f_scal,dummy,ierr)
 !--------------------------------------------------------------------------------------------------
 ! constructing residual
  e = 0_pInt
- do k = 1_pInt, grid(3); do j = 1_pInt, grid(2); do i = 1_pInt, grid(1)
+ do k = 1_pInt, gridLocal(3); do j = 1_pInt, gridLocal(2); do i = 1_pInt, gridLocal(1)
    e = e + 1_pInt
    residual_F(1:3,1:3,i,j,k) = math_mul3333xx33(math_invSym3333(materialpoint_dPdF(1:3,1:3,1:3,1:3,1,e) + C_scale), &
                                                 residual_F(1:3,1:3,i,j,k) - &
                                                 math_mul33x33(F(1:3,1:3,i,j,k), &
-                                                           math_mul3333xx33(C_scale,F_lambda(1:3,1:3,i,j,k) - math_I3))) &
-                            + residual_F_lambda(1:3,1:3,i,j,k)
+                                                math_mul3333xx33(C_scale,F_lambda(1:3,1:3,i,j,k) - math_I3))) &
+                                                + residual_F_lambda(1:3,1:3,i,j,k)
  enddo; enddo; enddo
  
 !--------------------------------------------------------------------------------------------------
 ! calculating curl 
- field_real = 0.0_pReal
- field_real = reshape(F,[grid(1),grid(2),grid(3),3,3],order=[4,5,1,2,3]) 
+ field_realMPI = 0.0_pReal
+ field_realMPI(1:3,1:3,1:gridLocal(1),1:gridLocal(2),1:gridLocal(3)) = F
  call Utilities_FFTforward()
  err_curl = Utilities_curlRMS()
  call Utilities_FFTbackward()
@@ -515,7 +514,8 @@ subroutine AL_converged(snes_local,PETScIter,xnorm,snorm,fnorm,reason,dummy,ierr
    err_curl_tolRel, &
    err_curl_tolAbs, &
    err_stress_tolAbs, &
-   err_stress_tolRel
+   err_stress_tolRel, &
+   worldrank
  use math, only: &
    math_mul3333xx33
  use FEsolving, only: &
@@ -562,15 +562,17 @@ subroutine AL_converged(snes_local,PETScIter,xnorm,snorm,fnorm,reason,dummy,ierr
 
 !--------------------------------------------------------------------------------------------------
 ! report
- write(6,'(1/,a)') ' ... reporting .............................................................'
- write(6,'(/,a,f12.2,a,es8.2,a,es9.2,a)') ' error curl =       ', &
+ if (worldrank == 0_pInt) then
+   write(6,'(1/,a)') ' ... reporting .............................................................'
+   write(6,'(/,a,f12.2,a,es8.2,a,es9.2,a)') ' error curl =       ', &
             err_curl/curlTol,' (',err_curl,' -,   tol =',curlTol,')'
- write(6,'  (a,f12.2,a,es8.2,a,es9.2,a)') ' error divergence = ', &
+   write(6,'  (a,f12.2,a,es8.2,a,es9.2,a)') ' error divergence = ', &
             err_div/divTol,  ' (',err_div, ' / m, tol =',divTol,')'
- write(6,'  (a,f12.2,a,es8.2,a,es9.2,a)') ' error BC =         ', &
+   write(6,'  (a,f12.2,a,es8.2,a,es9.2,a)') ' error BC =         ', &
             err_BC/BC_tol, ' (',err_BC,    ' Pa,  tol =',BC_tol,')' 
- write(6,'(/,a)') ' ==========================================================================='
- flush(6) 
+   write(6,'(/,a)') ' ==========================================================================='
+   flush(6) 
+ endif
 
 end subroutine AL_converged
 
@@ -583,16 +585,16 @@ subroutine AL_forward(guess,timeinc,timeinc_old,loadCaseTime,F_BC,P_BC,rotation_
    math_mul3333xx33, &
    math_transpose33, &
    math_rotate_backward33
+ use numerics, only: &
+   worldrank 
+ use mesh, only: &
+   gridLocal
  use DAMASK_spectral_Utilities, only: &
-   grid, &
-   geomSize, &
    Utilities_calculateRate, &
    Utilities_forwardField, &
+   Utilities_updateIPcoords, &
    tBoundaryCondition, &
    cutBack
- use mesh, only: &
-   mesh_ipCoordinates,&
-   mesh_deformedCoordsFFT
  use IO, only: &
    IO_write_JobRealFile
  use FEsolving, only: &
@@ -613,6 +615,7 @@ subroutine AL_forward(guess,timeinc,timeinc_old,loadCaseTime,F_BC,P_BC,rotation_
  PetscScalar, dimension(:,:,:,:), pointer :: xx_psc, F, F_lambda
  integer(pInt) :: i, j, k
  real(pReal), dimension(3,3) :: F_lambda33
+ character(len=1024) :: rankStr
 
 !--------------------------------------------------------------------------------------------------
 ! update coordinates and rate and forward last inc
@@ -620,47 +623,48 @@ subroutine AL_forward(guess,timeinc,timeinc_old,loadCaseTime,F_BC,P_BC,rotation_
  F => xx_psc(0:8,:,:,:)
  F_lambda => xx_psc(9:17,:,:,:)
  if (restartWrite) then
-   write(6,'(/,a)') ' writing converged results for restart'
-   flush(6)
-   call IO_write_jobRealFile(777,'F',size(F))                                                     ! writing deformation gradient field to file
+   if (worldrank == 0_pInt) then
+     write(6,'(/,a)') ' writing converged results for restart'
+     flush(6)
+   endif
+   write(rankStr,'(a1,i0)')'_',worldrank
+   call IO_write_jobRealFile(777,'F'//trim(rankStr),size(F))                                                       ! writing deformation gradient field to file
    write (777,rec=1) F
    close (777)
-   call IO_write_jobRealFile(777,'F_lastInc',size(F_lastInc))                                     ! writing F_lastInc field to file
+   call IO_write_jobRealFile(777,'F_lastInc'//trim(rankStr),size(F_lastInc))                                       ! writing F_lastInc field to file
    write (777,rec=1) F_lastInc
    close (777)
-   call IO_write_jobRealFile(777,'F_lastInc2',size(F_lastInc2))                                   ! writing F_lastInc field to file
-   write (777,rec=1) F_lastInc2
-   close (777)
-   call IO_write_jobRealFile(777,'F_lambda',size(F_lambda))                                       ! writing deformation gradient field to file
+   call IO_write_jobRealFile(777,'F_lambda'//trim(rankStr),size(F_lambda))                                         ! writing deformation gradient field to file
    write (777,rec=1) F_lambda
    close (777)
-   call IO_write_jobRealFile(777,'F_lambda_lastInc',size(F_lambda_lastInc))                       ! writing F_lastInc field to file
+   call IO_write_jobRealFile(777,'F_lambda_lastInc'//trim(rankStr),size(F_lambda_lastInc))                         ! writing F_lastInc field to file
    write (777,rec=1) F_lambda_lastInc
    close (777)
-   call IO_write_jobRealFile(777,'F_aim',size(F_aim))
-   write (777,rec=1) F_aim
-   close(777)
-   call IO_write_jobRealFile(777,'F_aim_lastInc',size(F_aim_lastInc))
-   write (777,rec=1) F_aim_lastInc
-   close(777)
-   call IO_write_jobRealFile(777,'F_aimDot',size(F_aimDot))
-   write (777,rec=1) F_aimDot
-   close(777)
-   call IO_write_jobRealFile(777,'C_volAvg',size(C_volAvg))
-   write (777,rec=1) C_volAvg
-   close(777)
-   call IO_write_jobRealFile(777,'C_volAvgLastInc',size(C_volAvgLastInc))
-   write (777,rec=1) C_volAvgLastInc
-   close(777)
+   if (worldrank == 0_pInt) then
+     call IO_write_jobRealFile(777,'F_aim',size(F_aim))
+     write (777,rec=1) F_aim
+     close(777)
+     call IO_write_jobRealFile(777,'F_aim_lastInc',size(F_aim_lastInc))
+     write (777,rec=1) F_aim_lastInc
+     close(777)
+     call IO_write_jobRealFile(777,'F_aimDot',size(F_aimDot))
+     write (777,rec=1) F_aimDot
+     close(777)
+     call IO_write_jobRealFile(777,'C_volAvg',size(C_volAvg))
+     write (777,rec=1) C_volAvg
+     close(777)
+     call IO_write_jobRealFile(777,'C_volAvgLastInc',size(C_volAvgLastInc))
+     write (777,rec=1) C_volAvgLastInc
+     close(777)
+   endif
 
  endif 
- mesh_ipCoordinates = reshape(mesh_deformedCoordsFFT(geomSize,reshape(&
-                                             F,[3,3,grid(1),grid(2),grid(3)])),[3,1,product(grid)])
+ call utilities_updateIPcoords(F)
 
  if (cutBack) then 
    F_aim    = F_aim_lastInc
-   F_lambda = reshape(F_lambda_lastInc,[9,grid(1),grid(2),grid(3)]) 
-   F        = reshape(F_lastInc,       [9,grid(1),grid(2),grid(3)]) 
+   F_lambda = reshape(F_lambda_lastInc,[9,gridLocal(1),gridLocal(2),gridLocal(3)]) 
+   F        = reshape(F_lastInc,       [9,gridLocal(1),gridLocal(2),gridLocal(3)]) 
    C_volAvg = C_volAvgLastInc
  else
    ForwardData = .True.
@@ -679,25 +683,24 @@ subroutine AL_forward(guess,timeinc,timeinc_old,loadCaseTime,F_BC,P_BC,rotation_
 
 !--------------------------------------------------------------------------------------------------
 ! update coordinates and rate and forward last inc
-   mesh_ipCoordinates = reshape(mesh_deformedCoordsFFT(geomSize,reshape(&
-                                            F,[3,3,grid(1),grid(2),grid(3)])),[3,1,product(grid)])
+   call utilities_updateIPcoords(F)
    Fdot =  Utilities_calculateRate(math_rotate_backward33(f_aimDot,rotation_BC), &
-                  timeinc_old,guess,F_lastInc,reshape(F,[3,3,grid(1),grid(2),grid(3)]))
+                  timeinc_old,guess,F_lastInc,reshape(F,[3,3,gridLocal(1),gridLocal(2),gridLocal(3)]))
    F_lambdaDot =  Utilities_calculateRate(math_rotate_backward33(f_aimDot,rotation_BC), &
-                  timeinc_old,guess,F_lambda_lastInc,reshape(F_lambda,[3,3,grid(1),grid(2),grid(3)]))  
-   F_lastInc2       = F_lastInc                
-   F_lastInc        = reshape(F,       [3,3,grid(1),grid(2),grid(3)])
-   F_lambda_lastInc = reshape(F_lambda,[3,3,grid(1),grid(2),grid(3)])
+                  timeinc_old,guess,F_lambda_lastInc,reshape(F_lambda,[3,3,gridLocal(1), &
+                                                          gridLocal(2),gridLocal(3)]))  
+   F_lastInc        = reshape(F,       [3,3,gridLocal(1),gridLocal(2),gridLocal(3)])
+   F_lambda_lastInc = reshape(F_lambda,[3,3,gridLocal(1),gridLocal(2),gridLocal(3)])
  endif
 
  F_aim = F_aim + f_aimDot * timeinc
  F = reshape(Utilities_forwardField(timeinc,F_lastInc,Fdot, &                                       ! ensure that it matches rotated F_aim
                                     math_rotate_backward33(F_aim,rotation_BC)), &
-             [9,grid(1),grid(2),grid(3)])
+             [9,gridLocal(1),gridLocal(2),gridLocal(3)])
  F_lambda = reshape(Utilities_forwardField(timeinc,F_lambda_lastInc,F_lambdadot), &
-                    [9,grid(1),grid(2),grid(3)])                                                    ! does not have any average value as boundary condition
+                    [9,gridLocal(1),gridLocal(2),gridLocal(3)])                                     ! does not have any average value as boundary condition
  if (.not. guess) then                                                                              ! large strain forwarding
-   do k = 1_pInt, grid(3); do j = 1_pInt, grid(2); do i = 1_pInt, grid(1)
+   do k = 1_pInt, gridLocal(3); do j = 1_pInt, gridLocal(2); do i = 1_pInt, gridLocal(1)
       F_lambda33 = reshape(F_lambda(1:9,i,j,k),[3,3])
       F_lambda33 = math_mul3333xx33(S_scale,math_mul33x33(F_lambda33, &
                                   math_mul3333xx33(C_scale,&
diff --git a/code/DAMASK_spectral_solverBasicPETSc.f90 b/code/DAMASK_spectral_solverBasicPETSc.f90
index 3c56c8138..05e33a019 100644
--- a/code/DAMASK_spectral_solverBasicPETSc.f90
+++ b/code/DAMASK_spectral_solverBasicPETSc.f90
@@ -42,8 +42,7 @@ module DAMASK_spectral_SolverBasicPETSc
 
 !--------------------------------------------------------------------------------------------------
 ! common pointwise data
- real(pReal), private, dimension(:,:,:,:,:), allocatable ::  F_lastInc, Fdot, F_lastInc2
- complex(pReal), private, dimension(:,:,:,:,:), allocatable :: inertiaField_fourier
+ real(pReal), private, dimension(:,:,:,:,:), allocatable ::  F_lastInc, Fdot
 
 !--------------------------------------------------------------------------------------------------
 ! stress, stiffness and compliance average etc.
@@ -105,6 +104,9 @@ subroutine basicPETSc_init(temperature)
    debug_spectralRestart
  use FEsolving, only: &
    restartInc
+ use numerics, only: &
+   worldrank, &
+   worldsize
  use DAMASK_interface, only: &
    getSolverJobName
  use DAMASK_spectral_Utilities, only: &
@@ -112,10 +114,12 @@ subroutine basicPETSc_init(temperature)
    Utilities_constitutiveResponse, &
    Utilities_updateGamma, &
    utilities_updateIPcoords, &
-   grid, &
    grid1Red, &
-   wgt, &
-   geomSize
+   wgt
+ use mesh, only: &
+   gridLocal, &
+   gridGlobal, &
+   mesh_ipCoordinates
  use math, only: &
    math_invSym3333
    
@@ -128,33 +132,40 @@ subroutine basicPETSc_init(temperature)
  PetscObject    :: dummy
  real(pReal), dimension(3,3) :: &
    temp33_Real = 0.0_pReal
+ integer(pInt), dimension(:), allocatable :: localK  
+ integer(pInt) :: proc
+ character(len=1024) :: rankStr
 
  call Utilities_init()
- write(6,'(/,a)') ' <<<+-  DAMASK_spectral_solverBasicPETSc init  -+>>>'
- write(6,'(a)') ' $Id$'
- write(6,'(a15,a)')   ' Current time: ',IO_timeStamp()
+ mainProcess: if (worldrank == 0_pInt) then
+   write(6,'(/,a)') ' <<<+-  DAMASK_spectral_solverBasicPETSc init  -+>>>'
+   write(6,'(a)') ' $Id$'
+   write(6,'(a15,a)')   ' Current time: ',IO_timeStamp()
 #include "compilation_info.f90"
+ endif mainProcess
 
- allocate (P        (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
+ allocate (P         (3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
 !--------------------------------------------------------------------------------------------------
 ! allocate global fields
- allocate (F_lastInc (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (F_lastInc2(3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (Fdot      (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (inertiaField_fourier (grid1Red,grid(2),grid(3),3,3),source = cmplx(0.0_pReal,0.0_pReal,pReal))
+ allocate (F_lastInc (3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
+ allocate (Fdot      (3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
     
 !--------------------------------------------------------------------------------------------------
 ! initialize solver specific parts of PETSc
  call SNESCreate(PETSC_COMM_WORLD,snes,ierr); CHKERRQ(ierr)
+ allocate(localK(worldsize), source = 0); localK(worldrank+1) = gridLocal(3)
+ do proc = 1, worldsize
+   call MPI_Bcast(localK(proc),1,MPI_INTEGER,proc-1,PETSC_COMM_WORLD,ierr)
+ enddo  
  call DMDACreate3d(PETSC_COMM_WORLD, &
         DM_BOUNDARY_NONE, DM_BOUNDARY_NONE, DM_BOUNDARY_NONE, &                                     ! cut off stencil at boundary
         DMDA_STENCIL_BOX, &                                                                         ! Moore (26) neighborhood around central point
-        grid(1),grid(2),grid(3), &                                                                  ! overall grid
-        PETSC_DECIDE,PETSC_DECIDE,PETSC_DECIDE, &                                                   ! domain decomposition strategy (or local (per core) grid)
-        9,1, &                                                                                      ! #dof (F tensor), ghost boundary width (domain overlap)
-        PETSC_NULL_INTEGER,PETSC_NULL_INTEGER,PETSC_NULL_INTEGER, &                                 ! todo
+        gridGlobal(1),gridGlobal(2),gridGlobal(3), &                                                ! global grid
+        1, 1, worldsize, &
+        9, 0, &                                                                                     ! #dof (F tensor), ghost boundary width (domain overlap)
+        gridLocal (1),gridLocal (2),localK, &                                                       ! local grid
         da,ierr)                                                                                    ! handle, error
-   CHKERRQ(ierr)
+ CHKERRQ(ierr)
  call DMCreateGlobalVector(da,solution_vec,ierr); CHKERRQ(ierr)                                     ! global solution vector (grid x 9, i.e. every def grad tensor)
  call DMDASNESSetFunctionLocal(da,INSERT_VALUES,BasicPETSC_formResidual,dummy,ierr)                 ! residual vector of same shape as solution vector
  CHKERRQ(ierr) 
@@ -168,33 +179,31 @@ subroutine basicPETSc_init(temperature)
  call DMDAVecGetArrayF90(da,solution_vec,F,ierr); CHKERRQ(ierr)                                     ! get the data out of PETSc to work with
 
  if (restartInc == 1_pInt) then                                                                     ! no deformation (no restart)
-   F_lastInc = spread(spread(spread(math_I3,3,grid(1)),4,grid(2)),5,grid(3))                        ! initialize to identity
-   F = reshape(F_lastInc,[9,grid(1),grid(2),grid(3)])
-   F_lastInc2 = F_lastInc
+   F_lastInc = spread(spread(spread(math_I3,3,gridLocal(1)),4,gridLocal(2)),5,gridLocal(3))         ! initialize to identity
+   F = reshape(F_lastInc,[9,gridLocal(1),gridLocal(2),gridLocal(3)])
  elseif (restartInc > 1_pInt) then                                                                  ! using old values from file                                                      
-   if (iand(debug_level(debug_spectral),debug_spectralRestart)/= 0) &
+   if (iand(debug_level(debug_spectral),debug_spectralRestart)/= 0 .and. worldrank == 0_pInt) &
      write(6,'(/,a,'//IO_intOut(restartInc-1_pInt)//',a)') &
      'reading values of increment ', restartInc - 1_pInt, ' from file'
    flush(6)
-   call IO_read_realFile(777,'F',trim(getSolverJobName()),size(F))
+   write(rankStr,'(a1,i0)')'_',worldrank
+   call IO_read_realFile(777,'F'//trim(rankStr),trim(getSolverJobName()),size(F))
    read (777,rec=1) F
    close (777)
-   call IO_read_realFile(777,'F_lastInc',trim(getSolverJobName()),size(F_lastInc))
+   call IO_read_realFile(777,'F_lastInc'//trim(rankStr),trim(getSolverJobName()),size(F_lastInc))
    read (777,rec=1) F_lastInc
    close (777)
-   call IO_read_realFile(777,'F_lastInc2',trim(getSolverJobName()),size(F_lastInc2))
-   read (777,rec=1) F_lastInc2
-   close (777)
-   call IO_read_realFile(777,'F_aimDot',trim(getSolverJobName()),size(f_aimDot))
-   read (777,rec=1) f_aimDot
-   close (777)
+   if (worldrank == 0_pInt) then
+     call IO_read_realFile(777,'F_aimDot',trim(getSolverJobName()),size(f_aimDot))
+     read (777,rec=1) f_aimDot
+     close (777)
+   endif
    F_aim         = reshape(sum(sum(sum(F,dim=4),dim=3),dim=2) * wgt, [3,3])                         ! average of F
    F_aim_lastInc = sum(sum(sum(F_lastInc,dim=5),dim=4),dim=3) * wgt                                 ! average of F_lastInc 
  endif
 
- call utilities_updateIPcoords(F)
- call Utilities_constitutiveResponse(F_lastInc, &
-    reshape(F(0:8,0:grid(1)-1_pInt,0:grid(2)-1_pInt,0:grid(3)-1_pInt),[3,3,grid(1),grid(2),grid(3)]), &
+ call Utilities_updateIPcoords(F)
+ call Utilities_constitutiveResponse(F_lastInc, F, &
     temperature, &
     0.0_pReal, &
     P, &
@@ -204,7 +213,7 @@ subroutine basicPETSc_init(temperature)
     math_I3)
  call DMDAVecRestoreArrayF90(da,solution_vec,F,ierr); CHKERRQ(ierr)                                 ! write data back to PETSc
 
- if (restartInc > 1_pInt) then                                                                      ! using old values from files                                                    
+ if (restartInc > 1_pInt .and. worldrank == 0_pInt) then                                                                      ! using old values from files                                                    
    if (iand(debug_level(debug_spectral),debug_spectralRestart)/= 0) &
      write(6,'(/,a,'//IO_intOut(restartInc-1_pInt)//',a)') &
      'reading more values of increment', restartInc - 1_pInt, 'from file'
@@ -228,10 +237,11 @@ end subroutine basicPETSc_init
 !> @brief solution for the Basic PETSC scheme with internal iterations
 !--------------------------------------------------------------------------------------------------
 type(tSolutionState) function basicPETSc_solution( &
-             incInfoIn,guess,timeinc,timeinc_old,loadCaseTime,P_BC,F_BC,temperature_bc,rotation_BC,density)
+     incInfoIn,guess,timeinc,timeinc_old,loadCaseTime,P_BC,F_BC,temperature_bc,rotation_BC,density)
  use numerics, only: &
    update_gamma, &
-   itmax
+   itmax, &
+   worldrank
  use DAMASK_spectral_Utilities, only: &
    tBoundaryCondition, &
    Utilities_maskedCompliance, &
@@ -305,6 +315,10 @@ subroutine BasicPETSC_formResidual(in,x_scal,f_scal,dummy,ierr)
  use numerics, only: &
    itmax, &
    itmin
+ use numerics, only: &
+   worldrank
+ use mesh, only: &
+   gridLocal
  use math, only: &
    math_rotate_backward33, &
    math_transpose33, &
@@ -314,11 +328,9 @@ subroutine BasicPETSC_formResidual(in,x_scal,f_scal,dummy,ierr)
    debug_spectral, &
    debug_spectralRotation
  use DAMASK_spectral_Utilities, only: &
-   grid, &
-   geomSize, &
    wgt, &
-   field_real, &
-   field_fourier, &
+   field_realMPI, &
+   field_fourierMPI, &
    Utilities_FFTforward, &
    Utilities_FFTbackward, &
    Utilities_fourierConvolution, &
@@ -327,13 +339,18 @@ subroutine BasicPETSC_formResidual(in,x_scal,f_scal,dummy,ierr)
    Utilities_divergenceRMS
  use IO, only: &
    IO_intOut 
+ use FEsolving, only: &
+   terminallyIll
 
  implicit none
  DMDALocalInfo, dimension(DMDA_LOCAL_INFO_SIZE) :: &
    in
- PetscScalar,   dimension(3,3,grid(1),grid(2),grid(3)) :: &
-   x_scal, &
-   f_scal 
+ PetscScalar, dimension(3,3, &
+   XG_RANGE,YG_RANGE,ZG_RANGE) :: &
+   x_scal
+ PetscScalar, dimension(3,3, &
+   X_RANGE,Y_RANGE,Z_RANGE) :: &
+   f_scal
  PetscInt :: &
    PETScIter, &
    nfuncs
@@ -348,36 +365,23 @@ subroutine BasicPETSC_formResidual(in,x_scal,f_scal,dummy,ierr)
 !--------------------------------------------------------------------------------------------------
 ! report begin of new iteration
    totalIter = totalIter + 1_pInt
-   write(6,'(1x,a,3(a,'//IO_intOut(itmax)//'))') trim(incInfo), &
+   if (worldrank == 0_pInt) then
+     write(6,'(1x,a,3(a,'//IO_intOut(itmax)//'))') trim(incInfo), &
                     ' @ Iteration ', itmin, '≤',totalIter, '≤', itmax
-   if (iand(debug_level(debug_spectral),debug_spectralRotation) /= 0) &
-   write(6,'(/,a,/,3(3(f12.7,1x)/))',advance='no') ' deformation gradient aim (lab) =', &
+     if (iand(debug_level(debug_spectral),debug_spectralRotation) /= 0) &
+     write(6,'(/,a,/,3(3(f12.7,1x)/))',advance='no') ' deformation gradient aim (lab) =', &
                                  math_transpose33(math_rotate_backward33(F_aim,params%rotation_BC))
-   write(6,'(/,a,/,3(3(f12.7,1x)/))',advance='no') ' deformation gradient aim =', &
+     write(6,'(/,a,/,3(3(f12.7,1x)/))',advance='no') ' deformation gradient aim =', &
                                  math_transpose33(F_aim)
+   endif
    flush(6)
- endif
-
-!--------------------------------------------------------------------------------------------------
-! evaluate inertia
-if (params%density > 0.0_pReal) then
-   f_scal = ((x_scal - F_lastInc)/params%timeinc - (F_lastInc - F_lastInc2)/params%timeincOld)/&
-                                                   ((params%timeinc + params%timeincOld)/2.0_pReal)
-   f_scal = params%density*product(geomSize/grid)*f_scal
-   field_real = 0.0_pReal
-   field_real(1:grid(1),1:grid(2),1:grid(3),1:3,1:3) = reshape(f_scal,[grid(1),grid(2),grid(3),3,3],&
-                                                               order=[4,5,1,2,3])                   ! field real has a different order
-   call Utilities_FFTforward()
-   call Utilities_inverseLaplace()
-   inertiaField_fourier = field_fourier
- else
-   inertiaField_fourier = cmplx(0.0_pReal,0.0_pReal,pReal)
- endif  
+ endif 
 
 !--------------------------------------------------------------------------------------------------
 ! evaluate constitutive response
  call Utilities_constitutiveResponse(F_lastInc,x_scal,params%temperature,params%timeinc, &
                                      f_scal,C_volAvg,C_minmaxAvg,P_av,ForwardData,params%rotation_BC)
+ call MPI_Allreduce(MPI_IN_PLACE,terminallyIll,1,MPI_LOGICAL,MPI_LOR,PETSC_COMM_WORLD,ierr)
  ForwardData = .false.
   
 !--------------------------------------------------------------------------------------------------
@@ -388,18 +392,16 @@ if (params%density > 0.0_pReal) then
  
 !--------------------------------------------------------------------------------------------------
 ! updated deformation gradient using fix point algorithm of basic scheme
- field_real = 0.0_pReal
- field_real(1:grid(1),1:grid(2),1:grid(3),1:3,1:3) = reshape(f_scal,[grid(1),grid(2),grid(3),3,3],&
-                                                             order=[4,5,1,2,3]) ! field real has a different order
+ field_realMPI = 0.0_pReal
+ field_realMPI(1:3,1:3,1:gridLocal(1),1:gridLocal(2),1:gridLocal(3)) = f_scal
  call Utilities_FFTforward()
- field_fourier = field_fourier + inertiaField_fourier
- err_divDummy = Utilities_divergenceRMS()
+ err_div = Utilities_divergenceRMS()
  call Utilities_fourierConvolution(math_rotate_backward33(F_aim_lastIter-F_aim,params%rotation_BC))
  call Utilities_FFTbackward()
  
 !--------------------------------------------------------------------------------------------------
 ! constructing residual
- f_scal = reshape(field_real(1:grid(1),1:grid(2),1:grid(3),1:3,1:3),shape(f_scal),order=[3,4,5,1,2]) 
+ f_scal = field_realMPI(1:3,1:3,1:gridLocal(1),1:gridLocal(2),1:gridLocal(3))
 
 end subroutine BasicPETSc_formResidual
 
@@ -414,7 +416,8 @@ subroutine BasicPETSc_converged(snes_local,PETScIter,xnorm,snorm,fnorm,reason,du
    err_div_tolRel, &
    err_div_tolAbs, &
    err_stress_tolRel, &
-   err_stress_tolAbs
+   err_stress_tolAbs, &
+   worldrank
  use FEsolving, only: &
    terminallyIll
  
@@ -434,7 +437,6 @@ subroutine BasicPETSc_converged(snes_local,PETScIter,xnorm,snorm,fnorm,reason,du
  
  divTol    = max(maxval(abs(P_av))*err_div_tolRel,err_div_tolAbs)
  stressTol = max(maxval(abs(P_av))*err_stress_tolrel,err_stress_tolabs)
- err_divPrev = err_div; err_div = err_divDummy
  
  converged: if ((totalIter >= itmin .and. &
                            all([ err_div/divTol, &
@@ -449,13 +451,15 @@ subroutine BasicPETSc_converged(snes_local,PETScIter,xnorm,snorm,fnorm,reason,du
 
 !--------------------------------------------------------------------------------------------------
 ! report
- write(6,'(1/,a)') ' ... reporting .............................................................'
- write(6,'(1/,a,f12.2,a,es8.2,a,es9.2,a)') ' error divergence = ', &
+ if (worldrank == 0_pInt) then
+   write(6,'(1/,a)') ' ... reporting .............................................................'
+   write(6,'(1/,a,f12.2,a,es8.2,a,es9.2,a)') ' error divergence = ', &
             err_div/divTol,  ' (',err_div,' / m, tol =',divTol,')'
- write(6,'(a,f12.2,a,es8.2,a,es9.2,a)')   ' error stress BC =  ', &
+   write(6,'(a,f12.2,a,es8.2,a,es9.2,a)')   ' error stress BC =  ', &
             err_stress/stressTol, ' (',err_stress, ' Pa,  tol =',stressTol,')' 
- write(6,'(/,a)') ' ==========================================================================='
- flush(6) 
+   write(6,'(/,a)') ' ==========================================================================='
+   flush(6) 
+ endif
  
 end subroutine BasicPETSc_converged
 
@@ -466,9 +470,9 @@ subroutine BasicPETSc_forward(guess,timeinc,timeinc_old,loadCaseTime,F_BC,P_BC,r
  use math, only: &
    math_mul33x33 ,&
    math_rotate_backward33
+ use mesh, only: &
+   gridLocal
  use DAMASK_spectral_Utilities, only: &
-   grid, &
-   geomSize, &
    Utilities_calculateRate, &
    Utilities_forwardField, &
    utilities_updateIPcoords, &
@@ -478,6 +482,8 @@ subroutine BasicPETSc_forward(guess,timeinc,timeinc_old,loadCaseTime,F_BC,P_BC,r
    IO_write_JobRealFile
  use FEsolving, only: &
    restartWrite
+ use numerics, only: &
+   worldrank
 
  implicit none
  real(pReal), intent(in) :: &
@@ -492,37 +498,40 @@ subroutine BasicPETSc_forward(guess,timeinc,timeinc_old,loadCaseTime,F_BC,P_BC,r
    guess
  PetscScalar, pointer :: F(:,:,:,:)
  PetscErrorCode :: ierr
+ character(len=1024) :: rankStr
 
  call DMDAVecGetArrayF90(da,solution_vec,F,ierr)
 !--------------------------------------------------------------------------------------------------
 ! restart information for spectral solver
  if (restartWrite) then
-   write(6,'(/,a)') ' writing converged results for restart'
-   flush(6)
-   call IO_write_jobRealFile(777,'F',size(F))                                                     ! writing deformation gradient field to file
+   if (worldrank == 0_pInt) then
+     write(6,'(/,a)') ' writing converged results for restart'
+     flush(6)
+   endif
+   write(rankStr,'(a1,i0)')'_',worldrank
+   call IO_write_jobRealFile(777,'F'//trim(rankStr),size(F))                                                       ! writing deformation gradient field to file
    write (777,rec=1) F
    close (777)
-   call IO_write_jobRealFile(777,'F_lastInc',size(F_lastInc))                                     ! writing F_lastInc field to file
+   call IO_write_jobRealFile(777,'F_lastInc'//trim(rankStr),size(F_lastInc))                                       ! writing F_lastInc field to file
    write (777,rec=1) F_lastInc
    close (777)
-   call IO_write_jobRealFile(777,'F_lastInc2',size(F_lastInc2))                                   ! writing F_lastInc field to file
-   write (777,rec=1) F_lastInc2
-   close (777)
-   call IO_write_jobRealFile(777,'F_aimDot',size(F_aimDot))
-   write (777,rec=1) F_aimDot
-   close(777)
-   call IO_write_jobRealFile(777,'C_volAvg',size(C_volAvg))
-   write (777,rec=1) C_volAvg
-   close(777)
-   call IO_write_jobRealFile(777,'C_volAvgLastInc',size(C_volAvgLastInc))
-   write (777,rec=1) C_volAvgLastInc
-   close(777)
+   if (worldrank == 0_pInt) then
+     call IO_write_jobRealFile(777,'F_aimDot',size(F_aimDot))
+     write (777,rec=1) F_aimDot
+     close(777)
+     call IO_write_jobRealFile(777,'C_volAvg',size(C_volAvg))
+     write (777,rec=1) C_volAvg
+     close(777)
+     call IO_write_jobRealFile(777,'C_volAvgLastInc',size(C_volAvgLastInc))
+     write (777,rec=1) C_volAvgLastInc
+     close(777)
+   endif
  endif 
 
  call utilities_updateIPcoords(F)
  if (cutBack) then 
    F_aim = F_aim_lastInc
-   F    = reshape(F_lastInc,    [9,grid(1),grid(2),grid(3)]) 
+   F    = reshape(F_lastInc,    [9,gridLocal(1),gridLocal(2),gridLocal(3)]) 
    C_volAvg = C_volAvgLastInc
  else
    ForwardData = .True.
@@ -543,16 +552,15 @@ subroutine BasicPETSc_forward(guess,timeinc,timeinc_old,loadCaseTime,F_BC,P_BC,r
 ! update coordinates and rate and forward last inc
    call utilities_updateIPcoords(F)
    Fdot =  Utilities_calculateRate(math_rotate_backward33(f_aimDot,rotation_BC), &
-                  timeinc_old,guess,F_lastInc,reshape(F,[3,3,grid(1),grid(2),grid(3)]))
-   F_lastInc2 = F_lastInc
-   F_lastInc     = reshape(F,       [3,3,grid(1),grid(2),grid(3)])
+                  timeinc_old,guess,F_lastInc,reshape(F,[3,3,gridLocal(1),gridLocal(2),gridLocal(3)]))
+   F_lastInc     = reshape(F,       [3,3,gridLocal(1),gridLocal(2),gridLocal(3)])
  endif
  F_aim = F_aim + f_aimDot * timeinc
 
 !--------------------------------------------------------------------------------------------------
 ! update local deformation gradient
- F     = reshape(Utilities_forwardField(timeinc,F_lastInc,Fdot, &                                      ! ensure that it matches rotated F_aim
-                               math_rotate_backward33(F_aim,rotation_BC)),[9,grid(1),grid(2),grid(3)])
+ F     = reshape(Utilities_forwardField(timeinc,F_lastInc,Fdot, &                                   ! ensure that it matches rotated F_aim
+           math_rotate_backward33(F_aim,rotation_BC)),[9,gridLocal(1),gridLocal(2),gridLocal(3)])
  call DMDAVecRestoreArrayF90(da,solution_vec,F,ierr); CHKERRQ(ierr)
 
 end subroutine BasicPETSc_forward
diff --git a/code/DAMASK_spectral_solverPolarisation.f90 b/code/DAMASK_spectral_solverPolarisation.f90
index 508332bf9..4a71ab7ca 100644
--- a/code/DAMASK_spectral_solverPolarisation.f90
+++ b/code/DAMASK_spectral_solverPolarisation.f90
@@ -45,11 +45,9 @@ module DAMASK_spectral_solverPolarisation
 ! common pointwise data
  real(pReal), private, dimension(:,:,:,:,:), allocatable :: &
    F_lastInc, &                                                                                     !< field of previous compatible deformation gradients
-   F_lastInc2, &                                                                                    !< field of 2nd previous compatible deformation gradients
    F_tau_lastInc, &                                                                                 !< field of previous incompatible deformation gradient 
    Fdot, &                                                                                          !< field of assumed rate of compatible deformation gradient
    F_tauDot                                                                                         !< field of assumed rate of incopatible deformation gradient
- complex(pReal),private, dimension(:,:,:,:,:), allocatable :: inertiaField_fourier
 
 !--------------------------------------------------------------------------------------------------
 ! stress, stiffness and compliance average etc.
@@ -70,7 +68,7 @@ module DAMASK_spectral_solverPolarisation
    S_scale = 0.0_pReal
  
  real(pReal), private :: &
-   err_BC, &                                                                                    !< deviation from stress BC
+   err_BC, &                                                                                        !< deviation from stress BC
    err_curl, &                                                                                      !< RMS of curl of F
    err_div                                                                                          !< RMS of div of P
  logical, private :: ForwardData
@@ -118,19 +116,21 @@ subroutine Polarisation_init(temperature)
   debug_spectralRestart
  use FEsolving, only: &
    restartInc
+ use numerics, only: &
+   worldrank, &
+   worldsize
  use DAMASK_interface, only: &
    getSolverJobName
  use DAMASK_spectral_Utilities, only: &
    Utilities_init, &
    Utilities_constitutiveResponse, &
    Utilities_updateGamma, &
-   grid, &
+   Utilities_updateIPcoords, &
    grid1Red, &
-   geomSize, &
    wgt
  use mesh, only: &
-   mesh_ipCoordinates, &
-   mesh_deformedCoordsFFT
+   gridLocal, &
+   gridGlobal
  use math, only: &
    math_invSym3333
    
@@ -144,30 +144,41 @@ subroutine Polarisation_init(temperature)
  PetscErrorCode :: ierr
  PetscObject :: dummy
  PetscScalar, pointer, dimension(:,:,:,:) :: xx_psc, F, F_tau
+ integer(pInt), dimension(:), allocatable :: localK  
+ integer(pInt) :: proc
+ character(len=1024) :: rankStr
  
  call Utilities_init()
- write(6,'(/,a)') ' <<<+-  DAMASK_spectral_solverPolarisation init  -+>>>'
- write(6,'(a)') ' $Id$'
- write(6,'(a15,a)')   ' Current time: ',IO_timeStamp()
+ if (worldrank == 0_pInt) then
+   write(6,'(/,a)') ' <<<+-  DAMASK_spectral_solverPolarisation init  -+>>>'
+   write(6,'(a)') ' $Id$'
+   write(6,'(a15,a)')   ' Current time: ',IO_timeStamp()
 #include "compilation_info.f90"
+ endif
 
- allocate (P            (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
+ allocate (P            (3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
 !--------------------------------------------------------------------------------------------------
 ! allocate global fields
- allocate (F_lastInc    (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (F_lastInc2   (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (Fdot         (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (F_tau_lastInc(3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (F_tauDot     (3,3,grid(1),grid(2),grid(3)),source = 0.0_pReal)
- allocate (inertiaField_fourier (grid1Red,grid(2),grid(3),3,3),source = cmplx(0.0_pReal,0.0_pReal,pReal))
+ allocate (F_lastInc    (3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
+ allocate (Fdot         (3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
+ allocate (F_tau_lastInc(3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
+ allocate (F_tauDot     (3,3,gridLocal(1),gridLocal(2),gridLocal(3)),source = 0.0_pReal)
     
 !--------------------------------------------------------------------------------------------------
 ! PETSc Init
  call SNESCreate(PETSC_COMM_WORLD,snes,ierr); CHKERRQ(ierr)
- call DMDACreate3d(PETSC_COMM_WORLD,                             &
-           DM_BOUNDARY_NONE, DM_BOUNDARY_NONE, DM_BOUNDARY_NONE, &
-           DMDA_STENCIL_BOX,grid(1),grid(2),grid(3),PETSC_DECIDE,PETSC_DECIDE,PETSC_DECIDE, &
-           18,1,PETSC_NULL_INTEGER,PETSC_NULL_INTEGER,PETSC_NULL_INTEGER,da,ierr)
+ allocate(localK(worldsize), source = 0); localK(worldrank+1) = gridLocal(3)
+ do proc = 1, worldsize
+   call MPI_Bcast(localK(proc),1,MPI_INTEGER,proc-1,PETSC_COMM_WORLD,ierr)
+ enddo  
+ call DMDACreate3d(PETSC_COMM_WORLD, &
+        DM_BOUNDARY_NONE, DM_BOUNDARY_NONE, DM_BOUNDARY_NONE, &                                     ! cut off stencil at boundary
+        DMDA_STENCIL_BOX, &                                                                         ! Moore (26) neighborhood around central point
+        gridGlobal(1),gridGlobal(2),gridGlobal(3), &                                                ! global grid
+        1 , 1, worldsize, &
+        18, 0, &                                                                                    ! #dof (F tensor), ghost boundary width (domain overlap)
+        gridLocal (1),gridLocal (2),localK, &                                                       ! local grid
+        da,ierr)                                                                                    ! handle, error
  CHKERRQ(ierr)
  call DMCreateGlobalVector(da,solution_vec,ierr); CHKERRQ(ierr)
  call DMDASNESSetFunctionLocal(da,INSERT_VALUES,Polarisation_formResidual,dummy,ierr)
@@ -183,52 +194,50 @@ subroutine Polarisation_init(temperature)
  F => xx_psc(0:8,:,:,:)
  F_tau => xx_psc(9:17,:,:,:)
  if (restartInc == 1_pInt) then                                                                     ! no deformation (no restart)
-   F_lastInc     = spread(spread(spread(math_I3,3,grid(1)),4,grid(2)),5,grid(3))                    ! initialize to identity
-   F_lastInc2 = F_lastInc
-   F = reshape(F_lastInc,[9,grid(1),grid(2),grid(3)])
+   F_lastInc     = spread(spread(spread(math_I3,3,gridLocal(1)),4,gridLocal(2)),5,gridLocal(3))     ! initialize to identity
+   F = reshape(F_lastInc,[9,gridLocal(1),gridLocal(2),gridLocal(3)])
    F_tau = 2.0_pReal* F
    F_tau_lastInc = 2.0_pReal*F_lastInc
  elseif (restartInc > 1_pInt) then 
-   if (iand(debug_level(debug_spectral),debug_spectralRestart)/= 0) &
+   if (iand(debug_level(debug_spectral),debug_spectralRestart)/= 0 .and. worldrank == 0_pInt) &
      write(6,'(/,a,'//IO_intOut(restartInc-1_pInt)//',a)') &
      'reading values of increment', restartInc - 1_pInt, 'from file'
    flush(6)
-   call IO_read_realFile(777,'F',trim(getSolverJobName()),size(F))
+   write(rankStr,'(a1,i0)')'_',worldrank
+   call IO_read_realFile(777,'F'//trim(rankStr),trim(getSolverJobName()),size(F))
    read (777,rec=1) F
    close (777)
-   call IO_read_realFile(777,'F_lastInc',trim(getSolverJobName()),size(F_lastInc))
+   call IO_read_realFile(777,'F_lastInc'//trim(rankStr),trim(getSolverJobName()),size(F_lastInc))
    read (777,rec=1) F_lastInc
    close (777)
-   call IO_read_realFile(777,'F_lastInc2',trim(getSolverJobName()),size(F_lastInc2))
-   read (777,rec=1) F_lastInc2
-   close (777)
-   call IO_read_realFile(777,'F_tau',trim(getSolverJobName()),size(F_tau))
+   call IO_read_realFile(777,'F_tau'//trim(rankStr),trim(getSolverJobName()),size(F_tau))
    read (777,rec=1) F_tau
    close (777)
-   call IO_read_realFile(777,'F_tau_lastInc',&
+   call IO_read_realFile(777,'F_tau_lastInc'//trim(rankStr),&
                                         trim(getSolverJobName()),size(F_tau_lastInc))
    read (777,rec=1) F_tau_lastInc
    close (777)
-   call IO_read_realFile(777,'F_aim', trim(getSolverJobName()),size(F_aim))
-   read (777,rec=1) F_aim
-   close (777)
-   call IO_read_realFile(777,'F_aim_lastInc', trim(getSolverJobName()),size(F_aim_lastInc))
-   read (777,rec=1) F_aim_lastInc
-   close (777)
-   call IO_read_realFile(777,'F_aimDot',trim(getSolverJobName()),size(f_aimDot))
-   read (777,rec=1) f_aimDot
-   close (777)
+   if (worldrank == 0_pInt) then
+     call IO_read_realFile(777,'F_aim', trim(getSolverJobName()),size(F_aim))
+     read (777,rec=1) F_aim
+     close (777)
+     call IO_read_realFile(777,'F_aim_lastInc', trim(getSolverJobName()),size(F_aim_lastInc))
+     read (777,rec=1) F_aim_lastInc
+     close (777)
+     call IO_read_realFile(777,'F_aimDot',trim(getSolverJobName()),size(f_aimDot))
+     read (777,rec=1) f_aimDot
+     close (777)
+   endif
  endif
 
- mesh_ipCoordinates = reshape(mesh_deformedCoordsFFT(geomSize,reshape(&
-                              F,[3,3,grid(1),grid(2),grid(3)])),[3,1,product(grid)])
- call Utilities_constitutiveResponse(F_lastInc,reshape(F,[3,3,grid(1),grid(2),grid(3)]),&
+ call Utilities_updateIPcoords(F)
+ call Utilities_constitutiveResponse(F_lastInc,F,&
    temperature,0.0_pReal,P,C_volAvg,C_minMaxAvg,temp33_Real,.false.,math_I3)
  nullify(F)
  nullify(F_tau)
  call DMDAVecRestoreArrayF90(da,solution_vec,xx_psc,ierr); CHKERRQ(ierr)                            ! write data back to PETSc
 
- if (restartInc > 1_pInt) then                                                                      ! using old values from files
+ if (restartInc > 1_pInt .and. worldrank == 0_pInt) then                                            ! using old values from files
    if (iand(debug_level(debug_spectral),debug_spectralRestart)/= 0) &
      write(6,'(/,a,'//IO_intOut(restartInc-1_pInt)//',a)') &
      'reading more values of increment', restartInc - 1_pInt, 'from file'
@@ -255,7 +264,8 @@ end subroutine Polarisation_init
 !> @brief solution for the Polarisation scheme with internal iterations
 !--------------------------------------------------------------------------------------------------
 type(tSolutionState) function &
-  Polarisation_solution(incInfoIn,guess,timeinc,timeinc_old,loadCaseTime,P_BC,F_BC,temperature_bc,rotation_BC,density)
+  Polarisation_solution(incInfoIn,guess,timeinc,timeinc_old,loadCaseTime,P_BC,F_BC,temperature_bc, &
+                                                                               rotation_BC,density)
  use numerics, only: &
    update_gamma
  use math, only: &
@@ -267,6 +277,8 @@ type(tSolutionState) function &
  use FEsolving, only: &
    restartWrite, &
    terminallyIll
+ use numerics, only: &
+   worldrank
  
  implicit none
 
@@ -340,9 +352,12 @@ subroutine Polarisation_formResidual(in,x_scal,f_scal,dummy,ierr)
    itmax, &
    itmin, &
    polarAlpha, &
-   polarBeta
+   polarBeta, &
+   worldrank
  use IO, only: &
    IO_intOut
+ use mesh, only: &
+   gridLocal
  use math, only: &
    math_rotate_backward33, &
    math_transpose33, &
@@ -351,11 +366,9 @@ subroutine Polarisation_formResidual(in,x_scal,f_scal,dummy,ierr)
    math_mul33x33, &
    PI
  use DAMASK_spectral_Utilities, only: &
-   grid, &
-   geomSize, &
    wgt, &
-   field_real, &
-   field_fourier, &
+   field_realMPI, &
+   field_fourierMPI, &
    Utilities_FFTforward, &
    Utilities_fourierConvolution, &
    Utilities_inverseLaplace, &
@@ -369,6 +382,8 @@ subroutine Polarisation_formResidual(in,x_scal,f_scal,dummy,ierr)
    debug_spectralRotation
  use homogenization, only: &
    materialpoint_dPdF
+ use FEsolving, only: &
+   terminallyIll
 
  implicit none
 !--------------------------------------------------------------------------------------------------
@@ -408,71 +423,57 @@ subroutine Polarisation_formResidual(in,x_scal,f_scal,dummy,ierr)
  call SNESGetIterationNumber(snes,PETScIter,ierr); CHKERRQ(ierr)
 
  F_av = sum(sum(sum(F,dim=5),dim=4),dim=3) * wgt
+ call MPI_Allreduce(MPI_IN_PLACE,F_av,9,MPI_DOUBLE,MPI_SUM,PETSC_COMM_WORLD,ierr)
  
  if(nfuncs== 0 .and. PETScIter == 0) totalIter = -1_pInt                                            ! new increment
  if (totalIter <= PETScIter) then                                                                   ! new iteration
 !--------------------------------------------------------------------------------------------------
 ! report begin of new iteration
    totalIter = totalIter + 1_pInt
-   write(6,'(1x,a,3(a,'//IO_intOut(itmax)//'))') trim(incInfo), &
+   if (worldrank == 0_pInt) then
+     write(6,'(1x,a,3(a,'//IO_intOut(itmax)//'))') trim(incInfo), &
                     ' @ Iteration ', itmin, '≤',totalIter, '≤', itmax
-   if (iand(debug_level(debug_spectral),debug_spectralRotation) /= 0) &
-   write(6,'(/,a,/,3(3(f12.7,1x)/))',advance='no') ' deformation gradient aim (lab) =', &
+     if (iand(debug_level(debug_spectral),debug_spectralRotation) /= 0) &
+     write(6,'(/,a,/,3(3(f12.7,1x)/))',advance='no') ' deformation gradient aim (lab) =', &
                                  math_transpose33(math_rotate_backward33(F_aim,params%rotation_BC))
-   write(6,'(/,a,/,3(3(f12.7,1x)/))',advance='no') ' deformation gradient aim =', &
+     write(6,'(/,a,/,3(3(f12.7,1x)/))',advance='no') ' deformation gradient aim =', &
                                  math_transpose33(F_aim)
+     endif
    flush(6)
  endif
 
-!--------------------------------------------------------------------------------------------------
-! evaluate inertia
- dynamic: if (params%density > 0.0_pReal) then
-   residual_F = ((F - F_lastInc)/params%timeinc - (F_lastInc - F_lastInc2)/params%timeincOld)/&
-                                                  ((params%timeinc + params%timeincOld)/2.0_pReal)
-   residual_F = params%density*product(geomSize/grid)*residual_F
-   field_real = 0.0_pReal
-   field_real(1:grid(1),1:grid(2),1:grid(3),1:3,1:3) = reshape(residual_F,[grid(1),grid(2),grid(3),3,3],&
-                                                               order=[4,5,1,2,3])                   ! field real has a different order
-   call Utilities_FFTforward()
-   call Utilities_inverseLaplace()
-   inertiaField_fourier = field_fourier
- else dynamic
-   inertiaField_fourier = cmplx(0.0_pReal,0.0_pReal,pReal)
- endif dynamic
-  
 !--------------------------------------------------------------------------------------------------
 ! 
- field_real = 0.0_pReal
- do k = 1_pInt, grid(3); do j = 1_pInt, grid(2); do i = 1_pInt, grid(1)
-   field_real(i,j,k,1:3,1:3) = &
+ field_realMPI = 0.0_pReal
+ do k = 1_pInt, gridLocal(3); do j = 1_pInt, gridLocal(2); do i = 1_pInt, gridLocal(1)
+   field_realMPI(1:3,1:3,i,j,k) = &
      polarBeta*math_mul3333xx33(C_scale,F(1:3,1:3,i,j,k) - math_I3) -&
      polarAlpha*math_mul33x33(F(1:3,1:3,i,j,k), &
-                              math_mul3333xx33(C_scale,F_tau(1:3,1:3,i,j,k) - F(1:3,1:3,i,j,k) - math_I3))
+                        math_mul3333xx33(C_scale,F_tau(1:3,1:3,i,j,k) - F(1:3,1:3,i,j,k) - math_I3))
  enddo; enddo; enddo
  
 !--------------------------------------------------------------------------------------------------
 ! doing convolution in Fourier space 
  call Utilities_FFTforward()
- field_fourier = field_fourier + polarAlpha*inertiaField_fourier
  call Utilities_fourierConvolution(math_rotate_backward33(polarBeta*F_aim,params%rotation_BC)) 
  call Utilities_FFTbackward()
 
 !--------------------------------------------------------------------------------------------------
 ! constructing residual                         
- residual_F_tau = polarBeta*F - reshape(field_real(1:grid(1),1:grid(2),1:grid(3),1:3,1:3),&
-                                      [3,3,grid(1),grid(2),grid(3)],order=[3,4,5,1,2])
+ residual_F_tau = polarBeta*F - field_realMPI(1:3,1:3,1:gridLocal(1),1:gridLocal(2),1:gridLocal(3))
 
 !--------------------------------------------------------------------------------------------------
 ! evaluate constitutive response
  P_avLastEval = P_av
  call Utilities_constitutiveResponse(F_lastInc,F - residual_F_tau/polarBeta,params%temperature,params%timeinc, &
                                      residual_F,C_volAvg,C_minMaxAvg,P_av,ForwardData,params%rotation_BC)
+ call MPI_Allreduce(MPI_IN_PLACE,terminallyIll,1,MPI_LOGICAL,MPI_LOR,PETSC_COMM_WORLD,ierr) 
  ForwardData = .False.
 
 !--------------------------------------------------------------------------------------------------
 ! calculate divergence
- field_real = 0.0_pReal
- field_real = reshape(residual_F,[grid(1),grid(2),grid(3),3,3],order=[4,5,1,2,3]) 
+ field_realMPI = 0.0_pReal
+ field_realMPI(1:3,1:3,1:gridLocal(1),1:gridLocal(2),1:gridLocal(3)) = residual_F
  call Utilities_FFTforward()
  err_div = Utilities_divergenceRMS()
  call Utilities_FFTbackward()
@@ -480,19 +481,19 @@ subroutine Polarisation_formResidual(in,x_scal,f_scal,dummy,ierr)
 !--------------------------------------------------------------------------------------------------
 ! constructing residual
  e = 0_pInt
- do k = 1_pInt, grid(3); do j = 1_pInt, grid(2); do i = 1_pInt, grid(1)
+ do k = 1_pInt, gridLocal(3); do j = 1_pInt, gridLocal(2); do i = 1_pInt, gridLocal(1)
    e = e + 1_pInt
    residual_F(1:3,1:3,i,j,k) = &
      math_mul3333xx33(math_invSym3333(materialpoint_dPdF(1:3,1:3,1:3,1:3,1,e) + C_scale), &
                       residual_F(1:3,1:3,i,j,k) - math_mul33x33(F(1:3,1:3,i,j,k), &
                       math_mul3333xx33(C_scale,F_tau(1:3,1:3,i,j,k) - F(1:3,1:3,i,j,k) - math_I3))) &
-   + residual_F_tau(1:3,1:3,i,j,k)
+                      + residual_F_tau(1:3,1:3,i,j,k)
  enddo; enddo; enddo
  
 !--------------------------------------------------------------------------------------------------
 ! calculating curl
- field_real = 0.0_pReal
- field_real = reshape(F,[grid(1),grid(2),grid(3),3,3],order=[4,5,1,2,3]) 
+ field_realMPI = 0.0_pReal
+ field_realMPI(1:3,1:3,1:gridLocal(1),1:gridLocal(2),1:gridLocal(3)) = F
  call Utilities_FFTforward()
  err_curl = Utilities_curlRMS()
  call Utilities_FFTbackward()
@@ -512,7 +513,8 @@ subroutine Polarisation_converged(snes_local,PETScIter,xnorm,snorm,fnorm,reason,
    err_curl_tolRel, &
    err_curl_tolAbs, &
    err_stress_tolabs, &
-   err_stress_tolrel
+   err_stress_tolrel, &
+   worldrank
  use math, only: &
    math_mul3333xx33
  use FEsolving, only: &
@@ -559,15 +561,17 @@ subroutine Polarisation_converged(snes_local,PETScIter,xnorm,snorm,fnorm,reason,
 
 !--------------------------------------------------------------------------------------------------
 ! report
- write(6,'(1/,a)') ' ... reporting .............................................................'
- write(6,'(/,a,f12.2,a,es8.2,a,es9.2,a)') ' error curl =       ', &
+if (worldrank == 0_pInt) then
+  write(6,'(1/,a)') ' ... reporting .............................................................'
+  write(6,'(/,a,f12.2,a,es8.2,a,es9.2,a)') ' error curl =       ', &
             err_curl/curlTol,' (',err_curl,' -,   tol =',curlTol,')'
- write(6,'  (a,f12.2,a,es8.2,a,es9.2,a)') ' error divergence = ', &
+  write(6,'  (a,f12.2,a,es8.2,a,es9.2,a)') ' error divergence = ', &
             err_div/divTol,  ' (',err_div, ' / m, tol =',divTol,')'
- write(6,'  (a,f12.2,a,es8.2,a,es9.2,a)') ' error BC =         ', &
+  write(6,'  (a,f12.2,a,es8.2,a,es9.2,a)') ' error BC =         ', &
             err_BC/BC_tol, ' (',err_BC,    ' Pa,  tol =',BC_tol,')' 
- write(6,'(/,a)') ' ==========================================================================='
- flush(6) 
+  write(6,'(/,a)') ' ==========================================================================='
+  flush(6) 
+endif
 
 end subroutine Polarisation_converged
 
@@ -581,19 +585,19 @@ subroutine Polarisation_forward(guess,timeinc,timeinc_old,loadCaseTime,F_BC,P_BC
    math_transpose33, &
    math_rotate_backward33
  use DAMASK_spectral_Utilities, only: &
-   grid, &
-   geomSize, &
    Utilities_calculateRate, &
    Utilities_forwardField, &
+   Utilities_updateIPcoords, &
    tBoundaryCondition, &
    cutBack
  use mesh, only: &
-   mesh_ipCoordinates,&
-   mesh_deformedCoordsFFT
+   gridLocal
  use IO, only: &
    IO_write_JobRealFile
  use FEsolving, only: &
    restartWrite
+ use numerics, only: &
+   worldrank
 
  implicit none
  real(pReal), intent(in) :: &
@@ -610,6 +614,7 @@ subroutine Polarisation_forward(guess,timeinc,timeinc_old,loadCaseTime,F_BC,P_BC
  PetscScalar, dimension(:,:,:,:), pointer :: xx_psc, F, F_tau
  integer(pInt) :: i, j, k
  real(pReal), dimension(3,3) :: F_lambda33
+ character(len=1024) :: rankStr
 
 !--------------------------------------------------------------------------------------------------
 ! update coordinates and rate and forward last inc
@@ -617,46 +622,46 @@ subroutine Polarisation_forward(guess,timeinc,timeinc_old,loadCaseTime,F_BC,P_BC
  F => xx_psc(0:8,:,:,:)
  F_tau => xx_psc(9:17,:,:,:)
  if (restartWrite) then
-   write(6,'(/,a)') ' writing converged results for restart'
+   if (worldrank == 0_pInt) write(6,'(/,a)') ' writing converged results for restart'
    flush(6)
-   call IO_write_jobRealFile(777,'F',size(F))                                                     ! writing deformation gradient field to file
+   write(rankStr,'(a1,i0)')'_',worldrank
+   call IO_write_jobRealFile(777,'F'//trim(rankStr),size(F))                                                     ! writing deformation gradient field to file
    write (777,rec=1) F
    close (777)
-   call IO_write_jobRealFile(777,'F_lastInc',size(F_lastInc))                                     ! writing F_lastInc field to file
+   call IO_write_jobRealFile(777,'F_lastInc'//trim(rankStr),size(F_lastInc))                                     ! writing F_lastInc field to file
    write (777,rec=1) F_lastInc
    close (777)
-   call IO_write_jobRealFile(777,'F_lastInc2',size(F_lastInc2))                                   ! writing F_lastInc field to file
-   write (777,rec=1) F_lastInc2
-   close (777)
-   call IO_write_jobRealFile(777,'F_tau',size(F_tau))                                             ! writing deformation gradient field to file
+   call IO_write_jobRealFile(777,'F_tau'//trim(rankStr),size(F_tau))                                             ! writing deformation gradient field to file
    write (777,rec=1) F_tau
    close (777)
-   call IO_write_jobRealFile(777,'F_tau_lastInc',size(F_tau_lastInc))                             ! writing F_lastInc field to file
+   call IO_write_jobRealFile(777,'F_tau_lastInc'//trim(rankStr),size(F_tau_lastInc))                             ! writing F_lastInc field to file
    write (777,rec=1) F_tau_lastInc
    close (777)
-   call IO_write_jobRealFile(777,'F_aim',size(F_aim))
-   write (777,rec=1) F_aim
-   close(777)
-   call IO_write_jobRealFile(777,'F_aim_lastInc',size(F_aim_lastInc))
-   write (777,rec=1) F_aim_lastInc
-   close (777)
-   call IO_write_jobRealFile(777,'F_aimDot',size(F_aimDot))
-   write (777,rec=1) F_aimDot
-   close(777)
-   call IO_write_jobRealFile(777,'C_volAvg',size(C_volAvg))
-   write (777,rec=1) C_volAvg
-   close(777)
-   call IO_write_jobRealFile(777,'C_volAvgLastInc',size(C_volAvgLastInc))
-   write (777,rec=1) C_volAvgLastInc
-   close(777)
+   if (worldrank == 0_pInt) then
+     call IO_write_jobRealFile(777,'F_aim',size(F_aim))
+     write (777,rec=1) F_aim
+     close(777)
+     call IO_write_jobRealFile(777,'F_aim_lastInc',size(F_aim_lastInc))
+     write (777,rec=1) F_aim_lastInc
+     close (777)
+     call IO_write_jobRealFile(777,'F_aimDot',size(F_aimDot))
+     write (777,rec=1) F_aimDot
+     close(777)
+     call IO_write_jobRealFile(777,'C_volAvg',size(C_volAvg))
+     write (777,rec=1) C_volAvg
+     close(777)
+     call IO_write_jobRealFile(777,'C_volAvgLastInc',size(C_volAvgLastInc))
+     write (777,rec=1) C_volAvgLastInc
+     close(777)
+   endif
  endif 
- mesh_ipCoordinates = reshape(mesh_deformedCoordsFFT(geomSize,reshape(&
-                                             F,[3,3,grid(1),grid(2),grid(3)])),[3,1,product(grid)])
+ call utilities_updateIPcoords(F)
+
 
  if (cutBack) then 
    F_aim = F_aim_lastInc
-   F_tau= reshape(F_tau_lastInc,[9,grid(1),grid(2),grid(3)]) 
-   F    = reshape(F_lastInc,    [9,grid(1),grid(2),grid(3)]) 
+   F_tau= reshape(F_tau_lastInc,[9,gridLocal(1),gridLocal(2),gridLocal(3)]) 
+   F    = reshape(F_lastInc,    [9,gridLocal(1),gridLocal(2),gridLocal(3)]) 
    C_volAvg = C_volAvgLastInc
  else
    ForwardData = .True.
@@ -675,25 +680,27 @@ subroutine Polarisation_forward(guess,timeinc,timeinc_old,loadCaseTime,F_BC,P_BC
 
 !--------------------------------------------------------------------------------------------------
 ! update coordinates and rate and forward last inc
-   mesh_ipCoordinates = reshape(mesh_deformedCoordsFFT(geomSize,reshape(&
-                                            F,[3,3,grid(1),grid(2),grid(3)])),[3,1,product(grid)])
+   call utilities_updateIPcoords(F)
    Fdot =  Utilities_calculateRate(math_rotate_backward33(f_aimDot,rotation_BC), &
-                  timeinc_old,guess,F_lastInc,reshape(F,[3,3,grid(1),grid(2),grid(3)]))
+                                                                  timeinc_old,guess,F_lastInc, &
+                            reshape(F,[3,3,gridLocal(1),gridLocal(2),gridLocal(3)]))
    F_tauDot =  Utilities_calculateRate(math_rotate_backward33(2.0_pReal*f_aimDot,rotation_BC), &
-                  timeinc_old,guess,F_tau_lastInc,reshape(F_tau,[3,3,grid(1),grid(2),grid(3)]))  
-   F_lastInc2 = F_lastInc
-   F_lastInc     = reshape(F,       [3,3,grid(1),grid(2),grid(3)])
-   F_tau_lastInc = reshape(F_tau,[3,3,grid(1),grid(2),grid(3)])
+                                                              timeinc_old,guess,F_tau_lastInc, &
+                        reshape(F_tau,[3,3,gridLocal(1),gridLocal(2),gridLocal(3)]))  
+   F_lastInc     = reshape(F,       [3,3,gridLocal(1),gridLocal(2),gridLocal(3)])
+   F_tau_lastInc = reshape(F_tau,[3,3,gridLocal(1),gridLocal(2),gridLocal(3)])
  endif
  F_aim = F_aim + f_aimDot * timeinc
 
 !--------------------------------------------------------------------------------------------------
 ! update local deformation gradient
- F     = reshape(Utilities_forwardField(timeinc,F_lastInc,Fdot, &                                      ! ensure that it matches rotated F_aim
-                               math_rotate_backward33(F_aim,rotation_BC)),[9,grid(1),grid(2),grid(3)])
- F_tau = reshape(Utilities_forwardField(timeinc,F_tau_lastInc,F_taudot),  [9,grid(1),grid(2),grid(3)]) ! does not have any average value as boundary condition
- if (.not. guess) then                                                                                 ! large strain forwarding
-   do k = 1_pInt, grid(3); do j = 1_pInt, grid(2); do i = 1_pInt, grid(1)
+ F     = reshape(Utilities_forwardField(timeinc,F_lastInc,Fdot, &                                   ! ensure that it matches rotated F_aim
+                               math_rotate_backward33(F_aim,rotation_BC)), &
+   [9,gridLocal(1),gridLocal(2),gridLocal(3)])
+ F_tau = reshape(Utilities_forwardField(timeinc,F_tau_lastInc,F_taudot),  &                         ! does not have any average value as boundary condition
+                                                  [9,gridLocal(1),gridLocal(2),gridLocal(3)])          
+ if (.not. guess) then                                                                              ! large strain forwarding
+   do k = 1_pInt, gridLocal(3); do j = 1_pInt, gridLocal(2); do i = 1_pInt, gridLocal(1)
       F_lambda33 = reshape(F_tau(1:9,i,j,k)-F(1:9,i,j,k),[3,3])
       F_lambda33 = math_mul3333xx33(S_scale,math_mul33x33(F_lambda33, &
                                   math_mul3333xx33(C_scale,&
diff --git a/code/DAMASK_spectral_utilities.f90 b/code/DAMASK_spectral_utilities.f90
index 80ed4f999..886f130c7 100644
--- a/code/DAMASK_spectral_utilities.f90
+++ b/code/DAMASK_spectral_utilities.f90
@@ -17,48 +17,48 @@ module DAMASK_spectral_utilities
 #ifdef PETSc
 #include <petsc-finclude/petscsys.h>
 #endif
- logical,       public                                         :: cutBack =.false.                  !< cut back of BVP solver in case convergence is not achieved or a material point is terminally ill
+ include 'fftw3-mpi.f03'
+ 
+ logical,       public             :: cutBack =.false.                                              !< cut back of BVP solver in case convergence is not achieved or a material point is terminally ill
  integer(pInt), public, parameter  :: maxPhaseFields = 2_pInt
 !--------------------------------------------------------------------------------------------------
 ! grid related information information
- integer(pInt), public,  dimension(3) :: grid                                                       !< grid points as specified in geometry file
  real(pReal),   public                :: wgt                                                        !< weighting factor 1/Nelems
- real(pReal),   public,  dimension(3) :: geomSize                                                   !< size of geometry as specified in geometry file
  
 !--------------------------------------------------------------------------------------------------
 ! variables storing information for spectral method and FFTW
- integer(pInt), public                                         :: grid1Red                          !< grid(1)/2
- real(pReal),   public,  dimension(:,:,:,:,:),     pointer     :: field_real                        !< real representation (some stress or deformation) of field_fourier
- complex(pReal),public,  dimension(:,:,:,:,:),     pointer     :: field_fourier                     !< field on which the Fourier transform operates
- real(pReal),   private, dimension(:,:,:,:,:,:,:), allocatable :: gamma_hat                         !< gamma operator (field) for spectral method
- real(pReal),   private, dimension(:,:,:,:),       allocatable :: xi                                !< wave vector field for divergence and for gamma operator
- real(pReal),   private, dimension(3,3,3,3)                    :: C_ref                             !< reference stiffness
- real(pReal),   private, dimension(3)                          :: scaledGeomSize                    !< scaled geometry size for calculation of divergence (Basic, Basic PETSc)
-
+ integer(pInt), public                                                    :: grid1Red               !< grid(1)/2
+ real   (C_DOUBLE),        public,  dimension(:,:,:,:,:),     pointer     :: field_realMPI          !< real representation (some stress or deformation) of field_fourier
+ complex(C_DOUBLE_COMPLEX),public,  dimension(:,:,:,:,:),     pointer     :: field_fourierMPI       !< field on which the Fourier transform operates
+ real(pReal),              private, dimension(:,:,:,:,:,:,:), allocatable :: gamma_hat              !< gamma operator (field) for spectral method
+ real(pReal),              private, dimension(:,:,:,:),       allocatable :: xi                     !< wave vector field for divergence and for gamma operator
+ real(pReal),              private, dimension(3,3,3,3)                    :: C_ref                  !< reference stiffness
+ real(pReal),              private, dimension(3)                          :: scaledGeomSize         !< scaled geometry size for calculation of divergence (Basic, Basic PETSc)
+ 
 !--------------------------------------------------------------------------------------------------
 ! debug fftw 
- complex(pReal),private, dimension(:,:,:),         pointer :: scalarField_real, &                   !< scalar field real representation for debug of FFTW
-                                                              scalarField_fourier                   !< scalar field complex representation for debug of FFTW
-
+ real(C_DOUBLE),           private, dimension(:,:,:), pointer     :: scalarField_realMPI            !< scalar field real representation for debugging fftw
+ complex(C_DOUBLE_COMPLEX),private, dimension(:,:,:), pointer     :: scalarField_fourierMPI         !< scalar field fourier representation for debugging fftw
+  
 !--------------------------------------------------------------------------------------------------
 ! geometry reconstruction
- !real(pReal),    private, dimension(:,:,:,:),   pointer :: coords_real
- !complex(pReal), private, dimension(:,:,:,:),   pointer :: coords_fourier
+ real(C_DOUBLE),           private, dimension(:,:,:,:), pointer     :: coords_realMPI               !< vector field real representation for geometry reconstruction
+ complex(C_DOUBLE_COMPLEX),private, dimension(:,:,:,:), pointer     :: coords_fourierMPI            !< vector field fourier representation for geometry reconstruction
  
 !--------------------------------------------------------------------------------------------------
 ! debug divergence
- real(pReal),   private, dimension(:,:,:,:), pointer     :: divReal                                 !< scalar field real representation for debugging divergence calculation
- complex(pReal),private, dimension(:,:,:,:), pointer     :: divFourier                              !< scalar field real representation for debugging divergence calculation
+ real(C_DOUBLE),           private, dimension(:,:,:,:), pointer     :: divRealMPI                   !< vector field real representation for debugging divergence calculation
+ complex(C_DOUBLE_COMPLEX),private, dimension(:,:,:,:), pointer     :: divFourierMPI                !< vector field fourier representation for debugging divergence calculation
 
 !--------------------------------------------------------------------------------------------------
 ! plans for FFTW
  type(C_PTR),   private :: &
-   planForth, &                                                                                     !< FFTW plan P(x) to P(k)
-   planBack, &                                                                                      !< FFTW plan F(k) to F(x)
-   planDebugForth, &                                                                                !< FFTW plan for scalar field (proof that order of usual transform is correct)
-   planDebugBack, &                                                                                 !< FFTW plan for scalar field inverse (proof that order of usual transform is correct)
-   planDiv!, &                                                                                       !< FFTW plan in case of debugging divergence calculation
-   !planCoords                                                                                       !< FFTW plan for geometry reconstruction
+   planForthMPI, &                                                                                  !< FFTW MPI plan P(x) to P(k)
+   planBackMPI, &                                                                                   !< FFTW MPI plan F(k) to F(x)
+   planDebugForthMPI, &                                                                             !< FFTW MPI plan for scalar field
+   planDebugBackMPI, &                                                                              !< FFTW MPI plan for scalar field inverse
+   planDivMPI, &                                                                                    !< FFTW MPI plan for FFTW in case of debugging divergence calculation
+   planCoordsMPI                                                                                    !< FFTW MPI plan for geometry reconstruction
 
 !--------------------------------------------------------------------------------------------------
 ! variables controlling debugging
@@ -85,10 +85,10 @@ module DAMASK_spectral_utilities
    character(len=64)           :: myType      = 'None'
  end type tBoundaryCondition
 
- type, public :: phaseFieldDataBin                                                                     !< set of parameters defining a phase field
-   real(pReal)       :: diffusion      = 0.0_pReal, &                            !< thermal conductivity
-                        mobility       = 0.0_pReal, &                            !< thermal mobility
-                        phaseField0    = 0.0_pReal                               !< homogeneous damage field starting condition 
+ type, public :: phaseFieldDataBin                                                                  !< set of parameters defining a phase field
+   real(pReal)       :: diffusion      = 0.0_pReal, &                                               !< thermal conductivity
+                        mobility       = 0.0_pReal, &                                               !< thermal mobility
+                        phaseField0    = 0.0_pReal                                                  !< homogeneous damage field starting condition 
    logical           :: active         = .false.
    character(len=64) :: label      = ''
  end type phaseFieldDataBin
@@ -136,7 +136,8 @@ subroutine utilities_init()
    fftw_timelimit, &
    memory_efficient, &
    petsc_options, &
-   divergence_correction
+   divergence_correction, &
+   worldrank
  use debug, only: &
    debug_level, &
    debug_SPECTRAL, &
@@ -149,10 +150,14 @@ subroutine utilities_init()
  use debug, only: &
    PETSCDEBUG
 #endif
- use math                                                                                           ! must use the whole module for use of FFTW
+ use math                                                                                    
  use mesh, only: &
-  mesh_spectral_getSize, &
-  mesh_spectral_getGrid
+   gridGlobal, &
+   gridLocal, &
+   gridOffset, &
+   geomSizeGlobal, &
+   geomSizeLocal, &
+   geomSizeOffset
 
  implicit none
 #ifdef PETSc
@@ -166,15 +171,19 @@ subroutine utilities_init()
  integer(pInt), parameter    :: fileUnit = 228_pInt
  integer(pInt), dimension(3) :: k_s
  type(C_PTR) :: &
-   tensorField, &                                                                                   !< field cotaining data for FFTW in real and fourier space (in place)
-   scalarField_realC, &                                                                             !< field cotaining data for FFTW in real space when debugging FFTW (no in place)
-   scalarField_fourierC, &                                                                          !< field cotaining data for FFTW in fourier space when debugging FFTW (no in place)
-   div!, &                                                                                           !< field cotaining data for FFTW in real and fourier space when debugging divergence (in place)
-   !coords_fftw
- write(6,'(/,a)')   ' <<<+-  DAMASK_spectral_utilities init  -+>>>'
- write(6,'(a)')     ' $Id$'
- write(6,'(a15,a)') ' Current time: ',IO_timeStamp()
+   tensorFieldMPI, &                                                                                !< field cotaining data for FFTW in real and fourier space (in place)
+   scalarFieldMPI, &                                                                                !< field cotaining data for FFTW in real space when debugging FFTW (no in place)
+   div, &                                                                                           !< field cotaining data for FFTW in real and fourier space when debugging divergence (in place)
+   coordsMPI
+ integer(C_INTPTR_T) :: gridFFTW(3), alloc_local, local_K, local_K_offset, &
+                        dimSize = 3, scalarSize = 1, vecSize = 3, tensorSize = 9
+ 
+ mainProcess: if (worldrank == 0) then
+   write(6,'(/,a)')   ' <<<+-  DAMASK_spectral_utilities init  -+>>>'
+   write(6,'(a)')     ' $Id$'
+   write(6,'(a15,a)') ' Current time: ',IO_timeStamp()
 #include "compilation_info.f90"
+ endif mainProcess
 
 !--------------------------------------------------------------------------------------------------
 ! set debugging parameters
@@ -184,7 +193,7 @@ subroutine utilities_init()
  debugRotation   = iand(debug_level(debug_SPECTRAL),debug_SPECTRALROTATION)   /= 0
  debugPETSc      = iand(debug_level(debug_SPECTRAL),debug_SPECTRALPETSC)      /= 0
 #ifdef PETSc
- if(debugPETSc) write(6,'(3(/,a),/)') &
+ if(debugPETSc .and. worldrank == 0_pInt) write(6,'(3(/,a),/)') &
                 ' Initializing PETSc with debug options: ', &
                 trim(PETScDebug), &
                 ' add more using the PETSc_Options keyword in numerics.config '
@@ -196,121 +205,116 @@ subroutine utilities_init()
  if(debugPETSc) call IO_warning(41_pInt, ext_msg='debug PETSc')
 #endif
 
- call IO_open_file(fileUnit,geometryFile)                                                           ! parse info from geometry file...
- grid = mesh_spectral_getGrid(fileUnit)
- grid1Red = grid(1)/2_pInt + 1_pInt
- wgt = 1.0/real(product(grid),pReal)
- geomSize = mesh_spectral_getSize(fileUnit)
- close(fileUnit)
-
- write(6,'(a,3(i12  ))')  ' grid     a b c: ', grid
- write(6,'(a,3(es12.5))') ' size     x y z: ', geomSize
+ grid1Red = gridLocal(1)/2_pInt + 1_pInt
+ wgt = 1.0/real(product(gridGlobal),pReal)
 
+ if (worldrank == 0) then
+   write(6,'(a,3(i12  ))')  ' grid     a b c: ', gridGlobal
+   write(6,'(a,3(es12.5))') ' size     x y z: ', geomSizeGlobal
+ endif
+ 
 !--------------------------------------------------------------------------------------------------
 ! scale dimension to calculate either uncorrected, dimension-independent, or dimension- and reso-
 ! lution-independent divergence
  if (divergence_correction == 1_pInt) then
    do j = 1_pInt, 3_pInt
-    if (j /= minloc(geomSize,1) .and. j /= maxloc(geomSize,1)) &
-      scaledGeomSize = geomSize/geomSize(j)
+    if (j /= minloc(geomSizeGlobal,1) .and. j /= maxloc(geomSizeGlobal,1)) &
+      scaledGeomSize = geomSizeGlobal/geomSizeGlobal(j)
    enddo
  elseif (divergence_correction == 2_pInt) then
    do j = 1_pInt, 3_pInt
-    if (j /= minloc(geomSize/grid,1) .and. j /= maxloc(geomSize/grid,1)) &
-      scaledGeomSize = geomSize/geomSize(j)*grid(j)
+    if (j /= minloc(geomSizeGlobal/gridGlobal,1) .and. j /= maxloc(geomSizeGlobal/gridGlobal,1)) &
+      scaledGeomSize = geomSizeGlobal/geomSizeGlobal(j)*gridGlobal(j)
    enddo
  else
-   scaledGeomSize = geomSize
+   scaledGeomSize = geomSizeGlobal
  endif
 
 
+!--------------------------------------------------------------------------------------------------
+! MPI allocation
+ gridFFTW = gridGlobal
+ alloc_local = fftw_mpi_local_size_3d(gridFFTW(3), gridFFTW(2), gridFFTW(1)/2 +1, &
+                                      MPI_COMM_WORLD, local_K, local_K_offset)
+ tensorFieldMPI = fftw_alloc_complex(9*alloc_local)
+ call c_f_pointer(tensorFieldMPI, field_realMPI,   [dimSize,dimSize,2*(gridFFTW(1)/2 + 1),gridFFTW(2),local_K]) 
+ call c_f_pointer(tensorFieldMPI, field_fourierMPI,[dimSize,dimSize,   gridFFTW(1)/2 + 1 ,gridFFTW(2),local_K]) 
+ allocate (xi(3,grid1Red,gridLocal(2),gridLocal(3)),source = 0.0_pReal)                             ! frequencies, only half the size for first dimension
+ 
+ coordsMPI = fftw_alloc_complex(3*alloc_local)
+ call c_f_pointer(coordsMPI, coords_realMPI,    [dimSize,2*(gridFFTW(1)/2 + 1),gridFFTW(2),local_K])  ! place a pointer for a real representation on coords_fftw
+ call c_f_pointer(coordsMPI, coords_fourierMPI, [dimSize,   gridFFTW(1)/2 + 1 ,gridFFTW(2),local_K])  ! place a pointer for a real representation on coords_fftw
 
 !--------------------------------------------------------------------------------------------------
-! allocation
- allocate (xi(3,grid1Red,grid(2),grid(3)),source = 0.0_pReal)                                       ! frequencies, only half the size for first dimension
- tensorField = fftw_alloc_complex(int(grid1Red*grid(2)*grid(3)*9_pInt,C_SIZE_T))                    ! allocate aligned data using a C function, C_SIZE_T is of type integer(8)
- call c_f_pointer(tensorField, field_real, [grid(1)+2_pInt-mod(grid(1),2_pInt),grid(2),grid(3),3,3])! place a pointer for a real representation on tensorField
- call c_f_pointer(tensorField, field_fourier,[grid1Red,                        grid(2),grid(3),3,3])! place a pointer for a complex representation on tensorField
- !coords_fftw = fftw_alloc_complex(int(grid1Red*grid(2)*grid(3)*3_pInt,C_SIZE_T))                    ! allocate aligned data using a C function, C_SIZE_T is of type integer(8)
- !call c_f_pointer(tensorField, coords_real,[grid(1)+2_pInt-mod(grid(1),2_pInt),grid(2),grid(3),3])  ! place a pointer for a real representation on coords_fftw
- !call c_f_pointer(tensorField, coords_fourier,[grid1Red,                       grid(2),grid(3),3])  ! place a pointer for a real representation on coords_fftw
+! MPI fftw plans
+ planForthMPI = fftw_mpi_plan_many_dft_r2c(3, [gridFFTW(3),gridFFTW(2),gridFFTW(1)], tensorSize, &     ! dimension, logical length in each dimension in reversed order, no. of transforms
+                                         FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, &          ! default iblock and oblock
+                                                        field_realMPI, field_fourierMPI, &          ! input data, output data
+                                                       MPI_COMM_WORLD, fftw_planner_flag)           ! use all processors, planer precision
+ planBackMPI  = fftw_mpi_plan_many_dft_c2r(3, [gridFFTW(3),gridFFTW(2),gridFFTW(1)], tensorSize, &     ! dimension, logical length in each dimension in reversed order, no. of transforms
+                                         FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, &          ! default iblock and oblock
+                                                         field_fourierMPI,field_realMPI, &          ! input data, output data
+                                                       MPI_COMM_WORLD, fftw_planner_flag)           ! all processors, planer precision
+   
+!--------------------------------------------------------------------------------------------------
+! Coordinates MPI fftw plans
+ planCoordsMPI  =  fftw_mpi_plan_many_dft_c2r(3, [gridFFTW(3),gridFFTW(2),gridFFTW(1)], vecSize, &     ! dimension, logical length in each dimension in reversed order, no. of transforms
+                                         FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, &          ! default iblock and oblock
+                                                       coords_fourierMPI,coords_realMPI, &          ! input data, output data
+                                                       MPI_COMM_WORLD, fftw_planner_flag)           ! use all processors, planer precision
 
 !--------------------------------------------------------------------------------------------------
+! depending on debug options, allocate more memory and create additional plans 
+ if (debugDivergence) then
+   div = fftw_alloc_complex(3*alloc_local)
+   call c_f_pointer(div,divRealMPI,   [dimSize,2*(gridFFTW(1)/2 + 1),gridFFTW(2),local_K])
+   call c_f_pointer(div,divFourierMPI,[dimSize,   gridFFTW(1)/2 + 1 ,gridFFTW(2),local_K])
+   planDivMPI  = fftw_mpi_plan_many_dft_c2r(3, [gridFFTW(3),gridFFTW(2) ,gridFFTW(1)],vecSize, &
+                                            FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, &
+                                                                 divFourierMPI, divRealMPI, &
+                                                         MPI_COMM_WORLD, fftw_planner_flag)
+ endif
+
+ if (debugFFTW) then
+   scalarFieldMPI = fftw_alloc_complex(alloc_local)                                                 ! allocate data for real representation (no in place transform)
+   call c_f_pointer(scalarFieldMPI,    scalarField_realMPI, &
+                                               [2*(gridFFTW(1)/2 + 1),gridFFTW(2),local_K])           ! place a pointer for a real representation
+   call c_f_pointer(scalarFieldMPI, scalarField_fourierMPI, &
+                                               [   gridFFTW(1)/2 + 1 ,gridFFTW(2),local_K])           ! place a pointer for a fourier representation
+   planDebugForthMPI = fftw_mpi_plan_many_dft_r2c(3, [gridFFTW(3),gridFFTW(2),gridFFTW(1)], &
+                             scalarSize, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, &
+             scalarField_realMPI, scalarField_fourierMPI, &
+               MPI_COMM_WORLD, fftw_planner_flag)
+   planDebugBackMPI  = fftw_mpi_plan_many_dft_c2r(3, [gridFFTW(3),gridFFTW(2),gridFFTW(1)], &
+                             scalarSize, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, &
+      scalarField_fourierMPI,scalarField_realMPI, &
+               MPI_COMM_WORLD, fftw_planner_flag)
+ endif 
+!--------------------------------------------------------------------------------------------------
 ! general initialization of FFTW (see manual on fftw.org for more details)
  if (pReal /= C_DOUBLE .or. pInt /= C_INT) call IO_error(0_pInt,ext_msg='Fortran to C')             ! check for correct precision in C
  call fftw_set_timelimit(fftw_timelimit)                                                            ! set timelimit for plan creation
 
-!--------------------------------------------------------------------------------------------------
-! creating plans for the convolution
- planForth =  fftw_plan_many_dft_r2c(3,[grid(3),grid(2) ,grid(1)], 9, &                             ! dimensions,  logical length in each dimension in reversed order,  no. of transforms
-                            field_real,[grid(3),grid(2) ,grid(1)+2_pInt-mod(grid(1),2_pInt)], &     ! input data,  physical length in each dimension in reversed order
-                                     1, grid(3)*grid(2)*(grid(1)+2_pInt-mod(grid(1),2_pInt)), &     ! striding,    product of physical length in the 3 dimensions
-                         field_fourier,[grid(3),grid(2) ,grid1Red], &                               ! output data, physical length in each dimension in reversed order
-                                     1, grid(3)*grid(2)* grid1Red,       fftw_planner_flag)         ! striding,    product of physical length in the 3 dimensions,      planner precision
-
- planBack  =  fftw_plan_many_dft_c2r(3,[grid(3),grid(2) ,grid(1)], 9, &                             ! dimensions,  logical length in each dimension in reversed order,  no. of transforms
-                         field_fourier,[grid(3),grid(2) ,grid1Red], &                               ! input data,  physical length in each dimension in reversed order
-                                     1, grid(3)*grid(2)* grid1Red, &                                ! striding,    product of physical length in the 3 dimensions
-                            field_real,[grid(3),grid(2) ,grid(1)+2_pInt-mod(grid(1),2_pInt)], &     ! output data, physical length in each dimension in reversed order
-                                     1, grid(3)*grid(2)*(grid(1)+2_pInt-mod(grid(1),2_pInt)), &     ! striding,    product of physical length in the 3 dimensions
-                                                                       fftw_planner_flag)           ! planner precision
-
-!--------------------------------------------------------------------------------------------------
-! allocation and FFTW initialization
-
-
-! planCoords = fftw_plan_many_dft_c2r(3_pInt,[grid(3),grid(2) ,grid(1)],3_pInt,&
-!                             coords_fourier,[grid(3),grid(2) ,grid1Red],&
-!                                    1_pInt,  grid(3)*grid(2)* grid1Red,&
-!                                coords_real,[grid(3),grid(2) ,grid(1)+2_pInt-mod(grid(1),2_pInt)],&
-!                                    1_pInt,  grid(3)*grid(2)*(grid(1)+2_pInt-mod(grid(1),2_pInt)),&
-!                                                                      fftw_planner_flag)
-
-!--------------------------------------------------------------------------------------------------
-! depending on debug options, allocate more memory and create additional plans 
- if (debugDivergence) then
-   div = fftw_alloc_complex(int(grid1Red*grid(2)*grid(3)*3_pInt,C_SIZE_T))
-   call c_f_pointer(div,divReal,   [grid(1)+2_pInt-mod(grid(1),2_pInt),grid(2),grid(3),3])
-   call c_f_pointer(div,divFourier,[grid1Red,                          grid(2),grid(3),3])
-   planDiv  = fftw_plan_many_dft_c2r(3,[grid(3),grid(2) ,grid(1)],3,&
-                            divFourier,[grid(3),grid(2) ,grid1Red],&
-                                     1, grid(3)*grid(2)* grid1Red,&
-                               divReal,[grid(3),grid(2) ,grid(1)+2_pInt-mod(grid(1),2_pInt)], &
-                                     1, grid(3)*grid(2)*(grid(1)+2_pInt-mod(grid(1),2_pInt)), &
-                                                                       fftw_planner_flag)
- endif
-
- if (debugFFTW) then
-   scalarField_realC    = fftw_alloc_complex(int(product(grid),C_SIZE_T))                           ! allocate data for real representation (no in place transform)
-   scalarField_fourierC = fftw_alloc_complex(int(product(grid),C_SIZE_T))                           ! allocate data for fourier representation (no in place transform)
-   call c_f_pointer(scalarField_realC,    scalarField_real,    grid)                                ! place a pointer for a real representation
-   call c_f_pointer(scalarField_fourierC, scalarField_fourier, grid)                                ! place a pointer for a fourier representation
-   planDebugForth = fftw_plan_dft_3d(grid(3),grid(2),grid(1),&                                      ! reversed order (C style)
-                                      scalarField_real,scalarField_fourier,-1,fftw_planner_flag)    ! input, output, forward FFT(-1), planner precision
-   planDebugBack  = fftw_plan_dft_3d(grid(3),grid(2),grid(1),&                                      ! reversed order (C style)
-                                      scalarField_fourier,scalarField_real,+1,fftw_planner_flag)    ! input, output, backward (1), planner precision
- endif 
-
- if (debugGeneral) write(6,'(/,a)') ' FFTW initialized'
- flush(6)
+ if (debugGeneral .and. worldrank == 0_pInt) write(6,'(/,a)') ' FFTW initialized'
+   flush(6)
  
 !--------------------------------------------------------------------------------------------------
 ! calculation of discrete angular frequencies, ordered as in FFTW (wrap around)
- do k = 1_pInt, grid(3)
+ do k = gridOffset+1_pInt, gridOffset+gridLocal(3)
    k_s(3) = k - 1_pInt
-   if(k > grid(3)/2_pInt + 1_pInt) k_s(3) = k_s(3) - grid(3)                                        ! running from 0,1,...,N/2,N/2+1,-N/2,-N/2+1,...,-1
-     do j = 1_pInt, grid(2)
+   if(k > gridGlobal(3)/2_pInt + 1_pInt) k_s(3) = k_s(3) - gridGlobal(3)                            ! running from 0,1,...,N/2,N/2+1,-N/2,-N/2+1,...,-1
+     do j = 1_pInt, gridLocal(2)
        k_s(2) = j - 1_pInt
-       if(j > grid(2)/2_pInt + 1_pInt) k_s(2) = k_s(2) - grid(2)                                    ! running from 0,1,...,N/2,N/2+1,-N/2,-N/2+1,...,-1
+       if(j > gridGlobal(2)/2_pInt + 1_pInt) k_s(2) = k_s(2) - gridGlobal(2)                        ! running from 0,1,...,N/2,N/2+1,-N/2,-N/2+1,...,-1
          do i = 1_pInt, grid1Red
            k_s(1) = i - 1_pInt                                                                      ! symmetry, junst running from 0,1,...,N/2,N/2+1
-           xi(1:3,i,j,k) = real(k_s, pReal)/scaledGeomSize                                          ! if divergence_correction is set, frequencies are calculated on unit length
+           xi(1:3,i,j,k-gridOffset) = real(k_s, pReal)/scaledGeomSize                               ! if divergence_correction is set, frequencies are calculated on unit length
  enddo; enddo; enddo
  
  if(memory_efficient) then                                                                          ! allocate just single fourth order tensor
    allocate (gamma_hat(3,3,3,3,1,1,1), source = 0.0_pReal)
  else                                                                                               ! precalculation of gamma_hat field
-   allocate (gamma_hat(3,3,3,3,grid1Red ,grid(2),grid(3)), source = 0.0_pReal)     
+   allocate (gamma_hat(3,3,3,3,grid1Red,gridLocal(2),gridLocal(3)), source = 0.0_pReal)     
  endif
 
 end subroutine utilities_init
@@ -328,43 +332,48 @@ subroutine utilities_updateGamma(C,saveReference)
  use IO, only: &
   IO_write_jobRealFile
  use numerics, only: &
-   memory_efficient
+   memory_efficient, &
+   worldrank
+ use mesh, only: &
+   gridOffset, &
+   gridLocal
  use math, only: &
    math_inv33
+ use mesh, only: &
+   gridLocal
 
  implicit none
  real(pReal), intent(in), dimension(3,3,3,3) :: C                                                   !< input stiffness to store as reference stiffness
  logical    , intent(in)                     :: saveReference                                       !< save reference stiffness to file for restart
  real(pReal),                 dimension(3,3) :: temp33_Real, xiDyad
- real(pReal)                                 :: filter                                              !< weighting of current component
  integer(pInt) :: &
    i, j, k, &
    l, m, n, o
-  
+ 
  C_ref = C
  if (saveReference) then
-   write(6,'(/,a)') ' writing reference stiffness to file'
-   flush(6)
-   call IO_write_jobRealFile(777,'C_ref',size(C_ref))
-   write (777,rec=1) C_ref
-   close(777)
+   if (worldrank == 0_pInt) then
+     write(6,'(/,a)') ' writing reference stiffness to file'
+     flush(6)
+     call IO_write_jobRealFile(777,'C_ref',size(C_ref))
+     write (777,rec=1) C_ref
+     close(777)
+   endif
  endif
  
  if(.not. memory_efficient) then                                                   
-   do k = 1_pInt, grid(3); do j = 1_pInt, grid(2); do i = 1_pInt, grid1Red
-     if(any([i,j,k] /= 1_pInt)) then                                                                ! singular point at xi=(0.0,0.0,0.0) i.e. i=j=k=1
+   do k = gridOffset+1_pInt, gridOffset+gridLocal(3); do j = 1_pInt, gridLocal(2); do i = 1_pInt, grid1Red
+     if (any([i,j,k] /= 1_pInt)) then                                                                ! singular point at xi=(0.0,0.0,0.0) i.e. i=j=k=1
        forall(l = 1_pInt:3_pInt, m = 1_pInt:3_pInt) &
-         xiDyad(l,m) = xi(l, i,j,k)*xi(m, i,j,k)
+         xiDyad(l,m) = xi(l, i,j,k-gridOffset)*xi(m, i,j,k-gridOffset)
        forall(l = 1_pInt:3_pInt, m = 1_pInt:3_pInt) &
          temp33_Real(l,m) = sum(C_ref(l,1:3,m,1:3)*xiDyad)
        temp33_Real = math_inv33(temp33_Real)
-       filter = utilities_getFilter(xi(1:3,i,j,k))                                                  ! weighting factor computed by getFilter function
        forall(l=1_pInt:3_pInt, m=1_pInt:3_pInt, n=1_pInt:3_pInt, o=1_pInt:3_pInt)&
-         gamma_hat(l,m,n,o, i,j,k) =  filter*temp33_Real(l,n)*xiDyad(m,o)
+         gamma_hat(l,m,n,o,i,j,k-gridOffset) =  temp33_Real(l,n)*xiDyad(m,o)
      endif  
    enddo; enddo; enddo
-   gamma_hat(1:3,1:3,1:3,1:3, 1,1,1) = 0.0_pReal                                                    ! singular point at xi=(0.0,0.0,0.0) i.e. i=j=k=1
- endif
+ endif  
 
 end subroutine utilities_updateGamma
 
@@ -377,58 +386,69 @@ end subroutine utilities_updateGamma
 !--------------------------------------------------------------------------------------------------
 subroutine utilities_FFTforward()
  use math
+ use numerics, only: &
+   worldrank
+ use mesh, only: &
+   gridOffset, &
+   gridLocal
 
  implicit none
  integer(pInt)  :: row, column                                                                      ! if debug FFTW, compare 3D array field of row and column
- integer(pInt), dimension(2:3,2) :: Nyquist                                                         ! highest frequencies to be removed (1 if even, 2 if odd)
- real(pReal), dimension(2) :: myRand                                                                ! random numbers
+ real(pReal), dimension(2) :: myRand, maxScalarField                                                ! random numbers
+ integer(pInt) ::  i, j, k
+ PetscErrorCode :: ierr
 
 !--------------------------------------------------------------------------------------------------
 ! copy one component of the stress field to to a single FT and check for mismatch
  if (debugFFTW) then
-   call random_number(myRand)                                                                       ! two numbers: 0 <= x < 1
-   row    = nint(myRand(1)*2_pReal + 1_pReal,pInt)
-   column = nint(myRand(2)*2_pReal + 1_pReal,pInt)
-   scalarField_real = cmplx(field_real(1:grid(1),1:grid(2),1:grid(3),row,column),0.0_pReal,pReal)   ! store the selected component 
+   if (worldrank == 0_pInt) then
+     call random_number(myRand)                                                                     ! two numbers: 0 <= x < 1
+     row    = nint(myRand(1)*2_pReal + 1_pReal,pInt)
+     column = nint(myRand(2)*2_pReal + 1_pReal,pInt)
+   endif
+   call MPI_Bcast(row   ,1,MPI_INTEGER,0,PETSC_COMM_WORLD,ierr)
+   call MPI_Bcast(column,1,MPI_INTEGER,0,PETSC_COMM_WORLD,ierr)
+   scalarField_realMPI = 0.0_pReal
+   scalarField_realMPI(1:gridLocal(1),1:gridLocal(2),1:gridLocal(3)) = &
+     field_realMPI(row,column,1:gridLocal(1),1:gridLocal(2),1:gridLocal(3))                         ! store the selected component 
  endif
 
 !--------------------------------------------------------------------------------------------------
 ! doing the FFT
- call fftw_execute_dft_r2c(planForth,field_real,field_fourier)
+ call fftw_mpi_execute_dft_r2c(planForthMPI,field_realMPI,field_fourierMPI)
   
 !--------------------------------------------------------------------------------------------------
 ! comparing 1 and 3x3 FT results
  if (debugFFTW) then
-   call fftw_execute_dft(planDebugForth,scalarField_real,scalarField_fourier)
-   where(abs(scalarField_fourier(1:grid1Red,1:grid(2),1:grid(3))) > tiny(1.0_pReal))               ! avoid division by zero
-     scalarField_fourier(1:grid1Red,1:grid(2),1:grid(3)) = &  
-                         (scalarField_fourier(1:grid1Red,1:grid(2),1:grid(3))-&
-                                field_fourier(1:grid1Red,1:grid(2),1:grid(3),row,column))/&
-                          scalarField_fourier(1:grid1Red,1:grid(2),1:grid(3))
+   call fftw_mpi_execute_dft_r2c(planDebugForthMPI,scalarField_realMPI,scalarField_fourierMPI)
+   where(abs(scalarField_fourierMPI(1:grid1Red,1:gridLocal(2),1:gridLocal(3))) > tiny(1.0_pReal))   ! avoid division by zero
+     scalarField_fourierMPI(1:grid1Red,1:gridLocal(2),1:gridLocal(3)) = &  
+                   (scalarField_fourierMPI(1:grid1Red,1:gridLocal(2),1:gridLocal(3))-&
+                          field_fourierMPI(row,column,1:grid1Red,1:gridLocal(2),1:gridLocal(3)))/&
+                    scalarField_fourierMPI(1:grid1Red,1:gridLocal(2),1:gridLocal(3))
    else where
-     scalarField_real = cmplx(0.0,0.0,pReal)
+     scalarField_realMPI = cmplx(0.0,0.0,pReal)
    end where
-   write(6,'(/,a,i1,1x,i1,a)') ' .. checking FT results of compontent ', row, column, ' ..'
-   write(6,'(/,a,2(es11.4,1x))')  ' max FT relative error = ',&                                     ! print real and imaginary part seperately
-     maxval(real (scalarField_fourier(1:grid1Red,1:grid(2),1:grid(3)))),&
-     maxval(aimag(scalarField_fourier(1:grid1Red,1:grid(2),1:grid(3))))
-   flush(6)
+   maxScalarField(1) = maxval(real (scalarField_fourierMPI(1:grid1Red,1:gridLocal(2), &
+                                                                     1:gridLocal(3))))
+   maxScalarField(2) = maxval(aimag(scalarField_fourierMPI(1:grid1Red,1:gridLocal(2), &
+                                                                     1:gridLocal(3))))
+   call MPI_reduce(MPI_IN_PLACE,maxScalarField,2,MPI_DOUBLE,MPI_MAX,0,PETSC_COMM_WORLD,ierr)
+   if (worldrank == 0_pInt) then
+     write(6,'(/,a,i1,1x,i1,a)') ' .. checking FT results of compontent ', row, column, ' ..'
+     write(6,'(/,a,2(es11.4,1x))')  ' max FT relative error = ',&                                   ! print real and imaginary part seperately
+       maxScalarField(1),maxScalarField(2)
+     flush(6)
+   endif
  endif
 
 !--------------------------------------------------------------------------------------------------
-! removing highest frequencies
- Nyquist(2,1:2) = [grid(2)/2_pInt + 1_pInt, grid(2)/2_pInt + 1_pInt + mod(grid(2),2_pInt)]
- Nyquist(3,1:2) = [grid(3)/2_pInt + 1_pInt, grid(3)/2_pInt + 1_pInt + mod(grid(3),2_pInt)]
-
- if(grid(1)/=1_pInt) &                                                                               ! do not delete the whole slice in case of 2D calculation
-   field_fourier (grid1Red,  1:grid(2),                 1:grid(3),                 1:3,1:3) &
-                                                     = cmplx(0.0_pReal,0.0_pReal,pReal)
- if(grid(2)/=1_pInt) &                                                                               ! do not delete the whole slice in case of 2D calculation
-   field_fourier (1:grid1Red,Nyquist(2,1):Nyquist(2,2),1:grid(3),                 1:3,1:3) & 
-                                                     = cmplx(0.0_pReal,0.0_pReal,pReal)
- if(grid(3)/=1_pInt) &                                                                               ! do not delete the whole slice in case of 2D calculation
-   field_fourier (1:grid1Red,1:grid(2),                 Nyquist(3,1):Nyquist(3,2),1:3,1:3) &
-                                                     = cmplx(0.0_pReal,0.0_pReal,pReal)
+! applying filter
+ do k = 1_pInt, gridLocal(3);  do j = 1_pInt, gridLocal(2);  do i = 1_pInt,grid1Red
+   field_fourierMPI(1:3,1:3,i,j,k) = utilities_getFilter(xi(1:3,i,j,k))* &
+                                     field_fourierMPI(1:3,1:3,i,j,k)
+ enddo; enddo; enddo
+ 
 end subroutine utilities_FFTforward
 
 
@@ -440,56 +460,58 @@ end subroutine utilities_FFTforward
 !> results is weighted by number of points stored in wgt
 !--------------------------------------------------------------------------------------------------
 subroutine utilities_FFTbackward()
- use math                                                                                           !< must use the whole module for use of FFTW
+ use math
+ use numerics, only: &
+   worldrank          
+ use mesh, only: &
+   gridLocal
 
  implicit none
  integer(pInt) :: row, column                                                                       !< if debug FFTW, compare 3D array field of row and column
- integer(pInt) ::  i, j, k, m, n
  real(pReal), dimension(2) :: myRand 
+ real(pReal) :: maxScalarField
+ PetscErrorCode :: ierr
 
 !--------------------------------------------------------------------------------------------------
 ! unpack FFT data for conj complex symmetric part. This data is not transformed when using c2r
  if (debugFFTW) then
-   call random_number(myRand)                                                                       ! two numbers: 0 <= x < 1
-   row    = nint(myRand(1)*2_pReal + 1_pReal,pInt)
-   column = nint(myRand(2)*2_pReal + 1_pReal,pInt)
-   scalarField_fourier(1:grid1Red,1:grid(2),1:grid(3)) &
-                                         = field_fourier(1:grid1Red,1:grid(2),1:grid(3),row,column)
-   do i = 0_pInt, grid(1)/2_pInt-2_pInt + mod(grid(1),2_pInt)
-    m = 1_pInt
-    do k = 1_pInt, grid(3)
-      n = 1_pInt
-      do j = 1_pInt, grid(2)
-        scalarField_fourier(grid(1)-i,j,k) = conjg(scalarField_fourier(2+i,n,m))
-        if(n == 1_pInt) n = grid(2) + 1_pInt
-        n = n-1_pInt
-     enddo
-     if(m == 1_pInt) m = grid(3) + 1_pInt
-     m = m -1_pInt
-   enddo; enddo
- endif
+  if (worldrank == 0_pInt) then
+     call random_number(myRand)                                                                     ! two numbers: 0 <= x < 1
+     row    = nint(myRand(1)*2_pReal + 1_pReal,pInt)
+     column = nint(myRand(2)*2_pReal + 1_pReal,pInt)
+   endif
+   call MPI_Bcast(row   ,1,MPI_INTEGER,0,PETSC_COMM_WORLD,ierr)
+   call MPI_Bcast(column,1,MPI_INTEGER,0,PETSC_COMM_WORLD,ierr)
+   scalarField_fourierMPI(1:grid1Red,1:gridLocal(2),1:gridLocal(3)) = &
+     field_fourierMPI(row,column,1:grid1Red,1:gridLocal(2),1:gridLocal(3))
+ endif 
  
 !--------------------------------------------------------------------------------------------------
 ! doing the iFFT
- call fftw_execute_dft_c2r(planBack,field_fourier,field_real)                                       ! back transform of fluct deformation gradient
+ call fftw_mpi_execute_dft_c2r(planBackMPI,field_fourierMPI,field_realMPI)                          ! back transform of fluct deformation gradient
 
 !--------------------------------------------------------------------------------------------------
 ! comparing 1 and 3x3 inverse FT results
  if (debugFFTW) then
-   call fftw_execute_dft(planDebugBack,scalarField_fourier,scalarField_real)
-   where(abs(real(scalarField_real,pReal)) > tiny(1.0_pReal))                                       ! avoid division by zero
-     scalarField_real = (scalarField_real &
-                         - cmplx(field_real(1:grid(1),1:grid(2),1:grid(3),row,column), 0.0, pReal))/ &
-                         scalarField_real
+   call fftw_mpi_execute_dft_c2r(planDebugBackMPI,scalarField_fourierMPI,scalarField_realMPI)
+   where(abs(real(scalarField_realMPI,pReal)) > tiny(1.0_pReal))                                    ! avoid division by zero
+     scalarField_realMPI(1:gridLocal(1),1:gridLocal(2),1:gridLocal(3)) = &
+   (scalarField_realMPI(1:gridLocal(1),1:gridLocal(2),1:gridLocal(3)) &
+     -  field_realMPI      (row,column,1:gridLocal(1),1:gridLocal(2),1:gridLocal(3)))/ &
+       scalarField_realMPI(1:gridLocal(1),1:gridLocal(2),1:gridLocal(3))
    else where
-     scalarField_real = cmplx(0.0,0.0,pReal)
+     scalarField_realMPI = cmplx(0.0,0.0,pReal)
    end where
-   write(6,'(/,a,i1,1x,i1,a)') ' ... checking iFT results of compontent ', row, column, ' ..'
-   write(6,'(/,a,es11.4)')     ' max iFT relative error = ', maxval(real(scalarField_real,pReal))
-   flush(6)
+   maxScalarField = maxval(real (scalarField_realMPI(1:gridLocal(1),1:gridLocal(2),1:gridLocal(3))))
+   call MPI_reduce(MPI_IN_PLACE,maxScalarField,1,MPI_DOUBLE,MPI_MAX,0,PETSC_COMM_WORLD,ierr)
+   if (worldrank == 0_pInt) then
+     write(6,'(/,a,i1,1x,i1,a)') ' ... checking iFT results of compontent ', row, column, ' ..'
+     write(6,'(/,a,es11.4)')     ' max iFT relative error = ', maxScalarField
+     flush(6)
+   endif
  endif
  
- field_real = field_real * wgt                                                                      ! normalize the result by number of elements
+ field_realMPI = field_realMPI * wgt                                                                 ! normalize the result by number of elements
 
 end subroutine utilities_FFTbackward
 
@@ -501,28 +523,31 @@ subroutine utilities_inverseLaplace()
  use math, only: &
    math_inv33, &
    PI
+ use numerics, only: &
+   worldrank 
+ use mesh, only: &
+   gridLocal, &
+   gridOffset, &
+   geomSizeGlobal
 
  implicit none  
  integer(pInt) :: i, j, k
- integer(pInt), dimension(3) :: k_s
+ real(pReal), dimension(3) :: k_s
 
- write(6,'(/,a)') ' ... doing inverse laplace .................................................'
- flush(6)
+ if (worldrank == 0_pInt) then 
+   write(6,'(/,a)') ' ... doing inverse laplace .................................................'
+   flush(6)
+ endif
  
- do k = 1_pInt, grid(3)
-  k_s(3) = k - 1_pInt
-  if(k > grid(3)/2_pInt + 1_pInt) k_s(3) = k_s(3) - grid(3)                                        ! running from 0,1,...,N/2,N/2+1,-N/2,-N/2+1,...,-1
-  do j = 1_pInt, grid(2)
-    k_s(2) = j - 1_pInt
-    if(j > grid(2)/2_pInt + 1_pInt) k_s(2) = k_s(2) - grid(2)                                      ! running from 0,1,...,N/2,N/2+1,-N/2,-N/2+1,...,-1
-    do i = 1_pInt, grid1Red
-      k_s(1) = i - 1_pInt  
-      if (any(k_s /= 0_pInt)) field_fourier(i,j,k, 1:3,1:3) =  &
-                              field_fourier(i,j,k, 1:3,1:3)/ &
-                                        cmplx(-sum((2.0_pReal*PI*k_s/geomSize)*&
-                                                   (2.0_pReal*PI*k_s/geomSize)),0.0_pReal,pReal)   ! symmetry, junst running from 0,1,...,N/2,N/2+1
-enddo; enddo; enddo
-field_fourier(1,1,1,1:3,1:3) = cmplx(0.0_pReal,0.0_pReal,pReal)
+ do k = 1_pInt, gridLocal(3); do j = 1_pInt, gridLocal(2); do i = 1_pInt, grid1Red
+   k_s = xi(1:3,i,j,k)*scaledGeomSize
+   if (any(k_s /= 0_pInt)) field_fourierMPI(1:3,1:3,i,j,k-gridOffset) =  &
+                           field_fourierMPI(1:3,1:3,i,j,k-gridOffset)/ &
+                           cmplx(-sum((2.0_pReal*PI*k_s/geomSizeGlobal)* &
+     (2.0_pReal*PI*k_s/geomSizeGlobal)),0.0_pReal,pReal)
+ enddo; enddo; enddo
+ if (gridOffset == 0_pInt) &
+  field_fourierMPI(1:3,1:3,1,1,1) = cmplx(0.0_pReal,0.0_pReal,pReal)
 
 end subroutine utilities_inverseLaplace
  
@@ -535,45 +560,51 @@ subroutine utilities_fourierConvolution(fieldAim)
    memory_efficient
  use math, only: &
    math_inv33
+ use numerics, only: &
+   worldrank 
+ use mesh, only: &
+   gridLocal, &
+   gridOffset
 
  implicit none  
  real(pReal), intent(in), dimension(3,3) :: fieldAim                                                !< desired average value of the field after convolution
  real(pReal),             dimension(3,3) :: xiDyad, temp33_Real
- real(pReal)                             :: filter                                                  !< weighting of current component
  complex(pReal),          dimension(3,3) :: temp33_complex
  integer(pInt) :: &
    i, j, k, &
    l, m, n, o
 
- write(6,'(/,a)') ' ... doing convolution .....................................................'
- flush(6)
+ if (worldrank == 0_pInt) then 
+   write(6,'(/,a)') ' ... doing convolution .....................................................'
+   flush(6)
+ endif
  
 !--------------------------------------------------------------------------------------------------
 ! do the actual spectral method calculation (mechanical equilibrium)
  if(memory_efficient) then                                                                          ! memory saving version, on-the-fly calculation of gamma_hat
-   do k = 1_pInt, grid(3); do j = 1_pInt, grid(2) ;do i = 1_pInt, grid1Red
-     if(any([i,j,k] /= 1_pInt)) then                                                                ! singular point at xi=(0.0,0.0,0.0) i.e. i=j=k=1       
+   do k = 1_pInt, gridLocal(3); do j = 1_pInt, gridLocal(2) ;do i = 1_pInt, grid1Red
+     if(any([i,j,k+gridOffset] /= 1_pInt)) then                                                     ! singular point at xi=(0.0,0.0,0.0) i.e. i=j=k=1       
        forall(l = 1_pInt:3_pInt, m = 1_pInt:3_pInt) &
          xiDyad(l,m) = xi(l, i,j,k)*xi(m, i,j,k)
        forall(l = 1_pInt:3_pInt, m = 1_pInt:3_pInt) &
          temp33_Real(l,m) = sum(C_ref(l,1:3,m,1:3)*xiDyad)
        temp33_Real = math_inv33(temp33_Real)
-       filter = utilities_getFilter(xi(1:3,i,j,k))                                                  ! weighting factor computed by getFilter function
        forall(l=1_pInt:3_pInt, m=1_pInt:3_pInt, n=1_pInt:3_pInt, o=1_pInt:3_pInt)&
-         gamma_hat(l,m,n,o, 1,1,1) =  filter*temp33_Real(l,n)*xiDyad(m,o)
+         gamma_hat(l,m,n,o, 1,1,1) =  temp33_Real(l,n)*xiDyad(m,o)
        forall(l = 1_pInt:3_pInt, m = 1_pInt:3_pInt) &
-         temp33_Complex(l,m) = sum(gamma_hat(l,m,1:3,1:3, 1,1,1) * field_fourier(i,j,k,1:3,1:3))
-       field_fourier(i,j,k,1:3,1:3) = temp33_Complex 
+         temp33_Complex(l,m) = sum(gamma_hat(l,m,1:3,1:3, 1,1,1) * field_fourierMPI(1:3,1:3,i,j,k))
+       field_fourierMPI(1:3,1:3,i,j,k) = temp33_Complex 
      endif             
    enddo; enddo; enddo
  else                                                                                               ! use precalculated gamma-operator
-   do k = 1_pInt, grid(3);  do j = 1_pInt, grid(2);  do i = 1_pInt,grid1Red
+   do k = 1_pInt, gridLocal(3);  do j = 1_pInt, gridLocal(2);  do i = 1_pInt,grid1Red
      forall(l = 1_pInt:3_pInt, m = 1_pInt:3_pInt) &
-       temp33_Complex(l,m) = sum(gamma_hat(l,m,1:3,1:3, i,j,k) * field_fourier(i,j,k,1:3,1:3))
-     field_fourier(i,j,k, 1:3,1:3) = temp33_Complex
+       temp33_Complex(l,m) = sum(gamma_hat(l,m,1:3,1:3,i,j,k) * field_fourierMPI(1:3,1:3,i,j,k))
+     field_fourierMPI(1:3,1:3,i,j,k) = temp33_Complex
    enddo; enddo; enddo
  endif
- field_fourier(1,1,1,1:3,1:3) = cmplx(fieldAim*real(product(grid),pReal),0.0_pReal,pReal)           ! singular point at xi=(0.0,0.0,0.0) i.e. i=j=k=1  
+ if (gridOffset == 0_pInt) &
+   field_fourierMPI(1:3,1:3,1,1,1) = cmplx(fieldAim/wgt,0.0_pReal,pReal) ! singular point at xi=(0.0,0.0,0.0) i.e. i=j=k=1  
 
 end subroutine utilities_fourierConvolution
  
@@ -582,7 +613,12 @@ end subroutine utilities_fourierConvolution
 !> @brief calculate root mean square of divergence of field_fourier
 !--------------------------------------------------------------------------------------------------
 real(pReal) function utilities_divergenceRMS()
- use math                                                                                           !< must use the whole module for use of FFTW
+ use math    
+ use numerics, only: &
+   worldrank                                   
+ use mesh, only: &
+   gridLocal, &
+   gridGlobal
 
  implicit none
  integer(pInt) :: i, j, k 
@@ -591,56 +627,70 @@ real(pReal) function utilities_divergenceRMS()
    err_div_max, &                                                                                   !< maximum value of divergence in Fourier space
    err_real_div_max                                                                                 !< maximum value of divergence in real space
  complex(pReal), dimension(3) ::  temp3_complex
+ PetscErrorCode :: ierr
 
- write(6,'(/,a)') ' ... calculating divergence ................................................'
- flush(6)
+ 
+ if (worldrank == 0_pInt) then 
+   write(6,'(/,a)') ' ... calculating divergence ................................................'
+   flush(6)
+ endif
 
 !--------------------------------------------------------------------------------------------------
 ! calculating RMS divergence criterion in Fourier space
  utilities_divergenceRMS = 0.0_pReal
- do k = 1_pInt, grid(3); do j = 1_pInt, grid(2)
+ do k = 1_pInt, gridLocal(3); do j = 1_pInt, gridLocal(2)
    do i = 2_pInt, grid1Red -1_pInt                                                                  ! Has somewhere a conj. complex counterpart. Therefore count it twice.
      utilities_divergenceRMS = utilities_divergenceRMS &
-           + 2.0_pReal*(sum (real(math_mul33x3_complex(field_fourier(i,j,k,1:3,1:3),&               ! (sqrt(real(a)**2 + aimag(a)**2))**2 = real(a)**2 + aimag(a)**2. do not take square root and square again
-                                           xi(1:3,i,j,k))*TWOPIIMG)**2.0_pReal)&                    ! --> sum squared L_2 norm of vector 
-                       +sum(aimag(math_mul33x3_complex(field_fourier(i,j,k,1:3,1:3),& 
-                                                          xi(1:3,i,j,k))*TWOPIIMG)**2.0_pReal))
+           + 2.0_pReal*(sum (real(math_mul33x3_complex(field_fourierMPI(1:3,1:3,i,j,k),&            ! (sqrt(real(a)**2 + aimag(a)**2))**2 = real(a)**2 + aimag(a)**2. do not take square root and square again
+                                                       xi(1:3,i,j,k))*TWOPIIMG)**2.0_pReal)&        ! --> sum squared L_2 norm of vector 
+                       +sum(aimag(math_mul33x3_complex(field_fourierMPI(1:3,1:3,i,j,k),& 
+                                                       xi(1:3,i,j,k))*TWOPIIMG)**2.0_pReal))
    enddo
    utilities_divergenceRMS = utilities_divergenceRMS &                                              ! these two layers (DC and Nyquist) do not have a conjugate complex counterpart (if grid(1) /= 1)
-              + sum( real(math_mul33x3_complex(field_fourier(1       ,j,k,1:3,1:3),&
-                                                  xi(1:3,1       ,j,k))*TWOPIIMG)**2.0_pReal)&
-              + sum(aimag(math_mul33x3_complex(field_fourier(1       ,j,k,1:3,1:3),&
-                                                  xi(1:3,1       ,j,k))*TWOPIIMG)**2.0_pReal)&
-              + sum( real(math_mul33x3_complex(field_fourier(grid1Red,j,k,1:3,1:3),&
-                                                  xi(1:3,grid1Red,j,k))*TWOPIIMG)**2.0_pReal)&
-              + sum(aimag(math_mul33x3_complex(field_fourier(grid1Red,j,k,1:3,1:3),&
-                                                  xi(1:3,grid1Red,j,k))*TWOPIIMG)**2.0_pReal)
+              + sum( real(math_mul33x3_complex(field_fourierMPI(1:3,1:3,1       ,j,k), &
+                                               xi(1:3,1       ,j,k))*TWOPIIMG)**2.0_pReal) &
+              + sum(aimag(math_mul33x3_complex(field_fourierMPI(1:3,1:3,1       ,j,k), &
+                                               xi(1:3,1       ,j,k))*TWOPIIMG)**2.0_pReal) &
+              + sum( real(math_mul33x3_complex(field_fourierMPI(1:3,1:3,grid1Red,j,k), &
+                                               xi(1:3,grid1Red,j,k))*TWOPIIMG)**2.0_pReal) &
+              + sum(aimag(math_mul33x3_complex(field_fourierMPI(1:3,1:3,grid1Red,j,k), &
+                                               xi(1:3,grid1Red,j,k))*TWOPIIMG)**2.0_pReal)
  enddo; enddo
- if(grid(1) == 1_pInt) utilities_divergenceRMS = utilities_divergenceRMS * 0.5_pReal                ! counted twice in case of grid(1) == 1
+ if(gridGlobal(1) == 1_pInt) utilities_divergenceRMS = utilities_divergenceRMS * 0.5_pReal          ! counted twice in case of grid(1) == 1
+ call MPI_Allreduce(MPI_IN_PLACE,utilities_divergenceRMS,1,MPI_DOUBLE,MPI_SUM,PETSC_COMM_WORLD,ierr)
  utilities_divergenceRMS = sqrt(utilities_divergenceRMS) * wgt                                      ! RMS in real space calculated with Parsevals theorem from Fourier space
 
 !--------------------------------------------------------------------------------------------------
 ! calculate additional divergence criteria and report
  if (debugDivergence) then                                                                          ! calculate divergence again
    err_div_max = 0.0_pReal
-   do k = 1_pInt, grid(3); do j = 1_pInt, grid(2); do i = 1_pInt, grid1Red
-     temp3_Complex = math_mul33x3_complex(field_fourier(i,j,k,1:3,1:3)*wgt,&                        ! weighting P_fourier
+   do k = 1_pInt, gridLocal(3); do j = 1_pInt, gridLocal(2); do i = 1_pInt, grid1Red
+     temp3_Complex = math_mul33x3_complex(field_fourierMPI(1:3,1:3,i,j,k)*wgt,&                     ! weighting P_fourier
                                              xi(1:3,i,j,k))*TWOPIIMG
      err_div_max = max(err_div_max,sum(abs(temp3_Complex)**2.0_pReal))
-     divFourier(i,j,k,1:3) = temp3_Complex                                                          ! need divergence NOT squared
+     divFourierMPI(1:3,i,j,k) = temp3_Complex                                                       ! need divergence NOT squared
    enddo; enddo; enddo
    
-   call fftw_execute_dft_c2r(planDiv,divFourier,divReal)                                            ! already weighted
+   call fftw_mpi_execute_dft_c2r(planDivMPI,divFourierMPI,divRealMPI)                               ! already weighted
 
-   err_real_div_RMS = sqrt(wgt*sum(divReal**2.0_pReal))                                             ! RMS in real space
-   err_real_div_max = sqrt(maxval(sum(divReal**2.0_pReal,dim=4)))                                   ! max in real space                                       
+   err_real_div_RMS = sum(divRealMPI**2.0_pReal)
+   call MPI_reduce(MPI_IN_PLACE,err_real_div_RMS,1,MPI_DOUBLE,MPI_SUM,0,PETSC_COMM_WORLD,ierr)
+   err_real_div_RMS = sqrt(wgt*err_real_div_RMS)                                                    ! RMS in real space
+   
+   err_real_div_max = maxval(sum(divRealMPI**2.0_pReal,dim=4))                                      ! max in real space                                       
+   call MPI_reduce(MPI_IN_PLACE,err_real_div_max,1,MPI_DOUBLE,MPI_MAX,0,PETSC_COMM_WORLD,ierr)
+   err_real_div_max = sqrt(err_real_div_max)
+
+   call MPI_reduce(MPI_IN_PLACE,err_div_max,1,MPI_DOUBLE,MPI_MAX,0,PETSC_COMM_WORLD,ierr)
    err_div_max      = sqrt(    err_div_max)                                                         ! max in Fourier space
    
-   write(6,'(/,1x,a,es11.4)')      'error divergence  FT  RMS = ',utilities_divergenceRMS
-   write(6,'(1x,a,es11.4)')        'error divergence Real RMS = ',err_real_div_RMS
-   write(6,'(1x,a,es11.4)')        'error divergence  FT  max = ',err_div_max
-   write(6,'(1x,a,es11.4)')        'error divergence Real max = ',err_real_div_max
-   flush(6)
+   if (worldrank == 0_pInt) then 
+     write(6,'(/,1x,a,es11.4)')      'error divergence  FT  RMS = ',utilities_divergenceRMS
+     write(6,'(1x,a,es11.4)')        'error divergence Real RMS = ',err_real_div_RMS
+     write(6,'(1x,a,es11.4)')        'error divergence  FT  max = ',err_div_max
+     write(6,'(1x,a,es11.4)')        'error divergence Real max = ',err_real_div_max
+     flush(6)
+   endif
  endif
 
 end function utilities_divergenceRMS
@@ -650,54 +700,64 @@ end function utilities_divergenceRMS
 !> @brief calculate max of curl of field_fourier
 !--------------------------------------------------------------------------------------------------
 real(pReal) function utilities_curlRMS()
- use math                                                                                           !< must use the whole module for use of FFTW
+ use math       
+ use numerics, only: &
+   worldrank                                            
+ use mesh, only: &
+   gridLocal, &
+   gridGlobal
 
  implicit none
  integer(pInt)  ::  i, j, k, l 
  complex(pReal), dimension(3,3) ::  curl_fourier
+ PetscErrorCode :: ierr
 
- write(6,'(/,a)') ' ... calculating curl ......................................................'
- flush(6)
-!--------------------------------------------------------------------------------------------------
+ if (worldrank == 0_pInt) then 
+   write(6,'(/,a)') ' ... calculating curl ......................................................'
+   flush(6)
+ endif
+
+ !--------------------------------------------------------------------------------------------------
 ! calculating max curl criterion in Fourier space
  utilities_curlRMS = 0.0_pReal
  
- do k = 1_pInt, grid(3); do j = 1_pInt, grid(2); 
+ do k = 1_pInt, gridLocal(3); do j = 1_pInt, gridLocal(2); 
  do i = 2_pInt, grid1Red - 1_pInt
    do l = 1_pInt, 3_pInt
-     curl_fourier(l,1) = (+field_fourier(i,j,k,l,3)*xi(2,i,j,k)&
-                          -field_fourier(i,j,k,l,2)*xi(3,i,j,k))*TWOPIIMG
-     curl_fourier(l,2) = (+field_fourier(i,j,k,l,1)*xi(3,i,j,k)&
-                          -field_fourier(i,j,k,l,3)*xi(1,i,j,k))*TWOPIIMG
-     curl_fourier(l,3) = (+field_fourier(i,j,k,l,2)*xi(1,i,j,k)&
-                          -field_fourier(i,j,k,l,1)*xi(2,i,j,k))*TWOPIIMG
+     curl_fourier(l,1) = (+field_fourierMPI(l,3,i,j,k)*xi(2,i,j,k)&
+                          -field_fourierMPI(l,2,i,j,k)*xi(3,i,j,k))*TWOPIIMG
+     curl_fourier(l,2) = (+field_fourierMPI(l,1,i,j,k)*xi(3,i,j,k)&
+                          -field_fourierMPI(l,3,i,j,k)*xi(1,i,j,k))*TWOPIIMG
+     curl_fourier(l,3) = (+field_fourierMPI(l,2,i,j,k)*xi(1,i,j,k)&
+                          -field_fourierMPI(l,1,i,j,k)*xi(2,i,j,k))*TWOPIIMG
    enddo
    utilities_curlRMS = utilities_curlRMS + &
-                       2.0_pReal*sum(real(curl_fourier)**2.0_pReal + aimag(curl_fourier)**2.0_pReal)
+                      2.0_pReal*sum(real(curl_fourier)**2.0_pReal + aimag(curl_fourier)**2.0_pReal)
  enddo 
  do l = 1_pInt, 3_pInt
-   curl_fourier = (+field_fourier(1,j,k,l,3)*xi(2,1,j,k)&
-                   -field_fourier(1,j,k,l,2)*xi(3,1,j,k))*TWOPIIMG
-   curl_fourier = (+field_fourier(1,j,k,l,1)*xi(3,1,j,k)&
-                   -field_fourier(1,j,k,l,3)*xi(1,1,j,k))*TWOPIIMG
-   curl_fourier = (+field_fourier(1,j,k,l,2)*xi(1,1,j,k)&
-                   -field_fourier(1,j,k,l,1)*xi(2,1,j,k))*TWOPIIMG
+   curl_fourier = (+field_fourierMPI(l,3,1,j,k)*xi(2,1,j,k)&
+                   -field_fourierMPI(l,2,1,j,k)*xi(3,1,j,k))*TWOPIIMG
+   curl_fourier = (+field_fourierMPI(l,1,1,j,k)*xi(3,1,j,k)&
+                   -field_fourierMPI(l,3,1,j,k)*xi(1,1,j,k))*TWOPIIMG
+   curl_fourier = (+field_fourierMPI(l,2,1,j,k)*xi(1,1,j,k)&
+                   -field_fourierMPI(l,1,1,j,k)*xi(2,1,j,k))*TWOPIIMG
  enddo
  utilities_curlRMS = utilities_curlRMS + &
                      2.0_pReal*sum(real(curl_fourier)**2.0_pReal + aimag(curl_fourier)**2.0_pReal)   
  do l = 1_pInt, 3_pInt  
-   curl_fourier = (+field_fourier(grid1Red,j,k,l,3)*xi(2,grid1Red,j,k)&
-                   -field_fourier(grid1Red,j,k,l,2)*xi(3,grid1Red,j,k))*TWOPIIMG
-   curl_fourier = (+field_fourier(grid1Red,j,k,l,1)*xi(3,grid1Red,j,k)&
-                   -field_fourier(grid1Red,j,k,l,3)*xi(1,grid1Red,j,k))*TWOPIIMG
-   curl_fourier = (+field_fourier(grid1Red,j,k,l,2)*xi(1,grid1Red,j,k)&
-                   -field_fourier(grid1Red,j,k,l,1)*xi(2,grid1Red,j,k))*TWOPIIMG
+   curl_fourier = (+field_fourierMPI(l,3,grid1Red,j,k)*xi(2,grid1Red,j,k)&
+                   -field_fourierMPI(l,2,grid1Red,j,k)*xi(3,grid1Red,j,k))*TWOPIIMG
+   curl_fourier = (+field_fourierMPI(l,1,grid1Red,j,k)*xi(3,grid1Red,j,k)&
+                   -field_fourierMPI(l,3,grid1Red,j,k)*xi(1,grid1Red,j,k))*TWOPIIMG
+   curl_fourier = (+field_fourierMPI(l,2,grid1Red,j,k)*xi(1,grid1Red,j,k)&
+                   -field_fourierMPI(l,1,grid1Red,j,k)*xi(2,grid1Red,j,k))*TWOPIIMG
  enddo
  utilities_curlRMS = utilities_curlRMS + &
                      2.0_pReal*sum(real(curl_fourier)**2.0_pReal + aimag(curl_fourier)**2.0_pReal) 
  enddo; enddo
+ call MPI_Allreduce(MPI_IN_PLACE,utilities_curlRMS,1,MPI_DOUBLE,MPI_SUM,PETSC_COMM_WORLD,ierr)
  utilities_curlRMS = sqrt(utilities_curlRMS) * wgt
- if(grid(1) == 1_pInt) utilities_curlRMS = utilities_curlRMS * 0.5_pReal                           ! counted twice in case of grid(1) == 1
+ if(gridGlobal(1) == 1_pInt) utilities_curlRMS = utilities_curlRMS * 0.5_pReal                      ! counted twice in case of grid(1) == 1
 
 end function utilities_curlRMS
 
@@ -708,6 +768,8 @@ end function utilities_curlRMS
 function utilities_maskedCompliance(rot_BC,mask_stress,C)
  use IO, only: &
    IO_error
+ use numerics, only: &
+   worldrank
  use math, only: &
    math_Plain3333to99, &
    math_plain99to3333, &
@@ -716,10 +778,10 @@ function utilities_maskedCompliance(rot_BC,mask_stress,C)
    math_invert
 
  implicit none
- real(pReal),              dimension(3,3,3,3) :: utilities_maskedCompliance                        !< masked compliance
- real(pReal), intent(in) , dimension(3,3,3,3) :: C                                                !< current average stiffness
- real(pReal), intent(in) , dimension(3,3)     :: rot_BC                                           !< rotation of load frame
- logical,     intent(in),  dimension(3,3)     :: mask_stress                                     !< mask of stress BC
+ real(pReal),              dimension(3,3,3,3) :: utilities_maskedCompliance                         !< masked compliance
+ real(pReal), intent(in) , dimension(3,3,3,3) :: C                                                  !< current average stiffness
+ real(pReal), intent(in) , dimension(3,3)     :: rot_BC                                             !< rotation of load frame
+ logical,     intent(in),  dimension(3,3)     :: mask_stress                                        !< mask of stress BC
  integer(pInt) :: j, k, m, n 
  logical, dimension(9) :: mask_stressVector
  real(pReal), dimension(9,9) :: temp99_Real   
@@ -740,7 +802,7 @@ function utilities_maskedCompliance(rot_BC,mask_stress,C)
 
    temp99_Real = math_Plain3333to99(math_rotate_forward3333(C,rot_BC))
 
-   if(debugGeneral) then 
+   if(debugGeneral .and. worldrank == 0_pInt) then 
      write(6,'(/,a)') ' ... updating masked compliance ............................................'
      write(6,'(/,a,/,9(9(2x,f12.7,1x)/))',advance='no') ' Stiffness C (load) / GPa =',&
                                                   transpose(temp99_Real)/1.e9_pReal
@@ -779,7 +841,7 @@ function utilities_maskedCompliance(rot_BC,mask_stress,C)
        if(m/=n .and. abs(sTimesC(m,n)) > (0.0_pReal + 10.0e-12_pReal)) errmatinv = .true.           ! off diagonal elements of S*C should be 0
      enddo
    enddo
-   if(debugGeneral .or. errmatinv) then                                                             ! report
+   if((debugGeneral .or. errmatinv) .and. (worldrank == 0_pInt)) then                               ! report
      write(formatString, '(I16.16)') size_reduced
      formatString = '(/,a,/,'//trim(formatString)//'('//trim(formatString)//'(2x,es9.2,1x)/))'
      write(6,trim(formatString),advance='no') ' C * S (load) ', &
@@ -793,7 +855,7 @@ function utilities_maskedCompliance(rot_BC,mask_stress,C)
  else
    temp99_real = 0.0_pReal
  endif
- if(debugGeneral) &                                                                                 ! report
+ if(debugGeneral .and. worldrank == 0_pInt) &                                                       ! report
    write(6,'(/,a,/,9(9(2x,f12.7,1x)/),/)',advance='no') ' Masked Compliance (load) * GPa =', &
                                                     transpose(temp99_Real*1.e9_pReal)
  flush(6)
@@ -810,10 +872,14 @@ subroutine utilities_constitutiveResponse(F_lastInc,F,temperature,timeinc,&
  use debug, only: &
    debug_reset, &
    debug_info
+ use numerics, only: &
+   worldrank
  use math, only: &
    math_transpose33, &
    math_rotate_forward33, &
    math_det33
+ use mesh, only: &
+   gridLocal
  use FEsolving, only: &
    restartWrite
  use CPFEM, only: &
@@ -831,7 +897,7 @@ subroutine utilities_constitutiveResponse(F_lastInc,F,temperature,timeinc,&
  
  implicit none
  real(pReal), intent(in)                                         :: temperature                     !< temperature (no field)
- real(pReal), intent(in), dimension(3,3,grid(1),grid(2),grid(3)) :: &
+ real(pReal), intent(in), dimension(3,3,gridLocal(1),gridLocal(2),gridLocal(3)) :: &
    F_lastInc, &                                                                                     !< target deformation gradient
    F                                                                                                !< previous deformation gradient
  real(pReal), intent(in)                                         :: timeinc                         !< loading time
@@ -840,20 +906,24 @@ subroutine utilities_constitutiveResponse(F_lastInc,F,temperature,timeinc,&
  
  real(pReal),intent(out), dimension(3,3,3,3)                     :: C_volAvg, C_minmaxAvg           !< average stiffness
  real(pReal),intent(out), dimension(3,3)                         :: P_av                            !< average PK stress
- real(pReal),intent(out), dimension(3,3,grid(1),grid(2),grid(3)) :: P                               !< PK stress
+ real(pReal),intent(out), dimension(3,3,gridLocal(1),gridLocal(2),gridLocal(3)) :: P                !< PK stress
  
  integer(pInt) :: &
    calcMode, &                                                                                      !< CPFEM mode for calculation
    j,k
  real(pReal), dimension(3,3,3,3) :: max_dPdF, min_dPdF
  real(pReal)   :: max_dPdF_norm, min_dPdF_norm, defgradDetMin, defgradDetMax, defgradDet
+ PetscErrorCode :: ierr
 
- write(6,'(/,a)') ' ... evaluating constitutive response ......................................'
+  if (worldrank == 0_pInt) then 
+   write(6,'(/,a)') ' ... evaluating constitutive response ......................................'
+   flush(6)
+ endif
  calcMode    = CPFEM_CALCRESULTS
 
  if (forwardData) then                                                                              ! aging results
    calcMode    = ior(calcMode,    CPFEM_AGERESULTS)             
-   materialpoint_F0 = reshape(F_lastInc, [3,3,1,product(grid)])
+   materialpoint_F0 = reshape(F_lastInc, [3,3,1,product(gridLocal)])
  endif
  if (cutBack) then                                                                                  ! restore saved variables
   calcMode    = iand(calcMode,    not(CPFEM_AGERESULTS)) 
@@ -862,7 +932,7 @@ subroutine utilities_constitutiveResponse(F_lastInc,F,temperature,timeinc,&
  call CPFEM_general(CPFEM_COLLECT,F_lastInc(1:3,1:3,1,1,1),F(1:3,1:3,1,1,1), &
                    temperature,timeinc,1_pInt,1_pInt)
  thermal_isothermal_temperature(:) = temperature
- materialpoint_F  = reshape(F,[3,3,1,product(grid)])
+ materialpoint_F  = reshape(F,[3,3,1,product(gridLocal)])
 
  call debug_reset()
 
@@ -871,24 +941,28 @@ subroutine utilities_constitutiveResponse(F_lastInc,F,temperature,timeinc,&
  if(debugGeneral) then
    defgradDetMax = -huge(1.0_pReal)
    defgradDetMin = +huge(1.0_pReal)
-   do j = 1_pInt, product(grid)
+   do j = 1_pInt, product(gridLocal)
      defgradDet = math_det33(materialpoint_F(1:3,1:3,1,j))
      defgradDetMax = max(defgradDetMax,defgradDet)
      defgradDetMin = min(defgradDetMin,defgradDet) 
    end do
-   write(6,'(a,1x,es11.4)') ' max determinant of deformation =', defgradDetMax
-   write(6,'(a,1x,es11.4)') ' min determinant of deformation =', defgradDetMin
-   flush(6)
+   call MPI_reduce(MPI_IN_PLACE,defgradDetMax,1,MPI_DOUBLE,MPI_MAX,0,PETSC_COMM_WORLD,ierr)
+   call MPI_reduce(MPI_IN_PLACE,defgradDetMin,1,MPI_DOUBLE,MPI_MIN,0,PETSC_COMM_WORLD,ierr)
+   if (worldrank == 0_pInt) then 
+     write(6,'(a,1x,es11.4)') ' max determinant of deformation =', defgradDetMax
+     write(6,'(a,1x,es11.4)') ' min determinant of deformation =', defgradDetMin
+     flush(6)
+   endif
  endif
   
- call CPFEM_general(calcMode,F_lastInc(1:3,1:3,1,1,1), F(1:3,1:3,1,1,1), &              ! first call calculates everything
+ call CPFEM_general(calcMode,F_lastInc(1:3,1:3,1,1,1), F(1:3,1:3,1,1,1), &                          ! first call calculates everything
                     temperature,timeinc,1_pInt,1_pInt)
 
  max_dPdF = 0.0_pReal
  max_dPdF_norm = 0.0_pReal
  min_dPdF = huge(1.0_pReal)
  min_dPdF_norm = huge(1.0_pReal)
- do k = 1_pInt, product(grid)
+ do k = 1_pInt, product(gridLocal)
    if (max_dPdF_norm < sum(materialpoint_dPdF(1:3,1:3,1:3,1:3,1,k)**2.0_pReal)) then
      max_dPdF = materialpoint_dPdF(1:3,1:3,1:3,1:3,1,k)
      max_dPdF_norm = sum(materialpoint_dPdF(1:3,1:3,1:3,1:3,1,k)**2.0_pReal)
@@ -898,23 +972,30 @@ subroutine utilities_constitutiveResponse(F_lastInc,F,temperature,timeinc,&
      min_dPdF_norm = sum(materialpoint_dPdF(1:3,1:3,1:3,1:3,1,k)**2.0_pReal)
    endif  
  end do
-
- P = reshape(materialpoint_P, [3,3,grid(1),grid(2),grid(3)])
- C_volAvg = sum(sum(materialpoint_dPdF,dim=6),dim=5) * wgt
+ call MPI_Allreduce(MPI_IN_PLACE,max_dPdF,81,MPI_DOUBLE,MPI_MAX,PETSC_COMM_WORLD,ierr)
+ call MPI_Allreduce(MPI_IN_PLACE,min_dPdF,81,MPI_DOUBLE,MPI_MIN,PETSC_COMM_WORLD,ierr)
  C_minmaxAvg = 0.5_pReal*(max_dPdF + min_dPdF)
+
+ C_volAvg = sum(sum(materialpoint_dPdF,dim=6),dim=5) * wgt
+ call MPI_Allreduce(MPI_IN_PLACE,C_volAvg,81,MPI_DOUBLE,MPI_SUM,PETSC_COMM_WORLD,ierr)
  
  call debug_info()
  
  restartWrite = .false.                                                                             ! reset restartWrite status
  cutBack = .false.                                                                                  ! reset cutBack status
  
+ P = reshape(materialpoint_P, [3,3,gridLocal(1),gridLocal(2),gridLocal(3)])
  P_av = sum(sum(sum(P,dim=5),dim=4),dim=3) * wgt                                                    ! average of P 
- if (debugRotation) &
+ call MPI_Allreduce(MPI_IN_PLACE,P_av,9,MPI_DOUBLE,MPI_SUM,PETSC_COMM_WORLD,ierr)
+ if (debugRotation .and. worldrank == 0_pInt) &
  write(6,'(/,a,/,3(3(2x,f12.4,1x)/))',advance='no') ' Piola--Kirchhoff stress (lab) / MPa =',&
                                                      math_transpose33(P_av)*1.e-6_pReal
  P_av = math_rotate_forward33(P_av,rotation_BC)
- write(6,'(/,a,/,3(3(2x,f12.4,1x)/))',advance='no') ' Piola--Kirchhoff stress / MPa =',&
+ if (worldrank == 0_pInt) then
+   write(6,'(/,a,/,3(3(2x,f12.4,1x)/))',advance='no') ' Piola--Kirchhoff stress / MPa =',&
                                                      math_transpose33(P_av)*1.e-6_pReal
+   flush(6)
+ endif
 
 end subroutine utilities_constitutiveResponse
 
@@ -923,22 +1004,26 @@ end subroutine utilities_constitutiveResponse
 !> @brief calculates forward rate, either guessing or just add delta/timeinc
 !--------------------------------------------------------------------------------------------------
 pure function utilities_calculateRate(avRate,timeinc_old,guess,field_lastInc,field)
-
+ use mesh, only: &
+   gridLocal
+   
  implicit none
  real(pReal), intent(in), dimension(3,3)                      :: avRate                             !< homogeneous addon
  real(pReal), intent(in) :: &
    timeinc_old                                                                                      !< timeinc of last step
  logical, intent(in) :: &
    guess                                                                                            !< guess along former trajectory
- real(pReal), intent(in), dimension(3,3,grid(1),grid(2),grid(3)) :: &
+ real(pReal), intent(in), dimension(3,3,gridLocal(1),gridLocal(2),gridLocal(3)) :: &
    field_lastInc, &                                                                                 !< data of previous step
    field                                                                                            !< data of current step
- real(pReal),             dimension(3,3,grid(1),grid(2),grid(3)) :: utilities_calculateRate
+ real(pReal),             dimension(3,3,gridLocal(1),gridLocal(2),gridLocal(3)) :: &
+   utilities_calculateRate
  
- if(guess) then
+ if (guess) then
    utilities_calculateRate = (field-field_lastInc) / timeinc_old
  else
-   utilities_calculateRate = spread(spread(spread(avRate,3,grid(1)),4,grid(2)),5,grid(3))
+   utilities_calculateRate = spread(spread(spread(avRate,3,gridLocal(1)),4,gridLocal(2)), &
+                                                                           5,gridLocal(3))
  endif
 
 end function utilities_calculateRate
@@ -948,24 +1033,30 @@ end function utilities_calculateRate
 !> @brief forwards a field with a pointwise given rate, if aim is given, 
 !> ensures that the average matches the aim
 !--------------------------------------------------------------------------------------------------
-pure function utilities_forwardField(timeinc,field_lastInc,rate,aim)
+function utilities_forwardField(timeinc,field_lastInc,rate,aim)
+ use mesh, only: &
+   gridLocal
 
  implicit none
  real(pReal), intent(in) :: & 
    timeinc                                                                                          !< timeinc of current step
- real(pReal), intent(in),           dimension(3,3,grid(1),grid(2),grid(3)) :: &
+ real(pReal), intent(in),           dimension(3,3,gridLocal(1),gridLocal(2),gridLocal(3)) :: &
    field_lastInc, &                                                                                 !< initial field
    rate                                                                                             !< rate by which to forward
  real(pReal), intent(in), optional, dimension(3,3) :: &
    aim                                                                                              !< average field value aim
- real(pReal),                       dimension(3,3,grid(1),grid(2),grid(3)) :: utilities_forwardField
+ real(pReal),                       dimension(3,3,gridLocal(1),gridLocal(2),gridLocal(3)) :: &
+   utilities_forwardField
  real(pReal),                       dimension(3,3)                      :: fieldDiff                !< <a + adot*t> - aim
+ PetscErrorCode :: ierr
  
  utilities_forwardField = field_lastInc + rate*timeinc
  if (present(aim)) then                                                                             !< correct to match average
-   fieldDiff = sum(sum(sum(utilities_forwardField,dim=5),dim=4),dim=3)*wgt - aim
+   fieldDiff = sum(sum(sum(utilities_forwardField,dim=5),dim=4),dim=3)*wgt
+   call MPI_Allreduce(MPI_IN_PLACE,fieldDiff,9,MPI_DOUBLE,MPI_SUM,PETSC_COMM_WORLD,ierr)
+   fieldDiff = fieldDiff - aim
    utilities_forwardField = utilities_forwardField - &
-                          spread(spread(spread(fieldDiff,3,grid(1)),4,grid(2)),5,grid(3))
+                            spread(spread(spread(fieldDiff,3,gridLocal(1)),4,gridLocal(2)),5,gridLocal(3))
  endif
 
 end function utilities_forwardField
@@ -981,6 +1072,8 @@ real(pReal) function utilities_getFilter(k)
    spectral_filter
  use math, only: &
    PI
+ use mesh, only: &
+   gridGlobal
   
  implicit none
  real(pReal),intent(in), dimension(3) :: k                                                          !< indices of frequency
@@ -988,15 +1081,26 @@ real(pReal) function utilities_getFilter(k)
  utilities_getFilter = 1.0_pReal
 
  select case (spectral_filter)
-    case ('none')                                                                                   ! default, no weighting
-    case ('cosine')                                                                                 ! cosine curve with 1 for avg and zero for highest freq
-      utilities_getFilter = product(1.0_pReal + cos(PI*k*scaledGeomSize/grid))/8.0_pReal
-    case ('gradient')                                                                               ! gradient, might need grid scaling as for cosine filter
-      utilities_getFilter = 1.0_pReal/(1.0_pReal + &
-                                       (k(1)*k(1) + k(2)*k(2) + k(3)*k(3)))
-    case default
-      call IO_error(error_ID = 892_pInt, ext_msg = trim(spectral_filter))
-  end select 
+   case ('none')                                                                                    ! default, no weighting
+   case ('cosine')                                                                                  ! cosine curve with 1 for avg and zero for highest freq
+     utilities_getFilter = product(1.0_pReal + cos(PI*k*scaledGeomSize/gridGlobal))/8.0_pReal
+   case ('gradient')                                                                                ! gradient, might need grid scaling as for cosine filter
+     utilities_getFilter = 1.0_pReal/(1.0_pReal + &
+                                      (k(1)*k(1) + k(2)*k(2) + k(3)*k(3)))
+   case default
+     call IO_error(error_ID = 892_pInt, ext_msg = trim(spectral_filter))
+ end select 
+
+ if (gridGlobal(1) /= 1_pInt .and. k(1) == real(grid1Red - 1_pInt,    pReal)/scaledGeomSize(1)) &
+   utilities_getFilter = 0.0_pReal
+ if (gridGlobal(2) /= 1_pInt .and. k(2) == real(gridGlobal(2)/2_pInt, pReal)/scaledGeomSize(2)) &
+   utilities_getFilter = 0.0_pReal                                                                  ! do not delete the whole slice in case of 2D calculation
+ if (gridGlobal(2) /= 1_pInt .and. k(2) == real(gridGlobal(2)/2_pInt + mod(gridGlobal(2),2_pInt), pReal)/scaledGeomSize(2)) &
+   utilities_getFilter = 0.0_pReal                                                                  ! do not delete the whole slice in case of 2D calculation
+ if (gridGlobal(3) /= 1_pInt .and. k(3) == real(gridGlobal(3)/2_pInt, pReal)/scaledGeomSize(3)) &
+   utilities_getFilter = 0.0_pReal                                                                  ! do not delete the whole slice in case of 2D calculation
+ if (gridGlobal(3) /= 1_pInt .and. k(3) == real(gridGlobal(3)/2_pInt + mod(gridGlobal(3),2_pInt), pReal)/scaledGeomSize(3)) &
+   utilities_getFilter = 0.0_pReal                                                                  ! do not delete the whole slice in case of 2D calculation
 
 end function utilities_getFilter
 
@@ -1009,12 +1113,12 @@ subroutine utilities_destroy()
 
  implicit none
 
- if (debugDivergence) call fftw_destroy_plan(planDiv)
- if (debugFFTW) call fftw_destroy_plan(planDebugForth)
- if (debugFFTW) call fftw_destroy_plan(planDebugBack)
- call fftw_destroy_plan(planForth)
- call fftw_destroy_plan(planBack)
- !call fftw_destroy_plan(planCoords)
+ if (debugDivergence) call fftw_destroy_plan(planDivMPI)
+ if (debugFFTW) call fftw_destroy_plan(planDebugForthMPI)
+ if (debugFFTW) call fftw_destroy_plan(planDebugBackMPI)
+ call fftw_destroy_plan(planForthMPI)
+ call fftw_destroy_plan(planBackMPI)
+ call fftw_destroy_plan(planCoordsMPI)
 
 end subroutine utilities_destroy
 
@@ -1027,55 +1131,55 @@ end subroutine utilities_destroy
 subroutine utilities_updateIPcoords(F)
  use math
  use mesh, only: &
+   gridGlobal, &
+   gridLocal, &
+   gridOffset, &
+   geomSizeGlobal, &
    mesh_ipCoordinates 
  implicit none
 
- real(pReal),   dimension(3,3,grid(1),grid(2),grid(3)), intent(in) :: F
+ real(pReal),   dimension(3,3,gridLocal(1),gridLocal(2),gridLocal(3)), intent(in) :: F
  integer(pInt) :: i, j, k, m
- integer(pInt), dimension(3) :: k_s
  real(pReal),   dimension(3) :: step, offset_coords, integrator
  real(pReal),   dimension(3,3) :: Favg
+ PetscErrorCode :: ierr
 
- field_real = 0.0_pReal
- field_real(1:grid(1),1:grid(2),1:grid(3),1:3,1:3) = reshape(F,[grid(1),grid(2),grid(3),3,3],&
-                                                               order=[4,5,1,2,3])           
+ field_realMPI = 0.0_pReal
+ field_realMPI(1:3,1:3,1:gridLocal(1),1:gridLocal(2),1:gridLocal(3)) = F
  call utilities_FFTforward()
 
- integrator = geomsize * 0.5_pReal / PI
- step = geomsize/real(grid, pReal)
+ integrator = geomSizeGlobal * 0.5_pReal / PI
+ step = geomSizeGlobal/real(gridGlobal, pReal)
  
 !--------------------------------------------------------------------------------------------------
 ! average F
- Favg = real(field_fourier(1,1,1,1:3,1:3),pReal)/real(product(grid),pReal)
+ if (gridOffset == 0_pInt) Favg = real(field_fourierMPI(1:3,1:3,1,1,1),pReal)*wgt
+ call MPI_Bcast(Favg,9,MPI_DOUBLE,0,PETSC_COMM_WORLD,ierr)
 
 !--------------------------------------------------------------------------------------------------
 ! integration in Fourier space
- field_fourier = cmplx(0.0_pReal,0.0_pReal,pReal)
- do k = 1_pInt, grid(3)
-   k_s(3) = k-1_pInt
-   if(k > grid(3)/2_pInt+1_pInt) k_s(3) = k_s(3)-grid(3)
-   do j = 1_pInt, grid(2)
-     k_s(2) = j-1_pInt
-     if(j > grid(2)/2_pInt+1_pInt) k_s(2) = k_s(2)-grid(2)
-     do i = 1_pInt, grid1Red
-       k_s(1) = i-1_pInt
-       do m = 1_pInt,3_pInt
-         field_fourier(i,j,k,m,1) = sum(field_fourier(i,j,k,m,1:3)*&
-                                       cmplx(0.0_pReal,real(k_s,pReal)*integrator,pReal))
-       enddo
-       if (any(k_s /= 0_pInt)) field_fourier(i,j,k,1:3,1) = &
-                               field_fourier(i,j,k,1:3,1) / cmplx(-sum(k_s*k_s),0.0_pReal,pReal)
+ coords_fourierMPI = cmplx(0.0_pReal, 0.0_pReal, pReal)
+ do k = 1_pInt, gridLocal(3);  do j = 1_pInt, gridLocal(2);  do i = 1_pInt,grid1Red
+   do m = 1_pInt,3_pInt
+     coords_fourierMPI(m,i,j,k) = sum(field_fourierMPI(m,1:3,i,j,k)*&
+                                   cmplx(0.0_pReal,xi(1:3,i,j,k)*scaledGeomSize*integrator,pReal))
+   enddo
+   if (any(xi(1:3,i,j,k) /= 0.0_pReal)) coords_fourierMPI(1:3,i,j,k) = &
+     coords_fourierMPI(1:3,i,j,k)/cmplx(-sum(xi(1:3,i,j,k)*scaledGeomSize*xi(1:3,i,j,k)* &
+                                                     scaledGeomSize),0.0_pReal,pReal)
  enddo; enddo; enddo
- call utilities_FFTbackward()
+ call fftw_mpi_execute_dft_c2r(planCoordsMPI,coords_fourierMPI,coords_realMPI)
 
 !--------------------------------------------------------------------------------------------------
 ! add average to fluctuation and put (0,0,0) on (0,0,0)
- offset_coords = math_mul33x3(Favg,step/2.0_pReal) - field_real(1,1,1,1:3,1)
+ if (gridOffset == 0_pInt) offset_coords = coords_realMPI(1:3,1,1,1)
+ call MPI_Bcast(offset_coords,3,MPI_DOUBLE,0,PETSC_COMM_WORLD,ierr)
+ offset_coords = math_mul33x3(Favg,step/2.0_pReal) - offset_coords
  m = 1_pInt
- do k = 1_pInt,grid(3); do j = 1_pInt,grid(2); do i = 1_pInt,grid(1)
-   mesh_ipCoordinates(1:3,1,m) = field_real(i,j,k,1:3,1) &
+ do k = 1_pInt,gridLocal(3); do j = 1_pInt,gridLocal(2); do i = 1_pInt,gridLocal(1)
+   mesh_ipCoordinates(1:3,1,m) = coords_realMPI(1:3,i,j,k) &
                                + offset_coords &
-                               + math_mul33x3(Favg,step*real([i,j,k]-1_pInt,pReal))
+                               + math_mul33x3(Favg,step*real([i,j,k+gridOffset]-1_pInt,pReal))
    m = m+1_pInt
  enddo; enddo; enddo
 
diff --git a/code/Makefile b/code/Makefile
index 563031e89..5efdadea8 100644
--- a/code/Makefile
+++ b/code/Makefile
@@ -336,7 +336,7 @@ SPECTRAL_FILES = prec.o DAMASK_interface.o IO.o libs.o numerics.o debug.o math.o
                  FEsolving.o mesh.o material.o lattice.o \
                  $(DAMAGE_FILES) $(THERMAL_FILES) $(VACANCY_FILES) $(PLASTIC_FILES) \
                  crystallite.o $(HOMOGENIZATION_FILES) CPFEM.o \
-                 DAMASK_spectral_utilities.o DAMASK_spectral_solverBasic.o \
+                 DAMASK_spectral_utilities.o \
                  DAMASK_spectral_solverAL.o DAMASK_spectral_solverBasicPETSc.o DAMASK_spectral_solverPolarisation.o
 
 DAMASK_spectral.exe: DAMASK_spectral_driver.o 
@@ -345,18 +345,15 @@ DAMASK_spectral.exe: DAMASK_spectral_driver.o
     $(SPECTRAL_FILES) $(LIBRARIES) $(SUFFIX)
     
     
-DAMASK_spectral_driver.o: DAMASK_spectral_driver.f90 DAMASK_spectral_solverBasic.o \
-                                                     DAMASK_spectral_solverAL.o \
-                                                     DAMASK_spectral_solverBasicPETSc.o \
-                                                     DAMASK_spectral_solverPolarisation.o
+DAMASK_spectral_driver.o: DAMASK_spectral_driver.f90 \
+                          DAMASK_spectral_solverAL.o \
+                          DAMASK_spectral_solverBasicPETSc.o \
+                          DAMASK_spectral_solverPolarisation.o
 	$(PREFIX) $(COMPILERNAME) $(COMPILE_MAXOPTI) -c DAMASK_spectral_driver.f90 $(SUFFIX)
 
 DAMASK_spectral_solverAL.o:    DAMASK_spectral_solverAL.f90 \
                                DAMASK_spectral_utilities.o
   
-DAMASK_spectral_solverBasic.o: DAMASK_spectral_solverBasic.f90 \
-                               DAMASK_spectral_utilities.o
-  
 DAMASK_spectral_solverPolarisation.o:    DAMASK_spectral_solverPolarisation.f90 \
                                DAMASK_spectral_utilities.o
   
@@ -370,8 +367,8 @@ DAMASK_spectral_utilities.o:   DAMASK_spectral_utilities.f90 \
 # FEM Solver
 #####################
 VPATH := ../private/FEM/code
-DAMASK_FEM.exe: COMPILE         += -DFEM -DmultiphysicsOut
-DAMASK_FEM.exe: COMPILE_MAXOPTI += -DFEM -DmultiphysicsOut
+DAMASK_FEM.exe: COMPILE         += -DFEM
+DAMASK_FEM.exe: COMPILE_MAXOPTI += -DFEM
 DAMASK_FEM.exe: MESHNAME        := ../private/FEM/code/meshFEM.f90
 DAMASK_FEM.exe: INTERFACENAME   := ../private/FEM/code/DAMASK_FEM_interface.f90
 DAMASK_FEM.exe: INCLUDE_DIRS    += -I./
diff --git a/code/mesh.f90 b/code/mesh.f90
index 5396ccf00..feb3fb587 100644
--- a/code/mesh.f90
+++ b/code/mesh.f90
@@ -15,7 +15,7 @@ module mesh
  implicit none
  private
  integer(pInt), public, protected :: &
-   mesh_NcpElems, &                                                                                 !< total number of CP elements in mesh
+   mesh_NcpElems, &                                                                                 !< total number of CP elements in local mesh
    mesh_NelemSets, &
    mesh_maxNelemInSet, &
    mesh_Nmaterials, &
@@ -29,6 +29,18 @@ module mesh
    mesh_maxNcellnodes, &                                                                            !< max number of cell nodes in any CP element
    mesh_Nelems                                                                                      !< total number of elements in mesh
 
+#ifdef Spectral
+ integer(pInt), public, protected :: &
+   mesh_NcpElemsGlobal, &                                                                           !< total number of CP elements in global mesh
+   gridGlobal(3), &
+   gridLocal (3), &
+   gridOffset
+ real(pReal), public, protected :: &
+   geomSizeGlobal(3), &
+   geomSizeLocal (3), &
+   geomSizeOffset
+#endif
+
  integer(pInt), dimension(:,:), allocatable, public, protected :: &
    mesh_element, &                                                                                  !< FEid, type(internal representation), material, texture, node indices as CP IDs
    mesh_sharedElem, &                                                                               !< entryCount and list of elements containing node
@@ -103,10 +115,15 @@ module mesh
  logical, private :: noPart                                                                         !< for cases where the ABAQUS input file does not use part/assembly information
 #endif
 
-#ifdef Spectral
- include 'fftw3.f03'
+#ifdef PETSc
+#include <petsc-finclude/petscsys.h>
 #endif
 
+#ifdef Spectral
+ include 'fftw3-mpi.f03'
+#endif
+
+
 ! These definitions should actually reside in the FE-solver specific part (different for MARC/ABAQUS)
 ! Hence, I suggest to prefix with "FE_"
 
@@ -470,6 +487,9 @@ contains
 !! Order and routines strongly depend on type of solver
 !--------------------------------------------------------------------------------------------------
 subroutine mesh_init(ip,el)
+#ifdef Spectral
+ use, intrinsic :: iso_c_binding
+#endif
  use DAMASK_interface
  use, intrinsic :: iso_fortran_env                                                                  ! to get compiler_version and compiler_options (at least for gfortran 4.6 at the moment)
  use IO, only: &
@@ -503,6 +523,9 @@ subroutine mesh_init(ip,el)
    modelName
  
  implicit none
+#ifdef Spectral
+ integer(C_INTPTR_T) :: gridMPI(3), alloc_local, local_K, local_K_offset
+#endif
  integer(pInt), parameter :: FILEUNIT = 222_pInt
  integer(pInt), intent(in) :: el, ip
  integer(pInt) :: j
@@ -540,8 +563,25 @@ subroutine mesh_init(ip,el)
  myDebug = (iand(debug_level(debug_mesh),debug_levelBasic) /= 0_pInt)
 
 #ifdef Spectral
+ call fftw_mpi_init()
  call IO_open_file(FILEUNIT,geometryFile)                                                           ! parse info from geometry file...
  if (myDebug) write(6,'(a)') ' Opened geometry file'; flush(6)
+ 
+ gridGlobal = mesh_spectral_getGrid(fileUnit)
+ gridMPI = gridGlobal
+ alloc_local = fftw_mpi_local_size_3d(gridMPI(3), gridMPI(2), gridMPI(1)/2 +1, &
+                                      MPI_COMM_WORLD, local_K, local_K_offset)
+ gridLocal(1) = gridGlobal(1)
+ gridLocal(2) = gridGlobal(2)
+ gridLocal(3) = local_K
+ gridOffset  = local_K_offset
+ 
+ geomSizeGlobal = mesh_spectral_getSize(fileUnit)
+ geomSizeLocal(1) = geomSizeGlobal(1)
+ geomSizeLocal(2) = geomSizeGlobal(2)
+ geomSizeLocal(3) = geomSizeGlobal(3)*real(gridLocal(3))/real(gridGlobal(3))
+ geomSizeOffset = geomSizeGlobal(3)*real(gridOffset)  /real(gridGlobal(3))
+ if (myDebug) write(6,'(a)') ' Grid partitioned'; flush(6)
  call mesh_spectral_count(FILEUNIT)
  if (myDebug) write(6,'(a)') ' Counted nodes/elements'; flush(6)
  call mesh_spectral_mapNodesAndElems
@@ -656,10 +696,12 @@ subroutine mesh_init(ip,el)
 #endif
 
  close (FILEUNIT)
- call mesh_tell_statistics
- call mesh_write_meshfile
- call mesh_write_cellGeom
- call mesh_write_elemGeom
+ if (worldrank == 0_pInt) then
+   call mesh_tell_statistics
+   call mesh_write_meshfile
+   call mesh_write_cellGeom
+   call mesh_write_elemGeom
+ endif
 
  if (usePingPong .and. (mesh_Nelems /= mesh_NcpElems)) &
    call IO_error(600_pInt)                                                                          ! ping-pong must be disabled when having non-DAMASK elements
@@ -1201,15 +1243,15 @@ end function mesh_spectral_getHomogenization
 !! 'mesh_Nelems', 'mesh_Nnodes' and 'mesh_NcpElems'
 !--------------------------------------------------------------------------------------------------
 subroutine mesh_spectral_count(fileUnit)
- 
+
  implicit none
  integer(pInt),              intent(in) :: fileUnit
- integer(pInt), dimension(3)            :: grid
 
- grid     = mesh_spectral_getGrid(fileUnit)
- mesh_Nelems = product(grid)
- mesh_NcpElems = mesh_Nelems
- mesh_Nnodes = product(grid+1_pInt)
+ mesh_Nelems  = product(gridLocal)
+ mesh_NcpElems= mesh_Nelems
+ mesh_Nnodes  = product(gridLocal + 1_pInt)
+ 
+ mesh_NcpElemsGlobal = product(gridGlobal)
 
 end subroutine mesh_spectral_count
 
@@ -1262,27 +1304,18 @@ subroutine mesh_spectral_build_nodes(fileUnit)
 
  implicit none
  integer(pInt),              intent(in) :: fileUnit
- integer(pInt)                          :: n
- integer(pInt), dimension(3)            :: grid
- real(pReal),   dimension(3)            :: geomSize
+ integer(pInt)                          :: n, i, j, k
 
  allocate (mesh_node0 (3,mesh_Nnodes), source = 0.0_pReal)
  allocate (mesh_node  (3,mesh_Nnodes), source = 0.0_pReal)
  
- grid     = mesh_spectral_getGrid(fileUnit)
- geomSize = mesh_spectral_getSize(fileUnit)
-
- forall (n = 0_pInt:mesh_Nnodes-1_pInt)
-   mesh_node0(1,n+1_pInt) = mesh_unitlength * &
-           geomSize(1)*real(mod(n,(grid(1)+1_pInt) ),pReal) &
-                                                  / real(grid(1),pReal)
-   mesh_node0(2,n+1_pInt) = mesh_unitlength * &
-           geomSize(2)*real(mod(n/(grid(1)+1_pInt),(grid(2)+1_pInt)),pReal) &
-                                                  / real(grid(2),pReal)
-   mesh_node0(3,n+1_pInt) = mesh_unitlength * &
-           geomSize(3)*real(mod(n/(grid(1)+1_pInt)/(grid(2)+1_pInt),(grid(3)+1_pInt)),pReal) &
-                                                  / real(grid(3),pReal)
- end forall 
+ n = 0_pInt
+ do k = 1, gridLocal(3); do j = 1, gridLocal(2); do i = 1, gridLocal(1)
+   n = n + 1_pInt
+   mesh_node0(1,n) = real(i)*geomSizeLocal(1)/real(gridLocal(1))
+   mesh_node0(2,n) = real(j)*geomSizeLocal(2)/real(gridLocal(2))
+   mesh_node0(3,n) = real(k)*geomSizeLocal(3)/real(gridLocal(3)) + geomSizeOffset
+ enddo; enddo; enddo 
 
  mesh_node = mesh_node0
 
@@ -1315,11 +1348,11 @@ subroutine mesh_spectral_build_elements(fileUnit)
    headerLength = 0_pInt, &
    maxIntCount, &
    homog, &
-   elemType
+   elemType, &
+   elemOffset
  integer(pInt),     dimension(:), allocatable :: &
-   microstructures
- integer(pInt),     dimension(3) :: &
-   grid
+   microstructures, &
+   mesh_microGlobal
  integer(pInt),     dimension(1,1) :: &
    dummySet = 0_pInt
  character(len=65536) :: &
@@ -1328,7 +1361,6 @@ subroutine mesh_spectral_build_elements(fileUnit)
  character(len=64), dimension(1) :: &
    dummyName = ''
 
- grid  = mesh_spectral_getGrid(fileUnit)
  homog = mesh_spectral_getHomogenization(fileUnit)
 
 !--------------------------------------------------------------------------------------------------
@@ -1358,7 +1390,8 @@ subroutine mesh_spectral_build_elements(fileUnit)
    maxIntCount = max(maxIntCount, i)
  enddo
  allocate (mesh_element    (4_pInt+mesh_maxNnodes,mesh_NcpElems), source = 0_pInt)
- allocate (microstructures (1_pInt+maxIntCount), source = 1_pInt)
+ allocate (microstructures (1_pInt+maxIntCount),  source = 1_pInt)
+ allocate (mesh_microGlobal(mesh_NcpElemsGlobal), source = 1_pInt)
 
 !--------------------------------------------------------------------------------------------------
 ! read in microstructures
@@ -1367,31 +1400,39 @@ subroutine mesh_spectral_build_elements(fileUnit)
    read(fileUnit,'(a65536)') line
  enddo
  
- elemType = FE_mapElemtype('C3D8R') 
  e = 0_pInt
- do while (e < mesh_NcpElems .and. microstructures(1) > 0_pInt)                                     ! fill expected number of elements, stop at end of data (or blank line!)
+ do while (e < mesh_NcpElemsGlobal .and. microstructures(1) > 0_pInt)                               ! fill expected number of elements, stop at end of data (or blank line!)
    microstructures = IO_continuousIntValues(fileUnit,maxIntCount,dummyName,dummySet,0_pInt)         ! get affected elements
    do i = 1_pInt,microstructures(1_pInt)
      e = e+1_pInt                                                                                   ! valid element entry
-     mesh_element( 1,e) = e                                                                         ! FE id
-     mesh_element( 2,e) = elemType                                                                  ! elem type
-     mesh_element( 3,e) = homog                                                                     ! homogenization
-     mesh_element( 4,e) = microstructures(1_pInt+i)                                                 ! microstructure
-     mesh_element( 5,e) = e + (e-1_pInt)/grid(1) + &
-                                       ((e-1_pInt)/(grid(1)*grid(2)))*(grid(1)+1_pInt)              ! base node
-     mesh_element( 6,e) = mesh_element(5,e) + 1_pInt
-     mesh_element( 7,e) = mesh_element(5,e) + grid(1) + 2_pInt
-     mesh_element( 8,e) = mesh_element(5,e) + grid(1) + 1_pInt
-     mesh_element( 9,e) = mesh_element(5,e) +(grid(1) + 1_pInt) * (grid(2) + 1_pInt)                ! second floor base node
-     mesh_element(10,e) = mesh_element(9,e) + 1_pInt
-     mesh_element(11,e) = mesh_element(9,e) + grid(1) + 2_pInt
-     mesh_element(12,e) = mesh_element(9,e) + grid(1) + 1_pInt
-     mesh_maxValStateVar(1) = max(mesh_maxValStateVar(1),mesh_element(3,e))                         ! needed for statistics
-     mesh_maxValStateVar(2) = max(mesh_maxValStateVar(2),mesh_element(4,e))              
+     mesh_microGlobal(e) = microstructures(1_pInt+i)
    enddo
  enddo
 
+ elemType = FE_mapElemtype('C3D8R') 
+ elemOffset = gridLocal(1)*gridLocal(2)*gridOffset
+ e = 0_pInt
+ do while (e < mesh_NcpElems)                                                                       ! fill expected number of elements, stop at end of data (or blank line!)
+   e = e+1_pInt                                                                                     ! valid element entry
+   mesh_element( 1,e) = e                                                                           ! FE id
+   mesh_element( 2,e) = elemType                                                                    ! elem type
+   mesh_element( 3,e) = homog                                                                       ! homogenization
+   mesh_element( 4,e) = mesh_microGlobal(e+elemOffset)                                              ! microstructure
+   mesh_element( 5,e) = e + (e-1_pInt)/gridLocal(1) + &
+                                     ((e-1_pInt)/(gridLocal(1)*gridLocal(2)))*(gridLocal(1)+1_pInt) ! base node
+   mesh_element( 6,e) = mesh_element(5,e) + 1_pInt
+   mesh_element( 7,e) = mesh_element(5,e) + gridLocal(1) + 2_pInt
+   mesh_element( 8,e) = mesh_element(5,e) + gridLocal(1) + 1_pInt
+   mesh_element( 9,e) = mesh_element(5,e) +(gridLocal(1) + 1_pInt) * (gridLocal(2) + 1_pInt)        ! second floor base node
+   mesh_element(10,e) = mesh_element(9,e) + 1_pInt
+   mesh_element(11,e) = mesh_element(9,e) + gridLocal(1) + 2_pInt
+   mesh_element(12,e) = mesh_element(9,e) + gridLocal(1) + 1_pInt
+   mesh_maxValStateVar(1) = max(mesh_maxValStateVar(1),mesh_element(3,e))                           ! needed for statistics
+   mesh_maxValStateVar(2) = max(mesh_maxValStateVar(2),mesh_element(4,e))              
+ enddo
+
  deallocate(microstructures)
+ deallocate(mesh_microGlobal)
  if (e /= mesh_NcpElems) call IO_error(880_pInt,e)
 
 end subroutine mesh_spectral_build_elements
@@ -1409,39 +1450,35 @@ subroutine mesh_spectral_build_ipNeighborhood(fileUnit)
  integer(pInt) :: &
   x,y,z, &
   e
- integer(pInt),     dimension(3) :: &
-  grid
  allocate(mesh_ipNeighborhood(3,mesh_maxNipNeighbors,mesh_maxNips,mesh_NcpElems),source=0_pInt)
  
- grid = mesh_spectral_getGrid(fileUnit)
-
  e = 0_pInt
- do z = 0_pInt,grid(3)-1_pInt
-   do y = 0_pInt,grid(2)-1_pInt
-     do x = 0_pInt,grid(1)-1_pInt
+ do z = 0_pInt,gridLocal(3)-1_pInt
+   do y = 0_pInt,gridLocal(2)-1_pInt
+     do x = 0_pInt,gridLocal(1)-1_pInt
        e = e + 1_pInt
-         mesh_ipNeighborhood(1,1,1,e) = z * grid(1) * grid(2) &
-                                      + y * grid(1) &
-                                      + modulo(x+1_pInt,grid(1)) &
+         mesh_ipNeighborhood(1,1,1,e) = z * gridLocal(1) * gridLocal(2) &
+                                      + y * gridLocal(1) &
+                                      + modulo(x+1_pInt,gridLocal(1)) &
                                       + 1_pInt
-         mesh_ipNeighborhood(1,2,1,e) = z * grid(1) * grid(2) &
-                                      + y * grid(1) &
-                                      + modulo(x-1_pInt,grid(1)) &
+         mesh_ipNeighborhood(1,2,1,e) = z * gridLocal(1) * gridLocal(2) &
+                                      + y * gridLocal(1) &
+                                      + modulo(x-1_pInt,gridLocal(1)) &
                                       + 1_pInt
-         mesh_ipNeighborhood(1,3,1,e) = z * grid(1) * grid(2) &
-                                      + modulo(y+1_pInt,grid(2)) * grid(1) &
+         mesh_ipNeighborhood(1,3,1,e) = z * gridLocal(1) * gridLocal(2) &
+                                      + modulo(y+1_pInt,gridLocal(2)) * gridLocal(1) &
                                       + x &
                                       + 1_pInt
-         mesh_ipNeighborhood(1,4,1,e) = z * grid(1) * grid(2) &
-                                      + modulo(y-1_pInt,grid(2)) * grid(1) &
+         mesh_ipNeighborhood(1,4,1,e) = z * gridLocal(1) * gridLocal(2) &
+                                      + modulo(y-1_pInt,gridLocal(2)) * gridLocal(1) &
                                       + x &
                                       + 1_pInt
-         mesh_ipNeighborhood(1,5,1,e) = modulo(z+1_pInt,grid(3)) * grid(1) * grid(2) &
-                                      + y * grid(1) &
+         mesh_ipNeighborhood(1,5,1,e) = modulo(z+1_pInt,gridLocal(3)) * gridLocal(1) * gridLocal(2) &
+                                      + y * gridLocal(1) &
                                       + x &
                                       + 1_pInt
-         mesh_ipNeighborhood(1,6,1,e) = modulo(z-1_pInt,grid(3)) * grid(1) * grid(2) &
-                                      + y * grid(1) &
+         mesh_ipNeighborhood(1,6,1,e) = modulo(z-1_pInt,gridLocal(3)) * gridLocal(1) * gridLocal(2) &
+                                      + y * gridLocal(1) &
                                       + x &
                                       + 1_pInt
          mesh_ipNeighborhood(2,1:6,1,e) = 1_pInt
@@ -1484,8 +1521,8 @@ function mesh_regrid(adaptive,resNewInput,minRes)
    math_mul33x3
 
  implicit none
- logical, intent(in)                                    :: adaptive                                  ! if true, choose adaptive grid based on resNewInput, otherwise keep it constant
- integer(pInt), dimension(3), optional, intent(in)      :: resNewInput                               ! f2py cannot handle optional arguments correctly (they are always present)
+ logical, intent(in)                                    :: adaptive                                 ! if true, choose adaptive grid based on resNewInput, otherwise keep it constant
+ integer(pInt), dimension(3), optional, intent(in)      :: resNewInput                              ! f2py cannot handle optional arguments correctly (they are always present)
  integer(pInt), dimension(3), optional, intent(in)      :: minRes
  integer(pInt), dimension(3)                            :: mesh_regrid, ratio, grid
  integer(pInt),                         parameter       :: FILEUNIT = 777_pInt
@@ -1576,7 +1613,7 @@ function mesh_regrid(adaptive,resNewInput,minRes)
      if (minRes(1) > 0_pInt .or. minRes(2) > 0_pInt) then
         if (minRes(3) /= 1_pInt .or. &
            mod(minRes(1),2_pInt) /= 0_pInt .or. &
-           mod(minRes(2),2_pInt) /= 0_pInt)  call IO_error(890_pInt, ext_msg = '2D minRes')        ! as f2py has problems with present, use pyf file for initialization to -1
+           mod(minRes(2),2_pInt) /= 0_pInt)  call IO_error(890_pInt, ext_msg = '2D minRes')         ! as f2py has problems with present, use pyf file for initialization to -1
    endif; endif
  else
    spatialDim = 3_pInt
@@ -1584,7 +1621,7 @@ function mesh_regrid(adaptive,resNewInput,minRes)
      if (any(minRes > 0_pInt)) then
         if (mod(minRes(1),2_pInt) /= 0_pInt .or. &
             mod(minRes(2),2_pInt) /= 0_pInt .or. &
-            mod(minRes(3),2_pInt) /= 0_pInt)  call IO_error(890_pInt, ext_msg = '3D minRes')       ! as f2py has problems with present, use pyf file for initialization to -1
+            mod(minRes(3),2_pInt) /= 0_pInt)  call IO_error(890_pInt, ext_msg = '3D minRes')        ! as f2py has problems with present, use pyf file for initialization to -1
    endif; endif
  endif
  
@@ -1601,12 +1638,12 @@ function mesh_regrid(adaptive,resNewInput,minRes)
      else
        possibleResNew(i,1:2) = [ratio(i)-1_pInt, ratio(i) + 1_pInt]
      endif
-     if (.not.present(minRes)) then                                                                ! calling from fortran, optional argument not given
+     if (.not.present(minRes)) then                                                                 ! calling from fortran, optional argument not given
        possibleResNew = possibleResNew
-     else                                                                                          ! optional argument is there
+     else                                                                                           ! optional argument is there
        if (any(minRes<1_pInt)) then
-         possibleResNew = possibleResNew                                                           ! f2py calling, but without specification (or choosing invalid values), standard from pyf = -1
-       else                                                                                        ! given useful values
+         possibleResNew = possibleResNew                                                            ! f2py calling, but without specification (or choosing invalid values), standard from pyf = -1
+       else                                                                                         ! given useful values
          forall(k = 1_pInt:3_pInt, j = 1_pInt:3_pInt) &
            possibleResNew(j,k) = max(possibleResNew(j,k), minRes(j))
        endif
@@ -1656,7 +1693,7 @@ function mesh_regrid(adaptive,resNewInput,minRes)
  N_Digits = adjustl(N_Digits)
  formatString = '(I'//trim(N_Digits)//'.'//trim(N_Digits)//',a)'
 
- call IO_write_jobFile(FILEUNIT,'IDX')                                                                   ! make it a general open-write file
+ call IO_write_jobFile(FILEUNIT,'IDX')                                                              ! make it a general open-write file
  write(FILEUNIT, '(A)') '1 header'
  write(FILEUNIT, '(A)') 'Numbered indices as per the large set'
  do i = 1_pInt, NpointsNew
@@ -1675,7 +1712,7 @@ function mesh_regrid(adaptive,resNewInput,minRes)
  N_Digits = adjustl(N_Digits)
  formatString = '(I'//trim(N_Digits)//'.'//trim(N_Digits)//',a)'
 
- call IO_write_jobFile(FILEUNIT,'idx')                                                                   ! make it a general open-write file
+ call IO_write_jobFile(FILEUNIT,'idx')                                                              ! make it a general open-write file
  write(FILEUNIT, '(A)') '1 header'
  write(FILEUNIT, '(A)') 'Numbered indices as per the small set'
  do i = 1_pInt, NpointsNew
@@ -1841,7 +1878,7 @@ function mesh_regrid(adaptive,resNewInput,minRes)
  deallocate(Tstar)
  deallocate(TstarNew)
  
-! for the state, we first have to know the size------------------------------------------------------------------ 
+! for the state, we first have to know the size----------------------------------------------------
  allocate(sizeStateConst(1,1,mesh_NcpElems))
  call IO_read_intFile(FILEUNIT,'sizeStateConst',trim(getSolverJobName()),size(sizeStateConst))
  read (FILEUNIT,rec=1) sizeStateConst
@@ -2641,7 +2678,7 @@ subroutine mesh_marc_map_elementSets(fileUnit)
       elemSet = elemSet+1_pInt
       mesh_nameElemSet(elemSet) = trim(IO_stringValue(line,myPos,4_pInt))
       mesh_mapElemSet(:,elemSet) = &
-            IO_continuousIntValues(fileUnit,mesh_maxNelemInSet,mesh_nameElemSet,mesh_mapElemSet,mesh_NelemSets)
+        IO_continuousIntValues(fileUnit,mesh_maxNelemInSet,mesh_nameElemSet,mesh_mapElemSet,mesh_NelemSets)
    endif
  enddo
  
@@ -2679,7 +2716,7 @@ subroutine mesh_marc_count_cpElements(fileUnit)
        do i=1_pInt,3_pInt+hypoelasticTableStyle  ! Skip 3 or 4 lines
          read (fileUnit,610,END=620) line
        enddo
-       mesh_NcpElems = mesh_NcpElems + IO_countContinuousIntValues(fileUnit)  ! why not simply mesh_NcpElems = IO_countContinuousIntValues(fileUnit)?
+       mesh_NcpElems = mesh_NcpElems + IO_countContinuousIntValues(fileUnit)                        ! why not simply mesh_NcpElems = IO_countContinuousIntValues(fileUnit)?
      exit
    endif
  enddo
@@ -2769,7 +2806,7 @@ subroutine mesh_marc_map_nodes(fileUnit)
    read (fileUnit,610,END=650) line
    myPos = IO_stringPos(line,maxNchunks)
    if( IO_lc(IO_stringValue(line,myPos,1_pInt)) == 'coordinates' ) then
-     read (fileUnit,610,END=650) line                                                                 ! skip crap line
+     read (fileUnit,610,END=650) line                                                               ! skip crap line
      do i = 1_pInt,mesh_Nnodes
        read (fileUnit,610,END=650) line
        mesh_mapFEtoCPnode(1_pInt,i) = IO_fixedIntValue (line,[ 0_pInt,10_pInt],1_pInt)
@@ -2816,7 +2853,7 @@ subroutine mesh_marc_build_nodes(fileUnit)
    read (fileUnit,610,END=670) line
    myPos = IO_stringPos(line,maxNchunks)
    if( IO_lc(IO_stringValue(line,myPos,1_pInt)) == 'coordinates' ) then
-     read (fileUnit,610,END=670) line                                                                 ! skip crap line
+     read (fileUnit,610,END=670) line                                                               ! skip crap line
      do i=1_pInt,mesh_Nnodes
        read (fileUnit,610,END=670) line
        m = mesh_FEasCP('node',IO_fixedIntValue(line,node_ends,1_pInt))
@@ -2865,10 +2902,10 @@ subroutine mesh_marc_count_cpSizes(fileUnit)
    read (fileUnit,610,END=630) line
    myPos = IO_stringPos(line,maxNchunks)
    if( IO_lc(IO_stringValue(line,myPos,1_pInt)) == 'connectivity' ) then
-     read (fileUnit,610,END=630) line                                                                 ! Garbage line
-     do i=1_pInt,mesh_Nelems                                                                          ! read all elements
+     read (fileUnit,610,END=630) line                                                               ! Garbage line
+     do i=1_pInt,mesh_Nelems                                                                        ! read all elements
        read (fileUnit,610,END=630) line
-       myPos = IO_stringPos(line,maxNchunks)                                                          ! limit to id and type
+       myPos = IO_stringPos(line,maxNchunks)                                                        ! limit to id and type
        e = mesh_FEasCP('elem',IO_intValue(line,myPos,1_pInt))
        if (e /= 0_pInt) then
          t = FE_mapElemtype(IO_stringValue(line,myPos,2_pInt))
@@ -2878,7 +2915,7 @@ subroutine mesh_marc_count_cpSizes(fileUnit)
          mesh_maxNips =         max(mesh_maxNips,FE_Nips(g))
          mesh_maxNipNeighbors = max(mesh_maxNipNeighbors,FE_NipNeighbors(c))
          mesh_maxNcellnodes =   max(mesh_maxNcellnodes,FE_Ncellnodes(g))
-         call IO_skipChunks(fileUnit,FE_Nnodes(t)-(myPos(1_pInt)-2_pInt))                             ! read on if FE_Nnodes exceeds node count present on current line
+         call IO_skipChunks(fileUnit,FE_Nnodes(t)-(myPos(1_pInt)-2_pInt))                           ! read on if FE_Nnodes exceeds node count present on current line
        endif
      enddo
      exit
@@ -2905,7 +2942,7 @@ subroutine mesh_marc_build_elements(fileUnit)
  implicit none
  integer(pInt), intent(in) :: fileUnit
 
- integer(pInt), parameter :: maxNchunks = 66_pInt                                                  ! limit to 64 nodes max (plus ID, type)
+ integer(pInt), parameter :: maxNchunks = 66_pInt                                                   ! limit to 64 nodes max (plus ID, type)
  integer(pInt), dimension (1_pInt+2_pInt*maxNchunks) :: myPos
  character(len=300) line
 
@@ -2921,26 +2958,26 @@ subroutine mesh_marc_build_elements(fileUnit)
    read (fileUnit,610,END=620) line
    myPos(1:1+2*1) = IO_stringPos(line,1_pInt)
    if( IO_lc(IO_stringValue(line,myPos,1_pInt)) == 'connectivity' ) then
-     read (fileUnit,610,END=620) line                                                                 ! garbage line
+     read (fileUnit,610,END=620) line                                                               ! garbage line
      do i = 1_pInt,mesh_Nelems
        read (fileUnit,610,END=620) line
        myPos = IO_stringPos(line,maxNchunks)
        e = mesh_FEasCP('elem',IO_intValue(line,myPos,1_pInt))
-       if (e /= 0_pInt) then                                                                          ! disregard non CP elems
-         mesh_element(1,e) = IO_IntValue (line,myPos,1_pInt)                                          ! FE id
-         t = FE_mapElemtype(IO_StringValue(line,myPos,2_pInt))                                        ! elem type
+       if (e /= 0_pInt) then                                                                        ! disregard non CP elems
+         mesh_element(1,e) = IO_IntValue (line,myPos,1_pInt)                                        ! FE id
+         t = FE_mapElemtype(IO_StringValue(line,myPos,2_pInt))                                      ! elem type
          mesh_element(2,e) = t
          nNodesAlreadyRead = 0_pInt
          do j = 1_pInt,myPos(1)-2_pInt
-           mesh_element(4_pInt+j,e) = mesh_FEasCP('node',IO_IntValue(line,myPos,j+2_pInt))            ! CP ids of nodes
+           mesh_element(4_pInt+j,e) = mesh_FEasCP('node',IO_IntValue(line,myPos,j+2_pInt))          ! CP ids of nodes
          enddo  
          nNodesAlreadyRead = myPos(1) - 2_pInt
-         do while(nNodesAlreadyRead < FE_Nnodes(t))                                                   ! read on if not all nodes in one line
+         do while(nNodesAlreadyRead < FE_Nnodes(t))                                                 ! read on if not all nodes in one line
            read (fileUnit,610,END=620) line
            myPos = IO_stringPos(line,maxNchunks)
            do j = 1_pInt,myPos(1)
              mesh_element(4_pInt+nNodesAlreadyRead+j,e) &
-               = mesh_FEasCP('node',IO_IntValue(line,myPos,j))                                        ! CP ids of nodes
+               = mesh_FEasCP('node',IO_IntValue(line,myPos,j))                                      ! CP ids of nodes
            enddo
            nNodesAlreadyRead = nNodesAlreadyRead + myPos(1)
          enddo
@@ -2950,33 +2987,33 @@ subroutine mesh_marc_build_elements(fileUnit)
    endif
  enddo
  
-620 rewind(fileUnit)                                                                                  ! just in case "initial state" appears before "connectivity"
+620 rewind(fileUnit)                                                                                ! just in case "initial state" appears before "connectivity"
  read (fileUnit,610,END=620) line
  do
    myPos(1:1+2*2) = IO_stringPos(line,2_pInt)
    if( (IO_lc(IO_stringValue(line,myPos,1_pInt)) == 'initial') .and. &
        (IO_lc(IO_stringValue(line,myPos,2_pInt)) == 'state') ) then
-     if (initialcondTableStyle == 2_pInt) read (fileUnit,610,END=620) line                            ! read extra line for new style
-     read (fileUnit,610,END=630) line                                                                 ! read line with index of state var
+     if (initialcondTableStyle == 2_pInt) read (fileUnit,610,END=620) line                          ! read extra line for new style
+     read (fileUnit,610,END=630) line                                                               ! read line with index of state var
      myPos(1:1+2*1) = IO_stringPos(line,1_pInt)
-     sv = IO_IntValue(line,myPos,1_pInt)                                                              ! figure state variable index
-     if( (sv == 2_pInt).or.(sv == 3_pInt) ) then                                                      ! only state vars 2 and 3 of interest
-       read (fileUnit,610,END=620) line                                                               ! read line with value of state var
+     sv = IO_IntValue(line,myPos,1_pInt)                                                            ! figure state variable index
+     if( (sv == 2_pInt).or.(sv == 3_pInt) ) then                                                    ! only state vars 2 and 3 of interest
+       read (fileUnit,610,END=620) line                                                             ! read line with value of state var
        myPos(1:1+2*1) = IO_stringPos(line,1_pInt)
-       do while (scan(IO_stringValue(line,myPos,1_pInt),'+-',back=.true.)>1)                          ! is noEfloat value?
-         myVal = nint(IO_fixedNoEFloatValue(line,[0_pInt,20_pInt],1_pInt),pInt)                       ! state var's value
-         mesh_maxValStateVar(sv-1_pInt) = max(myVal,mesh_maxValStateVar(sv-1_pInt))                   ! remember max val of homogenization and microstructure index
+       do while (scan(IO_stringValue(line,myPos,1_pInt),'+-',back=.true.)>1)                        ! is noEfloat value?
+         myVal = nint(IO_fixedNoEFloatValue(line,[0_pInt,20_pInt],1_pInt),pInt)                     ! state var's value
+         mesh_maxValStateVar(sv-1_pInt) = max(myVal,mesh_maxValStateVar(sv-1_pInt))                 ! remember max val of homogenization and microstructure index
          if (initialcondTableStyle == 2_pInt) then
-           read (fileUnit,610,END=630) line                                                           ! read extra line
-           read (fileUnit,610,END=630) line                                                           ! read extra line
+           read (fileUnit,610,END=630) line                                                         ! read extra line
+           read (fileUnit,610,END=630) line                                                         ! read extra line
          endif
-         contInts = IO_continuousIntValues&                                                           ! get affected elements
+         contInts = IO_continuousIntValues&                                                         ! get affected elements
                    (fileUnit,mesh_NcpElems,mesh_nameElemSet,mesh_mapElemSet,mesh_NelemSets)
          do i = 1_pInt,contInts(1)
            e = mesh_FEasCP('elem',contInts(1_pInt+i))
            mesh_element(1_pInt+sv,e) = myVal
          enddo
-         if (initialcondTableStyle == 0_pInt) read (fileUnit,610,END=620) line                        ! ignore IP range for old table style
+         if (initialcondTableStyle == 0_pInt) read (fileUnit,610,END=620) line                      ! ignore IP range for old table style
          read (fileUnit,610,END=630) line
          myPos(1:1+2*1) = IO_stringPos(line,1_pInt)
        enddo
@@ -3368,7 +3405,7 @@ subroutine mesh_abaqus_map_elements(fileUnit)
    endselect
  enddo
 
-660 call math_qsort(mesh_mapFEtoCPelem,1_pInt,int(size(mesh_mapFEtoCPelem,2_pInt),pInt))                  ! should be mesh_NcpElems
+660 call math_qsort(mesh_mapFEtoCPelem,1_pInt,int(size(mesh_mapFEtoCPelem,2_pInt),pInt))            ! should be mesh_NcpElems
 
  if (int(size(mesh_mapFEtoCPelem),pInt) < 2_pInt) call IO_error(error_ID=907_pInt)
 
@@ -3827,10 +3864,10 @@ subroutine mesh_build_nodeTwins
                maximumNode, &
                n1, &
                n2
- integer(pInt), dimension(mesh_Nnodes+1) :: minimumNodes, maximumNodes ! list of surface nodes (minimum and maximum coordinate value) with first entry giving the number of nodes
- real(pReal)   minCoord, maxCoord, &     ! extreme positions in one dimension
-               tolerance                 ! tolerance below which positions are assumed identical
- real(pReal), dimension(3) ::  distance  ! distance between two nodes in all three coordinates
+ integer(pInt), dimension(mesh_Nnodes+1) :: minimumNodes, maximumNodes                              ! list of surface nodes (minimum and maximum coordinate value) with first entry giving the number of nodes
+ real(pReal)   minCoord, maxCoord, &                                                                ! extreme positions in one dimension
+               tolerance                                                                            ! tolerance below which positions are assumed identical
+ real(pReal), dimension(3) ::  distance                                                             ! distance between two nodes in all three coordinates
  logical, dimension(mesh_Nnodes) :: unpaired
  
  allocate(mesh_nodeTwins(3,mesh_Nnodes))
@@ -3838,8 +3875,8 @@ subroutine mesh_build_nodeTwins
  
  tolerance = 0.001_pReal * minval(mesh_ipVolume) ** 0.333_pReal
  
- do dir = 1_pInt,3_pInt                                    ! check periodicity in directions of x,y,z
-   if (mesh_periodicSurface(dir)) then                     ! only if periodicity is requested
+ do dir = 1_pInt,3_pInt                                                                             ! check periodicity in directions of x,y,z
+   if (mesh_periodicSurface(dir)) then                                                              ! only if periodicity is requested
  
      
      !*** find out which nodes sit on the surface 
@@ -3849,7 +3886,7 @@ subroutine mesh_build_nodeTwins
      maximumNodes = 0_pInt
      minCoord = minval(mesh_node0(dir,:))
      maxCoord = maxval(mesh_node0(dir,:))
-     do node = 1_pInt,mesh_Nnodes                     ! loop through all nodes and find surface nodes
+     do node = 1_pInt,mesh_Nnodes                                                                   ! loop through all nodes and find surface nodes
        if (abs(mesh_node0(dir,node) - minCoord) <= tolerance) then
          minimumNodes(1) = minimumNodes(1) + 1_pInt
          minimumNodes(minimumNodes(1)+1_pInt) = node
@@ -3869,10 +3906,10 @@ subroutine mesh_build_nodeTwins
          do n2 = 1_pInt,maximumNodes(1)
            maximumNode = maximumNodes(n2+1_pInt)
            distance = abs(mesh_node0(:,minimumNode) - mesh_node0(:,maximumNode))
-           if (sum(distance) - distance(dir) <= tolerance) then        ! minimum possible distance (within tolerance)
+           if (sum(distance) - distance(dir) <= tolerance) then                                     ! minimum possible distance (within tolerance)
              mesh_nodeTwins(dir,minimumNode) = maximumNode
              mesh_nodeTwins(dir,maximumNode) = minimumNode
-             unpaired(maximumNode) = .false.                           ! remember this node, we don't have to look for his partner again
+             unpaired(maximumNode) = .false.                                                        ! remember this node, we don't have to look for his partner again
              exit
            endif
          enddo
@@ -4146,15 +4183,15 @@ subroutine mesh_tell_statistics
  
  myDebug = debug_level(debug_mesh)
 
- if (mesh_maxValStateVar(1) < 1_pInt) call IO_error(error_ID=170_pInt) ! no homogenization specified
- if (mesh_maxValStateVar(2) < 1_pInt) call IO_error(error_ID=180_pInt) ! no microstructure specified
+ if (mesh_maxValStateVar(1) < 1_pInt) call IO_error(error_ID=170_pInt)                              ! no homogenization specified
+ if (mesh_maxValStateVar(2) < 1_pInt) call IO_error(error_ID=180_pInt)                              ! no microstructure specified
  
  allocate (mesh_HomogMicro(mesh_maxValStateVar(1),mesh_maxValStateVar(2))); mesh_HomogMicro = 0_pInt
 do e = 1_pInt,mesh_NcpElems
-  if (mesh_element(3,e) < 1_pInt) call IO_error(error_ID=170_pInt,el=e) ! no homogenization specified
-  if (mesh_element(4,e) < 1_pInt) call IO_error(error_ID=180_pInt,el=e) ! no microstructure specified
+  if (mesh_element(3,e) < 1_pInt) call IO_error(error_ID=170_pInt,el=e)                             ! no homogenization specified
+  if (mesh_element(4,e) < 1_pInt) call IO_error(error_ID=180_pInt,el=e)                             ! no microstructure specified
   mesh_HomogMicro(mesh_element(3,e),mesh_element(4,e)) = &
-  mesh_HomogMicro(mesh_element(3,e),mesh_element(4,e)) + 1_pInt ! count combinations of homogenization and microstructure
+  mesh_HomogMicro(mesh_element(3,e),mesh_element(4,e)) + 1_pInt                                     ! count combinations of homogenization and microstructure
 enddo
 !$OMP CRITICAL (write2out)
   if (iand(myDebug,debug_levelBasic) /= 0_pInt) then
@@ -4173,8 +4210,8 @@ enddo
     write (myFmt,'(a,i32.32,a)') '(9x,a2,1x,',mesh_maxValStateVar(2),'(i8))'
     write(6,myFmt) '+-',math_range(mesh_maxValStateVar(2))
     write (myFmt,'(a,i32.32,a)') '(i8,1x,a2,1x,',mesh_maxValStateVar(2),'(i8))'
-    do i=1_pInt,mesh_maxValStateVar(1)      ! loop over all (possibly assigned) homogenizations
-      write(6,myFmt) i,'| ',mesh_HomogMicro(i,:) ! loop over all (possibly assigned) microstructures
+    do i=1_pInt,mesh_maxValStateVar(1)                                                              ! loop over all (possibly assigned) homogenizations
+      write(6,myFmt) i,'| ',mesh_HomogMicro(i,:)                                                    ! loop over all (possibly assigned) microstructures
     enddo
     write(6,'(/,a,/)') ' Input Parser: ADDITIONAL MPIE OPTIONS'
     write(6,*) 'periodic surface : ', mesh_periodicSurface