function index1d(ix, iy, iz)
  USE chunkModule
  IMPLICIT NONE
  integer ix, iy, iz, index1d
  index1d = ny*nz*ix + nz*iy + iz
end function

subroutine index3d(index, ix, iy, iz)
  USE chunkModule
  IMPLICIT NONE
  integer ix, iy, iz, index
  ix = index/(ny*nz)
  iy = mod(index,(ny*nz))/nz
  iz = mod(index,nz)
end subroutine


recursive SUBROUTINE ampimain
  USE mpi
  USE chunkModule
  IMPLICIT NONE
    interface
      function AMPI_WTIME()
        DOUBLE PRECISION :: AMPI_WTIME
      end function AMPI_WTIME
      INTEGER FUNCTION index1d(ix,iy,iz)
        INTEGER ix, iy, iz
      END FUNCTION
      SUBROUTINE index3d(index, ix, iy, iz)
        INTEGER index, ix, iy, iz
      END SUBROUTINE
    end interface

  INTEGER :: i,j,k
  INTEGER :: iter, niter
  INTEGER, DIMENSION(AMPI_STATUS_SIZE) :: status
  DOUBLE PRECISION :: error, tval, maxerr, starttime, endtime
  TYPE(chunk_type), POINTER :: chunk

  INTEGER :: thisIndex, ierr, nblocks

  CALL AMPI_Init(ierr)
  CALL AMPI_Comm_rank(AMPI_COMM_WORLD, thisIndex, ierr)
  CALL AMPI_Comm_size(AMPI_COMM_WORLD, nblocks, ierr)

  ALLOCATE(chunk)

  if(thisIndex .eq. 0) then
    niter = 30                ! some dummy proc 0 only initialization
  end if

  call AMPI_Bcast(niter, 1, AMPI_INTEGER, 0, AMPI_COMM_WORLD, ierr)

  call index3d(thisIndex, chunk%xidx, chunk%yidx, chunk%zidx)
  chunk%xp = index1d(mod(chunk%xidx+1,nx),chunk%yidx,chunk%zidx)
  chunk%xm = index1d(mod(chunk%xidx+nx-1,nx),chunk%yidx,chunk%zidx)
  chunk%yp = index1d(chunk%xidx,mod(chunk%yidx+1,ny),chunk%zidx)
  chunk%ym = index1d(chunk%xidx,mod(chunk%yidx+ny-1,ny),chunk%zidx)
  chunk%zp = index1d(chunk%xidx,chunk%yidx,mod(chunk%zidx+1,nz))
  chunk%zm = index1d(chunk%xidx,chunk%yidx,mod(chunk%zidx+nz-1,nz))
  DO i = 2, 11
    DO j = 2, 11
      DO k = 2, 11
        chunk%t(k, j, i) = 100*(i-2) + 10*(j-2) + (k-2)
      ENDDO
    ENDDO
  ENDDO

  call AMPI_Barrier(AMPI_COMM_WORLD, ierr)
  if(thisIndex .eq. 0) then
    starttime = AMPI_Wtime()
  end if

  maxerr = 0.0
  DO iter = 0,niter-1 
    !maxerr = 0.0

    chunk%sbxm = reshape(chunk%t(2,2:11,2:11), (/ 100 /))
    chunk%sbxp = reshape(chunk%t(11,2:11,2:11), (/ 100 /))
    chunk%sbym = reshape(chunk%t(2:11,2,2:11), (/ 100 /))
    chunk%sbyp = reshape(chunk%t(2:11,11,2:11), (/ 100 /))
    chunk%sbzm = reshape(chunk%t(2:11,2:11,2), (/ 100 /))
    chunk%sbzp = reshape(chunk%t(2:11,2:11,11), (/ 100 /))
    call AMPI_Send(chunk%sbxm, 100, AMPI_DOUBLE_PRECISION, chunk%xm, 0, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Send(chunk%sbxp, 100, AMPI_DOUBLE_PRECISION, chunk%xp, 1, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Send(chunk%sbym, 100, AMPI_DOUBLE_PRECISION, chunk%ym, 2, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Send(chunk%sbyp, 100, AMPI_DOUBLE_PRECISION, chunk%yp, 3, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Send(chunk%sbzm, 100, AMPI_DOUBLE_PRECISION, chunk%zm, 4, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Send(chunk%sbzp, 100, AMPI_DOUBLE_PRECISION, chunk%zp, 5, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Recv(chunk%rbxm, 100, AMPI_DOUBLE_PRECISION, chunk%xm, &
&                 1, AMPI_COMM_WORLD, status, ierr)
    call AMPI_Recv(chunk%rbxp, 100, AMPI_DOUBLE_PRECISION, chunk%xp, &
&                 0, AMPI_COMM_WORLD, status, ierr)
    call AMPI_Recv(chunk%rbym, 100, AMPI_DOUBLE_PRECISION, chunk%ym, &
&                 3, AMPI_COMM_WORLD, status, ierr)
    call AMPI_Recv(chunk%rbyp, 100, AMPI_DOUBLE_PRECISION, chunk%yp, &
&                 2, AMPI_COMM_WORLD, status, ierr)
    call AMPI_Recv(chunk%rbzm, 100, AMPI_DOUBLE_PRECISION, chunk%zm, &
&                 5, AMPI_COMM_WORLD, status, ierr)
    call AMPI_Recv(chunk%rbzp, 100, AMPI_DOUBLE_PRECISION, chunk%zp, &
&                 4, AMPI_COMM_WORLD, status, ierr)

    chunk%t(1,2:11,2:11) = reshape(chunk%rbxm,(/10,10/))
    chunk%t(12,2:11,2:11) = reshape(chunk%rbxp,(/10,10/))
    chunk%t(2:11,1,2:11) = reshape(chunk%rbym,(/10,10/))
    chunk%t(2:11,12,2:11) = reshape(chunk%rbyp,(/10,10/))
    chunk%t(2:11,2:11,1) = reshape(chunk%rbzm,(/10,10/))
    chunk%t(2:11,2:11,12) = reshape(chunk%rbzp,(/10,10/))
    DO i = 2, 11
      DO j = 2, 11
        DO k = 2, 11
          tval = ( chunk%t(k,j,i)+chunk%t(k,j,i+1)+&
&                 chunk%t(k,j,i-1)+chunk%t(k,j+1,i)+&
&                 chunk%t(k,j-1,i)+chunk%t(k+1,j,i)+chunk%t(k-1,j,i))/7.0
          !error = abs(tval-chunk%t(k,j,i))
          chunk%t(k,j,i) = tval
          !if(error > maxerr) maxerr = error
        END DO
      END DO
    ENDDO

    !call AMPI_AllReduce(maxerr, maxerr, 1, AMPI_DOUBLE_PRECISION, AMPI_MAX, &
!&                      AMPI_COMM_WORLD, ierr)
    if (thisIndex .eq. 0) then
      write(*,*) ' iter ', iter, ' time: ', AMPI_Wtime()
    endif
  END DO
  if(thisIndex .eq. 0) then
    endtime = AMPI_Wtime()
    write(*,*) 'Time per iteration = ', (endtime-starttime)/niter
  end if
  CALL AMPI_Finalize(ierr)
END SUBROUTINE
