function index1d(ix, iy, iz)
  USE chunkModule
  IMPLICIT NONE
  integer ix, iy, iz, index1d
  index1d = ny*nz*ix + nz*iy + iz
end function

subroutine index3d(index, ix, iy, iz)
  USE chunkModule
  IMPLICIT NONE
  integer ix, iy, iz, index
  ix = index/(ny*nz)
  iy = mod(index,(ny*nz))/nz
  iz = mod(index,nz)
end subroutine


subroutine jacpup(p, cp)
  use pupmod
  use chunkModule
  implicit none
  integer :: p
  type(chunkdata) :: cp
  type(chunk_type), pointer :: c

  if(pup_isupk(p)) allocate(cp%chunk)
  c => cp%chunk

  call pup(p, c%t)
  call pup(p, c%xidx)
  call pup(p, c%yidx)
  call pup(p, c%zidx)
  call pup(p, c%xm)
  call pup(p, c%xp)
  call pup(p, c%ym)
  call pup(p, c%yp)
  call pup(p, c%zm)
  call pup(p, c%zp)
  call pup(p, c%sbxm)
  call pup(p, c%sbxp)
  call pup(p, c%sbym)
  call pup(p, c%sbyp)
  call pup(p, c%sbzm)
  call pup(p, c%sbzp)
  call pup(p, c%rbxm)
  call pup(p, c%rbxp)
  call pup(p, c%rbym)
  call pup(p, c%rbyp)
  call pup(p, c%rbzm)
  call pup(p, c%rbzp)

  if(pup_ispk(p)) deallocate(cp%chunk)
end subroutine

recursive SUBROUTINE AMPI_Main
  USE ampi
  USE chunkModule
  IMPLICIT NONE
    interface
      INTEGER FUNCTION index1d(ix,iy,iz)
        INTEGER ix, iy, iz
      END FUNCTION
      SUBROUTINE index3d(index, ix, iy, iz)
        INTEGER index, ix, iy, iz
      END SUBROUTINE
    end interface

  external jacpup
  INTEGER :: i,j,k,m,cidx
  INTEGER :: iter, niter
  INTEGER, DIMENSION(AMPI_STATUS_SIZE) :: status
  DOUBLE PRECISION :: error, tval, maxerr, starttime, endtime, itertime
  TYPE(chunkdata) :: cp
  TYPE(chunk_type), pointer :: chunk

  INTEGER :: thisIndex, ierr, nblocks

  CALL AMPI_Init(ierr)
  CALL AMPI_Comm_rank(AMPI_COMM_WORLD, thisIndex, ierr)
  CALL AMPI_Comm_size(AMPI_COMM_WORLD, nblocks, ierr)

  cidx = AMPI_Register(cp, jacpup)

  allocate(cp%chunk)
  chunk => cp%chunk

  if(thisIndex .eq. 0) then
    niter = 140                ! some dummy proc 0 only initialization
  end if

  call AMPI_Bcast(niter, 1, AMPI_INTEGER, 0, AMPI_COMM_WORLD, ierr)

  call index3d(thisIndex, chunk%xidx, chunk%yidx, chunk%zidx)
  chunk%xp = index1d(mod(chunk%xidx+1,nx),chunk%yidx,chunk%zidx)
  chunk%xm = index1d(mod(chunk%xidx+nx-1,nx),chunk%yidx,chunk%zidx)
  chunk%yp = index1d(chunk%xidx,mod(chunk%yidx+1,ny),chunk%zidx)
  chunk%ym = index1d(chunk%xidx,mod(chunk%yidx+ny-1,ny),chunk%zidx)
  chunk%zp = index1d(chunk%xidx,chunk%yidx,mod(chunk%zidx+1,nz))
  chunk%zm = index1d(chunk%xidx,chunk%yidx,mod(chunk%zidx+nz-1,nz))
  DO i = 2, 21
    DO j = 2, 21
      DO k = 2, 21
        chunk%t(k, j, i) = 400*(i-2) + 20*(j-2) + (k-2)
      ENDDO
    ENDDO
  ENDDO

  call AMPI_Barrier(AMPI_COMM_WORLD, ierr)
  starttime = AMPI_Wtime()

  maxerr = 0.0
  DO iter = 1,niter 
    maxerr = 0.0

    chunk%sbxm = reshape(chunk%t(2,2:21,2:21), (/ 400 /))
    chunk%sbxp = reshape(chunk%t(21,2:21,2:21), (/ 400 /))
    chunk%sbym = reshape(chunk%t(2:21,2,2:21), (/ 400 /))
    chunk%sbyp = reshape(chunk%t(2:21,21,2:21), (/ 400 /))
    chunk%sbzm = reshape(chunk%t(2:21,2:21,2), (/ 400 /))
    chunk%sbzp = reshape(chunk%t(2:21,2:21,21), (/ 400 /))
    call AMPI_Send(chunk%sbxm, 400, AMPI_DOUBLE_PRECISION, chunk%xm, 0, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Send(chunk%sbxp, 400, AMPI_DOUBLE_PRECISION, chunk%xp, 1, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Send(chunk%sbym, 400, AMPI_DOUBLE_PRECISION, chunk%ym, 2, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Send(chunk%sbyp, 400, AMPI_DOUBLE_PRECISION, chunk%yp, 3, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Send(chunk%sbzm, 400, AMPI_DOUBLE_PRECISION, chunk%zm, 4, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Send(chunk%sbzp, 400, AMPI_DOUBLE_PRECISION, chunk%zp, 5, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Recv(chunk%rbxm, 400, AMPI_DOUBLE_PRECISION, chunk%xm, &
&                 1, AMPI_COMM_WORLD, status, ierr)
    call AMPI_Recv(chunk%rbxp, 400, AMPI_DOUBLE_PRECISION, chunk%xp, &
&                 0, AMPI_COMM_WORLD, status, ierr)
    call AMPI_Recv(chunk%rbym, 400, AMPI_DOUBLE_PRECISION, chunk%ym, &
&                 3, AMPI_COMM_WORLD, status, ierr)
    call AMPI_Recv(chunk%rbyp, 400, AMPI_DOUBLE_PRECISION, chunk%yp, &
&                 2, AMPI_COMM_WORLD, status, ierr)
    call AMPI_Recv(chunk%rbzm, 400, AMPI_DOUBLE_PRECISION, chunk%zm, &
&                 5, AMPI_COMM_WORLD, status, ierr)
    call AMPI_Recv(chunk%rbzp, 400, AMPI_DOUBLE_PRECISION, chunk%zp, &
&                 4, AMPI_COMM_WORLD, status, ierr)

    chunk%t(1,2:21,2:21) = reshape(chunk%rbxm,(/20,20/))
    chunk%t(22,2:21,2:21) = reshape(chunk%rbxp,(/20,20/))
    chunk%t(2:21,1,2:21) = reshape(chunk%rbym,(/20,20/))
    chunk%t(2:21,22,2:21) = reshape(chunk%rbyp,(/20,20/))
    chunk%t(2:21,2:21,1) = reshape(chunk%rbzm,(/20,20/))
    chunk%t(2:21,2:21,22) = reshape(chunk%rbzp,(/20,20/))
    DO i = 2, 21
      DO j = 2, 21
        DO k = 2, 21
          tval = ( chunk%t(k,j,i)+chunk%t(k,j,i+1)+&
&                 chunk%t(k,j,i-1)+chunk%t(k,j+1,i)+&
&                 chunk%t(k,j-1,i)+chunk%t(k+1,j,i)+chunk%t(k-1,j,i))/7.0
          error = abs(tval-chunk%t(k,j,i))
          chunk%t(k,j,i) = tval
          if(error > maxerr) maxerr = error
        END DO
      END DO
    ENDDO

    if(iter .ge. 25 .and. iter .lt. 85 .and. thisIndex .eq. 35) then
    DO m = 1, 8
      DO i = 2, 21
        DO j = 2, 21
          DO k = 2, 21
            tval = ( chunk%t(k,j,i)+chunk%t(k,j,i+1)+&
&                   chunk%t(k,j,i-1)+chunk%t(k,j+1,i)+&
&                   chunk%t(k,j-1,i)+chunk%t(k+1,j,i)+chunk%t(k-1,j,i))/7.0
            error = abs(tval-chunk%t(k,j,i))
            chunk%t(k,j,i) = tval
            if(error > maxerr) maxerr = error
          END DO
        END DO
      ENDDO
    ENDDO
    endif

    call AMPI_AllReduce(maxerr, maxerr, 1, AMPI_DOUBLE_PRECISION, AMPI_MAX, &
&                      AMPI_COMM_WORLD, ierr)
    endtime = AMPI_Wtime()
    itertime = endtime - starttime
    call AMPI_AllReduce(itertime, itertime, 1, AMPI_DOUBLE_PRECISION, AMPI_SUM,&
&                      AMPI_COMM_WORLD, ierr)
    itertime = itertime/nblocks
    if (thisIndex .eq. 0) then
      write(*,*) ' iter ', iter, ' time: ', itertime
    endif
    starttime = AMPI_Wtime()
    if (mod(iter,20) .eq. 0) then
      call AMPI_Migrate()
      chunk => cp%chunk
    endif
  END DO
  CALL AMPI_Finalize(ierr)
END SUBROUTINE
