subroutine jacpup(p, chunk)
  use pupmod
  use chunkModule
  implicit none
  integer :: p
  type(chunk_type) :: chunk
  integer :: error

  call pup(p, chunk%h)
  call pup(p, chunk%w)
  if (fpup_isunpacking(p)) then
    allocate(chunk%t(chunk%h, chunk%w+2),stat=error)
    if(error /= 0) write(*,*) 'Error in memory allocation.'
  endif
  call pup(p, chunk%t)
  if (fpup_isdeleting(p)) then
    deallocate(chunk%t)
  endif
end subroutine

recursive SUBROUTINE mpi_main
  USE chunkModule
  IMPLICIT NONE
  include 'mpif.h'

  external jacpup
  INTEGER :: i,j
  INTEGER :: iter, left, right
  INTEGER :: tag, tagLeft, tagRight
  INTEGER, DIMENSION(MPI_STATUS_SIZE) :: status
  DOUBLE PRECISION :: error, tval, maxerr, starttime, endtime
  INTEGER :: niter
  TYPE(chunk_type) :: chunk

  INTEGER :: thisIndex, ierr, nblocks, pupidx

  CALL MPI_Init(ierr)
  CALL MPI_Comm_rank(MPI_COMM_WORLD, thisIndex, ierr)
  CALL MPI_Comm_size(MPI_COMM_WORLD, nblocks, ierr)

  pupidx = MPI_Register(chunk, jacpup)
  chunk%h = 100
  chunk%w = 10
  allocate(chunk%t(chunk%h, chunk%w+2))


  if(thisIndex .eq. 0) then
    niter = 30                ! some dummy proc 0 only initialization
  end if

  call MPI_Bcast(niter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)

  DO i = 1, chunk%w
    DO j = 0, chunk%h-1
      chunk%t(j+1, i+1) = 100*(i-1) + j
    ENDDO
  ENDDO

  call MPI_Barrier(MPI_COMM_WORLD, ierr)
  if(thisIndex .eq. 0) then
    starttime = MPI_Wtime()
  end if

  maxerr = 0.0
  left = mod((thisIndex-1+nblocks), nblocks)
  right = mod((thisIndex+1), nblocks)
  DO iter = 0,niter-1 
    maxerr = 0.0
    tag = iter*nblocks+thisIndex
    tagLeft = iter*nblocks+left
    tagRight = iter*nblocks+right

    call MPI_Send(chunk%t(1,2), chunk%h, MPI_DOUBLE_PRECISION, left, tag, &
&                 MPI_COMM_WORLD, ierr)
    call MPI_Recv(chunk%t(1, chunk%w+2), chunk%h, MPI_DOUBLE_PRECISION, right, &
&                 tagRight, MPI_COMM_WORLD, status, ierr)
    call MPI_Send(chunk%t(1,chunk%w+1), chunk%h, MPI_DOUBLE_PRECISION, right, tag, &
&                 MPI_COMM_WORLD, ierr)
    call MPI_Recv(chunk%t(1, 1), chunk%h, MPI_DOUBLE_PRECISION, left, tagLeft, &
&                 MPI_COMM_WORLD, status, ierr)

    DO i = 2, chunk%w+1
      DO j = 2, chunk%h-1
        tval=(chunk%t(j,i)+chunk%t(j,i+1)+chunk%t(j,i-1)+chunk%t(j+1,i)+chunk%t(j-1,i))/5.0
        error = abs(tval-chunk%t(j,i))
        chunk%t(j,i) = tval
        if(error > maxerr) maxerr = error
      END DO
    END DO

    call MPI_AllReduce(maxerr, maxerr, 1, MPI_DOUBLE_PRECISION, MPI_MAX, &
                        MPI_COMM_WORLD, ierr)
    if (thisIndex .eq. 0) then
      write(*,*) 'error:', maxerr, ' iter ', iter, ' time: ', MPI_Wtime()
    endif
    if(iter .eq. 10) then
      call MPI_Migrate
    endif
  END DO
  if(thisIndex .eq. 0) then
    endtime = MPI_Wtime()
    write(*,*) 'Time per iteration = ', (endtime-starttime)/niter
  end if
  CALL MPI_Finalize(ierr)
END SUBROUTINE
