recursive SUBROUTINE main
  USE mpi
  USE chunkModule
  IMPLICIT NONE
    interface
      function AMPI_WTIME()
        DOUBLE PRECISION :: AMPI_WTIME
      end function AMPI_WTIME
    end interface

  INTEGER :: i,j
  INTEGER :: iter, left, right
  INTEGER :: tag, tagLeft, tagRight
  INTEGER, DIMENSION(AMPI_STATUS_SIZE) :: status
  DOUBLE PRECISION :: error, tval, maxerr, starttime, endtime
  INTEGER :: niter
  TYPE(chunk_type), POINTER :: chunk

  INTEGER :: thisIndex, ierr, nblocks

  CALL AMPI_Init(ierr)
  CALL AMPI_Comm_rank(AMPI_COMM_WORLD, thisIndex, ierr)
  CALL AMPI_Comm_size(AMPI_COMM_WORLD, nblocks, ierr)

  ALLOCATE(chunk)
  chunk%h = 100
  chunk%w = 10
  allocate(chunk%t(chunk%h, chunk%w+2))


  if(thisIndex .eq. 0) then
    niter = 30                ! some dummy proc 0 only initialization
  end if

  call AMPI_Bcast(niter, 1, AMPI_INTEGER, 0, AMPI_COMM_WORLD, ierr)

  DO i = 1, chunk%w
    DO j = 0, chunk%h-1
      chunk%t(j+1, i+1) = 100*(i-1) + j
    ENDDO
  ENDDO

  call AMPI_Barrier(AMPI_COMM_WORLD, ierr)
  if(thisIndex .eq. 0) then
    starttime = AMPI_Wtime()
  end if

  maxerr = 0.0
  left = mod((thisIndex-1+nblocks), nblocks)
  right = mod((thisIndex+1), nblocks)
  DO iter = 0,niter-1 
    !maxerr = 0.0
    tag = iter*nblocks+thisIndex
    tagLeft = iter*nblocks+left
    tagRight = iter*nblocks+right

    call AMPI_Send(chunk%t(1,2), chunk%h, AMPI_DOUBLE_PRECISION, left, tag, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Recv(chunk%t(1, chunk%w+2), chunk%h, AMPI_DOUBLE_PRECISION, right, &
&                 tagRight, AMPI_COMM_WORLD, status, ierr)
    call AMPI_Send(chunk%t(1,chunk%w+1), chunk%h, AMPI_DOUBLE_PRECISION, right, tag, &
&                 AMPI_COMM_WORLD, ierr)
    call AMPI_Recv(chunk%t(1, 1), chunk%h, AMPI_DOUBLE_PRECISION, left, tagLeft, &
&                 AMPI_COMM_WORLD, status, ierr)

    DO i = 2, chunk%w+1
      DO j = 2, chunk%h-1
        tval=(chunk%t(j,i)+chunk%t(j,i+1)+chunk%t(j,i-1)+chunk%t(j+1,i)+chunk%t(j-1,i))/5.0
        !error = abs(tval-chunk%t(j,i))
        chunk%t(j,i) = tval
        !if(error > maxerr) maxerr = error
      END DO
    END DO

    !call AMPI_AllReduce(maxerr, maxerr, 1, AMPI_DOUBLE_PRECISION, AMPI_MAX, &
!&                      AMPI_COMM_WORLD, ierr)
    if (thisIndex .eq. 0) then
      write(*,*) 'error:', maxerr, ' iter ', iter, ' time: ', AMPI_Wtime()
    endif
  END DO
  if(thisIndex .eq. 0) then
    endtime = AMPI_Wtime()
    write(*,*) 'Time per iteration = ', (endtime-starttime)/niter
  end if
  CALL AMPI_Finalize(ierr)
END SUBROUTINE
