head	1.40;
access;
symbols
	charm6_1:1.40
	charm_6_0_1:1.40
	charm6_0_1:1.40
	charm6_0:1.37
	ChaNGa_1-0:1.31
	charm5_9:1.30
	charm_5-4-2:1.2
	charm_5-4-1:1.2;
locks; strict;
comment	@# @;


1.40
date	2008.12.01.16.48.52;	author gzheng;	state Exp;
branches;
next	1.39;
commitid	lVDGNpYwyIHTGMst;

1.39
date	2008.05.30.17.44.30;	author gzheng;	state Exp;
branches;
next	1.38;
commitid	xf42maFkkltGW05t;

1.38
date	2008.05.28.17.10.58;	author gzheng;	state Exp;
branches;
next	1.37;
commitid	uqomyzpeZCOSOK4t;

1.37
date	2008.01.09.22.54.21;	author gzheng;	state Exp;
branches;
next	1.36;
commitid	pVUZyfr5eckXdNMs;

1.36
date	2007.10.16.16.49.39;	author gzheng;	state Exp;
branches;
next	1.35;
commitid	19584714eb914567;

1.35
date	2007.10.15.22.22.11;	author gzheng;	state Exp;
branches;
next	1.34;
commitid	419a4713e7fc4567;

1.34
date	2007.10.03.07.49.39;	author gzheng;	state Exp;
branches;
next	1.33;
commitid	624c4703497c4567;

1.33
date	2007.09.18.02.53.11;	author gzheng;	state Exp;
branches;
next	1.32;
commitid	9f746ef3d8c4567;

1.32
date	2007.09.17.20.30.29;	author gzheng;	state Exp;
branches;
next	1.31;
commitid	66f746eee3e04567;

1.31
date	2005.07.10.19.42.14;	author gzheng;	state Exp;
branches;
next	1.30;

1.30
date	2005.03.20.17.34.08;	author gzheng;	state Exp;
branches;
next	1.29;

1.29
date	2005.02.08.05.17.58;	author gzheng;	state Exp;
branches;
next	1.28;

1.28
date	2005.02.08.00.37.35;	author gzheng;	state Exp;
branches;
next	1.27;

1.27
date	2005.01.27.04.09.54;	author gzheng;	state Exp;
branches;
next	1.26;

1.26
date	2005.01.26.17.15.29;	author gzheng;	state Exp;
branches;
next	1.25;

1.25
date	2005.01.25.17.25.33;	author gzheng;	state Exp;
branches;
next	1.24;

1.24
date	2005.01.24.04.03.45;	author gzheng;	state Exp;
branches;
next	1.23;

1.23
date	2005.01.23.07.27.50;	author gzheng;	state Exp;
branches;
next	1.22;

1.22
date	2005.01.22.18.09.27;	author gzheng;	state Exp;
branches;
next	1.21;

1.21
date	2005.01.21.20.07.36;	author gzheng;	state Exp;
branches;
next	1.20;

1.20
date	2005.01.16.17.18.51;	author gzheng;	state Exp;
branches;
next	1.19;

1.19
date	2005.01.16.17.09.39;	author gzheng;	state Exp;
branches;
next	1.18;

1.18
date	2005.01.13.16.06.45;	author gzheng;	state Exp;
branches;
next	1.17;

1.17
date	2005.01.11.08.52.48;	author gzheng;	state Exp;
branches;
next	1.16;

1.16
date	2005.01.08.09.17.40;	author gzheng;	state Exp;
branches;
next	1.15;

1.15
date	2005.01.08.00.25.06;	author gzheng;	state Exp;
branches;
next	1.14;

1.14
date	2005.01.07.15.22.58;	author gzheng;	state Exp;
branches;
next	1.13;

1.13
date	2005.01.07.08.36.38;	author gzheng;	state Exp;
branches;
next	1.12;

1.12
date	2005.01.07.07.46.49;	author gzheng;	state Exp;
branches;
next	1.11;

1.11
date	2005.01.06.22.16.17;	author gzheng;	state Exp;
branches;
next	1.10;

1.10
date	2005.01.06.21.45.50;	author gzheng;	state Exp;
branches;
next	1.9;

1.9
date	2005.01.06.20.45.44;	author gzheng;	state Exp;
branches;
next	1.8;

1.8
date	2005.01.06.17.04.30;	author gzheng;	state Exp;
branches;
next	1.7;

1.7
date	2005.01.06.09.12.22;	author gzheng;	state Exp;
branches;
next	1.6;

1.6
date	2005.01.06.06.55.54;	author gzheng;	state Exp;
branches;
next	1.5;

1.5
date	2005.01.06.06.47.10;	author gzheng;	state Exp;
branches;
next	1.4;

1.4
date	2005.01.06.05.45.26;	author gzheng;	state Exp;
branches;
next	1.3;

1.3
date	2002.02.26.07.32.03;	author gzheng;	state Exp;
branches;
next	1.2;

1.2
date	2001.02.28.20.29.17;	author gzheng;	state Exp;
branches;
next	1.1;

1.1
date	2000.12.02.05.43.02;	author olawlor;	state Exp;
branches;
next	;


desc
@@


1.40
log
@avoid starting mpd daemon everytime running charmrun
@
text
@#!/bin/sh
#
# Conv-host for MPI:
#  Translates +pN-style conv-host options into 
# mpirun -npN options.

args=""
pes=1
machinefile=""

while [ $# -gt 0 ]
do
	case $1 in
	+ppn)
		args=$args" +ppn "$2
		shift
		;;
	+ppn*)
		args=$args" "$1
		;;
	+p)
		pes=$2
		shift
		;;
	+p*)
		pes=`echo $1 | awk '{print substr($1,3)}'`
		;;
        -machinefile)
		machinefile=$2
		args=" "$1" "$2" "$args
		shift
		;;
	*) 
		args=$args" "$1
		;;
	esac
	shift
done


printf "\nRunning on $pes processors: $args\n"


if [ -n "$PBS_NODEFILE" ]
then
# we are in a job shell
  mpirun_cmd=`which mpirun`
  if echo $mpirun_cmd | grep 'mvapich2'  > /dev/null 2>/dev/null
  then
    # if daemon not started, start it
    if ! mpdtrace > /dev/null 2>/dev/null
    then
      mvapich2-start-mpd
    fi
    mpirun -np $pes $args
#    mpdallexit
  else   # normal case
    test -z "$machinefile" && args=-machinefile" "$PBS_NODEFILE" "$args
    echo mpirun -np $pes $args
    mpirun -np $pes $args
  fi
elif [ -n "$LSB_HOSTS" ]
then
# Tungsten
  echo cmpirun -lsf -poll -no_smp -gm_long 200000 $args 
  cmpirun -lsf -poll -no_smp -gm_long 200000 $args 
elif [ -n "$PBS_QUEUE" -o -n "$LSF_QUEUE" ]
then
# Interactive mode: create, and submit a batch job
        script="charmrun_script.$$.sh"
        indir=`pwd`
        output="$indir/charmrun_script.$$.stdout"
        result="$indir/charmrun_script.$$.result"
	rm -f $result
# Some machine specific 
	USE_LSF=0
# 10 minutes	
	walllimit=10
	queue_stat=qstat
	queue_qsub=qsub
	queue_kill=qdel
	hostname=`hostname`
	case "$hostname" in
	turing*.turing.uiuc.edu) 
		ppn='#PBS -l nodes='$pes':ppn=1'
		extra='-machinefile $PBS_NODEFILE'
		;;
	tg-login*|honest*.ncsa.uiuc.edu)
		# always ppn=2
		nodes=`expr \( $pes + 1 \) / 2`
 		test $pes -eq 1 && ppns=1 || ppns=2
		ppn='#PBS -l nodes='$nodes':ppn='$ppns
		extra='-machinefile $PBS_NODEFILE'
		;;
	co-login*.ncsa.uiuc.edu)
		mem='#PBS -l mem=500mb'
		ncpus="#PBS -l ncpus=$pes"
		;;
	tun*)
		USE_LSF=1
		queue_stat=bjobs
		queue_qsub=bsub
		queue_kill=bkill
		;;
	abe*)
		# always ppn=2
		nodes=`expr \( $pes + 1 \) / 2`
 		test $pes -eq 1 && ppns=1 || ppns=2
		ppn='#PBS -l nodes='$nodes':ppn='$ppns
		extra='-machinefile $PBS_NODEFILE'
		;;
	*)
		ncpus="#PBS -l ncpus=$pes"
		;;
	esac
	if test $USE_LSF -eq 0
	then
	  mpirun=`which mpirun`
          cat > $script << EOF
#!/bin/sh
# This is a charmrun-generated PBS batch job script.
# The lines starting with #PBS are queuing system flags:
#
$ppn
#
$ncpus
#
#PBS -l walltime=$walllimit:00
#
$mem
#
#PBS -q $PBS_QUEUE
#
#PBS -N autobuild
#
#PBS -j oe
#
#PBS -o $output

cd $indir

cat \$PBS_NODEFILE
$mpirun -np $pes $extra $args

# Save mpirun exit status
status=\$?
echo \$status > $result
EOF
	else
#  use LSF
	  mpirun="cmpirun -lsf -poll -no_smp -gm_long 200000"
          cat > $script << EOF
#!/bin/sh
# This is a charmrun-generated PBS batch job script.
# The lines starting with #PBS are queuing system flags:
#
#BSUB -J autobuild
#BSUB -W 0:$walllimit
#BSUB -n $pes
#BSUB -o $output

cd $indir
echo \$LSB_MCPU_HOSTS
$mpirun $args
# Save mpirun exit status
status=\$?
echo \$status > $result
EOF
	fi

End() {
	echo "Charmrun> $queue_kill $jobid ..."
	$queue_kill $jobid
	rm -f $script
	exit $1
}

        echo "Submitting batch job for> $mpirun -np $pes $args"
        echo " using the command> $queue_qsub $script"
        chmod 755 $script
	while [ -z "$jobid" ]
	do
	  [ $USE_LSF = 0 ] && jobid=`$queue_qsub $script|tail -1`
	  [ $USE_LSF = 1 ] && jobid=`$queue_qsub < $script|tail -1|sed -e 's/[^0-9]*//g'`
	done
	echo "Job enqueued under job ID $jobid"
# kill job if interrupted
	trap 'End 1' 2 3
	retry=0
# Wait for the job to complete, by checking its status
        while [ true ]
        do
                $queue_stat $jobid > tmp.$$
		exitstatus=$?
                if test -f $output
                then
# The job is done-- print its output
                        rm tmp.$$
# When job hangs, result file does not exist
			test -f $result && status=`cat $result` || status=1
			test $status -eq 0 && status=`grep 'End of program' $output > /dev/null 2>&1`
			cat $output
			rm -f $result
			test -f $status && rm -f $script $output
			exit $status
                fi
# The job is still queued or running-- print status and wait
                tail -1 tmp.$$
                rm tmp.$$
# Job ID may not exist now
		if test $exitstatus -ne 0
		then
# retry a few times when error occurs
			retry=`expr $retry + 1`
			if test $retry -gt 6
			then
				echo "Charmrun> too many errors, abort!"
				exit 1
			else
				sleep 15
			fi
		else
# job still in queue
			retry=0
                	sleep 20
		fi
        done
else
  [ -n "$MPI_MACHINEFILE" ] && args=" -machinefile $MPI_MACHINEFILE $args"
  setarch_cmd=`which setarch 2>/dev/null`
  if [ -n "$setarch_cmd" -a -x "$setarch_cmd" ]
  then
    # Disables randomization of the virtual address  space  (turns  on
    #          ADDR_NO_RANDOMIZE).
    cur_arch=`uname -m`
    echo "charmrun>  $setarch_cmd $cur_arch -R  mpirun -np $pes $args"
    $setarch_cmd $cur_arch -R  mpirun -np $pes $args
  else
    echo "charmrun> mpirun -np $pes $args"
    mpirun -np $pes $args
  fi
fi


@


1.39
log
@fixed the shell code that detects if setarch presents.
@
text
@d50 5
a54 1
    mvapich2-start-mpd
d56 1
a56 1
    mpdallexit
@


1.38
log
@when calling mpirun using setarch -R command to start MPI program and disable the randomization of address space.
@
text
@d227 1
a227 1
  if test -n "$setarch_cmd"
@


1.37
log
@handles -machinefile better - it is important that -machinefile args appears before ./pgm.
@
text
@d226 12
a237 2
  echo "charmrun> mpirun -np $pes $args"
  mpirun -np $pes $args
@


1.36
log
@a minor fix for charmrun script change I checked in last time.
@
text
@d9 1
d28 5
d40 1
d43 1
d54 3
a56 2
    echo mpirun -np $pes -machinefile $PBS_NODEFILE $args
    mpirun -np $pes -machinefile $PBS_NODEFILE $args
d101 7
@


1.35
log
@charmrun takes MPI_MACHINEFILE environment variable to allow users to choose machinefile.
@
text
@d209 2
a210 1
  [ -n "$MPI_MACHINEFILE" ] && args="$args -machinefile $MPI_MACHINEFILE"
a214 1

@


1.34
log
@for mvapich2, starts mpd in script.
@
text
@d209 1
@


1.33
log
@fixed ppn when npes = 1.
@
text
@d39 10
a48 1
  mpirun -np $pes -machinefile $PBS_NODEFILE $args
@


1.32
log
@now knows abe.ncsa
@
text
@d69 2
a70 1
		ppn='#PBS -l nodes='$nodes':ppn=2'
@


1.31
log
@removed "myr" in generated job script for turing-new.
@
text
@d66 1
a66 1
	tg-login*)
@


1.30
log
@when run charmrun under a job shell, apply -machinefile $PBS_NODEFILE in command line.
@
text
@d63 1
a63 1
		ppn='#PBS -l nodes='$pes':ppn=1:myr'
@


1.29
log
@minor change
@
text
@d38 2
a39 1
  mpirun -np $pes $args
@


1.28
log
@a minor change for new turing.
@
text
@d34 1
a34 1
echo "Running on $pes processors: $args"
@


1.27
log
@for teragrid always use ppn=2 to improve the chance of running autobuild timely.
@
text
@d62 1
a62 1
		ppn='#PBS -l nodes='$pes':ppn=1:myr2'
d157 1
a157 1
	trap 'End 1' 2
@


1.26
log
@calling -gm_long <size> to let cmpirun switch to long protocol for larger messages. this seems to improve the situation where megatest crash on long messages on tungsten.
@
text
@d66 3
a68 1
		ncpus="#PBS -l ncpus=$pes"
@


1.25
log
@reduce job wallclock limit to 10 minutes so that it may have better chance to be schduled.
@
text
@d42 2
a43 1
  cmpirun -lsf -poll -no_smp $args
d118 1
a118 1
	  mpirun=cmpirun
d131 1
a131 1
$mpirun -lsf -poll -no_smp $args
@


1.24
log
@use queue myr2 on new turing.
@
text
@d53 2
d94 1
a94 1
#PBS -l walltime=20:00
d124 1
a124 1
#BSUB -W 0:20
@


1.23
log
@when hit "ctrl^c", cleanup job.
@
text
@d59 1
a59 1
		ppn='#PBS -l nodes='$pes':ppn=1'
a87 1
# This determines the number of nodes and pes (here $nodes and $pes):
a91 1
# This determines the wall-clock time limit (here 5 minutes):
a95 1
# Queue name (see info about other queues in web documentation)
a97 1
# Job name (default = name of script file)
a101 1
# Filename for standard output (default = <job_name>.o<job_id>)
d103 1
a103 1
#
a105 1
# This is the actual command to run the job:
d108 1
d120 1
a120 1

a123 1
# Filename for standard output (default = <job_name>.o<job_id>)
d181 1
a181 1
				echo "Charmrun> abort!"
d184 1
a184 1
				sleep 10
@


1.22
log
@make qstat more robust to tolerate trasient errors by returing a few more times.
@
text
@d55 1
d58 1
a58 1
	turing*.turing.uiuc.edu|tg-login*) 
d62 4
d74 1
d87 1
a87 1

d140 8
d157 2
d171 1
a171 1
			test $status -eq 0 && status=`grep 'End of program' $output > /dev/null 2>/dev/null`
@


1.21
log
@now support LSF as on tun.ncsa.
@
text
@a50 2
        echo "Submitting batch job for> mpirun -np $pes $args"
        echo " using the command> qsub $script"
d114 1
d128 1
a128 1
cmpirun -lsf -poll -no_smp $args
d134 2
d143 1
d148 2
a149 2
# Job ID does not exist now
                if test $? -ne 0 -o -f $output
d164 17
a180 1
                sleep 20
@


1.20
log
@increased job walltime limit to 20min
@
text
@d39 5
a43 1
elif [ -n "$PBS_QUEUE" ]
a52 1
	mpirun=`which mpirun`
d54 3
d67 5
d76 4
a79 1
        cat > $script << EOF
d114 21
d138 2
a139 1
          jobid=`qsub $script|tail -1`
d145 1
a145 1
                qstat $jobid > tmp.$$
d147 1
a147 1
                if test $? -ne 0 -a -f $output
@


1.19
log
@check if program correctly finished by grep'ing "End of Program"
@
text
@d76 1
a76 1
#PBS -l walltime=5:00
@


1.18
log
@removed  PBS -ncpus which new turing doesn't like.
@
text
@d117 1
@


1.17
log
@updated for running on teragrid
@
text
@d59 4
d73 1
a73 1
#PBS -l ncpus=$pes
@


1.16
log
@don't remove output file and job script when job error occurs.
@
text
@d53 1
a53 1
	turing*.turing.uiuc.edu) 
@


1.15
log
@use exit status of mpirun to better catch the runtime failure. also detect if pr
ogram hang.
@
text
@d57 3
d74 2
d114 2
a115 1
			rm -f $script $result $output
@


1.14
log
@a fix in echo $? in here file.
@
text
@d45 2
d70 1
a70 1

a73 7
# Export all my environment variables to the job
##PBS -V
#
# Charge job to project abc (recommended for users with multiple projects)
# [If project is invalid, a valid project will be automatically selected]
##PBS -A abc
#
d87 3
a89 1
echo STATUS: \$?
d101 2
a102 1
                if test ! $? -eq 0 -a -f $output
d106 5
a110 9
                        if `grep 'End of program' $output > /dev/null 2>/dev/null`
			then
				rm $script
                        	exec cat $output
			else
				cat $output
				rm -f $output 
				exit 1
			fi
@


1.13
log
@a minor change
@
text
@d92 1
a92 1
echo STATUS: $?
@


1.12
log
@removed several questionabale PBS directives.
@
text
@d124 1
a124 2
	echo "Charmrun> failed."
	exit 1
@


1.11
log
@a minor change to make some turing specific parameters
@
text
@a68 3
# Set memory limit to 500 Mbytes
#PBS -l mem=500mb

d73 1
a73 1
#PBS -V
d92 1
d110 1
d114 1
a114 1
				rm $output 
@


1.10
log
@test $PBS_NODEFILE to see if charmrun is runnin inside job script or not.
@
text
@d50 2
a51 2
	if [ $hostname = 'turing-2.turing.uiuc.edu' ] 
	then 
d54 2
a55 1
	fi
@


1.9
log
@make job submission script smarter by detecting unexpecting errors.
@
text
@d36 1
a36 1
if [ x"$PBS_QUEUE" = x ]
d39 2
a40 1
else
d123 3
a125 1

@


1.8
log
@use PBS_QUEUE to specify the job queue to submit.
@
text
@d46 8
d60 2
d63 1
a63 1

d91 2
a92 1
mpirun -np $pes $args
d95 4
a98 1
        jobid=`qsub $script|tail -1`
d104 1
a104 1
                if [ ! $? -eq 0 ]
@


1.7
log
@changed to check end of program to tell if a job succeed or not.
@
text
@d36 1
a36 1
if [ x"$PBS_TEST" = x ]
d61 1
a61 1
#PBS -q standard
@


1.6
log
@remove tmp files generated
@
text
@a43 1
        errout="$indir/charmrun_script.$$.stderr"
d73 2
a77 3
# Filename for standard error (default = <job_name>.e<job_id>)
#PBS -e $errout

d94 1
a94 1
                        if test -s $errout 
d96 4
a99 2
				cat $output $errout 
				rm $output $errout
a100 2
			else
                        	exec cat $output
@


1.5
log
@test error output to detect error.
@
text
@d96 8
a103 2
                        test -s $errout && cat $output && cat $errout && exit 1
                        exec cat $output
@


1.4
log
@automatically qsub a job if a queueing system (qsub) is needed to run a job.
PBS_TEST environment variable need to be set.
@
text
@d44 1
d78 1
a78 1
#PBS -e testjob.err
d96 1
d102 1
a102 1
                sleep 10
@


1.3
log
@make charmrun script recorgnize +ppn
@
text
@d36 46
d83 24
@


1.2
log
@remove the "" for the args, which doesn't work on turing
@
text
@d13 7
@


1.1
log
@Mpi charmrun translates conv-host-style parameters to mpirun-style parameters; Sim/Uth charmruns do nothing.  This should make it easier to run Charm++ programs on a variety of machines.
@
text
@d21 1
a21 1
		args=$args" "'"'$1'"'
@

