00001 #include "converse.h"
00002
00003 #include "../sockRoutines.h"
00004 #include "../sockRoutines.c"
00005 #include "../ccs-auth.h"
00006 #include "../ccs-auth.c"
00007 #include "../ccs-server.h"
00008 #include "../ccs-server.c"
00009
00010 #include <stdio.h>
00011 #include <string.h>
00012 #include <ctype.h>
00013 #include <errno.h>
00014 #include <setjmp.h>
00015 #include <stdlib.h>
00016 #include <signal.h>
00017 #include <fcntl.h>
00018 #include <time.h>
00019 #include <assert.h>
00020 #include <math.h>
00021 #if CMK_BPROC
00022 #include <sys/bproc.h>
00023 #endif
00024 #if CMK_USE_POLL
00025 #include <poll.h>
00026 #endif
00027 #include <sys/stat.h>
00028
00029
00030 #if defined(_WIN32) && !defined(__CYGWIN__)
00031
00032 #define getcwd _getcwd
00033 #define strdup _strdup
00034 #define unlink _unlink
00035 #define open _open
00036 #define fdopen _fdopen
00037 #define ftruncate _chsize
00038 #include <winbase.h>
00039 #include <direct.h>
00040 #include <io.h>
00041 #include <sys/timeb.h>
00042 #include <process.h>
00043 #define DIRSEP "\\"
00044 #define SIGBUS -1
00045 #define SIGKILL -1
00046 #define SIGQUIT -1
00047
00048
00049 #else
00050 #include <pwd.h>
00051 #include <unistd.h>
00052 #define DIRSEP "/"
00053 #endif
00054
00055 #if CMK_RSH_NOT_NEEDED
00056 # define CMK_USE_RSH 0
00057
00058 #else
00059 # define CMK_USE_RSH 1
00060 #ifdef __MINGW_H
00061 # include <rpc.h>
00062 #elif !defined(__CYGWIN__)
00063 # include <rpc/rpc.h>
00064 #else
00065 # include <w32api/rpc.h>
00066 #endif
00067 # if CMK_RSH_IS_A_COMMAND
00068 # define RSH_CMD "rsh"
00069 # endif
00070
00071 # if CMK_RSH_USE_REMSH
00072 # define RSH_CMD "remsh"
00073 # endif
00074 #endif
00075
00076 #include "daemon.h"
00077
00078
00079 #define DEBUGF(x)
00080
00081 #ifndef MAXPATHLEN
00082 #define MAXPATHLEN 1024
00083 #endif
00084
00085
00086
00087 #ifdef HSTART
00088
00089 int mynodes_start ;
00090
00091 #endif
00092
00093 static double ftTimer;
00094
00095 double start_timer;
00096
00097 int *rsh_pids=NULL;
00098
00099 double GetClock(void)
00100 {
00101 #if defined(_WIN32) && !defined(__CYGWIN__)
00102 struct _timeb tv;
00103 _ftime(&tv);
00104 return (tv.time * 1.0 + tv.millitm * 1.0E-3);
00105 #else
00106 struct timeval tv; int ok;
00107 ok = gettimeofday(&tv, NULL);
00108 if (ok<0) { perror("gettimeofday"); exit(1); }
00109 return (tv.tv_sec * 1.0 + tv.tv_usec * 1.0E-6);
00110 #endif
00111 }
00112
00113
00114 int probefile(path)
00115 char *path;
00116 {
00117 FILE *f=fopen(path,"r");
00118 if (f==NULL) return 0;
00119 fclose(f);
00120 return 1;
00121 }
00122
00123 char *mylogin(void)
00124 {
00125 #if defined(_WIN32) && !defined(__CYGWIN__)
00126 static char name[100]={'d','u','n','n','o',0};
00127 int len=100;
00128 GetUserName(name,&len);
00129 return name;
00130 #else
00131 struct passwd *self;
00132
00133 self = getpwuid(getuid());
00134 if (self==0) {
00135 #if CMK_HAS_POPEN
00136 char cmd[16];
00137 char uname[64];
00138 FILE *p;
00139 sprintf(cmd, "id -u -n");
00140 p = popen(cmd, "r");
00141 if (p){
00142 fscanf(p, "%s", uname);
00143 pclose(p);
00144 return strdup(uname);
00145 }
00146 else
00147 return "unknown";
00148 #else
00149 return "unknown";
00150 #endif
00151 }
00152 return self->pw_name;
00153 #endif
00154 }
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165 void ping_developers()
00166 {
00167 #ifdef NOTIFY
00168 char info[1000];
00169
00170 skt_ip_t destination_ip=skt_lookup_ip("128.174.241.211");
00171 unsigned int destination_port=6571;
00172 struct sockaddr_in addr=skt_build_addr(destination_ip,destination_port);
00173 SOCKET skt;
00174
00175 skt = socket(AF_INET, SOCK_DGRAM, 0);
00176 if (skt == INVALID_SOCKET) return;
00177
00178 sprintf(info,"%s",mylogin());
00179
00180 sendto(skt, info, strlen(info), 0, (struct sockaddr *)&addr, sizeof(addr));
00181 skt_close(skt);
00182 #endif
00183 }
00184
00185
00186
00187
00188
00189
00190
00191 typedef struct pathfixlist {
00192 char *s1;
00193 char *s2;
00194 struct pathfixlist *next;
00195 } *pathfixlist;
00196
00197 pathfixlist pathfix_append(char *s1, char *s2, pathfixlist l)
00198 {
00199 pathfixlist pf = (pathfixlist)malloc(sizeof(struct pathfixlist));
00200 pf->s1 = s1;
00201 pf->s2 = s2;
00202 pf->next = l;
00203 return pf;
00204 }
00205
00206 char *pathfix(char *path, pathfixlist fixes)
00207 {
00208 char buffer[MAXPATHLEN]; pathfixlist l;
00209 char buf2[MAXPATHLEN];
00210 char *offs; int mod, len;
00211 strcpy(buffer,path);
00212 mod = 1;
00213 while (mod) {
00214 mod = 0;
00215 for (l=fixes; l; l=l->next) {
00216 len = strlen(l->s1);
00217 offs = strstr(buffer, l->s1);
00218 if (offs) {
00219 offs[0]=0;
00220 sprintf(buf2,"%s%s%s",buffer,l->s2,offs+len);
00221 strcpy(buffer,buf2);
00222 mod = 1;
00223 }
00224 }
00225 }
00226 return strdup(buffer);
00227 }
00228
00229 char *pathextfix(char *path, pathfixlist fixes, char *ext)
00230 {
00231 char *newpath = pathfix(path, fixes);
00232 char *ret;
00233 if (ext == NULL) return newpath;
00234 ret = (char *)malloc(strlen(newpath)+strlen(ext)+2);
00235 strcpy(ret, newpath);
00236 strcat(ret, ext);
00237 return ret;
00238 }
00239
00240
00241
00242
00243
00244
00245
00246 int is_quote(char c)
00247 {
00248 return (c=='\'' || c == '"');
00249 }
00250
00251 void zap_newline(char *s)
00252 {
00253 char *p;
00254 p = s + strlen(s)-1;
00255 if (*p == '\n') *p = '\0';
00256
00257 p--;
00258 if (*p == '\15') *p = '\0';
00259 }
00260
00261
00262 char *substr(char *lo, char *hi)
00263 {
00264 int len;
00265 char *res;
00266 if (is_quote(*lo)) lo++;
00267 if (is_quote(*(hi-1))) hi--;
00268 len = hi-lo;
00269 res = (char *)malloc(1+len);
00270 memcpy(res, lo, len);
00271 res[len]=0;
00272 return res;
00273 }
00274
00275 int subeqs(char *lo, char *hi, char *str)
00276 {
00277 int len = strlen(str);
00278 if (hi-lo != len) return 0;
00279 if (memcmp(lo, str, len)) return 0;
00280 return 1;
00281 }
00282
00283
00284 char *skipblanks(char *p)
00285 {
00286 while ((*p==' ')||(*p=='\t')) p++;
00287 return p;
00288 }
00289
00290
00291 char *skipstuff(char *p)
00292 {
00293 char quote = 0;
00294 if (*p && (*p=='\'' || *p=='"')) { quote=*p; p++; }
00295 if (quote != 0) {
00296 while (*p&&*p!=quote) p++;
00297 if (*p!=quote) {
00298 fprintf(stderr, "ERROR> Unmatched quote in nodelist file.\n");
00299 exit(1);
00300 }
00301 p++;
00302 }
00303 else
00304 while ((*p)&&(*p!=' ')&&(*p!='\t')) p++;
00305 return p;
00306 }
00307
00308 #if CMK_USE_RSH
00309 char *getenv_rsh()
00310 {
00311 char *e;
00312
00313 e = getenv("CONV_RSH");
00314 return e ? e : RSH_CMD;
00315 }
00316 #endif
00317
00318 #if !defined(_WIN32) || defined(__CYGWIN__)
00319 char *getenv_display()
00320 {
00321 static char result[100],ipBuf[200];
00322 char *e, *p;
00323
00324 e = getenv("DISPLAY");
00325 if (e==0) return NULL;
00326 p = strrchr(e, ':');
00327 if (p==0) return NULL;
00328 if ((e[0]==':')||(strncmp(e,"unix:",5)==0)) {
00329 sprintf(result,"%s:%s",skt_print_ip(ipBuf,skt_my_ip()),p+1);
00330 }
00331 else strcpy(result, e);
00332 return result;
00333 }
00334 char *getenv_display_no_tamper()
00335 {
00336 static char result[100],ipBuf[200];
00337 char *e, *p;
00338
00339 e = getenv("DISPLAY");
00340 if (e==0) return NULL;
00341 p = strrchr(e, ':');
00342 if (p==0) return NULL;
00343 strcpy(result, e);
00344 return result;
00345 }
00346
00347 #endif
00348
00349
00350
00351
00352
00353
00354
00355 typedef struct ppdef
00356 {
00357 union {
00358 int *i;
00359 double *r;
00360 char **s;
00361 int *f;
00362 } where;
00363 const char *lname;
00364 const char *doc;
00365 char type;
00366 struct ppdef *next;
00367 }
00368 *ppdef;
00369
00370 static ppdef ppdefs;
00371
00372 static int pparam_pos;
00373 static char **pparam_argv;
00374 static char pparam_optc='-';
00375 char pparam_error[100];
00376
00377 static ppdef pparam_find(lname)
00378 const char *lname;
00379 {
00380 ppdef def;
00381 for (def=ppdefs; def; def=def->next)
00382 if (strcmp(def->lname, lname)==0)
00383 return def;
00384 return 0;
00385 }
00386
00387 static ppdef pparam_cell(lname)
00388 const char *lname;
00389 {
00390 ppdef def = pparam_find(lname);
00391 if (def) return def;
00392 def = (ppdef)malloc(sizeof(struct ppdef));
00393 def->lname = lname;
00394 def->type = 's';
00395 def->doc = "(undocumented)";
00396 def->next = ppdefs;
00397 ppdefs = def;
00398 return def;
00399 }
00400
00401
00402
00403 void pparam_int(int *where,int defValue,
00404 const char *arg,const char *doc)
00405 {
00406 ppdef def = pparam_cell(arg);
00407 def->type = 'i';
00408 def->where.i = where; *where=defValue;
00409 def->lname=arg;
00410 def->doc=doc;
00411 }
00412
00413 void pparam_flag(int *where,int defValue,
00414 const char *arg,const char *doc)
00415 {
00416 ppdef def = pparam_cell(arg);
00417 def->type = 'f';
00418 def->where.f = where; *where=defValue;
00419 def->lname=arg;
00420 def->doc=doc;
00421 }
00422
00423 void pparam_real(double *where,double defValue,
00424 const char *arg,const char *doc)
00425 {
00426 ppdef def = pparam_cell(arg);
00427 def->type = 'r';
00428 def->where.r = where; *where=defValue;
00429 def->lname=arg;
00430 def->doc=doc;
00431 }
00432 void pparam_str(char **where,char *defValue,
00433 const char *arg,const char *doc)
00434 {
00435 ppdef def = pparam_cell(arg);
00436 def->type = 's';
00437 def->where.s = where; *where=defValue;
00438 def->lname=arg;
00439 def->doc=doc;
00440 }
00441
00442 static int pparam_setdef(def, value)
00443 ppdef def; char *value;
00444 {
00445 char *p;
00446 switch(def->type)
00447 {
00448 case 'i' :
00449 *def->where.i = strtol(value, &p, 10);
00450 if (*p) return -1;
00451 return 0;
00452 case 'r' :
00453 *def->where.r = strtod(value, &p);
00454 if (*p) return -1;
00455 return 0;
00456 case 's' :
00457 *def->where.s = strdup(value);
00458 return 0;
00459 case 'f' :
00460 *def->where.f = strtol(value, &p, 10);
00461 if (*p) return -1;
00462 return 0;
00463 }
00464 return -1;
00465 }
00466
00467 int pparam_set(lname, value)
00468 char *lname; char *value;
00469 {
00470 ppdef def = pparam_cell(lname);
00471 return pparam_setdef(def, value);
00472 }
00473
00474 char *pparam_getdef(def)
00475 ppdef def;
00476 {
00477 static char result[100];
00478 switch(def->type)
00479 {
00480 case 'i': sprintf(result,"%d", *def->where.i); return result;
00481 case 'r': sprintf(result,"%f",*def->where.r); return result;
00482 case 's': return *def->where.s?*def->where.s:"";
00483 case 'f': sprintf(result,"%d", *def->where.f); return result;
00484 }
00485 return NULL;
00486 }
00487
00488 void pparam_printdocs()
00489 {
00490 ppdef def; int len, maxname, maxdoc;
00491 maxname = 0;
00492 maxdoc = 0;
00493 for (def=ppdefs; def; def=def->next)
00494 {
00495 len = strlen(def->lname);
00496 if (len>maxname) maxname=len;
00497 len = strlen(def->doc);
00498 if (len>maxdoc) maxdoc=len;
00499 }
00500 fprintf(stderr,"\n");
00501 fprintf(stderr,"Charmrun Command-line Parameters:\n");
00502 for (def=ppdefs; def; def=def->next)
00503 {
00504 fprintf(stderr," %c%c%-*s ",pparam_optc,pparam_optc,maxname,def->lname);
00505 fprintf(stderr," %-*s [%s]\n",maxdoc,def->doc,pparam_getdef(def));
00506 }
00507 fprintf(stderr,"\n");
00508 }
00509
00510 void pparam_delarg(i)
00511 int i;
00512 {
00513 int j;
00514 for (j=i; pparam_argv[j]; j++)
00515 pparam_argv[j]=pparam_argv[j+1];
00516 }
00517
00518 int pparam_countargs(argv)
00519 char **argv;
00520 {
00521 int argc;
00522 for (argc=0; argv[argc]; argc++);
00523 return argc;
00524 }
00525
00526 int pparam_parseopt()
00527 {
00528 int ok; ppdef def=NULL;
00529 char *opt = pparam_argv[pparam_pos];
00530
00531 if ((opt[1]=='+')&&(opt[2]==0))
00532 {
00533 pparam_delarg(pparam_pos);
00534 while (pparam_argv[pparam_pos]) pparam_pos++;
00535 return 0;
00536 }
00537
00538 if (opt[1]==0)
00539 {
00540 sprintf(pparam_error,"Illegal option +\n");
00541 return -1;
00542 }
00543
00544 if (opt[1]=='+') def = pparam_find(opt+2);
00545 else
00546 {
00547 char name[2];
00548 name[0]=opt[1];
00549 if (strlen(opt)<=2 || !isalpha(opt[2]))
00550 {
00551 name[1]=0;
00552 def = pparam_find(name);
00553 }
00554 }
00555 if (def==NULL)
00556 {
00557 if (opt[1]=='+')
00558 {
00559 sprintf(pparam_error,"Option %s not recognized.",opt);
00560 return -1;
00561 } else {
00562
00563 pparam_pos++;
00564 return 0;
00565 }
00566 }
00567
00568 if ((def->type=='f')&&(opt[1]!='+')&&(opt[2]))
00569 {
00570 sprintf(pparam_error,"Option %s should not include a value",opt);
00571 return -1;
00572 }
00573 if (def->type=='f')
00574 {
00575 *def->where.f = 1;
00576 pparam_delarg(pparam_pos);
00577 return 0;
00578 }
00579
00580 if ((opt[1]=='+')||(opt[2]==0))
00581 {
00582 pparam_delarg(pparam_pos);
00583 opt = pparam_argv[pparam_pos];
00584 }
00585 else opt+=2;
00586 if ((opt == 0)||(opt[0] == 0))
00587 {
00588 sprintf(pparam_error,"%s must be followed by a value.",opt);
00589 return -1;
00590 }
00591 ok = pparam_setdef(def, opt);
00592 pparam_delarg(pparam_pos);
00593 if (ok<0)
00594 {
00595 sprintf(pparam_error,"Illegal value for %s",opt);
00596 return -1;
00597 }
00598 return 0;
00599 }
00600
00601 int pparam_parsecmd(optchr, argv)
00602 char optchr; char **argv;
00603 {
00604 pparam_error[0]=0;
00605 pparam_argv = argv;
00606 pparam_optc = optchr;
00607 pparam_pos = 0;
00608 while(1)
00609 {
00610 char *opt = pparam_argv[pparam_pos];
00611 if (opt==0) break;
00612 if (opt[0]!=optchr) pparam_pos++;
00613 else if (pparam_parseopt()<0) return -1;
00614 }
00615 return 0;
00616 }
00617
00618 #ifdef HSTART
00619 char **
00620 dupargv (argv)
00621 char **argv;
00622 {
00623 int argc;
00624 char **copy;
00625
00626 if (argv == NULL)
00627 return NULL;
00628
00629
00630 for (argc = 0; argv[argc] != NULL; argc++);
00631 copy = (char **) malloc ((argc +2) * sizeof (char *));
00632 if (copy == NULL)
00633 return NULL;
00634
00635
00636 for (argc = 0; argv[argc] != NULL; argc++)
00637 {
00638 int len = strlen (argv[argc]);
00639 copy[argc] = malloc (sizeof (char ) * (len + 1));
00640 strcpy (copy[argc], argv[argc]);
00641 }
00642 copy[argc] = NULL;
00643 return copy;
00644 }
00645
00646 #endif
00647
00648
00649
00650
00651
00652
00653
00654
00655
00656
00657
00658
00659 #define MAX_NODES 1000
00660 #define MAX_LINE_LENGTH 1000
00661
00662 char **arg_argv;
00663 int arg_argc;
00664
00665 int arg_requested_pes;
00666 int arg_timeout;
00667 int arg_verbose;
00668 char *arg_nodelist;
00669 char *arg_nodegroup;
00670 char *arg_runscript;
00671 char *arg_charmrunip;
00672 #if CONVERSE_VERSION_VMI
00673 char *arg_vmispecfile;
00674 #endif
00675
00676 int arg_debug;
00677 int arg_debug_no_pause;
00678 int arg_debug_no_xrdb;
00679 int arg_charmdebug;
00680 char *arg_debug_commands;
00681
00682 int arg_local;
00683 int arg_batch_spawn;
00684 int arg_scalable_start;
00685
00686 #ifdef HSTART
00687 int arg_hierarchical_start;
00688 int arg_child_charmrun;
00689 #endif
00690 int arg_help;
00691 int arg_ppn;
00692 int arg_usehostname;
00693
00694 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00695 int arg_read_pes=0;
00696 #endif
00697
00698 #if CMK_USE_RSH
00699 int arg_maxrsh;
00700 char *arg_shell;
00701 int arg_in_xterm;
00702 char *arg_debugger;
00703 char *arg_xterm;
00704 char *arg_display;
00705 int arg_ssh_display;
00706 char *arg_mylogin;
00707 #endif
00708 int arg_mpiexec;
00709 int arg_no_va_rand;
00710
00711 char *arg_nodeprog_a;
00712 char *arg_nodeprog_r;
00713 char *arg_currdir_a;
00714 char *arg_currdir_r;
00715
00716 int arg_server;
00717 int arg_server_port=0;
00718 char *arg_server_auth=NULL;
00719 int replay_single=0;
00720
00721 #if CMK_BPROC
00722 int arg_startpe;
00723 int arg_endpe;
00724 int arg_singlemaster;
00725 int arg_skipmaster;
00726 #endif
00727
00728 void arg_init(int argc, char **argv)
00729 {
00730 static char buf[1024];
00731
00732 int i, local_def=0;
00733 #if CMK_CHARMRUN_LOCAL
00734 local_def=1;
00735 #endif
00736
00737 pparam_int(&arg_requested_pes, 1, "p", "number of processes to create");
00738 pparam_int(&arg_timeout, 60, "timeout", "seconds to wait per host connection");
00739 pparam_flag(&arg_verbose, 0, "verbose", "Print diagnostic messages");
00740 pparam_str(&arg_nodelist, 0, "nodelist", "file containing list of nodes");
00741 pparam_str(&arg_nodegroup,"main", "nodegroup", "which group of nodes to use");
00742 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
00743 pparam_int(&arg_read_pes, 0, "readpe", "number of host names to read into the host table");
00744 #endif
00745
00746 #if CMK_CCS_AVAILABLE
00747 pparam_flag(&arg_server, 0, "server", "Enable client-server (CCS) mode");
00748 pparam_int(&arg_server_port, 0, "server-port", "Port to listen for CCS requests");
00749 pparam_str(&arg_server_auth, 0, "server-auth", "CCS Authentication file");
00750 #endif
00751 pparam_flag(&arg_local, local_def, "local", "Start node programs locally without daemon");
00752 pparam_int(&arg_batch_spawn, 0, "batch", "Rsh several node programs at a time, avoiding overloading charmrun pe");
00753 pparam_flag(&arg_scalable_start, 0, "scalable-start", "scalable start");
00754 #ifdef HSTART
00755 pparam_flag(&arg_hierarchical_start, 0, "hierarchical-start", "hierarchical start");
00756 pparam_flag(&arg_child_charmrun, 0, "child-charmrun", "child charmrun");
00757 #endif
00758 pparam_flag(&arg_usehostname, 0, "usehostname", "Send nodes our symbolic hostname instead of IP address");
00759 pparam_str(&arg_charmrunip, 0, "useip", "Use IP address provided for charmrun IP");
00760 pparam_flag(&arg_mpiexec, 0, "mpiexec", "use mpiexec to start jobs");
00761 #if CMK_USE_RSH
00762 pparam_flag(&arg_debug, 0, "debug", "Run each node under gdb in an xterm window");
00763 pparam_flag(&arg_debug_no_pause,0, "debug-no-pause","Like debug, except doesn't pause at beginning");
00764 pparam_str(&arg_debug_commands, 0, "debug-commands", "Commands to be run inside gdb at startup");
00765 pparam_flag(&arg_debug_no_xrdb,0, "no-xrdb","Don't check xrdb");
00766
00767
00768
00769
00770
00771
00772 #if !defined(_WIN32)
00773 pparam_flag(&arg_charmdebug, 0, "charmdebug", "Used only when charmrun is started by charmdebug");
00774 #endif
00775
00776 pparam_int(&arg_maxrsh, 16, "maxrsh", "Maximum number of rsh's to run at a time");
00777 pparam_str(&arg_shell, 0, "remote-shell", "which remote shell to use");
00778 pparam_str(&arg_debugger, 0, "debugger", "which debugger to use");
00779 pparam_str(&arg_display, 0, "display", "X Display for xterm");
00780 pparam_flag(&arg_ssh_display, 0, "ssh-display", "use own X Display for each ssh session");
00781 pparam_flag(&arg_in_xterm, 0, "in-xterm", "Run each node in an xterm window");
00782 pparam_str(&arg_xterm, 0, "xterm", "which xterm to use");
00783 #endif
00784 #ifdef CMK_BPROC
00785
00786 pparam_int(&arg_startpe, 0, "startpe", "first pe to start job(SCYLD)");
00787 pparam_int(&arg_endpe, 1000000, "endpe", "last pe to start job(SCYLD)");
00788 pparam_flag(&arg_singlemaster, 0, "singlemaster", "Only assign one process to master node(SCYLD)");
00789 pparam_flag(&arg_skipmaster, 0, "skipmaster", "Donot assign any process to master node(SCYLD)");
00790 if (arg_skipmaster && arg_singlemaster) {
00791 printf("Charmrun> 'singlemaster' is ignored due to 'skipmaster'. \n");
00792 arg_singlemaster = 0;
00793 }
00794 pparam_flag(&arg_debug, 0, "debug", "turn on more verbose debug print");
00795 #endif
00796 #ifdef CONVERSE_VERSION_VMI
00797 pparam_str (&arg_vmispecfile, 0, "specfile", "device specfile to load (VMI)");
00798 #endif
00799 pparam_str(&arg_runscript, 0, "runscript", "script to run node-program with");
00800 pparam_flag(&arg_help, 0, "help", "print help messages");
00801 pparam_int(&arg_ppn, 0, "ppn", "number of pes per node");
00802 pparam_flag(&arg_no_va_rand, 0, "no-va-randomization", "Disables randomization of the virtual address space");
00803 #ifdef HSTART
00804 arg_argv = dupargv(argv);
00805 #endif
00806
00807 if (pparam_parsecmd('+', argv) < 0) {
00808 fprintf(stderr,"ERROR> syntax: %s\n",pparam_error);
00809 pparam_printdocs();
00810 exit(1);
00811 }
00812
00813
00814 for (i=0;argv[i];i++) {
00815 if (0==strcmp(argv[i],"-?") ||
00816 0==strcmp(argv[i],"-h") ||
00817 0==strcmp(argv[i],"--help"))
00818 arg_help=1;
00819 }
00820 if (arg_help) {
00821 pparam_printdocs();
00822
00823 }
00824
00825 #ifdef HSTART
00826 if (!arg_hierarchical_start || arg_child_charmrun)
00827 #endif
00828 arg_argv = argv+1;
00829 arg_argc = pparam_countargs(arg_argv);
00830 if (arg_argc<1) {
00831 fprintf(stderr,"ERROR> You must specify a node-program.\n");
00832 pparam_printdocs();
00833 exit(1);
00834 }
00835
00836 #ifdef HSTART
00837 if (!arg_hierarchical_start || arg_child_charmrun){
00838
00839 arg_argv++; arg_argc--;
00840 }
00841 else{
00842
00843 arg_argv++;arg_argc--;
00844
00845 arg_argv[arg_argc]=malloc(sizeof(char) * strlen("++child-charmrun"));
00846 strcpy(arg_argv[arg_argc++],"++child-charmrun");
00847 arg_argv[arg_argc] = NULL;
00848 }
00849 #else
00850 arg_argv++; arg_argc--;
00851 #endif
00852
00853 if (arg_server_port || arg_server_auth) arg_server=1;
00854
00855 if (arg_debug || arg_debug_no_pause) {
00856 arg_verbose=1;
00857
00858 arg_argv[arg_argc++]="++debug";
00859 }
00860
00861
00862 for (i=0;argv[i];i++) {
00863 if (0==strcmp(argv[i],"+replay-detail")) {
00864 replay_single = 1;
00865 arg_requested_pes = 1;
00866 }
00867 }
00868
00869 #ifdef CMK_BPROC
00870 if (arg_local) {
00871 fprintf(stderr,"Warning> ++local cannot be used in bproc version, ignored!\n");
00872 arg_local = 0;
00873 }
00874 #endif
00875
00876 #if CMK_USE_RSH
00877
00878 if(!arg_shell) {
00879 if (arg_mpiexec)
00880 arg_shell = "mpiexec";
00881 else
00882 arg_shell = getenv_rsh();
00883 }
00884
00885
00886 if(!arg_display)
00887 arg_display = getenv_display_no_tamper();
00888 if ((arg_debug || arg_debug_no_pause || arg_in_xterm) && (arg_display==0)) {
00889 fprintf(stderr,"ERROR> DISPLAY must be set to use debugging mode\n");
00890 exit(1);
00891 }
00892 if (arg_debug || arg_debug_no_pause)
00893 arg_timeout=8*60*60;
00894
00895
00896 if(!arg_debugger)
00897 arg_debugger = "gdb" ;
00898
00899 if(!arg_xterm)
00900 arg_xterm = "xterm" ;
00901
00902 arg_mylogin = mylogin();
00903 #endif
00904
00905 #if CONVERSE_VERSION_VMI
00906 if (!arg_vmispecfile) {
00907 arg_vmispecfile = getenv ("VMI_SPECFILE");
00908 }
00909
00910 if (!arg_vmispecfile) {
00911 fprintf (stderr, "ERROR> ++specfile not specified and VMI_SPECFILE not given in environment\n");
00912 exit (1);
00913 }
00914 #endif
00915
00916
00917 getcwd(buf, 1023);
00918 arg_currdir_a = strdup(buf);
00919
00920
00921 arg_nodeprog_r = argv[1];
00922
00923 if (arg_nodeprog_r[0]=='-' || arg_nodeprog_r[0]=='+')
00924 {
00925
00926 printf("Charmrun does not recognize the flag '%s'.\n",arg_nodeprog_r);
00927 if (arg_nodeprog_r[0]=='+')
00928 printf("Charm++'s flags need to be placed *after* the program name.\n");
00929 pparam_printdocs();
00930 exit(1);
00931 }
00932
00933
00934 #if defined(_WIN32) && !defined(__CYGWIN__)
00935 if (argv[1][1]==':' || argv[1][0]=='\\' && argv[1][1]=='\\') {
00936 #else
00937 if (argv[1][0]=='/') {
00938 #endif
00939
00940 arg_nodeprog_a = argv[1];
00941 } else {
00942 sprintf(buf,"%s%s%s",arg_currdir_a,DIRSEP,arg_nodeprog_r);
00943 arg_nodeprog_a = strdup(buf);
00944 }
00945 if (arg_scalable_start) {
00946 printf("Charmrun> scalable start enabled. \n");
00947 if (arg_debug || arg_debug_no_pause) {
00948 fprintf(stderr, "Charmrun> Error: ++scalable-start does not support debugging mode. \n");
00949 exit(1);
00950 }
00951 }
00952
00953 #ifdef HSTART
00954 if (arg_hierarchical_start) {
00955 printf("Charmrun> Hierarchical scalable start enabled. \n");
00956 if (arg_debug || arg_debug_no_pause) {
00957 fprintf(stderr, "Charmrun> Error: ++hierarchial-start does not support debugging mode. \n");
00958 exit(1);
00959 }
00960 if (arg_verbose) {
00961 fprintf(stderr, "Charmrun> Warning: you have enabled verbose output with Hierarchical startup, you may get inconsistent verbose outputs. \n++hierarchial-start does not support verbose mode. \n");
00962 }
00963
00964 }
00965 else if(arg_child_charmrun) {
00966 fprintf(stderr, "Charmrun> Error: ++child-charmrun is not a user-specified flag. \n");
00967 exit(1);
00968 }
00969 #endif
00970 if(arg_debug && arg_local){
00971 printf("++debug cannot be used with ++local.\n");
00972 exit(0);
00973 }
00974 }
00975
00976
00977
00978
00979
00980
00981
00982 static int portOk = 1;
00983 static const char *nodetab_tempName=NULL;
00984 char *nodetab_file_find()
00985 {
00986 char buffer[MAXPATHLEN];
00987
00988
00989 if (arg_nodelist) {
00990 char *path = arg_nodelist;
00991 if (probefile(path)) return strdup(path);
00992 fprintf(stderr,"ERROR> No such nodelist file %s\n",path);
00993 exit(1);
00994 }
00995
00996 if (getenv("NODELIST")) {
00997 char *path = getenv("NODELIST");
00998 if (path && probefile(path)) return strdup(path);
00999 fprintf(stderr,"ERROR> Cannot find nodelist file %s\n",path);
01000 exit(1);
01001 }
01002
01003 if (probefile("./nodelist")) return strdup("./nodelist");
01004 #if defined(_WIN32) && !defined(__CYGWIN__)
01005 tmpnam(buffer);
01006 nodetab_tempName=strdup(buffer);
01007 #else
01008 if (getenv("HOME")) {
01009 sprintf(buffer,"%s/.nodelist",getenv("HOME"));
01010 }
01011 #endif
01012 if (!probefile(buffer))
01013 {
01014
01015 FILE *f=fopen(buffer,"w");
01016 if (f==NULL) {
01017 fprintf(stderr,"ERROR> Cannot create a 'nodelist' file.\n");
01018 exit(1);
01019 }
01020 fprintf(f,"group main\nhost localhost\n");
01021 fclose(f);
01022 }
01023 return strdup(buffer);
01024 }
01025
01026 typedef struct nodetab_host {
01027 char *name;
01028 skt_ip_t ip;
01029 pathfixlist pathfixes;
01030 char *ext;
01031 int cpus;
01032 int rank;
01033 double speed;
01034 int nice;
01035 int forks;
01036
01037 int dataport;
01038 SOCKET ctrlfd;
01039 #if CMK_USE_RSH
01040 char *shell;
01041 char *debugger ;
01042 char *xterm ;
01043 char *login;
01044 char *passwd;
01045 char *setup;
01046 #endif
01047
01048 #if CMK_USE_IBVERBS
01049 ChInfiAddr *qpData;
01050 #endif
01051 #if CMK_USE_IBUD
01052 ChInfiAddr qp;
01053 #endif
01054
01055
01056 } nodetab_host;
01057
01058 nodetab_host **nodetab_table;
01059 int nodetab_max;
01060 int nodetab_size;
01061 int *nodetab_rank0_table;
01062 int nodetab_rank0_size;
01063
01064 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
01065 int loaded_max_pe;
01066 #endif
01067
01068 void nodetab_reset(nodetab_host *h)
01069 {
01070 h->name="SET_H->NAME";
01071 h->ip=_skt_invalid_ip;
01072 h->pathfixes = 0;
01073 h->ext = NULL;
01074 h->speed = 1.0;
01075 h->cpus = 1;
01076 h->rank = 0;
01077 h->nice=-100;
01078 h->forks = 0;
01079 h->dataport=-1;
01080 h->ctrlfd=-1;
01081 #if CMK_USE_RSH
01082 h->shell = arg_shell;
01083 h->debugger = arg_debugger;
01084 h->xterm = arg_xterm;
01085 h->login = arg_mylogin;
01086 h->passwd = "*";
01087 h->setup = "*";
01088 #endif
01089 }
01090
01091 void nodetab_add(nodetab_host *h)
01092 {
01093 if (h->rank == 0)
01094 nodetab_rank0_table[nodetab_rank0_size++] = nodetab_size;
01095 nodetab_table[nodetab_size] = (nodetab_host *) malloc(sizeof(nodetab_host));
01096
01097 if (arg_verbose) {
01098 char ips[200];
01099 skt_print_ip(ips,h->ip);
01100 printf("Charmrun> adding client %d: \"%s\", IP:%s\n", nodetab_size, h->name, ips);
01101 }
01102
01103 *nodetab_table[nodetab_size++] = *h;
01104 }
01105
01106 void nodetab_makehost(char *name,nodetab_host *h)
01107 {
01108 h->name=strdup(name);
01109 h->ip = skt_innode_lookup_ip(name);
01110 if (skt_ip_match(h->ip,_skt_invalid_ip)) {
01111 #ifdef CMK_BPROC
01112
01113 if (!(1 == arg_requested_pes && atoi(name)==-1))
01114 #endif
01115 {
01116 fprintf(stderr,"ERROR> Cannot obtain IP address of %s\n", name);
01117 exit(1);
01118 }
01119 }
01120 if (nodetab_size == nodetab_max) return;
01121 nodetab_add(h);
01122 }
01123
01124 char *nodetab_args(char *args,nodetab_host *h)
01125 {
01126 if (arg_ppn>0) h->cpus = arg_ppn;
01127 while(*args != 0) {
01128 char *b1 = skipblanks(args), *e1 = skipstuff(b1);
01129 char *b2 = skipblanks(e1), *e2 = skipstuff(b2);
01130 while (*b1=='+') b1++;
01131 #if CMK_USE_RSH
01132 if (subeqs(b1,e1,"login")) h->login = substr(b2,e2);
01133 else if (subeqs(b1,e1,"passwd")) h->passwd = substr(b2,e2);
01134 else if (subeqs(b1,e1,"setup")) h->setup = strdup(b2);
01135 else if (subeqs(b1,e1,"shell")) h->shell = substr(b2,e2);
01136 else if (subeqs(b1,e1,"debugger")) h->debugger = substr(b2,e2);
01137 else if (subeqs(b1,e1,"xterm")) h->xterm = substr(b2,e2);
01138 else
01139 #endif
01140 if (subeqs(b1,e1,"speed")) h->speed = atof(b2);
01141 else if (subeqs(b1,e1,"cpus")) {
01142 if (arg_ppn==0) h->cpus = atol(b2);
01143 }
01144 else if (subeqs(b1,e1,"pathfix")) {
01145 char *b3 = skipblanks(e2), *e3 = skipstuff(b3);
01146 args = skipblanks(e3);
01147 h->pathfixes=pathfix_append(substr(b2,e2),substr(b3,e3),h->pathfixes);
01148 e2 = e3;
01149 }
01150 else if (subeqs(b1,e1,"ext")) h->ext = substr(b2,e2);
01151 else if (subeqs(b1,e1,"nice")) h->nice = atoi(b2);
01152 else return args;
01153 args = skipblanks(e2);
01154 }
01155 #if CMK_SHARED_VARS_UNAVAILABLE
01156 if (h->cpus != 1) {
01157 fprintf(stderr,"Warning> Invalid cpus %d in nodelist ignored.\n", h->cpus);
01158 h->cpus = 1;
01159 }
01160 #endif
01161 return args;
01162 }
01163
01164
01165 void nodetab_init_for_local()
01166 {
01167 int tablesize, i, done=0;
01168 nodetab_host group;
01169
01170 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
01171 if(arg_read_pes == 0){
01172 arg_read_pes = arg_requested_pes;
01173 }
01174 #endif
01175
01176 tablesize = arg_requested_pes;
01177 nodetab_table=(nodetab_host**)malloc(tablesize*sizeof(nodetab_host*));
01178 nodetab_rank0_table=(int*)malloc(tablesize*sizeof(int));
01179 nodetab_max=tablesize;
01180
01181 nodetab_reset(&group);
01182 if (arg_ppn==0) arg_ppn=1;
01183 #if CMK_SHARED_VARS_UNAVAILABLE
01184 if (arg_ppn > 1) {
01185 fprintf(stderr,"Warning> Invalid ppn %d in nodelist ignored.\n", arg_ppn);
01186 arg_ppn=1;
01187 }
01188 #endif
01189 group.cpus = arg_ppn;
01190 i = 0;
01191 while (!done) {
01192 char *hostname = "127.0.0.1";
01193 for (group.rank = 0; group.rank<arg_ppn; group.rank++) {
01194 nodetab_makehost(hostname, &group);
01195 if (++i == arg_requested_pes) { done = 1; break; }
01196 }
01197 }
01198 }
01199
01200
01201 #ifdef HSTART
01202
01203 int branchfactor;
01204 int nodes_per_child;
01205 int * nodetab_unique_table;
01206 int nodetab_unique_size;
01207 char *nodetab_name(int i);
01208 void nodetab_init_hierarchical_start(void)
01209 {
01210 int node_start = 0;
01211 char * node_name;
01212 nodetab_unique_size = 0;
01213 nodetab_unique_table = (int *)malloc(nodetab_rank0_size * sizeof(int));
01214 while(node_start<nodetab_rank0_size)
01215 {
01216 nodetab_unique_table[nodetab_unique_size++] = node_start;
01217 node_name = nodetab_name(node_start);
01218 do{
01219 node_start++;
01220 }
01221 while(node_start<nodetab_rank0_size&&(!strcmp(nodetab_name(node_start),node_name)));
01222
01223 }
01224 branchfactor = ceil(sqrt(nodetab_unique_size));
01225 nodes_per_child = round(nodetab_unique_size*1.0/branchfactor);
01226 }
01227 #endif
01228
01229 void nodetab_init()
01230 {
01231 FILE *f,*fopen();
01232 char *nodesfile;
01233 nodetab_host global,group,host;
01234 char input_line[MAX_LINE_LENGTH];
01235 int rightgroup, basicsize, i, remain;
01236
01237
01238 if (arg_local || arg_mpiexec) {
01239 nodetab_init_for_local();
01240 goto fin;
01241 }
01242
01243
01244 nodesfile = nodetab_file_find();
01245 if(arg_verbose)
01246 fprintf(stderr, "Charmrun> using %s as nodesfile\n", nodesfile);
01247 if (!(f = fopen(nodesfile,"r"))) {
01248 fprintf(stderr,"ERROR> Cannot read %s: %s\n",nodesfile,strerror(errno));
01249 exit(1);
01250 }
01251
01252 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
01253 if(arg_read_pes == 0){
01254 arg_read_pes = arg_requested_pes;
01255 }
01256 nodetab_table=(nodetab_host**)malloc(arg_read_pes*sizeof(nodetab_host*));
01257 nodetab_rank0_table=(int*)malloc(arg_read_pes*sizeof(int));
01258 nodetab_max=arg_read_pes;
01259 fprintf(stderr,"arg_read_pes %d arg_requested_pes %d\n",arg_read_pes,arg_requested_pes);
01260 #else
01261 nodetab_table=(nodetab_host**)malloc(arg_requested_pes*sizeof(nodetab_host*));
01262 nodetab_rank0_table=(int*)malloc(arg_requested_pes*sizeof(int));
01263 nodetab_max=arg_requested_pes;
01264 #endif
01265
01266
01267 nodetab_reset(&global);
01268 group=global;
01269 rightgroup = (strcmp(arg_nodegroup,"main")==0);
01270
01271 while(fgets(input_line,sizeof(input_line)-1,f)!=0) {
01272 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
01273 if (nodetab_size == arg_read_pes) break;
01274 #else
01275 if (nodetab_size == arg_requested_pes) break;
01276 #endif
01277 if (input_line[0]=='#') continue;
01278 zap_newline(input_line);
01279 if (!nodetab_args(input_line,&global)) {
01280
01281 nodetab_args(input_line,&group);
01282 }
01283 else {
01284 char *b1 = skipblanks(input_line), *e1 = skipstuff(b1);
01285 char *b2 = skipblanks(e1), *e2 = skipstuff(b2);
01286 char *b3 = skipblanks(e2);
01287 if (subeqs(b1,e1,"host")) {
01288 if (rightgroup) {
01289 host=group;
01290 nodetab_args(b3,&host);
01291 for (host.rank=0; host.rank<host.cpus; host.rank++)
01292 nodetab_makehost(substr(b2,e2),&host);
01293 }
01294 } else if (subeqs(b1,e1, "group")) {
01295 group=global;
01296 nodetab_args(b3,&group);
01297 rightgroup = subeqs(b2,e2,arg_nodegroup);
01298 } else if (b1!=b3) {
01299 fprintf(stderr,"ERROR> unrecognized command in nodesfile:\n");
01300 fprintf(stderr,"ERROR> %s\n", input_line);
01301 exit(1);
01302 }
01303 }
01304 }
01305 fclose(f);
01306 if (nodetab_tempName!=NULL) unlink(nodetab_tempName);
01307
01308
01309 basicsize = nodetab_size;
01310 if (basicsize==0) {
01311 fprintf(stderr,"ERROR> No hosts in group %s\n", arg_nodegroup);
01312 exit(1);
01313 }
01314 while ((nodetab_size < arg_requested_pes)&&(arg_requested_pes!=MAX_NODES))
01315 nodetab_add(nodetab_table[nodetab_size%basicsize]);
01316
01317 fin:
01318
01319 for (i=0; i<nodetab_size; i++) {
01320 if (nodetab_table[i]->rank == 0)
01321 remain = nodetab_size - i;
01322 if (nodetab_table[i]->cpus > remain)
01323 nodetab_table[i]->cpus = remain;
01324 }
01325
01326 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
01327 loaded_max_pe = arg_requested_pes-1;
01328 #endif
01329 #ifdef HSTART
01330 if(arg_hierarchical_start)
01331 nodetab_init_hierarchical_start();
01332 #endif
01333
01334 }
01335
01336
01337 nodetab_host *nodetab_getinfo(int i)
01338 {
01339 if (nodetab_table==0) {
01340 fprintf(stderr,"ERROR> Node table not initialized.\n");
01341 exit(1);
01342 }
01343 return nodetab_table[i];
01344 }
01345
01346
01347 nodetab_host *nodetab_getnodeinfo(int i)
01348 {
01349 return nodetab_getinfo(nodetab_rank0_table[i]);
01350 }
01351
01352
01353 char *nodetab_name(int i) { return nodetab_getinfo(i)->name; }
01354 pathfixlist nodetab_pathfixes(int i){ return nodetab_getinfo(i)->pathfixes; }
01355 char *nodetab_ext(int i) { return nodetab_getinfo(i)->ext; }
01356 skt_ip_t nodetab_ip(int i) { return nodetab_getinfo(i)->ip; }
01357 unsigned int nodetab_cpus(int i) { return nodetab_getinfo(i)->cpus; }
01358 unsigned int nodetab_rank(int i) { return nodetab_getinfo(i)->rank; }
01359 int nodetab_dataport(int i) { return nodetab_getinfo(i)->dataport; }
01360 int nodetab_nice(int i) { return nodetab_getinfo(i)->nice; }
01361 SOCKET nodetab_ctrlfd(int i) { return nodetab_getinfo(i)->ctrlfd;}
01362 #if CMK_USE_RSH
01363 char *nodetab_setup(int i) { return nodetab_getinfo(i)->setup; }
01364 char *nodetab_shell(int i) { return nodetab_getinfo(i)->shell; }
01365 char *nodetab_debugger(int i) { return nodetab_getinfo(i)->debugger; }
01366 char *nodetab_xterm(int i) { return nodetab_getinfo(i)->xterm; }
01367 char *nodetab_login(int i) { return nodetab_getinfo(i)->login; }
01368 char *nodetab_passwd(int i) { return nodetab_getinfo(i)->passwd; }
01369 #endif
01370
01371
01372
01373
01374
01375
01376
01377
01378
01379
01380
01381 static ChNodeinfo *nodeinfo_arr;
01382
01383 void nodeinfo_allocate(void)
01384 {
01385 nodeinfo_arr=(ChNodeinfo *)malloc(nodetab_rank0_size*sizeof(ChNodeinfo));
01386 }
01387 void nodeinfo_add(const ChSingleNodeinfo *in,SOCKET ctrlfd)
01388 {
01389 int node=ChMessageInt(in->nodeNo);
01390 ChNodeinfo i=in->info;
01391 unsigned int nt;
01392 unsigned int pe;
01393 unsigned int dataport;
01394 int lid,qpn,psn;
01395 if (node<0 || node>=nodetab_rank0_size)
01396 {fprintf(stderr,"Unexpected node %d registered!\n",node);exit(1);}
01397 nt=nodetab_rank0_table[node];
01398 i.nPE=ChMessageInt_new(nodetab_cpus(nt));
01399 if (arg_mpiexec)
01400 nodetab_getinfo(nt)->ip = i.IP;
01401 i.IP=nodetab_ip(nt);
01402 #if CMK_USE_IBVERBS
01403 nodeinfo_arr[node] = i;
01404 for (pe=0;pe<nodetab_cpus(nt);pe++){
01405 nodetab_table[nt+pe]->ctrlfd=ctrlfd;
01406 }
01407
01408 #else
01409 dataport = ChMessageInt(i.dataport);
01410 if (0==dataport)
01411 {fprintf(stderr,"Node %d could not initialize network!\n",node);exit(1);}
01412 nodeinfo_arr[node]=i;
01413 for (pe=0;pe<nodetab_cpus(nt);pe++)
01414 {
01415 nodetab_table[nt+pe]->dataport=dataport;
01416 nodetab_table[nt+pe]->ctrlfd=ctrlfd;
01417 #if CMK_USE_IBUD
01418 nodetab_table[nt+pe]->qp=i.qp;
01419 #endif
01420 }
01421 if (arg_verbose) {
01422 char ips[200];
01423 skt_print_ip(ips,nodetab_ip(nt));
01424 printf("Charmrun> client %d connected (IP=%s data_port=%d)\n", nt, ips, dataport);
01425 #if CMK_USE_IBUD
01426 printf("Charmrun> client %d lid=%d qpn=%i psn=%i\n",nt,ChMessageInt(i.qp.lid),ChMessageInt(i.qp.qpn),ChMessageInt(i.qp.psn));
01427 #endif
01428 }
01429 #endif
01430 }
01431
01432
01433
01434
01435
01436
01437
01438
01439
01440
01441
01442
01443
01444
01445
01446 char *input_buffer;
01447
01448 void input_extend()
01449 {
01450 char line[1024];
01451 int len = input_buffer?strlen(input_buffer):0;
01452 fflush(stdout);
01453 if (fgets(line, 1023, stdin)==0) {
01454 fprintf(stderr,"end-of-file on stdin");
01455 exit(1);
01456 }
01457 input_buffer = realloc(input_buffer, len + strlen(line) + 1);
01458 strcpy(input_buffer+len, line);
01459 }
01460
01461 void input_init()
01462 {
01463 input_buffer = strdup("");
01464 }
01465
01466 char *input_extract(nchars)
01467 int nchars;
01468 {
01469 char *res = substr(input_buffer, input_buffer+nchars);
01470 char *tmp = substr(input_buffer+nchars, input_buffer+strlen(input_buffer));
01471 free(input_buffer);
01472 input_buffer = tmp;
01473 return res;
01474 }
01475
01476 char *input_gets()
01477 {
01478 char *p, *res; int len;
01479 while(1) {
01480 p = strchr(input_buffer,'\n');
01481 if (p) break;
01482 input_extend();
01483 }
01484 len = p-input_buffer;
01485 res = input_extract(len+1);
01486 res[len]=0;
01487 return res;
01488 }
01489
01490
01491 char *input_scanf_chars(fmt)
01492 char *fmt;
01493 {
01494 char buf[8192]; int len, pos;
01495 static int fd; static FILE *file;
01496 fflush(stdout);
01497 if (file==0) {
01498 #if CMK_USE_MKSTEMP
01499 char tmp[128];
01500 strcpy(tmp, "/tmp/fnordXXXXXX");
01501 mkstemp(tmp);
01502 #else
01503 char *tmp=tmpnam(NULL);
01504 #endif
01505 unlink(tmp);
01506 fd = open(tmp,O_RDWR | O_CREAT | O_TRUNC, 0664);
01507 if (fd<0) {
01508 fprintf(stderr,"cannot open temp file /tmp/fnord");
01509 exit(1);
01510 }
01511 file = fdopen(fd, "r+");
01512 unlink(tmp);
01513 }
01514 while (1) {
01515 len = strlen(input_buffer);
01516 rewind(file);
01517 fwrite(input_buffer, len, 1, file);
01518 fflush(file);
01519 rewind(file);
01520 ftruncate(fd, len);
01521 fscanf(file, fmt, buf, buf, buf, buf, buf, buf, buf, buf, buf, buf, buf, buf, buf, buf, buf, buf, buf, buf);
01522 pos = ftell(file);
01523 if (pos<len) break;
01524 input_extend();
01525 }
01526 return input_extract(pos);
01527 }
01528
01529
01530
01531
01532
01533
01534
01535 #if CMK_CCS_AVAILABLE
01536
01537
01538
01539
01540
01541 void req_ccs_connect(void)
01542 {
01543 const void *bufs[3]; int lens[3];
01544 struct {
01545 ChMessageHeader ch;
01546 CcsImplHeader hdr;
01547 } h;
01548 void *reqData;
01549 int pe,reqBytes;
01550 if (0==CcsServer_recvRequest(&h.hdr,&reqData))
01551 return;
01552 pe=ChMessageInt(h.hdr.pe);
01553 reqBytes=ChMessageInt(h.hdr.len);
01554
01555 if (pe == -1) {
01556
01557 pe = 0;
01558 }
01559 if ((pe<=-nodetab_size || pe>=nodetab_size) && 0==replay_single) {
01560
01561
01562
01563 #if ! CMK_BIGSIM_CHARM
01564 if (pe==-nodetab_size) fprintf(stderr,"Invalid processor index in CCS request: are you trying to do a broadcast instead?");
01565 else fprintf(stderr,"Invalid processor index in CCS request.");
01566 CcsServer_sendReply(&h.hdr,0,0);
01567 free(reqData);
01568 return;
01569 #endif
01570 }
01571 else if (pe < -1) {
01572
01573
01574 reqBytes -= pe * sizeof(ChMessageInt_t);
01575 pe = ChMessageInt(*(ChMessageInt_t*)reqData);
01576 }
01577
01578 if (! check_stdio_header(&h.hdr)) {
01579
01580 #define LOOPBACK 0
01581 #if LOOPBACK
01582 CcsServer_sendReply(&h.hdr,0,0);
01583 #else
01584 int destpe = pe;
01585 #if CMK_BIGSIM_CHARM
01586 destpe = destpe % nodetab_size;
01587 #endif
01588 if (replay_single) destpe = 0;
01589
01590 ChMessageHeader_new("req_fw",sizeof(h.hdr)+reqBytes,&h.ch);
01591
01592 bufs[0]=&h; lens[0]=sizeof(h);
01593 bufs[1]=reqData; lens[1]=reqBytes;
01594 skt_sendV(nodetab_ctrlfd(destpe),2,bufs,lens);
01595
01596 #endif
01597 }
01598 free(reqData);
01599 }
01600
01601
01602
01603
01604
01605 int req_ccs_reply_fw(ChMessage *msg,SOCKET srcFd) {
01606 int len=msg->len;
01607
01608
01609 CcsImplHeader hdr;
01610 skt_recvN(srcFd,&hdr,sizeof(hdr)); len-=sizeof(hdr);
01611
01612 #define m (4*1024)
01613 if (len<m || hdr.attr.auth)
01614 {
01615 void *data=malloc(len);
01616 skt_recvN(srcFd,data,len);
01617 CcsServer_sendReply(&hdr,len,data);
01618 free(data);
01619 }
01620 else
01621 {
01622 ChMessageInt_t outLen;
01623 int destFd;
01624 skt_abortFn old=skt_set_abort(reply_abortFn);
01625 int destErrs=0;
01626
01627 destFd=ChMessageInt(hdr.replyFd);
01628 outLen=ChMessageInt_new(len);
01629 skt_sendN(destFd,&outLen,sizeof(outLen));
01630 while(len>0) {
01631 char buf[m];
01632 int r=m; if (r>len) r=len;
01633 skt_recvN(srcFd,buf,r);
01634 if (0==destErrs)
01635 destErrs|=skt_sendN(destFd,buf,r);
01636 len-=m;
01637 #undef m
01638 }
01639 skt_close(destFd);
01640
01641 skt_set_abort(old);
01642 }
01643 return 0;
01644 }
01645
01646 #else
01647 int req_ccs_reply_fw(ChMessage *msg,SOCKET srcFd) {
01648
01649 }
01650 #endif
01651
01652
01653
01654
01655
01656
01657
01658
01659
01660
01663
01664
01665 SOCKET *req_clients;
01666 #ifdef HSTART
01667 SOCKET *charmrun_fds;
01668 #endif
01669 int req_nClients;
01670 int req_ending=0;
01671
01672
01673 int gdb_info_pid=0;
01674 int gdb_info_std[3];
01675 FILE *gdb_stream=NULL;
01676
01677 #define REQ_OK 0
01678 #define REQ_FAILED -1
01679
01680 #ifdef HSTART
01681 int req_reply_child(SOCKET fd, char *type,
01682 const char *data, int dataLen)
01683 {
01684
01685 int status = req_reply(fd, type, data, dataLen);
01686 if(status != REQ_OK) return status;
01687 SOCKET clientFd ;
01688 skt_recvN(fd, (const char *)&clientFd, sizeof(SOCKET));
01689 skt_sendN(fd, (const char *)&clientFd, sizeof(fd));
01690 return status;
01691 }
01692 #endif
01693
01696 int req_reply(SOCKET fd, char *type,
01697 const char *data, int dataLen)
01698 {
01699 ChMessageHeader msg;
01700 if (fd == INVALID_SOCKET) return REQ_FAILED;
01701 ChMessageHeader_new(type,dataLen,&msg);
01702 skt_sendN(fd,(const char *)&msg,sizeof(msg));
01703 skt_sendN(fd,data,dataLen);
01704 return REQ_OK;
01705 }
01706
01707
01708
01709
01710
01711
01712
01713 int req_handle_initnode(ChMessage *msg,SOCKET fd)
01714 {
01715 #if CMK_USE_IBVERBS
01716 int i;
01717 ChSingleNodeinfo *nodeInfo = (ChSingleNodeinfo *)msg->data;
01718
01719 if(msg->len != sizeof(ChSingleNodeinfo) + (nodetab_rank0_size-1)*sizeof(ChInfiAddr)){
01720 fprintf(stderr,"Charmrun: Bad initnode data length. Aborting\n");
01721 exit(1);
01722 }
01723 nodeInfo->info.qpList = malloc(sizeof(ChInfiAddr)*(nodetab_rank0_size-1));
01724 memcpy((char *)nodeInfo->info.qpList,&msg->data[sizeof(ChSingleNodeinfo)],sizeof(ChInfiAddr)*(nodetab_rank0_size-1));
01725
01726
01727
01728 #else
01729 if (msg->len!=sizeof(ChSingleNodeinfo)) {
01730 fprintf(stderr,"Charmrun: Bad initnode data length. Aborting\n");
01731 fprintf(stderr,"Charmrun: possibly because: %s.\n", msg->data);
01732 exit(1);
01733 }
01734 #endif
01735 nodeinfo_add((ChSingleNodeinfo *)msg->data,fd);
01736 return REQ_OK;
01737 }
01738
01743 int req_handle_initnodetab(ChMessage *msg,SOCKET fd)
01744 {
01745 ChMessageHeader hdr;
01746 ChMessageInt_t nNodes=ChMessageInt_new(nodetab_rank0_size);
01747 ChMessageHeader_new("initnodetab",sizeof(ChMessageInt_t)+
01748 sizeof(ChNodeinfo)*nodetab_rank0_size,&hdr);
01749 skt_sendN(fd,(const char *)&hdr,sizeof(hdr));
01750 skt_sendN(fd,(const char *)&nNodes,sizeof(nNodes));
01751 skt_sendN(fd,(const char *)nodeinfo_arr,
01752 sizeof(ChNodeinfo)*nodetab_rank0_size);
01753
01754 return REQ_OK;
01755 }
01756
01757 #ifdef HSTART
01758
01759 int req_handle_initnodetab1(ChMessage *msg,SOCKET fd)
01760 {
01761 ChMessageHeader hdr;
01762 ChMessageInt_t nNodes=ChMessageInt_new(nodetab_rank0_size);
01763 ChMessageHeader_new("initnttab",sizeof(ChMessageInt_t)+
01764 sizeof(ChNodeinfo)*nodetab_rank0_size,&hdr);
01765 skt_sendN(fd,(const char *)&hdr,sizeof(hdr));
01766 skt_sendN(fd,(const char *)&nNodes,sizeof(nNodes));
01767 skt_sendN(fd,(const char *)nodeinfo_arr,
01768 sizeof(ChNodeinfo)*nodetab_rank0_size);
01769
01770 return REQ_OK;
01771 }
01772
01773
01774
01775 static int parent_charmrun_fd = -1;
01776 int req_handle_initnodedistribution(ChMessage *msg,SOCKET fd, int client)
01777 {
01778 int nodes_to_fork = nodes_per_child;
01779 int rank0_start = nodetab_unique_table[client*nodes_per_child];
01780 int rank0_finish;
01781 if(client == branchfactor -1)
01782 {
01783 nodes_to_fork = nodetab_unique_size- client*nodes_per_child;
01784 rank0_finish = nodetab_rank0_size;
01785 }
01786 else
01787 rank0_finish = nodetab_unique_table[client*nodes_per_child + nodes_to_fork];
01788 int k;
01789 ChMessageInt_t * nodemsg = (ChMessageInt_t *)malloc((rank0_finish - rank0_start)*sizeof(ChMessageInt_t));
01790 for(k =0; k <rank0_finish- rank0_start; k++)
01791 nodemsg[k] = ChMessageInt_new(nodetab_rank0_table[rank0_start+k]);
01792 ChMessageHeader hdr;
01793 ChMessageInt_t nNodes=ChMessageInt_new(rank0_finish- rank0_start);
01794 ChMessageInt_t nTotalNodes=ChMessageInt_new(nodetab_rank0_size);
01795 ChMessageHeader_new("initnodetab",sizeof(ChMessageInt_t)*2+
01796 sizeof(ChMessageInt_t)*(rank0_finish- rank0_start),&hdr);
01797 skt_sendN(fd,(const char *)&hdr,sizeof(hdr));
01798 skt_sendN(fd,(const char *)&nNodes,sizeof(nNodes));
01799 skt_sendN(fd,(const char *)&nTotalNodes,sizeof(nTotalNodes));
01800 skt_sendN(fd,(const char *)nodemsg,(rank0_finish- rank0_start)*sizeof(ChMessageInt_t));
01801 free(nodemsg);
01802 return REQ_OK;
01803 }
01804
01805 ChSingleNodeinfo * myNodesInfo;
01806 int send_myNodeInfo_to_parent()
01807 {
01808 ChMessageHeader hdr;
01809 ChMessageInt_t nNodes=ChMessageInt_new(nodetab_rank0_size);
01810 ChMessageHeader_new("initnodetab",sizeof(ChMessageInt_t)+
01811 sizeof(ChSingleNodeinfo)*nodetab_rank0_size,&hdr);
01812 skt_sendN(parent_charmrun_fd,(const char *)&hdr,sizeof(hdr));
01813 skt_sendN(parent_charmrun_fd,(const char *)&nNodes,sizeof(nNodes));
01814 skt_sendN(parent_charmrun_fd,(const char *)myNodesInfo,
01815 sizeof(ChSingleNodeinfo)*nodetab_rank0_size);
01816
01817 return REQ_OK;
01818 }
01819 void forward_nodetab_to_children()
01820 {
01821
01822 if (!skt_select1(parent_charmrun_fd,1200*1000)){
01823 exit(0);
01824 }
01825 ChMessage msg;
01826 ChMessage_recv(parent_charmrun_fd,&msg);
01827
01828 ChMessageInt_t * nodelistmsg = (ChMessageInt_t *)msg.data;
01829 int nodetab_Nodes = ChMessageInt(nodelistmsg[0]);
01830 int client;
01831 for (client=0;client<nodetab_rank0_size;client++) {
01832 SOCKET fd = req_clients[client];
01833 ChMessageHeader hdr;
01834 ChMessageInt_t nNodes=ChMessageInt_new(nodetab_Nodes);
01835 ChMessageHeader_new("initnodetab",sizeof(ChMessageInt_t)+
01836 sizeof(ChNodeinfo)*nodetab_Nodes,&hdr);
01837 skt_sendN(fd,(const char *)&hdr,sizeof(hdr));
01838 skt_sendN(fd,(const char *)&nNodes,sizeof(nNodes));
01839 skt_sendN(fd,(const char *)(nodelistmsg+1),
01840 sizeof(ChNodeinfo)*nodetab_Nodes);
01841 }
01842 }
01843
01844 void receive_nodeset_from_child(ChMessage *msg, SOCKET fd)
01845 {
01846 ChMessageInt_t * n32 = (ChMessageInt_t *)msg->data;
01847 int numOfNodes =ChMessageInt(n32[0]);
01848 ChSingleNodeinfo *childNodeInfo = (ChSingleNodeinfo*) (n32+1);
01849 int k;
01850 for(k = 0; k<numOfNodes; k++)
01851 nodeinfo_add(childNodeInfo+k,fd);
01852 }
01853
01854 void set_sockets_list(ChMessage *msg, SOCKET fd)
01855 {
01856 ChMessageInt_t * n32 = (ChMessageInt_t *)msg->data;
01857 int node_start =ChMessageInt(n32[0]);
01858 charmrun_fds[node_start/nodes_per_child] = fd;
01859 }
01860 #endif
01861
01862 static void checkPrintfError(int err) {
01863 if (err<0) {
01864 static int warned=0;
01865 if (!warned) {
01866 perror("charmrun WARNING> error in printf");
01867 warned=1;
01868 }
01869 }
01870 }
01871
01872 int req_handle_print(ChMessage *msg,SOCKET fd)
01873 {
01874 checkPrintfError(printf("%s",msg->data));
01875 checkPrintfError(fflush(stdout));
01876 write_stdio_duplicate(msg->data);
01877 return REQ_OK;
01878 }
01879
01880
01881 int req_handle_printerr(ChMessage *msg,SOCKET fd)
01882 {
01883 fprintf(stderr,"%s",msg->data);
01884 fflush(stderr);
01885 write_stdio_duplicate(msg->data);
01886 return REQ_OK;
01887 }
01888
01889
01890 int req_handle_printsyn(ChMessage *msg,SOCKET fd)
01891 {
01892 checkPrintfError(printf("%s",msg->data));
01893 checkPrintfError(fflush(stdout));
01894 write_stdio_duplicate(msg->data);
01895 #ifdef HSTART
01896 if(arg_hierarchical_start)
01897 req_reply_child(fd, "printdone", "", 1);
01898 else
01899 #endif
01900 req_reply(fd, "printdone", "", 1);
01901 return REQ_OK;
01902 }
01903
01904
01905 int req_handle_printerrsyn(ChMessage *msg,SOCKET fd)
01906 {
01907 fprintf(stderr,"%s",msg->data);
01908 fflush(stderr);
01909 write_stdio_duplicate(msg->data);
01910 #ifdef HSTART
01911 if(arg_hierarchical_start)
01912 req_reply_child(fd, "printdone", "", 1);
01913 else
01914 #endif
01915 req_reply(fd, "printdone", "", 1);
01916 return REQ_OK;
01917 }
01918
01919
01920 int req_handle_ending(ChMessage *msg,SOCKET fd)
01921 {
01922 int i;
01923 req_ending++;
01924
01925 #if (!defined(_FAULT_MLOG_) && !defined(_FAULT_CAUSAL_))
01926 if (req_ending == nodetab_size)
01927 #else
01928 if(req_ending == arg_requested_pes)
01929 #endif
01930 {
01931 for (i=0;i<req_nClients;i++)
01932 skt_close(req_clients[i]);
01933 if (arg_verbose) printf("Charmrun> Graceful exit.\n");
01934 exit(0);
01935 }
01936 return REQ_OK;
01937 }
01938
01939 int req_handle_barrier(ChMessage *msg,SOCKET fd)
01940 {
01941 int i;
01942 static int barrier_count = 0;
01943 static int barrier_phase = 0;
01944 barrier_count ++;
01945 #ifdef HSTART
01946 if (barrier_count == arg_requested_pes)
01947 #else
01948 if (barrier_count == req_nClients)
01949 #endif
01950 {
01951 barrier_count = 0;
01952 barrier_phase ++;
01953 for (i=0;i<req_nClients;i++)
01954 if (REQ_OK != req_reply(req_clients[i], "barrier", "", 1))
01955 {
01956 fprintf(stderr, "req_handle_barrier socket error: %d\n", i);
01957 abort();
01958 }
01959 }
01960 return REQ_OK;
01961 }
01962
01963 int req_handle_barrier0(ChMessage *msg,SOCKET fd)
01964 {
01965 int i;
01966 static int count = 0;
01967 static SOCKET fd0;
01968 int pe = atoi(msg->data);
01969 if (pe == 0) fd0 = fd;
01970 count ++;
01971 #ifdef HSTART
01972 if (count == arg_requested_pes)
01973 #else
01974 if (count == req_nClients)
01975 #endif
01976 {
01977 req_reply(fd0, "barrier0", "", 1);
01978 count = 0;
01979 }
01980 return REQ_OK;
01981 }
01982
01983
01984 int req_handle_abort(ChMessage *msg,SOCKET fd)
01985 {
01986
01987 if (msg->len==0)
01988 fprintf(stderr,"Aborting!\n");
01989 else
01990 fprintf(stderr, "%s\n", msg->data);
01991 exit(1);
01992 }
01993
01994 int req_handle_scanf(ChMessage *msg,SOCKET fd)
01995 {
01996 char *fmt, *res, *p;
01997
01998 fmt = msg->data;
01999 fmt[msg->len-1]=0;
02000 res = input_scanf_chars(fmt);
02001 p = res; while (*p) { if (*p=='\n') *p=' '; p++; }
02002 #ifdef HSTART
02003 if(arg_hierarchical_start)
02004 req_reply_child(fd, "scanf-data", res, strlen(res)+1);
02005 else
02006 #endif
02007 req_reply(fd, "scanf-data", res, strlen(res)+1);
02008 free(res);
02009 return REQ_OK;
02010 }
02011
02012 #ifdef __FAULT__
02013 void restart_node(int crashed_node);
02014 void reconnect_crashed_client(int socket_index,int crashed_node);
02015 void announce_crash(int socket_index,int crashed_node);
02016
02017 static int _last_crash = 0;
02018 static int _crash_socket_index = 0;
02019 #ifdef HSTART
02020 static int _crash_socket_charmrun_index = 0;
02021 int crashed_pe_id;
02022 int restarted_pe_id;
02023 #endif
02024 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
02025 static int numCrashes=0;
02026 static SOCKET last_crashed_fd=-1;
02027 #endif
02028
02033 int req_handle_crashack(ChMessage *msg,SOCKET fd)
02034 {
02035 static int count = 0;
02036 count ++;
02037 #ifdef HSTART
02038 if(arg_hierarchical_start)
02039 {
02040 if (count == nodetab_rank0_size-1) {
02041
02042
02043 printf("Charmrun> continue node: %d\n", _last_crash);
02044 req_handle_initnodetab1(NULL,req_clients[_crash_socket_charmrun_index]);
02045 _last_crash = 0;
02046 count = 0;
02047 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
02048 last_crashed_fd=-1;
02049 #endif
02050 }
02051 }
02052
02053 else
02054
02055 #endif
02056 if (count == req_nClients-1) {
02057
02058 printf("Charmrun> continue node: %d\n", _last_crash);
02059 req_handle_initnodetab(NULL,req_clients[_crash_socket_index]);
02060 _last_crash = 0;
02061 count = 0;
02062 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
02063 last_crashed_fd=-1;
02064 #endif
02065 }
02066 }
02067
02068 #ifdef HSTART
02069
02070 int set_crashed_socket_id(ChMessage *msg,SOCKET fd)
02071 {
02072 ChSingleNodeinfo *nodeInfo = (ChSingleNodeinfo *)msg->data;
02073 int nt=nodetab_rank0_table[ChMessageInt(nodeInfo->nodeNo)-mynodes_start];
02074 nodeInfo->nodeNo = ChMessageInt_new(nt);
02075
02076
02077 int pe;
02078 for (pe=0;pe<nodetab_cpus(nt);pe++)
02079 {
02080 nodetab_table[nt+pe]->ctrlfd=fd;
02081 }
02082 }
02083
02084
02085 int req_handle_crash(ChMessage *msg,SOCKET fd)
02086 {
02087
02088 ChMessageInt_t oldpe, newpe;
02089 skt_recvN(fd, (const char *)&oldpe, sizeof(oldpe));
02090 skt_recvN(fd, (const char *)&newpe, sizeof(newpe));
02091 *nodetab_table[ChMessageInt(oldpe)] = *nodetab_table[ChMessageInt(newpe)];
02092
02093 int status = req_handle_initnode(msg,fd);
02094 int i;
02095 for(i=0;i<req_nClients;i++){
02096 if(req_clients[i] == fd){
02097 break;
02098 }
02099 }
02100 _crash_socket_charmrun_index = i;
02101
02102 fprintf(stdout,"Root charmrun : Socket %d failed %d\n",fd, _crash_socket_charmrun_index);
02103 fflush(stdout);
02104 ChSingleNodeinfo *nodeInfo = (ChSingleNodeinfo *)msg->data;
02105 int crashed_node = ChMessageInt(nodeInfo->nodeNo);
02106 _last_crash = crashed_node;
02107 switch (status)
02108 {
02109 case REQ_OK: break;
02110 case REQ_FAILED:
02111 return REQ_FAILED;
02112 }
02113
02114
02115 int client;
02116 for (client=0;client<req_nClients;client++) {
02117 req_handle_initnodetab(NULL,req_clients[client]);
02118 }
02119
02120
02121 announce_crash(nodetab_rank0_size+1,crashed_node );
02122
02123 }
02124
02125 #endif
02126 #endif
02127
02128 #ifdef __FAULT__
02129 void error_in_req_serve_client(SOCKET fd){
02130 SOCKET * new_req_clients=(SOCKET *)malloc((req_nClients-1)*sizeof(SOCKET));
02131 int count=0,i;
02132 int crashed_node,crashed_pe,node_index,socket_index;
02133 fprintf(stdout,"Socket %d failed \n",fd);
02134
02135
02136 #ifdef HSTART
02137 if(arg_hierarchical_start)
02138 {
02139 for(i=mynodes_start;i<mynodes_start+nodetab_rank0_size;i++){
02140 if(nodetab_ctrlfd(i) == fd){
02141 break;
02142 }
02143 }
02144 }
02145
02146 else
02147 #endif
02148 for(i=0;i<nodetab_max;i++){
02149 if(nodetab_ctrlfd(i) == fd){
02150 break;
02151 }
02152 }
02153
02154 fflush(stdout);
02155 #if (!defined(_FAULT_MLOG_) && !defined(_FAULT_CAUSAL_))
02156 skt_close(fd);
02157 #endif
02158 crashed_pe = i;
02159 node_index = i-nodetab_rank(crashed_pe);
02160 for(i=0;i<nodetab_rank0_size;i++){
02161 if(node_index == nodetab_rank0_table[i]){
02162 break;
02163 }
02164 }
02165 crashed_node = i;
02166
02168
02169 restart_node(crashed_node);
02170
02171 fprintf(stdout,"charmrun says Processor %d failed on Node %d\n",crashed_pe,crashed_node);
02176 for(i=0;i<req_nClients;i++){
02177 if(req_clients[i] == fd){
02178 break;
02179 }
02180 }
02181 socket_index = i;
02182 reconnect_crashed_client(socket_index,crashed_node);
02183 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
02184 skt_close(fd);
02185 #endif
02186 }
02187 #endif
02188
02189 int req_handler_dispatch(ChMessage *msg,SOCKET replyFd)
02190 {
02191 char *cmd=msg->header.type;
02192 int recv_status;
02193 DEBUGF(("Got request '%s'\n",cmd,replyFd));
02194 #if CMK_CCS_AVAILABLE
02195 if (strcmp(cmd,"reply_fw")==0) return req_ccs_reply_fw(msg,replyFd);
02196 #endif
02197
02198
02199 recv_status = ChMessageData_recv(replyFd,msg);
02200 #ifdef __FAULT__
02201 #ifdef HSTART
02202 if(!arg_hierarchical_start)
02203 #endif
02204 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
02205 if(recv_status < 0){
02206 if(replyFd == last_crashed_fd){
02207 return REQ_OK;
02208 }
02209 DEBUGF(("recv_status %d on socket %d \n",recv_status,replyFd));
02210 error_in_req_serve_client(replyFd);
02211 }
02212 #else
02213 if(recv_status < 0) error_in_req_serve_client(replyFd);
02214 #endif
02215 #endif
02216
02217 if (strcmp(cmd,"ping")==0) return REQ_OK;
02218 else if (strcmp(cmd,"print")==0) return req_handle_print(msg,replyFd);
02219 else if (strcmp(cmd,"printerr")==0) return req_handle_printerr(msg,replyFd);
02220 else if (strcmp(cmd,"printsyn")==0) return req_handle_printsyn(msg,replyFd);
02221 else if (strcmp(cmd,"printerrsyn")==0) return req_handle_printerrsyn(msg,replyFd);
02222 else if (strcmp(cmd,"scanf")==0) return req_handle_scanf(msg,replyFd);
02223 else if (strcmp(cmd,"barrier")==0) return req_handle_barrier(msg,replyFd);
02224 else if (strcmp(cmd,"barrier0")==0) return req_handle_barrier0(msg,replyFd);
02225 else if (strcmp(cmd,"ending")==0) return req_handle_ending(msg,replyFd);
02226 else if (strcmp(cmd,"abort")==0) return req_handle_abort(msg,replyFd);
02227 #ifdef __FAULT__
02228 else if (strcmp(cmd,"crash_ack")==0) return req_handle_crashack(msg,replyFd);
02229 #ifdef HSTART
02230 else if (strcmp(cmd,"initnode")==0) return req_handle_crash(msg,replyFd);
02231 #endif
02232 #endif
02233 else {
02234 #ifndef __FAULT__
02235 fprintf(stderr,"Charmrun> Bad control socket request '%s'\n",cmd);
02236 abort();
02237 return REQ_OK;
02238 #endif
02239 }
02240 return REQ_OK;
02241 }
02242
02243 void req_serve_client(SOCKET fd)
02244 {
02245 int recv_status;
02246 int status;
02247 ChMessage msg;
02248 DEBUGF(("Getting message from client...\n"));
02249 recv_status = ChMessageHeader_recv(fd,&msg);
02250 #ifdef __FAULT__
02251 #ifdef HSTART
02252 if(!arg_hierarchical_start && recv_status < 0) error_in_req_serve_client(fd);
02253 #else
02254 if(recv_status < 0) error_in_req_serve_client(fd);
02255 #endif
02256 #endif
02257
02258 DEBUGF(("Message is '%s'\n",msg.header.type));
02259 status = req_handler_dispatch(&msg,fd);
02260 switch (status)
02261 {
02262 case REQ_OK: break;
02263 case REQ_FAILED:
02264 fprintf(stderr,"Charmrun> Error processing control socket request %s\n",msg.header.type);
02265 abort();
02266 break;
02267 }
02268 ChMessage_free(&msg);
02269 }
02270
02271 #ifdef HSTART
02272 void req_forward_root(SOCKET fd)
02273 {
02274 int recv_status;
02275 int status;
02276 ChMessage msg;
02277 recv_status = ChMessage_recv(fd,&msg);
02278
02279 char *cmd=msg.header.type;
02280
02281 #ifdef __FAULT__
02282 if(recv_status < 0)
02283 {
02284 error_in_req_serve_client(fd);
02285 return;
02286 }
02287
02288
02289 if (strcmp(cmd,"initnode")==0)
02290 {
02291 set_crashed_socket_id(&msg,fd);
02292 }
02293 #endif
02294
02295 if (strcmp(cmd,"ping")!=0)
02296 {
02297 status = req_reply(parent_charmrun_fd, cmd, msg.data,ChMessageInt(msg.header.len));
02298
02299 if (strcmp(cmd,"scanf")==0 || strcmp(cmd,"printsyn")==0 || strcmp(cmd,"printerrsyn")==0)
02300 skt_sendN(parent_charmrun_fd,(const char *)&fd, sizeof(fd));
02301
02302 #ifdef __FAULT__
02303 if (strcmp(cmd,"initnode")==0)
02304 {
02305 ChMessageInt_t oldpe=ChMessageInt_new(crashed_pe_id);
02306 ChMessageInt_t newpe=ChMessageInt_new(restarted_pe_id);
02307 skt_sendN(parent_charmrun_fd,(const char *)&oldpe, sizeof(oldpe));
02308 skt_sendN(parent_charmrun_fd,(const char *)&newpe, sizeof(newpe));
02309 }
02310 #endif
02311 }
02312
02313
02314 switch (status)
02315 {
02316 case REQ_OK: break;
02317 case REQ_FAILED:
02318 abort();
02319 break;
02320 }
02321 ChMessage_free(&msg);
02322 }
02323
02324 void req_forward_client()
02325 {
02326 int recv_status;
02327 int status;
02328 ChMessage msg;
02329 recv_status = ChMessage_recv(parent_charmrun_fd,&msg);
02330 if(recv_status < 0)
02331 {
02332 int i;
02333 for (i=0;i<req_nClients;i++)
02334 skt_close(req_clients[i]);
02335 exit(0);
02336 }
02337
02338 char *cmd=msg.header.type;
02339
02340 if(strcmp(cmd, "barrier") ==0){
02341 int i;
02342 for (i=0;i<req_nClients;i++)
02343 if (REQ_OK != req_reply(req_clients[i],cmd,msg.data,ChMessageInt(msg.header.len)))
02344 {
02345 abort();
02346 }
02347 return;
02348 }
02349 #ifdef __FAULT__
02350 if(strcmp(cmd, "initnodetab") ==0){
02351 if(_last_crash ==0 )
02352 current_restart_phase++;
02353 int i;
02354 for (i=0;i<req_nClients;i++)
02355 if(_last_crash==0 || i !=_crash_socket_index)
02356 if (REQ_OK != req_reply(req_clients[i],cmd,msg.data,ChMessageInt(msg.header.len))){
02357 abort();
02358 }
02359 return;
02360 }
02361
02362 if(strcmp(cmd, "crashnode") ==0){
02363
02364 int i;
02365 for (i=0;i<req_nClients;i++)
02366 if(_last_crash==0 || i !=_crash_socket_index)
02367 if (REQ_OK != req_reply(req_clients[i],cmd,msg.data,ChMessageInt(msg.header.len)))
02368 {
02369 abort();
02370 }
02371 return;
02372 }
02373 if(strcmp(cmd, "initnttab") ==0){
02374 _last_crash = 0;
02375 if (REQ_OK != req_reply(req_clients[_crash_socket_index],"initnodetab",msg.data,ChMessageInt(msg.header.len)))
02376 {
02377 abort();
02378 }
02379 return;
02380 }
02381
02382 #endif
02383
02384 SOCKET fd;
02385
02386
02387 if(strcmp(cmd,"req_fw") ==0)
02388 {
02389 CcsImplHeader * hdr =(CcsImplHeader *)msg.data;
02390 int pe=ChMessageInt(hdr->pe);
02391 fd = nodetab_table[pe]->ctrlfd;
02392 }
02393 else if(strcmp(cmd, "barrier0") ==0)
02394 {
02395 fd = nodetab_table[0]->ctrlfd;
02396 }
02397 else
02398 skt_recvN(parent_charmrun_fd, (const char *)&fd,sizeof(SOCKET));
02399
02400 status = req_reply(fd, cmd, msg.data,ChMessageInt(msg.header.len));
02401
02402 switch (status)
02403 {
02404 case REQ_OK: break;
02405 case REQ_FAILED:
02406 abort();
02407 break;
02408 }
02409 ChMessage_free(&msg);
02410 }
02411
02412 #endif
02413
02414 int ignore_socket_errors(int c,const char *m)
02415 {
02416
02417 #ifndef __FAULT__
02418 exit(2);
02419 #endif
02420 return -1;
02421 }
02422
02423
02424
02425
02426 int socket_error_in_poll(int code,const char *msg)
02427 {
02428
02429
02430
02431 int i;
02432 skt_set_abort(ignore_socket_errors);
02433 fprintf(stderr,"Charmrun: error on request socket--\n"
02434 "%s\n",msg);
02435 #ifndef __FAULT__
02436 for (i=0;i<req_nClients;i++)
02437 skt_close(req_clients[i]);
02438 exit(1);
02439 #endif
02440 ftTimer = GetClock();
02441 return -1;
02442 }
02443
02444 #if CMK_USE_POLL
02445 # define CMK_PIPE_DECL(maxn,delayMs) \
02446 static struct pollfd *fds = NULL; \
02447 int nFds_sto=0; int *nFds=&nFds_sto; \
02448 int pollDelayMs=delayMs; \
02449 if (fds == NULL) fds = (struct pollfd *)malloc((maxn) * sizeof(struct pollfd));
02450 # define CMK_PIPE_SUB fds,nFds
02451 # define CMK_PIPE_CALL() poll(fds, *nFds, pollDelayMs); *nFds=0
02452
02453 # define CMK_PIPE_PARAM struct pollfd *fds,int *nFds
02454 # define CMK_PIPE_ADDREAD(rd_fd) \
02455 do {fds[*nFds].fd=rd_fd; fds[*nFds].events=POLLIN; (*nFds)++;} while(0)
02456 # define CMK_PIPE_ADDWRITE(wr_fd) \
02457 do {fds[*nFds].fd=wr_fd; fds[*nFds].events=POLLOUT; (*nFds)++;} while(0)
02458 # define CMK_PIPE_CHECKREAD(rd_fd) fds[(*nFds)++].revents&POLLIN
02459 # define CMK_PIPE_CHECKWRITE(wr_fd) fds[(*nFds)++].revents&POLLOUT
02460
02461 #else
02462
02463 # define CMK_PIPE_DECL(maxn, delayMs) \
02464 fd_set rfds_sto,wfds_sto;\
02465 int nFds=0; \
02466 fd_set *rfds=&rfds_sto,*wfds=&wfds_sto; struct timeval tmo; \
02467 FD_ZERO(rfds); FD_ZERO(wfds); \
02468 tmo.tv_sec=delayMs/1000; tmo.tv_usec=1000*(delayMs%1000);
02469 # define CMK_PIPE_SUB rfds,wfds
02470 # define CMK_PIPE_CALL() select(FD_SETSIZE, rfds, 0, 0, &tmo)
02471
02472 # define CMK_PIPE_PARAM fd_set *rfds,fd_set *wfds
02473 # define CMK_PIPE_ADDREAD(rd_fd) { assert(nFds<FD_SETSIZE);FD_SET(rd_fd,rfds); nFds++; }
02474 # define CMK_PIPE_ADDWRITE(wr_fd) FD_SET(wr_fd,wfds)
02475 # define CMK_PIPE_CHECKREAD(rd_fd) FD_ISSET(rd_fd,rfds)
02476 # define CMK_PIPE_CHECKWRITE(wr_fd) FD_ISSET(wr_fd,wfds)
02477 #endif
02478
02479
02480
02481
02482
02483 void req_poll()
02484 {
02485 int status,i;
02486 int readcount;
02487
02488 CMK_PIPE_DECL(req_nClients+5, 1000);
02489 for (i=0;i<req_nClients;i++)
02490 CMK_PIPE_ADDREAD(req_clients[i]);
02491 if (CcsServer_fd()!=INVALID_SOCKET) CMK_PIPE_ADDREAD(CcsServer_fd());
02492 if (arg_charmdebug) {
02493 CMK_PIPE_ADDREAD(0);
02494 CMK_PIPE_ADDREAD(gdb_info_std[1]);
02495 CMK_PIPE_ADDREAD(gdb_info_std[2]);
02496 }
02497
02498 skt_set_abort(socket_error_in_poll);
02499
02500 DEBUGF(("Req_poll: Calling select...\n"));
02501 status=CMK_PIPE_CALL();
02502 DEBUGF(("Req_poll: Select returned %d...\n",status));
02503
02504 if (status==0) return;
02505
02506 if (status<0){
02507 if (errno == EINTR || errno == EAGAIN) return;
02508 fflush(stdout);
02509 fflush(stderr);
02510 socket_error_in_poll(1359,"Node program terminated unexpectedly!\n");
02511 }
02512 for (i=0;i<req_nClients;i++)
02513 if (CMK_PIPE_CHECKREAD(req_clients[i]))
02514 {
02515 readcount=10;
02516
02517 do { req_serve_client(req_clients[i]); readcount--;}
02518 while (1==skt_select1(req_clients[i],0) && readcount>0);
02519 }
02520
02521 if (CcsServer_fd()!=INVALID_SOCKET)
02522 if (CMK_PIPE_CHECKREAD(CcsServer_fd())) {
02523 DEBUGF(("Activity on CCS server port...\n"));
02524 req_ccs_connect();
02525 }
02526
02527 if (arg_charmdebug) {
02528 char buf[2048];
02529 if (CMK_PIPE_CHECKREAD(0)) {
02530 int indata = read(0, buf, 5);
02531 buf[indata] = 0;
02532 if (indata < 5) fprintf(stderr,"Error reading command (%s)\n",buf);
02533 if (strncmp(buf,"info:",5)==0) {
02534
02535 char c;
02536 int num=0;
02537
02538 while (read(0, &c, 1)!=-1) {
02539 buf[num++]=c;
02540 if (c=='\n' || num >= 2045) {
02541 write(gdb_info_std[0], buf, num);
02542 if (c=='\n') break;
02543 }
02544 }
02545 }
02546
02547 }
02548
02549
02550
02551
02552
02553 if (CMK_PIPE_CHECKREAD(gdb_info_std[2])) {
02554 int indata = read(gdb_info_std[2], buf, 100);
02555
02556 if (indata > 0) {
02557 buf[indata] = 0;
02558
02559
02560
02561 fflush(gdb_stream);
02562 }
02563 } else if (CMK_PIPE_CHECKREAD(gdb_info_std[1])) {
02564 int indata = read(gdb_info_std[1], buf, 100);
02565
02566 if (indata > 0) {
02567 buf[indata] = 0;
02568
02569
02570 fprintf(gdb_stream,"%s",buf);
02571 fflush(gdb_stream);
02572 }
02573 }
02574 }
02575 }
02576
02577 #ifdef HSTART
02578 void req_poll_hierarchical()
02579 {
02580 int status,i;
02581 fd_set rfds;
02582 struct timeval tmo;
02583 int readcount;
02584
02585 skt_set_abort(socket_error_in_poll);
02586
02587 tmo.tv_sec = 1;
02588 tmo.tv_usec = 0;
02589 FD_ZERO(&rfds);
02590 for (i=0;i<req_nClients;i++)
02591 FD_SET(req_clients[i],&rfds);
02592 if (CcsServer_fd()!=INVALID_SOCKET) FD_SET(CcsServer_fd(),&rfds);
02593 if (arg_charmdebug) {
02594 FD_SET(0, &rfds);
02595 FD_SET(gdb_info_std[1], &rfds);
02596 FD_SET(gdb_info_std[2], &rfds);
02597 }
02598
02599 if(arg_child_charmrun)
02600 FD_SET(parent_charmrun_fd,&rfds);
02601 DEBUGF(("Req_poll: Calling select...\n"));
02602 status=select(FD_SETSIZE, &rfds, 0, 0, &tmo);
02603 DEBUGF(("Req_poll: Select returned %d...\n",status));
02604
02605 if (status==0) return;
02606 if (status<0){
02607 fflush(stdout);
02608 fflush(stderr);
02609 socket_error_in_poll(1359,"Node program terminated unexpectedly!\n");
02610 }
02611 for (i=0;i<req_nClients;i++)
02612 if (FD_ISSET(req_clients[i],&rfds))
02613 {
02614 readcount=10;
02615
02616 do {
02617 if(arg_child_charmrun)
02618 req_forward_root(req_clients[i]);
02619 else
02620 req_serve_client(req_clients[i]);
02621 readcount--;
02622 }
02623 while (1==skt_select1(req_clients[i],0) && readcount>0);
02624 }
02625
02626
02627 if(arg_child_charmrun)
02628
02629 if (FD_ISSET(parent_charmrun_fd,&rfds))
02630 {
02631 readcount=10;
02632 do{
02633 req_forward_client();
02634 readcount--;
02635 }
02636 while (1==skt_select1(parent_charmrun_fd,0) && readcount>0);
02637 }
02638
02639
02640 if (CcsServer_fd()!=INVALID_SOCKET)
02641 if (FD_ISSET(CcsServer_fd(),&rfds)) {
02642 DEBUGF(("Activity on CCS server port...\n"));
02643 req_ccs_connect();
02644 }
02645
02646 if (arg_charmdebug) {
02647 char buf[2048];
02648 if (FD_ISSET(0, &rfds)) {
02649 int indata = read(0, buf, 5);
02650 buf[indata] = 0;
02651 if (indata < 5) fprintf(stderr,"Error reading command (%s)\n",buf);
02652 if (strncmp(buf,"info:",5)==0) {
02653
02654 char c;
02655 int num=0;
02656
02657 while (read(0, &c, 1)!=-1) {
02658 buf[num++]=c;
02659 if (c=='\n' || num >= 2045) {
02660 write(gdb_info_std[0], buf, num);
02661 if (c=='\n') break;
02662 }
02663 }
02664 }
02665
02666 }
02667
02668
02669
02670
02671
02672 if (FD_ISSET(gdb_info_std[2], &rfds)) {
02673 int indata = read(gdb_info_std[2], buf, 100);
02674
02675 if (indata > 0) {
02676 buf[indata] = 0;
02677
02678
02679
02680 fflush(gdb_stream);
02681 }
02682 } else if (FD_ISSET(gdb_info_std[1], &rfds)) {
02683 int indata = read(gdb_info_std[1], buf, 100);
02684
02685 if (indata > 0) {
02686 buf[indata] = 0;
02687
02688
02689 fprintf(gdb_stream,"%s",buf);
02690 fflush(gdb_stream);
02691 }
02692 }
02693 }
02694 }
02695 #endif
02696
02697 static unsigned int server_port;
02698 static char server_addr[1024];
02699 static SOCKET server_fd;
02700
02701 #ifdef HSTART
02702 static skt_ip_t parent_charmrun_IP;
02703 static int parent_charmrun_port;
02704 static int parent_charmrun_pid;
02705 static int dataport;
02706 static SOCKET dataskt;
02707 int charmrun_phase =0;
02708 #endif
02709
02710 int client_connect_problem(int code,const char *msg)
02711 {
02712
02713 fprintf(stderr,"Charmrun> error %d attaching to node:\n"
02714 "%s\n",code,msg);
02715 exit(1);
02716 return -1;
02717 }
02718
02720 int errorcheck_one_client_connect(int client){
02721 #ifdef HSTART
02722
02723 if(arg_hierarchical_start && !arg_child_charmrun && charmrun_phase ==1)
02724 return 1;
02725 #endif
02726 unsigned int clientPort;
02727 skt_ip_t clientIP;
02728 if (arg_verbose) printf("Charmrun> Waiting for %d-th client to connect.\n",client);
02729 if (0==skt_select1(server_fd,arg_timeout*1000))
02730 client_connect_problem(client,"Timeout waiting for node-program to connect");
02731
02732
02733 req_clients[client]=skt_accept(server_fd,&clientIP,&clientPort);
02734
02735 if (req_clients[client]==SOCKET_ERROR)
02736 client_connect_problem(client,"Failure in node accept");
02737
02738 skt_tcp_no_nagle(req_clients[client]);
02739
02740 return 1;
02741 };
02742
02743
02744 #if CMK_C_INLINE
02745 inline static
02746 #endif
02747 void read_initnode_one_client(int client){
02748 ChMessage msg;
02749 if (!skt_select1(req_clients[client],arg_timeout*1000))
02750 client_connect_problem(client,"Timeout on IP request");
02751 ChMessage_recv(req_clients[client],&msg);
02752 req_handle_initnode(&msg,req_clients[client]);
02753 ChMessage_free(&msg);
02754 }
02755
02756
02757 #if CMK_IBVERBS_FAST_START
02758 void req_one_client_partinit(int client){
02759 ChMessage partStartMsg;
02760 int clientNode;
02761
02762 if(errorcheck_one_client_connect(client)){
02763 if (!skt_select1(req_clients[client],arg_timeout*1000))
02764 client_connect_problem(client,"Timeout on partial init request");
02765
02766 ChMessage_recv(req_clients[client],&partStartMsg);
02767 clientNode = ChMessageInt(*(ChMessageInt_t*)partStartMsg.data);
02768 assert(strncmp(partStartMsg.header.type,"partinit",8) == 0);
02769 ChMessage_free(&partStartMsg);
02770 }
02771
02772 };
02773 #endif
02774
02775
02776 #ifdef HSTART
02777 int nodeCount = 0;
02778
02779 void add_singlenodeinfo_to_mynodeinfo(ChMessage * msg, SOCKET ctrlfd)
02780 {
02781
02782 ChSingleNodeinfo *nodeInfo = (ChSingleNodeinfo *)msg->data;
02783
02784
02785 myNodesInfo[nodeCount].nodeNo = ChMessageInt_new(nodetab_rank0_table[ChMessageInt(nodeInfo->nodeNo)-mynodes_start]);
02786 myNodesInfo[nodeCount++].info = nodeInfo->info;
02787
02788
02789 int nt=nodetab_rank0_table[ChMessageInt(nodeInfo->nodeNo)-mynodes_start];
02790 int pe;
02791 for (pe=0;pe<nodetab_cpus(nt);pe++)
02792 {
02793 nodetab_table[nt+pe]->ctrlfd=ctrlfd;
02794 }
02795 }
02796 #endif
02797
02798 #ifndef HSTART
02799
02800 void req_set_client_connect(int start,int end) {
02801 fd_set sockset;
02802 ChMessage msg;
02803 int client,i;
02804 int done,maxdesc;
02805 int *finished;
02806 int curclient,curclientend,curclientstart;
02807
02808 curclient=curclientend=curclientstart=start;
02809
02810 finished=malloc((end-start)*sizeof(int));
02811 for(i=0;i<(end-start);i++)
02812 finished[i]=0;
02813
02814 #if CMK_USE_IBVERBS && !CMK_IBVERBS_FAST_START
02815 for (i=start;i<end;i++) {
02816 errorcheck_one_client_connect(curclientend++);
02817 }
02818 if (req_nClients > 1) {
02819
02820 for (i=start;i<end;i++)
02821 ChMessage_recv(req_clients[i],&msg);
02822 for (i=start;i<end;i++)
02823 req_reply(req_clients[i], "barrier", "", 1);
02824 }
02825 #endif
02826
02827 done=0;
02828 while(!done) {
02829
02830 #if ! CMK_USE_IBVERBS || CMK_IBVERBS_FAST_START
02831 while(curclientstart==curclientend||skt_select1(server_fd,1)!=0) {
02832 errorcheck_one_client_connect(curclientend++);
02833 }
02834 #endif
02835
02836 for(client=curclientstart;client<curclientend;client++)
02837 if(req_clients[client]>0) {
02838 if(skt_select1(req_clients[client],1)!=0) {
02839 ChMessage_recv(req_clients[client],&msg);
02840 req_handle_initnode(&msg,req_clients[client]);
02841 finished[client-start]=1;
02842 }
02843 }
02844
02845
02846
02847 done=1;
02848 for(i=curclientstart-start;i<(end-start);i++)
02849 if(finished[i]==0) {
02850 curclientstart=start+i;
02851 done=0;
02852 break;
02853 }
02854
02855 }
02856 ChMessage_free(&msg);
02857
02858 free(finished);
02859 }
02860 #else
02861
02862 void req_set_client_connect(int start,int end) {
02863 fd_set sockset;
02864 ChMessage msg;
02865 int client,i;
02866 int done,maxdesc;
02867 int *finished;
02868 int curclient,curclientend,curclientstart;
02869
02870 curclient=curclientend=curclientstart=start;
02871
02872 finished=malloc((end-start)*sizeof(int));
02873 for(i=0;i<(end-start);i++)
02874 finished[i]=0;
02875
02876 if(arg_child_charmrun && start==0 ) myNodesInfo = malloc(sizeof(ChSingleNodeinfo)*nodetab_rank0_size);
02877
02878 #if CMK_USE_IBVERBS && !CMK_IBVERBS_FAST_START
02879 for (i=start;i<end;i++) {
02880 errorcheck_one_client_connect(curclientend++);
02881 }
02882 if (req_nClients > 1) {
02883
02884 for (i=start;i<end;i++)
02885 ChMessage_recv(req_clients[i],&msg);
02886 for (i=start;i<end;i++)
02887 req_reply(req_clients[i], "barrier", "", 1);
02888 }
02889 #endif
02890
02891 done=0;
02892 while(!done) {
02893
02894 #if ! CMK_USE_IBVERBS || CMK_IBVERBS_FAST_START
02895 while(curclientstart==curclientend||skt_select1(server_fd,1)!=0) {
02896 errorcheck_one_client_connect(curclientend++);
02897 }
02898 #endif
02899
02900 for(client=curclientstart;client<curclientend;client++)
02901 if(req_clients[client]>0) {
02902 if(skt_select1(req_clients[client],1)!=0) {
02903 ChMessage_recv(req_clients[client],&msg);
02904 if(!arg_hierarchical_start)
02905 req_handle_initnode(&msg,req_clients[client]);
02906 else{
02907 if(!arg_child_charmrun)
02908 {
02909 if(charmrun_phase ==1)
02910 receive_nodeset_from_child(&msg, req_clients[client]);
02911 else
02912 set_sockets_list(&msg, req_clients[client]);
02913
02914 }
02915 else
02916 add_singlenodeinfo_to_mynodeinfo(&msg,req_clients[client] );
02917 }
02918 finished[client-start]=1;
02919 }
02920 }
02921
02922
02923 done=1;
02924 for(i=curclientstart-start;i<(end-start);i++)
02925 if(finished[i]==0) {
02926 curclientstart=start+i;
02927 done=0;
02928 break;
02929 }
02930
02931 }
02932 ChMessage_free(&msg);
02933
02934 free(finished);
02935 }
02936 #endif
02937
02938
02939
02940 void req_one_client_connect(int client)
02941 {
02942 if(errorcheck_one_client_connect(client))
02943 {
02944 read_initnode_one_client(client);
02945 }
02946 }
02947
02948 #if CMK_USE_IBVERBS
02949
02950
02951
02952
02953 void exchange_qpdata_clients(){
02954 int proc,i;
02955 for( i=0;i<nodetab_rank0_size;i++){
02956 int nt=nodetab_rank0_table[i];
02957 nodetab_table[nt]->qpData = malloc(sizeof(ChInfiAddr)*nodetab_rank0_size);
02958 }
02959 for(proc =0;proc< nodetab_rank0_size;proc++){
02960 int count=0;
02961 for(i=0;i<nodetab_rank0_size;i++){
02962 if(i == proc){
02963 }else{
02964 int nt=nodetab_rank0_table[i];
02965 nodetab_table[nt]->qpData[proc] = nodeinfo_arr[proc].qpList[count];
02966
02967 count++;
02968 }
02969 }
02970 free(nodeinfo_arr[proc].qpList);
02971 }
02972 };
02973
02974 void send_clients_nodeinfo_qpdata(){
02975 int node;
02976 int msgSize = sizeof(ChMessageInt_t)+sizeof(ChNodeinfo)*nodetab_rank0_size+sizeof(ChInfiAddr)*nodetab_rank0_size;
02977 for(node=0;node<nodetab_rank0_size;node++){
02978 int nt=nodetab_rank0_table[node];
02979
02980 ChMessageHeader hdr;
02981 ChMessageInt_t nNodes=ChMessageInt_new(nodetab_rank0_size);
02982 ChMessageHeader_new("initnodetab",msgSize,&hdr);
02983 skt_sendN(nodetab_table[nt]->ctrlfd,(const char *)&hdr,sizeof(hdr));
02984 skt_sendN(nodetab_table[nt]->ctrlfd,(const char *)&nNodes,sizeof(nNodes));
02985 skt_sendN(nodetab_table[nt]->ctrlfd,(const char *)nodeinfo_arr,sizeof(ChNodeinfo)*nodetab_rank0_size);
02986 skt_sendN(nodetab_table[nt]->ctrlfd,(const char *)&nodetab_table[nt]->qpData[0],sizeof(ChInfiAddr)*nodetab_rank0_size);
02987 }
02988 }
02989 #endif
02990
02991 struct timeval tim;
02992 #define getthetime(x) gettimeofday(&tim,NULL); x = tim.tv_sec + (tim.tv_usec/1000000.0);
02993 #define getthetime1(x) gettimeofday(&tim,NULL); x = tim.tv_sec ;
02994
02995 void req_client_connect(void)
02996 {
02997 int client;
02998 #ifdef HSTART
02999 if(!arg_hierarchical_start)
03000 #endif
03001 nodeinfo_allocate();
03002 req_nClients=nodetab_rank0_size;
03003 req_clients=(SOCKET *)malloc(req_nClients*sizeof(SOCKET));
03004 for(client=0;client<req_nClients;client++)
03005 req_clients[client]=-1;
03006
03007 skt_set_abort(client_connect_problem);
03008
03009 #if CMK_IBVERBS_FAST_START
03010 for (client=0;client<req_nClients;client++){
03011 req_one_client_partinit(client);
03012 }
03013 for (client=0;client<req_nClients;client++){
03014 read_initnode_one_client(client);
03015 }
03016 #else
03017
03018 req_set_client_connect(0,req_nClients);
03019
03020 #endif
03021
03022 if (portOk == 0) exit(1);
03023 if (arg_verbose) printf("Charmrun> All clients connected.\n");
03024 #if CMK_USE_IBVERBS
03025 exchange_qpdata_clients();
03026 send_clients_nodeinfo_qpdata();
03027 #else
03028 #ifdef HSTART
03029 if(arg_hierarchical_start) {
03030
03031 send_myNodeInfo_to_parent();
03032
03033 forward_nodetab_to_children();
03034 }
03035
03036 else
03037 #endif
03038 for (client=0;client<req_nClients;client++) {
03039 req_handle_initnodetab(NULL,req_clients[client]);
03040 }
03041
03042 #endif
03043 if (arg_verbose) printf("Charmrun> IP tables sent.\n");
03044 }
03045
03046
03047 #ifdef HSTART
03048 void req_charmrun_connect(void)
03049 {
03050
03051 int client;
03052 nodeinfo_allocate();
03053 req_nClients=branchfactor;
03054 req_clients=(SOCKET *)malloc(req_nClients*sizeof(SOCKET));
03055 charmrun_fds=(SOCKET *)malloc(req_nClients*sizeof(SOCKET));
03056 for(client=0;client<req_nClients;client++)
03057 req_clients[client]=-1;
03058
03059 skt_set_abort(client_connect_problem);
03060
03061 #if CMK_IBVERBS_FAST_START
03062 for (client=0;client<req_nClients;client++){
03063 req_one_client_partinit(client);
03064 }
03065 for (client=0;client<req_nClients;client++){
03066 read_initnode_one_client(client);
03067 }
03068 #else
03069
03070
03071 req_set_client_connect(0,req_nClients);
03072
03073 #endif
03074
03075 if (portOk == 0) exit(1);
03076 if (arg_verbose) printf("Charmrun> All clients connected.\n");
03077 #if CMK_USE_IBVERBS
03078 exchange_qpdata_clients();
03079 send_clients_nodeinfo_qpdata();
03080 #else
03081 for (client=0;client<req_nClients;client++) {
03082
03083 req_handle_initnodedistribution(NULL, charmrun_fds[client], client);
03084 }
03085
03086
03087
03088 charmrun_phase = 1;
03089
03090 skt_set_abort(client_connect_problem);
03091
03092 req_set_client_connect(0,req_nClients);
03093
03094
03095 for (client=0;client<req_nClients;client++) {
03096 req_handle_initnodetab(NULL,req_clients[client]);
03097 }
03098
03099 #endif
03100 if (arg_verbose) printf("Charmrun> IP tables sent.\n");
03101
03102 }
03103
03104 #endif
03105
03106 #ifndef CMK_BPROC
03107
03108 void start_one_node_rsh(int rank0no);
03109 void finish_one_node(int rank0no);
03110 void finish_set_nodes(int start, int stop);
03111
03112
03113
03114 void req_client_start_and_connect(void)
03115 {
03116 int client, c;
03117 int batch = arg_batch_spawn;
03118 int clientgroup,clientstart;
03119 int counter;
03120
03121 #ifdef HSTART
03122 if(!arg_hierarchical_start)
03123 #endif
03124 nodeinfo_allocate();
03125 req_nClients=nodetab_rank0_size;
03126 req_clients=(SOCKET *)malloc(req_nClients*sizeof(SOCKET));
03127
03128 skt_set_abort(client_connect_problem);
03129
03130 client=0;
03131 while(client<req_nClients) {
03132 clientstart=client;
03133
03134 for(counter=0;counter<batch;counter++) {
03135 clientgroup=start_set_node_rsh(client);
03136 client+=clientgroup;
03137 if(client>=req_nClients) {
03138 client=req_nClients;
03139 break;
03140 }
03141 }
03142 #if CMK_USE_RSH
03143
03144 if (!arg_ssh_display)
03145 #endif
03146 finish_set_nodes(clientstart,client);
03147
03148 #if CMK_IBVERBS_FAST_START
03149 for (c=clientstart;c<client;c++) {
03150 req_one_client_partinit(c);
03151 }
03152 #else
03153 req_set_client_connect(clientstart,client);
03154 #endif
03155 }
03156
03157
03158 #if CMK_IBVERBS_FAST_START
03159 for (client=0;client<req_nClients;client++){
03160 read_initnode_one_client(client);
03161 }
03162 #endif
03163 if (portOk == 0) exit(1);
03164 if (arg_verbose) printf("Charmrun> All clients connected.\n");
03165
03166 #if CMK_USE_IBVERBS
03167 exchange_qpdata_clients();
03168 send_clients_nodeinfo_qpdata();
03169 #else
03170 #ifdef HSTART
03171 if(arg_hierarchical_start) {
03172
03173 send_myNodeInfo_to_parent();
03174
03175 forward_nodetab_to_children();
03176 }
03177
03178 else
03179 #endif
03180 for (client=0;client<req_nClients;client++) {
03181 req_handle_initnodetab(NULL,req_clients[client]);
03182 }
03183
03184
03185 #endif
03186 if (arg_verbose) printf("Charmrun> IP tables sent.\n");
03187 free(rsh_pids);
03188 }
03189
03190 #endif
03191
03192
03193 void req_start_server(void)
03194 {
03195 skt_ip_t ip=skt_innode_my_ip();
03196 if (arg_local)
03197
03198 strcpy(server_addr, "127.0.0.1");
03199 else if (arg_charmrunip != NULL)
03200
03201 strcpy(server_addr, arg_charmrunip);
03202 else if ( (arg_charmrunip = getenv ("CHARMRUN_IP")) != NULL)
03203
03204 strcpy(server_addr, arg_charmrunip);
03205 else if (skt_ip_match(ip,_skt_invalid_ip)) {
03206 printf("Charmrun> Warning-- cannot find IP address for your hostname. Using loopback.\n");
03207 strcpy(server_addr, "127.0.0.1");
03208 }
03209 else if (arg_usehostname || skt_ip_match(ip,skt_lookup_ip("127.0.0.1")))
03210
03211 gethostname(server_addr,sizeof(server_addr));
03212 else
03213 skt_print_ip(server_addr,ip);
03214
03215 server_port = 0;
03216 server_fd=skt_server(&server_port);
03217
03218 if (arg_verbose) {
03219 printf("Charmrun> Charmrun = %s, port = %d\n", server_addr, server_port);
03220 }
03221
03222 #if CMK_CCS_AVAILABLE
03223 #ifdef HSTART
03224 if(!arg_hierarchical_start || (arg_hierarchical_start && !arg_child_charmrun))
03225 #endif
03226 if(arg_server == 1) CcsServer_new(NULL,&arg_server_port,arg_server_auth);
03227 #endif
03228 }
03229
03230 #ifdef HSTART
03231 int unique_node_start;
03232
03233 void parse_netstart(void)
03234 {
03235 char *ns;
03236 int nread;
03237 int port;
03238 ns = getenv("NETSTART");
03239 if (ns!=0)
03240 {
03241 char parent_charmrun_name[1024];
03242 nread = sscanf(ns, "%d%s%d%d%d",
03243 &unique_node_start,
03244 parent_charmrun_name, &parent_charmrun_port,
03245 &parent_charmrun_pid, &port);
03246 parent_charmrun_IP=skt_lookup_ip(parent_charmrun_name);
03247 mynodes_start = nodetab_unique_table[unique_node_start];
03248
03249
03250 if (nread!=5) {
03251 fprintf(stderr,"Error parsing NETSTART '%s'\n",ns);
03252 exit(1);
03253 }
03254 }
03255 #if CMK_USE_IBVERBS | CMK_USE_IBUD
03256 char *cmi_num_nodes = getenv("CmiNumNodes");
03257 if(cmi_num_nodes != NULL){
03258 sscanf(cmi_num_nodes,"%d",&_Cmi_numnodes);
03259 }
03260 #endif
03261 }
03262
03263 int nodetab_rank0_size_total;
03264
03265 void my_nodetab_store(ChMessage *msg)
03266 {
03267 ChMessageInt_t * nodelistmsg = (ChMessageInt_t *)msg->data;
03268 nodetab_rank0_size = ChMessageInt(nodelistmsg[0]);
03269 nodetab_rank0_size_total = ChMessageInt(nodelistmsg[1]);
03270 int k;
03271 for(k =0; k<nodetab_rank0_size ; k++)
03272 {
03273 nodetab_rank0_table[k] = ChMessageInt(nodelistmsg[k+2]);
03274 }
03275 }
03276
03277
03278
03279 void nodelist_obtain(void)
03280 {
03281 ChMessage nodelistmsg;
03282
03283
03284 #if CMK_USE_IBVERBS
03285 {
03286
03287
03288
03289
03290
03291
03292 }
03293 #else
03294 ChMessageHeader hdr;
03295 ChMessageInt_t node_start=ChMessageInt_new(unique_node_start);
03296 ChMessageHeader_new("initnodetab",sizeof(ChMessageInt_t),&hdr);
03297 skt_sendN(parent_charmrun_fd,(const char *)&hdr,sizeof(hdr));
03298 skt_sendN(parent_charmrun_fd,(const char *)&node_start,sizeof(node_start));
03299
03300 #endif //CMK_USE_IBVERBS
03301
03302
03303
03304
03305 if (!skt_select1(parent_charmrun_fd,1200*1000)){
03306 exit(0);
03307 }
03308 ChMessage_recv(parent_charmrun_fd,&nodelistmsg);
03309
03310 my_nodetab_store(&nodelistmsg);
03311 ChMessage_free(&nodelistmsg);
03312 }
03313
03314
03315 void init_mynodes(void)
03316 {
03317 parse_netstart();
03318 if (!skt_ip_match(parent_charmrun_IP,_skt_invalid_ip)) {
03319 dataskt=skt_server(&dataport);
03320 parent_charmrun_fd = skt_connect(parent_charmrun_IP, parent_charmrun_port, 1800);
03321 } else {
03322 parent_charmrun_fd=-1;
03323 }
03324
03325 nodelist_obtain();
03326 }
03327 #endif
03328
03329
03330
03331
03332
03333
03334 void start_nodes_daemon(void);
03335 void start_nodes_rsh(void);
03336 void start_nodes_mpiexec();
03337 #ifdef HSTART
03338 void start_next_level_charmruns(void);
03339 #endif
03340 #if CMK_BPROC
03341 void nodetab_init_for_scyld(void);
03342 void start_nodes_scyld(void);
03343 #endif
03344 void start_nodes_local(char **envp);
03345 void kill_nodes(void);
03346 void open_gdb_info(void);
03347 void read_global_segments_size(void);
03348
03349 static void fast_idleFn(void) {sleep(0);}
03350 void finish_nodes(void);
03351
03352 int main(int argc, char **argv, char **envp)
03353 {
03354 srand(time(0));
03355 skt_init();
03356 skt_set_idle(fast_idleFn);
03357
03358
03359
03360 #ifdef HSTART
03361 if(!arg_child_charmrun)
03362 #endif
03363 ping_developers();
03364
03365 arg_init(argc, argv);
03366 if(arg_verbose) fprintf(stderr, "Charmrun> charmrun started...\n");
03367 start_timer = GetClock();
03368 #if CMK_BPROC
03369
03370 if (arg_nodelist)
03371 nodetab_init();
03372 else
03373 nodetab_init_for_scyld();
03374 #else
03375
03376 nodetab_init();
03377 #endif
03378
03379
03380 req_start_server();
03381
03382
03383 input_init();
03384
03385 #ifdef HSTART
03386
03387 if(arg_child_charmrun)
03388 {
03389 init_mynodes();
03390 }
03391 #endif
03392
03393 if (0!=getenv("CONV_DAEMON"))
03394 start_nodes_daemon();
03395 else
03396 #if CMK_BPROC
03397 start_nodes_scyld();
03398 #else
03399 #if CMK_USE_IBVERBS
03400 printf("Charmrun> IBVERBS version of charmrun\n");
03401 #endif
03402
03403 #ifdef HSTART
03404
03405 if(arg_hierarchical_start) {
03406 if (!arg_local) {
03407 if(!arg_child_charmrun){
03408 start_next_level_charmruns();}
03409 else {
03410 if (!arg_batch_spawn)
03411 start_nodes_rsh();
03412 else
03413 req_client_start_and_connect();
03414 }
03415 }
03416 else
03417 start_nodes_local(envp);
03418 }
03419
03420
03421 else
03422
03423 #endif
03424 {
03425 if (!arg_local) {
03426 if (!arg_batch_spawn) {
03427 if (arg_mpiexec)
03428 start_nodes_mpiexec();
03429 else
03430 start_nodes_rsh();
03431 }
03432 else
03433 req_client_start_and_connect();
03434 }
03435 else
03436 start_nodes_local(envp);
03437 }
03438 #endif
03439
03440 if (arg_charmdebug) {
03441 #if (defined(_WIN32) && !defined(__CYGWIN__)) || CMK_BPROC
03442
03443 fprintf(stderr, "Charmdebug is supported currently only with the rsh subsystem\n");
03444 abort();
03445 #else
03446
03447 printf("opening connection with node 0 for info gdb\n");
03448 read_global_segments_size();
03449 open_gdb_info();
03450 gdb_stream = fdopen(dup(2), "a");
03451 dup2(1, 2);
03452 #endif
03453 }
03454
03455
03456 if(arg_verbose) fprintf(stderr, "Charmrun> node programs all started\n");
03457
03458
03459 #ifdef HSTART
03460
03461 if(arg_hierarchical_start) {
03462 #if !CMK_RSH_KILL
03463 if (!arg_batch_spawn || (!arg_child_charmrun)) finish_nodes();
03464 #endif
03465
03466 if(!arg_child_charmrun)
03467 req_charmrun_connect();
03468 else if (!arg_batch_spawn)
03469 req_client_connect();
03470 }
03471
03472 else
03473 #endif
03474 {
03475 #if !CMK_RSH_KILL
03476 if (!arg_batch_spawn) finish_nodes();
03477 #endif
03478 if (!arg_batch_spawn) req_client_connect();
03479 }
03480 #if CMK_RSH_KILL
03481 kill_nodes();
03482 #endif
03483 if(arg_verbose) fprintf(stderr, "Charmrun> node programs all connected\n");
03484
03485 fprintf(stderr, "Charmrun> started all node programs in %.3f seconds.\n", GetClock()-start_timer);
03486
03487
03488 #ifdef HSTART
03489 if(arg_hierarchical_start)
03490 while (1) req_poll_hierarchical();
03491 else
03492 #endif
03493 while (1) req_poll();
03494
03495 }
03496
03497
03498
03499
03500
03501
03502 char *create_netstart(int node)
03503 {
03504 static char dest[1024];
03505 int port=0;
03506 if (arg_mpiexec)
03507 sprintf(dest,"$CmiMyNode %s %d %d %d",server_addr,server_port,getpid()&0x7FFF, port);
03508 else
03509 sprintf(dest,"%d %s %d %d %d",node,server_addr,server_port,getpid()&0x7FFF, port);
03510 return dest;
03511 }
03512
03513
03514
03515
03516
03517
03518
03519 void start_nodes_daemon(void)
03520 {
03521 taskStruct task;
03522 char argBuffer[5000];
03523 int i,nodeNumber;
03524
03525
03526
03527 argBuffer[0]=0;
03528 for (i=0;arg_argv[i];i++)
03529 {
03530 if (arg_verbose)
03531 printf("Charmrun> packing arg: %s\n", arg_argv[i]);
03532 strcat(argBuffer," ");
03533 strcat(argBuffer,arg_argv[i]);
03534 }
03535
03536 task.magic=ChMessageInt_new(DAEMON_MAGIC);
03537
03538
03539
03540 for (nodeNumber=0;nodeNumber<nodetab_rank0_size;nodeNumber++)
03541 {
03542 char nodeArgBuffer[5000];
03543 char *argBuf;
03544 char* arg_nodeprog_r, *arg_currdir_r;
03545 char statusCode='N';
03546 int fd;
03547 int pe0=nodetab_rank0_table[nodeNumber];
03548
03549 arg_currdir_r = pathfix(arg_currdir_a, nodetab_pathfixes(nodeNumber));
03550 strcpy(task.cwd,arg_currdir_r);
03551
03552 arg_nodeprog_r = pathextfix(arg_nodeprog_a, nodetab_pathfixes(nodeNumber), nodetab_ext(nodeNumber));
03553 strcpy(task.pgm,arg_nodeprog_r);
03554
03555 if (arg_verbose)
03556 printf("Charmrun> Starting node program %d on '%s' as %s.\n",nodeNumber,nodetab_name(pe0), arg_nodeprog_r);
03557
03558 sprintf(task.env,"NETSTART=%s",create_netstart(nodeNumber));
03559
03560 if (nodetab_nice(nodeNumber) != -100) {
03561 if(arg_verbose) fprintf(stderr, "Charmrun> +nice %d\n", nodetab_nice(nodeNumber));
03562 sprintf(nodeArgBuffer, "%s +nice %d", argBuffer, nodetab_nice(nodeNumber));
03563 argBuf = nodeArgBuffer;
03564 }
03565 else
03566 argBuf = argBuffer;
03567 task.argLength=ChMessageInt_new(strlen(argBuf));
03568
03569
03570 fd = skt_connect(nodetab_ip(pe0),
03571 DAEMON_IP_PORT,30);
03572 if (fd!=INVALID_SOCKET)
03573 {
03574 skt_sendN(fd, (const char *)&task, sizeof(task));
03575 skt_sendN(fd, (const char *)argBuf, strlen(argBuf));
03576 skt_recvN(fd, &statusCode,sizeof(char));
03577 }
03578 if (statusCode!='G')
03579 {
03580 fprintf(stderr,"Error '%c' starting remote node program on %s--\n%s\n",
03581 statusCode,nodetab_name(pe0),daemon_status2msg(statusCode));
03582 exit(1);
03583 } else if (arg_verbose)
03584 printf("Charmrun> Node program %d started.\n",nodeNumber);
03585 }
03586 }
03587
03588 #if defined(_WIN32) && !defined(__CYGWIN__)
03589
03590
03591
03592 void start_nodes_rsh() {start_nodes_daemon();}
03593 void finish_nodes(void) {}
03594 void start_one_node_rsh(int rank0no) {}
03595 void finish_one_node(int rank0no) {}
03596 void start_nodes_mpiexec() {}
03597
03598 int start_set_node_rsh(int client) { return 0; }
03599 void finish_set_nodes(int start, int stop) {}
03600
03601 void envCat(char *dest,LPTSTR oldEnv)
03602 {
03603 char *src=oldEnv;
03604 dest+=strlen(dest);
03605 dest++;
03606 while ((*src)!='\0') {
03607 int adv=strlen(src)+1;
03608 strcpy(dest,src);
03609 dest+=adv;
03610 src+=adv;
03611 }
03612 *dest='\0';
03613 FreeEnvironmentStrings(oldEnv);
03614 }
03615
03616
03617
03618
03619 void start_nodes_local(char ** env)
03620 {
03621 int ret, i;
03622 PROCESS_INFORMATION pi;
03623 char **p;
03624
03625 char environment[10000];
03626 char cmdLine[10000];
03627
03628
03629
03630
03631
03632 strcpy(cmdLine,pparam_argv[1]);
03633 p = pparam_argv+2;
03634 while ((*p)!='\0') {
03635 strcat(cmdLine," ");
03636 strcat(cmdLine,*p);
03637 p++;
03638 }
03639
03640 for (i=0; i<arg_requested_pes; i++)
03641 {
03642 STARTUPINFO si={0};
03643
03644 sprintf(environment, "NETSTART=%s", create_netstart(i));
03645
03646 envCat(environment,GetEnvironmentStrings());
03647
03648
03649
03650 si.cb = sizeof(si);
03651 if (arg_verbose)
03652 printf("Charmrun> start %d node program on localhost.\n", i);
03653
03654 ret = CreateProcess(NULL,
03655 cmdLine,
03656 NULL,
03657 NULL,
03658 FALSE,
03659 #if 1
03660 CREATE_NEW_PROCESS_GROUP|DETACHED_PROCESS,
03661 #else
03662 CREATE_NEW_PROCESS_GROUP|CREATE_NEW_CONSOLE,
03663 #endif
03664
03665 environment,
03666 ".",
03667 &si,
03668 &pi);
03669
03670 if (ret==0)
03671 {
03672
03673
03674
03675
03676
03677
03678
03679
03680
03681
03682 int error=GetLastError();
03683 printf("startProcess failed to start process \"%s\" with status: %d\n", pparam_argv[1], error);
03684 exit(1) ;
03685 }
03686 }
03687 }
03688
03689 #elif CMK_BPROC
03690
03691 int bproc_nodeisup(int node)
03692 {
03693 int status = 0;
03694 #if CMK_BPROC_VERSION < 4
03695 if (bproc_nodestatus(node) == bproc_node_up) status = 1;
03696 if (arg_verbose)
03697 printf("Charmrun> node %d status: %s\n", node, status?"up":"down");
03698 #else
03699 char nodestatus[128];
03700 if (node == -1) {
03701 strcpy(nodestatus, "up");
03702 status = 1;
03703 }
03704 if (bproc_nodestatus(node, nodestatus, 128)) {
03705 if (strcmp(nodestatus, "up")==0) status = 1;
03706 }
03707 if (arg_verbose)
03708 printf("Charmrun> node %d status: %s\n", node, nodestatus);
03709 #endif
03710 return status;
03711 }
03712
03718 void nodetab_init_for_scyld()
03719 {
03720 int maxNodes, i, node, npes, rank;
03721 nodetab_host group;
03722 int tablesize;
03723
03724 tablesize = arg_requested_pes;
03725 maxNodes = bproc_numnodes() + 1;
03726 if (arg_endpe < maxNodes) maxNodes=arg_endpe+1;
03727 if (maxNodes > tablesize) tablesize = maxNodes;
03728 nodetab_table=(nodetab_host**)malloc(tablesize*sizeof(nodetab_host*));
03729 nodetab_rank0_table=(int*)malloc(tablesize*sizeof(int));
03730 nodetab_max=tablesize;
03731
03732 nodetab_reset(&group);
03733
03734 if (arg_ppn==0) arg_ppn=1;
03735
03736
03737
03738
03739
03740
03741
03742
03743 group.cpus = 1;
03744 group.rank = 0;
03745
03746
03747 npes = 0;
03748 for (i=-1; i<maxNodes && npes < arg_requested_pes; i++) {
03749 char hostname[256];
03750 if (!bproc_nodeisup(i)) continue;
03751 if (i!= -1 && i<arg_startpe) continue;
03752 if (i==-1 && arg_skipmaster) continue;
03753 sprintf(hostname, "%d", i);
03754 #if ! CMK_SHARED_VARS_UNAVAILABLE
03755 if (npes + arg_ppn > arg_requested_pes) group.cpus = arg_requested_pes-npes;
03756 else group.cpus = arg_ppn;
03757 #endif
03758 for (rank = 0; rank<arg_ppn; rank++) {
03759 #if ! CMK_SHARED_VARS_UNAVAILABLE
03760 group.rank = rank;
03761 #endif
03762 nodetab_makehost(hostname, &group);
03763 if (++npes == arg_requested_pes) break;
03764 }
03765 }
03766 if (nodetab_rank0_size == 0) {
03767 fprintf(stderr, "Charmrun> no slave node available!\n");
03768 exit (1);
03769 }
03770 if (arg_verbose)
03771 printf("Charmrun> There are %d slave nodes available.\n", nodetab_rank0_size-(arg_skipmaster?0:1));
03772
03773
03774 if (arg_requested_pes > npes) {
03775 int orig_size = npes;
03776 int node;
03777 int startnode = 0;
03778 if (arg_singlemaster && nodetab_rank0_size > 1 && !arg_skipmaster)
03779 startnode = arg_ppn;
03780 node = startnode;
03781 while (npes < arg_requested_pes) {
03782 #if ! CMK_SHARED_VARS_UNAVAILABLE
03783 if (npes+arg_ppn > arg_requested_pes) group.cpus = arg_requested_pes-npes;
03784 else group.cpus = arg_ppn;
03785 #endif
03786 for (rank = 0; rank<arg_ppn; rank++) {
03787 #if ! CMK_SHARED_VARS_UNAVAILABLE
03788 group.rank = rank;
03789 #endif
03790 nodetab_makehost(nodetab_name(node), &group);
03791 if (++node == orig_size) node = startnode;
03792 if (++npes == arg_requested_pes) break;
03793 }
03794 }
03795 }
03796 }
03797
03798 void start_nodes_scyld(void)
03799 {
03800 char *envp[2];
03801 int i;
03802
03803 envp[0] = (char *)malloc(256);
03804 envp[1] = 0;
03805 for (i=0;i<nodetab_rank0_size;i++)
03806 {
03807 int status = 0;
03808 int pid;
03809 int pe=nodetab_rank0_table[i];
03810 int nodeno = atoi(nodetab_name(pe));
03811
03812 if (arg_verbose)
03813 printf("Charmrun> start node program on slave node: %d.\n", nodeno);
03814 sprintf(envp[0], "NETSTART=%s", create_netstart(i));
03815 pid = 0;
03816 pid = fork();
03817 if (pid < 0) exit(1);
03818 if (pid == 0)
03819 {
03820 int fd, fd1 = dup(1);
03821 if (!(arg_debug || arg_debug_no_pause)) {
03822 if (fd = open("/dev/null", O_RDWR)) {
03823 dup2(fd, 0); dup2(fd, 1); dup2(fd, 2);
03824 }
03825 }
03826 if (nodeno == -1) {
03827 status = execve(pparam_argv[1], pparam_argv+1, envp);
03828 dup2(fd1, 1);
03829 printf("execve failed to start process \"%s\" with status: %d\n", pparam_argv[1], status);
03830 }
03831 else {
03832 status = bproc_execmove(nodeno, pparam_argv[1], pparam_argv+1, envp);
03833 dup2(fd1, 1);
03834 printf("bproc_execmove failed to start remote process \"%s\" with status: %d\n", pparam_argv[1], status);
03835 }
03836 kill(getppid(), 9);
03837 exit(1);
03838 }
03839 }
03840 free(envp[0]);
03841 }
03842 void finish_nodes(void) {}
03843
03844 #else
03845
03846
03847
03848
03849
03850
03851
03852
03853 #include <sys/wait.h>
03854
03855 extern char **environ;
03856 void removeEnv(const char *doomedEnv)
03857 {
03858 char **oe, **ie;
03859 oe=ie=environ;
03860 while (*ie != NULL) {
03861 if (0!=strncmp(*ie,doomedEnv,strlen(doomedEnv)))
03862 *oe++ = *ie;
03863 ie++;
03864 }
03865 *oe=NULL;
03866 }
03867
03868 int rsh_fork(int nodeno,const char *startScript)
03869 {
03870 char **rshargv;
03871 int pid;
03872 int num=0;
03873 char *s, *e;
03874
03875
03876 s=nodetab_shell(nodeno); e=skipstuff(s);
03877 while (*s) {
03878 num++;
03879 s = skipblanks(e); e = skipstuff(s);
03880 }
03881 rshargv = (char **)malloc(sizeof(char *)*(num+6));
03882
03883 num = 0;
03884 s=nodetab_shell(nodeno); e=skipstuff(s);
03885 while (*s) {
03886 rshargv[num++]=substr(s, e);
03887 s = skipblanks(e); e = skipstuff(s);
03888 }
03889
03890 rshargv[num++]=nodetab_name(nodeno);
03891 rshargv[num++]="-l";
03892 rshargv[num++]=nodetab_login(nodeno);
03893 rshargv[num++]="/bin/sh -f";
03894 rshargv[num++]=0;
03895 if (arg_verbose) printf("Charmrun> Starting %s %s -l %s %s\n",nodetab_shell(nodeno), nodetab_name(nodeno),nodetab_login(nodeno), rshargv[num-2]);
03896
03897 pid = fork();
03898 if (pid < 0)
03899 { perror("ERROR> starting rsh"); exit(1); }
03900 if (pid == 0)
03901 {
03902 int i;
03903 int fdScript=open(startScript,O_RDONLY);
03904 unlink(startScript);
03905 dup2(fdScript,0);
03906
03907 for(i=3; i<1024; i++) close(i);
03908 execvp(rshargv[0], rshargv);
03909 fprintf(stderr,"Charmrun> Couldn't find rsh program '%s'!\n",rshargv[0]);
03910 exit(1);
03911 }
03912 free(rshargv);
03913 if (arg_verbose)
03914 fprintf(stderr,"Charmrun> remote shell (%s:%d) started\n",
03915 nodetab_name(nodeno),nodeno);
03916 return pid;
03917 }
03918
03919 void fprint_arg(FILE *f,char **argv)
03920 {
03921 while (*argv) {
03922 fprintf(f," %s",*argv);
03923 argv++;
03924 }
03925 }
03926 void rsh_Find(FILE *f,const char *program,const char *dest)
03927 {
03928 fprintf(f,"Find %s\n",program);
03929 fprintf(f,"%s=$loc\n",dest);
03930 }
03931 void rsh_script(FILE *f, int nodeno, int rank0no, char **argv, int restart)
03932 {
03933 char *netstart;
03934 char *arg_nodeprog_r,*arg_currdir_r;
03935 char *dbg=nodetab_debugger(nodeno);
03936 char *host=nodetab_name(nodeno);
03937
03938 if (arg_mpiexec)
03939 fprintf(f, "#!/bin/sh\n");
03940
03941 fprintf(f,
03942 "Echo() {\n"
03943 " echo 'Charmrun remote shell(%s.%d)>' $*\n"
03944 "}\n",host,nodeno);
03945 fprintf(f,
03946 "Exit() {\n"
03947 " if [ $1 -ne 0 ]\n"
03948 " then\n"
03949 " Echo Exiting with error code $1\n"
03950 " fi\n"
03951 #if CMK_RSH_KILL
03952 " sleep 5\n"
03953 " kill -9 $$\n"
03954 #else
03955 " exit $1\n"
03956 #endif
03957 "}\n");
03958 fprintf(f,
03959 "Find() {\n"
03960 " loc=''\n"
03961 " for dir in `echo $PATH | sed -e 's/:/ /g'`\n"
03962 " do\n"
03963 " test -f \"$dir/$1\" && loc=\"$dir/$1\"\n"
03964 " done\n"
03965 " if [ \"x$loc\" = x ]\n"
03966 " then\n"
03967 " Echo $1 not found in your PATH \"($PATH)\"--\n"
03968 " Echo set your path in your ~/.charmrunrc\n"
03969 " Exit 1\n"
03970 " fi\n"
03971 "}\n");
03972
03973 if (arg_verbose) fprintf(f,"Echo 'remote responding...'\n");
03974
03975 fprintf(f,"test -f \"$HOME/.charmrunrc\" && . \"$HOME/.charmrunrc\"\n");
03976
03977
03978
03979 if (arg_display && !arg_ssh_display)
03980 fprintf(f,"DISPLAY='%s';export DISPLAY\n",arg_display);
03981
03982 #ifdef HSTART
03983 if(arg_child_charmrun)
03984 fprintf(f,"NETMAGIC=\"%d\";export NETMAGIC\n",parent_charmrun_pid&0x7FFF);
03985 else
03986 #endif
03987 fprintf(f,"NETMAGIC=\"%d\";export NETMAGIC\n",getpid()&0x7FFF);
03988
03989 if (arg_mpiexec) {
03990 fprintf(f,"CmiMyNode=$OMPI_COMM_WORLD_RANK\n");
03991 fprintf(f,"test -z \"$CmiMyNode\" && CmiMyNode=$MPIRUN_RANK\n");
03992 fprintf(f,"test -z \"$CmiMyNode\" && CmiMyNode=$PMI_RANK\n");
03993 fprintf(f,"test -z \"$CmiMyNode\" && CmiMyNode=$PMI_ID\n");
03994 fprintf(f,"test -z \"$CmiMyNode\" && (Echo Could not detect rank from environment ; Exit 1)\n");
03995 fprintf(f,"export CmiMyNode\n");
03996 }
03997 #ifdef HSTART
03998 else if(arg_hierarchical_start && arg_child_charmrun)
03999 fprintf(f,"CmiMyNode='%d'; export CmiMyNode\n",mynodes_start+rank0no);
04000 #endif
04001 else
04002 fprintf(f,"CmiMyNode='%d'; export CmiMyNode\n",rank0no);
04003
04004 #ifdef HSTART
04005 if(arg_hierarchical_start && arg_child_charmrun)
04006 netstart = create_netstart(mynodes_start+rank0no);
04007 else
04008 #endif
04009 netstart = create_netstart(rank0no);
04010 fprintf(f,"NETSTART=\"%s\";export NETSTART\n",netstart);
04011
04012 fprintf(f,"CmiMyNodeSize='%d'; export CmiMyNodeSize\n",nodetab_getnodeinfo(rank0no)->cpus);
04013
04014 if (restart || arg_mpiexec)
04015 fprintf(f,"CmiMyForks='%d'; export CmiMyForks\n",0);
04016 else
04017 fprintf(f,"CmiMyForks='%d'; export CmiMyForks\n",nodetab_getnodeinfo(rank0no)->forks);
04018
04019 if (arg_mpiexec) {
04020 fprintf(f,"CmiNumNodes=$OMPI_COMM_WORLD_SIZE\n");
04021 fprintf(f,"test -z \"$CmiNumNodes\" && CmiNumNodes=$MPIRUN_NPROCS\n");
04022 fprintf(f,"test -z \"$CmiNumNodes\" && CmiNumNodes=$PMI_SIZE\n");
04023 fprintf(f,"test -z \"$CmiNumNodes\" && (Echo Could not detect node count from environment ; Exit 1)\n");
04024 fprintf(f,"export CmiNumNodes\n");
04025 }
04026 #ifdef HSTART
04027 else if(arg_hierarchical_start && arg_child_charmrun)
04028 fprintf(f,"CmiNumNodes='%d'; export CmiNumNodes\n",nodetab_rank0_size_total);
04029 #endif
04030
04031 else
04032 fprintf(f,"CmiNumNodes='%d'; export CmiNumNodes\n",nodetab_rank0_size);
04033
04034 #if CONVERSE_VERSION_VMI
04035
04036 fprintf (f, "VMI_PROCS='%d'; export VMI_PROCS\n", arg_requested_pes);
04037 fprintf (f, "VMI_KEY='charmrun%d'; export VMI_KEY\n", getpid ());
04038 fprintf (f, "VMI_SPECFILE='%s'; export VMI_SPECFILE\n", arg_vmispecfile);
04039 #endif
04040 #ifdef CMK_G95
04041 fprintf(f,"G95_UNBUFFERED_ALL=TRUE; export G95_UNBUFFERED_ALL\n");
04042 #endif
04043 #ifdef CMK_GFORTRAN
04044 fprintf(f,"GFORTRAN_UNBUFFERED_ALL=YES; export GFORTRAN_UNBUFFERED_ALL\n");
04045 #endif
04046 #if CMK_USE_MX
04047 fprintf(f,"MX_MONOTHREAD=1; export MX_MONOTHREAD\n");
04048
04049 #endif
04050 #if CMK_AIX && CMK_SMP
04051 fprintf(f,"MALLOCMULTIHEAP=1; export MALLOCMULTIHEAP\n");
04052 #endif
04053
04054 if (arg_verbose) {
04055 printf("Charmrun> Sending \"%s\" to client %d.\n", netstart, rank0no);
04056 }
04057 fprintf(f,"PATH=\"$PATH:/bin:/usr/bin:/usr/X/bin:/usr/X11/bin:/usr/local/bin:"
04058 "/usr/X11R6/bin:/usr/openwin/bin\"\n");
04059
04060
04061 arg_nodeprog_r = pathextfix(arg_nodeprog_a, nodetab_pathfixes(nodeno), nodetab_ext(nodeno));
04062
04063
04064 arg_currdir_r = pathfix(arg_currdir_a, nodetab_pathfixes(nodeno));
04065
04066 if (arg_verbose) {
04067 printf("Charmrun> find the node program \"%s\" at \"%s\" for %d.\n", arg_nodeprog_r, arg_currdir_r, nodeno);
04068 }
04069 if (arg_debug || arg_debug_no_pause || arg_in_xterm) {
04070 rsh_Find(f,nodetab_xterm(nodeno),"F_XTERM");
04071 if(!arg_ssh_display && !arg_debug_no_xrdb)
04072 rsh_Find(f,"xrdb","F_XRDB");
04073 if(arg_verbose) fprintf(f,"Echo 'using xterm' $F_XTERM\n");
04074 }
04075
04076 if (arg_debug || arg_debug_no_pause)
04077 {
04078 rsh_Find(f,dbg,"F_DBG");
04079 if (arg_verbose) fprintf(f,"Echo 'using debugger' $F_DBG\n");
04080 }
04081
04082 if (!arg_ssh_display && !arg_debug_no_xrdb &&
04083 (arg_debug || arg_debug_no_pause || arg_in_xterm)) {
04084
04085 fprintf(f,"$F_XRDB -query > /dev/null\n");
04086 fprintf(f,"if test $? != 0\nthen\n");
04087 fprintf(f," Echo 'Cannot contact X Server '$DISPLAY'. You probably'\n");
04088 fprintf(f," Echo 'need to run xhost to authorize connections.'\n");
04089 fprintf(f," Echo '(See manual for xhost for security issues)'\n");
04090 fprintf(f," Echo 'Or try ++batch 1 ++ssh-display to rely on SSH X11 forwarding'\n");
04091 fprintf(f," Exit 1\n");
04092 fprintf(f,"fi\n");
04093 }
04094
04095 fprintf(f,"if test ! -x \"%s\"\nthen\n",arg_nodeprog_r);
04096 fprintf(f," Echo 'Cannot locate this node-program: %s'\n",arg_nodeprog_r);
04097 fprintf(f," Exit 1\n");
04098 fprintf(f,"fi\n");
04099
04100 fprintf(f,"cd \"%s\"\n",arg_currdir_r);
04101 fprintf(f,"if test $? = 1\nthen\n");
04102 fprintf(f," Echo 'Cannot propagate this current directory:'\n");
04103 fprintf(f," Echo '%s'\n",arg_currdir_r);
04104 fprintf(f," Exit 1\n");
04105 fprintf(f,"fi\n");
04106
04107 if (strcmp(nodetab_setup(nodeno),"*")) {
04108 fprintf(f,"%s\n",nodetab_setup(nodeno));
04109 fprintf(f,"if test $? = 1\nthen\n");
04110 fprintf(f," Echo 'this initialization command failed:'\n");
04111 fprintf(f," Echo '\"%s\"'\n",nodetab_setup(nodeno));
04112 fprintf(f," Echo 'edit your nodes file to fix it.'\n");
04113 fprintf(f," Exit 1\n");
04114 fprintf(f,"fi\n");
04115 }
04116
04117 fprintf(f,"rm -f /tmp/charmrun_err.$$\n");
04118 if(arg_verbose) fprintf(f,"Echo 'starting node-program...'\n");
04119
04120 fprintf(f,"(");
04121
04122 if (arg_debug || arg_debug_no_pause ) {
04123 if ( strcmp(dbg, "gdb") == 0 || strcmp(dbg, "idb") == 0 ) {
04124 fprintf(f,"cat > /tmp/charmrun_gdb.$$ << END_OF_SCRIPT\n");
04125 if ( strcmp(dbg, "idb") == 0 ) {
04126 fprintf(f,"set \\$cmdset=\"gdb\"\n");
04127 }
04128 fprintf(f,"shell /bin/rm -f /tmp/charmrun_gdb.$$\n");
04129 fprintf(f,"handle SIGPIPE nostop noprint\n");
04130 fprintf(f,"handle SIGWINCH nostop noprint\n");
04131 fprintf(f,"handle SIGWAITING nostop noprint\n");
04132 if(arg_debug_commands)
04133 fprintf(f,"%s\n", arg_debug_commands);
04134 fprintf(f,"set args");
04135 fprint_arg(f,argv);
04136 fprintf(f,"\n");
04137 if (arg_debug_no_pause) fprintf(f,"run\n");
04138 fprintf(f,"END_OF_SCRIPT\n");
04139 if (arg_runscript)
04140 fprintf(f,"\"%s\" ",arg_runscript);
04141 fprintf(f,"$F_XTERM");
04142 fprintf(f," -title 'Node %d (%s)' ",nodeno,nodetab_name(nodeno));
04143 if ( strcmp(dbg, "idb") == 0 )
04144 fprintf(f," -e $F_DBG %s -c /tmp/charmrun_gdb.$$ \n", arg_nodeprog_r);
04145 else
04146 fprintf(f," -e $F_DBG %s -x /tmp/charmrun_gdb.$$ \n", arg_nodeprog_r);
04147 } else if ( strcmp(dbg, "dbx") == 0 ) {
04148 fprintf(f,"cat > /tmp/charmrun_dbx.$$ << END_OF_SCRIPT\n");
04149 fprintf(f,"sh /bin/rm -f /tmp/charmrun_dbx.$$\n");
04150 fprintf(f,"dbxenv suppress_startup_message 5.0\n");
04151 fprintf(f,"ignore SIGPOLL\n");
04152 fprintf(f,"ignore SIGPIPE\n");
04153 fprintf(f,"ignore SIGWINCH\n");
04154 fprintf(f,"ignore SIGWAITING\n");
04155 if(arg_debug_commands)
04156 fprintf(f,"%s\n", arg_debug_commands);
04157 fprintf(f,"END_OF_SCRIPT\n");
04158 if (arg_runscript)
04159 fprintf(f,"\"%s\" ",arg_runscript);
04160 fprintf(f,"$F_XTERM");
04161 fprintf(f," -title 'Node %d (%s)' ",nodeno,nodetab_name(nodeno));
04162 fprintf(f," -e $F_DBG %s ",arg_debug_no_pause?"-r":"");
04163 if(arg_debug) {
04164 fprintf(f,"-c \'runargs ");
04165 fprint_arg(f,argv);
04166 fprintf(f,"\' ");
04167 }
04168 fprintf(f, "-s/tmp/charmrun_dbx.$$ %s",arg_nodeprog_r);
04169 if(arg_debug_no_pause)
04170 fprint_arg(f,argv);
04171 fprintf(f,"\n");
04172 } else {
04173 fprintf(stderr, "Unknown debugger: %s.\n Exiting.\n",
04174 nodetab_debugger(nodeno));
04175 }
04176 } else if (arg_in_xterm) {
04177 if(arg_verbose)
04178 fprintf(stderr, "Charmrun> node %d: xterm is %s\n",
04179 nodeno, nodetab_xterm(nodeno));
04180 fprintf(f,"cat > /tmp/charmrun_inx.$$ << END_OF_SCRIPT\n");
04181 fprintf(f,"#!/bin/sh\n");
04182 fprintf(f,"/bin/rm -f /tmp/charmrun_inx.$$\n");
04183 fprintf(f,"%s", arg_nodeprog_r);
04184 fprint_arg(f,argv);
04185 fprintf(f,"\n");
04186 fprintf(f,"echo 'program exited with code '\\$?\n");
04187 fprintf(f,"read eoln\n");
04188 fprintf(f,"END_OF_SCRIPT\n");
04189 fprintf(f,"chmod 700 /tmp/charmrun_inx.$$\n");
04190 if (arg_runscript)
04191 fprintf(f,"\"%s\" ",arg_runscript);
04192 fprintf(f,"$F_XTERM -title 'Node %d (%s)' ",nodeno,nodetab_name(nodeno));
04193 fprintf(f," -sl 5000");
04194 fprintf(f," -e /tmp/charmrun_inx.$$\n");
04195 } else {
04196 if (arg_runscript)
04197 fprintf(f,"\"%s\" ",arg_runscript);
04198 if (arg_no_va_rand) {
04199 if(arg_verbose) fprintf(stderr, "Charmrun> setarch -R is used.\n");
04200 fprintf(f,"setarch `uname -m` -R ");
04201 }
04202 fprintf(f,"\"%s\" ",arg_nodeprog_r);
04203 fprint_arg(f,argv);
04204 if (nodetab_nice(nodeno) != -100) {
04205 if(arg_verbose) fprintf(stderr, "Charmrun> nice -n %d\n", nodetab_nice(nodeno));
04206 fprintf(f," +nice %d ",nodetab_nice(nodeno));
04207 }
04208 fprintf(f,"\nres=$?\n");
04209
04210
04211
04212
04213 fprintf(f,
04214 "if [ $res -eq 127 ]\n"
04215 "then\n"
04216 " ( \n"
04217 " \"%s\" \n"
04218 " ldd \"%s\"\n"
04219 " ) > /tmp/charmrun_err.$$ 2>&1 \n"
04220 "fi\n",arg_nodeprog_r,arg_nodeprog_r);
04221 }
04222
04223
04224
04225
04226
04227 fprintf(f,")");
04228 fprintf(f," < /dev/null 1> /dev/null 2> /dev/null");
04229 if (!arg_mpiexec)
04230 fprintf(f, " &");
04231 fprintf(f, "\n");
04232
04233 if (arg_verbose) fprintf(f,"Echo 'rsh phase successful.'\n");
04234 fprintf(f,
04235 "sleep 1\n"
04236 "if [ -r /tmp/charmrun_err.$$ ]\n"
04237 "then\n"
04238 " cat /tmp/charmrun_err.$$ \n"
04239 " rm -f /tmp/charmrun_err.$$ \n"
04240 " Exit 1\n"
04241 "fi\n");
04242 fprintf(f,"Exit 0\n");
04243 }
04244
04245
04246
04247
04248 void read_global_segments_size() {
04249 char **rshargv;
04250 int childPid;
04251
04252
04253 arg_nodeprog_r = pathextfix(arg_nodeprog_a, nodetab_pathfixes(0), nodetab_ext(0));
04254
04255 rshargv = (char **)malloc(sizeof(char *)*6);
04256 rshargv[0]=nodetab_shell(0);
04257 rshargv[1]=nodetab_name(0);
04258 rshargv[2]="-l";
04259 rshargv[3]=nodetab_login(0);
04260 rshargv[4] = (char *)malloc(sizeof(char)*9+strlen(arg_nodeprog_r));
04261 sprintf(rshargv[4],"size -A %s",arg_nodeprog_r);
04262 rshargv[5]=0;
04263
04264 childPid = fork();
04265 if (childPid < 0) {
04266 perror("ERROR> getting the size of the global variables segments"); exit(1);
04267 } else if (childPid == 0) {
04268
04269 dup2(2, 1);
04270
04271 execvp(rshargv[0], rshargv);
04272 fprintf(stderr,"Charmrun> Couldn't find rsh program '%s'!\n",rshargv[0]);
04273 exit(1);
04274 } else {
04275
04276 free(rshargv[4]);
04277 free(rshargv);
04278 waitpid(childPid, NULL, 0);
04279 }
04280 }
04281
04282
04283 void open_gdb_info() {
04284 char **rshargv;
04285 int fdin[2];
04286 int fdout[2];
04287 int fderr[2];
04288 int i;
04289
04290
04291 arg_nodeprog_r = pathextfix(arg_nodeprog_a, nodetab_pathfixes(0), nodetab_ext(0));
04292
04293 rshargv = (char **)malloc(sizeof(char *)*6);
04294 rshargv[0]=nodetab_shell(0);
04295 rshargv[1]=nodetab_name(0);
04296 rshargv[2]="-l";
04297 rshargv[3]=nodetab_login(0);
04298 rshargv[4] = (char *)malloc(sizeof(char)*8+strlen(arg_nodeprog_r));
04299 sprintf(rshargv[4],"gdb -q %s",arg_nodeprog_r);
04300 rshargv[5]=0;
04301
04302 pipe(fdin);
04303 pipe(fdout);
04304 pipe(fderr);
04305
04306 gdb_info_pid = fork();
04307 if (gdb_info_pid < 0) {
04308 perror("ERROR> starting info gdb"); exit(1);
04309 } else if (gdb_info_pid == 0) {
04310
04311 close(fdin[1]);
04312 close(fdout[0]);
04313 close(fderr[0]);
04314 printf("executing: \"%s\" \"%s\" \"%s\" \"%s\" \"%s\"\n",rshargv[0],rshargv[1],rshargv[2],rshargv[3],rshargv[4]);
04315 dup2(fdin[0],0);
04316 dup2(fdout[1],1);
04317 dup2(fderr[1],2);
04318 for(i=3; i<1024; i++) close(i);
04319 execvp(rshargv[0], rshargv);
04320 fprintf(stderr,"Charmrun> Couldn't find rsh program '%s'!\n",rshargv[0]);
04321 exit(1);
04322 }
04323
04324 free(rshargv[4]);
04325 free(rshargv);
04326 gdb_info_std[0] = fdin[1];
04327 gdb_info_std[1] = fdout[0];
04328 gdb_info_std[2] = fderr[0];
04329 close(fdin[0]);
04330 close(fdout[1]);
04331 close(fderr[1]);
04332 }
04333 #ifdef HSTART
04334 void start_next_level_charmruns()
04335 {
04336
04337 static char buf[1024];
04338 char * nodeprog_name = strrchr(arg_nodeprog_a, '/');
04339 nodeprog_name[0] = 0;
04340 sprintf(buf,"%s%s%s",arg_nodeprog_a,DIRSEP,"charmrun");
04341 arg_nodeprog_a = strdup(buf);
04342
04343 int client;
04344 int nextIndex =0;
04345 client=0;
04346 while(nextIndex<branchfactor){
04347
04348 int rank0no = nodetab_unique_table[client];
04349 int pe=nodetab_rank0_table[rank0no];
04350 FILE *f;
04351 char startScript[200];
04352 sprintf(startScript,"/tmp/charmrun.%d.%d",getpid(),pe);
04353 f=fopen(startScript,"w");
04354 if (f==NULL) {
04355
04356 sprintf(startScript,"charmrun.%d.%d",getpid(),pe);
04357 f=fopen(startScript,"w");
04358 if (f==NULL) {
04359 fprintf(stderr,"Charmrun> Can not write file %s!\n", startScript);
04360 exit(1);
04361 }
04362 }
04363 rsh_script(f,pe,client,arg_argv,0);
04364 fclose(f);
04365 if (!rsh_pids)
04366 rsh_pids=(int *)malloc(sizeof(int)*branchfactor);
04367 rsh_pids[nextIndex++] = rsh_fork(pe,startScript);
04368 client += nodes_per_child;
04369
04370 }
04371 }
04372 #endif
04373
04374
04375 void start_one_node_rsh(int rank0no)
04376 {
04377 int pe=nodetab_rank0_table[rank0no];
04378 FILE *f;
04379 char startScript[200];
04380 sprintf(startScript,"/tmp/charmrun.%d.%d",getpid(),pe);
04381 f=fopen(startScript,"w");
04382 if (f==NULL) {
04383
04384 sprintf(startScript,"charmrun.%d.%d",getpid(),pe);
04385 f=fopen(startScript,"w");
04386 if (f==NULL) {
04387 fprintf(stderr,"Charmrun> Can not write file %s!\n", startScript);
04388 exit(1);
04389 }
04390 }
04391 rsh_script(f,pe,rank0no,arg_argv,0);
04392 fclose(f);
04393 if (!rsh_pids)
04394 rsh_pids=(int *)malloc(sizeof(int)*nodetab_rank0_size);
04395 rsh_pids[rank0no] = rsh_fork(pe,startScript);
04396 }
04397
04398 int start_set_node_rsh(int client) {
04399
04400 int clientgroup;
04401 #if CMK_SMP || defined(_WIN32)
04402 clientgroup=client+1;
04403 #else
04404
04405 #ifdef HSTART
04406 if(!arg_scalable_start && !arg_hierarchical_start)
04407 clientgroup=client+1;
04408 else {
04409 clientgroup=client;
04410 do {
04411 clientgroup++;
04412 if(clientgroup>=nodetab_rank0_size)
04413 break;
04414 if(arg_scalable_start&&!arg_hierarchical_start)
04415 if(strcmp(nodetab_name(clientgroup),nodetab_name(client)))
04416 break;
04417
04418 if(strcmp(nodetab_name(nodetab_rank0_table[clientgroup]),nodetab_name(nodetab_rank0_table[client])))
04419 break;
04420 }
04421 while(1);
04422 }
04423
04424 #else
04425 if(!arg_scalable_start)
04426 clientgroup=client+1;
04427 else {
04428 clientgroup=client;
04429 do {
04430 clientgroup++;
04431 }while(clientgroup<nodetab_rank0_size&&(!strcmp(nodetab_name(clientgroup),nodetab_name(client))));
04432 }
04433 #endif
04434
04435 #endif
04436 nodetab_getnodeinfo(client)->forks=clientgroup-client-1;
04437 start_one_node_rsh(client);
04438 return clientgroup-client;
04439 }
04440
04441 void start_nodes_rsh()
04442 {
04443 int client,clientgroup;
04444 rsh_pids=(int *)malloc(sizeof(int)*nodetab_rank0_size);
04445
04446 if (arg_verbose) printf("start_nodes_rsh\n");
04447 client=0;
04448 while(client<nodetab_rank0_size) {
04449
04450 clientgroup=start_set_node_rsh(client);
04451 client+=clientgroup;
04452 }
04453 }
04454
04455
04456
04457 int rsh_fork_one(const char *startScript)
04458 {
04459 char **rshargv;
04460 int pid;
04461 int num=0;
04462 char npes[128];
04463 char *s, *e;
04464
04465
04466 s=nodetab_shell(0); e=skipstuff(s);
04467 while (*s) {
04468 num++;
04469 s = skipblanks(e); e = skipstuff(s);
04470 }
04471 rshargv = (char **)malloc(sizeof(char *)*(num+8));
04472
04473 num = 0;
04474 s=nodetab_shell(0); e=skipstuff(s);
04475 while (*s) {
04476 rshargv[num++]=substr(s, e);
04477 s = skipblanks(e); e = skipstuff(s);
04478 }
04479
04480 rshargv[num++]="-n";
04481 sprintf(npes, "%d", nodetab_rank0_size);
04482 rshargv[num++]=npes;
04483 rshargv[num++]=(char*)startScript;
04484 rshargv[num++]=0;
04485 if (arg_verbose) printf("Charmrun> Starting %s %s \n", nodetab_shell(0), startScript);
04486
04487 pid = fork();
04488 if (pid < 0)
04489 { perror("ERROR> starting mpiexec"); exit(1); }
04490 if (pid == 0)
04491 {
04492 int i;
04493
04494
04495 for(i=3; i<1024; i++) close(i);
04496 execvp(rshargv[0], rshargv);
04497 fprintf(stderr,"Charmrun> Couldn't find mpiexec program '%s'!\n",rshargv[0]);
04498 exit(1);
04499 }
04500 free(rshargv);
04501 if (arg_verbose)
04502 fprintf(stderr,"Charmrun> mpiexec started\n");
04503 return pid;
04504 }
04505
04506 void start_nodes_mpiexec()
04507 {
04508 int i;
04509
04510 FILE *f;
04511 char startScript[200];
04512 sprintf(startScript,"./charmrun.%d",getpid());
04513 f=fopen(startScript,"w");
04514 chmod(startScript, S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IROTH);
04515 if (f==NULL) {
04516
04517 sprintf(startScript,"./charmrun.%d",getpid());
04518 f=fopen(startScript,"w");
04519 if (f==NULL) {
04520 fprintf(stderr,"Charmrun> Can not write file %s!\n", startScript);
04521 exit(1);
04522 }
04523 }
04524 rsh_script(f,0,0,arg_argv,0);
04525 fclose(f);
04526 rsh_pids=(int *)malloc(sizeof(int)*nodetab_rank0_size);
04527 rsh_pids[0]=rsh_fork_one(startScript);
04528 for (i=0; i<nodetab_rank0_size; i++)
04529 rsh_pids[i] = 0;
04530 }
04531
04532 void finish_set_nodes(int start, int stop) {
04533 int status,done,i;
04534 char *host;
04535
04536 if (!rsh_pids) return;
04537
04538 done=0;
04539 while(!done) {
04540 done=1;
04541 for(i=start;i<stop;i++) {
04542 if(rsh_pids[i]!=0) {
04543 done=0;
04544 status=0;
04545 waitpid(rsh_pids[i],&status,0);
04546 if(WIFEXITED(status)) {
04547 if (!WEXITSTATUS(status)) {
04548 rsh_pids[i]=0;
04549 } else {
04550 host=nodetab_name(nodetab_rank0_table[i]);
04551 fprintf(stderr,"Charmrun> Error %d returned from rsh (%s:%d)\n",
04552 WEXITSTATUS(status),host,i);
04553 exit(1);
04554 }
04555 }
04556 }
04557 }
04558 }
04559 }
04560
04561 void finish_nodes()
04562 {
04563 #ifdef HSTART
04564 if(arg_hierarchical_start && !arg_child_charmrun)
04565 finish_set_nodes(0, branchfactor);
04566 else
04567 #endif
04568 finish_set_nodes(0,nodetab_rank0_size);
04569 free(rsh_pids);
04570 }
04571
04572 void kill_nodes()
04573 {
04574 int rank0no;
04575 if (!rsh_pids) return;
04576
04577 for (rank0no=0;rank0no<nodetab_rank0_size;rank0no++)
04578 {
04579 const char *host=nodetab_name(nodetab_rank0_table[rank0no]);
04580 int status=0;
04581 if (arg_verbose) printf("Charmrun> waiting for rsh (%s:%d), pid %d\n",
04582 host,rank0no,rsh_pids[rank0no]);
04583 kill(rsh_pids[rank0no],9);
04584 waitpid(rsh_pids[rank0no],&status,0);
04585 }
04586 free(rsh_pids);
04587 }
04588
04589
04590
04591 void start_nodes_local(char ** env)
04592 {
04593 char **envp;
04594 int envc, rank0no, i;
04595 int extra = 0;
04596
04597 #if CMK_AIX && CMK_SMP
04598 extra = 1;
04599 #endif
04600
04601
04602 for (envc=0; env[envc]; envc++);
04603 envp = (char **)malloc((envc+2+extra+1)*sizeof(void *));
04604 for (i=0; i<envc; i++) envp[i] = env[i];
04605 envp[envc] = (char *)malloc(256);
04606 envp[envc+1] = (char *)malloc(256);
04607 #if CMK_AIX && CMK_SMP
04608 envp[envc+2] = (char *)malloc(256);
04609 sprintf(envp[envc+2], "MALLOCMULTIHEAP=1");
04610 #endif
04611 envp[envc+2+extra] = 0;
04612
04613 for (rank0no=0;rank0no<nodetab_rank0_size;rank0no++)
04614 {
04615 int status = 0;
04616 int pid;
04617 int pe=nodetab_rank0_table[rank0no];
04618
04619 if (arg_verbose)
04620 printf("Charmrun> start %d node program on localhost.\n", pe);
04621 sprintf(envp[envc], "NETSTART=%s", create_netstart(rank0no));
04622 sprintf(envp[envc+1],"CmiNumNodes=%d", nodetab_rank0_size);
04623 pid = 0;
04624 pid = fork();
04625 if (pid < 0) exit(1);
04626 if (pid == 0)
04627 {
04628 int fd, fd1 = dup(1);
04629 if (-1!=(fd = open("/dev/null", O_RDWR))) {
04630 dup2(fd, 0); dup2(fd, 1); dup2(fd, 2);
04631 }
04632 status = execve(pparam_argv[1], pparam_argv+1, envp);
04633 dup2(fd1, 1);
04634 printf("execve failed to start process \"%s\" with status: %d\n", pparam_argv[1], status);
04635 kill(getppid(), 9);
04636 exit(1);
04637 }
04638 }
04639 free(envp[envc]);
04640 free(envp[envc+1]);
04641 #if CMK_AIX && CMK_SMP
04642 free(envp[envc+2]);
04643 #endif
04644 free(envp);
04645 }
04646
04647 #ifdef __FAULT__
04648
04649 int current_restart_phase = 1;
04650
04651 void refill_nodetab_entry(int crashed_node);
04652 nodetab_host *replacement_host(int pe);
04653
04657 void restart_node(int crashed_node){
04658 int pe = nodetab_rank0_table[crashed_node];
04659 FILE *f;
04660 char startScript[200];
04661 int restart_rsh_pid;
04662 char **restart_argv;
04663 int status=0;
04664 char phase_str[10];
04665 int i;
04667 sprintf(startScript,"/tmp/charmrun.%d.%d",getpid(),pe);
04668 f=fopen(startScript,"w");
04669
04673 i=0;
04674 while(arg_argv[i]!= NULL){
04675 i++;
04676 }
04677 restart_argv = (char **)malloc(sizeof(char *)*(i+4));
04678 i=0;
04679 while(arg_argv[i]!= NULL){
04680 restart_argv[i] = arg_argv[i];
04681 i++;
04682 }
04683 restart_argv[i] = "+restartaftercrash";
04684 sprintf(phase_str,"%d", ++current_restart_phase);
04685 restart_argv[i+1]=phase_str;
04686 restart_argv[i+2] = "+restartisomalloc";
04687 restart_argv[i+3]=NULL;
04688
04689 rsh_script(f,pe,crashed_node,restart_argv,1);
04690 fclose(f);
04693 refill_nodetab_entry(crashed_node);
04695 restart_rsh_pid =rsh_fork(pe,startScript);
04697 status=0;
04698 if (arg_debug_no_pause || arg_debug) ;
04699 else {
04700 do{
04701 waitpid(restart_rsh_pid,&status,0);
04702 }while(!WIFEXITED(status));
04703 if (WEXITSTATUS(status)!=0){
04704 fprintf(stderr,"Charmrun> Error %d returned from new attempted rsh \n",
04705 WEXITSTATUS(status));
04706 exit(1);
04707 }
04708 }
04709 printf("Charmrun finished launching new process in %fs\n", GetClock()-ftTimer);
04710 }
04711
04712
04713
04714 void refill_nodetab_entry(int crashed_node){
04715 int pe = nodetab_rank0_table[crashed_node];
04716 nodetab_host *h = nodetab_table[pe];
04717 *h = *(replacement_host(pe));
04718 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
04719 fprintf(stderr,"Charmrun>>> New pe %d is on host %s \n",pe,nodetab_name(pe));
04720 #endif
04721 }
04722
04723 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
04724 nodetab_host *replacement_host(int pe){
04725 int x=loaded_max_pe+1;
04726
04727 x = x%arg_read_pes;
04728 loaded_max_pe +=1;
04729
04730
04731
04732 fprintf(stderr,"Charmrun>>> replacing pe %d with %d host %s with %s \n",pe,x,nodetab_name(pe),nodetab_name(x));
04733 return nodetab_table[x];
04734 }
04735 #else
04736 nodetab_host *replacement_host(int pe){
04737 int x=pe;
04738 while(x == pe){
04739 #ifdef HSTART
04740 if(arg_hierarchical_start){
04741 x = nodetab_rank0_table[rand()%nodetab_rank0_size];
04742 crashed_pe_id = pe;
04743 restarted_pe_id = x;
04744 }
04745 else
04746 #endif
04747 x = rand()%nodetab_size;
04748 }
04749 return nodetab_table[x];
04750 }
04751 #endif
04752
04758 void reconnect_crashed_client(int socket_index,int crashed_node){
04759 int i;
04760 unsigned int clientPort;
04761 skt_ip_t clientIP;
04762 ChSingleNodeinfo *in;
04763 if(0==skt_select1(server_fd,arg_timeout*1000)){
04764 client_connect_problem(socket_index,"Timeout waiting forrestarted node-program to connect");
04765 }
04766 req_clients[socket_index] = skt_accept(server_fd,&clientIP,&clientPort);
04767 if(req_clients[socket_index] == SOCKET_ERROR){
04768 client_connect_problem(socket_index,"Failure in restarted node accept");
04769 }else{
04770 ChMessage msg;
04771 if(!skt_select1(req_clients[socket_index],arg_timeout*1000)){
04772 client_connect_problem(socket_index,"Timeout on IP request for restarted processor");
04773 }
04774
04775 #ifdef HSTART
04776 if(arg_hierarchical_start){
04777 req_forward_root(req_clients[socket_index]);
04778 if (_last_crash != 0) {
04779 fprintf(stderr, "ERROR> Charmrun detected multiple crashes.\n");
04780 exit(1);
04781 }
04782
04783 _last_crash = crashed_node;
04784 _crash_socket_index = socket_index;
04785 return;
04786 }
04787 #endif
04788 ChMessage_recv(req_clients[socket_index],&msg);
04789 if(msg.len != sizeof(ChSingleNodeinfo)){
04790 fprintf(stderr,"Charmrun: Bad initnode data length. Aborting\n");
04791 fprintf(stderr,"Charmrun: possibly because: %s.\n", msg.data);
04792 }
04793 fprintf(stdout,"socket_index %d crashed_node %d reconnected fd %d \n",socket_index,crashed_node,req_clients[socket_index]);
04794
04797 in = (ChSingleNodeinfo *)msg.data;
04798 nodeinfo_add(in,req_clients[socket_index]);
04799 for(i=0;i<req_nClients;i++){
04800 if(i != socket_index){
04801 req_handle_initnodetab(NULL,req_clients[i]);
04802 }
04803 }
04804
04805
04806 announce_crash(socket_index,crashed_node);
04807 if (_last_crash != 0) {
04808 fprintf(stderr, "ERROR> Charmrun detected multiple crashes.\n");
04809 exit(1);
04810 }
04811 _last_crash = crashed_node;
04812 _crash_socket_index = socket_index;
04813
04814
04815
04816
04817
04818 ChMessage_free(&msg);
04819 }
04820 }
04821
04826 void announce_crash(int socket_index,int crashed_node){
04827 int i;
04828 ChMessageHeader hdr;
04829 ChMessageInt_t crashNo=ChMessageInt_new(crashed_node);
04830 ChMessageHeader_new("crashnode",sizeof(ChMessageInt_t),&hdr);
04831 for(i=0;i<req_nClients;i++){
04832 if(i != socket_index){
04833 skt_sendN(req_clients[i],(const char *)&hdr,sizeof(hdr));
04834 skt_sendN(req_clients[i],(const char *)&crashNo,sizeof(ChMessageInt_t));
04835 }
04836 }
04837 }
04838
04839 #endif
04840
04841 #endif
04842
04843
04844
04845