| [22755] | 1 | Index: ../trunk-jpl/src/m/classes/clusters/stallo.py | 
|---|
|  | 2 | =================================================================== | 
|---|
|  | 3 | --- ../trunk-jpl/src/m/classes/clusters/stallo.py       (nonexistent) | 
|---|
|  | 4 | +++ ../trunk-jpl/src/m/classes/clusters/stallo.py       (revision 21987) | 
|---|
|  | 5 | @@ -0,0 +1,171 @@ | 
|---|
|  | 6 | +import subprocess | 
|---|
|  | 7 | +from fielddisplay import fielddisplay | 
|---|
|  | 8 | +from pairoptions import pairoptions | 
|---|
|  | 9 | +from issmssh import issmssh | 
|---|
|  | 10 | +from issmscpin import issmscpin | 
|---|
|  | 11 | +from issmscpout import issmscpout | 
|---|
|  | 12 | +from QueueRequirements import QueueRequirements | 
|---|
|  | 13 | +import datetime | 
|---|
|  | 14 | +try: | 
|---|
|  | 15 | +       from stallo_settings import stallo_settings | 
|---|
|  | 16 | +except ImportError: | 
|---|
|  | 17 | +       print 'You need stallo_settings.py to proceed, check presence and sys.path' | 
|---|
|  | 18 | + | 
|---|
|  | 19 | +class stallo(object): | 
|---|
|  | 20 | +       """ | 
|---|
|  | 21 | +       Stallo cluster class definition | 
|---|
|  | 22 | +       This is a SLURM queue | 
|---|
|  | 23 | +       The priorities are given to: | 
|---|
|  | 24 | +          - Large jobs | 
|---|
|  | 25 | +          - Short jobs | 
|---|
|  | 26 | +          - small number of job per user | 
|---|
|  | 27 | + | 
|---|
|  | 28 | +       There are some 20cpu nodes and 16cpu nodes, with 32GB (a few with 128GB) mem per node, you can ask for part of a node if you need more memory.(1 node, 2 CPUS and 10GB per cpu for example) | 
|---|
|  | 29 | + | 
|---|
|  | 30 | + | 
|---|
|  | 31 | +          Usage: | 
|---|
|  | 32 | +             cluster=stallo(); | 
|---|
|  | 33 | +       """ | 
|---|
|  | 34 | + | 
|---|
|  | 35 | +       def __init__(self,*args): | 
|---|
|  | 36 | +       # {{{ | 
|---|
|  | 37 | +               self.name           = 'stallo' | 
|---|
|  | 38 | +               self.login          = '' | 
|---|
|  | 39 | +               self.numnodes       = 2 | 
|---|
|  | 40 | +               self.cpuspernode    = 20 | 
|---|
|  | 41 | +               self.mem            = 1.6 | 
|---|
|  | 42 | +               self.queue          = 'normal' | 
|---|
|  | 43 | +               self.time           = 2*60 | 
|---|
|  | 44 | +               self.codepath       = '' | 
|---|
|  | 45 | +               self.executionpath  = '' | 
|---|
|  | 46 | +               self.interactive    = 0 | 
|---|
|  | 47 | +               self.port           = [] | 
|---|
|  | 48 | +               self.accountname    = '' | 
|---|
|  | 49 | +               self.profiling      = 0 | 
|---|
|  | 50 | +               #use provided options to change fields | 
|---|
|  | 51 | +               options=pairoptions(*args) | 
|---|
|  | 52 | + | 
|---|
|  | 53 | +               #initialize cluster using user settings if provided | 
|---|
|  | 54 | +               self=vilje_settings(self) | 
|---|
|  | 55 | +               #OK get other fields | 
|---|
|  | 56 | +               self=options.AssignObjectFields(self) | 
|---|
|  | 57 | +               self.np=self.numnodes*self.procspernodes | 
|---|
|  | 58 | +       # }}} | 
|---|
|  | 59 | + | 
|---|
|  | 60 | +       def __repr__(self): | 
|---|
|  | 61 | +       # {{{ | 
|---|
|  | 62 | +               #  display the object | 
|---|
|  | 63 | +               s = "class vilje object:" | 
|---|
|  | 64 | +               s = "%s\n%s"%(s,fielddisplay(self,'name','name of the cluster')) | 
|---|
|  | 65 | +               s = "%s\n%s"%(s,fielddisplay(self,'login','login')) | 
|---|
|  | 66 | +               s = "%s\n%s"%(s,fielddisplay(self,'numnodes','number of nodes')) | 
|---|
|  | 67 | +               s = "%s\n%s"%(s,fielddisplay(self,'cpuspernode','number of nodes per CPUs')) | 
|---|
|  | 68 | +               s = "%s\n%s"%(s,fielddisplay(self,'mem','memory per CPU')) | 
|---|
|  | 69 | +               s = "%s\n%s"%(s,fielddisplay(self,'queue','name of the queue (normal (D), short,singlenode,multinode,devel)')) | 
|---|
|  | 70 | +               s = "%s\n%s"%(s,fielddisplay(self,'time','walltime requested in minutes')) | 
|---|
|  | 71 | +               s = "%s\n%s"%(s,fielddisplay(self,'codepath','code path on the cluster')) | 
|---|
|  | 72 | +               s = "%s\n%s"%(s,fielddisplay(self,'executionpath','execution path on the cluster')) | 
|---|
|  | 73 | +               s = "%s\n%s"%(s,fielddisplay(self,'interactive','')) | 
|---|
|  | 74 | +               s = "%s\n%s"%(s,fielddisplay(self,'accountname','your cluster account')) | 
|---|
|  | 75 | +               s = "%s\n%s"%(s,fielddisplay(self,'profiling','enable profiling if 1 default is 0')) | 
|---|
|  | 76 | +               return s | 
|---|
|  | 77 | +       # }}} | 
|---|
|  | 78 | +       def checkconsistency(self,md,solution,analyses): | 
|---|
|  | 79 | +       # {{{ | 
|---|
|  | 80 | +               #Queue dictionarry  gives queue name as key and max walltime and cpus as var | 
|---|
|  | 81 | +               queuedict = {'short':[60, 2048], | 
|---|
|  | 82 | +                                                                'normal':[2*24*60,2048], | 
|---|
|  | 83 | +                                                                'singlenode':[28*24*60,20], | 
|---|
|  | 84 | +                                                                'multinode':[28*24*60,2048], | 
|---|
|  | 85 | +                                                                'devel':[4*60,2048]} | 
|---|
|  | 86 | +               QueueRequirements(queuedict,self.queue,self.np,self.time) | 
|---|
|  | 87 | + | 
|---|
|  | 88 | +               #Miscelaneous | 
|---|
|  | 89 | +               if not self.login: | 
|---|
|  | 90 | +                       md = md.checkmessage('login empty') | 
|---|
|  | 91 | +               if not self.codepath: | 
|---|
|  | 92 | +                       md = md.checkmessage('codepath empty') | 
|---|
|  | 93 | +               if not self.executionpath: | 
|---|
|  | 94 | +                       md = md.checkmessage('executionpath empty') | 
|---|
|  | 95 | +               if self.interactive==1: | 
|---|
|  | 96 | +                       md = md.checkmessage('interactive mode not implemented') | 
|---|
|  | 97 | +               return self | 
|---|
|  | 98 | +               # }}} | 
|---|
|  | 99 | +       def BuildQueueScript(self,dirname,modelname,solution,io_gather,isvalgrind,isgprof,isdakota,isoceancoupling): | 
|---|
|  | 100 | +               # {{{ | 
|---|
|  | 101 | + | 
|---|
|  | 102 | +               executable='issm.exe' | 
|---|
|  | 103 | +               if isdakota: | 
|---|
|  | 104 | +                       version=IssmConfig('_DAKOTA_VERSION_')[0:2] | 
|---|
|  | 105 | +                       version=float(version) | 
|---|
|  | 106 | +                       if version>=6: | 
|---|
|  | 107 | +                               executable='issm_dakota.exe' | 
|---|
|  | 108 | +               if isoceancoupling: | 
|---|
|  | 109 | +                       executable='issm_ocean.exe' | 
|---|
|  | 110 | +               #write queuing script | 
|---|
|  | 111 | +               shortname=modelname[0:min(12,len(modelname))] | 
|---|
|  | 112 | +               fid=open(modelname+'.queue','w') | 
|---|
|  | 113 | + | 
|---|
|  | 114 | +               fid.write('#!/bin/bash -l\n') | 
|---|
|  | 115 | +               fid.write('#SBATCH --job-name=%s \n' % shortname) | 
|---|
|  | 116 | +               fid.write('#SBATCH --partition %s \n' % self.queue) | 
|---|
|  | 117 | +               fid.write('#SBATCH --nodes=%i' % self.numnodes) | 
|---|
|  | 118 | +               fid.write('#SBATCH --ntasks-per-nodes==%i' % self.cpuspernode) | 
|---|
|  | 119 | +               fid.write('#SBATCH --time=%s\n' % timestring) #walltime is minutes | 
|---|
|  | 120 | +               fid.write('#SBATCH --mem-per-cpu=%iGB\n' % self.mem)# mem is in GB | 
|---|
|  | 121 | +               if (mod(self.np,16)+mod(self.np,20))==0: | 
|---|
|  | 122 | +                       fid.write('#SBATCH --ntask=%i\n' % self.np) | 
|---|
|  | 123 | +               fid.write('#SBATCH --account=%s\n' % self.accountname) | 
|---|
|  | 124 | +               fid.write('#SBATCH --output %s/%s/%s.outlog \n' % (self.executionpath,dirname,modelname)) | 
|---|
|  | 125 | +               fid.write('#SBATCH --error %s/%s/%s.errlog \n\n' % (self.executionpath,dirname,modelname)) | 
|---|
|  | 126 | + | 
|---|
|  | 127 | +               fid.write('export ISSM_DIR="%s/../"\n' % self.codepath) | 
|---|
|  | 128 | +               fid.write('module load Automake/1.15-intel-2016a\n') | 
|---|
|  | 129 | +               fid.write('module load libtool/2.4.6-intel-2016a\n') | 
|---|
|  | 130 | +               fid.write('module load CMake/3.5.2-intel-2016a\n') | 
|---|
|  | 131 | +               fid.write('module load intel/2016a\n') | 
|---|
|  | 132 | +               fid.write('module load ParMETIS/4.0.3-intel-2016a\n') | 
|---|
|  | 133 | +               fid.write('module load MUMPS/5.1.1-intel-2016a-parmetis\n') | 
|---|
|  | 134 | +               fid.write('module load PETSc/3.7.2-intel-2016a-Python-2.7.11\n') | 
|---|
|  | 135 | +               fid.write('module load FFTW/3.3.4-intel-2016a\n') | 
|---|
|  | 136 | +               fid.write('module load OpenSSL/1.0.1s-intel-2016a\n') | 
|---|
|  | 137 | + | 
|---|
|  | 138 | +               fid.write('cd %s/%s/\n\n' % (self.executionpath,dirname)) | 
|---|
|  | 139 | +               if self.profiling==1: | 
|---|
|  | 140 | +                       fid.write('module load perf-report\n') | 
|---|
|  | 141 | +                       fid.write('perf-report mpirun -np %i %s/%s %s %s/%s %s\n' % (self.np,self.codepath,executable,str(solution),self.executionpath,dirname,modelname)) | 
|---|
|  | 142 | +               else: | 
|---|
|  | 143 | +                       fid.write('mpirun -np %i %s/%s %s %s/%s %s\n' % (self.np,self.codepath,executable,str(solution),self.executionpath,dirname,modelname)) | 
|---|
|  | 144 | +               fid.close() | 
|---|
|  | 145 | + | 
|---|
|  | 146 | +               # }}} | 
|---|
|  | 147 | +       def UploadQueueJob(self,modelname,dirname,filelist): | 
|---|
|  | 148 | +               # {{{ | 
|---|
|  | 149 | +               #compress the files into one zip. | 
|---|
|  | 150 | +               compressstring='tar -zcf %s.tar.gz ' % dirname | 
|---|
|  | 151 | +               for file in filelist: | 
|---|
|  | 152 | +                       compressstring += ' %s' % file | 
|---|
|  | 153 | +               subprocess.call(compressstring,shell=True) | 
|---|
|  | 154 | + | 
|---|
|  | 155 | +               print 'uploading input file and queueing script' | 
|---|
|  | 156 | +               issmscpout(self.name,self.executionpath,self.login,self.port,[dirname+'.tar.gz']) | 
|---|
|  | 157 | + | 
|---|
|  | 158 | +               # }}} | 
|---|
|  | 159 | +       def LaunchQueueJob(self,modelname,dirname,filelist,restart,batch): | 
|---|
|  | 160 | +               # {{{ | 
|---|
|  | 161 | + | 
|---|
|  | 162 | +               print 'launching solution sequence on remote cluster' | 
|---|
|  | 163 | +               if restart: | 
|---|
|  | 164 | +                       launchcommand='cd %s && cd %s && sbatch %s.queue' % (self.executionpath,dirname,modelname) | 
|---|
|  | 165 | +               else: | 
|---|
|  | 166 | +                       launchcommand='cd %s && rm -rf ./%s && mkdir %s && cd %s && mv ../%s.tar.gz ./ && tar -zxf %s.tar.gz  && sbatch %s.queue' % (self.executionpath,dirname,dirname,dirname,dirname,dirname,modelname) | 
|---|
|  | 167 | +               issmssh(self.name,self.login,self.port,launchcommand) | 
|---|
|  | 168 | + | 
|---|
|  | 169 | +               # }}} | 
|---|
|  | 170 | +       def Download(self,dirname,filelist): | 
|---|
|  | 171 | +               # {{{ | 
|---|
|  | 172 | + | 
|---|
|  | 173 | +               #copy files from cluster to current directory | 
|---|
|  | 174 | +               directory='%s/%s/' % (self.executionpath,dirname) | 
|---|
|  | 175 | +               issmscpin(self.name,self.login,self.port,directory,filelist) | 
|---|
|  | 176 | +               # }}} | 
|---|