source: issm/oecreview/Archive/21724-22754/ISSM-21986-21987.diff

Last change on this file was 22755, checked in by Mathieu Morlighem, 7 years ago

CHG: added 21724-22754

File size: 6.8 KB
RevLine 
[22755]1Index: ../trunk-jpl/src/m/classes/clusters/stallo.py
2===================================================================
3--- ../trunk-jpl/src/m/classes/clusters/stallo.py (nonexistent)
4+++ ../trunk-jpl/src/m/classes/clusters/stallo.py (revision 21987)
5@@ -0,0 +1,171 @@
6+import subprocess
7+from fielddisplay import fielddisplay
8+from pairoptions import pairoptions
9+from issmssh import issmssh
10+from issmscpin import issmscpin
11+from issmscpout import issmscpout
12+from QueueRequirements import QueueRequirements
13+import datetime
14+try:
15+ from stallo_settings import stallo_settings
16+except ImportError:
17+ print 'You need stallo_settings.py to proceed, check presence and sys.path'
18+
19+class stallo(object):
20+ """
21+ Stallo cluster class definition
22+ This is a SLURM queue
23+ The priorities are given to:
24+ - Large jobs
25+ - Short jobs
26+ - small number of job per user
27+
28+ There are some 20cpu nodes and 16cpu nodes, with 32GB (a few with 128GB) mem per node, you can ask for part of a node if you need more memory.(1 node, 2 CPUS and 10GB per cpu for example)
29+
30+
31+ Usage:
32+ cluster=stallo();
33+ """
34+
35+ def __init__(self,*args):
36+ # {{{
37+ self.name = 'stallo'
38+ self.login = ''
39+ self.numnodes = 2
40+ self.cpuspernode = 20
41+ self.mem = 1.6
42+ self.queue = 'normal'
43+ self.time = 2*60
44+ self.codepath = ''
45+ self.executionpath = ''
46+ self.interactive = 0
47+ self.port = []
48+ self.accountname = ''
49+ self.profiling = 0
50+ #use provided options to change fields
51+ options=pairoptions(*args)
52+
53+ #initialize cluster using user settings if provided
54+ self=vilje_settings(self)
55+ #OK get other fields
56+ self=options.AssignObjectFields(self)
57+ self.np=self.numnodes*self.procspernodes
58+ # }}}
59+
60+ def __repr__(self):
61+ # {{{
62+ # display the object
63+ s = "class vilje object:"
64+ s = "%s\n%s"%(s,fielddisplay(self,'name','name of the cluster'))
65+ s = "%s\n%s"%(s,fielddisplay(self,'login','login'))
66+ s = "%s\n%s"%(s,fielddisplay(self,'numnodes','number of nodes'))
67+ s = "%s\n%s"%(s,fielddisplay(self,'cpuspernode','number of nodes per CPUs'))
68+ s = "%s\n%s"%(s,fielddisplay(self,'mem','memory per CPU'))
69+ s = "%s\n%s"%(s,fielddisplay(self,'queue','name of the queue (normal (D), short,singlenode,multinode,devel)'))
70+ s = "%s\n%s"%(s,fielddisplay(self,'time','walltime requested in minutes'))
71+ s = "%s\n%s"%(s,fielddisplay(self,'codepath','code path on the cluster'))
72+ s = "%s\n%s"%(s,fielddisplay(self,'executionpath','execution path on the cluster'))
73+ s = "%s\n%s"%(s,fielddisplay(self,'interactive',''))
74+ s = "%s\n%s"%(s,fielddisplay(self,'accountname','your cluster account'))
75+ s = "%s\n%s"%(s,fielddisplay(self,'profiling','enable profiling if 1 default is 0'))
76+ return s
77+ # }}}
78+ def checkconsistency(self,md,solution,analyses):
79+ # {{{
80+ #Queue dictionarry gives queue name as key and max walltime and cpus as var
81+ queuedict = {'short':[60, 2048],
82+ 'normal':[2*24*60,2048],
83+ 'singlenode':[28*24*60,20],
84+ 'multinode':[28*24*60,2048],
85+ 'devel':[4*60,2048]}
86+ QueueRequirements(queuedict,self.queue,self.np,self.time)
87+
88+ #Miscelaneous
89+ if not self.login:
90+ md = md.checkmessage('login empty')
91+ if not self.codepath:
92+ md = md.checkmessage('codepath empty')
93+ if not self.executionpath:
94+ md = md.checkmessage('executionpath empty')
95+ if self.interactive==1:
96+ md = md.checkmessage('interactive mode not implemented')
97+ return self
98+ # }}}
99+ def BuildQueueScript(self,dirname,modelname,solution,io_gather,isvalgrind,isgprof,isdakota,isoceancoupling):
100+ # {{{
101+
102+ executable='issm.exe'
103+ if isdakota:
104+ version=IssmConfig('_DAKOTA_VERSION_')[0:2]
105+ version=float(version)
106+ if version>=6:
107+ executable='issm_dakota.exe'
108+ if isoceancoupling:
109+ executable='issm_ocean.exe'
110+ #write queuing script
111+ shortname=modelname[0:min(12,len(modelname))]
112+ fid=open(modelname+'.queue','w')
113+
114+ fid.write('#!/bin/bash -l\n')
115+ fid.write('#SBATCH --job-name=%s \n' % shortname)
116+ fid.write('#SBATCH --partition %s \n' % self.queue)
117+ fid.write('#SBATCH --nodes=%i' % self.numnodes)
118+ fid.write('#SBATCH --ntasks-per-nodes==%i' % self.cpuspernode)
119+ fid.write('#SBATCH --time=%s\n' % timestring) #walltime is minutes
120+ fid.write('#SBATCH --mem-per-cpu=%iGB\n' % self.mem)# mem is in GB
121+ if (mod(self.np,16)+mod(self.np,20))==0:
122+ fid.write('#SBATCH --ntask=%i\n' % self.np)
123+ fid.write('#SBATCH --account=%s\n' % self.accountname)
124+ fid.write('#SBATCH --output %s/%s/%s.outlog \n' % (self.executionpath,dirname,modelname))
125+ fid.write('#SBATCH --error %s/%s/%s.errlog \n\n' % (self.executionpath,dirname,modelname))
126+
127+ fid.write('export ISSM_DIR="%s/../"\n' % self.codepath)
128+ fid.write('module load Automake/1.15-intel-2016a\n')
129+ fid.write('module load libtool/2.4.6-intel-2016a\n')
130+ fid.write('module load CMake/3.5.2-intel-2016a\n')
131+ fid.write('module load intel/2016a\n')
132+ fid.write('module load ParMETIS/4.0.3-intel-2016a\n')
133+ fid.write('module load MUMPS/5.1.1-intel-2016a-parmetis\n')
134+ fid.write('module load PETSc/3.7.2-intel-2016a-Python-2.7.11\n')
135+ fid.write('module load FFTW/3.3.4-intel-2016a\n')
136+ fid.write('module load OpenSSL/1.0.1s-intel-2016a\n')
137+
138+ fid.write('cd %s/%s/\n\n' % (self.executionpath,dirname))
139+ if self.profiling==1:
140+ fid.write('module load perf-report\n')
141+ fid.write('perf-report mpirun -np %i %s/%s %s %s/%s %s\n' % (self.np,self.codepath,executable,str(solution),self.executionpath,dirname,modelname))
142+ else:
143+ fid.write('mpirun -np %i %s/%s %s %s/%s %s\n' % (self.np,self.codepath,executable,str(solution),self.executionpath,dirname,modelname))
144+ fid.close()
145+
146+ # }}}
147+ def UploadQueueJob(self,modelname,dirname,filelist):
148+ # {{{
149+ #compress the files into one zip.
150+ compressstring='tar -zcf %s.tar.gz ' % dirname
151+ for file in filelist:
152+ compressstring += ' %s' % file
153+ subprocess.call(compressstring,shell=True)
154+
155+ print 'uploading input file and queueing script'
156+ issmscpout(self.name,self.executionpath,self.login,self.port,[dirname+'.tar.gz'])
157+
158+ # }}}
159+ def LaunchQueueJob(self,modelname,dirname,filelist,restart,batch):
160+ # {{{
161+
162+ print 'launching solution sequence on remote cluster'
163+ if restart:
164+ launchcommand='cd %s && cd %s && sbatch %s.queue' % (self.executionpath,dirname,modelname)
165+ else:
166+ launchcommand='cd %s && rm -rf ./%s && mkdir %s && cd %s && mv ../%s.tar.gz ./ && tar -zxf %s.tar.gz && sbatch %s.queue' % (self.executionpath,dirname,dirname,dirname,dirname,dirname,modelname)
167+ issmssh(self.name,self.login,self.port,launchcommand)
168+
169+ # }}}
170+ def Download(self,dirname,filelist):
171+ # {{{
172+
173+ #copy files from cluster to current directory
174+ directory='%s/%s/' % (self.executionpath,dirname)
175+ issmscpin(self.name,self.login,self.port,directory,filelist)
176+ # }}}
Note: See TracBrowser for help on using the repository browser.