Context Navigation

ISSM-21986-21987.diff

Last change on this file was 22755, checked in by Mathieu Morlighem, 7 years ago
CHG: added 21724-22754
File size: 6.8 KB

Rev	Line
[22755]	1	Index: ../trunk-jpl/src/m/classes/clusters/stallo.py
	2	===================================================================
	3	--- ../trunk-jpl/src/m/classes/clusters/stallo.py (nonexistent)
	4	+++ ../trunk-jpl/src/m/classes/clusters/stallo.py (revision 21987)
	5	@@ -0,0 +1,171 @@
	6	+import subprocess
	7	+from fielddisplay import fielddisplay
	8	+from pairoptions import pairoptions
	9	+from issmssh import issmssh
	10	+from issmscpin import issmscpin
	11	+from issmscpout import issmscpout
	12	+from QueueRequirements import QueueRequirements
	13	+import datetime
	14	+try:
	15	+ from stallo_settings import stallo_settings
	16	+except ImportError:
	17	+ print 'You need stallo_settings.py to proceed, check presence and sys.path'
	18	+
	19	+class stallo(object):
	20	+ """
	21	+ Stallo cluster class definition
	22	+ This is a SLURM queue
	23	+ The priorities are given to:
	24	+ - Large jobs
	25	+ - Short jobs
	26	+ - small number of job per user
	27	+
	28	+ There are some 20cpu nodes and 16cpu nodes, with 32GB (a few with 128GB) mem per node, you can ask for part of a node if you need more memory.(1 node, 2 CPUS and 10GB per cpu for example)
	29	+
	30	+
	31	+ Usage:
	32	+ cluster=stallo();
	33	+ """
	34	+
	35	+ def __init__(self,*args):
	36	+ # {{{
	37	+ self.name = 'stallo'
	38	+ self.login = ''
	39	+ self.numnodes = 2
	40	+ self.cpuspernode = 20
	41	+ self.mem = 1.6
	42	+ self.queue = 'normal'
	43	+ self.time = 2*60
	44	+ self.codepath = ''
	45	+ self.executionpath = ''
	46	+ self.interactive = 0
	47	+ self.port = []
	48	+ self.accountname = ''
	49	+ self.profiling = 0
	50	+ #use provided options to change fields
	51	+ options=pairoptions(*args)
	52	+
	53	+ #initialize cluster using user settings if provided
	54	+ self=vilje_settings(self)
	55	+ #OK get other fields
	56	+ self=options.AssignObjectFields(self)
	57	+ self.np=self.numnodes*self.procspernodes
	58	+ # }}}
	59	+
	60	+ def __repr__(self):
	61	+ # {{{
	62	+ # display the object
	63	+ s = "class vilje object:"
	64	+ s = "%s\n%s"%(s,fielddisplay(self,'name','name of the cluster'))
	65	+ s = "%s\n%s"%(s,fielddisplay(self,'login','login'))
	66	+ s = "%s\n%s"%(s,fielddisplay(self,'numnodes','number of nodes'))
	67	+ s = "%s\n%s"%(s,fielddisplay(self,'cpuspernode','number of nodes per CPUs'))
	68	+ s = "%s\n%s"%(s,fielddisplay(self,'mem','memory per CPU'))
	69	+ s = "%s\n%s"%(s,fielddisplay(self,'queue','name of the queue (normal (D), short,singlenode,multinode,devel)'))
	70	+ s = "%s\n%s"%(s,fielddisplay(self,'time','walltime requested in minutes'))
	71	+ s = "%s\n%s"%(s,fielddisplay(self,'codepath','code path on the cluster'))
	72	+ s = "%s\n%s"%(s,fielddisplay(self,'executionpath','execution path on the cluster'))
	73	+ s = "%s\n%s"%(s,fielddisplay(self,'interactive',''))
	74	+ s = "%s\n%s"%(s,fielddisplay(self,'accountname','your cluster account'))
	75	+ s = "%s\n%s"%(s,fielddisplay(self,'profiling','enable profiling if 1 default is 0'))
	76	+ return s
	77	+ # }}}
	78	+ def checkconsistency(self,md,solution,analyses):
	79	+ # {{{
	80	+ #Queue dictionarry gives queue name as key and max walltime and cpus as var
	81	+ queuedict = {'short':[60, 2048],
	82	+ 'normal':[22460,2048],
	83	+ 'singlenode':[282460,20],
	84	+ 'multinode':[282460,2048],
	85	+ 'devel':[4*60,2048]}
	86	+ QueueRequirements(queuedict,self.queue,self.np,self.time)
	87	+
	88	+ #Miscelaneous
	89	+ if not self.login:
	90	+ md = md.checkmessage('login empty')
	91	+ if not self.codepath:
	92	+ md = md.checkmessage('codepath empty')
	93	+ if not self.executionpath:
	94	+ md = md.checkmessage('executionpath empty')
	95	+ if self.interactive==1:
	96	+ md = md.checkmessage('interactive mode not implemented')
	97	+ return self
	98	+ # }}}
	99	+ def BuildQueueScript(self,dirname,modelname,solution,io_gather,isvalgrind,isgprof,isdakota,isoceancoupling):
	100	+ # {{{
	101	+
	102	+ executable='issm.exe'
	103	+ if isdakota:
	104	+ version=IssmConfig('_DAKOTA_VERSION_')[0:2]
	105	+ version=float(version)
	106	+ if version>=6:
	107	+ executable='issm_dakota.exe'
	108	+ if isoceancoupling:
	109	+ executable='issm_ocean.exe'
	110	+ #write queuing script
	111	+ shortname=modelname[0:min(12,len(modelname))]
	112	+ fid=open(modelname+'.queue','w')
	113	+
	114	+ fid.write('#!/bin/bash -l\n')
	115	+ fid.write('#SBATCH --job-name=%s \n' % shortname)
	116	+ fid.write('#SBATCH --partition %s \n' % self.queue)
	117	+ fid.write('#SBATCH --nodes=%i' % self.numnodes)
	118	+ fid.write('#SBATCH --ntasks-per-nodes==%i' % self.cpuspernode)
	119	+ fid.write('#SBATCH --time=%s\n' % timestring) #walltime is minutes
	120	+ fid.write('#SBATCH --mem-per-cpu=%iGB\n' % self.mem)# mem is in GB
	121	+ if (mod(self.np,16)+mod(self.np,20))==0:
	122	+ fid.write('#SBATCH --ntask=%i\n' % self.np)
	123	+ fid.write('#SBATCH --account=%s\n' % self.accountname)
	124	+ fid.write('#SBATCH --output %s/%s/%s.outlog \n' % (self.executionpath,dirname,modelname))
	125	+ fid.write('#SBATCH --error %s/%s/%s.errlog \n\n' % (self.executionpath,dirname,modelname))
	126	+
	127	+ fid.write('export ISSM_DIR="%s/../"\n' % self.codepath)
	128	+ fid.write('module load Automake/1.15-intel-2016a\n')
	129	+ fid.write('module load libtool/2.4.6-intel-2016a\n')
	130	+ fid.write('module load CMake/3.5.2-intel-2016a\n')
	131	+ fid.write('module load intel/2016a\n')
	132	+ fid.write('module load ParMETIS/4.0.3-intel-2016a\n')
	133	+ fid.write('module load MUMPS/5.1.1-intel-2016a-parmetis\n')
	134	+ fid.write('module load PETSc/3.7.2-intel-2016a-Python-2.7.11\n')
	135	+ fid.write('module load FFTW/3.3.4-intel-2016a\n')
	136	+ fid.write('module load OpenSSL/1.0.1s-intel-2016a\n')
	137	+
	138	+ fid.write('cd %s/%s/\n\n' % (self.executionpath,dirname))
	139	+ if self.profiling==1:
	140	+ fid.write('module load perf-report\n')
	141	+ fid.write('perf-report mpirun -np %i %s/%s %s %s/%s %s\n' % (self.np,self.codepath,executable,str(solution),self.executionpath,dirname,modelname))
	142	+ else:
	143	+ fid.write('mpirun -np %i %s/%s %s %s/%s %s\n' % (self.np,self.codepath,executable,str(solution),self.executionpath,dirname,modelname))
	144	+ fid.close()
	145	+
	146	+ # }}}
	147	+ def UploadQueueJob(self,modelname,dirname,filelist):
	148	+ # {{{
	149	+ #compress the files into one zip.
	150	+ compressstring='tar -zcf %s.tar.gz ' % dirname
	151	+ for file in filelist:
	152	+ compressstring += ' %s' % file
	153	+ subprocess.call(compressstring,shell=True)
	154	+
	155	+ print 'uploading input file and queueing script'
	156	+ issmscpout(self.name,self.executionpath,self.login,self.port,[dirname+'.tar.gz'])
	157	+
	158	+ # }}}
	159	+ def LaunchQueueJob(self,modelname,dirname,filelist,restart,batch):
	160	+ # {{{
	161	+
	162	+ print 'launching solution sequence on remote cluster'
	163	+ if restart:
	164	+ launchcommand='cd %s && cd %s && sbatch %s.queue' % (self.executionpath,dirname,modelname)
	165	+ else:
	166	+ launchcommand='cd %s && rm -rf ./%s && mkdir %s && cd %s && mv ../%s.tar.gz ./ && tar -zxf %s.tar.gz && sbatch %s.queue' % (self.executionpath,dirname,dirname,dirname,dirname,dirname,modelname)
	167	+ issmssh(self.name,self.login,self.port,launchcommand)
	168	+
	169	+ # }}}
	170	+ def Download(self,dirname,filelist):
	171	+ # {{{
	172	+
	173	+ #copy files from cluster to current directory
	174	+ directory='%s/%s/' % (self.executionpath,dirname)
	175	+ issmscpin(self.name,self.login,self.port,directory,filelist)
	176	+ # }}}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: issm/oecreview/Archive/21724-22754/ISSM-21986-21987.diff

Download in other formats: