source: issm/trunk-jpl/src/m/classes/clusters/hexagon.py@ 21425

Last change on this file since 21425 was 21425, checked in by aplach, 8 years ago

hexagon.py+vilje.py: fixed issue with walltime requests greater than 24h; timestring=str(datetime.timedelta(minutes=self.time)) does not provide the right format of HH:MM:SS for requests greater than 24h

File size: 5.6 KB
RevLine 
[20784]1import subprocess
2from fielddisplay import fielddisplay
3from pairoptions import pairoptions
4from issmssh import issmssh
5from issmscpin import issmscpin
6from issmscpout import issmscpout
7from QueueRequirements import QueueRequirements
8import datetime
9try:
10 from hexagon_settings import hexagon_settings
11except ImportError:
12 print 'You need hexagon_settings.py to proceed, check presence and sys.path'
13
14class hexagon(object):
15 """
16 Hexagon cluster class definition
17 Hexagon have nodes built of 2*16 CPUs. Nodes are dedicated to one job so the best usage is to use 32 procs per nodes (16 per cores) as it is what is billed anyway.
18 You can reduce this number if you run out of memory as the total node memory is divided by the number of procs
19 Usage:
20 cluster=hexagon();
21 """
22
23 def __init__(self,*args):
24 # {{{
25 self.name = 'hexagon'
26 self.login = ''
27 self.numnodes = 2
28 self.procspernodes = 32
29 self.mem = 32000
30 self.queue = 'batch'
31 self.time = 2*60
32 self.codepath = ''
33 self.executionpath = ''
34 self.interactive = 0
35 self.port = []
36 self.accountname = ''
37
38 #use provided options to change fields
39 options=pairoptions(*args)
40
41 #initialize cluster using user settings if provided
42 self=hexagon_settings(self)
[21415]43
[20784]44 #OK get other fields
45 self=options.AssignObjectFields(self)
[21415]46 self.np=self.numnodes*self.procspernodes
[20784]47 # }}}
48 def __repr__(self):
49 # {{{
50 # display the object
51 s = "class hexagon object:"
52 s = "%s\n%s"%(s,fielddisplay(self,'name','name of the cluster'))
53 s = "%s\n%s"%(s,fielddisplay(self,'login','login'))
54 s = "%s\n%s"%(s,fielddisplay(self,'numnodes','number of nodes'))
55 s = "%s\n%s"%(s,fielddisplay(self,'procspernodes','number of mpi procs per nodes default and optimal is 32'))
56 s = "%s\n%s"%(s,fielddisplay(self,'mem','Total node memory'))
57 s = "%s\n%s"%(s,fielddisplay(self,'queue','name of the queue'))
58 s = "%s\n%s"%(s,fielddisplay(self,'time','walltime requested in minutes'))
59 s = "%s\n%s"%(s,fielddisplay(self,'codepath','code path on the cluster'))
60 s = "%s\n%s"%(s,fielddisplay(self,'executionpath','execution path on the cluster'))
61 s = "%s\n%s"%(s,fielddisplay(self,'interactive',''))
62 s = "%s\n%s"%(s,fielddisplay(self,'accountname','your cluster account'))
63 return s
64 # }}}
65 def checkconsistency(self,md,solution,analyses):
66 # {{{
67 #mem should not be over 32000mb
68 #numprocs should not be over 4096
69 #we have cpupernodes*numberofcpus=mppwidth and mppnppn=cpupernodes,
70 #Miscelaneous
71 if not self.login:
72 md = md.checkmessage('login empty')
73 if not self.codepath:
74 md = md.checkmessage('codepath empty')
75 if not self.executionpath:
76 md = md.checkmessage('executionpath empty')
77 if self.interactive==1:
78 md = md.checkmessage('interactive mode not implemented')
79 if self.mem>32000:
80 md = md.checkmessage('asking too much memory max is 32000 per node')
81 return self
82 # }}}
83 def BuildQueueScript(self,dirname,modelname,solution,io_gather,isvalgrind,isgprof,isdakota):
84 # {{{
85
86 executable='issm.exe'
87 if isdakota:
88 version=IssmConfig('_DAKOTA_VERSION_')[0:2]
89 version=float(version)
90 if version>=6:
91 executable='issm_dakota.exe'
92
93 #write queuing script
94 shortname=modelname[0:min(12,len(modelname))]
95 fid=open(modelname+'.queue','w')
96 fid.write('#!/bin/bash\n')
97 fid.write('#PBS -N %s \n' % shortname)
98 fid.write('#PBS -l mppwidth=%i,mppnppn=%i\n' % (self.np,self.procspernodes))
[21425]99 timeobj=datetime.timedelta(minutes=self.time)
100 m,s=divmod(timeobj.total_seconds(), 60)
101 h,m=divmod(m, 60)
102 timestring="%02d:%02d:%02d" % (h, m, s)
[20784]103 fid.write('#PBS -l walltime=%s\n' % timestring) #walltime is hh:mm:ss
104 fid.write('#PBS -l mppmem=%imb\n' % int(self.mem/self.procspernodes))
105 fid.write('#PBS -A %s\n' % self.accountname)
106 fid.write('#PBS -o %s/%s/%s.outlog \n' % (self.executionpath,dirname,modelname))
107 fid.write('#PBS -e %s/%s/%s.errlog \n\n' % (self.executionpath,dirname,modelname))
108 fid.write('export ISSM_DIR="%s/../"\n' % self.codepath)
109 fid.write('export CRAY_ROOTFS=DSL\n')
110 fid.write('module swap PrgEnv-cray/5.2.40 PrgEnv-gnu\n')
111 fid.write('module load cray-petsc\n')
112 fid.write('module load cray-tpsl\n')
113 fid.write('module load cray-mpich\n')
114 fid.write('module load gsl\n')
115 fid.write('cd %s/%s/\n\n' % (self.executionpath,dirname))
[21050]116 fid.write('aprun -B %s/%s %s %s/%s %s\n' % (self.codepath,executable,str(solution),self.executionpath,dirname,modelname))
[20784]117 fid.close()
118
119 # }}}
120 def UploadQueueJob(self,modelname,dirname,filelist):
121 # {{{
122
123 #compress the files into one zip.
124 compressstring='tar -zcf %s.tar.gz ' % dirname
125 for file in filelist:
126 compressstring += ' %s' % file
127 subprocess.call(compressstring,shell=True)
128
129 print 'uploading input file and queueing script'
130 issmscpout(self.name,self.executionpath,self.login,self.port,[dirname+'.tar.gz'])
131
132 # }}}
133 def LaunchQueueJob(self,modelname,dirname,filelist,restart,batch):
134 # {{{
135
136 print 'launching solution sequence on remote cluster'
137 if restart:
138 launchcommand='cd %s && cd %s && qsub %s.queue' % (self.executionpath,dirname,modelname)
139 else:
140 launchcommand='cd %s && rm -rf ./%s && mkdir %s && cd %s && mv ../%s.tar.gz ./ && tar -zxf %s.tar.gz && qsub %s.queue' % (self.executionpath,dirname,dirname,dirname,dirname,dirname,modelname)
141 issmssh(self.name,self.login,self.port,launchcommand)
142
143 # }}}
144 def Download(self,dirname,filelist):
145 # {{{
146 #copy files from cluster to current directory
147 directory='%s/%s/' % (self.executionpath,dirname)
148 issmscpin(self.name,self.login,self.port,directory,filelist)
149 # }}}
Note: See TracBrowser for help on using the repository browser.