source: issm/trunk/src/m/classes/clusters/hexagon.py@ 24313

Last change on this file since 24313 was 24313, checked in by Mathieu Morlighem, 5 years ago

merged trunk-jpl and trunk for revision 24310

File size: 6.3 KB
Line 
1import subprocess
2from fielddisplay import fielddisplay
3from pairoptions import pairoptions
4from issmssh import issmssh
5from issmscpin import issmscpin
6from issmscpout import issmscpout
7from IssmConfig import IssmConfig
8import datetime
9try:
10 from hexagon_settings import hexagon_settings
11except ImportError:
12 print('You need hexagon_settings.py to proceed, check presence and sys.path')
13
14
15class hexagon(object):
16 """
17 Hexagon cluster class definition
18 Hexagon have nodes built of 2 * 16 CPUs. Nodes are dedicated to one job so the best usage is to use 32 procs per nodes (16 per cores) as it is what is billed anyway.
19 You can reduce this number if you run out of memory as the total node memory is divided by the number of procs
20 Usage:
21 cluster = hexagon()
22 """
23
24 def __init__(self, *args): # {{{
25 self.name = 'hexagon'
26 self.login = ''
27 self.numnodes = 2
28 self.procspernodes = 32
29 self.mem = 32000
30 self.queue = 'batch'
31 self.time = 2 * 60
32 self.codepath = ''
33 self.executionpath = ''
34 self.interactive = 0
35 self.port = []
36 self.accountname = ''
37
38 #use provided options to change fields
39 options = pairoptions(*args)
40
41 #initialize cluster using user settings if provided
42 self = hexagon_settings(self)
43
44 #OK get other fields
45 self = options.AssignObjectFields(self)
46 self.np = self.numnodes * self.procspernodes
47 # }}}
48
49 def __repr__(self): # {{{
50 # display the object
51 s = "class hexagon object:"
52 s = "%s\n%s" % (s, fielddisplay(self, 'name', 'name of the cluster'))
53 s = "%s\n%s" % (s, fielddisplay(self, 'login', 'login'))
54 s = "%s\n%s" % (s, fielddisplay(self, 'numnodes', 'number of nodes'))
55 s = "%s\n%s" % (s, fielddisplay(self, 'procspernodes', 'number of mpi procs per nodes default and optimal is 32'))
56 s = "%s\n%s" % (s, fielddisplay(self, 'mem', 'Total node memory'))
57 s = "%s\n%s" % (s, fielddisplay(self, 'queue', 'name of the queue'))
58 s = "%s\n%s" % (s, fielddisplay(self, 'time', 'walltime requested in minutes'))
59 s = "%s\n%s" % (s, fielddisplay(self, 'codepath', 'code path on the cluster'))
60 s = "%s\n%s" % (s, fielddisplay(self, 'executionpath', 'execution path on the cluster'))
61 s = "%s\n%s" % (s, fielddisplay(self, 'interactive', ''))
62 s = "%s\n%s" % (s, fielddisplay(self, 'accountname', 'your cluster account'))
63 return s
64 # }}}
65
66 def checkconsistency(self, md, solution, analyses): # {{{
67 #mem should not be over 32000mb
68 #numprocs should not be over 4096
69 #we have cpupernodes * numberofcpus = mppwidth and mppnppn = cpupernodes,
70 #Miscelaneous
71 if not self.login:
72 md = md.checkmessage('login empty')
73 if not self.codepath:
74 md = md.checkmessage('codepath empty')
75 if not self.executionpath:
76 md = md.checkmessage('executionpath empty')
77 if self.interactive == 1:
78 md = md.checkmessage('interactive mode not implemented')
79 if self.mem > 32000:
80 md = md.checkmessage('asking too much memory max is 32000 per node')
81 return self
82 # }}}
83
84 def BuildQueueScript(self, dirname, modelname, solution, io_gather, isvalgrind, isgprof, isdakota, isoceancoupling): # {{{
85 executable = 'issm.exe'
86 if isdakota:
87 version = IssmConfig('_DAKOTA_VERSION_')[0:2]
88 version = float(version)
89 if version >= 6:
90 executable = 'issm_dakota.exe'
91 if isoceancoupling:
92 executable = 'issm_ocean.exe'
93
94 #write queuing script
95 shortname = modelname[0:min(12, len(modelname))]
96 fid = open(modelname + '.queue', 'w')
97 fid.write(' #!/bin/bash\n')
98 fid.write(' #PBS - N %s \n' % shortname)
99 fid.write(' #PBS - l mppwidth=%i, mppnppn=%i\n' % (self.np, self.procspernodes))
100 timeobj = datetime.timedelta(minutes=self.time)
101 m, s = divmod(timeobj.total_seconds(), 60)
102 h, m = divmod(m, 60)
103 timestring = "%02d:%02d:%02d" % (h, m, s)
104 fid.write('#PBS -l walltime=%s\n' % timestring) #walltime is hh:mm:ss
105 fid.write('#PBS -l mppmem=%imb\n' % int(self.mem / self.procspernodes))
106 fid.write('#PBS -A %s\n' % self.accountname)
107 fid.write('#PBS -o %s/%s/%s.outlog \n' % (self.executionpath, dirname, modelname))
108 fid.write('#PBS -e %s/%s/%s.errlog \n\n' % (self.executionpath, dirname, modelname))
109 fid.write('export ISSM_DIR="%s/../"\n' % self.codepath)
110 fid.write('export CRAY_ROOTFS=DSL\n')
111 fid.write('module swap PrgEnv-cray / 5.2.40 PrgEnv - gnu\n')
112 fid.write('module load cray-petsc\n')
113 fid.write('module load cray-tpsl\n')
114 fid.write('module load cray-mpich\n')
115 fid.write('module load gsl\n')
116 fid.write('cd %s/%s/\n\n' % (self.executionpath, dirname))
117 fid.write('aprun -B %s/%s %s %s/%s %s\n' % (self.codepath, executable, str(solution), self.executionpath, dirname, modelname))
118 fid.close()
119 # }}}
120
121 def UploadQueueJob(self, modelname, dirname, filelist): # {{{
122 #compress the files into one zip.
123 compressstring = 'tar -zcf %s.tar.gz ' % dirname
124 for file in filelist:
125 compressstring += ' %s' % file
126 subprocess.call(compressstring, shell=True)
127
128 print('uploading input file and queueing script')
129 issmscpout(self.name, self.executionpath, self.login, self.port, [dirname + '.tar.gz'])
130 # }}}
131
132 def LaunchQueueJob(self, modelname, dirname, filelist, restart, batch): # {{{
133 print('launching solution sequence on remote cluster')
134 if restart:
135 launchcommand = 'cd %s && cd %s && qsub %s.queue' % (self.executionpath, dirname, modelname)
136 else:
137 launchcommand = 'cd %s && rm -rf ./%s && mkdir %s && cd %s && mv ../%s.tar.gz ./ && tar -zxf %s.tar.gz && qsub %s.queue' % (self.executionpath, dirname, dirname, dirname, dirname, dirname, modelname)
138 issmssh(self.name, self.login, self.port, launchcommand)
139 # }}}
140
141 def Download(self, dirname, filelist): # {{{
142 #copy files from cluster to current directory
143 directory = '%s/%s/' % (self.executionpath, dirname)
144 issmscpin(self.name, self.login, self.port, directory, filelist)
145 # }}}
Note: See TracBrowser for help on using the repository browser.