Changeset 26435
- Timestamp:
- 09/13/21 13:56:40 (4 years ago)
- Location:
- issm/trunk-jpl/src/m
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
issm/trunk-jpl/src/m/classes/clusters/smce_eis.py
r26419 r26435 1 import os 2 import shutil 1 3 import subprocess 2 4 … … 8 10 from helpers import * 9 11 from IssmConfig import IssmConfig 10 from issmscpin import issmscpin11 from issmscpout import issmscpout12 12 from issmssh import issmssh 13 13 from MatlabFuncs import * 14 14 from pairoptions import pairoptions 15 from QueueRequirements import QueueRequirements16 15 17 16 class smce_eis(object): … … 25 24 26 25 def __init__(self, *args): # {{{ 27 self.name = oshostname() 28 self.login = '' 26 self.name = '52.54.75.45' 27 self.login = 'jdquinn1' 28 self.idfile = '~/.ssh/smce-aws-eis-pilot' 29 29 self.modules = [] 30 self.numnodes = 2031 self.cpuspernode = 830 self.numnodes = 8 31 self.cpuspernode = 1 32 32 self.port = 0 33 self.queue = 'general'34 33 self.time = 12 * 60 * 60 35 self.processor = ' west'34 self.processor = 'skylake' 36 35 self.srcpath = '/shared/issm/issm/trunk-jpl' 37 36 self.codepath = '/shared/issm/issm/trunk-jpl/bin' 38 self.executionpath = '' 39 self.grouplist = '' 37 self.executionpath = '/home/jdquinn1/issm-exec' 40 38 self.interactive = 0 41 self.bbftp = 0 42 self.numstreams = 8 39 self.numstreams = 1 43 40 self.hyperthreading = 0 44 41 self.email = '' … … 62 59 s += ' name: {}\n'.format(self.name) 63 60 s += ' login: {}\n'.format(self.login) 61 s += ' idfile: {}\n'.format(self.idfile) 64 62 s += ' modules: {}\n'.format(strjoin(self.modules, ', ')) 65 63 s += ' numnodes: {}\n'.format(self.numnodes) … … 67 65 s += ' np: {}\n'.format(self.nprocs()) 68 66 s += ' port: {}\n'.format(self.port) 69 s += ' queue: {}\n'.format(self.queue)70 67 s += ' time: {}\n'.format(self.time) 71 68 s += ' processor: {}\n'.format(self.processor) … … 73 70 s += ' codepath: {}\n'.format(self.codepath) 74 71 s += ' executionpath: {}\n'.format(self.executionpath) 75 s += ' grouplist: {}\n'.format(self.grouplist)76 72 s += ' interactive: {}\n'.format(self.interactive) 77 s += ' bbftp: {}\n'.format(self.bbftp)78 73 s += ' numstreams: {}\n'.format(self.numstreams) 79 74 s += ' hyperthreading: {}\n'.format(self.hyperthreading) … … 86 81 87 82 def checkconsistency(self, md, solution, analyses): # {{{ 88 queuedict = {'long': [24 * 60 * 60, 560],89 'allnccs': [12 * 60 * 60, 6000],90 'debug': [1 * 60 * 60, 532]}91 QueueRequirements(queuedict, self.queue, self.nprocs(), self.time)92 93 83 # Now, check cluster.cpuspernode according to processor type 94 if self.processor == 'sand': 95 if self.cpuspernode > 16 or self.cpuspernode < 1: 96 md = md.checkmessage('cpuspernode should be between 1 and 16 for \'sand\' processors in hyperthreading mode') 97 elif self.processor == 'hasw': 98 if self.cpuspernode > 28 or self.cpuspernode < 1: 99 md = md.checkmessage('cpuspernode should be between 1 and 28 for \'hasw\' processors in hyperthreading mode') 84 if self.processor == 'skylake': 85 if self.cpuspernode > 14 or self.cpuspernode < 1: 86 md = md.checkmessage('cpuspernode should be between 1 and 14 for \'skyw\' processors in hyperthreading mode') 100 87 else: 101 md = md.checkmessage('unknown processor type, should be \'s and\' or \'hasw\'')88 md = md.checkmessage('unknown processor type, should be \'skylake\'') 102 89 103 90 # Miscellaneous 104 91 if not self.login: 105 92 md = md.checkmessage('login empty') 93 if self.port: 94 md = md.checkmessage('port must be set to 0 as we do not have an SSH tunnel') 106 95 if not self.codepath: 107 96 md = md.checkmessage('codepath empty') … … 129 118 130 119 fid.write('#!/bin/bash\n') 120 fid.write('#SBATCH --partition=hpc-spot \n') 131 121 fid.write('#SBATCH -J {} \n'.format(modelname)) 132 fid.write('#SBATCH --qos={} \n'.format(self.queue))133 122 fid.write('#SBATCH -o {}.outlog \n'.format(modelname)) 134 123 fid.write('#SBATCH -e {}.errlog \n'.format(modelname)) 135 fid.write('#SBATCH -n {} \n'.format(self.nprocs())) 136 fid.write('#SBATCH -N {} \n'.format(self.numnodes)) 124 fid.write('#SBATCH --nodes={} \n'.format(self.numnodes)) 125 fid.write('#SBATCH --ntasks-per-node={} \n'.format(self.cpuspernode)) 126 fid.write('#SBATCH --cpus-per-task={} \n'.format(self.numstreams)) 137 127 fid.write('#SBATCH -t {:02d}:{:02d}:00 \n'.format(int(floor(self.time / 3600)), int(floor(self.time % 3600) / 60))) 138 fid.write('#SBATCH -A {} \n\n'.format(self.grouplist))139 128 if (self.email.find('@')>-1): 140 129 fid.write('#SBATCH --mail-user={} \n'.format(self.email)) 141 fid.write('#SBATCH --mail-type=end \n\n') 142 # fid.write('. /usr/share/modules/init/bash\n\n') 130 fid.write('#SBATCH --mail-type=BEGIN,END,FAIL \n\n') 143 131 # for i in range(len(self.modules)): 144 132 # fid.write('module load {}\n'.format(self.modules[i])) 145 133 fid.write('export MPI_GROUP_MAX=64\n\n') 146 134 fid.write('export MPI_UNBUFFERED_STDIO=true\n\n') 135 fid.write('export PATH="$PATH:/opt/slurm/bin"\n') # TODO: Figure out how to add this to PATH by sourcing environment script 147 136 fid.write('export PATH="$PATH:."\n\n') 148 137 fid.write('export ISSM_DIR="{}"\n'.format(self.srcpath)) # FIXME 138 fid.write('export HYDRA_HOST_FILE={}/{}/'.format(self.executionpath, dirname)) 139 fid.write('${USER}-hydranodes-sj:${SLURM_JOB_ID}\n') 140 #fid.write('source /usr/share/modules/init/bash\n') 141 #fid.write('source /efs/spack-justin/share/spack/setup-env.sh\n') 142 #fid.write('module load gcc-9.3.0-gcc-9.3.0-ug2hqa3\n') 143 fid.write('touch ${HYDRA_HOST_FILE}\n') 144 fid.write('/efs/mrilee/snodelist/build/snodelist -m -f "%h%[:]c" >> ${HYDRA_HOST_FILE}\n') 149 145 fid.write('source $ISSM_DIR/etc/environment.sh\n') # FIXME 150 146 fid.write('cd {}/{}/\n\n'.format(self.executionpath, dirname)) … … 187 183 directory = self.executionpath 188 184 189 if self.bbftp: 190 issmbbftpout(self.name, directory, self.login, self.port, self.numstreams, '{}.tar.gz'.format(dirname)) 191 else: 192 issmscpout(self.name, directory, self.login, self.port, ['{}.tar.gz'.format(dirname)]) 185 # NOTE: Replacement for issmscpout(self.name, directory, self.login, self.port, ['{}.tar.gz'.format(dirname)]) 186 copystring = 'cp {}.tar.gz /efs/issm-tmp/'.format(dirname, dirname) 187 subprocess.call(copystring, shell=True) 193 188 # }}} 194 189 … … 201 196 else: 202 197 if not isempty(restart): 203 launchcommand = 'cd {} && cd {} && sbatch {}.queue'.format(self.executionpath, dirname, modelname)198 launchcommand = 'cd {} && cd {} && /opt/slurm/bin/sbatch {}.queue'.format(self.executionpath, dirname, modelname) 204 199 else: 205 launchcommand = 'cd {} && rm -rf ./{} && mkdir {} && cd {} && mv ../{}.tar.gz ./ && tar -zxf {}.tar.gz &&sbatch {}.queue'.format(self.executionpath, dirname, dirname, dirname, dirname, dirname, modelname)200 launchcommand = 'cd {} && rm -rf ./{} && mkdir {} && cd {} && cp /efs/issm-tmp/{}.tar.gz ./ && tar -zxf {}.tar.gz && /opt/slurm/bin/sbatch {}.queue'.format(self.executionpath, dirname, dirname, dirname, dirname, dirname, modelname) 206 201 207 202 print('launching solution sequence on remote cluster') 208 issmssh(self.name, self.login, self.port, launchcommand) 203 204 # NOTE: Replacement for issmssh(self.name, self.login, self.port, launchcommand) 205 subprocess.call('ssh -l {} -i {} {} "{}"'.format(self.login, self.idfile, self.name, launchcommand), shell=True) 209 206 # }}} 210 207 211 208 def Download(self, dirname, filelist): # {{{ 212 209 # Copy files from cluster to current directory 213 if self.interactive: 214 directory = '{}/Interactive{}'.format(self.executionpath, self.interactive) 215 else: 216 directory = '{}/{}/'.format(self.executionpath, dirname) 217 218 if self.bbftp: 219 issmbbftpin(self.name, self.login, self.port, self.numstreams, directory, filelist) 220 else: 221 issmscpin(self.name, self.login, self.port, directory, filelist) 222 # }}} 210 211 # NOTE: Replacement for issmscpin(self.name, self.login, self.port, directory, filelist) 212 directory = '{}/{}/'.format(self.executionpath, dirname) 213 fileliststr = "'{" + ','.join([str(x) for x in filelist]) + "}'" 214 downloadcommand = 'scp -T -i {} {}@{}:{} {}/.'.format(self.idfile, self.login, self.name, os.path.join(directory, fileliststr), os.getcwd()) 215 subprocess.call(downloadcommand, shell=True) 216 # }}} -
issm/trunk-jpl/src/m/classes/model.py
r26353 r26435 169 169 s = "%s\n%s" % (s, "%19s: %-22s -- %s" % ("steadystate", "[%s %s]" % ("1x1", obj.steadystate.__class__.__name__), "parameters for steadystate solution")) 170 170 s = "%s\n%s" % (s, "%19s: %-22s -- %s" % ("transient", "[%s %s]" % ("1x1", obj.transient.__class__.__name__), "parameters for transient solution")) 171 s = "%s\n%s" % (s, "%19s: %-22s -- %s" % ("levelset", "[%s %s]" % ("1x1", obj.levelset.__class__.__name__), "parameters for moving boundaries (level -set method)"))171 s = "%s\n%s" % (s, "%19s: %-22s -- %s" % ("levelset", "[%s %s]" % ("1x1", obj.levelset.__class__.__name__), "parameters for moving boundaries (level-set method)")) 172 172 s = "%s\n%s" % (s, "%19s: %-22s -- %s" % ("calving", "[%s %s]" % ("1x1", obj.calving.__class__.__name__), "parameters for calving")) 173 173 s = "%s\n%s" % (s, "%19s: %-22s -- %s" % ("frontalforcings", "[%s %s]" % ("1x1", obj.frontalforcings.__class__.__name__), "parameters for frontalforcings")) -
issm/trunk-jpl/src/m/solve/waitonlock.py
r26353 r26435 53 53 # Prepare command if the job is not running on the local machine 54 54 if not strcmpi(oshostname(), cluster.name): 55 login = cluster.login 56 port = 0 57 if isprop(cluster, 'port'): 58 port = cluster.port 59 if port: 60 command = 'ssh -l {} -p {} localhost "[ -f {} ] && [ -f {} ]" 2>/dev/null'.format(login, port, lockfilename, logfilename) 61 elif cluster.name == 'cloud': 55 if cluster.name == 'cloud': 62 56 command = '[ -f {} ] && [ -f {} ] 2>/dev/null'.format(lockfilename, logfilename) 63 57 command = '{} sshmaster {} --user {} \'{}\''.format(starcluster(), cluster.name, cluster.login, command) 64 58 else: 65 command = 'ssh -l {} {} "[ -f {} ] && [ -f {} ]" 2>/dev/null'.format(login, cluster.name, lockfilename, logfilename) 59 command = 'ssh -l {}'.format(cluster.login) 60 if isprop(cluster, 'idfile') and cluster.idfile != '': 61 command += ' -i {}'.format(cluster.idfile) 62 port = 0 63 if isprop(cluster, 'port'): 64 port = cluster.port 65 if port: # Check if port is non-zero 66 command += ' -p {} localhost'.format(port) 67 else: 68 command += ' {}'.format(cluster.name) 69 command += ' "[ -f {} ] && [ -f {} ]" 2>/dev/null'.format(lockfilename, logfilename) 66 70 67 71 while not ispresent and elapsedtime < timelimit: … … 81 85 # UPDATE: Works in testing under Debian Linux system. Leaving comment for now so that it is easier to backtrace this issue if someone else encounters it. 82 86 # 83 if errs != '':84 raise Exception('waitonlock: check for existence of files failed: {}'.format(errs))87 #if errs != '': 88 # raise Exception('waitonlock: check for existence of files failed: {}'.format(errs)) 85 89 ispresent = not subproc.returncode 86 90 if ispresent:
Note:
See TracChangeset
for help on using the changeset viewer.