Changeset 26435


Ignore:
Timestamp:
09/13/21 13:56:40 (4 years ago)
Author:
jdquinn
Message:

CHG: Updates to SMCE cluster class to properly download results; waitonlock now supports ID file; minor formatting

Location:
issm/trunk-jpl/src/m
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • issm/trunk-jpl/src/m/classes/clusters/smce_eis.py

    r26419 r26435  
     1import os
     2import shutil
    13import subprocess
    24
     
    810from helpers import *
    911from IssmConfig import IssmConfig
    10 from issmscpin import issmscpin
    11 from issmscpout import issmscpout
    1212from issmssh import issmssh
    1313from MatlabFuncs import *
    1414from pairoptions import pairoptions
    15 from QueueRequirements import QueueRequirements
    1615
    1716class smce_eis(object):
     
    2524
    2625    def __init__(self, *args):  # {{{
    27         self.name = oshostname()
    28         self.login = ''
     26        self.name = '52.54.75.45'
     27        self.login = 'jdquinn1'
     28        self.idfile = '~/.ssh/smce-aws-eis-pilot'
    2929        self.modules = []
    30         self.numnodes = 20
    31         self.cpuspernode = 8
     30        self.numnodes = 8
     31        self.cpuspernode = 1
    3232        self.port = 0
    33         self.queue = 'general'
    3433        self.time = 12 * 60 * 60
    35         self.processor = 'west'
     34        self.processor = 'skylake'
    3635        self.srcpath = '/shared/issm/issm/trunk-jpl'
    3736        self.codepath = '/shared/issm/issm/trunk-jpl/bin'
    38         self.executionpath = ''
    39         self.grouplist = ''
     37        self.executionpath = '/home/jdquinn1/issm-exec'
    4038        self.interactive = 0
    41         self.bbftp = 0
    42         self.numstreams = 8
     39        self.numstreams = 1
    4340        self.hyperthreading = 0
    4441        self.email = ''
     
    6259        s += '    name: {}\n'.format(self.name)
    6360        s += '    login: {}\n'.format(self.login)
     61        s += '    idfile: {}\n'.format(self.idfile)
    6462        s += '    modules: {}\n'.format(strjoin(self.modules, ', '))
    6563        s += '    numnodes: {}\n'.format(self.numnodes)
     
    6765        s += '    np: {}\n'.format(self.nprocs())
    6866        s += '    port: {}\n'.format(self.port)
    69         s += '    queue: {}\n'.format(self.queue)
    7067        s += '    time: {}\n'.format(self.time)
    7168        s += '    processor: {}\n'.format(self.processor)
     
    7370        s += '    codepath: {}\n'.format(self.codepath)
    7471        s += '    executionpath: {}\n'.format(self.executionpath)
    75         s += '    grouplist: {}\n'.format(self.grouplist)
    7672        s += '    interactive: {}\n'.format(self.interactive)
    77         s += '    bbftp: {}\n'.format(self.bbftp)
    7873        s += '    numstreams: {}\n'.format(self.numstreams)
    7974        s += '    hyperthreading: {}\n'.format(self.hyperthreading)
     
    8681
    8782    def checkconsistency(self, md, solution, analyses):  # {{{
    88         queuedict = {'long': [24 * 60 * 60, 560],
    89                      'allnccs': [12 * 60 * 60, 6000],
    90                      'debug': [1 * 60 * 60, 532]}
    91         QueueRequirements(queuedict, self.queue, self.nprocs(), self.time)
    92 
    9383        # Now, check cluster.cpuspernode according to processor type
    94         if self.processor == 'sand':
    95             if self.cpuspernode > 16 or self.cpuspernode < 1:
    96                 md = md.checkmessage('cpuspernode should be between 1 and 16 for \'sand\' processors in hyperthreading mode')
    97         elif self.processor == 'hasw':
    98             if self.cpuspernode > 28 or self.cpuspernode < 1:
    99                 md = md.checkmessage('cpuspernode should be between 1 and 28 for \'hasw\' processors in hyperthreading mode')
     84        if self.processor == 'skylake':
     85            if self.cpuspernode > 14 or self.cpuspernode < 1:
     86                md = md.checkmessage('cpuspernode should be between 1 and 14 for \'skyw\' processors in hyperthreading mode')
    10087        else:
    101             md = md.checkmessage('unknown processor type, should be \'sand\' or \'hasw\'')
     88            md = md.checkmessage('unknown processor type, should be \'skylake\'')
    10289
    10390        # Miscellaneous
    10491        if not self.login:
    10592            md = md.checkmessage('login empty')
     93        if self.port:
     94            md = md.checkmessage('port must be set to 0 as we do not have an SSH tunnel')
    10695        if not self.codepath:
    10796            md = md.checkmessage('codepath empty')
     
    129118
    130119        fid.write('#!/bin/bash\n')
     120        fid.write('#SBATCH --partition=hpc-spot \n')
    131121        fid.write('#SBATCH -J {} \n'.format(modelname))
    132         fid.write('#SBATCH --qos={} \n'.format(self.queue))
    133122        fid.write('#SBATCH -o {}.outlog \n'.format(modelname))
    134123        fid.write('#SBATCH -e {}.errlog \n'.format(modelname))
    135         fid.write('#SBATCH -n {} \n'.format(self.nprocs()))
    136         fid.write('#SBATCH -N {} \n'.format(self.numnodes))
     124        fid.write('#SBATCH --nodes={} \n'.format(self.numnodes))
     125        fid.write('#SBATCH --ntasks-per-node={} \n'.format(self.cpuspernode))
     126        fid.write('#SBATCH --cpus-per-task={} \n'.format(self.numstreams))
    137127        fid.write('#SBATCH -t {:02d}:{:02d}:00 \n'.format(int(floor(self.time / 3600)), int(floor(self.time % 3600) / 60)))
    138         fid.write('#SBATCH -A {} \n\n'.format(self.grouplist))
    139128        if (self.email.find('@')>-1):
    140129            fid.write('#SBATCH --mail-user={} \n'.format(self.email))
    141             fid.write('#SBATCH --mail-type=end \n\n')
    142         # fid.write('. /usr/share/modules/init/bash\n\n')
     130            fid.write('#SBATCH --mail-type=BEGIN,END,FAIL \n\n')
    143131        # for i in range(len(self.modules)):
    144132        #     fid.write('module load {}\n'.format(self.modules[i]))
    145133        fid.write('export MPI_GROUP_MAX=64\n\n')
    146134        fid.write('export MPI_UNBUFFERED_STDIO=true\n\n')
     135        fid.write('export PATH="$PATH:/opt/slurm/bin"\n') # TODO: Figure out how to add this to PATH by sourcing environment script
    147136        fid.write('export PATH="$PATH:."\n\n')
    148137        fid.write('export ISSM_DIR="{}"\n'.format(self.srcpath)) # FIXME
     138        fid.write('export HYDRA_HOST_FILE={}/{}/'.format(self.executionpath, dirname))
     139        fid.write('${USER}-hydranodes-sj:${SLURM_JOB_ID}\n')
     140        #fid.write('source /usr/share/modules/init/bash\n')
     141        #fid.write('source /efs/spack-justin/share/spack/setup-env.sh\n')
     142        #fid.write('module load gcc-9.3.0-gcc-9.3.0-ug2hqa3\n')
     143        fid.write('touch ${HYDRA_HOST_FILE}\n')
     144        fid.write('/efs/mrilee/snodelist/build/snodelist -m -f "%h%[:]c" >> ${HYDRA_HOST_FILE}\n')
    149145        fid.write('source $ISSM_DIR/etc/environment.sh\n') # FIXME
    150146        fid.write('cd {}/{}/\n\n'.format(self.executionpath, dirname))
     
    187183            directory = self.executionpath
    188184
    189         if self.bbftp:
    190             issmbbftpout(self.name, directory, self.login, self.port, self.numstreams, '{}.tar.gz'.format(dirname))
    191         else:
    192             issmscpout(self.name, directory, self.login, self.port, ['{}.tar.gz'.format(dirname)])
     185        # NOTE: Replacement for issmscpout(self.name, directory, self.login, self.port, ['{}.tar.gz'.format(dirname)])
     186        copystring = 'cp {}.tar.gz /efs/issm-tmp/'.format(dirname, dirname)
     187        subprocess.call(copystring, shell=True)
    193188    # }}}
    194189
     
    201196        else:
    202197            if not isempty(restart):
    203                 launchcommand = 'cd {} && cd {} && sbatch {}.queue'.format(self.executionpath, dirname, modelname)
     198                launchcommand = 'cd {} && cd {} && /opt/slurm/bin/sbatch {}.queue'.format(self.executionpath, dirname, modelname)
    204199            else:
    205                 launchcommand = 'cd {} && rm -rf ./{} && mkdir {} && cd {} && mv ../{}.tar.gz ./ && tar -zxf {}.tar.gz && sbatch {}.queue'.format(self.executionpath, dirname, dirname, dirname, dirname, dirname, modelname)
     200                launchcommand = 'cd {} && rm -rf ./{} && mkdir {} && cd {} && cp /efs/issm-tmp/{}.tar.gz ./ && tar -zxf {}.tar.gz && /opt/slurm/bin/sbatch {}.queue'.format(self.executionpath, dirname, dirname, dirname, dirname, dirname, modelname)
    206201
    207202        print('launching solution sequence on remote cluster')
    208         issmssh(self.name, self.login, self.port, launchcommand)
     203
     204        # NOTE: Replacement for issmssh(self.name, self.login, self.port, launchcommand)
     205        subprocess.call('ssh -l {} -i {} {} "{}"'.format(self.login, self.idfile, self.name, launchcommand), shell=True)
    209206    # }}}
    210207
    211208    def Download(self, dirname, filelist):  # {{{
    212209        # Copy files from cluster to current directory
    213         if self.interactive:
    214             directory = '{}/Interactive{}'.format(self.executionpath, self.interactive)
    215         else:
    216             directory = '{}/{}/'.format(self.executionpath, dirname)
    217 
    218         if self.bbftp:
    219             issmbbftpin(self.name, self.login, self.port, self.numstreams, directory, filelist)
    220         else:
    221             issmscpin(self.name, self.login, self.port, directory, filelist)
    222     # }}}
     210   
     211        # NOTE: Replacement for issmscpin(self.name, self.login, self.port, directory, filelist)
     212        directory = '{}/{}/'.format(self.executionpath, dirname)
     213        fileliststr = "'{" + ','.join([str(x) for x in filelist]) + "}'"
     214        downloadcommand = 'scp -T -i {} {}@{}:{} {}/.'.format(self.idfile, self.login, self.name, os.path.join(directory, fileliststr), os.getcwd())
     215        subprocess.call(downloadcommand, shell=True)
     216    # }}}
  • issm/trunk-jpl/src/m/classes/model.py

    r26353 r26435  
    169169        s = "%s\n%s" % (s, "%19s: %-22s -- %s" % ("steadystate", "[%s %s]" % ("1x1", obj.steadystate.__class__.__name__), "parameters for steadystate solution"))
    170170        s = "%s\n%s" % (s, "%19s: %-22s -- %s" % ("transient", "[%s %s]" % ("1x1", obj.transient.__class__.__name__), "parameters for transient solution"))
    171         s = "%s\n%s" % (s, "%19s: %-22s -- %s" % ("levelset", "[%s %s]" % ("1x1", obj.levelset.__class__.__name__), "parameters for moving boundaries (level - set method)"))
     171        s = "%s\n%s" % (s, "%19s: %-22s -- %s" % ("levelset", "[%s %s]" % ("1x1", obj.levelset.__class__.__name__), "parameters for moving boundaries (level-set method)"))
    172172        s = "%s\n%s" % (s, "%19s: %-22s -- %s" % ("calving", "[%s %s]" % ("1x1", obj.calving.__class__.__name__), "parameters for calving"))
    173173        s = "%s\n%s" % (s, "%19s: %-22s -- %s" % ("frontalforcings", "[%s %s]" % ("1x1", obj.frontalforcings.__class__.__name__), "parameters for frontalforcings"))
  • issm/trunk-jpl/src/m/solve/waitonlock.py

    r26353 r26435  
    5353    # Prepare command if the job is not running on the local machine
    5454    if not strcmpi(oshostname(), cluster.name):
    55         login = cluster.login
    56         port = 0
    57         if isprop(cluster, 'port'):
    58             port = cluster.port
    59         if port:
    60             command = 'ssh -l {} -p {} localhost "[ -f {} ] && [ -f {} ]" 2>/dev/null'.format(login, port, lockfilename, logfilename)
    61         elif cluster.name == 'cloud':
     55        if cluster.name == 'cloud':
    6256            command = '[ -f {} ] && [ -f {} ] 2>/dev/null'.format(lockfilename, logfilename)
    6357            command = '{} sshmaster {} --user {} \'{}\''.format(starcluster(), cluster.name, cluster.login, command)
    6458        else:
    65             command = 'ssh -l {} {} "[ -f {} ] && [ -f {} ]" 2>/dev/null'.format(login, cluster.name, lockfilename, logfilename)
     59            command = 'ssh -l {}'.format(cluster.login)
     60            if isprop(cluster, 'idfile') and cluster.idfile != '':
     61                command += ' -i {}'.format(cluster.idfile)
     62            port = 0
     63            if isprop(cluster, 'port'):
     64                port = cluster.port
     65            if port: # Check if port is non-zero
     66                command += ' -p {} localhost'.format(port)
     67            else:
     68                command += ' {}'.format(cluster.name)
     69            command += ' "[ -f {} ] && [ -f {} ]" 2>/dev/null'.format(lockfilename, logfilename)
    6670
    6771    while not ispresent and elapsedtime < timelimit:
     
    8185            # UPDATE: Works in testing under Debian Linux system. Leaving comment for now so that it is easier to backtrace this issue if someone else encounters it.
    8286            #
    83             if errs != '':
    84                 raise Exception('waitonlock: check for existence of files failed: {}'.format(errs))
     87            #if errs != '':
     88            #    raise Exception('waitonlock: check for existence of files failed: {}'.format(errs))
    8589            ispresent = not subproc.returncode
    8690            if ispresent:
Note: See TracChangeset for help on using the changeset viewer.