Source code for cerise.back_end.execution_manager

import logging
import time
import traceback
from typing import cast

import cerulean
from paramiko.ssh_exception import SSHException  # type: ignore

from cerise.back_end.cwl import get_cwltool_result, is_workflow
from cerise.back_end.job_planner import InvalidJobError, JobPlanner
from cerise.back_end.job_runner import JobRunner
from cerise.back_end.local_files import ConnectionError, LocalFiles
from cerise.back_end.remote_api import RemoteApi
from cerise.back_end.remote_job_files import RemoteJobFiles
from cerise.config import Config
from cerise.job_store.job_state import JobState
from cerise.job_store.sqlite_job import SQLiteJob
from cerise.job_store.sqlite_job_store import SQLiteJobStore


[docs]class ExecutionManager: """Handles the execution of jobs on the remote resource. The execution manager monitors the job store for files that are ready to be staged in, started, cancelled, staged out, or deleted, and performs the required activity. It also monitors the remote resource, ensuring that any remote state changes are propagated to the job store correctly. """ def __init__(self, config: Config, local_api_dir: cerulean.Path) -> None: """Set up the execution manager. Args: config: The configuration. local_api_dir: The path to the local API directory. """ self._logger = logging.getLogger(__name__) self._update_available = False """Whether the installed API is older than the local one.""" self._shutting_down = False """True iff we're shutting down.""" self._job_store = SQLiteJobStore(config.get_database_location()) """The job store to use.""" self._local_files = LocalFiles(self._job_store, config) """The local files manager.""" self._remote_api = RemoteApi(config, local_api_dir) """The remote API manager.""" self._remote_refresh = config.get_remote_refresh() self._job_planner = JobPlanner(self._job_store, local_api_dir) """Determines required hardware resources.""" self._remote_job_files = RemoteJobFiles(self._job_store, config) """The remote job files manager.""" remote_cwlrunner = self._remote_api.translate_runner_location( config.get_remote_cwl_runner()) self._job_runner = JobRunner(self._job_store, config, remote_cwlrunner) """The job runner submits jobs and checks on them.""" # recover database from crash with self._job_store: for job in self._job_store.list_jobs(): if job.state == JobState.STAGING_IN: self._remote_job_files.delete_job(job.id) job.state = JobState.SUBMITTED if job.state == JobState.STAGING_OUT: self._local_files.delete_output_dir(job.id) job.state = JobState.FINISHED if job.state == JobState.WAITING_CR: self._job_runner.cancel_job(job.id) if job.state == JobState.RUNNING_CR: self._job_runner.cancel_job(job.id) # Check for updates self._update_available = self._remote_api.update_available() if self._update_available: self._logger.info('Specialisation update available') self._logger.info('Started back-end')
[docs] def shutdown(self) -> None: """Requests the execution manager to execute a clean shutdown.""" self._logger.debug('Shutdown requested') self._shutting_down = True
def _delete_job(self, job_id: str, job: SQLiteJob) -> None: """Delete a job. Deletes the job from the compute resource, and if it was destaged, also from the local file store. Prerequisite: the job is in a final state. Args: job_id: The id of the job job: The job object """ self._logger.debug('Deleting job ' + job_id) self._remote_job_files.delete_job(job_id) self._local_files.delete_output_dir(job_id) self._job_store.delete_job(job_id) def _cancel_job(self, job_id: str, job: SQLiteJob) -> None: """Cancel a job. If the job is running, the cancellation request may take some time to process by the compute resource. In this case, the job will remain in RUNNING_CR. Otherwise, it will be cancelled immediately, and be put in CANCELLED. Precondition: Job is in a _CR state. Postcondition: Job is in CANCELLED or RUNNING_CR. Args: job_id: The id of the job job: The job object """ job.info('Cancelling job') if self._job_runner.cancel_job(job_id): job.state = JobState.RUNNING_CR else: job.state = JobState.CANCELLED job.info('Job cancelled') def _stage_and_start_job(self, job_id: str, job: SQLiteJob) -> None: """Stages, plans and starts a job. Precondition: Job is in STAGING_IN state Postcondition: Job is in WAITING, PERMANENT_FAILURE, CANCELLED, or WAITING_CR Args: job_id: The id of the job job: The job object """ try: job.info('Resolving inputs') input_files = self._local_files.resolve_input(job_id) except FileNotFoundError as e: job.error('Input not found, failing with PermanentFailure') job.state = JobState.PERMANENT_FAILURE return except ValueError as e: job.error('Invalid input: {}'.format(e.args[0])) job.state = JobState.PERMANENT_FAILURE return if not is_workflow(cast(bytes, job.workflow_content)): job.error('Input is not a CWL workflow') job.state = JobState.PERMANENT_FAILURE return if job.try_transition(JobState.STAGING_IN_CR, JobState.CANCELLED): job.info('Job was cancelled while resolving input') return job.info('Resolved input, now planning') try: self._job_planner.plan_job(job_id) except InvalidJobError: job.error('Job is invalid') job.state = JobState.PERMANENT_FAILURE return if job.state == JobState.PERMANENT_FAILURE: return job.info('Planned job, now staging in inputs') workflow_content = self._remote_api.translate_workflow( cast(bytes, job.workflow_content)) try: self._remote_job_files.stage_job(job_id, input_files, workflow_content) except FileNotFoundError as e: job.error('Input not found, failing with PermanentFailure') job.state = JobState.PERMANENT_FAILURE return job.info('Staged job, now starting') job.info('API versions:') for project_version in self._remote_api.get_projects(): job.info(' {}'.format(project_version)) self._job_runner.start_job(job_id) job.info('Started job') if not (job.try_transition(JobState.STAGING_IN, JobState.WAITING) or job.try_transition(JobState.STAGING_IN_CR, JobState.WAITING_CR)): self._logger.critical( 'Something odd happened while staging and starting') self._logger.critical('State is now {}'.format(job.state)) job.state = JobState.SYSTEM_ERROR def _destage_job(self, job_id: str, job: SQLiteJob) -> None: """Get job results back from the compute resource. Precondition: Job is in FINISHED Postcondition: Job is in SUCCESS, TEMPORARY_FAILURE, PERMANENT_FAILURE or CANCELLED Args: job_id: The job's id job: The job object """ result = get_cwltool_result(job.remote_error) if job.try_transition(JobState.FINISHED, JobState.STAGING_OUT): job.info('Starting destaging of results') output_files = self._remote_job_files.destage_job_output( job_id) self._local_files.publish_job_output(job_id, output_files) job.info('Results downloaded and available') if not (job.try_transition(JobState.STAGING_OUT, result) or job.try_transition(JobState.STAGING_OUT_CR, JobState.CANCELLED)): job.state = JobState.SYSTEM_ERROR def _process_jobs(self, check_remote: bool) -> bool: """ Go through the jobs and do what needs to be done. Args: check_remote: Whether to access the remote compute resource to check on jobs. Returns: True iff there are currently running jobs. """ # If we don't check remote, assume that we have running jobs, # so that we don't install updates while jobs are running. have_running_jobs = not check_remote jobs = self._job_store.list_jobs() for job_id in [job.id for job in jobs]: if self._shutting_down: break try: job = self._job_store.get_job(job_id) previous_state = job.state self._logger.debug('Processing job ' + job_id + ' with current state ' + job.state.value) if check_remote and JobState.is_remote(job.state): self._logger.debug('Checking remote state') self._job_runner.update_job(job_id) self._remote_job_files.update_job(job_id) job = self._job_store.get_job(job_id) have_running_jobs = ( have_running_jobs or JobState.is_remote(job.state) ) if job.state == JobState.FINISHED: self._destage_job(job_id, job) if not self._update_available: if job.try_transition(JobState.SUBMITTED, JobState.STAGING_IN): self._stage_and_start_job(job_id, job) self._logger.debug('Staged and started job') if JobState.cancellation_active(job.state): self._cancel_job(job_id, job) self._logger.debug('State is now ' + job.state.value) if job.please_delete and JobState.is_final(job.state): self._delete_job(job_id, job) except (ConnectionError, IOError, EOFError, OSError, SSHException ) as e: self._logger.debug('System exception while processing job:' ' {}'.format(e)) if isinstance(e, IOError) or isinstance(e, OSError): if ('Socket' not in str(e) and 'Network' not in str(e) and 'Temporary' not in str(e) and 'Timeout opening channel' not in str(e)): job.error('An IO error occurred while processing the' ' job: {}. Please check that your network' ' connection works, and that you have enough' ' disk space or quota on the remote machine.' ''.format(e)) job.state = JobState.SYSTEM_ERROR self._logger.critical('An internal error occurred when' ' processing job ' + job.id) self._logger.critical(traceback.format_exc()) return False job = self._job_store.get_job(job_id) job.debug('Connection problem with remote resource: {}, will' ' try again later'.format(e.args[0])) job.state = previous_state have_running_jobs = True except: job.state = JobState.SYSTEM_ERROR self._logger.critical( 'An internal error occurred when processing job ' + job.id) self._logger.critical(traceback.format_exc()) return have_running_jobs
[docs] def execute_jobs(self) -> None: """Run the main backend execution loop. This repeatedly processes jobs, but does not check the remote compute resource more often than specified in the remote_refresh configuration parameter. """ with self._job_store: last_active = time.perf_counter() - self._remote_refresh - 1 # Handler in run_back_end throws KeyboardInterrupt in order to # break the sleep call; catch it to exit gracefully try: while not self._shutting_down: now = time.perf_counter() check_remote = now - last_active > self._remote_refresh have_running_jobs = self._process_jobs(check_remote) if not have_running_jobs and self._update_available: self._remote_api.install() self._update_available = False if check_remote: last_active = time.perf_counter() time.sleep(0.1) except KeyboardInterrupt: pass self._logger.debug('Shutting down')