Source code for cerise.back_end.remote_job_files

import json
import logging
import re
from typing import Any, Dict, List, cast

import cerulean
from cerulean import Path

from cerise.back_end.cwl import get_files_from_binding
from cerise.back_end.file import File
from cerise.config import Config
from cerise.job_store.sqlite_job_store import SQLiteJobStore


[docs]class RemoteJobFiles:
    """Manages a remote directory structure.
    Expects to be given a remote dir to work within. Inside this
    directory, it makes a jobs/ directory, and inside that there
    is a directory for every job.

    Within each job directory are the following files:

    - jobs/<job_id>/name.txt contains the user-given name of the job
    - jobs/<job_id>/workflow.cwl contains the workflow to run
    - jobs/<job_id>/work/ contains input and output files, and is the
      working directory for the job.
    - jobs/<job_id>/stdout.txt is the standard output of the CWL runner
    - jobs/<job_id>/stderr.txt is the standard error of the CWL runner
    """

    def __init__(self, job_store: SQLiteJobStore, config: Config) -> None:
        """Create a RemoteJobFiles object.
        Sets up remote directory structure as well, but refuses to
        create the top-level directory.

        Args:
            job_store: The job store to use.
            config: The configuration.
        """
        self._logger = logging.getLogger(__name__)
        """Logger: The logger for this class."""
        self._job_store = job_store
        """JobStore: The job store to use."""
        self._username = config.get_username('files')
        """str: The remote user name to use, if any."""
        self._basedir = config.get_basedir()
        """Path: The remote path to the directory where the API files are."""

        # Create directories if they don't exist
        self._logger.debug('basedir: {}'.format(self._basedir))
        self._basedir.mkdir(0o750, parents=True, exists_ok=True)
        (self._basedir / 'jobs').mkdir(parents=True, exists_ok=True)

[docs]    def stage_job(self, job_id: str, input_files: List[File],
                  workflow_content: bytes) -> None:
        """Stage a job. Copies any necessary files to
        the remote resource.

        Args:
            job_id: The id of the job to stage
            input_files: A list of input files to stage.
            workflow_content: Translated contents of the workflow to be
                    run.
        """
        self._logger.debug('Staging job ' + job_id)
        with self._job_store:
            job = self._job_store.get_job(job_id)

            # create work dir
            self._abs_path(job_id, '').mkdir(
                0o700, parents=True, exists_ok=True)
            self._abs_path(job_id, 'work').mkdir(
                0o700, parents=True, exists_ok=True)
            job.remote_workdir_path = str(self._abs_path(job_id, 'work'))

            # stage name of the job
            self._add_file_to_job(job_id, 'name.txt', job.name.encode('utf-8'))

            # stage workflow
            self._add_file_to_job(job_id, 'workflow.cwl', workflow_content)
            job.remote_workflow_path = str(
                self._abs_path(job_id, 'workflow.cwl'))

            # stage input files
            inputs = json.loads(job.local_input)
            count = 1
            for input_file in input_files:
                if input_file.index is not None:
                    input_desc = inputs[input_file.name][input_file.index]
                else:
                    input_desc = inputs[input_file.name]
                count = self._stage_input_file(count, job_id, input_file,
                                               input_desc)

            # stage input description
            inputs_json = json.dumps(inputs).encode('utf-8')
            self._add_file_to_job(job_id, 'input.json', inputs_json)
            job.remote_input_path = str(self._abs_path(job_id, 'input.json'))

            # configure output
            job.remote_stdout_path = str(self._abs_path(job_id, 'stdout.txt'))
            job.remote_stderr_path = str(self._abs_path(job_id, 'stderr.txt'))
            job.remote_system_out_path = str(self._abs_path(job_id,
                                                            'sysout.txt'))
            job.remote_system_err_path = str(self._abs_path(job_id,
                                                            'syserr.txt'))

[docs]    def destage_job_output(self, job_id: str) -> List[File]:
        """Download results of the given job from the compute resource.

        Args:
            job_id: The id of the job to download results of.

        Returns:
            A list of (name, path, content) tuples.
        """
        self._logger.debug('Destaging job ' + job_id)
        output_files = []
        with self._job_store:
            job = self._job_store.get_job(job_id)
            work_dir = self._basedir / 'jobs' / job_id / 'work'
            self._logger.debug("Remote output: {}".format(job.remote_output))
            if job.remote_output != '':
                outputs = json.loads(job.remote_output)
                for output_file in get_files_from_binding(outputs):
                    self._logger.debug(
                        'Destage path = {} for output {}'.format(
                            output_file.location, output_file.name))
                    prefix = 'file://' + str(work_dir) + '/'
                    if not output_file.location.startswith(prefix):
                        raise Exception(
                            'Unexpected output location in cwl-runner output:'
                            ' {}, expected it to start with: {}'
                            .format(output_file.location, prefix))
                    output_file.location = output_file.location[len(prefix):]
                    output_file.source = work_dir / output_file.location
                    output_files.append(output_file)
            else:
                self._logger.error(
                    'CWL runner did not produce any output for job {}!'.format(
                        job_id))

        # output name and location are (immutable) str's, while source
        # does not come from the store, so we're not leaking here
        return output_files

[docs]    def delete_job(self, job_id: str) -> None:
        """Remove the work directory for a job.
        This will remove the directory and everything in it, if it exists.

        Args:
            job_id: The id of the job whose work directory to delete.
        """
        job_dir = self._abs_path(job_id, '')
        if job_dir.exists():
            job_dir.rmdir(recursive=True)

[docs]    def update_job(self, job_id: str) -> None:
        """Get status from remote resource and update store.

        Args:
            job_id: ID of the job to get the status of.
        """
        self._logger.debug("Updating " + job_id + " from remote files")
        with self._job_store:
            job = self._job_store.get_job(job_id)

            # get output
            output = self._read_remote_file(job_id, 'stdout.txt')
            if len(output) > 0:
                self._logger.debug("Output:")
                self._logger.debug(output)
                job.remote_output = output.decode()

            # get log
            log = self._read_remote_file(job_id, 'stderr.txt')
            if len(log) > 0:
                lines = log.decode().splitlines()
                last_lines = job.remote_error.splitlines()
                first_new_line = len(last_lines)
                job.debug(lines[first_new_line:])
                job.remote_error = log.decode()

    def _stage_input_file(self, count: int, job_id: str, input_file: File,
                          input_desc: Dict[str, Any]) -> int:
        """Stage an input file. Copies the file to the remote resource.

        Uses count to create unique file names, returns the new count \
        (i.e. the next available number).

        Args:
            count: The next available unique count
            job_id: The job id to stage for
            input_file: The input file to stage
            input_desc: The input description whose location \
                    (and secondaryFiles) to update.

        Returns:
            The updated count
        """
        staged_name = _create_input_filename(
            str(count).zfill(2), input_file.location)
        count += 1

        with self._job_store:
            job = self._job_store.get_job(job_id)
            job.info('Staging input file {}'.format(input_file.location))

        target_path = self._abs_path(job_id, 'work/{}'.format(staged_name))
        cerulean.copy(cast(Path, input_file.source), target_path)

        input_desc['location'] = str(
            self._abs_path(job_id, 'work/' + staged_name))

        for i, secondary_file in enumerate(input_file.secondary_files):
            sec_input_desc = input_desc['secondaryFiles'][i]
            count = self._stage_input_file(count, job_id, secondary_file,
                                           sec_input_desc)

        return count

    def _add_file_to_job(self, job_id: str, rel_path: str,
                         data: bytes) -> None:
        """Write a file on the remote resource containing the given raw data.

        Args:
            job_id: The id of the job to write data for
            rel_path: A path relative to the job's directory
            data: The data to write
        """
        remote_path = self._abs_path(job_id, rel_path)
        remote_path.write_bytes(data)

    def _read_remote_file(self, job_id: str, rel_path: str) -> bytes:
        """Read data from a remote file.

        Silently returns an empty result if the file does not exist.

        Args:
            job_id: A job from whose work dir a file is read
            rel_path: A path relative to the job's directory
        """
        try:
            return self._abs_path(job_id, rel_path).read_bytes()
        except FileNotFoundError:
            return bytes()

    def _abs_path(self, job_id: str, rel_path: str) -> Path:
        """Return an absolute remote path given a job-relative path.

        Args:
            job_id: A job from whose dir a file is read
            rel_path: A a path relative to the job's directory
        """
        ret = self._basedir / 'jobs' / job_id
        if rel_path != '':
            ret /= rel_path
        return ret


def _create_input_filename(unique_prefix: str, orig_path: str) -> str:
    """Return a string containing a remote filename that
    resembles the original path this file was submitted with.

    Args:
        unique_prefix: A unique prefix, used to avoid collisions.
        orig_path: A string we will try to resemble to aid
            debugging.
    """
    result = orig_path

    result.replace('/', '_')
    result.replace('?', '_')
    result.replace('&', '_')
    result.replace('=', '_')

    regex = re.compile('[^a-zA-Z0-9_.-]+')
    result = regex.sub('_', result)

    if len(result) > 39:
        result = result[:18] + '___' + result[-18:]

    return unique_prefix + '_' + result