Source code for snakemake.script

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2015-2019, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"

import inspect
import os
import tempfile
import textwrap
import sys
import pickle
import traceback
import subprocess
import collections
import re
from urllib.request import urlopen, pathname2url
from urllib.error import URLError
from itertools import islice

from snakemake.utils import format
from snakemake.logging import logger
from snakemake.exceptions import WorkflowError
from snakemake.shell import shell
from snakemake.common import MIN_PY_VERSION, escape_backslash, SNAKEMAKE_SEARCHPATH
from snakemake.io import git_content, split_git_path
from snakemake.deployment import singularity


PY_VER_RE = re.compile("Python (?P<ver_min>\d+\.\d+).*")
# TODO use this to find the right place for inserting the preamble
PY_PREAMBLE_RE = re.compile(r"from( )+__future__( )+import.*?(?P<end>[;\n])")


[docs]class REncoder:
    """Encoding Pyton data structures into R."""

[docs]    @classmethod
    def encode_numeric(cls, value):
        if value is None:
            return "as.numeric(NA)"
        return str(value)

[docs]    @classmethod
    def encode_value(cls, value):
        if value is None:
            return "NULL"
        elif isinstance(value, str):
            return repr(value)
        elif isinstance(value, dict):
            return cls.encode_dict(value)
        elif isinstance(value, bool):
            return "TRUE" if value else "FALSE"
        elif isinstance(value, int) or isinstance(value, float):
            return str(value)
        elif isinstance(value, collections.abc.Iterable):
            # convert all iterables to vectors
            return cls.encode_list(value)
        else:
            # Try to convert from numpy if numpy is present
            try:
                import numpy as np

                if isinstance(value, np.number):
                    return str(value)
            except ImportError:
                pass
        raise ValueError("Unsupported value for conversion into R: {}".format(value))

[docs]    @classmethod
    def encode_list(cls, l):
        return "c({})".format(", ".join(map(cls.encode_value, l)))

[docs]    @classmethod
    def encode_items(cls, items):
        def encode_item(item):
            name, value = item
            return '"{}" = {}'.format(name, cls.encode_value(value))

        return ", ".join(map(encode_item, items))

[docs]    @classmethod
    def encode_dict(cls, d):
        d = "list({})".format(cls.encode_items(d.items()))
        return d

[docs]    @classmethod
    def encode_namedlist(cls, namedlist):
        positional = ", ".join(map(cls.encode_value, namedlist))
        named = cls.encode_items(namedlist.items())
        source = "list("
        if positional:
            source += positional
        if named:
            source += ", " + named
        source += ")"
        return source


[docs]class JuliaEncoder:
    """Encoding Pyton data structures into Julia."""

[docs]    @classmethod
    def encode_value(cls, value):
        if value is None:
            return "nothing"
        elif isinstance(value, str):
            return repr(value)
        elif isinstance(value, dict):
            return cls.encode_dict(value)
        elif isinstance(value, bool):
            return "true" if value else "false"
        elif isinstance(value, int) or isinstance(value, float):
            return str(value)
        elif isinstance(value, collections.abc.Iterable):
            # convert all iterables to vectors
            return cls.encode_list(value)
        else:
            # Try to convert from numpy if numpy is present
            try:
                import numpy as np

                if isinstance(value, np.number):
                    return str(value)
            except ImportError:
                pass
        raise ValueError(
            "Unsupported value for conversion into Julia: {}".format(value)
        )

[docs]    @classmethod
    def encode_list(cls, l):
        return "[{}]".format(", ".join(map(cls.encode_value, l)))

[docs]    @classmethod
    def encode_items(cls, items):
        def encode_item(item):
            name, value = item
            return '"{}" => {}'.format(name, cls.encode_value(value))

        return ", ".join(map(encode_item, items))

[docs]    @classmethod
    def encode_positional_items(cls, namedlist):
        encoded = ""
        for index, value in enumerate(namedlist):
            encoded += "{} => {}, ".format(index + 1, cls.encode_value(value))
        return encoded

[docs]    @classmethod
    def encode_dict(cls, d):
        d = "Dict({})".format(cls.encode_items(d.items()))
        return d

[docs]    @classmethod
    def encode_namedlist(cls, namedlist):
        positional = cls.encode_positional_items(namedlist)
        named = cls.encode_items(namedlist.items())
        source = "Dict("
        if positional:
            source += positional
        if named:
            source += named
        source += ")"
        return source


[docs]class Snakemake:
    def __init__(
        self,
        input,
        output,
        params,
        wildcards,
        threads,
        resources,
        log,
        config,
        rulename,
        bench_iteration,
        scriptdir=None,
    ):
        # convert input and output to plain strings as some remote objects cannot
        # be pickled
        self.input = input._plainstrings()
        self.output = output._plainstrings()
        self.params = params
        self.wildcards = wildcards
        self.threads = threads
        self.resources = resources
        self.log = log._plainstrings()
        self.config = config
        self.rule = rulename
        self.bench_iteration = bench_iteration
        self.scriptdir = scriptdir

[docs]    def log_fmt_shell(self, stdout=True, stderr=True, append=False):
        """
        Return a shell redirection string to be used in `shell()` calls

        This function allows scripts and wrappers support optional `log` files
        specified in the calling rule.  If no `log` was specified, then an
        empty string "" is returned, regardless of the values of `stdout`,
        `stderr`, and `append`.

        Parameters
        ---------

        stdout : bool
            Send stdout to log

        stderr : bool
            Send stderr to log

        append : bool
            Do not overwrite the log file. Useful for sending output of
            multiple commands to the same log. Note however that the log will
            not be truncated at the start.

        The following table describes the output:

        -------- -------- -------- ----- -------------
        stdout   stderr   append   log   return value
        -------- -------- -------- ----- ------------
        True     True     True     fn    >> fn 2>&1
        True     False    True     fn    >> fn
        False    True     True     fn    2>> fn
        True     True     False    fn    > fn 2>&1
        True     False    False    fn    > fn
        False    True     False    fn    2> fn
        any      any      any      None  ""
        -------- -------- -------- ----- -----------
        """
        if not self.log:
            return ""
        lookup = {
            (True, True, True): " >> {0} 2>&1",
            (True, False, True): " >> {0}",
            (False, True, True): " 2>> {0}",
            (True, True, False): " > {0} 2>&1",
            (True, False, False): " > {0}",
            (False, True, False): " 2> {0}",
        }
        return lookup[(stdout, stderr, append)].format(self.log)


[docs]def get_source(path, basedir="."):
    source = None
    if not path.startswith("http") and not path.startswith("git+file"):
        if path.startswith("file://"):
            path = path[7:]
        elif path.startswith("file:"):
            path = path[5:]
        if not os.path.isabs(path):
            path = os.path.abspath(os.path.join(basedir, path))
        path = "file://" + path
    path = format(path, stepout=1)
    if path.startswith("file://"):
        sourceurl = "file:" + pathname2url(path[7:])
    elif path.startswith("git+file"):
        source = git_content(path)
        (root_path, file_path, version) = split_git_path(path)
        path = path.rstrip("@" + version)
    else:
        sourceurl = path

    language = None
    if path.endswith(".py"):
        language = "python"
    elif path.endswith(".R"):
        language = "r"
    elif path.endswith(".Rmd"):
        language = "rmarkdown"
    elif path.endswith(".jl"):
        language = "julia"

    if source is None:
        with urlopen(sourceurl) as source:
            return path, source.read(), language
    else:
        return path, source, language


[docs]def script(
    path,
    basedir,
    input,
    output,
    params,
    wildcards,
    threads,
    resources,
    log,
    config,
    rulename,
    conda_env,
    singularity_img,
    singularity_args,
    env_modules,
    bench_record,
    jobid,
    bench_iteration,
    cleanup_scripts,
    shadow_dir,
):
    """
    Load a script from the given basedir + path and execute it.
    Supports Python 3 and R.
    """

    f = None
    try:
        path, source, language = get_source(path, basedir)
        if language == "python":
            wrapper_path = path[7:] if path.startswith("file://") else path
            snakemake = Snakemake(
                input,
                output,
                params,
                wildcards,
                threads,
                resources,
                log,
                config,
                rulename,
                bench_iteration,
                os.path.dirname(wrapper_path),
            )
            snakemake = pickle.dumps(snakemake)
            # Obtain search path for current snakemake module.
            # The module is needed for unpickling in the script.
            # We append it at the end (as a fallback).
            searchpath = SNAKEMAKE_SEARCHPATH
            if singularity_img is not None:
                searchpath = singularity.SNAKEMAKE_MOUNTPOINT
            searchpath = '"{}"'.format(searchpath)
            # For local scripts, add their location to the path in case they use path-based imports
            if path.startswith("file://"):
                searchpath += ', "{}"'.format(os.path.dirname(path[7:]))
            preamble = textwrap.dedent(
                """
            ######## Snakemake header ########
            import sys; sys.path.extend([{searchpath}]); import pickle; snakemake = pickle.loads({snakemake}); from snakemake.logging import logger; logger.printshellcmds = {printshellcmds}; __real_file__ = __file__; __file__ = {file_override};
            ######## Original script #########
            """
            ).format(
                searchpath=escape_backslash(searchpath),
                snakemake=snakemake,
                printshellcmds=logger.printshellcmds,
                file_override=repr(os.path.realpath(wrapper_path)),
            )
        elif language == "r" or language == "rmarkdown":
            preamble = textwrap.dedent(
                """
            ######## Snakemake header ########
            library(methods)
            Snakemake <- setClass(
                "Snakemake",
                slots = c(
                    input = "list",
                    output = "list",
                    params = "list",
                    wildcards = "list",
                    threads = "numeric",
                    log = "list",
                    resources = "list",
                    config = "list",
                    rule = "character",
                    bench_iteration = "numeric",
                    scriptdir = "character",
                    source = "function"
                )
            )
            snakemake <- Snakemake(
                input = {},
                output = {},
                params = {},
                wildcards = {},
                threads = {},
                log = {},
                resources = {},
                config = {},
                rule = {},
                bench_iteration = {},
                scriptdir = {},
                source = function(...){{
                    wd <- getwd()
                    setwd(snakemake@scriptdir)
                    source(...)
                    setwd(wd)
                }}
            )

            ######## Original script #########
            """
            ).format(
                REncoder.encode_namedlist(input),
                REncoder.encode_namedlist(output),
                REncoder.encode_namedlist(params),
                REncoder.encode_namedlist(wildcards),
                threads,
                REncoder.encode_namedlist(log),
                REncoder.encode_dict(
                    {
                        name: value
                        for name, value in resources.items()
                        if name != "_cores" and name != "_nodes"
                    }
                ),
                REncoder.encode_dict(config),
                REncoder.encode_value(rulename),
                REncoder.encode_numeric(bench_iteration),
                REncoder.encode_value(
                    os.path.dirname(path[7:])
                    if path.startswith("file://")
                    else os.path.dirname(path)
                ),
            )
        elif language == "julia":
            preamble = textwrap.dedent(
                """
                    ######## Snakemake header ########
                    struct Snakemake
                        input::Dict
                        output::Dict
                        params::Dict
                        wildcards::Dict
                        threads::Int64
                        log::Dict
                        resources::Dict
                        config::Dict
                        rule::String
                        bench_iteration
                        scriptdir::String
                        #source::Any
                    end
                    snakemake = Snakemake(
                        {}, #input::Dict
                        {}, #output::Dict
                        {}, #params::Dict
                        {}, #wildcards::Dict
                        {}, #threads::Int64
                        {}, #log::Dict
                        {}, #resources::Dict
                        {}, #config::Dict
                        {}, #rule::String
                        {}, #bench_iteration::Int64
                        {}, #scriptdir::String
                        #, #source::Any
                    )
                    ######## Original script #########
                    """.format(
                    JuliaEncoder.encode_namedlist(input),
                    JuliaEncoder.encode_namedlist(output),
                    JuliaEncoder.encode_namedlist(params),
                    JuliaEncoder.encode_namedlist(wildcards),
                    JuliaEncoder.encode_value(threads),
                    JuliaEncoder.encode_namedlist(log),
                    JuliaEncoder.encode_dict(
                        {
                            name: value
                            for name, value in resources.items()
                            if name != "_cores" and name != "_nodes"
                        }
                    ),
                    JuliaEncoder.encode_dict(config),
                    JuliaEncoder.encode_value(rulename),
                    JuliaEncoder.encode_value(bench_iteration),
                    JuliaEncoder.encode_value(
                        os.path.dirname(path[7:])
                        if path.startswith("file://")
                        else os.path.dirname(path)
                    ),
                ).replace(
                    "'", '"'
                )
            )
        else:
            raise ValueError(
                "Unsupported script: Expecting either Python (.py), R (.R), RMarkdown (.Rmd) or Julia (.jl) script."
            )

        dir = ".snakemake/scripts"
        os.makedirs(dir, exist_ok=True)

        with tempfile.NamedTemporaryFile(
            suffix="." + os.path.basename(path), dir=dir, delete=False
        ) as f:
            if not language == "rmarkdown":
                f.write(preamble.encode())
                f.write(source)
            else:
                # Insert Snakemake object after the RMarkdown header
                code = source.decode()
                pos = next(islice(re.finditer(r"---\n", code), 1, 2)).start() + 3
                f.write(str.encode(code[:pos]))
                preamble = textwrap.dedent(
                    """
                    ```{r, echo=FALSE, message=FALSE, warning=FALSE}
                    %s
                    ```
                    """
                    % preamble
                )
                f.write(preamble.encode())
                f.write(str.encode(code[pos:]))

        if language == "python":
            py_exec = sys.executable
            if conda_env is not None:
                py = os.path.join(conda_env, "bin", "python")
                if os.path.exists(py):
                    out = subprocess.check_output(
                        [py, "--version"],
                        stderr=subprocess.STDOUT,
                        universal_newlines=True,
                    )
                    ver = tuple(
                        map(int, PY_VER_RE.match(out).group("ver_min").split("."))
                    )
                    if ver >= MIN_PY_VERSION:
                        # Python version is new enough, make use of environment
                        # to execute script
                        py_exec = "python"
                    else:
                        logger.warning(
                            "Conda environment defines Python "
                            "version < {0}.{1}. Using Python of the "
                            "master process to execute "
                            "script. Note that this cannot be avoided, "
                            "because the script uses data structures from "
                            "Snakemake which are Python >={0}.{1} "
                            "only.".format(*MIN_PY_VERSION)
                        )
            if singularity_img is not None:
                # use python from image
                py_exec = "python"
            if env_modules is not None:
                # use python from environment module
                py_exec = "python"
            # use the same Python as the running process or the one from the environment
            shell("{py_exec} {f.name:q}", bench_record=bench_record)
        elif language == "r":
            if conda_env is not None and "R_LIBS" in os.environ:
                logger.warning(
                    "R script job uses conda environment but "
                    "R_LIBS environment variable is set. This "
                    "is likely not intended, as R_LIBS can "
                    "interfere with R packages deployed via "
                    "conda. Consider running `unset R_LIBS` or "
                    "remove it entirely before executing "
                    "Snakemake."
                )
            shell("Rscript --vanilla {f.name:q}", bench_record=bench_record)
        elif language == "rmarkdown":
            if len(output) != 1:
                raise WorkflowError(
                    "RMarkdown scripts (.Rmd) may only have a single output file."
                )
            out = os.path.abspath(output[0])
            shell(
                'Rscript --vanilla -e \'rmarkdown::render("{f.name}", output_file="{out}", quiet=TRUE, knit_root_dir = "{workdir}", params = list(rmd="{f.name}"))\'',
                bench_record=bench_record,
                workdir=os.getcwd(),
            )
        elif language == "julia":
            shell("julia {f.name:q}", bench_record=bench_record)

    except URLError as e:
        raise WorkflowError(e)
    finally:
        if f and cleanup_scripts:
            os.remove(f.name)
        elif f:
            logger.warning("Not cleaning up {} upon request.".format(f.name))