Source code for mg_process_files.tool.wig_indexer

"""
.. See the NOTICE file distributed with this work for additional information
   regarding copyright ownership.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
"""

from __future__ import print_function

import sys
import subprocess
import shlex

import numpy as np
import h5py

from utils import logger

try:
    if hasattr(sys, '_run_from_cmdl') is True:
        raise ImportError
    from pycompss.api.parameter import FILE_IN, FILE_OUT, FILE_INOUT, IN
    from pycompss.api.task import task
    from pycompss.api.api import compss_wait_on
except ImportError:
    logger.warn("[Warning] Cannot import \"pycompss\" API packages.")
    logger.warn("          Using mock decorators.")

    from utils.dummy_pycompss import FILE_IN, FILE_INOUT, FILE_OUT, IN  # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import task  # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import compss_wait_on  # pylint: disable=ungrouped-imports

from basic_modules.tool import Tool

# ------------------------------------------------------------------------------


[docs]class wigIndexerTool(Tool):
    """
    Tool for running indexers over a WIG file for use in the RESTful API
    """

    def __init__(self, configuration=None):
        """
        Init function
        """
        logger.info("WIG File Indexer")
        Tool.__init__(self)

        if configuration is None:
            configuration = {}

        self.configuration.update(configuration)

[docs]    @task(returns=bool, file_wig=FILE_IN, file_chrom=FILE_IN, file_bw=FILE_OUT,
          isModifier=False)
    def wig2bigwig(self, file_wig, file_chrom, file_bw):  # pylint: disable=no-self-use
        """
        WIG to BigWig converter

        This uses the ``wigToBigWig`` program binary provided at
        http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/
        to perform the conversion from WIG to BigWig.

        Parameters
        ----------
        file_wig : str
            Location of the wig file
        file_chrom : str
            Location of the chrom.size file
        file_bw : str
            Location of the bigWig file

        Example
        -------
        .. code-block:: python
           :linenos:

           if not self.wig2bigwig(wig_file, chrom_file, bw_file):
               output_metadata.set_exception(
                   Exception(
                       "wig2bigWig: Could not process files {}, {}.".format(*input_files)))

        """
        command_line = 'wigToBigWig ' + file_wig + ' ' + file_chrom + ' ' + file_bw + '.tmp.bw'
        try:
            args = shlex.split(command_line)
            process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            process.communicate()
        except (IOError, OSError) as msg:
            logger.fatal("I/O error({0} - twoBitInfo): {1}\n{2}".format(
                msg.errno, msg.strerror, command_line))
            out, err = process.communicate()  # pylint: disable=unused-variable
            logger.warn("wigToBigWig: " + err)
            return False

        logger.info('BIGWIG - COMMAND: ' + command_line)
        logger.info('BIGWIG - FILES: ' + file_wig + ", " + file_chrom + ", " + file_bw)

        try:
            with open(file_bw, 'wb') as f_out:
                with open(file_bw + '.tmp.bw', 'rb') as f_in:
                    f_out.write(f_in.read())
        except (IOError, OSError) as msg:
            logger.fatal("wig2BigWig - I/O error({0}): {1}\n{2}".format(
                msg.errno, msg.strerror, command_line))
            return False

        return True

[docs]    @task(returns=bool, file_id=IN, assembly=IN, file_wig=FILE_IN, file_hdf5=FILE_INOUT)
    def wig2hdf5(self, file_id, assembly, file_wig, file_hdf5):  # pylint: disable=no-self-use,too-many-branches,too-many-locals,too-many-statements
        """
        WIG to HDF5 converter

        Loads the WIG file into the HDF5 index file that gets used by the REST
        API to determine if there are files that have data in a given region.
        Overlapping regions are condensed into a single feature block rather
        than maintaining all of the detail of the original WIG file.

        Parameters
        ----------
        file_id : str
            The file_id as stored by the DMP so that it can be used for file
            retrieval later
        assembly : str
            Assembly of the genome that is getting indexed so that the
            chromosomes match
        file_wig : str
            Location of the wig file
        file_hdf5 : str
            Location of the HDF5 index file

        Example
        -------
        .. code-block:: python
           :linenos:

           if not self.wig2hdf5(file_id, assembly, wig_file, hdf5_file):
               output_metadata.set_exception(
                   Exception(
                       "wig2hdf5: Could not process files {}, {}.".format(*input_files)))

        """
        max_files = 1024
        max_chromosomes = 1024
        max_chromosome_size = 2000000000

        hdf5_in = h5py.File(file_hdf5, "a")

        if str(assembly) in hdf5_in:
            grp = hdf5_in[str(assembly)]
            # Setup the variable in the datastructure
            meta = hdf5_in['meta']  # pylint: disable=unused-variable

            dset = grp['data']
            fset = grp['files']
            cset = grp['chromosomes']
            file_idx = [i for i in fset if i != '']
            if file_id not in file_idx:
                file_idx.append(file_id)
                # pylint is unable to recognise the resize and shape methods
                dset.resize((dset.shape[0], dset.shape[1] + 1, max_chromosome_size))  # pylint: disable=no-member
            chrom_idx = [c for c in cset if c != '']

        else:
            # Create the initial dataset with minimum values
            grp = hdf5_in.create_group(str(assembly))
            # Setup the variable in the datastructure
            meta = hdf5_in.create_group('meta')  # pylint: disable=unused-variable

            dtf = h5py.special_dtype(vlen=str)
            dtc = h5py.special_dtype(vlen=str)
            fset = grp.create_dataset('files', (max_files,), dtype=dtf)
            cset = grp.create_dataset('chromosomes', (max_chromosomes,), dtype=dtc)

            file_idx = [file_id]
            chrom_idx = []

            dset = grp.create_dataset(
                'data1', (0, 1, max_chromosome_size),
                maxshape=(max_chromosomes, max_files, max_chromosome_size),
                dtype='bool', chunks=True, compression="gzip")

        # Save the list of files
        fset[0:len(file_idx)] = file_idx

        file_chrom_count = 0

        dnp = np.zeros([max_chromosome_size], dtype='bool')

        loaded = False

        start = 0
        span = 1
        step = 1
        step_type = 'variable'
        previous_chrom = ''
        previous_start = 0
        previous_end = 0

        with open(file_wig, 'r') as f_in:
            for line in f_in:  # pylint: disable=too-many-nested-blocks
                line = line.strip()
                if line[0:9] == 'fixedStep' or line[0:12] == 'variableStep':
                    start = 0
                    span = 1
                    step = 1
                    previous_chrom = ''
                    step_type = 'variable'

                    sline = line.split(' ')

                    if sline[0][0:9] == 'fixedStep':
                        step_type = 'fixed'

                    if previous_chrom != '':
                        dnp[previous_start:previous_end + 1] = 1

                    chrom = ''
                    for key_value in sline[1:]:
                        key_value.strip()
                        if not key_value:
                            continue
                        k, i = key_value.split('=')
                        if k == 'start':
                            start = int(i)
                        elif k == 'span':
                            span = int(i)
                        elif k == 'step':
                            step = int(i)
                        elif k == 'chrom':
                            chrom = i

                    file_chrom_count += 1
                    if previous_chrom != '' and previous_chrom != chrom:
                        if previous_chrom not in chrom_idx:
                            chrom_idx.append(previous_chrom)
                            cset[0:len(chrom_idx)] = chrom_idx
                            dset.resize((dset.shape[0] + 1, dset.shape[1], max_chromosome_size))

                        dset[chrom_idx.index(previous_chrom), file_idx.index(file_id), :] += dnp

                        dnp = np.zeros([max_chromosome_size], dtype='bool')
                        loaded = True

                    previous_chrom = chrom

                else:
                    loaded = False
                    if step_type == 'fixed':
                        if float(line) == 0.0:
                            if previous_start != previous_end:
                                dnp[previous_start:previous_end + 1] = 1
                                previous_start = 0
                                previous_end = 0
                        else:
                            if previous_start == 0:
                                previous_start = start
                                previous_end = start + span - 1
                            else:
                                if previous_end == start - 1:
                                    previous_end += span
                                else:
                                    dnp[previous_start:previous_end + 1] = 1
                                    previous_start = start
                                    previous_end = start + span - 1

                        start += step

                    elif step_type == 'variable':
                        sline = line.split("\t")
                        if float(sline[1]) == 0.0:
                            if previous_start != previous_end:
                                dnp[previous_start:previous_end + 1] = 1
                                previous_start = 0
                                previous_end = 0
                        else:
                            if previous_start == 0:
                                previous_start = int(sline[0])
                                previous_end = int(sline[0]) + span - 1
                            else:
                                if previous_end == int(sline[0]) - 1:
                                    previous_end += span
                                else:
                                    dnp[previous_start:previous_end + 1] = 1
                                    previous_start = int(sline[0])
                                    previous_end = int(sline[0]) + span - 1

            if loaded is False:
                if previous_chrom not in chrom_idx:
                    chrom_idx.append(chrom)
                    cset[0:len(chrom_idx)] = chrom_idx
                    dset.resize((dset.shape[0] + 1, dset.shape[1], max_chromosome_size))

                dset[chrom_idx.index(previous_chrom), file_idx.index(file_id), :] = dnp

        hdf5_in.close()

        return True

[docs]    def run(self, input_files, input_metadata, output_files):
        """
        Function to run the WIG file sorter and indexer so that the files can
        get searched as part of the REST API

        Parameters
        ----------
        input_files : dict
            wig_file : str
                Location of the wig file
            chrom_size : str
                Location of chrom.size file
            hdf5_file : str
                Location of the HDF5 index file
        meta_data : dict

        Returns
        -------
        list
            bw_file : str
                Location of the BigWig file
            hdf5_file : str
                Location of the HDF5 index file

        """
        logger.info(
            'PIPELINE FILES:', input_files["wig"], input_files["chrom_file"],
            output_files["bw_file"])
        results_1 = self.wig2bigwig(
            input_files["wig"], input_files["chrom_file"], output_files["bw_file"])
        results_1 = compss_wait_on(results_1)

        results_2 = self.wig2hdf5(
            input_files["wig"], input_metadata["wig"].meta_data["assembly"],
            input_files["wig"], input_files["hdf5_file"])
        results_2 = compss_wait_on(results_2)

        output_generated_files = {
            "bw_file": output_files["bw_file"],
            "hdf5_file": input_files["hdf5_file"]
        }
        output_metadata = {
            "bw_file": input_files["wig"],
            "hdf5_file": input_metadata["hdf5_file"]
        }

        return (output_generated_files, output_metadata)

# ------------------------------------------------------------------------------