Source code for mg_process_files.tool.bed_indexer

"""
.. See the NOTICE file distributed with this work for additional information
   regarding copyright ownership.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
"""

from __future__ import print_function

import sys
import subprocess
import shlex

import numpy as np
import h5py

from utils import logger

try:
    if hasattr(sys, '_run_from_cmdl') is True:
        raise ImportError
    from pycompss.api.parameter import FILE_IN, FILE_OUT, FILE_INOUT, IN
    from pycompss.api.task import task
    from pycompss.api.api import compss_wait_on
except ImportError:
    logger.warn("[Warning] Cannot import \"pycompss\" API packages.")
    logger.warn("          Using mock decorators.")

    from utils.dummy_pycompss import FILE_IN, FILE_INOUT, FILE_OUT, IN  # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import task  # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import compss_wait_on  # pylint: disable=ungrouped-imports

from basic_modules.tool import Tool

# ------------------------------------------------------------------------------


[docs]class bedIndexerTool(Tool):
    """
    Tool for running indexers over a BED file for use in the RESTful API
    """

    def __init__(self, configuration=None):
        """
        Init function
        """
        logger.info("BED File Indexer")
        Tool.__init__(self)

        if configuration is None:
            configuration = {}

        self.configuration.update(configuration)

[docs]    def bed_feature_length(self, file_bed):
        """
        BED Feature Length

        Function to calcualte the averagte length of a feature in BED file.

        Parameters
        ----------
        file_bed : str
            Location of teh BED file

        Returns
        -------
        average_feature_length : int
            The average length of the features in a BED file.
        """

        total_feature_count = 0
        total_feature_length = 0

        with open(file_bed, 'r') as f_in:
            for line in f_in:
                line = line.strip()
                sline = line.split("\t")

                start = int(sline[1])
                end = int(sline[2])
                length = end-start

                total_feature_count += 1
                total_feature_length += length

        return total_feature_length / total_feature_count

[docs]    @task(returns=bool, file_sorted_bed=FILE_IN, file_chrom=FILE_IN,
          file_bb=FILE_OUT, bed_type=IN, isModifier=False)
    def bed2bigbed(self, file_sorted_bed, file_chrom, file_bb, bed_type=None):  # pylint: disable=no-self-use
        """
        BED to BigBed converter

        This uses the ``bedToBigBed`` program binary provided at
        http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/
        to perform the conversion from bed to bigbed.

        Parameters
        ----------
        file_sorted_bed : str
            Location of the sorted BED file
        file_chrom : str
            Location of the chrom.size file
        file_bb : str
            Location of the bigBed file

        Example
        -------
        .. code-block:: python
           :linenos:

           if not self.bed2bigbed(bed_file, chrom_file, bb_file):
               output_metadata.set_exception(
                   Exception(
                       "bed2bigbed: Could not process files {}, {}.".format(*input_files)))

        """
        command_line = 'bedToBigBed'
        if bed_type is not None:
            command_line += ' -type=' + str(bed_type)

        command_line += ' ' + file_sorted_bed + ' ' + file_chrom + ' ' + file_bb + '.tmp.bb'

        logger.info('BED 2 BIGBED:', command_line)

        args = shlex.split(command_line)
        process_handle = subprocess.Popen(args)
        process_handle.wait()

        with open(file_bb, 'wb') as f_out:
            with open(file_bb + '.tmp.bb', 'rb') as f_in:
                f_out.write(f_in.read())

        return True

[docs]    @task(returns=bool, file_id=IN, assembly=IN, file_sorted_bed=FILE_IN,
          file_hdf5=FILE_INOUT)
    def bed2hdf5(self, file_id, assembly, file_sorted_bed, file_hdf5):  # pylint: disable=no-self-use, too-many-locals,too-many-statements,too-many-branches
        """
        BED to HDF5 converter

        Loads the BED file into the HDF5 index file that gets used by the REST
        API to determine if there are files that have data in a given region.
        Overlapping regions are condensed into a single feature block rather
        than maintaining all of the detail of the original bed file.

        Parameters
        ----------
        file_id : str
            The file_id as stored by the DM-API so that it can be used for file
            retrieval later
        assembly : str
            Assembly of the genome that is getting indexed so that the
            chromosomes match
        feature_length : int
            Defines the level of resolution that the features should be recorded
            at. The 2 options are 1 or 1000. 1 records features at every single
            base whereas 1000 groups features into 1000bp chunks. The single
            base pair option should really only be used when features are less
            than 10bp to
        file_sorted_bed : str
            Location of the sorted BED file
        file_hdf5 : str
            Location of the HDF5 index file

        Example
        -------
        .. code-block:: python
           :linenos:

           if not self.bed2hdf5(file_id, assembly, bed_file, hdf5_file):
               output_metadata.set_exception(
                   Exception(
                       "bed2hdf5: Could not process files {}, {}.".format(*input_files)))

        """
        max_files = 1024
        max_chromosomes = 1024
        max_chromosome_size = 2000000000

        feature_length = self.bed_feature_length(file_sorted_bed)
        storage_level = 1000
        if feature_length < 10:
            storage_level = 1

        hdf5_in = h5py.File(file_hdf5, "a")

        if str(assembly) in hdf5_in:
            grp = hdf5_in[str(assembly)]
            # Required for preparing the data object
            meta = hdf5_in['meta']  # pylint: disable=unused-variable

            dset1 = grp['data1']
            dset1k = grp['data1k']
            fset = grp['files']
            cset = grp['chromosomes']
            file_idx_1 = [fs for fs in fset[0] if fs != '']
            file_idx_1k = [fs for fs in fset[1] if fs != '']
            if file_id not in file_idx_1 and file_id not in file_idx_1k:
                if storage_level == 1000:
                    file_idx_1k.append(file_id)
                else:
                    file_idx_1.append(file_id)

                # pylint comment: resize is a valid member of the objects
                dset1.resize((dset1.shape[0], dset1.shape[1] + 1, max_chromosome_size))  # pylint: disable=no-member
                dset1k.resize((dset1k.shape[0], dset1k.shape[1] + 1, max_chromosome_size/1000))  # pylint: disable=no-member
            chrom_idx = [c for c in cset if c != '']

        else:
            # Create the initial dataset with minimum values
            grp = hdf5_in.create_group(str(assembly))
            hdf5_in.create_group('meta')

            dtf = h5py.special_dtype(vlen=str)
            dtc = h5py.special_dtype(vlen=str)
            fset = grp.create_dataset('files', (2, max_files), dtype=dtf)
            cset = grp.create_dataset('chromosomes', (max_chromosomes,), dtype=dtc)

            file_idx_1 = []
            file_idx_1k = []
            chrom_idx = []

            logger.info(str(max_chromosome_size), str(max_chromosomes), str(max_files))
            dset1 = grp.create_dataset(
                'data1', (0, 1, max_chromosome_size),
                maxshape=(max_chromosomes, max_files, max_chromosome_size),
                dtype='bool', chunks=True, compression="gzip"
            )
            dset1k = grp.create_dataset(
                'data1k', (0, 1, max_chromosome_size/1000),
                maxshape=(max_chromosomes, max_files, max_chromosome_size/1000),
                dtype='bool', chunks=True, compression="gzip"
            )

            if storage_level == 1000:
                file_idx_1k.append(file_id)
            else:
                file_idx_1.append(file_id)

        # Save the list of files
        fset[0, 0:len(file_idx_1)] = file_idx_1
        fset[1, 0:len(file_idx_1k)] = file_idx_1k

        file_chrom_count = 0

        if storage_level == 1000:
            dnp = np.zeros([int(np.ceil(max_chromosome_size/1000))], dtype='bool')
        else:
            dnp = np.zeros([max_chromosome_size], dtype='bool')

        previous_chrom = ''

        loaded = False

        with open(file_sorted_bed, 'r') as f_in:
            for line in f_in:
                line = line.strip()
                length = line.split("\t")

                chrom = str(length[0])
                start = int(length[1])
                end = int(length[2])

                loaded = False

                if chrom != previous_chrom and previous_chrom != '':
                    file_chrom_count += 1
                    if previous_chrom not in chrom_idx:
                        chrom_idx.append(previous_chrom)
                        cset[0:len(chrom_idx)] = chrom_idx
                        dset1.resize(
                            (
                                dset1.shape[0]+1,
                                dset1.shape[1],
                                max_chromosome_size)
                        )
                        dset1k.resize(
                            (
                                dset1k.shape[0]+1,
                                dset1k.shape[1],
                                max_chromosome_size/1000
                            )
                        )

                    loaded = True

                    if storage_level == 1000:
                        dset1k[chrom_idx.index(previous_chrom), file_idx_1k.index(file_id), :] = dnp
                        dnp = np.zeros([int(np.ceil(max_chromosome_size/1000))], dtype='bool')
                    else:
                        dset1[chrom_idx.index(previous_chrom), file_idx_1.index(file_id), :] = dnp
                        dnp = np.zeros([max_chromosome_size], dtype='bool')

                previous_chrom = chrom
                if storage_level == 1000:
                    dnp[int(np.floor(start/1000)):int(np.ceil(end/1000))] = '1'
                else:
                    dnp[start:end+1] = '1'

            if loaded is False:
                if previous_chrom not in chrom_idx:
                    chrom_idx.append(chrom)
                    cset[0:len(chrom_idx)] = chrom_idx
                    dset1.resize((dset1.shape[0] + 1, dset1.shape[1], max_chromosome_size))
                    dset1k.resize((dset1k.shape[0] + 1, dset1k.shape[1], max_chromosome_size/1000))

                if storage_level == 1000:
                    dset1k[
                        chrom_idx.index(previous_chrom)/1000, file_idx_1k.index(file_id), :
                    ] = dnp
                else:
                    dset1[
                        chrom_idx.index(previous_chrom), file_idx_1.index(file_id), :
                    ] = dnp

        hdf5_in.close()

        return True

[docs]    def run(self, input_files, input_metadata, output_files):
        """
        Function to run the BED file sorter and indexer so that the files can
        get searched as part of the REST API

        Parameters
        ----------
        input_files : list
            bed_file : str
                Location of the sorted bed file
            chrom_size : str
                Location of chrom.size file
            hdf5_file : str
                Location of the HDF5 index file
        metadata : list
            file_id : str
                file_id used to identify the original bed file
            assembly : str
                Genome assembly accession

        Returns
        -------
        list
            bed_file : str
                Location of the sorted bed file
            bb_file : str
                Location of the BigBed file
            hdf5_file : str
                Location of the HDF5 index file

        Example
        -------
        .. code-block:: python
           :linenos:

           import tool

           # Bed Indexer
           b = tool.bedIndexerTool(self.configuration)
           bi, bm = bd.run(
               [bed_file_id, chrom_file_id, hdf5_file_id], [], {'assembly' : assembly}
           )
        """
        bed_type = None
        if "bed_type" in self.configuration:
            bed_type = self.configuration['bed_type']

        results = self.bed2bigbed(
            input_files["bed"], input_files["chrom_file"], output_files["bb_file"], bed_type)
        results = compss_wait_on(results)

        results = self.bed2hdf5(
            input_files['bed'], input_metadata["bed"].meta_data["assembly"],
            input_files["bed"], input_files["hdf5_file"]
        )
        results = compss_wait_on(results)

        output_generated_files = {
            "bb_file": output_files["bb_file"],
            "hdf5_file": input_metadata["bed"].meta_data["assembly"]
        }
        output_metadata = {
            "bb_file": input_files["bed"],
            "hdf5_file": input_metadata["hdf5_file"]
        }

        return (output_generated_files, output_metadata)

# ------------------------------------------------------------------------------