Source code for tool.tb_bin

"""
.. See the NOTICE file distributed with this work for additional information
   regarding copyright ownership.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
"""
from __future__ import print_function

import sys
from sys import stdout
from subprocess import PIPE, Popen
import os

from utils import logger

try:
    from cPickle import load
except ImportError:
    logger.info("Using pickle instead of cPickle")
    from pickle import load

try:
    if hasattr(sys, '_run_from_cmdl') is True:
        raise ImportError
    from pycompss.api.parameter import FILE_IN, FILE_OUT, IN
    from pycompss.api.task import task
except ImportError:
    logger.info("[Warning] Cannot import \"pycompss\" API packages.")
    logger.info("          Using mock decorators.")

    from utils.dummy_pycompss import FILE_IN, FILE_OUT, IN  # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import task  # pylint: disable=ungrouped-imports

from basic_modules.tool import Tool

from pytadbit.parsers.hic_bam_parser import write_matrix  # pylint: disable=import-error,no-name-in-module
from pytadbit import Chromosome  # pylint: disable=import-error
from pytadbit.parsers.hic_parser import load_hic_data_from_bam  # pylint: disable=import-error,no-name-in-module
from pytadbit.mapping.analyze import hic_map  # pylint: disable=import-error


# ------------------------------------------------------------------------------

[docs]class tbBinTool(Tool):  # pylint: disable=invalid-name,too-few-public-methods
    """
    Tool for binning an adjacency matrix
    """

    def __init__(self):
        """
        Init function
        """
        logger.info("TADbit - Bin")
        Tool.__init__(self)

[docs]    @task(bamin=FILE_IN, biases=FILE_IN, resolution=IN, coord1=IN, coord2=IN,
          norm=IN, workdir=IN, raw_matrix=FILE_OUT, raw_fig=FILE_OUT,
          nrm_matrix=FILE_OUT, nrm_fig=FILE_OUT, json_matrix=FILE_OUT)
    def tb_bin(self, bamin, biases, resolution, coord1, coord2,  # pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-branches,no-self-use
               norm, workdir, ncpus="1", metadata=None):
        """
        Function to bin to a given resolution the Hi-C
        matrix

        Parameters
        ----------
        bamin : str
            Location of the tadbit bam paired reads
        biases : str
            Location of the pickle hic biases
        resolution : int
            Resolution of the Hi-C
        coord1 : str
            Coordinate of the region to retrieve. By default all genome,
            arguments can be either one chromosome name, or the coordinate in
            the form: "-c chr3:110000000-120000000"
        coord2 : str
            Coordinate of a second region to retrieve the matrix in the
            intersection with the first region.
        norm : list
            [['raw']] normalization(s) to apply. Order matters. Choices: [norm,
            decay, raw]
        workdir : str
            Location of working directory
        ncpus : int
            Number of cpus to use

        Returns
        -------
        hic_contacts_matrix_raw : str
            Location of HiC raw matrix in text format
        hic_contacts_matrix_nrm : str
            Location of HiC normalized matrix in text format
        hic_contacts_matrix_raw_fig : str
            Location of HiC raw matrix in png format
        hic_contacts_matrix_norm_fig : str
            Location of HiC normalized matrix in png format

        """
        logger.info("TB BIN: {0}, {1}, {2}".format(bamin, resolution, workdir))

        output_metadata = {}

        if coord2 and not coord1:
            coord1, coord2 = coord2, coord1

        if not coord1:
            region1 = None
            start1 = None
            end1 = None
            region2 = None
            start2 = None
            end2 = None
        else:
            try:
                crm1, pos1 = coord1.split(':')
                start1, end1 = pos1.split('-')
                region1 = crm1
                start1 = int(start1)
                end1 = int(end1)
            except ValueError:
                region1 = coord1
                start1 = None
                end1 = None
            if coord2:
                try:
                    crm2, pos2 = coord2.split(':')
                    start2, end2 = pos2.split('-')
                    region2 = crm2
                    start2 = int(start2)
                    end2 = int(end2)
                except ValueError:
                    region2 = coord2
                    start2 = None
                    end2 = None
            else:
                region2 = None
                start2 = None
                end2 = None

        if region1:
            if region1:
                stdout.write('\nExtraction of %s' % (region1))
                if start1:
                    stdout.write(':%s-%s' % (start1, end1))
                else:
                    stdout.write(' (full chromosome)')
                if region2:
                    stdout.write(' intersection with %s' % (region2))
                    if start2:
                        stdout.write(':%s-%s\n' % (start2, end2))
                    else:
                        stdout.write(' (full chromosome)\n')
                else:
                    stdout.write('\n')
        else:
            stdout.write('\nExtraction of full genome\n')

        out_files = write_matrix(
            bamin, resolution,
            load(open(biases)) if biases else None,
            workdir, filter_exclude=[],
            normalizations=norm, half_matrix=True,
            region1=region1, start1=start1, end1=end1,
            region2=region2, start2=start2, end2=end2,
            tmpdir=workdir, append_to_tar=None, ncpus=ncpus,
            verbose=True)
        output_files = [out_files['RAW']]
        imx = load_hic_data_from_bam(bamin, resolution,
                                     biases=biases if biases else None,
                                     region=region1,
                                     ncpus=ncpus, tmpdir=workdir)
        hic_contacts_matrix_raw_fig = workdir+"/genomic_maps_raw.png"
        focus = None
        by_chrom = None
        if start1 is not None:
            focus = ((start1/resolution) if start1 != 0 else 1, (end1/resolution))
        else:
            if region1 is not None:
                focus = region1
            else:
                by_chrom = 'all'
                hic_contacts_matrix_raw_fig = workdir+"/genomic_maps_raw"
        hic_map(imx, resolution, savefig=hic_contacts_matrix_raw_fig,
                normalized=False, by_chrom=by_chrom, focus=focus)
        if by_chrom == 'all':
            hic_map(imx, resolution, savefig=hic_contacts_matrix_raw_fig+"/full_map.png",
                    normalized=False, by_chrom=None, focus=focus)
        output_files.append(hic_contacts_matrix_raw_fig)

        if len(norm) > 1:
            output_files.append(out_files['NRM'])
            hic_contacts_matrix_norm_fig = workdir+"/genomic_maps_nrm.png"
            if by_chrom == 'all':
                hic_contacts_matrix_norm_fig = workdir+"/genomic_maps_nrm"
            hic_map(imx, resolution, savefig=hic_contacts_matrix_norm_fig,
                    normalized=True, by_chrom=by_chrom, focus=focus)
            if by_chrom == 'all':
                hic_map(imx, resolution, savefig=hic_contacts_matrix_norm_fig+"/full_map.png",
                        normalized=True, by_chrom=None, focus=focus)
            output_files.append(hic_contacts_matrix_norm_fig)

        json_chr = Chromosome(name="VRE Chromosome", species=metadata["species"],
                              assembly=metadata["assembly"], max_tad_size=260000)
        json_chr.add_experiment(
            "exp1", resolution,
            hic_data=imx,
            silent=True)

        exp = json_chr.experiments[0]
        json_file_name = out_files['RAW'].replace(".abc", ".json")
        if start1 is not None:
            if focus[1]-focus[0] > 1200:
                logger.info("TADkit json too big, limiting to 1200 bins")
                focus = (focus[0], focus[0]+1200)
        else:
            if exp.size > 1200:
                logger.info("TADkit json too big, limiting to 1200 bins")
                focus = (1, 1200)
        exp.write_json(json_file_name, focus=focus)
        output_files.append(json_file_name)

        return (output_files, output_metadata)

[docs]    def run(self, input_files, input_metadata, output_files):  # pylint: disable=too-many-locals
        """
        The main function to the predict TAD sites for a given resolution from
        the Hi-C matrix

        Parameters
        ----------
        input_files : list
            bamin : str
                Location of the tadbit bam paired reads
            biases : str
                Location of the pickle hic biases
        input_metadata : dict
            resolution : int
                Resolution of the Hi-C
            coord1 : str
                Coordinate of the region to retrieve. By default all genome,
                arguments can be either one chromosome name, or the coordinate in
                the form: "-c chr3:110000000-120000000"
            coord2 : str
                Coordinate of a second region to retrieve the matrix in the
                intersection with the first region.
            norm : str
                [['raw']] normalization(s) to apply. Order matters. Choices: [norm,
                decay, raw]
            workdir : str
                Location of working directory
            ncpus : int
                Number of cpus to use



        Returns
        -------
        output_files : list
            List of locations for the output files.
        output_metadata : list
            List of matching metadata dict objects

        """

        bamin = input_files[0]

        if not os.path.isfile(bamin.replace('.bam', '.bam.bai')):
            logger.info('Creating bam index')
            _cmd = ['samtools', 'index', bamin]
            out, err = Popen(_cmd, stdout=PIPE, stderr=PIPE).communicate()
            logger.info(out)
            logger.info(err)

        biases = None
        if len(input_files) > 1:
            biases = input_files[1]
        resolution = 100000
        if 'resolution' in input_metadata:
            resolution = int(input_metadata['resolution'])

        coord1 = coord2 = norm = None
        ncpus = 1
        if 'ncpus' in input_metadata:
            ncpus = input_metadata['ncpus']

        if 'coord1' in input_metadata:
            coord1 = input_metadata['coord1']
        if 'coord2' in input_metadata:
            coord2 = input_metadata['coord2']
        if 'norm' in input_metadata:
            norm = input_metadata['norm']

        root_name = os.path.dirname(os.path.abspath(bamin))
        if 'workdir' in input_metadata:
            root_name = input_metadata['workdir']

        # input and output share most metadata
        project_metadata = {}
        project_metadata["species"] = input_metadata["species"]
        project_metadata["assembly"] = input_metadata["assembly"]

        output_files, output_metadata = self.tb_bin(bamin, biases, resolution,
                                                    coord1, coord2, norm, root_name,
                                                    ncpus, project_metadata)

        return (output_files, output_metadata)

# ------------------------------------------------------------------------------