Source code for tool.tb_bin

"""
.. See the NOTICE file distributed with this work for additional information
   regarding copyright ownership.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
"""
from __future__ import print_function

import sys
from sys import stdout
from subprocess import PIPE, Popen
import os

from utils import logger

try:
    from cPickle import load
except ImportError:
    logger.info("Using pickle instead of cPickle")
    from pickle import load

try:
    if hasattr(sys, '_run_from_cmdl') is True:
        raise ImportError
    from pycompss.api.parameter import FILE_IN, FILE_OUT, IN
    from pycompss.api.task import task
except ImportError:
    logger.info("[Warning] Cannot import \"pycompss\" API packages.")
    logger.info("          Using mock decorators.")

    from utils.dummy_pycompss import FILE_IN, FILE_OUT, IN  # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import task  # pylint: disable=ungrouped-imports

from basic_modules.tool import Tool

from pytadbit.parsers.hic_bam_parser import write_matrix  # pylint: disable=import-error,no-name-in-module
from pytadbit import Chromosome  # pylint: disable=import-error
from pytadbit.parsers.hic_parser import load_hic_data_from_bam  # pylint: disable=import-error,no-name-in-module
from pytadbit.mapping.analyze import hic_map  # pylint: disable=import-error


# ------------------------------------------------------------------------------

[docs]class tbBinTool(Tool): # pylint: disable=invalid-name,too-few-public-methods """ Tool for binning an adjacency matrix """ def __init__(self): """ Init function """ logger.info("TADbit - Bin") Tool.__init__(self)
[docs] @task(bamin=FILE_IN, biases=FILE_IN, resolution=IN, coord1=IN, coord2=IN, norm=IN, workdir=IN, raw_matrix=FILE_OUT, raw_fig=FILE_OUT, nrm_matrix=FILE_OUT, nrm_fig=FILE_OUT, json_matrix=FILE_OUT) def tb_bin(self, bamin, biases, resolution, coord1, coord2, # pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-branches,no-self-use norm, workdir, ncpus="1", metadata=None): """ Function to bin to a given resolution the Hi-C matrix Parameters ---------- bamin : str Location of the tadbit bam paired reads biases : str Location of the pickle hic biases resolution : int Resolution of the Hi-C coord1 : str Coordinate of the region to retrieve. By default all genome, arguments can be either one chromosome name, or the coordinate in the form: "-c chr3:110000000-120000000" coord2 : str Coordinate of a second region to retrieve the matrix in the intersection with the first region. norm : list [['raw']] normalization(s) to apply. Order matters. Choices: [norm, decay, raw] workdir : str Location of working directory ncpus : int Number of cpus to use Returns ------- hic_contacts_matrix_raw : str Location of HiC raw matrix in text format hic_contacts_matrix_nrm : str Location of HiC normalized matrix in text format hic_contacts_matrix_raw_fig : str Location of HiC raw matrix in png format hic_contacts_matrix_norm_fig : str Location of HiC normalized matrix in png format """ logger.info("TB BIN: {0}, {1}, {2}".format(bamin, resolution, workdir)) output_metadata = {} if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None if region1: if region1: stdout.write('\nExtraction of %s' % (region1)) if start1: stdout.write(':%s-%s' % (start1, end1)) else: stdout.write(' (full chromosome)') if region2: stdout.write(' intersection with %s' % (region2)) if start2: stdout.write(':%s-%s\n' % (start2, end2)) else: stdout.write(' (full chromosome)\n') else: stdout.write('\n') else: stdout.write('\nExtraction of full genome\n') out_files = write_matrix( bamin, resolution, load(open(biases)) if biases else None, workdir, filter_exclude=[], normalizations=norm, half_matrix=True, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=workdir, append_to_tar=None, ncpus=ncpus, verbose=True) output_files = [out_files['RAW']] imx = load_hic_data_from_bam(bamin, resolution, biases=biases if biases else None, region=region1, ncpus=ncpus, tmpdir=workdir) hic_contacts_matrix_raw_fig = workdir+"/genomic_maps_raw.png" focus = None by_chrom = None if start1 is not None: focus = ((start1/resolution) if start1 != 0 else 1, (end1/resolution)) else: if region1 is not None: focus = region1 else: by_chrom = 'all' hic_contacts_matrix_raw_fig = workdir+"/genomic_maps_raw" hic_map(imx, resolution, savefig=hic_contacts_matrix_raw_fig, normalized=False, by_chrom=by_chrom, focus=focus) if by_chrom == 'all': hic_map(imx, resolution, savefig=hic_contacts_matrix_raw_fig+"/full_map.png", normalized=False, by_chrom=None, focus=focus) output_files.append(hic_contacts_matrix_raw_fig) if len(norm) > 1: output_files.append(out_files['NRM']) hic_contacts_matrix_norm_fig = workdir+"/genomic_maps_nrm.png" if by_chrom == 'all': hic_contacts_matrix_norm_fig = workdir+"/genomic_maps_nrm" hic_map(imx, resolution, savefig=hic_contacts_matrix_norm_fig, normalized=True, by_chrom=by_chrom, focus=focus) if by_chrom == 'all': hic_map(imx, resolution, savefig=hic_contacts_matrix_norm_fig+"/full_map.png", normalized=True, by_chrom=None, focus=focus) output_files.append(hic_contacts_matrix_norm_fig) json_chr = Chromosome(name="VRE Chromosome", species=metadata["species"], assembly=metadata["assembly"], max_tad_size=260000) json_chr.add_experiment( "exp1", resolution, hic_data=imx, silent=True) exp = json_chr.experiments[0] json_file_name = out_files['RAW'].replace(".abc", ".json") if start1 is not None: if focus[1]-focus[0] > 1200: logger.info("TADkit json too big, limiting to 1200 bins") focus = (focus[0], focus[0]+1200) else: if exp.size > 1200: logger.info("TADkit json too big, limiting to 1200 bins") focus = (1, 1200) exp.write_json(json_file_name, focus=focus) output_files.append(json_file_name) return (output_files, output_metadata)
[docs] def run(self, input_files, input_metadata, output_files): # pylint: disable=too-many-locals """ The main function to the predict TAD sites for a given resolution from the Hi-C matrix Parameters ---------- input_files : list bamin : str Location of the tadbit bam paired reads biases : str Location of the pickle hic biases input_metadata : dict resolution : int Resolution of the Hi-C coord1 : str Coordinate of the region to retrieve. By default all genome, arguments can be either one chromosome name, or the coordinate in the form: "-c chr3:110000000-120000000" coord2 : str Coordinate of a second region to retrieve the matrix in the intersection with the first region. norm : str [['raw']] normalization(s) to apply. Order matters. Choices: [norm, decay, raw] workdir : str Location of working directory ncpus : int Number of cpus to use Returns ------- output_files : list List of locations for the output files. output_metadata : list List of matching metadata dict objects """ bamin = input_files[0] if not os.path.isfile(bamin.replace('.bam', '.bam.bai')): logger.info('Creating bam index') _cmd = ['samtools', 'index', bamin] out, err = Popen(_cmd, stdout=PIPE, stderr=PIPE).communicate() logger.info(out) logger.info(err) biases = None if len(input_files) > 1: biases = input_files[1] resolution = 100000 if 'resolution' in input_metadata: resolution = int(input_metadata['resolution']) coord1 = coord2 = norm = None ncpus = 1 if 'ncpus' in input_metadata: ncpus = input_metadata['ncpus'] if 'coord1' in input_metadata: coord1 = input_metadata['coord1'] if 'coord2' in input_metadata: coord2 = input_metadata['coord2'] if 'norm' in input_metadata: norm = input_metadata['norm'] root_name = os.path.dirname(os.path.abspath(bamin)) if 'workdir' in input_metadata: root_name = input_metadata['workdir'] # input and output share most metadata project_metadata = {} project_metadata["species"] = input_metadata["species"] project_metadata["assembly"] = input_metadata["assembly"] output_files, output_metadata = self.tb_bin(bamin, biases, resolution, coord1, coord2, norm, root_name, ncpus, project_metadata) return (output_files, output_metadata)
# ------------------------------------------------------------------------------