Source code for tool.tb_normalize

"""
.. See the NOTICE file distributed with this work for additional information
   regarding copyright ownership.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
"""
from __future__ import print_function

import sys
import glob
import os
from subprocess import PIPE, Popen
import subprocess

from basic_modules.tool import Tool

from utils import logger

try:
    if hasattr(sys, '_run_from_cmdl') is True:
        raise ImportError
    from pycompss.api.parameter import FILE_IN, FILE_OUT, IN
    from pycompss.api.task import task
    # from pycompss.api.api import compss_wait_on
    # from pycompss.api.constraint import constraint
except ImportError:
    logger.info("[Warning] Cannot import \"pycompss\" API packages.")
    logger.info("          Using mock decorators.")

    from utils.dummy_pycompss import FILE_IN, FILE_OUT, IN  # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import task  # pylint: disable=ungrouped-imports
    # from utils.dummy_pycompss import compss_wait_on  # pylint: disable=ungrouped-imports
    # from utils.dummy_pycompss import constraint


# ------------------------------------------------------------------------------

[docs]class tbNormalizeTool(Tool): # pylint: disable=invalid-name """ Tool for normalizing an adjacency matrix """ def __init__(self): """ Init function """ logger.info("TADbit - Normalize") Tool.__init__(self)
[docs] @task(bamin=FILE_IN, normalization=IN, resolution=IN, min_perc=IN, max_perc=IN, workdir=IN, biases=FILE_OUT, interactions_plot=FILE_OUT, filtered_bins_plot=FILE_OUT) def tb_normalize(self, bamin, normalization, resolution, min_perc, # pylint: disable=too-many-locals,too-many-statements,unused-argument,no-self-use,too-many-arguments max_perc, workdir, ncpus="1", min_count=None, fasta=None, mappability=None, rest_enzyme=None): """ Function to normalize to a given resolution the Hi-C matrix Parameters ---------- bamin : str Location of the tadbit bam paired reads normalization: str normalization(s) to apply. Order matters. Choices: [Vanilla, oneD] resolution : str Resolution of the Hi-C min_perc : str lower percentile from which consider bins as good. max_perc : str upper percentile until which consider bins as good. workdir : str Location of working directory ncpus : str Number of cpus to use min_count : str minimum number of reads mapped to a bin (recommended value could be 2500). If set this option overrides the perc_zero fasta: str Location of the fasta file with genome sequence, to compute GC content and number of restriction sites per bin. Required for oneD normalization mappability: str Location of the file with mappability, required for oneD normalization rest_enzyme: str For oneD normalization. Name of the restriction enzyme used to do the Hi-C experiment Returns ------- hic_biases : str Location of HiC biases pickle file interactions : str Location of interaction decay vs genomic distance pdf filtered_bins : str Location of filtered_bins png """ # chr_hic_data = read_matrix(matrix_file, resolution=int(resolution)) logger.info("TB NORMALIZATION: {0} {1} {2} {3} {4} {5}".format( bamin, normalization, resolution, min_perc, max_perc, workdir)) _cmd = [ 'tadbit', 'normalize', '--bam', bamin, '--normalization', normalization, '--workdir', workdir, '--resolution', resolution, '--cpus', str(ncpus) ] if min_perc: _cmd.append('--min_perc') _cmd.append(min_perc) if max_perc: _cmd.append('--max_perc') _cmd.append(max_perc) if min_count: _cmd.append('--min_count') _cmd.append(min_count) if normalization == 'oneD': _cmd.append('--fasta') _cmd.append(fasta) _cmd.append('--mappability') _cmd.append(mappability) _cmd.append('--renz') _cmd.append(rest_enzyme) output_metadata = {} output_files = [] try: _ = subprocess.check_output(_cmd, stderr=subprocess.STDOUT, cwd=workdir) except subprocess.CalledProcessError as subp_err: logger.info(subp_err.output) if not min_count: logger.info("cis/trans ratio failed, trying with min_count. Disabling plot.") _cmd.append('--min_count') _cmd.append('10') _cmd.append('--normalize_only') try: _ = subprocess.check_output(_cmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as subp_err: logger.fatal(subp_err.output) os.chdir(workdir+"/04_normalization") for fl_file in glob.glob("biases_*.pickle"): output_files.append(os.path.abspath(fl_file)) break for fl_file in glob.glob("interactions*.png"): output_files.append(os.path.abspath(fl_file)) break for fl_file in glob.glob("filtered_bins_*.png"): output_files.append(os.path.abspath(fl_file)) break return (output_files, output_metadata)
[docs] def run(self, input_files, input_metadata, output_files): # pylint: disable=too-many-locals """ The main function for the normalization of the Hi-C matrix to a given resolution Parameters ---------- input_files : list bamin : str Location of the tadbit bam paired reads metadata : dict normalization: str normalization(s) to apply. Order matters. Choices: [Vanilla, oneD] resolution : str Resolution of the Hi-C min_perc : str lower percentile from which consider bins as good. max_perc : str upper percentile until which consider bins as good. workdir : str Location of working directory ncpus : str Number of cpus to use min_count : str minimum number of reads mapped to a bin (recommended value could be 2500). If set this option overrides the perc_zero fasta: str Location of the fasta file with genome sequence, to compute GC content and number of restriction sites per bin. Required for oneD normalization mappability: str Location of the file with mappability, required for oneD normalization rest_enzyme: str For oneD normalization. Name of the restriction enzyme used to do the Hi-C experiment Returns ------- output_files : list List of locations for the output files. output_metadata : list List of matching metadata dict objects """ bamin = input_files[0] if not os.path.isfile(bamin.replace('.bam', '.bam.bai')): logger.info('Creating bam index') _cmd = ['samtools', 'index', bamin] out, err = Popen(_cmd, stdout=PIPE, stderr=PIPE).communicate() logger.info(out) logger.info(err) resolution = '1000000' if 'resolution' in input_metadata: resolution = input_metadata['resolution'] normalization = 'Vanilla' if 'normalization' in input_metadata: normalization = input_metadata['normalization'] min_perc = max_perc = min_count = fasta = mappability = rest_enzyme = None ncpus = 1 if 'ncpus' in input_metadata: ncpus = input_metadata['ncpus'] if 'min_perc' in input_metadata: min_perc = input_metadata['min_perc'] if 'max_perc' in input_metadata: max_perc = input_metadata['max_perc'] if 'min_count' in input_metadata: min_count = input_metadata['min_count'] if 'fasta' in input_metadata: fasta = input_metadata['fasta'] if 'mappability' in input_metadata: mappability = input_metadata['mappability'] if 'rest_enzyme' in input_metadata: rest_enzyme = input_metadata['rest_enzyme'] root_name = os.path.dirname(os.path.abspath(bamin)) if 'workdir' in input_metadata: root_name = input_metadata['workdir'] # input and output share most metadata output_files, output_metadata = self.tb_normalize(bamin, normalization, resolution, min_perc, max_perc, root_name, ncpus, min_count, fasta, mappability, rest_enzyme) return (output_files, output_metadata)
# ------------------------------------------------------------------------------