Source code for tool.tb_normalize
"""
.. See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import print_function
import sys
import glob
import os
from subprocess import PIPE, Popen
import subprocess
from basic_modules.tool import Tool
from utils import logger
try:
if hasattr(sys, '_run_from_cmdl') is True:
raise ImportError
from pycompss.api.parameter import FILE_IN, FILE_OUT, IN
from pycompss.api.task import task
# from pycompss.api.api import compss_wait_on
# from pycompss.api.constraint import constraint
except ImportError:
logger.info("[Warning] Cannot import \"pycompss\" API packages.")
logger.info(" Using mock decorators.")
from utils.dummy_pycompss import FILE_IN, FILE_OUT, IN # pylint: disable=ungrouped-imports
from utils.dummy_pycompss import task # pylint: disable=ungrouped-imports
# from utils.dummy_pycompss import compss_wait_on # pylint: disable=ungrouped-imports
# from utils.dummy_pycompss import constraint
# ------------------------------------------------------------------------------
[docs]class tbNormalizeTool(Tool): # pylint: disable=invalid-name
"""
Tool for normalizing an adjacency matrix
"""
def __init__(self):
"""
Init function
"""
logger.info("TADbit - Normalize")
Tool.__init__(self)
[docs] @task(bamin=FILE_IN, normalization=IN, resolution=IN, min_perc=IN,
max_perc=IN, workdir=IN, biases=FILE_OUT, interactions_plot=FILE_OUT,
filtered_bins_plot=FILE_OUT)
def tb_normalize(self, bamin, normalization, resolution, min_perc, # pylint: disable=too-many-locals,too-many-statements,unused-argument,no-self-use,too-many-arguments
max_perc, workdir, ncpus="1", min_count=None, fasta=None,
mappability=None, rest_enzyme=None):
"""
Function to normalize to a given resolution the Hi-C
matrix
Parameters
----------
bamin : str
Location of the tadbit bam paired reads
normalization: str
normalization(s) to apply. Order matters. Choices: [Vanilla, oneD]
resolution : str
Resolution of the Hi-C
min_perc : str
lower percentile from which consider bins as good.
max_perc : str
upper percentile until which consider bins as good.
workdir : str
Location of working directory
ncpus : str
Number of cpus to use
min_count : str
minimum number of reads mapped to a bin (recommended value
could be 2500). If set this option overrides the perc_zero
fasta: str
Location of the fasta file with genome sequence, to compute GC content and
number of restriction sites per bin. Required for oneD normalization
mappability: str
Location of the file with mappability, required for oneD normalization
rest_enzyme: str
For oneD normalization. Name of the restriction enzyme used to do the Hi-C experiment
Returns
-------
hic_biases : str
Location of HiC biases pickle file
interactions : str
Location of interaction decay vs genomic distance pdf
filtered_bins : str
Location of filtered_bins png
"""
# chr_hic_data = read_matrix(matrix_file, resolution=int(resolution))
logger.info("TB NORMALIZATION: {0} {1} {2} {3} {4} {5}".format(
bamin, normalization, resolution, min_perc, max_perc, workdir))
_cmd = [
'tadbit', 'normalize',
'--bam', bamin,
'--normalization', normalization,
'--workdir', workdir,
'--resolution', resolution,
'--cpus', str(ncpus)
]
if min_perc:
_cmd.append('--min_perc')
_cmd.append(min_perc)
if max_perc:
_cmd.append('--max_perc')
_cmd.append(max_perc)
if min_count:
_cmd.append('--min_count')
_cmd.append(min_count)
if normalization == 'oneD':
_cmd.append('--fasta')
_cmd.append(fasta)
_cmd.append('--mappability')
_cmd.append(mappability)
_cmd.append('--renz')
_cmd.append(rest_enzyme)
output_metadata = {}
output_files = []
try:
_ = subprocess.check_output(_cmd, stderr=subprocess.STDOUT,
cwd=workdir)
except subprocess.CalledProcessError as subp_err:
logger.info(subp_err.output)
if not min_count:
logger.info("cis/trans ratio failed, trying with min_count. Disabling plot.")
_cmd.append('--min_count')
_cmd.append('10')
_cmd.append('--normalize_only')
try:
_ = subprocess.check_output(_cmd, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as subp_err:
logger.fatal(subp_err.output)
os.chdir(workdir+"/04_normalization")
for fl_file in glob.glob("biases_*.pickle"):
output_files.append(os.path.abspath(fl_file))
break
for fl_file in glob.glob("interactions*.png"):
output_files.append(os.path.abspath(fl_file))
break
for fl_file in glob.glob("filtered_bins_*.png"):
output_files.append(os.path.abspath(fl_file))
break
return (output_files, output_metadata)
[docs] def run(self, input_files, input_metadata, output_files): # pylint: disable=too-many-locals
"""
The main function for the normalization of the Hi-C matrix to a given resolution
Parameters
----------
input_files : list
bamin : str
Location of the tadbit bam paired reads
metadata : dict
normalization: str
normalization(s) to apply. Order matters. Choices: [Vanilla, oneD]
resolution : str
Resolution of the Hi-C
min_perc : str
lower percentile from which consider bins as good.
max_perc : str
upper percentile until which consider bins as good.
workdir : str
Location of working directory
ncpus : str
Number of cpus to use
min_count : str
minimum number of reads mapped to a bin (recommended value
could be 2500). If set this option overrides the perc_zero
fasta: str
Location of the fasta file with genome sequence, to compute GC content and
number of restriction sites per bin. Required for oneD normalization
mappability: str
Location of the file with mappability, required for oneD normalization
rest_enzyme: str
For oneD normalization.
Name of the restriction enzyme used to do the Hi-C experiment
Returns
-------
output_files : list
List of locations for the output files.
output_metadata : list
List of matching metadata dict objects
"""
bamin = input_files[0]
if not os.path.isfile(bamin.replace('.bam', '.bam.bai')):
logger.info('Creating bam index')
_cmd = ['samtools', 'index', bamin]
out, err = Popen(_cmd, stdout=PIPE, stderr=PIPE).communicate()
logger.info(out)
logger.info(err)
resolution = '1000000'
if 'resolution' in input_metadata:
resolution = input_metadata['resolution']
normalization = 'Vanilla'
if 'normalization' in input_metadata:
normalization = input_metadata['normalization']
min_perc = max_perc = min_count = fasta = mappability = rest_enzyme = None
ncpus = 1
if 'ncpus' in input_metadata:
ncpus = input_metadata['ncpus']
if 'min_perc' in input_metadata:
min_perc = input_metadata['min_perc']
if 'max_perc' in input_metadata:
max_perc = input_metadata['max_perc']
if 'min_count' in input_metadata:
min_count = input_metadata['min_count']
if 'fasta' in input_metadata:
fasta = input_metadata['fasta']
if 'mappability' in input_metadata:
mappability = input_metadata['mappability']
if 'rest_enzyme' in input_metadata:
rest_enzyme = input_metadata['rest_enzyme']
root_name = os.path.dirname(os.path.abspath(bamin))
if 'workdir' in input_metadata:
root_name = input_metadata['workdir']
# input and output share most metadata
output_files, output_metadata = self.tb_normalize(bamin, normalization,
resolution, min_perc,
max_perc, root_name, ncpus,
min_count, fasta, mappability,
rest_enzyme)
return (output_files, output_metadata)
# ------------------------------------------------------------------------------