Source code for tool.tb_segment

"""
.. See the NOTICE file distributed with this work for additional information
   regarding copyright ownership.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
"""
from __future__ import print_function

import sys
import os
import glob
import shutil
# from subprocess import CalledProcessError
from subprocess import PIPE
from subprocess import Popen

from basic_modules.tool import Tool
from utils import logger

from tool.common import format_utils

try:
    if hasattr(sys, '_run_from_cmdl') is True:
        raise ImportError
    from pycompss.api.parameter import FILE_IN, FILE_OUT, IN
    from pycompss.api.task import task
    # from pycompss.api.api import compss_wait_on
    # from pycompss.api.constraint import constraint
except ImportError:
    logger.info("[Warning] Cannot import \"pycompss\" API packages.")
    logger.info("          Using mock decorators.")

    from utils.dummy_pycompss import FILE_IN, FILE_OUT, IN  # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import task  # pylint: disable=ungrouped-imports
    # from utils.dummy_pycompss import compss_wait_on # pylint: disable=ungrouped-imports
    # from utils.dummy_pycompss import constraint # pylint: disable=ungrouped-imports


# ------------------------------------------------------------------------------

[docs]class tbSegmentTool(Tool): # pylint: disable=invalid-name """ Tool for finding tads and compartments in an adjacency matrix """ def __init__(self): """ Init function """ logger.info("TADbit - Normalize") Tool.__init__(self)
[docs] @task(bamin=FILE_IN, biases=FILE_IN, resolution=IN, workdir=IN, tad_dir=FILE_OUT, compartment_dir=FILE_OUT) def tb_segment( # pylint: disable=too-many-locals,too-many-statements,unused-argument,no-self-use,too-many-arguments self, bamin, biases, resolution, callers, chromosomes, workdir, fasta=None, ncpus="1"): """ Function to find tads and compartments in the Hi-C matrix Parameters ---------- bamin : str Location of the tadbit bam paired reads biases : str Location of the pickle hic biases resolution : int Resolution of the Hi-C callers: str 1 for ta calling, 2 for compartment calling workdir : str Location of working directory ncpus : int Number of cpus to use Returns ------- compartments : str Location of tsv file with compartment definition tads : str Location of tsv file with tad definition filtered_bins : str Location of filtered_bins png """ logger.info("TB SEGMENT: {0} {1} {2}".format(bamin, resolution, workdir)) _cmd = [ 'tadbit', 'segment', '--nosql', '--mreads', bamin, '--workdir', workdir, '--resolution', resolution, '--cpu', str(ncpus), '--nosql' ] if '2' not in callers: _cmd.append('--only_tads') if '1' not in callers: _cmd.append('--only_compartments') if chromosomes: _cmd.append('--chromosomes') _cmd.append(chromosomes) if fasta: _cmd.append('--fasta') _cmd.append(fasta) if biases: _cmd.append('--biases') _cmd.append(biases) output_metadata = {} output_files = [] out, err = Popen(_cmd, stdout=PIPE, stderr=PIPE).communicate() logger.info(out) logger.info(err) if '1' in callers: tad_dir = os.path.join(workdir, '06_segmentation', 'tads_%s' % (format_utils.nice(int(resolution)))) clean_headers(tad_dir) output_files.append(tad_dir) if '2' in callers: cmprt_dir = os.path.join(workdir, '06_segmentation', 'compartments_%s' % (format_utils.nice(int(resolution)))) clean_headers(cmprt_dir) output_files.append(cmprt_dir) return (output_files, output_metadata)
[docs] def run(self, input_files, input_metadata, output_files): # pylint: disable=too-many-locals """ The main function to the predict TAD sites and compartments for a given resolution from the Hi-C matrix Parameters ---------- input_files : list bamin : str Location of the tadbit bam paired reads biases : str Location of the pickle hic biases metadata : dict resolution : int Resolution of the Hi-C workdir : str Location of working directory ncpus : int Number of cpus to use Returns ------- output_files : list List of locations for the output files. output_metadata : list List of matching metadata dict objects """ bamin = input_files[0] if not os.path.isfile(bamin.replace('.bam', '.bam.bai')): logger.info('Creating bam index') _cmd = ['samtools', 'index', bamin] out, err = Popen(_cmd, stdout=PIPE, stderr=PIPE).communicate() logger.info(out) logger.info(err) resolution = '1000000' if 'resolution' in input_metadata: resolution = input_metadata['resolution'] ncpus = 1 if 'ncpus' in input_metadata: ncpus = input_metadata['ncpus'] biases = chromosomes = fasta = None if len(input_files) > 1: biases = input_files[1] if "chromosomes" in input_metadata: chromosomes = input_metadata['chromosomes'] if "fasta" in input_metadata: fasta = input_metadata['fasta'] callers = "1" if "callers" in input_metadata: callers = input_metadata['callers'] root_name = os.path.dirname(os.path.abspath(bamin)) if 'workdir' in input_metadata: root_name = input_metadata['workdir'] # input and output share most metadata output_files, output_metadata = self.tb_segment(bamin, biases, resolution, callers, chromosomes, root_name, fasta, ncpus) return (output_files, output_metadata)
def clean_headers(fpath): """ Replaces spaces by underscores in the headers of tsv files """ os.chdir(fpath) for fl_files in glob.glob("*.tsv"): tsv_file = open(fl_files) line = tsv_file.readline() line = line.replace(' ', '_') dest_file = os.path.join(os.path.dirname(fl_files), 'vre_' + os.path.basename(fl_files)) to_file = open(dest_file, mode="w") to_file.write(line) shutil.copyfileobj(tsv_file, to_file) os.unlink(fl_files)