Source code for tool.forge_bsgenome

"""
.. See the NOTICE file distributed with this work for additional information
   regarding copyright ownership.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
"""
from __future__ import print_function

import os
import subprocess
import sys

from utils import logger

try:
    if hasattr(sys, '_run_from_cmdl') is True:
        raise ImportError
    from pycompss.api.parameter import FILE_IN, FILE_OUT, IN
    from pycompss.api.task import task
    # from pycompss.api.api import compss_wait_on
except ImportError:
    logger.warn("[Warning] Cannot import \"pycompss\" API packages.")
    logger.warn("          Using mock decorators.")

    from utils.dummy_pycompss import FILE_IN, FILE_OUT, IN  # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import task  # pylint: disable=ungrouped-imports
    # from utils.dummy_pycompss import compss_wait_on  # pylint: disable=ungrouped-imports

from basic_modules.tool import Tool
from basic_modules.metadata import Metadata

from tool.common import cd


# ------------------------------------------------------------------------------

[docs]class bsgenomeTool(Tool): # pylint: disable=invalid-name """ Tool for peak calling for iDamID-seq data """ def __init__(self, configuration=None): """ Initialise the tool with its configuration. Parameters ---------- configuration : dict a dictionary containing parameters that define how the operation should be carried out, which are specific to each Tool. """ logger.info("Forge BSgenome") Tool.__init__(self) if configuration is None: configuration = {} self.configuration.update(configuration)
[docs] @staticmethod def genome_to_2bit(genome, genome_2bit): """ Generate the 2bit genome file from a FASTA file Parameters ---------- genome : str Location of the FASRA genome file genome_2bit : str Location of the 2bit genome file Returns ------- bool True if successful, False if not. """ command_line_2bit = "faToTwoBit " + genome + " " + genome_2bit try: logger.info("faToTwoBit ...") # args = shlex.split(command_line_2bit) process = subprocess.Popen( command_line_2bit, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() out, err = process.communicate() logger.info(out) if process.returncode > 0: logger.warn(err) return False except (IOError, OSError) as msg: logger.fatal("I/O error({0}) - faToTwoBit: {1}\n{2}".format( msg.errno, msg.strerror, command_line_2bit)) return False return True
[docs] @staticmethod def get_chrom_size(genome_2bit, chrom_size, circ_chrom): """ Generate the chrom.size file and identify the available chromosomes in the 2Bit file. Parameters ---------- genome_2bit : str Location of the 2bit genome file chrom_size : str Location to save the chrom.size file to circ_chrom : list List of chromosomes that are known to be circular Returns ------- If successful 2 lists: [0] : List of the linear chromosomes in the 2bit file [1] : List of circular chromosomes in the 2bit file Returns (False, False) if there is an IOError """ command_line_chrom_size = "twoBitInfo " + genome_2bit + " stdout | sort -k2rn" try: logger.info("twoBitInfo ...") # args = shlex.split(command_line_chrom_size) with open(chrom_size, "w") as f_out: sub_proc_1 = subprocess.Popen( command_line_chrom_size, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sub_proc_1.wait() out, err = sub_proc_1.communicate() f_out.write(out) logger.info(out) logger.warn(err) except (IOError, OSError) as msg: logger.fatal("I/O error({0} - twoBitInfo): {1}\n{2}".format( msg.errno, msg.strerror, command_line_chrom_size)) out, err = sub_proc_1.communicate() logger.info(out) logger.warn(err) return (False, False) chrom_seq_list = [] chrom_circ_list = [] with open(chrom_size, "r") as f_in: for line in f_in: line = line.split("\t") if any(cc in line[0] for cc in circ_chrom): chrom_circ_list.append(line[0]) else: chrom_seq_list.append(line[0]) return (chrom_seq_list, chrom_circ_list)
[docs] @task( returns=int, genome=FILE_IN, circ_chrom=IN, seed_file_param=IN, genome_2bit=FILE_OUT, chrom_size=FILE_OUT, seed_file=FILE_OUT, bsgenome=FILE_OUT, isModifier=False) def bsgenome_creater( # pylint: disable=no-self-use,too-many-return-statements self, genome, circ_chrom, seed_file_param, genome_2bit, chrom_size, seed_file, bsgenome): """ Make BSgenome index files.Uses an R script that wraps the required code. Parameters ---------- genome : str circo_chrom : str Comma separated list of chromosome ids that are circular in the genome seed_file_param : dict Parameters required for the function to build the seed file genome_2bit : str chrom_size : str seed_file : str bsgenome : str Returns ------- """ if self.genome_to_2bit(genome, genome_2bit) is False: return False chrom_seq_list, chrom_circ_list = self.get_chrom_size(genome_2bit, chrom_size, circ_chrom) if chrom_seq_list is False: return False genome_split = genome_2bit.split("/") seed_file_param["seqnames"] = 'c("' + '","'.join(chrom_seq_list) + '")' number_circ_chrs = len(chrom_circ_list) if number_circ_chrs > 0: seed_file_param["circ_seqs"] = 'c("' + '","'.join(chrom_circ_list) + '")' seed_file_param["circ_seqs"] = "" seed_file_param["seqs_srcdir"] = "/".join(genome_split[0:-1]) seed_file_param["seqfile_name"] = genome_split[-1] # Create the seed file seed_order = [ "Package", "Title", "Description", "Author", "Maintainer", "License", "Version", "organism", "common_name", "provider", "provider_version", "release_date", "release_name", "organism_biocview", "BSgenomeObjname", "seqnames", "circ_seqs", "seqs_srcdir", "seqfile_name" ] with open(seed_file, "wb") as f_out: for seed_key in seed_order: logger.info(seed_key + ": " + seed_file_param[seed_key]) f_out.write(seed_key + ": " + seed_file_param[seed_key] + "\n") # Forge the BSgenomedirectory rscript = os.path.join(os.path.dirname(__file__), "../scripts/forge_bsgenome.R") command_line = "Rscript " + rscript + " --file " + seed_file logger.info("BSGENOME CMD: Rscript scripts/forge_bsgenome.R --file " + seed_file) with cd(seed_file_param["seqs_srcdir"]): # args = shlex.split(command_line) try: logger.info("command_line") process = subprocess.Popen( command_line, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() out, err = process.communicate() logger.info(out) except (IOError, OSError) as msg: logger.fatal("I/O error({0} - forge_bsgenome.R): {1}\n{2}".format( msg.errno, msg.strerror, command_line)) out, err = process.communicate() logger.info(out) logger.warn(err) return False package_build = seed_file_param["seqs_srcdir"] + "/" + seed_file_param["Package"] command_line_build = "R CMD build " + package_build command_line_check = "R CMD check " + package_build try: logger.info(command_line_build) # args = shlex.split(command_line_build) process = subprocess.Popen( command_line_build, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() out, err = process.communicate() logger.info(out) except (IOError, OSError) as msg: logger.fatal("I/O error({0} - BUILD): {1}\n{2}".format( msg.errno, msg.strerror, command_line_build)) out, err = process.communicate() logger.info(out) logger.warn(err) return False try: logger.info(command_line_check) # args = shlex.split(command_line_check) process = subprocess.Popen( command_line_check, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() out, err = process.communicate() logger.info(out) # with open(package_build + ".Rcheck/00install.out", "r") as f_in: # logger.warn(f_in.read()) # with open(package_build + ".Rcheck/00check.log", "r") as f_in: # logger.warn(f_in.read()) except (IOError, OSError) as msg: logger.fatal("I/O error({0} - CHECK): {1}\n{2}".format( msg.errno, msg.strerror, command_line_check)) out, err = process.communicate() logger.info(out) logger.warn(err) # with open(package_build + ".Rcheck/00install.out", "r") as f_in: # logger.warn(f_in.read()) with open(package_build + ".Rcheck/00check.log", "r") as f_in: logger.warn(f_in.read()) return False try: package_file_name = package_build + "_" + seed_file_param["Version"] + ".tar.gz" with open(package_file_name, "rb") as f_in: with open(bsgenome, "wb") as f_out: f_out.write(f_in.read()) except (IOError, OSError) as msg: logger.fatal("BSgenome failed to generate the index file") logger.fatal("I/O error({0}) - Change Package name: {1}\n{2}\n{3}".format( msg.errno, msg.strerror, package_build + "_" + seed_file_param["Version"] + ".tar.gz", bsgenome)) return False return True
[docs] def run(self, input_files, input_metadata, output_files): """ The main function to run iNPS for peak calling over a given BAM file and matching background BAM file. Parameters ---------- input_files : list List of input bam file locations where 0 is the bam data file and 1 is the matching background bam file metadata : dict Returns ------- output_files : list List of locations for the output files. output_metadata : list List of matching metadata dict objects """ seed_param = {} seed_param["Title"] = str(self.configuration["idear_title"]) seed_param["Description"] = str(self.configuration["idear_description"]) seed_param["Author"] = str(self.configuration["idear_provider"]) maintainer = str(self.configuration["idear_provider"]) + " <datasubs@ebi.ac.uk>" seed_param["Maintainer"] = maintainer seed_param["License"] = "Apache 2.0" seed_param["common_name"] = str(self.configuration["idear_common_name"]) seed_param["BSgenomeObjname"] = str(self.configuration["idear_common_name"]) seed_param["assembly"] = input_metadata["genome"].meta_data["assembly"] seed_param["release_name"] = input_metadata["genome"].meta_data["assembly"] seed_param["organism"] = str(self.configuration["idear_organism"]) org_split = str(self.configuration["idear_organism"]).split(" ") seed_param["organism_biocview"] = "_".join(org_split) seed_param["release_date"] = str(self.configuration["idear_release_date"]) seed_param["provider"] = str(self.configuration["idear_provider"]) seed_param["Package"] = ( "BSgenome." + seed_param["common_name"] + "." + seed_param["assembly"] ) seed_param["Version"] = "1.4.2" seed_param["provider_version"] = seed_param["assembly"] circ_chroms = [] if "idear_circ_chrom" in self.configuration: circ_chroms = str(self.configuration["idear_circ_chrom"]).split(",") self.bsgenome_creater( input_files["genome"], circ_chroms, seed_param, output_files["genome_2bit"], output_files["chrom_size"], output_files["seed_file"], output_files["bsgenome"] ) # results = compss_wait_on(results) output_metadata = { "bsgenome": Metadata( data_type=input_metadata['genome'].data_type, file_type="TAR", file_path=output_files["bsgenome"], sources=[input_metadata["genome"].file_path], taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "forge_bsgenome" } ), "chrom_size": Metadata( data_type=input_metadata['genome'].data_type, file_type="TXT", file_path=output_files["chrom_size"], sources=[input_metadata["genome"].file_path], taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "forge_bsgenome" } ), "genome_2bit": Metadata( data_type=input_metadata['genome'].data_type, file_type="2BIT", file_path=output_files["genome_2bit"], sources=[input_metadata["genome"].file_path], taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "forge_bsgenome" } ), "seed_file": Metadata( data_type=input_metadata['genome'].data_type, file_type="TXT", file_path=output_files["seed_file"], sources=[input_metadata["genome"].file_path], taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "forge_bsgenome" } ) } return (output_files, output_metadata)
# ------------------------------------------------------------------------------