Source code for tool.forge_bsgenome
"""
.. See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import print_function
import os
import subprocess
import sys
from utils import logger
try:
if hasattr(sys, '_run_from_cmdl') is True:
raise ImportError
from pycompss.api.parameter import FILE_IN, FILE_OUT, IN
from pycompss.api.task import task
# from pycompss.api.api import compss_wait_on
except ImportError:
logger.warn("[Warning] Cannot import \"pycompss\" API packages.")
logger.warn(" Using mock decorators.")
from utils.dummy_pycompss import FILE_IN, FILE_OUT, IN # pylint: disable=ungrouped-imports
from utils.dummy_pycompss import task # pylint: disable=ungrouped-imports
# from utils.dummy_pycompss import compss_wait_on # pylint: disable=ungrouped-imports
from basic_modules.tool import Tool
from basic_modules.metadata import Metadata
from tool.common import cd
# ------------------------------------------------------------------------------
[docs]class bsgenomeTool(Tool): # pylint: disable=invalid-name
"""
Tool for peak calling for iDamID-seq data
"""
def __init__(self, configuration=None):
"""
Initialise the tool with its configuration.
Parameters
----------
configuration : dict
a dictionary containing parameters that define how the operation
should be carried out, which are specific to each Tool.
"""
logger.info("Forge BSgenome")
Tool.__init__(self)
if configuration is None:
configuration = {}
self.configuration.update(configuration)
[docs] @staticmethod
def genome_to_2bit(genome, genome_2bit):
"""
Generate the 2bit genome file from a FASTA file
Parameters
----------
genome : str
Location of the FASRA genome file
genome_2bit : str
Location of the 2bit genome file
Returns
-------
bool
True if successful, False if not.
"""
command_line_2bit = "faToTwoBit " + genome + " " + genome_2bit
try:
logger.info("faToTwoBit ...")
# args = shlex.split(command_line_2bit)
process = subprocess.Popen(
command_line_2bit, shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
process.wait()
out, err = process.communicate()
logger.info(out)
if process.returncode > 0:
logger.warn(err)
return False
except (IOError, OSError) as msg:
logger.fatal("I/O error({0}) - faToTwoBit: {1}\n{2}".format(
msg.errno, msg.strerror, command_line_2bit))
return False
return True
[docs] @staticmethod
def get_chrom_size(genome_2bit, chrom_size, circ_chrom):
"""
Generate the chrom.size file and identify the available chromosomes in
the 2Bit file.
Parameters
----------
genome_2bit : str
Location of the 2bit genome file
chrom_size : str
Location to save the chrom.size file to
circ_chrom : list
List of chromosomes that are known to be circular
Returns
-------
If successful 2 lists:
[0] : List of the linear chromosomes in the 2bit file
[1] : List of circular chromosomes in the 2bit file
Returns (False, False) if there is an IOError
"""
command_line_chrom_size = "twoBitInfo " + genome_2bit + " stdout | sort -k2rn"
try:
logger.info("twoBitInfo ...")
# args = shlex.split(command_line_chrom_size)
with open(chrom_size, "w") as f_out:
sub_proc_1 = subprocess.Popen(
command_line_chrom_size, shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
sub_proc_1.wait()
out, err = sub_proc_1.communicate()
f_out.write(out)
logger.info(out)
logger.warn(err)
except (IOError, OSError) as msg:
logger.fatal("I/O error({0} - twoBitInfo): {1}\n{2}".format(
msg.errno, msg.strerror, command_line_chrom_size))
out, err = sub_proc_1.communicate()
logger.info(out)
logger.warn(err)
return (False, False)
chrom_seq_list = []
chrom_circ_list = []
with open(chrom_size, "r") as f_in:
for line in f_in:
line = line.split("\t")
if any(cc in line[0] for cc in circ_chrom):
chrom_circ_list.append(line[0])
else:
chrom_seq_list.append(line[0])
return (chrom_seq_list, chrom_circ_list)
[docs] @task(
returns=int,
genome=FILE_IN, circ_chrom=IN, seed_file_param=IN,
genome_2bit=FILE_OUT, chrom_size=FILE_OUT, seed_file=FILE_OUT, bsgenome=FILE_OUT,
isModifier=False)
def bsgenome_creater( # pylint: disable=no-self-use,too-many-return-statements
self, genome, circ_chrom, seed_file_param,
genome_2bit, chrom_size, seed_file, bsgenome):
"""
Make BSgenome index files.Uses an R script that wraps the required code.
Parameters
----------
genome : str
circo_chrom : str
Comma separated list of chromosome ids that are circular in the genome
seed_file_param : dict
Parameters required for the function to build the seed file
genome_2bit : str
chrom_size : str
seed_file : str
bsgenome : str
Returns
-------
"""
if self.genome_to_2bit(genome, genome_2bit) is False:
return False
chrom_seq_list, chrom_circ_list = self.get_chrom_size(genome_2bit, chrom_size, circ_chrom)
if chrom_seq_list is False:
return False
genome_split = genome_2bit.split("/")
seed_file_param["seqnames"] = 'c("' + '","'.join(chrom_seq_list) + '")'
number_circ_chrs = len(chrom_circ_list)
if number_circ_chrs > 0:
seed_file_param["circ_seqs"] = 'c("' + '","'.join(chrom_circ_list) + '")'
seed_file_param["circ_seqs"] = ""
seed_file_param["seqs_srcdir"] = "/".join(genome_split[0:-1])
seed_file_param["seqfile_name"] = genome_split[-1]
# Create the seed file
seed_order = [
"Package", "Title", "Description", "Author", "Maintainer", "License", "Version",
"organism", "common_name", "provider", "provider_version", "release_date",
"release_name", "organism_biocview", "BSgenomeObjname", "seqnames", "circ_seqs",
"seqs_srcdir", "seqfile_name"
]
with open(seed_file, "wb") as f_out:
for seed_key in seed_order:
logger.info(seed_key + ": " + seed_file_param[seed_key])
f_out.write(seed_key + ": " + seed_file_param[seed_key] + "\n")
# Forge the BSgenomedirectory
rscript = os.path.join(os.path.dirname(__file__), "../scripts/forge_bsgenome.R")
command_line = "Rscript " + rscript + " --file " + seed_file
logger.info("BSGENOME CMD: Rscript scripts/forge_bsgenome.R --file " + seed_file)
with cd(seed_file_param["seqs_srcdir"]):
# args = shlex.split(command_line)
try:
logger.info("command_line")
process = subprocess.Popen(
command_line, shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
process.wait()
out, err = process.communicate()
logger.info(out)
except (IOError, OSError) as msg:
logger.fatal("I/O error({0} - forge_bsgenome.R): {1}\n{2}".format(
msg.errno, msg.strerror, command_line))
out, err = process.communicate()
logger.info(out)
logger.warn(err)
return False
package_build = seed_file_param["seqs_srcdir"] + "/" + seed_file_param["Package"]
command_line_build = "R CMD build " + package_build
command_line_check = "R CMD check " + package_build
try:
logger.info(command_line_build)
# args = shlex.split(command_line_build)
process = subprocess.Popen(
command_line_build, shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
process.wait()
out, err = process.communicate()
logger.info(out)
except (IOError, OSError) as msg:
logger.fatal("I/O error({0} - BUILD): {1}\n{2}".format(
msg.errno, msg.strerror, command_line_build))
out, err = process.communicate()
logger.info(out)
logger.warn(err)
return False
try:
logger.info(command_line_check)
# args = shlex.split(command_line_check)
process = subprocess.Popen(
command_line_check, shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
process.wait()
out, err = process.communicate()
logger.info(out)
# with open(package_build + ".Rcheck/00install.out", "r") as f_in:
# logger.warn(f_in.read())
# with open(package_build + ".Rcheck/00check.log", "r") as f_in:
# logger.warn(f_in.read())
except (IOError, OSError) as msg:
logger.fatal("I/O error({0} - CHECK): {1}\n{2}".format(
msg.errno, msg.strerror, command_line_check))
out, err = process.communicate()
logger.info(out)
logger.warn(err)
# with open(package_build + ".Rcheck/00install.out", "r") as f_in:
# logger.warn(f_in.read())
with open(package_build + ".Rcheck/00check.log", "r") as f_in:
logger.warn(f_in.read())
return False
try:
package_file_name = package_build + "_" + seed_file_param["Version"] + ".tar.gz"
with open(package_file_name, "rb") as f_in:
with open(bsgenome, "wb") as f_out:
f_out.write(f_in.read())
except (IOError, OSError) as msg:
logger.fatal("BSgenome failed to generate the index file")
logger.fatal("I/O error({0}) - Change Package name: {1}\n{2}\n{3}".format(
msg.errno, msg.strerror,
package_build + "_" + seed_file_param["Version"] + ".tar.gz",
bsgenome))
return False
return True
[docs] def run(self, input_files, input_metadata, output_files):
"""
The main function to run iNPS for peak calling over a given BAM file
and matching background BAM file.
Parameters
----------
input_files : list
List of input bam file locations where 0 is the bam data file and 1
is the matching background bam file
metadata : dict
Returns
-------
output_files : list
List of locations for the output files.
output_metadata : list
List of matching metadata dict objects
"""
seed_param = {}
seed_param["Title"] = str(self.configuration["idear_title"])
seed_param["Description"] = str(self.configuration["idear_description"])
seed_param["Author"] = str(self.configuration["idear_provider"])
maintainer = str(self.configuration["idear_provider"]) + " <datasubs@ebi.ac.uk>"
seed_param["Maintainer"] = maintainer
seed_param["License"] = "Apache 2.0"
seed_param["common_name"] = str(self.configuration["idear_common_name"])
seed_param["BSgenomeObjname"] = str(self.configuration["idear_common_name"])
seed_param["assembly"] = input_metadata["genome"].meta_data["assembly"]
seed_param["release_name"] = input_metadata["genome"].meta_data["assembly"]
seed_param["organism"] = str(self.configuration["idear_organism"])
org_split = str(self.configuration["idear_organism"]).split(" ")
seed_param["organism_biocview"] = "_".join(org_split)
seed_param["release_date"] = str(self.configuration["idear_release_date"])
seed_param["provider"] = str(self.configuration["idear_provider"])
seed_param["Package"] = (
"BSgenome." + seed_param["common_name"] + "." + seed_param["assembly"]
)
seed_param["Version"] = "1.4.2"
seed_param["provider_version"] = seed_param["assembly"]
circ_chroms = []
if "idear_circ_chrom" in self.configuration:
circ_chroms = str(self.configuration["idear_circ_chrom"]).split(",")
self.bsgenome_creater(
input_files["genome"],
circ_chroms,
seed_param,
output_files["genome_2bit"],
output_files["chrom_size"],
output_files["seed_file"],
output_files["bsgenome"]
)
# results = compss_wait_on(results)
output_metadata = {
"bsgenome": Metadata(
data_type=input_metadata['genome'].data_type,
file_type="TAR",
file_path=output_files["bsgenome"],
sources=[input_metadata["genome"].file_path],
taxon_id=input_metadata["genome"].taxon_id,
meta_data={
"assembly": input_metadata["genome"].meta_data["assembly"],
"tool": "forge_bsgenome"
}
),
"chrom_size": Metadata(
data_type=input_metadata['genome'].data_type,
file_type="TXT",
file_path=output_files["chrom_size"],
sources=[input_metadata["genome"].file_path],
taxon_id=input_metadata["genome"].taxon_id,
meta_data={
"assembly": input_metadata["genome"].meta_data["assembly"],
"tool": "forge_bsgenome"
}
),
"genome_2bit": Metadata(
data_type=input_metadata['genome'].data_type,
file_type="2BIT",
file_path=output_files["genome_2bit"],
sources=[input_metadata["genome"].file_path],
taxon_id=input_metadata["genome"].taxon_id,
meta_data={
"assembly": input_metadata["genome"].meta_data["assembly"],
"tool": "forge_bsgenome"
}
),
"seed_file": Metadata(
data_type=input_metadata['genome'].data_type,
file_type="TXT",
file_path=output_files["seed_file"],
sources=[input_metadata["genome"].file_path],
taxon_id=input_metadata["genome"].taxon_id,
meta_data={
"assembly": input_metadata["genome"].meta_data["assembly"],
"tool": "forge_bsgenome"
}
)
}
return (output_files, output_metadata)
# ------------------------------------------------------------------------------