"""
.. See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import print_function
import os
import shlex
import subprocess
import sys
from utils import logger
try:
if hasattr(sys, '_run_from_cmdl') is True:
raise ImportError
from pycompss.api.parameter import FILE_IN, FILE_OUT, IN
from pycompss.api.task import task
from pycompss.api.constraint import constraint
from pycompss.api.api import compss_wait_on, compss_open, compss_delete_file
except ImportError:
logger.warn("[Warning] Cannot import \"pycompss\" API packages.")
logger.warn(" Using mock decorators.")
from utils.dummy_pycompss import FILE_IN, FILE_OUT, IN # pylint: disable=ungrouped-imports
from utils.dummy_pycompss import task, constraint # pylint: disable=ungrouped-imports
from utils.dummy_pycompss import compss_wait_on, compss_open, compss_delete_file # pylint: disable=ungrouped-imports
from basic_modules.metadata import Metadata
from basic_modules.tool import Tool
from tool.bam_utils import bamUtilsTask
from tool.common import common
# ------------------------------------------------------------------------------
[docs]class macs2(Tool): # pylint: disable=invalid-name
"""
Tool for peak calling for ChIP-seq data
"""
def __init__(self, configuration=None):
"""
Init function
"""
logger.info("MACS2 Peak Caller")
Tool.__init__(self)
if configuration is None:
configuration = {}
self.configuration.update(configuration)
@staticmethod
def _macs2_runner( # pylint: disable=too-many-locals,too-many-statements,too-many-statements,too-many-arguments
name, bam_file, bai_file, macs_params,
narrowpeak, summits_bed, broadpeak, gappedpeak,
bdg_control_lambda=None, bdg_treat_pileup=None, chromosome=None,
bam_file_bgd=None, bai_file_bgd=None, bedpe=False):
"""
Function to run MACS2 for peak calling on aligned sequence files and
normalised against a provided background set of alignments.
Parameters
----------
name : str
Name to be used to identify the files
bam_file : str
Location of the aligned FASTQ files as a bam file
bai_file : str
Location of the bam index file
narrowpeak : str
Location of the output narrowpeak file
summits_bed : str
Location of the output summits bed file
broadpeak : str
Location of the output broadpeak file
gappedpeak : str
Location of the output gappedpeak file
chromosome : str
If the tool is to be run over a single chromosome the matching
chromosome name should be specified. If None then the whole bam file
is analysed
bam_file_bgd : str
Location of the aligned FASTQ files as a bam file representing
background values for the cell
bai_file_bgd : str
Location of the background bam index file
Returns
-------
narrowPeak : file
BED6+4 file - ideal for transcription factor binding site
identification
summitPeak : file
BED4+1 file - Contains the peak summit locations for everypeak
broadPeak : file
BED6+3 file - ideal for histone binding site identification
gappedPeak : file
BED12+3 file - Contains a merged set of the broad and narrow peak
files
Definitions defined for each of these files have come from the MACS2
documentation described in the docs at https://github.com/taoliu/MACS
"""
od_list = bam_file.split("/")
output_dir = "/".join(od_list[0:-1])
from tool.bam_utils import bamUtils
bam_tmp_file = bam_file.replace(".bam", "." + str(chromosome) + ".bam")
bam_utils_handle = bamUtils()
bam_utils_handle.bam_split(bam_file, bai_file, chromosome, bam_tmp_file)
common_handle = common()
# Test to see if the bam file contains paired end reads
if bam_utils_handle.bam_paired_reads(bam_file):
if bedpe:
bed_tmp_file = bam_file.replace(".bam", "." + str(chromosome) + ".bed")
bam_utils_handle.bam_to_bed(bam_tmp_file, bed_tmp_file)
macs_params.append("--format BEDPE")
else:
macs_params.append("--format BAMPE")
command_param = [
'macs2 callpeak',
" ".join(macs_params),
'-t', bam_tmp_file,
'-n', name
]
if bam_file_bgd is not None:
bam_bgd_tmp_file = bam_file_bgd.replace(".bam", "." + str(chromosome) + ".bam")
bam_utils_handle.bam_split(bam_file_bgd, bai_file_bgd, chromosome, bam_bgd_tmp_file)
bgd_command = '-c ' + bam_bgd_tmp_file
command_param.append(bgd_command)
command_param.append('--outdir ' + output_dir)
command_line = ' '.join(command_param)
aligned_count = bam_utils_handle.bam_count_reads(bam_tmp_file, aligned=True)
if int(aligned_count) > 0:
try:
args = shlex.split(command_line)
process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
proc_out, proc_err = process.communicate()
except (IOError, OSError) as msg:
logger.fatal("I/O error({0}): {1}\n{2}".format(
msg.errno, msg.strerror, command_line))
return False
if process.returncode is not 0:
logger.fatal("MACS2 ERROR: " + str(process.returncode))
logger.fatal(
"MACS2 ERROR: BAM counts: " + str(
bam_utils_handle.bam_count_reads(bam_tmp_file, aligned=True)
)
)
logger.fatal("\n\nMACS2 ERROR - out:\n\t" + str(proc_out))
logger.fatal("\n\nMACS2 ERROR - err:\n\t" + str(proc_err))
logger.info('Process Results 1:', process)
logger.info('LIST DIR 1:', os.listdir(output_dir))
output_tmp = output_dir + '/{}_{}'
common_handle.to_output_file(output_tmp.format(name, 'peaks.narrowPeak'), narrowpeak)
common_handle.to_output_file(output_tmp.format(name, 'peaks.broadPeak'), broadpeak)
common_handle.to_output_file(output_tmp.format(name, 'peaks.gappedPeak'), gappedpeak)
common_handle.to_output_file(output_tmp.format(name, 'summits.bed'), summits_bed)
common_handle.to_output_file(
output_tmp.format(name, 'control_lambda.bdg'), bdg_control_lambda)
common_handle.to_output_file(output_tmp.format(name, 'treat_pileup.bdg'), bdg_treat_pileup)
return True
[docs] @constraint(ComputingUnits="1")
@task(
returns=bool,
name=IN,
bam_file=FILE_IN,
bai_file=FILE_IN,
bam_file_bgd=FILE_IN,
bai_file_bgd=FILE_IN,
macs_params=IN,
narrowpeak=FILE_OUT,
summits_bed=FILE_OUT,
broadpeak=FILE_OUT,
gappedpeak=FILE_OUT,
rscript=FILE_OUT,
bdg_control_lambda=FILE_OUT,
bdg_treat_pileup=FILE_OUT,
chromosome=IN,
bedpe=IN,
isModifier=False)
def macs2_peak_calling( # pylint: disable=no-self-use,too-many-arguments
self, name, bam_file, bai_file, bam_file_bgd, bai_file_bgd, macs_params,
narrowpeak, summits_bed, broadpeak, gappedpeak,
bdg_control_lambda, bdg_treat_pileup, chromosome, bedpe): # pylint: disable=unused-argument
"""
Function to run MACS2 for peak calling on aligned sequence files and
normalised against a provided background set of alignments.
Parameters
----------
name : str
Name to be used to identify the files
bam_file : str
Location of the aligned FASTQ files as a bam file
bai_file : str
Location of the bam index file
bam_file_bgd : str
Location of the aligned FASTQ files as a bam file representing
background values for the cell
bai_file_bgd : str
Location of the background bam index file
narrowpeak : str
Location of the output narrowpeak file
summits_bed : str
Location of the output summits bed file
broadpeak : str
Location of the output broadpeak file
gappedpeak : str
Location of the output gappedpeak file
chromosome : str
If the tool is to be run over a single chromosome the matching
chromosome name should be specified. If None then the whole bam file
is analysed
Returns
-------
narrowPeak : file
BED6+4 file - ideal for transcription factor binding site
identification
summitPeak : file
BED4+1 file - Contains the peak summit locations for everypeak
broadPeak : file
BED6+3 file - ideal for histone binding site identification
gappedPeak : file
BED12+3 file - Contains a merged set of the broad and narrow peak
files
Definitions defined for each of these files have come from the MACS2
documentation described in the docs at https://github.com/taoliu/MACS
"""
self._macs2_runner(
name, bam_file, bai_file, macs_params,
narrowpeak, summits_bed, broadpeak, gappedpeak,
bdg_control_lambda=bdg_control_lambda, bdg_treat_pileup=bdg_treat_pileup,
chromosome=chromosome, bam_file_bgd=bam_file_bgd,
bai_file_bgd=bai_file_bgd, bedpe=bedpe)
return True
[docs] @constraint(ComputingUnits="1")
@task(
returns=bool,
name=IN,
bam_file=FILE_IN,
bai_file=FILE_IN,
macs_params=IN,
narrowpeak=FILE_OUT,
summits_bed=FILE_OUT,
broadpeak=FILE_OUT,
gappedpeak=FILE_OUT,
bdg_control_lambda=FILE_OUT,
bdg_treat_pileup=FILE_OUT,
chromosome=IN,
bedpe=IN,
isModifier=False)
def macs2_peak_calling_nobgd( # pylint: disable=too-many-arguments,no-self-use,too-many-branches
self, name, bam_file, bai_file, macs_params,
narrowpeak, summits_bed, broadpeak, gappedpeak,
bdg_control_lambda, bdg_treat_pileup, chromosome, bedpe): # pylint: disable=unused-argument
"""
Function to run MACS2 for peak calling on aligned sequence files without
a background dataset for normalisation.
Parameters
----------
name : str
Name to be used to identify the files
bam_file : str
Location of the aligned FASTQ files as a bam file
bai_file : str
Location of the bam index file
narrowpeak : str
Location of the output narrowpeak file
summits_bed : str
Location of the output summits bed file
broadpeak : str
Location of the output broadpeak file
gappedpeak : str
Location of the output gappedpeak file
chromosome : str
If the tool is to be run over a single chromosome the matching
chromosome name should be specified. If None then the whole bam file
is analysed
Returns
-------
narrowPeak : file
BED6+4 file - ideal for transcription factor binding site
identification
summitPeak : file
BED4+1 file - Contains the peak summit locations for everypeak
broadPeak : file
BED6+3 file - ideal for histone binding site identification
gappedPeak : file
BED12+3 file - Contains a merged set of the broad and narrow peak
files
Definitions defined for each of these files have come from the MACS2
documentation described in the docs at https://github.com/taoliu/MACS
"""
self._macs2_runner(
name, bam_file, bai_file, macs_params,
narrowpeak, summits_bed, broadpeak, gappedpeak,
bdg_control_lambda=bdg_control_lambda, bdg_treat_pileup=bdg_treat_pileup,
chromosome=chromosome, bedpe=bedpe)
return True
[docs] @staticmethod
def get_macs2_params(params):
"""
Function to handle to extraction of commandline parameters and formatting
them for use in the aligner for BWA ALN
Parameters
----------
params : dict
Returns
-------
list
"""
command_params = []
command_parameters = {
"macs_format_param": ["--format", True],
"macs_gsize_param": ["--gsize", True],
"macs_tsize_param": ["--tsize", True],
"macs_bw_param": ["--bw", True],
"macs_qvalue_param": ["--qvalue", True],
"macs_pvalue_param": ["--pvalue", True],
"macs_mfold_param": ["--mfold", True],
"macs_nolambda_param": ["--nolambda", False],
"macs_slocal_param": ["--slocal", True],
"macs_llocal_param": ["--llocal", True],
"macs_fix-bimodal_param": ["--fix-bimodal", False],
"macs_nomodel_param": ["--nomodel", False],
"macs_extsize_param": ["--extsize", True],
"macs_shift_param": ["--shift", True],
"macs_keep-dup_param": ["--keep-dup", True],
"macs_broad_param": ["--broad", False],
"macs_broad-cutoff_param": ["--broad-cutoff", True],
"macs_to-large_param": ["--to-large", False],
"macs_down-sample_param": ["--down-sample", False],
"macs_bdg_param": ["--bdg", False],
"macs_call-summits_param": ["--call-summits", True],
}
for param in params:
if param in command_parameters:
if command_parameters[param][1] and params[param] != "":
command_params = command_params + [
command_parameters[param][0], params[param]
]
else:
if command_parameters[param][0] and params[param] is not False:
command_params.append(command_parameters[param][0])
return command_params
[docs] def run(self, input_files, input_metadata, output_files): # pylint: disable=too-many-locals,too-many-branches,too-many-statements
"""
The main function to run MACS 2 for peak calling over a given BAM file
and matching background BAM file.
Parameters
----------
input_files : dict
List of input bam file locations where 0 is the bam data file and 1
is the matching background bam file
metadata : dict
Returns
-------
output_files : dict
List of locations for the output files.
output_metadata : dict
List of matching metadata dict objects
"""
root_name = os.path.split(input_files['bam'])
name = root_name[1].replace('.bam', '')
# input and output share most metadata
output_bed_types = {
'narrow_peak': "bed4+1",
'summits': "bed6+4",
'broad_peak': "bed6+3",
'gapped_peak': "bed12+3"
}
command_params = self.get_macs2_params(self.configuration)
bam_utils_handle = bamUtilsTask()
bam_utils_handle.bam_index(
input_files['bam'],
input_files['bam'] + '.bai'
)
if 'bam_bg' in input_files:
bam_utils_handle.bam_index(
input_files['bam_bg'],
input_files['bam_bg'] + '.bai'
)
chr_list = bam_utils_handle.bam_list_chromosomes(input_files['bam'])
chr_list = compss_wait_on(chr_list)
logger.info("MACS2 COMMAND PARAMS: " + ", ".join(command_params))
bedpe_param = False
if 'macs2_bedpe' in self.configuration:
bedpe_param = True
for chromosome in chr_list:
if 'bam_bg' in input_files:
result = self.macs2_peak_calling(
name,
str(input_files['bam']), str(input_files['bam']) + '.bai',
str(input_files['bam_bg']), str(input_files['bam_bg']) + '.bai',
command_params,
str(output_files['narrow_peak']) + "." + str(chromosome),
str(output_files['summits']) + "." + str(chromosome),
str(output_files['broad_peak']) + "." + str(chromosome),
str(output_files['gapped_peak']) + "." + str(chromosome),
os.path.join(root_name[0], name + "_control_lambda.bdg"),
os.path.join(root_name[0], name + "_treat_pvalue.bdg"),
chromosome,
bedpe_param)
else:
result = self.macs2_peak_calling_nobgd(
name,
str(input_files['bam']), str(input_files['bam']) + '.bai',
command_params,
str(output_files['narrow_peak']) + "." + str(chromosome),
str(output_files['summits']) + "." + str(chromosome),
str(output_files['broad_peak']) + "." + str(chromosome),
str(output_files['gapped_peak']) + "." + str(chromosome),
os.path.join(root_name[0], name + "_control_lambda.bdg"),
os.path.join(root_name[0], name + "_treat_pvalue.bdg"),
chromosome,
bedpe_param)
if result is False:
logger.fatal("MACS2: Something went wrong with the peak calling")
# Merge the results files into single files.
with open(output_files['narrow_peak'], 'wb') as file_np_handle:
with open(output_files['summits'], 'wb') as file_s_handle:
with open(output_files['broad_peak'], 'wb') as file_bp_handle:
with open(output_files['gapped_peak'], 'wb') as file_gp_handle:
for chromosome in chr_list:
np_file_chr = "{}.{}".format(output_files['narrow_peak'], chromosome)
s_file_chr = "{}.{}".format(output_files['summits'], chromosome)
bp_file_chr = "{}.{}".format(output_files['broad_peak'], chromosome)
gp_file_chr = "{}.{}".format(output_files['gapped_peak'], chromosome)
if hasattr(sys, '_run_from_cmdl') is True:
with open(np_file_chr, 'rb') as file_in_handle:
file_np_handle.write(file_in_handle.read())
with open(s_file_chr, 'rb') as file_in_handle:
file_s_handle.write(file_in_handle.read())
with open(bp_file_chr, 'rb') as file_in_handle:
file_bp_handle.write(file_in_handle.read())
with open(gp_file_chr, 'rb') as file_in_handle:
file_gp_handle.write(file_in_handle.read())
else:
with compss_open(np_file_chr, 'rb') as file_in_handle:
file_np_handle.write(file_in_handle.read())
with compss_open(s_file_chr, 'rb') as file_in_handle:
file_s_handle.write(file_in_handle.read())
with compss_open(bp_file_chr, 'rb') as file_in_handle:
file_bp_handle.write(file_in_handle.read())
with compss_open(gp_file_chr, 'rb') as file_in_handle:
file_gp_handle.write(file_in_handle.read())
compss_delete_file(np_file_chr)
compss_delete_file(s_file_chr)
compss_delete_file(bp_file_chr)
compss_delete_file(gp_file_chr)
output_files_created = {}
output_metadata = {}
for result_file in output_files:
if (
os.path.isfile(output_files[result_file]) is True
and os.path.getsize(output_files[result_file]) > 0
):
output_files_created[result_file] = output_files[result_file]
sources = [input_metadata["bam"].file_path]
if 'bam_bg' in input_files:
sources.append(input_metadata["bam_bg"].file_path)
output_metadata[result_file] = Metadata(
data_type="data_chip_seq",
file_type="BED",
file_path=output_files[result_file],
sources=sources,
taxon_id=input_metadata["bam"].taxon_id,
meta_data={
"assembly": input_metadata["bam"].meta_data["assembly"],
"tool": "macs2",
"bed_type": output_bed_types[result_file],
"parameters": command_params
}
)
else:
try:
os.remove(output_files[result_file])
except (OSError, IOError) as error:
logger.warn("MACS2: I/O error({0}): {1}\nMissing file: {2}".format(
error.errno, error.strerror, output_files[result_file]))
bdg_files = {
"control_lambda": os.path.join(root_name[0], name + "_control_lambda.bdg"),
"treat_pvalue": os.path.join(root_name[0], name + "_treat_pvalue.bdg")
}
for result_file in bdg_files:
if (
os.path.isfile(bdg_files[result_file]) is True
and os.path.getsize(bdg_files[result_file]) > 0
):
output_files_created[result_file] = bdg_files[result_file]
sources = [input_metadata["bam"].file_path]
if 'bam_bg' in input_files:
sources.append(input_metadata["bam_bg"].file_path)
output_metadata[result_file] = Metadata(
data_type="data_chip_seq",
file_type="BEDGRAPH",
file_path=bdg_files[result_file],
sources=sources,
taxon_id=input_metadata["bam"].taxon_id,
meta_data={
"assembly": input_metadata["bam"].meta_data["assembly"],
"tool": "macs2",
"parameters": command_params
}
)
else:
try:
os.remove(bdg_files[result_file])
except (OSError, IOError) as error:
logger.warn("MACS2: I/O error({0}): {1}\nMissing file: {2}".format(
error.errno, error.strerror, bdg_files[result_file]))
logger.info('MACS2: GENERATED FILES:', output_files)
return (output_files_created, output_metadata)
# ------------------------------------------------------------------------------