Source code for mg_process_files.tool.json_3d_indexer

"""
.. See the NOTICE file distributed with this work for additional information
   regarding copyright ownership.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
"""

from __future__ import print_function

import sys
import subprocess
import shlex
import json

from os import listdir
from os.path import isdir

import numpy as np
import h5py

from utils import logger

try:
    if hasattr(sys, '_run_from_cmdl') is True:
        raise ImportError
    from pycompss.api.parameter import FILE_IN, FILE_OUT
    from pycompss.api.task import task
    from pycompss.api.api import compss_wait_on
except ImportError:
    logger.warn("[Warning] Cannot import \"pycompss\" API packages.")
    logger.warn("          Using mock decorators.")

    from utils.dummy_pycompss import FILE_IN, FILE_OUT  # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import task  # pylint: disable=ungrouped-imports
    from utils.dummy_pycompss import compss_wait_on  # pylint: disable=ungrouped-imports

from basic_modules.metadata import Metadata
from basic_modules.tool import Tool

# ------------------------------------------------------------------------------


[docs]class json3dIndexerTool(Tool):
    """
    Tool for running indexers over 3D JSON files for use in the RESTful API
    """

    def __init__(self, configuration=None):
        """
        Init function
        """
        print("3D JSON Model Indexer")
        Tool.__init__(self)

        if configuration is None:
            configuration = {}

        self.configuration.update(configuration)

[docs]    def unzipJSON(self, file_targz):  # pylint: disable=no-self-use
        """
        Unzips the zipped folder containing all the models for regions of the
        genome based on the information within the adjacency matrixes generated
        by TADbit.

        Parameters
        ----------
        archive_location : str
            Location of archived JSON files

        Returns
        -------
        json_file_locations : list
            List of the locations of the files within an extracted archive

        Example
        -------
        .. code-block:: python
           :linenos:

           gz_file = '/home/<user>/test.tar.gz'
           json_files = unzip(gz_file)

        """
        targz_file_dir = file_targz.split("/")
        root_dir = '/'.join(targz_file_dir[0:len(targz_file_dir) - 1])

        command_line = 'tar -xzf ' + file_targz + ' -C ' + root_dir
        args = shlex.split(command_line)
        process = subprocess.Popen(args)
        process.wait()

        onlyfiles = []
        for i in listdir(root_dir):
            if isdir('/'.join([root_dir, i])):
                for j in listdir('/'.join([root_dir, i])):
                    onlyfiles.append('/'.join([root_dir, i, j]))

        return onlyfiles

[docs]    @task(returns=bool, json_file_gz=FILE_IN, hdf5_file=FILE_OUT)
    def json2hdf5(self, json_file_gz, hdf5_file):  # pylint: disable=too-many-locals,too-many-statements
        """
        Genome Model Indexing

        Load the JSON files generated by TADbit into a specified HDF5 file. The
        file includes the x, y and z coordinates of all the models for each
        region along with the matching stats, clusters, TADs and adjacency
        values used during the modelling.

        Parameters
        ----------
        json_files : list
            Locations of all the JSON 3D model files generated by TADbit for a
            given dataset
        file_hdf5 : str
            Location of the HDF5 index file for this dataset.

        Example
        -------
        .. code-block:: python
           :linenos:

           if not self.json2hdf5(json_files, assembly, wig_file, hdf5_file):
               output_metadata.set_exception(
                   Exception(
                       "wig2hdf5: Could not process files {}, {}.".format(*input_files)))


        """

        json_files = self.unzipJSON(json_file_gz)

        for json_file in json_files:
            models = json.loads(open(json_file).read())

            metadata = models['metadata']
            objectdata = models['object']
            clusters = models['clusters']

            resolution = objectdata['resolution']

            uuid = objectdata['uuid']

            # Edit the HDF5 file
            hdf5_in = h5py.File(hdf5_file, "a")

            if str(resolution) in hdf5_in:
                grp = hdf5_in[str(resolution)]
                dset = grp['data']

                meta = grp['meta']
                mpgrp = meta['model_params']
                clustersgrp = meta['clusters']
                centroidsgrp = meta['centroids']
            else:
                # Create the initial dataset with minimum values
                grp = hdf5_in.create_group(str(resolution))
                meta = grp.create_group('meta')

                mpgrp = meta.create_group('model_params')
                clustersgrp = meta.create_group('clusters')
                centroidsgrp = meta.create_group('centroids')

                dset = grp.create_dataset(
                    'data', (1, 1000, 3), maxshape=(None, 1000, 3), dtype='int32',
                    chunks=True, compression="gzip")

                dset.attrs['title'] = objectdata['title']
                dset.attrs['experimentType'] = objectdata['experimentType']
                dset.attrs['species'] = objectdata['species']
                dset.attrs['project'] = objectdata['project']
                dset.attrs['identifier'] = objectdata['identifier']
                dset.attrs['assembly'] = objectdata['assembly']
                dset.attrs['cellType'] = objectdata['cellType']
                dset.attrs['resolution'] = objectdata['resolution']
                dset.attrs['datatype'] = objectdata['datatype']
                dset.attrs['components'] = objectdata['components']
                dset.attrs['source'] = objectdata['source']
                dset.attrs['TADbit_meta'] = json.dumps(metadata)
                dset.attrs['dependencies'] = json.dumps(objectdata['dependencies'])
                dset.attrs['restraints'] = json.dumps(models['restraints'])
                if 'hic_data' in models:
                    dset.attrs['hic_data'] = json.dumps(models['hic_data'])

            clustergrps = clustersgrp.create_group(str(uuid))
            cluster_size = len(clusters)
            for cluster_id in range(cluster_size):
                clustergrps.create_dataset(
                    str(cluster_id), data=clusters[cluster_id],
                    chunks=True, compression="gzip")

            centroidsgrp.create_dataset(
                str(uuid), data=models['centroids'],
                chunks=True, compression="gzip")

            current_size = len(dset)
            if current_size == 1:
                current_size = 0
            dset.resize((current_size + int(len(models['models'][0]['data']) / 3), 1000, 3))

            dnp = np.zeros([int(len(models['models'][0]['data']) / 3), 1000, 3], dtype='int32')

            model_param = []

            model_id = 0
            for model in models['models']:
                ref = model['ref']
                model_data = model['data']

                cid = [ind for ind in range(len(clusters)) if ref in clusters[ind]]
                cid_size = len(cid)
                if cid_size == 0:
                    cluster_id = len(clusters)
                else:
                    cluster_id = cid[0]

                model_param.append([int(ref), int(cluster_id)])

                j = 0
                for i in range(0, len(model_data), 3):
                    xyz = model_data[i:i + 3]
                    dnp[j][model_id] = xyz
                    j += 1

                model_id += 1

            model_param_ds = mpgrp.create_dataset(
                str(uuid), data=model_param, chunks=True, compression="gzip")

            model_param_ds.attrs['i'] = current_size
            model_param_ds.attrs['j'] = current_size + (len(models['models'][0]['data']) / 3)
            model_param_ds.attrs['chromosome'] = objectdata['chrom'][0]
            model_param_ds.attrs['start'] = int(objectdata['chromStart'][0])
            model_param_ds.attrs['end'] = int(objectdata['chromEnd'][0])

            dset[
                current_size:current_size + int(len(models['models'][0]['data']) / 3),
                0:1000,
                0:3
            ] += dnp

            hdf5_in.close()

        return True

[docs]    def run(self, input_files, input_metadata, output_files):
        """
        Function to index models of the geome structure generated by TADbit on a
        per dataset basis so that they can be easily distributed as part of the
        RESTful API.

        Parameters
        ----------
        input_files : list
            gz_file : str
                Location of the archived JSON model files
            hdf5_file : str
                Location of the HDF5 index file
        meta_data : list
            file_id : str
                file_id used to identify the original wig file
            assembly : str
                Genome assembly accession

        Returns
        -------
        list
            hdf5_file : str
                Location of the HDF5 index file

        Example
        -------
        .. code-block:: python
           :linenos:

           import tool

           # WIG Indexer
           j3d = tool.json3dIndexerTool(self.configuration)
           j3di = j3d.run((gz_file, hdf5_file_id), ())
        """

        targz_file = input_files["models"]
        h5_file = output_files["index"]

        output_metadata = {}

        # handle error
        results = self.json2hdf5(targz_file, h5_file)
        results = compss_wait_on(results)

        output_metadata = {
            "index": Metadata(
                data_type=input_metadata["models"].data_type,
                file_type=input_metadata["models"].file_type,
                file_path=input_metadata["models"].file_path,
                sources=[],
                taxon_id=input_metadata["models"].taxon_id,
                meta_data={
                    "tool": "json_3d_indexer",
                    "assembly": input_metadata["models"].meta_data["assembly"]
                }
            )
        }

        return (output_files, output_metadata)