Source code for tmtk.highdim.HighDimBase

import pandas as pd
import os

from .SampleMapping import SampleMapping

from ..utils import FileBase, ValidateMixin, PathError, ClassError, TransmartBatch, summarise
from ..annotation import ChromosomalRegions


[docs]class HighDimBase(FileBase, ValidateMixin):
    """
    Base class for high dimensional data structures.
    """

    def __init__(self, params=None, path=None, parent=None):
        """

        :param params:
        :param path:
        :param parent:
        """
        if params and params.is_viable():
            self.params = params
            self.path = os.path.join(params.dirname, params.DATA_FILE)
        elif path and os.path.exists(self.path):
            self.path = path
        else:
            raise PathError

        super().__init__()

        if hasattr(params, 'MAP_FILENAME'):
            self.sample_mapping = SampleMapping(os.path.join(params.dirname, params.MAP_FILENAME))
            self.platform = self.sample_mapping.platform

            self._parent = parent
            if hasattr(self._parent, 'Annotations'):
                self.annotation_file = parent.find_annotation(self.platform)

    def __str__(self):
        return 'HighDim: {} ({})'.format(self.params.datatype, self.params.dirname)

    def __repr__(self):
        return 'HighDim: {} ({})'.format(self.params.datatype, self.params.dirname)

    def _check_header_extensions(self):

        illegal_header_items = []

        for h in self.header[1:]:
            try:
                count_type = h.rsplit('.', 1)[1]
            except IndexError:
                self.msgs.error('Expected header with dot, but got {}.'.format(h))
                continue

            # Add count_type to illegal items if not allowed.
            if count_type not in self.allowed_header:
                illegal_header_items.append(count_type)

        # Create list of illegal header items.
        if illegal_header_items:
            self.msgs.error('Found illegal header items.', warning_list=illegal_header_items)
        else:
            self.msgs.okay('Header extensions are okay!')

    def _remap_to_chromosomal_regions(self, destination=None):
        """

        :param destination:
        :return:
        """
        from ..toolbox import remap_chromosomal_regions

        if not self.annotation_file:
            raise Exception

        if isinstance(destination, ChromosomalRegions):
            destination = destination.df
        elif not isinstance(destination, pd.DataFrame):
            raise ClassError(found=type(destination), expected='pd.DataFrame, or ChromosomalRegions')

        remapped = remap_chromosomal_regions(datafile=self.df,
                                             origin_platform=self.annotation_file.df,
                                             destination_platform=destination)
        return remapped

    @property
    def load_to(self):
        return TransmartBatch(self.params.path,
                              items_expected=self._get_lazy_batch_items()
                              ).get_loading_namespace()

    def _get_lazy_batch_items(self):
        return {self.params.path: (len(self.sample_mapping.samples), self.path)}

    def _validate_missing_annotation(self):
        missing_annotations = list(self.df.iloc[:, 0][~self.df.iloc[:, 0].isin(self.annotation_file.biomarkers)])

        if missing_annotations:
            self.msgs.warning('Missing annotations found.', warning_list=missing_annotations)
        else:
            self.msgs.okay('All data items have associated annotations.')

    def _validate_missing_data_items(self):
        missing_data = list(self.annotation_file.biomarkers[~self.annotation_file.biomarkers.isin(self.df.iloc[:, 0])])

        if not missing_data:
            self.msgs.okay('The entire annotation platform seems to have associated data.')
            return

        msg = 'Data file has less data than annotations.'

        if self.params.get('ALLOW_MISSING_ANNOTATIONS', 'N') == 'Y':
            self.msgs.warning(msg, warning_list=missing_data)
        else:
            self.msgs.error(msg, warning_list=missing_data)

    def _validate_annotation_file(self):

        if hasattr(self, 'annotation_file'):
            if not self.annotation_file:
                self.msgs.error('No annotation file found for {}.'.format(self.platform))
            else:
                self.msgs.okay('Annotation file found for {}!'.format(self.platform))

    def _validate_sample_mapping(self):
        header_samples = pd.Series(self.samples)
        mapping_samples = pd.Series(self.sample_mapping.samples)

        not_in_datafile = set(mapping_samples[~mapping_samples.isin(header_samples)])
        not_in_sample_mapping = set(header_samples[~header_samples.isin(mapping_samples)])
        intersection = set(header_samples[header_samples.isin(mapping_samples)])

        if not_in_datafile:
            self.msgs.error('Samples not in datafile: {}!'.format(summarise(not_in_datafile)),
                            warning_list=not_in_datafile)

        if not_in_sample_mapping:
            if self.params.get('SKIP_UNMAPPED_DATA', 'N') == 'Y':
                self.msgs.warning('Samples not in mapping file: {}.'.format(summarise(not_in_sample_mapping)),
                                  warning_list=not_in_sample_mapping)
            else:
                self.msgs.error('Samples not in mapping file: {}.'.format(summarise(not_in_sample_mapping)),
                                warning_list=not_in_sample_mapping)

        if intersection:
            self.msgs.info('Intersection of samples: {}.'.format(summarise(intersection)),
                           warning_list=intersection)

    def _validate_sample_mapping_study_id(self):
        if self.sample_mapping.study_id != self._parent.study_id:
            m = 'Study_id in ({}) does not match ({}) in study.params'.\
                format(self.sample_mapping.study_id, self._parent.study_id)
            self.msgs.error(m)
        else:
            self.msgs.okay('STUDY_ID as expected from study.params.')