Source code for mowl.datasets.builtin.gda

import pathlib

from ..base import RemoteDataset, PathDataset, OWLClasses
import math
import random
import numpy as np
import gzip
import os
from java.util import HashSet

DATA_HUMAN_URL = 'https://bio2vec.net/data/mowl/gda_human.tar.gz'
DATA_HUMAN_EL_URL = 'https://bio2vec.net/data/mowl/gda_human_el.tar.gz'
DATA_MOUSE_URL = 'https://bio2vec.net/data/mowl/gda_mouse.tar.gz'
DATA_MOUSE_EL_URL = 'https://bio2vec.net/data/mowl/gda_mouse_el.tar.gz'


[docs] class GDADataset(RemoteDataset): """Abstract class for gene-disease association datasets. This dataset represents the \ gene-disease association in a particular species. This dataset is built using phenotypic \ annotations of genes and diseases. For genes annotations we used the `Mouse/Human Orthology \ with Phenotype Annotations \ <http://www.informatics.jax.org/downloads/reports/HMD_HumanPhenotype.rpt>`_ document. Disease \ annotations were obtained from the \ `HPO annotations for rare disease <http://purl.obolibrary.org/obo/hp/hpoa/phenotype.hpoa>`_ \ document. These annotations were added to the *Unified Phenotype Ontology* (uPheno) to build \ the training ontology. Futhermore, gene-disease associations were obtained from the \ `Associations of Mouse Genes with DO Diseases \ <http://www.informatics.jax.org/downloads/reports/MGI_DO.rpt>`_ file, from which associations \ for human and mouse were extracted (to build separate datasets) and each of them were \ randomly split 80:10:10, added to the training ontology and created the validation and \ testing ontologies, respectively. """ def __init__(self, url=None): super().__init__(url=url) @property def evaluation_classes(self): """List of classes used for evaluation. Depending on the dataset, this method could \ return a single :class:`OWLClasses` object \ (as in :class:`PPIYeastDataset <mowl.datasets.builtin.PPIYeastDataset>`) \ or a tuple of :class:`OWLClasses` objects \ (as in :class:`GDAHumanDataset <mowl.datasets.builtin.GDAHumanDataset>`). If not \ overriden, this method returns the classes in the testing ontology obtained from the \ OWLAPI method ``getClassesInSignature()`` as a :class:`OWLClasses` object. """ if self._evaluation_classes is None: genes = set() diseases = set() for owl_name, owl_cls in self.classes.as_dict.items(): if owl_name[7:].isnumeric(): genes.add(owl_cls) if "OMIM_" in owl_name: diseases.add(owl_cls) genes = OWLClasses(genes) diseases = OWLClasses(diseases) self._evaluation_classes = (diseases, genes) return self._evaluation_classes
[docs] def get_evaluation_property(self): return "http://is_associated_with"
[docs] class GDAHumanDataset(GDADataset): """ Dataset containing gene-disease associations in human. """ def __init__(self): super().__init__(url=DATA_HUMAN_URL)
[docs] class GDAHumanELDataset(GDADataset): """This dataset is a reduced version of :class:`GDAHumanDataset`. The training ontology \ contains axioms in the :math:`\mathcal{EL}` language. """ def __init__(self): super().__init__(url=DATA_HUMAN_EL_URL)
[docs] class GDAMouseDataset(GDADataset): """ Dataset containing gene-disease associations in mouse. """ def __init__(self): super().__init__(url=DATA_MOUSE_URL)
[docs] class GDAMouseELDataset(GDADataset): """This dataset is a reduced version of :class:`GDAMouseDataset`. The training ontology \ contains axioms in the :math:`\mathcal{EL}` language. """ def __init__(self): super().__init__(url=DATA_MOUSE_EL_URL)