release commit
This commit is contained in:
41
biocypher/__init__.py
Normal file
41
biocypher/__init__.py
Normal file
@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher: a unifying framework for biomedical knowledge graphs.
|
||||
"""
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"__author__",
|
||||
"module_data",
|
||||
"config",
|
||||
"logfile",
|
||||
"log",
|
||||
"Driver",
|
||||
"BioCypher",
|
||||
"Resource",
|
||||
]
|
||||
|
||||
from ._get import Resource
|
||||
from ._core import BioCypher
|
||||
from ._config import config, module_data
|
||||
from ._logger import log, logger, logfile
|
||||
from ._metadata import __author__, __version__
|
||||
|
||||
|
||||
class Driver(BioCypher):
|
||||
# initialise parent class but log a warning
|
||||
def __init__(self, *args, **kwargs):
|
||||
logger.warning(
|
||||
"The class `Driver` is deprecated and will be removed in a future "
|
||||
"release. Please use `BioCypher` instead."
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
148
biocypher/_config/__init__.py
Normal file
148
biocypher/_config/__init__.py
Normal file
@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
Module data directory, including:
|
||||
|
||||
* The BioLink database schema
|
||||
* The default config files
|
||||
"""
|
||||
|
||||
from typing import Any, Optional
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import yaml
|
||||
import appdirs
|
||||
|
||||
__all__ = ["module_data", "module_data_path", "read_config", "config", "reset"]
|
||||
|
||||
_USER_CONFIG_DIR = appdirs.user_config_dir("biocypher", "saezlab")
|
||||
_USER_CONFIG_FILE = os.path.join(_USER_CONFIG_DIR, "conf.yaml")
|
||||
|
||||
|
||||
class MyLoader(yaml.SafeLoader):
|
||||
def construct_scalar(self, node):
|
||||
# Check if the scalar contains double quotes and an escape sequence
|
||||
value = super().construct_scalar(node)
|
||||
q = bool(node.style == '"')
|
||||
b = bool("\\" in value.encode("unicode_escape").decode("utf-8"))
|
||||
if q and b:
|
||||
warnings.warn(
|
||||
(
|
||||
"Double quotes detected in YAML configuration scalar: "
|
||||
f"{value.encode('unicode_escape')}. "
|
||||
"These allow escape sequences and may cause problems, for "
|
||||
"instance with the Neo4j admin import files (e.g. '\\t'). "
|
||||
"Make sure you wanted to do this, and use single quotes "
|
||||
"whenever possible."
|
||||
),
|
||||
category=UserWarning,
|
||||
)
|
||||
return value
|
||||
|
||||
|
||||
def module_data_path(name: str) -> str:
|
||||
"""
|
||||
Absolute path to a YAML file shipped with the module.
|
||||
"""
|
||||
|
||||
here = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
return os.path.join(here, f"{name}.yaml")
|
||||
|
||||
|
||||
def module_data(name: str) -> Any:
|
||||
"""
|
||||
Retrieve the contents of a YAML file shipped with this module.
|
||||
"""
|
||||
|
||||
path = module_data_path(name)
|
||||
|
||||
return _read_yaml(path)
|
||||
|
||||
|
||||
def _read_yaml(path: str) -> Optional[dict]:
|
||||
if os.path.exists(path):
|
||||
with open(path, "r") as fp:
|
||||
return yaml.load(fp.read(), Loader=MyLoader)
|
||||
|
||||
|
||||
def read_config() -> dict:
|
||||
"""
|
||||
Read the module config.
|
||||
|
||||
Read and merge the built-in default, the user level and directory level
|
||||
configuration, with the later taking precendence over the former.
|
||||
|
||||
TODO explain path configuration
|
||||
"""
|
||||
|
||||
defaults = module_data("biocypher_config")
|
||||
user = _read_yaml(_USER_CONFIG_FILE) or {}
|
||||
# TODO account for .yml?
|
||||
local = (
|
||||
_read_yaml("biocypher_config.yaml")
|
||||
or _read_yaml("config/biocypher_config.yaml")
|
||||
or {}
|
||||
)
|
||||
|
||||
for key in defaults:
|
||||
value = (
|
||||
local[key] if key in local else user[key] if key in user else None
|
||||
)
|
||||
|
||||
if value is not None:
|
||||
if isinstance(
|
||||
defaults[key], str
|
||||
): # first level config (like title)
|
||||
defaults[key] = value
|
||||
else:
|
||||
defaults[key].update(value)
|
||||
|
||||
return defaults
|
||||
|
||||
|
||||
def config(*args, **kwargs) -> Optional[Any]:
|
||||
"""
|
||||
Set or get module config parameters.
|
||||
"""
|
||||
|
||||
if args and kwargs:
|
||||
raise ValueError(
|
||||
"Setting and getting values in the same call is not allowed.",
|
||||
)
|
||||
|
||||
if args:
|
||||
result = tuple(globals()["_config"].get(key, None) for key in args)
|
||||
|
||||
return result[0] if len(result) == 1 else result
|
||||
|
||||
for key, value in kwargs.items():
|
||||
globals()["_config"][key].update(value)
|
||||
|
||||
|
||||
def reset():
|
||||
"""
|
||||
Reload configuration from the config files.
|
||||
"""
|
||||
|
||||
globals()["_config"] = read_config()
|
||||
|
||||
|
||||
reset()
|
||||
|
||||
|
||||
def update_from_file(path: str):
|
||||
"""
|
||||
Update the module configuration from a YAML file.
|
||||
"""
|
||||
|
||||
config(**_read_yaml(path))
|
141
biocypher/_config/biocypher_config.yaml
Normal file
141
biocypher/_config/biocypher_config.yaml
Normal file
@ -0,0 +1,141 @@
|
||||
Title: BioCypher python module configuration file
|
||||
|
||||
## Some options are not used by default. Uncomment them to use them.
|
||||
|
||||
biocypher:
|
||||
### Required parameters ###
|
||||
## DBMS type
|
||||
|
||||
dbms: neo4j
|
||||
|
||||
## Schema configuration
|
||||
|
||||
# schema_config_path: config/schema_config.yaml
|
||||
|
||||
## Offline mode: do not connect to a running DBMS instance
|
||||
## Can be used e.g. for writing batch import files
|
||||
|
||||
offline: true
|
||||
|
||||
## Strict mode: do not allow to create new nodes or relationships without
|
||||
## specifying source, version, and license parameters
|
||||
|
||||
strict_mode: false
|
||||
|
||||
## Ontology configuration
|
||||
|
||||
head_ontology:
|
||||
url: https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
|
||||
root_node: entity
|
||||
# switch_label_and_id: true
|
||||
|
||||
### Optional parameters ###
|
||||
|
||||
## Logging
|
||||
# Write log to disk
|
||||
log_to_disk: true
|
||||
|
||||
# Activate more granular logging
|
||||
debug: true
|
||||
|
||||
# Change the log directory
|
||||
# log_directory: biocypher-log
|
||||
|
||||
## Data output directory
|
||||
# output_directory: biocypher-out
|
||||
|
||||
## Resource cache directory
|
||||
# cache_directory: .cache
|
||||
|
||||
## Optional tail ontologies
|
||||
|
||||
# tail_ontologies:
|
||||
# so:
|
||||
# url: test/ontologies/so.owl
|
||||
# head_join_node: sequence variant
|
||||
# tail_join_node: sequence_variant
|
||||
# switch_label_and_id: true
|
||||
# mondo:
|
||||
# url: test/ontologies/mondo.owl
|
||||
# head_join_node: disease
|
||||
# tail_join_node: disease
|
||||
# switch_label_and_id: true
|
||||
|
||||
### DBMS configuration ###
|
||||
|
||||
neo4j:
|
||||
### Neo4j configuration ###
|
||||
## Database name
|
||||
|
||||
database_name: neo4j
|
||||
|
||||
## Wipe DB before import (offline mode: --force)
|
||||
|
||||
wipe: true
|
||||
|
||||
## Neo4j authentication
|
||||
|
||||
uri: neo4j://localhost:7687
|
||||
user: neo4j
|
||||
password: neo4j
|
||||
|
||||
## Neo4j admin import batch writer settings
|
||||
|
||||
delimiter: ";"
|
||||
array_delimiter: "|"
|
||||
quote_character: "'"
|
||||
|
||||
## MultiDB functionality
|
||||
## Set to false for using community edition or older versions of Neo4j
|
||||
|
||||
multi_db: true
|
||||
|
||||
## Import options
|
||||
|
||||
skip_duplicate_nodes: false
|
||||
skip_bad_relationships: false
|
||||
|
||||
## Import call prefixes
|
||||
|
||||
# import_call_bin_prefix: bin/
|
||||
# import_call_file_prefix: path/to/files/
|
||||
|
||||
postgresql:
|
||||
### PostgreSQL configuration ###
|
||||
|
||||
# PostgreSQL connection credentials
|
||||
database_name: postgres # DB name
|
||||
user: postgres # user name
|
||||
password: postgres # password
|
||||
host: localhost # host
|
||||
port: 5432 # port
|
||||
|
||||
# PostgreSQL import batch writer settings
|
||||
quote_character: '"'
|
||||
delimiter: '\t'
|
||||
# import_call_bin_prefix: '' # path to "psql"
|
||||
# import_call_file_prefix: '/path/to/files'
|
||||
|
||||
rdf:
|
||||
### RDF configuration ###
|
||||
rdf_format: turtle
|
||||
|
||||
sqlite:
|
||||
### SQLite configuration ###
|
||||
|
||||
# SQLite connection credentials
|
||||
database_name: sqlite.db # DB name
|
||||
|
||||
# SQLite import batch writer settings
|
||||
quote_character: '"'
|
||||
delimiter: '\t'
|
||||
# import_call_bin_prefix: '' # path to "sqlite3"
|
||||
# import_call_file_prefix: '/path/to/files'
|
||||
|
||||
csv:
|
||||
### CSV/Pandas configuration ###
|
||||
delimiter: ","
|
||||
|
||||
networkx:
|
||||
### NetworkX configuration ###
|
||||
some_config: some_value # placeholder for technical reasons TODO
|
5
biocypher/_config/test_config.yaml
Normal file
5
biocypher/_config/test_config.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
# We test the quote detection
|
||||
|
||||
valid: 'This is a valid string'
|
||||
also_valid: "This is also a valid string"
|
||||
invalid: "\t"
|
140
biocypher/_config/test_schema_config.yaml
Normal file
140
biocypher/_config/test_schema_config.yaml
Normal file
@ -0,0 +1,140 @@
|
||||
Title: BioCypher graph schema configuration file
|
||||
|
||||
# ---
|
||||
# "Named Things"
|
||||
# ---
|
||||
|
||||
protein:
|
||||
represented_as: node
|
||||
preferred_id: uniprot
|
||||
input_label: protein
|
||||
db_collection_name: proteins
|
||||
properties:
|
||||
name: str
|
||||
score: float
|
||||
taxon: int
|
||||
genes: str[]
|
||||
|
||||
microRNA:
|
||||
represented_as: node
|
||||
preferred_id: mirbase.mature
|
||||
input_label: mirna
|
||||
|
||||
complex:
|
||||
synonym_for: macromolecular complex
|
||||
represented_as: node
|
||||
preferred_id: complexportal
|
||||
input_label: complex
|
||||
|
||||
pathway:
|
||||
represented_as: node
|
||||
preferred_id: [reactome, wikipathways]
|
||||
input_label: [reactome, wikipathways]
|
||||
|
||||
gene:
|
||||
represented_as: node
|
||||
preferred_id: hgnc
|
||||
input_label: [hgnc, ensg]
|
||||
exclude_properties: accession
|
||||
|
||||
disease:
|
||||
represented_as: node
|
||||
preferred_id: doid
|
||||
input_label: Disease
|
||||
|
||||
side effect:
|
||||
is_a: phenotypic feature
|
||||
represented_as: node
|
||||
preferred_id: sider.effect
|
||||
input_label: sider
|
||||
|
||||
sequence variant:
|
||||
represented_as: node
|
||||
preferred_id: [clinically relevant, known, somatic]
|
||||
input_label: [Clinically_relevant_variant, Known_variant, Somatic_mutation]
|
||||
properties:
|
||||
source: str
|
||||
original_source: str
|
||||
effect: str
|
||||
biotype: str
|
||||
|
||||
snRNA sequence:
|
||||
is_a: nucleic acid entity
|
||||
represented_as: node
|
||||
preferred_id: [intact, rnacentral]
|
||||
input_label: [intact_snrna, rnacentral_snrna]
|
||||
properties:
|
||||
ac: str
|
||||
fullName: str
|
||||
shortName: str
|
||||
preferredName: str
|
||||
exclude_properties: sequence
|
||||
|
||||
DNA sequence:
|
||||
is_a: nucleic acid entity
|
||||
represented_as: node
|
||||
preferred_id: ensembl
|
||||
input_label: dna
|
||||
properties:
|
||||
ac: str
|
||||
fullName: str
|
||||
shortName: str
|
||||
preferredName: str
|
||||
sequence: str
|
||||
|
||||
dsDNA sequence:
|
||||
is_a: [DNA sequence, nucleic acid entity]
|
||||
inherit_properties: True
|
||||
represented_as: node
|
||||
preferred_id: [intact, uniparc]
|
||||
input_label: [intact_dsdna, uniprot_archive_dsdna]
|
||||
|
||||
# ---
|
||||
# Associations
|
||||
# ---
|
||||
|
||||
post translational interaction:
|
||||
is_a: pairwise molecular interaction
|
||||
represented_as: node
|
||||
label_as_edge: INTERACTS_POST_TRANSLATIONAL
|
||||
input_label: post_translational
|
||||
|
||||
phosphorylation:
|
||||
is_a: post translational interaction
|
||||
represented_as: edge
|
||||
input_label: phosphorylation
|
||||
|
||||
gene to disease association:
|
||||
represented_as: edge
|
||||
label_as_edge: PERTURBED_IN_DISEASE
|
||||
input_label: [protein_disease, gene_disease]
|
||||
exclude_properties: accession
|
||||
|
||||
mutation to tissue association:
|
||||
is_a: [genotype to tissue association, entity to tissue association, association]
|
||||
represented_as: edge
|
||||
label_as_edge: Is_Mutated_In
|
||||
input_label: Gene_Is_Mutated_In_Cell_Tissue
|
||||
|
||||
variant to gene association: # -> Known.... and Somatic....
|
||||
represented_as: edge
|
||||
source: [known.sequence variant, somatic.sequence variant]
|
||||
target: gene
|
||||
input_label: [
|
||||
VARIANT_FOUND_IN_GENE_Known_variant_Gene,
|
||||
VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene
|
||||
]
|
||||
|
||||
gene to gene association:
|
||||
represented_as: edge
|
||||
input_label: gene_gene
|
||||
properties:
|
||||
directional: bool
|
||||
curated: bool
|
||||
score: float
|
||||
id: str # should be removed
|
||||
|
||||
gene to variant association: # should be removed
|
||||
is_a: gene to variant association
|
||||
represented_as: edge
|
||||
input_label: gene_variant
|
3
biocypher/_config/test_schema_config_disconnected.yaml
Normal file
3
biocypher/_config/test_schema_config_disconnected.yaml
Normal file
@ -0,0 +1,3 @@
|
||||
disconnected:
|
||||
represented_as: node
|
||||
label_in_input: disconnected
|
152
biocypher/_config/test_schema_config_extended.yaml
Normal file
152
biocypher/_config/test_schema_config_extended.yaml
Normal file
@ -0,0 +1,152 @@
|
||||
Title: BioCypher graph schema configuration file
|
||||
|
||||
# ---
|
||||
# "Named Things"
|
||||
# ---
|
||||
|
||||
protein:
|
||||
represented_as: node
|
||||
preferred_id: uniprot
|
||||
input_label: protein
|
||||
db_collection_name: proteins
|
||||
properties:
|
||||
name: str
|
||||
score: float
|
||||
taxon: int
|
||||
genes: str[]
|
||||
|
||||
microRNA:
|
||||
represented_as: node
|
||||
preferred_id: mirbase.mature
|
||||
input_label: mirna
|
||||
|
||||
complex:
|
||||
synonym_for: macromolecular complex
|
||||
represented_as: node
|
||||
preferred_id: complexportal
|
||||
input_label: complex
|
||||
|
||||
pathway:
|
||||
represented_as: node
|
||||
preferred_id: [reactome, wikipathways]
|
||||
input_label: [reactome, wikipathways]
|
||||
|
||||
gene:
|
||||
represented_as: node
|
||||
preferred_id: hgnc
|
||||
input_label: [hgnc, ensg]
|
||||
exclude_properties: accession
|
||||
|
||||
disease:
|
||||
represented_as: node
|
||||
preferred_id: doid
|
||||
input_label: Disease
|
||||
|
||||
side effect:
|
||||
is_a: phenotypic feature
|
||||
represented_as: node
|
||||
preferred_id: sider.effect
|
||||
input_label: sider
|
||||
|
||||
sequence variant:
|
||||
represented_as: node
|
||||
preferred_id: [clinically relevant, known, somatic]
|
||||
input_label: [Clinically_relevant_variant, Known_variant, Somatic_mutation]
|
||||
properties:
|
||||
source: str
|
||||
original_source: str
|
||||
effect: str
|
||||
biotype: str
|
||||
|
||||
altered gene product level:
|
||||
represented_as: node
|
||||
input_label: agpl
|
||||
|
||||
decreased gene product level:
|
||||
represented_as: node
|
||||
input_label: agpl_decreased
|
||||
|
||||
lethal variant:
|
||||
represented_as: node
|
||||
input_label: lethal
|
||||
|
||||
snRNA sequence:
|
||||
is_a: nucleic acid entity
|
||||
represented_as: node
|
||||
preferred_id: [intact, rnacentral]
|
||||
input_label: [intact_snrna, rnacentral_snrna]
|
||||
properties:
|
||||
ac: str
|
||||
fullName: str
|
||||
shortName: str
|
||||
preferredName: str
|
||||
exclude_properties: sequence
|
||||
|
||||
DNA sequence:
|
||||
is_a: nucleic acid entity
|
||||
represented_as: node
|
||||
preferred_id: ensembl
|
||||
input_label: dna
|
||||
properties:
|
||||
ac: str
|
||||
fullName: str
|
||||
shortName: str
|
||||
preferredName: str
|
||||
sequence: str
|
||||
|
||||
dsDNA sequence:
|
||||
is_a: [DNA sequence, nucleic acid entity]
|
||||
inherit_properties: True
|
||||
represented_as: node
|
||||
preferred_id: [intact, uniparc]
|
||||
input_label: [intact_dsdna, uniprot_archive_dsdna]
|
||||
|
||||
# ---
|
||||
# Associations
|
||||
# ---
|
||||
|
||||
post translational interaction:
|
||||
is_a: pairwise molecular interaction
|
||||
represented_as: node
|
||||
label_as_edge: INTERACTS_POST_TRANSLATIONAL
|
||||
input_label: post_translational
|
||||
|
||||
phosphorylation:
|
||||
is_a: post translational interaction
|
||||
represented_as: edge
|
||||
use_id: false
|
||||
input_label: phosphorylation
|
||||
|
||||
gene to disease association:
|
||||
represented_as: edge
|
||||
label_as_edge: PERTURBED_IN_DISEASE
|
||||
input_label: [protein_disease, gene_disease]
|
||||
exclude_properties: accession
|
||||
|
||||
mutation to tissue association:
|
||||
is_a: [genotype to tissue association, entity to tissue association, association]
|
||||
represented_as: edge
|
||||
label_as_edge: Is_Mutated_In
|
||||
input_label: Gene_Is_Mutated_In_Cell_Tissue
|
||||
|
||||
variant to gene association: # -> Known.... and Somatic....
|
||||
represented_as: edge
|
||||
source: [known.sequence variant, somatic.sequence variant]
|
||||
target: gene
|
||||
input_label: [
|
||||
VARIANT_FOUND_IN_GENE_Known_variant_Gene,
|
||||
VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene
|
||||
]
|
||||
|
||||
gene to gene association:
|
||||
represented_as: edge
|
||||
input_label: gene_gene
|
||||
properties:
|
||||
directional: bool
|
||||
curated: bool
|
||||
score: float
|
||||
|
||||
gene to variant association:
|
||||
is_a: gene to variant association
|
||||
represented_as: edge
|
||||
input_label: gene_variant
|
734
biocypher/_core.py
Normal file
734
biocypher/_core.py
Normal file
@ -0,0 +1,734 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher core module. Interfaces with the user and distributes tasks to
|
||||
submodules.
|
||||
"""
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
import os
|
||||
import json
|
||||
|
||||
from more_itertools import peekable
|
||||
import yaml
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from ._get import Downloader
|
||||
from ._config import config as _config
|
||||
from ._config import update_from_file as _file_update
|
||||
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
||||
from ._mapping import OntologyMapping
|
||||
from ._ontology import Ontology
|
||||
from ._translate import Translator
|
||||
from ._deduplicate import Deduplicator
|
||||
from .output.in_memory._pandas import Pandas
|
||||
from .output.write._get_writer import DBMS_TO_CLASS, get_writer
|
||||
from .output.connect._neo4j_driver import get_driver
|
||||
|
||||
__all__ = ["BioCypher"]
|
||||
|
||||
SUPPORTED_DBMS = DBMS_TO_CLASS.keys()
|
||||
|
||||
REQUIRED_CONFIG = [
|
||||
"dbms",
|
||||
"offline",
|
||||
"strict_mode",
|
||||
"head_ontology",
|
||||
]
|
||||
|
||||
|
||||
class BioCypher:
|
||||
"""
|
||||
Orchestration of BioCypher operations. Instantiate this class to interact
|
||||
with BioCypher.
|
||||
|
||||
Args:
|
||||
|
||||
dbms (str): The database management system to use. For supported
|
||||
systems see SUPPORTED_DBMS.
|
||||
|
||||
offline (bool): Whether to run in offline mode. If True, no
|
||||
connection to the database will be made.
|
||||
|
||||
strict_mode (bool): Whether to run in strict mode. If True, the
|
||||
translator will raise an error if a node or edge does not
|
||||
provide source, version, and licence information.
|
||||
|
||||
biocypher_config_path (str): Path to the BioCypher config file.
|
||||
|
||||
schema_config_path (str): Path to the user schema config
|
||||
file.
|
||||
|
||||
head_ontology (dict): The head ontology defined by URL ('url') and root
|
||||
node ('root_node').
|
||||
|
||||
tail_ontologies (dict): The tail ontologies defined by URL and
|
||||
join nodes for both head and tail ontology.
|
||||
|
||||
output_directory (str): Path to the output directory. If not
|
||||
provided, the default value 'biocypher-out' will be used.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dbms: str = None,
|
||||
offline: bool = None,
|
||||
strict_mode: bool = None,
|
||||
biocypher_config_path: str = None,
|
||||
schema_config_path: str = None,
|
||||
head_ontology: dict = None,
|
||||
tail_ontologies: dict = None,
|
||||
output_directory: str = None,
|
||||
cache_directory: str = None,
|
||||
# legacy params
|
||||
db_name: str = None,
|
||||
):
|
||||
# Update configuration if custom path is provided
|
||||
if biocypher_config_path:
|
||||
_file_update(biocypher_config_path)
|
||||
|
||||
if db_name:
|
||||
logger.warning(
|
||||
"The parameter `db_name` is deprecated. Please set the "
|
||||
"`database_name` setting in the `biocypher_config.yaml` file "
|
||||
"instead."
|
||||
)
|
||||
_config(**{db_name: {"database_name": db_name}})
|
||||
|
||||
# Load configuration
|
||||
self.base_config = _config("biocypher")
|
||||
|
||||
# Check for required configuration
|
||||
for key in REQUIRED_CONFIG:
|
||||
if key not in self.base_config:
|
||||
raise ValueError(f"Configuration key {key} is required.")
|
||||
|
||||
# Set configuration - mandatory
|
||||
self._dbms = dbms or self.base_config["dbms"]
|
||||
|
||||
if offline is None:
|
||||
self._offline = self.base_config["offline"]
|
||||
else:
|
||||
self._offline = offline
|
||||
|
||||
if strict_mode is None:
|
||||
self._strict_mode = self.base_config["strict_mode"]
|
||||
else:
|
||||
self._strict_mode = strict_mode
|
||||
|
||||
self._schema_config_path = schema_config_path or self.base_config.get(
|
||||
"schema_config_path"
|
||||
)
|
||||
|
||||
if not self._schema_config_path:
|
||||
logger.warning("Running BioCypher without schema configuration.")
|
||||
else:
|
||||
logger.info(
|
||||
f"Running BioCypher with schema configuration from {self._schema_config_path}."
|
||||
)
|
||||
|
||||
self._head_ontology = head_ontology or self.base_config["head_ontology"]
|
||||
|
||||
# Set configuration - optional
|
||||
self._output_directory = output_directory or self.base_config.get(
|
||||
"output_directory"
|
||||
)
|
||||
self._cache_directory = cache_directory or self.base_config.get(
|
||||
"cache_directory"
|
||||
)
|
||||
self._tail_ontologies = tail_ontologies or self.base_config.get(
|
||||
"tail_ontologies"
|
||||
)
|
||||
|
||||
if self._dbms not in SUPPORTED_DBMS:
|
||||
raise ValueError(
|
||||
f"DBMS {self._dbms} not supported. "
|
||||
f"Please select from {SUPPORTED_DBMS}."
|
||||
)
|
||||
|
||||
# Initialize
|
||||
self._ontology_mapping = None
|
||||
self._deduplicator = None
|
||||
self._translator = None
|
||||
self._downloader = None
|
||||
self._ontology = None
|
||||
self._writer = None
|
||||
self._pd = None
|
||||
|
||||
def _get_deduplicator(self) -> Deduplicator:
|
||||
"""
|
||||
Create deduplicator if not exists and return.
|
||||
"""
|
||||
|
||||
if not self._deduplicator:
|
||||
self._deduplicator = Deduplicator()
|
||||
|
||||
return self._deduplicator
|
||||
|
||||
def _get_ontology_mapping(self) -> OntologyMapping:
|
||||
"""
|
||||
Create ontology mapping if not exists and return.
|
||||
"""
|
||||
|
||||
if not self._schema_config_path:
|
||||
self._ontology_mapping = OntologyMapping()
|
||||
|
||||
if not self._ontology_mapping:
|
||||
self._ontology_mapping = OntologyMapping(
|
||||
config_file=self._schema_config_path,
|
||||
)
|
||||
|
||||
return self._ontology_mapping
|
||||
|
||||
def _get_ontology(self) -> Ontology:
|
||||
"""
|
||||
Create ontology if not exists and return.
|
||||
"""
|
||||
|
||||
if not self._ontology:
|
||||
self._ontology = Ontology(
|
||||
ontology_mapping=self._get_ontology_mapping(),
|
||||
head_ontology=self._head_ontology,
|
||||
tail_ontologies=self._tail_ontologies,
|
||||
)
|
||||
|
||||
return self._ontology
|
||||
|
||||
def _get_translator(self) -> Translator:
|
||||
"""
|
||||
Create translator if not exists and return.
|
||||
"""
|
||||
|
||||
if not self._translator:
|
||||
self._translator = Translator(
|
||||
ontology=self._get_ontology(),
|
||||
strict_mode=self._strict_mode,
|
||||
)
|
||||
|
||||
return self._translator
|
||||
|
||||
def _get_writer(self):
|
||||
"""
|
||||
Create writer if not online. Set as instance variable `self._writer`.
|
||||
"""
|
||||
|
||||
if self._offline:
|
||||
timestamp = lambda: datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
outdir = self._output_directory or os.path.join(
|
||||
"biocypher-out", timestamp()
|
||||
)
|
||||
self._output_directory = os.path.abspath(outdir)
|
||||
|
||||
self._writer = get_writer(
|
||||
dbms=self._dbms,
|
||||
translator=self._get_translator(),
|
||||
deduplicator=self._get_deduplicator(),
|
||||
output_directory=self._output_directory,
|
||||
strict_mode=self._strict_mode,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError("Cannot get writer in online mode.")
|
||||
|
||||
def _get_driver(self):
|
||||
"""
|
||||
Create driver if not exists. Set as instance variable `self._driver`.
|
||||
"""
|
||||
|
||||
if not self._offline:
|
||||
self._driver = get_driver(
|
||||
dbms=self._dbms,
|
||||
translator=self._get_translator(),
|
||||
deduplicator=self._get_deduplicator(),
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError("Cannot get driver in offline mode.")
|
||||
|
||||
def write_nodes(
|
||||
self, nodes, batch_size: int = int(1e6), force: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Write nodes to database. Either takes an iterable of tuples (if given,
|
||||
translates to ``BioCypherNode`` objects) or an iterable of
|
||||
``BioCypherNode`` objects.
|
||||
|
||||
Args:
|
||||
nodes (iterable): An iterable of nodes to write to the database.
|
||||
|
||||
batch_size (int): The batch size to use when writing to disk.
|
||||
|
||||
force (bool): Whether to force writing to the output directory even
|
||||
if the node type is not present in the schema config file.
|
||||
|
||||
Returns:
|
||||
bool: True if successful.
|
||||
"""
|
||||
|
||||
if not self._writer:
|
||||
self._get_writer()
|
||||
|
||||
nodes = peekable(nodes)
|
||||
if not isinstance(nodes.peek(), BioCypherNode):
|
||||
tnodes = self._translator.translate_nodes(nodes)
|
||||
else:
|
||||
tnodes = nodes
|
||||
# write node files
|
||||
return self._writer.write_nodes(
|
||||
tnodes, batch_size=batch_size, force=force
|
||||
)
|
||||
|
||||
def write_edges(self, edges, batch_size: int = int(1e6)) -> bool:
|
||||
"""
|
||||
Write edges to database. Either takes an iterable of tuples (if given,
|
||||
translates to ``BioCypherEdge`` objects) or an iterable of
|
||||
``BioCypherEdge`` objects.
|
||||
|
||||
Args:
|
||||
edges (iterable): An iterable of edges to write to the database.
|
||||
|
||||
Returns:
|
||||
bool: True if successful.
|
||||
"""
|
||||
|
||||
if not self._writer:
|
||||
self._get_writer()
|
||||
|
||||
edges = peekable(edges)
|
||||
if not isinstance(edges.peek(), BioCypherEdge):
|
||||
tedges = self._translator.translate_edges(edges)
|
||||
else:
|
||||
tedges = edges
|
||||
# write edge files
|
||||
return self._writer.write_edges(tedges, batch_size=batch_size)
|
||||
|
||||
def to_df(self) -> list[pd.DataFrame]:
|
||||
"""
|
||||
Convert entities to a pandas DataFrame for each entity type and return
|
||||
a list.
|
||||
|
||||
Args:
|
||||
entities (iterable): An iterable of entities to convert to a
|
||||
DataFrame.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: A pandas DataFrame.
|
||||
"""
|
||||
if not self._pd:
|
||||
raise ValueError(
|
||||
"No pandas instance found. Please call `add()` first."
|
||||
)
|
||||
|
||||
return self._pd.dfs
|
||||
|
||||
def add(self, entities) -> None:
|
||||
"""
|
||||
Function to add entities to the in-memory database. Accepts an iterable
|
||||
of tuples (if given, translates to ``BioCypherNode`` or
|
||||
``BioCypherEdge`` objects) or an iterable of ``BioCypherNode`` or
|
||||
``BioCypherEdge`` objects.
|
||||
|
||||
Args:
|
||||
entities (iterable): An iterable of entities to add to the database.
|
||||
Can be 3-tuples (nodes) or 5-tuples (edges); also accepts
|
||||
4-tuples for edges (deprecated).
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if not self._pd:
|
||||
self._pd = Pandas(
|
||||
translator=self._get_translator(),
|
||||
deduplicator=self._get_deduplicator(),
|
||||
)
|
||||
|
||||
entities = peekable(entities)
|
||||
|
||||
if (
|
||||
isinstance(entities.peek(), BioCypherNode)
|
||||
or isinstance(entities.peek(), BioCypherEdge)
|
||||
or isinstance(entities.peek(), BioCypherRelAsNode)
|
||||
):
|
||||
tentities = entities
|
||||
elif len(entities.peek()) < 4:
|
||||
tentities = self._translator.translate_nodes(entities)
|
||||
else:
|
||||
tentities = self._translator.translate_edges(entities)
|
||||
|
||||
self._pd.add_tables(tentities)
|
||||
|
||||
def add_nodes(self, nodes) -> None:
|
||||
"""
|
||||
Wrapper for ``add()`` to add nodes to the in-memory database.
|
||||
|
||||
Args:
|
||||
nodes (iterable): An iterable of node tuples to add to the database.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
self.add(nodes)
|
||||
|
||||
def add_edges(self, edges) -> None:
|
||||
"""
|
||||
Wrapper for ``add()`` to add edges to the in-memory database.
|
||||
|
||||
Args:
|
||||
edges (iterable): An iterable of edge tuples to add to the database.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
self.add(edges)
|
||||
|
||||
def merge_nodes(self, nodes) -> bool:
|
||||
"""
|
||||
Merge nodes into database. Either takes an iterable of tuples (if given,
|
||||
translates to ``BioCypherNode`` objects) or an iterable of
|
||||
``BioCypherNode`` objects.
|
||||
|
||||
Args:
|
||||
nodes (iterable): An iterable of nodes to merge into the database.
|
||||
|
||||
Returns:
|
||||
bool: True if successful.
|
||||
"""
|
||||
|
||||
if not self._driver:
|
||||
self._get_driver()
|
||||
|
||||
nodes = peekable(nodes)
|
||||
if not isinstance(nodes.peek(), BioCypherNode):
|
||||
tnodes = self._translator.translate_nodes(nodes)
|
||||
else:
|
||||
tnodes = nodes
|
||||
# write node files
|
||||
return self._driver.add_biocypher_nodes(tnodes)
|
||||
|
||||
def merge_edges(self, edges) -> bool:
|
||||
"""
|
||||
Merge edges into database. Either takes an iterable of tuples (if given,
|
||||
translates to ``BioCypherEdge`` objects) or an iterable of
|
||||
``BioCypherEdge`` objects.
|
||||
|
||||
Args:
|
||||
edges (iterable): An iterable of edges to merge into the database.
|
||||
|
||||
Returns:
|
||||
bool: True if successful.
|
||||
"""
|
||||
|
||||
if not self._driver:
|
||||
self._get_driver()
|
||||
|
||||
edges = peekable(edges)
|
||||
if not isinstance(edges.peek(), BioCypherEdge):
|
||||
tedges = self._translator.translate_edges(edges)
|
||||
else:
|
||||
tedges = edges
|
||||
# write edge files
|
||||
return self._driver.add_biocypher_edges(tedges)
|
||||
|
||||
# DOWNLOAD AND CACHE MANAGEMENT METHODS ###
|
||||
|
||||
def _get_downloader(self, cache_dir: Optional[str] = None):
|
||||
"""
|
||||
Create downloader if not exists.
|
||||
"""
|
||||
|
||||
if not self._downloader:
|
||||
self._downloader = Downloader(self._cache_directory)
|
||||
|
||||
def download(self, *resources) -> None:
|
||||
"""
|
||||
Use the :class:`Downloader` class to download or load from cache the
|
||||
resources given by the adapter.
|
||||
"""
|
||||
|
||||
self._get_downloader()
|
||||
return self._downloader.download(*resources)
|
||||
|
||||
# OVERVIEW AND CONVENIENCE METHODS ###
|
||||
|
||||
def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]:
|
||||
"""
|
||||
|
||||
Get the set of input labels encountered without an entry in the
|
||||
`schema_config.yaml` and print them to the logger.
|
||||
|
||||
Returns:
|
||||
|
||||
Optional[Dict[str, List[str]]]: A dictionary of Biolink types
|
||||
encountered without an entry in the `schema_config.yaml` file.
|
||||
|
||||
"""
|
||||
|
||||
mt = self._translator.get_missing_biolink_types()
|
||||
|
||||
if mt:
|
||||
msg = (
|
||||
"Input entities not accounted for due to them not being "
|
||||
f"present in the schema configuration file {self._schema_config_path} "
|
||||
"(this is not necessarily a problem, if you did not intend "
|
||||
"to include them in the database; see the log for details): \n"
|
||||
)
|
||||
for k, v in mt.items():
|
||||
msg += f" {k}: {v} \n"
|
||||
|
||||
logger.info(msg)
|
||||
return mt
|
||||
|
||||
else:
|
||||
logger.info("No missing labels in input.")
|
||||
return None
|
||||
|
||||
def log_duplicates(self) -> None:
|
||||
"""
|
||||
Get the set of duplicate nodes and edges encountered and print them to
|
||||
the logger.
|
||||
"""
|
||||
|
||||
dn = self._deduplicator.get_duplicate_nodes()
|
||||
|
||||
if dn:
|
||||
ntypes = dn[0]
|
||||
nids = dn[1]
|
||||
|
||||
msg = "Duplicate node types encountered (IDs in log): \n"
|
||||
for typ in ntypes:
|
||||
msg += f" {typ}\n"
|
||||
|
||||
logger.info(msg)
|
||||
|
||||
idmsg = "Duplicate node IDs encountered: \n"
|
||||
for _id in nids:
|
||||
idmsg += f" {_id}\n"
|
||||
|
||||
logger.debug(idmsg)
|
||||
|
||||
else:
|
||||
logger.info("No duplicate nodes in input.")
|
||||
|
||||
de = self._deduplicator.get_duplicate_edges()
|
||||
|
||||
if de:
|
||||
etypes = de[0]
|
||||
eids = de[1]
|
||||
|
||||
msg = "Duplicate edge types encountered (IDs in log): \n"
|
||||
for typ in etypes:
|
||||
msg += f" {typ}\n"
|
||||
|
||||
logger.info(msg)
|
||||
|
||||
idmsg = "Duplicate edge IDs encountered: \n"
|
||||
for _id in eids:
|
||||
idmsg += f" {_id}\n"
|
||||
|
||||
logger.debug(idmsg)
|
||||
|
||||
else:
|
||||
logger.info("No duplicate edges in input.")
|
||||
|
||||
def show_ontology_structure(self, **kwargs) -> None:
|
||||
"""
|
||||
Show the ontology structure using treelib or write to GRAPHML file.
|
||||
|
||||
Args:
|
||||
|
||||
to_disk (str): If specified, the ontology structure will be saved
|
||||
to disk as a GRAPHML file, to be opened in your favourite
|
||||
graph visualisation tool.
|
||||
|
||||
full (bool): If True, the full ontology structure will be shown,
|
||||
including all nodes and edges. If False, only the nodes and
|
||||
edges that are relevant to the extended schema will be shown.
|
||||
"""
|
||||
|
||||
if not self._ontology:
|
||||
self._get_ontology()
|
||||
|
||||
return self._ontology.show_ontology_structure(**kwargs)
|
||||
|
||||
def write_import_call(self) -> str:
|
||||
"""
|
||||
Write a shell script to import the database depending on the chosen
|
||||
DBMS.
|
||||
|
||||
Returns:
|
||||
str: path toward the file holding the import call.
|
||||
"""
|
||||
|
||||
if not self._offline:
|
||||
raise NotImplementedError(
|
||||
"Cannot write import call in online mode."
|
||||
)
|
||||
|
||||
return self._writer.write_import_call()
|
||||
|
||||
def write_schema_info(self, as_node: bool = False) -> None:
|
||||
"""
|
||||
Write an extended schema info YAML file that extends the
|
||||
`schema_config.yaml` with run-time information of the built KG. For
|
||||
instance, include information on whether something present in the actual
|
||||
knowledge graph, whether it is a relationship (which is important in the
|
||||
case of representing relationships as nodes) and the actual sources and
|
||||
targets of edges. Since this file can be used in place of the original
|
||||
`schema_config.yaml` file, it indicates that it is the extended schema
|
||||
by setting `is_schema_info` to `true`.
|
||||
|
||||
We start by using the `extended_schema` dictionary from the ontology
|
||||
class instance, which contains all expanded entities and relationships.
|
||||
The information of whether something is a relationship can be gathered
|
||||
from the deduplicator instance, which keeps track of all entities that
|
||||
have been seen.
|
||||
"""
|
||||
|
||||
if not self._offline:
|
||||
raise NotImplementedError(
|
||||
"Cannot write schema info in online mode."
|
||||
)
|
||||
|
||||
ontology = self._get_ontology()
|
||||
schema = ontology.mapping.extended_schema.copy()
|
||||
schema["is_schema_info"] = True
|
||||
|
||||
deduplicator = self._get_deduplicator()
|
||||
for node in deduplicator.entity_types:
|
||||
if node in schema.keys():
|
||||
schema[node]["present_in_knowledge_graph"] = True
|
||||
schema[node]["is_relationship"] = False
|
||||
else:
|
||||
logger.info(
|
||||
f"Node {node} not present in extended schema. "
|
||||
"Skipping schema info."
|
||||
)
|
||||
|
||||
# find 'label_as_edge' cases in schema entries
|
||||
changed_labels = {}
|
||||
for k, v in schema.items():
|
||||
if not isinstance(v, dict):
|
||||
continue
|
||||
if "label_as_edge" in v.keys():
|
||||
if v["label_as_edge"] in deduplicator.seen_relationships.keys():
|
||||
changed_labels[v["label_as_edge"]] = k
|
||||
|
||||
for edge in deduplicator.seen_relationships.keys():
|
||||
if edge in changed_labels.keys():
|
||||
edge = changed_labels[edge]
|
||||
if edge in schema.keys():
|
||||
schema[edge]["present_in_knowledge_graph"] = True
|
||||
schema[edge]["is_relationship"] = True
|
||||
# TODO information about source and target nodes
|
||||
else:
|
||||
logger.info(
|
||||
f"Edge {edge} not present in extended schema. "
|
||||
"Skipping schema info."
|
||||
)
|
||||
|
||||
# write to output directory as YAML file
|
||||
path = os.path.join(self._output_directory, "schema_info.yaml")
|
||||
with open(path, "w") as f:
|
||||
f.write(yaml.dump(schema))
|
||||
|
||||
if as_node:
|
||||
# write as node
|
||||
node = BioCypherNode(
|
||||
node_id="schema_info",
|
||||
node_label="schema_info",
|
||||
properties={"schema_info": json.dumps(schema)},
|
||||
)
|
||||
self.write_nodes([node], force=True)
|
||||
|
||||
# override import call with added schema info node
|
||||
self.write_import_call()
|
||||
|
||||
return schema
|
||||
|
||||
# TRANSLATION METHODS ###
|
||||
|
||||
def translate_term(self, term: str) -> str:
|
||||
"""
|
||||
Translate a term to its BioCypher equivalent.
|
||||
|
||||
Args:
|
||||
term (str): The term to translate.
|
||||
|
||||
Returns:
|
||||
str: The BioCypher equivalent of the term.
|
||||
"""
|
||||
|
||||
# instantiate adapter if not exists
|
||||
self.start_ontology()
|
||||
|
||||
return self._translator.translate_term(term)
|
||||
|
||||
def summary(self) -> None:
|
||||
"""
|
||||
Wrapper for showing ontology structure and logging duplicates and
|
||||
missing input types.
|
||||
"""
|
||||
|
||||
self.show_ontology_structure()
|
||||
self.log_duplicates()
|
||||
self.log_missing_input_labels()
|
||||
|
||||
def reverse_translate_term(self, term: str) -> str:
|
||||
"""
|
||||
Reverse translate a term from its BioCypher equivalent.
|
||||
|
||||
Args:
|
||||
term (str): The BioCypher term to reverse translate.
|
||||
|
||||
Returns:
|
||||
str: The original term.
|
||||
"""
|
||||
|
||||
# instantiate adapter if not exists
|
||||
self.start_ontology()
|
||||
|
||||
return self._translator.reverse_translate_term(term)
|
||||
|
||||
def translate_query(self, query: str) -> str:
|
||||
"""
|
||||
Translate a query to its BioCypher equivalent.
|
||||
|
||||
Args:
|
||||
query (str): The query to translate.
|
||||
|
||||
Returns:
|
||||
str: The BioCypher equivalent of the query.
|
||||
"""
|
||||
|
||||
# instantiate adapter if not exists
|
||||
self.start_ontology()
|
||||
|
||||
return self._translator.translate(query)
|
||||
|
||||
def reverse_translate_query(self, query: str) -> str:
|
||||
"""
|
||||
Reverse translate a query from its BioCypher equivalent.
|
||||
|
||||
Args:
|
||||
query (str): The BioCypher query to reverse translate.
|
||||
|
||||
Returns:
|
||||
str: The original query.
|
||||
"""
|
||||
|
||||
# instantiate adapter if not exists
|
||||
self.start_ontology()
|
||||
|
||||
return self._translator.reverse_translate(query)
|
356
biocypher/_create.py
Normal file
356
biocypher/_create.py
Normal file
@ -0,0 +1,356 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'create' module. Handles the creation of BioCypher node and edge
|
||||
dataclasses.
|
||||
"""
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import Union
|
||||
from dataclasses import field, dataclass
|
||||
import os
|
||||
|
||||
__all__ = [
|
||||
"BioCypherEdge",
|
||||
"BioCypherNode",
|
||||
"BioCypherRelAsNode",
|
||||
]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BioCypherNode:
|
||||
"""
|
||||
Handoff class to represent biomedical entities as Neo4j nodes.
|
||||
|
||||
Has id, label, property dict; id and label (in the Neo4j sense of a
|
||||
label, ie, the entity descriptor after the colon, such as
|
||||
":Protein") are non-optional and called node_id and node_label to
|
||||
avoid confusion with "label" properties. Node labels are written in
|
||||
PascalCase and as nouns, as per Neo4j consensus.
|
||||
|
||||
Args:
|
||||
node_id (string): consensus "best" id for biological entity
|
||||
node_label (string): primary type of entity, capitalised
|
||||
**properties (kwargs): collection of all other properties to be
|
||||
passed to neo4j for the respective node (dict)
|
||||
|
||||
Todo:
|
||||
- check and correct small inconsistencies such as capitalisation
|
||||
of ID names ("uniprot" vs "UniProt")
|
||||
- check for correct ID patterns (eg "ENSG" + string of numbers,
|
||||
uniprot length)
|
||||
- ID conversion using pypath translation facilities for now
|
||||
"""
|
||||
|
||||
node_id: str
|
||||
node_label: str
|
||||
preferred_id: str = "id"
|
||||
properties: dict = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Add id field to properties.
|
||||
|
||||
Check for reserved keywords.
|
||||
|
||||
Replace unwanted characters in properties.
|
||||
"""
|
||||
self.properties["id"] = self.node_id
|
||||
self.properties["preferred_id"] = self.preferred_id or None
|
||||
# TODO actually make None possible here; as is, "id" is the default in
|
||||
# the dataclass as well as in the configuration file
|
||||
|
||||
if ":TYPE" in self.properties.keys():
|
||||
logger.warning(
|
||||
"Keyword ':TYPE' is reserved for Neo4j. "
|
||||
"Removing from properties.",
|
||||
# "Renaming to 'type'."
|
||||
)
|
||||
# self.properties["type"] = self.properties[":TYPE"]
|
||||
del self.properties[":TYPE"]
|
||||
|
||||
for k, v in self.properties.items():
|
||||
if isinstance(v, str):
|
||||
self.properties[k] = (
|
||||
v.replace(
|
||||
os.linesep,
|
||||
" ",
|
||||
)
|
||||
.replace(
|
||||
"\n",
|
||||
" ",
|
||||
)
|
||||
.replace(
|
||||
"\r",
|
||||
" ",
|
||||
)
|
||||
)
|
||||
|
||||
elif isinstance(v, list):
|
||||
#modified biocypher, because the data contained intgers in lists
|
||||
self.properties[k] = [
|
||||
(str(val) if isinstance(val, (int, float)) else val)
|
||||
.replace(os.linesep, " ")
|
||||
.replace("\n", " ")
|
||||
.replace("\r", " ")
|
||||
for val in v
|
||||
]
|
||||
|
||||
def get_id(self) -> str:
|
||||
"""
|
||||
Returns primary node identifier.
|
||||
|
||||
Returns:
|
||||
str: node_id
|
||||
"""
|
||||
return self.node_id
|
||||
|
||||
def get_label(self) -> str:
|
||||
"""
|
||||
Returns primary node label.
|
||||
|
||||
Returns:
|
||||
str: node_label
|
||||
"""
|
||||
return self.node_label
|
||||
|
||||
def get_type(self) -> str:
|
||||
"""
|
||||
Returns primary node label.
|
||||
|
||||
Returns:
|
||||
str: node_label
|
||||
"""
|
||||
return self.node_label
|
||||
|
||||
def get_preferred_id(self) -> str:
|
||||
"""
|
||||
Returns preferred id.
|
||||
|
||||
Returns:
|
||||
str: preferred_id
|
||||
"""
|
||||
return self.preferred_id
|
||||
|
||||
def get_properties(self) -> dict:
|
||||
"""
|
||||
Returns all other node properties apart from primary id and
|
||||
label as key-value pairs.
|
||||
|
||||
Returns:
|
||||
dict: properties
|
||||
"""
|
||||
return self.properties
|
||||
|
||||
def get_dict(self) -> dict:
|
||||
"""
|
||||
Return dict of id, labels, and properties.
|
||||
|
||||
Returns:
|
||||
dict: node_id and node_label as top-level key-value pairs,
|
||||
properties as second-level dict.
|
||||
"""
|
||||
return {
|
||||
"node_id": self.node_id,
|
||||
"node_label": self.node_label,
|
||||
"properties": self.properties,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BioCypherEdge:
|
||||
"""
|
||||
Handoff class to represent biomedical relationships in Neo4j.
|
||||
|
||||
Has source and target ids, label, property dict; ids and label (in
|
||||
the Neo4j sense of a label, ie, the entity descriptor after the
|
||||
colon, such as ":TARGETS") are non-optional and called source_id,
|
||||
target_id, and relationship_label to avoid confusion with properties
|
||||
called "label", which usually denotes the human-readable form.
|
||||
Relationship labels are written in UPPERCASE and as verbs, as per
|
||||
Neo4j consensus.
|
||||
|
||||
Args:
|
||||
|
||||
source_id (string): consensus "best" id for biological entity
|
||||
|
||||
target_id (string): consensus "best" id for biological entity
|
||||
|
||||
relationship_label (string): type of interaction, UPPERCASE
|
||||
|
||||
properties (dict): collection of all other properties of the
|
||||
respective edge
|
||||
|
||||
"""
|
||||
|
||||
source_id: str
|
||||
target_id: str
|
||||
relationship_label: str
|
||||
relationship_id: str = None
|
||||
properties: dict = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Check for reserved keywords.
|
||||
"""
|
||||
|
||||
if ":TYPE" in self.properties.keys():
|
||||
logger.debug(
|
||||
"Keyword ':TYPE' is reserved for Neo4j. "
|
||||
"Removing from properties.",
|
||||
# "Renaming to 'type'."
|
||||
)
|
||||
# self.properties["type"] = self.properties[":TYPE"]
|
||||
del self.properties[":TYPE"]
|
||||
elif "id" in self.properties.keys():
|
||||
logger.debug(
|
||||
"Keyword 'id' is reserved for Neo4j. "
|
||||
"Removing from properties.",
|
||||
# "Renaming to 'type'."
|
||||
)
|
||||
# self.properties["type"] = self.properties[":TYPE"]
|
||||
del self.properties["id"]
|
||||
elif "_ID" in self.properties.keys():
|
||||
logger.debug(
|
||||
"Keyword '_ID' is reserved for Postgres. "
|
||||
"Removing from properties.",
|
||||
# "Renaming to 'type'."
|
||||
)
|
||||
# self.properties["type"] = self.properties[":TYPE"]
|
||||
del self.properties["_ID"]
|
||||
|
||||
def get_id(self) -> Union[str, None]:
|
||||
"""
|
||||
Returns primary node identifier or None.
|
||||
|
||||
Returns:
|
||||
str: node_id
|
||||
"""
|
||||
|
||||
return self.relationship_id
|
||||
|
||||
def get_source_id(self) -> str:
|
||||
"""
|
||||
Returns primary node identifier of relationship source.
|
||||
|
||||
Returns:
|
||||
str: source_id
|
||||
"""
|
||||
return self.source_id
|
||||
|
||||
def get_target_id(self) -> str:
|
||||
"""
|
||||
Returns primary node identifier of relationship target.
|
||||
|
||||
Returns:
|
||||
str: target_id
|
||||
"""
|
||||
return self.target_id
|
||||
|
||||
def get_label(self) -> str:
|
||||
"""
|
||||
Returns relationship label.
|
||||
|
||||
Returns:
|
||||
str: relationship_label
|
||||
"""
|
||||
return self.relationship_label
|
||||
|
||||
def get_type(self) -> str:
|
||||
"""
|
||||
Returns relationship label.
|
||||
|
||||
Returns:
|
||||
str: relationship_label
|
||||
"""
|
||||
return self.relationship_label
|
||||
|
||||
def get_properties(self) -> dict:
|
||||
"""
|
||||
Returns all other relationship properties apart from primary ids
|
||||
and label as key-value pairs.
|
||||
|
||||
Returns:
|
||||
dict: properties
|
||||
"""
|
||||
return self.properties
|
||||
|
||||
def get_dict(self) -> dict:
|
||||
"""
|
||||
Return dict of ids, label, and properties.
|
||||
|
||||
Returns:
|
||||
dict: source_id, target_id and relationship_label as
|
||||
top-level key-value pairs, properties as second-level
|
||||
dict.
|
||||
"""
|
||||
return {
|
||||
"relationship_id": self.relationship_id or None,
|
||||
"source_id": self.source_id,
|
||||
"target_id": self.target_id,
|
||||
"relationship_label": self.relationship_label,
|
||||
"properties": self.properties,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BioCypherRelAsNode:
|
||||
"""
|
||||
Class to represent relationships as nodes (with in- and outgoing
|
||||
edges) as a triplet of a BioCypherNode and two BioCypherEdges. Main
|
||||
usage in type checking (instances where the receiving function needs
|
||||
to check whether it receives a relationship as a single edge or as
|
||||
a triplet).
|
||||
|
||||
Args:
|
||||
|
||||
node (BioCypherNode): node representing the relationship
|
||||
|
||||
source_edge (BioCypherEdge): edge representing the source of the
|
||||
relationship
|
||||
|
||||
target_edge (BioCypherEdge): edge representing the target of the
|
||||
relationship
|
||||
|
||||
"""
|
||||
|
||||
node: BioCypherNode
|
||||
source_edge: BioCypherEdge
|
||||
target_edge: BioCypherEdge
|
||||
|
||||
def __post_init__(self):
|
||||
if not isinstance(self.node, BioCypherNode):
|
||||
raise TypeError(
|
||||
f"BioCypherRelAsNode.node must be a BioCypherNode, "
|
||||
f"not {type(self.node)}.",
|
||||
)
|
||||
|
||||
if not isinstance(self.source_edge, BioCypherEdge):
|
||||
raise TypeError(
|
||||
f"BioCypherRelAsNode.source_edge must be a BioCypherEdge, "
|
||||
f"not {type(self.source_edge)}.",
|
||||
)
|
||||
|
||||
if not isinstance(self.target_edge, BioCypherEdge):
|
||||
raise TypeError(
|
||||
f"BioCypherRelAsNode.target_edge must be a BioCypherEdge, "
|
||||
f"not {type(self.target_edge)}.",
|
||||
)
|
||||
|
||||
def get_node(self) -> BioCypherNode:
|
||||
return self.node
|
||||
|
||||
def get_source_edge(self) -> BioCypherEdge:
|
||||
return self.source_edge
|
||||
|
||||
def get_target_edge(self) -> BioCypherEdge:
|
||||
return self.target_edge
|
147
biocypher/_deduplicate.py
Normal file
147
biocypher/_deduplicate.py
Normal file
@ -0,0 +1,147 @@
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
||||
|
||||
|
||||
class Deduplicator:
|
||||
"""
|
||||
Singleton class responsible of deduplicating BioCypher inputs. Maintains
|
||||
sets/dictionaries of node and edge types and their unique identifiers.
|
||||
|
||||
Nodes identifiers should be globally unique (represented as a set), while
|
||||
edge identifiers are only unique per edge type (represented as a dict of
|
||||
sets, keyed by edge type).
|
||||
|
||||
Stores collection of duplicate node and edge identifiers and types for
|
||||
troubleshooting and to avoid overloading the log.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.seen_entity_ids = set()
|
||||
self.duplicate_entity_ids = set()
|
||||
|
||||
self.entity_types = set()
|
||||
self.duplicate_entity_types = set()
|
||||
|
||||
self.seen_relationships = {}
|
||||
self.duplicate_relationship_ids = set()
|
||||
self.duplicate_relationship_types = set()
|
||||
|
||||
def node_seen(self, entity: BioCypherNode) -> bool:
|
||||
"""
|
||||
Adds a node to the instance and checks if it has been seen before.
|
||||
|
||||
Args:
|
||||
node: BioCypherNode to be added.
|
||||
|
||||
Returns:
|
||||
True if the node has been seen before, False otherwise.
|
||||
"""
|
||||
if entity.get_label() not in self.entity_types:
|
||||
self.entity_types.add(entity.get_label())
|
||||
|
||||
if entity.get_id() in self.seen_entity_ids:
|
||||
self.duplicate_entity_ids.add(entity.get_id())
|
||||
if entity.get_label() not in self.duplicate_entity_types:
|
||||
logger.warning(
|
||||
f"Duplicate node type {entity.get_label()} found. "
|
||||
)
|
||||
self.duplicate_entity_types.add(entity.get_label())
|
||||
return True
|
||||
|
||||
self.seen_entity_ids.add(entity.get_id())
|
||||
return False
|
||||
|
||||
def edge_seen(self, relationship: BioCypherEdge) -> bool:
|
||||
"""
|
||||
Adds an edge to the instance and checks if it has been seen before.
|
||||
|
||||
Args:
|
||||
edge: BioCypherEdge to be added.
|
||||
|
||||
Returns:
|
||||
True if the edge has been seen before, False otherwise.
|
||||
"""
|
||||
if relationship.get_type() not in self.seen_relationships:
|
||||
self.seen_relationships[relationship.get_type()] = set()
|
||||
|
||||
# concatenate source and target if no id is present
|
||||
if not relationship.get_id():
|
||||
_id = (
|
||||
f"{relationship.get_source_id()}_{relationship.get_target_id()}"
|
||||
)
|
||||
else:
|
||||
_id = relationship.get_id()
|
||||
|
||||
if _id in self.seen_relationships[relationship.get_type()]:
|
||||
self.duplicate_relationship_ids.add(_id)
|
||||
if relationship.get_type() not in self.duplicate_relationship_types:
|
||||
logger.warning(
|
||||
f"Duplicate edge type {relationship.get_type()} found. "
|
||||
)
|
||||
self.duplicate_relationship_types.add(relationship.get_type())
|
||||
return True
|
||||
|
||||
self.seen_relationships[relationship.get_type()].add(_id)
|
||||
return False
|
||||
|
||||
def rel_as_node_seen(self, rel_as_node: BioCypherRelAsNode) -> bool:
|
||||
"""
|
||||
Adds a rel_as_node to the instance (one entity and two relationships)
|
||||
and checks if it has been seen before. Only the node is relevant for
|
||||
identifying the rel_as_node as a duplicate.
|
||||
|
||||
Args:
|
||||
rel_as_node: BioCypherRelAsNode to be added.
|
||||
|
||||
Returns:
|
||||
True if the rel_as_node has been seen before, False otherwise.
|
||||
"""
|
||||
node = rel_as_node.get_node()
|
||||
|
||||
if node.get_label() not in self.seen_relationships:
|
||||
self.seen_relationships[node.get_label()] = set()
|
||||
|
||||
# rel as node always has an id
|
||||
_id = node.get_id()
|
||||
|
||||
if _id in self.seen_relationships[node.get_type()]:
|
||||
self.duplicate_relationship_ids.add(_id)
|
||||
if node.get_type() not in self.duplicate_relationship_types:
|
||||
logger.warning(f"Duplicate edge type {node.get_type()} found. ")
|
||||
self.duplicate_relationship_types.add(node.get_type())
|
||||
return True
|
||||
|
||||
self.seen_relationships[node.get_type()].add(_id)
|
||||
return False
|
||||
|
||||
def get_duplicate_nodes(self):
|
||||
"""
|
||||
Function to return a list of duplicate nodes.
|
||||
|
||||
Returns:
|
||||
list: list of duplicate nodes
|
||||
"""
|
||||
|
||||
if self.duplicate_entity_types:
|
||||
return (self.duplicate_entity_types, self.duplicate_entity_ids)
|
||||
else:
|
||||
return None
|
||||
|
||||
def get_duplicate_edges(self):
|
||||
"""
|
||||
Function to return a list of duplicate edges.
|
||||
|
||||
Returns:
|
||||
list: list of duplicate edges
|
||||
"""
|
||||
|
||||
if self.duplicate_relationship_types:
|
||||
return (
|
||||
self.duplicate_relationship_types,
|
||||
self.duplicate_relationship_ids,
|
||||
)
|
||||
else:
|
||||
return None
|
443
biocypher/_get.py
Normal file
443
biocypher/_get.py
Normal file
@ -0,0 +1,443 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher get module. Used to download and cache data from external sources.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
import shutil
|
||||
|
||||
import requests
|
||||
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from abc import ABC
|
||||
from datetime import datetime, timedelta
|
||||
from tempfile import TemporaryDirectory
|
||||
import os
|
||||
import json
|
||||
import ftplib
|
||||
|
||||
import pooch
|
||||
|
||||
from ._misc import to_list, is_nested
|
||||
|
||||
|
||||
class Resource(ABC):
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
url_s: str | list[str],
|
||||
lifetime: int = 0,
|
||||
):
|
||||
"""
|
||||
|
||||
A Resource is a file, a list of files, an API request, or a list of API
|
||||
requests, any of which can be downloaded from the given URL(s) and
|
||||
cached locally. This class implements checks of the minimum requirements
|
||||
for a resource, to be implemented by a biocypher adapter.
|
||||
|
||||
Args:
|
||||
name (str): The name of the resource.
|
||||
|
||||
url_s (str | list[str]): The URL or URLs of the resource.
|
||||
|
||||
lifetime (int): The lifetime of the resource in days. If 0, the
|
||||
resource is considered to be permanent.
|
||||
"""
|
||||
self.name = name
|
||||
self.url_s = url_s
|
||||
self.lifetime = lifetime
|
||||
|
||||
|
||||
class FileDownload(Resource):
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
url_s: str | list[str],
|
||||
lifetime: int = 0,
|
||||
is_dir: bool = False,
|
||||
):
|
||||
"""
|
||||
Represents basic information for a File Download.
|
||||
|
||||
Args:
|
||||
name(str): The name of the File Download.
|
||||
|
||||
url_s(str|list[str]): The URL(s) of the File Download.
|
||||
|
||||
lifetime(int): The lifetime of the File Download in days. If 0, the
|
||||
File Download is cached indefinitely.
|
||||
|
||||
is_dir (bool): Whether the URL points to a directory or not.
|
||||
"""
|
||||
|
||||
super().__init__(name, url_s, lifetime)
|
||||
self.is_dir = is_dir
|
||||
|
||||
|
||||
class APIRequest(Resource):
|
||||
def __init__(self, name: str, url_s: str | list[str], lifetime: int = 0):
|
||||
"""
|
||||
Represents basic information for an API Request.
|
||||
|
||||
Args:
|
||||
name(str): The name of the API Request.
|
||||
|
||||
url_s(str|list): The URL of the API endpoint.
|
||||
|
||||
lifetime(int): The lifetime of the API Request in days. If 0, the
|
||||
API Request is cached indefinitely.
|
||||
|
||||
"""
|
||||
super().__init__(name, url_s, lifetime)
|
||||
|
||||
|
||||
class Downloader:
|
||||
def __init__(self, cache_dir: Optional[str] = None) -> None:
|
||||
"""
|
||||
The Downloader is a class that manages resources that can be downloaded
|
||||
and cached locally. It manages the lifetime of downloaded resources by
|
||||
keeping a JSON record of the download date of each resource.
|
||||
|
||||
Args:
|
||||
cache_dir (str): The directory where the resources are cached. If
|
||||
not given, a temporary directory is created.
|
||||
"""
|
||||
self.cache_dir = cache_dir or TemporaryDirectory().name
|
||||
self.cache_file = os.path.join(self.cache_dir, "cache.json")
|
||||
self.cache_dict = self._load_cache_dict()
|
||||
|
||||
def download(self, *resources: Resource):
|
||||
"""
|
||||
Download one or multiple resources. Load from cache if the resource is
|
||||
already downloaded and the cache is not expired.
|
||||
|
||||
Args:
|
||||
resources (Resource): The resource(s) to download or load from
|
||||
cache.
|
||||
|
||||
Returns:
|
||||
list[str]: The path or paths to the resource(s) that were downloaded
|
||||
or loaded from cache.
|
||||
|
||||
"""
|
||||
paths = []
|
||||
for resource in resources:
|
||||
paths.append(self._download_or_cache(resource))
|
||||
|
||||
# flatten list if it is nested
|
||||
if is_nested(paths):
|
||||
paths = [path for sublist in paths for path in sublist]
|
||||
|
||||
return paths
|
||||
|
||||
def _download_or_cache(self, resource: Resource, cache: bool = True):
|
||||
"""
|
||||
Download a resource if it is not cached or exceeded its lifetime.
|
||||
|
||||
Args:
|
||||
resource (Resource): The resource to download.
|
||||
Returns:
|
||||
list[str]: The path or paths to the downloaded resource(s).
|
||||
|
||||
|
||||
"""
|
||||
expired = self._is_cache_expired(resource)
|
||||
|
||||
if expired or not cache:
|
||||
self._delete_expired_cache(resource)
|
||||
if isinstance(resource, FileDownload):
|
||||
logger.info(f"Asking for download of resource {resource.name}.")
|
||||
paths = self._download_files(cache, resource)
|
||||
elif isinstance(resource, APIRequest):
|
||||
logger.info(
|
||||
f"Asking for download of api request {resource.name}."
|
||||
)
|
||||
paths = self._download_api_request(resource)
|
||||
|
||||
else:
|
||||
raise TypeError(f"Unknown resource type: {type(resource)}")
|
||||
|
||||
else:
|
||||
paths = self.get_cached_version(resource)
|
||||
self._update_cache_record(resource)
|
||||
return paths
|
||||
|
||||
def _is_cache_expired(self, resource: Resource) -> bool:
|
||||
"""
|
||||
Check if resource or API request cache is expired.
|
||||
|
||||
Args:
|
||||
|
||||
resource (Resource): The resource or API request to download.
|
||||
|
||||
Returns:
|
||||
bool: True if cache is expired, False if not.
|
||||
"""
|
||||
cache_record = self._get_cache_record(resource)
|
||||
if cache_record:
|
||||
download_time = datetime.strptime(
|
||||
cache_record.get("date_downloaded"), "%Y-%m-%d %H:%M:%S.%f"
|
||||
)
|
||||
lifetime = timedelta(days=resource.lifetime)
|
||||
expired = download_time + lifetime < datetime.now()
|
||||
else:
|
||||
expired = True
|
||||
return expired
|
||||
|
||||
def _delete_expired_cache(self, resource: Resource):
|
||||
cache_resource_path = self.cache_dir + "/" + resource.name
|
||||
if os.path.exists(cache_resource_path) and os.path.isdir(
|
||||
cache_resource_path
|
||||
):
|
||||
shutil.rmtree(cache_resource_path)
|
||||
|
||||
def _download_files(self, cache, file_download: FileDownload):
|
||||
"""
|
||||
Download a resource given it is a file or a directory and return the
|
||||
path.
|
||||
|
||||
Args:
|
||||
cache (bool): Whether to cache the resource or not.
|
||||
file_download (FileDownload): The resource to download.
|
||||
|
||||
Returns:
|
||||
list[str]: The path or paths to the downloaded resource(s).
|
||||
"""
|
||||
if file_download.is_dir:
|
||||
files = self._get_files(file_download)
|
||||
file_download.url_s = [
|
||||
file_download.url_s + "/" + file for file in files
|
||||
]
|
||||
file_download.is_dir = False
|
||||
paths = self._download_or_cache(file_download, cache)
|
||||
elif isinstance(file_download.url_s, list):
|
||||
paths = []
|
||||
for url in file_download.url_s:
|
||||
fname = url[url.rfind("/") + 1 :].split("?")[0]
|
||||
paths.append(
|
||||
self._retrieve(
|
||||
url=url,
|
||||
fname=fname,
|
||||
path=os.path.join(self.cache_dir, file_download.name),
|
||||
)
|
||||
)
|
||||
else:
|
||||
paths = []
|
||||
fname = file_download.url_s[
|
||||
file_download.url_s.rfind("/") + 1 :
|
||||
].split("?")[0]
|
||||
results = self._retrieve(
|
||||
url=file_download.url_s,
|
||||
fname=fname,
|
||||
path=os.path.join(self.cache_dir, file_download.name),
|
||||
)
|
||||
if isinstance(results, list):
|
||||
paths.extend(results)
|
||||
else:
|
||||
paths.append(results)
|
||||
|
||||
# sometimes a compressed file contains multiple files
|
||||
# TODO ask for a list of files in the archive to be used from the
|
||||
# adapter
|
||||
return paths
|
||||
|
||||
def _download_api_request(self, api_request: APIRequest):
|
||||
"""
|
||||
Download an API request and return the path.
|
||||
|
||||
Args:
|
||||
api_request(APIRequest): The API request result that is being
|
||||
cached.
|
||||
Returns:
|
||||
list[str]: The path to the cached API request.
|
||||
|
||||
"""
|
||||
urls = (
|
||||
api_request.url_s
|
||||
if isinstance(api_request.url_s, list)
|
||||
else [api_request.url_s]
|
||||
)
|
||||
paths = []
|
||||
for url in urls:
|
||||
fname = url[url.rfind("/") + 1 :].rsplit(".", 1)[0]
|
||||
logger.info(
|
||||
f"Asking for caching API of {api_request.name} {fname}."
|
||||
)
|
||||
response = requests.get(url=url)
|
||||
|
||||
if response.status_code != 200:
|
||||
response.raise_for_status()
|
||||
response_data = response.json()
|
||||
api_path = os.path.join(
|
||||
self.cache_dir, api_request.name, f"{fname}.json"
|
||||
)
|
||||
|
||||
os.makedirs(os.path.dirname(api_path), exist_ok=True)
|
||||
with open(api_path, "w") as f:
|
||||
json.dump(response_data, f)
|
||||
logger.info(f"Caching API request to {api_path}.")
|
||||
paths.append(api_path)
|
||||
return paths
|
||||
|
||||
def get_cached_version(self, resource: Resource) -> list[str]:
|
||||
"""Get the cached version of a resource.
|
||||
|
||||
Args:
|
||||
resource(Resource): The resource to get the cached version of.
|
||||
|
||||
Returns:
|
||||
list[str]: The paths to the cached resource(s).
|
||||
|
||||
"""
|
||||
cached_location = os.path.join(self.cache_dir, resource.name)
|
||||
logger.info(f"Use cached version from {cached_location}.")
|
||||
paths = []
|
||||
for file in os.listdir(cached_location):
|
||||
paths.append(os.path.join(cached_location, file))
|
||||
return paths
|
||||
|
||||
def _retrieve(
|
||||
self,
|
||||
url: str,
|
||||
fname: str,
|
||||
path: str,
|
||||
known_hash: str = None,
|
||||
):
|
||||
"""
|
||||
Retrieve a file from a URL using Pooch. Infer type of file from
|
||||
extension and use appropriate processor.
|
||||
|
||||
Args:
|
||||
url (str): The URL to retrieve the file from.
|
||||
|
||||
fname (str): The name of the file.
|
||||
|
||||
path (str): The path to the file.
|
||||
"""
|
||||
if fname.endswith(".zip"):
|
||||
return pooch.retrieve(
|
||||
url=url,
|
||||
known_hash=known_hash,
|
||||
fname=fname,
|
||||
path=path,
|
||||
processor=pooch.Unzip(),
|
||||
progressbar=True,
|
||||
)
|
||||
|
||||
elif fname.endswith(".tar.gz"):
|
||||
return pooch.retrieve(
|
||||
url=url,
|
||||
known_hash=known_hash,
|
||||
fname=fname,
|
||||
path=path,
|
||||
processor=pooch.Untar(),
|
||||
progressbar=True,
|
||||
)
|
||||
|
||||
elif fname.endswith(".gz"):
|
||||
return pooch.retrieve(
|
||||
url=url,
|
||||
known_hash=known_hash,
|
||||
fname=fname,
|
||||
path=path,
|
||||
processor=pooch.Decompress(),
|
||||
progressbar=True,
|
||||
)
|
||||
|
||||
else:
|
||||
return pooch.retrieve(
|
||||
url=url,
|
||||
known_hash=known_hash,
|
||||
fname=fname,
|
||||
path=path,
|
||||
progressbar=True,
|
||||
)
|
||||
|
||||
def _get_files(self, file_download: FileDownload):
|
||||
"""
|
||||
Get the files contained in a directory file.
|
||||
|
||||
Args:
|
||||
file_download (FileDownload): The directory file.
|
||||
|
||||
Returns:
|
||||
list: The files contained in the directory.
|
||||
"""
|
||||
if file_download.url_s.startswith("ftp://"):
|
||||
# remove protocol
|
||||
url = file_download.url_s[6:]
|
||||
# get base url
|
||||
url = url[: url.find("/")]
|
||||
# get directory (remove initial slash as well)
|
||||
dir = file_download.url_s[7 + len(url) :]
|
||||
# get files
|
||||
ftp = ftplib.FTP(url)
|
||||
ftp.login()
|
||||
ftp.cwd(dir)
|
||||
files = ftp.nlst()
|
||||
ftp.quit()
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only FTP directories are supported at the moment."
|
||||
)
|
||||
|
||||
return files
|
||||
|
||||
def _load_cache_dict(self):
|
||||
"""
|
||||
Load the cache dictionary from the cache file. Create an empty cache
|
||||
file if it does not exist.
|
||||
"""
|
||||
if not os.path.exists(self.cache_dir):
|
||||
logger.info(f"Creating cache directory {self.cache_dir}.")
|
||||
os.makedirs(self.cache_dir)
|
||||
|
||||
if not os.path.exists(self.cache_file):
|
||||
logger.info(f"Creating cache file {self.cache_file}.")
|
||||
with open(self.cache_file, "w") as f:
|
||||
json.dump({}, f)
|
||||
|
||||
with open(self.cache_file, "r") as f:
|
||||
logger.info(f"Loading cache file {self.cache_file}.")
|
||||
return json.load(f)
|
||||
|
||||
def _get_cache_record(self, resource: Resource):
|
||||
"""
|
||||
Get the cache record of a resource.
|
||||
|
||||
Args:
|
||||
resource (Resource): The resource to get the cache record of.
|
||||
|
||||
Returns:
|
||||
The cache record of the resource.
|
||||
"""
|
||||
return self.cache_dict.get(resource.name, {})
|
||||
|
||||
def _update_cache_record(self, resource: Resource):
|
||||
"""
|
||||
Update the cache record of a resource.
|
||||
|
||||
Args:
|
||||
resource (Resource): The resource to update the cache record of.
|
||||
"""
|
||||
cache_record = {}
|
||||
cache_record["url"] = to_list(resource.url_s)
|
||||
cache_record["date_downloaded"] = str(datetime.now())
|
||||
cache_record["lifetime"] = resource.lifetime
|
||||
self.cache_dict[resource.name] = cache_record
|
||||
with open(self.cache_file, "w") as f:
|
||||
json.dump(self.cache_dict, f, default=str)
|
121
biocypher/_logger.py
Normal file
121
biocypher/_logger.py
Normal file
@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
Configuration of the module logger.
|
||||
"""
|
||||
|
||||
__all__ = ["get_logger", "log", "logfile"]
|
||||
|
||||
from datetime import datetime
|
||||
import os
|
||||
import pydoc
|
||||
import logging
|
||||
|
||||
from biocypher import _config
|
||||
from biocypher._metadata import __version__
|
||||
|
||||
|
||||
def get_logger(name: str = "biocypher") -> logging.Logger:
|
||||
"""
|
||||
Access the module logger, create a new one if does not exist yet.
|
||||
|
||||
Method providing central logger instance to main module. Is called
|
||||
only from main submodule, :mod:`biocypher.driver`. In child modules,
|
||||
the standard Python logging facility is called
|
||||
(using ``logging.getLogger(__name__)``), automatically inheriting
|
||||
the handlers from the central logger.
|
||||
|
||||
The file handler creates a log file named after the current date and
|
||||
time. Levels to output to file and console can be set here.
|
||||
|
||||
Args:
|
||||
name:
|
||||
Name of the logger instance.
|
||||
|
||||
Returns:
|
||||
An instance of the Python :py:mod:`logging.Logger`.
|
||||
"""
|
||||
|
||||
if not logging.getLogger(name).hasHandlers():
|
||||
# create logger
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.propagate = True
|
||||
|
||||
# formatting
|
||||
file_formatter = logging.Formatter(
|
||||
"%(asctime)s\t%(levelname)s\tmodule:%(module)s\n%(message)s",
|
||||
)
|
||||
stdout_formatter = logging.Formatter("%(levelname)s -- %(message)s")
|
||||
|
||||
# file name and creation
|
||||
now = datetime.now()
|
||||
date_time = now.strftime("%Y%m%d-%H%M%S")
|
||||
|
||||
log_to_disk = _config.config("biocypher").get("log_to_disk")
|
||||
|
||||
if log_to_disk:
|
||||
logdir = (
|
||||
_config.config("biocypher").get("log_directory")
|
||||
or "biocypher-log"
|
||||
)
|
||||
os.makedirs(logdir, exist_ok=True)
|
||||
logfile = os.path.join(logdir, f"biocypher-{date_time}.log")
|
||||
|
||||
# file handler
|
||||
file_handler = logging.FileHandler(logfile)
|
||||
|
||||
if _config.config("biocypher").get("debug"):
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
else:
|
||||
file_handler.setLevel(logging.INFO)
|
||||
|
||||
file_handler.setFormatter(file_formatter)
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# handlers
|
||||
# stream handler
|
||||
stdout_handler = logging.StreamHandler()
|
||||
stdout_handler.setLevel(logging.INFO)
|
||||
stdout_handler.setFormatter(stdout_formatter)
|
||||
|
||||
# add handlers
|
||||
logger.addHandler(stdout_handler)
|
||||
|
||||
# startup message
|
||||
logger.info(f"This is BioCypher v{__version__}.")
|
||||
if log_to_disk:
|
||||
logger.info(f"Logging into `{logfile}`.")
|
||||
else:
|
||||
logger.info("Logging into stdout.")
|
||||
|
||||
return logging.getLogger(name)
|
||||
|
||||
|
||||
def logfile() -> str:
|
||||
"""
|
||||
Path to the log file.
|
||||
"""
|
||||
|
||||
return get_logger().handlers[0].baseFilename
|
||||
|
||||
|
||||
def log():
|
||||
"""
|
||||
Browse the log file.
|
||||
"""
|
||||
|
||||
with open(logfile()) as fp:
|
||||
pydoc.pager(fp.read())
|
||||
|
||||
|
||||
logger = get_logger()
|
307
biocypher/_mapping.py
Normal file
307
biocypher/_mapping.py
Normal file
@ -0,0 +1,307 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'mapping' module. Handles the mapping of user-defined schema to the
|
||||
underlying ontology.
|
||||
"""
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import Optional
|
||||
from urllib.request import urlopen
|
||||
|
||||
import yaml
|
||||
|
||||
from . import _misc
|
||||
from ._config import config as _config
|
||||
|
||||
|
||||
class OntologyMapping:
|
||||
"""
|
||||
Class to store the ontology mapping and extensions.
|
||||
"""
|
||||
|
||||
def __init__(self, config_file: str = None):
|
||||
self.schema = self._read_config(config_file)
|
||||
|
||||
self.extended_schema = self._extend_schema()
|
||||
|
||||
def _read_config(self, config_file: str = None):
|
||||
"""
|
||||
Read the configuration file and store the ontology mapping and extensions.
|
||||
"""
|
||||
if config_file is None:
|
||||
schema_config = {}
|
||||
|
||||
# load yaml file from web
|
||||
elif config_file.startswith("http"):
|
||||
with urlopen(config_file) as f:
|
||||
schema_config = yaml.safe_load(f)
|
||||
|
||||
# get graph state from config (assume file is local)
|
||||
else:
|
||||
with open(config_file, "r") as f:
|
||||
schema_config = yaml.safe_load(f)
|
||||
|
||||
return schema_config
|
||||
|
||||
def _extend_schema(self, d: Optional[dict] = None) -> dict:
|
||||
"""
|
||||
Get leaves of the tree hierarchy from the data structure dict
|
||||
contained in the `schema_config.yaml`. Creates virtual leaves
|
||||
(as children) from entries that provide more than one preferred
|
||||
id type (and corresponding inputs).
|
||||
|
||||
Args:
|
||||
d:
|
||||
Data structure dict from yaml file.
|
||||
|
||||
"""
|
||||
|
||||
d = d or self.schema
|
||||
|
||||
extended_schema = dict()
|
||||
|
||||
# first pass: get parent leaves with direct representation in ontology
|
||||
for k, v in d.items():
|
||||
# k is not an entity
|
||||
if "represented_as" not in v:
|
||||
continue
|
||||
|
||||
# preferred_id optional: if not provided, use `id`
|
||||
if not v.get("preferred_id"):
|
||||
v["preferred_id"] = "id"
|
||||
|
||||
# k is an entity that is present in the ontology
|
||||
if "is_a" not in v:
|
||||
extended_schema[k] = v
|
||||
|
||||
# second pass: "vertical" inheritance
|
||||
d = self._vertical_property_inheritance(d)
|
||||
for k, v in d.items():
|
||||
if "is_a" in v:
|
||||
# prevent loops
|
||||
if k == v["is_a"]:
|
||||
logger.warning(
|
||||
f"Loop detected in ontology mapping: {k} -> {v}. "
|
||||
"Removing item. Please fix the inheritance if you want "
|
||||
"to use this item."
|
||||
)
|
||||
continue
|
||||
|
||||
extended_schema[k] = v
|
||||
|
||||
# "horizontal" inheritance: create siblings for multiple identifiers or
|
||||
# sources -> virtual leaves or implicit children
|
||||
mi_leaves = {}
|
||||
ms_leaves = {}
|
||||
for k, v in d.items():
|
||||
# k is not an entity
|
||||
if "represented_as" not in v:
|
||||
continue
|
||||
|
||||
if isinstance(v.get("preferred_id"), list):
|
||||
mi_leaves = self._horizontal_inheritance_pid(k, v)
|
||||
extended_schema.update(mi_leaves)
|
||||
|
||||
elif isinstance(v.get("source"), list):
|
||||
ms_leaves = self._horizontal_inheritance_source(k, v)
|
||||
extended_schema.update(ms_leaves)
|
||||
|
||||
return extended_schema
|
||||
|
||||
def _vertical_property_inheritance(self, d):
|
||||
"""
|
||||
Inherit properties from parents to children and update `d` accordingly.
|
||||
"""
|
||||
for k, v in d.items():
|
||||
# k is not an entity
|
||||
if "represented_as" not in v:
|
||||
continue
|
||||
|
||||
# k is an entity that is present in the ontology
|
||||
if "is_a" not in v:
|
||||
continue
|
||||
|
||||
# "vertical" inheritance: inherit properties from parent
|
||||
if v.get("inherit_properties", False):
|
||||
# get direct ancestor
|
||||
if isinstance(v["is_a"], list):
|
||||
parent = v["is_a"][0]
|
||||
else:
|
||||
parent = v["is_a"]
|
||||
|
||||
# ensure child has properties and exclude_properties
|
||||
if "properties" not in v:
|
||||
v["properties"] = {}
|
||||
if "exclude_properties" not in v:
|
||||
v["exclude_properties"] = {}
|
||||
|
||||
# update properties of child
|
||||
parent_props = self.schema[parent].get("properties", {})
|
||||
if parent_props:
|
||||
v["properties"].update(parent_props)
|
||||
|
||||
parent_excl_props = self.schema[parent].get(
|
||||
"exclude_properties", {}
|
||||
)
|
||||
if parent_excl_props:
|
||||
v["exclude_properties"].update(parent_excl_props)
|
||||
|
||||
# update schema (d)
|
||||
d[k] = v
|
||||
|
||||
return d
|
||||
|
||||
def _horizontal_inheritance_pid(self, key, value):
|
||||
"""
|
||||
Create virtual leaves for multiple preferred id types or sources.
|
||||
|
||||
If we create virtual leaves, input_label/label_in_input always has to be
|
||||
a list.
|
||||
"""
|
||||
|
||||
leaves = {}
|
||||
|
||||
preferred_id = value["preferred_id"]
|
||||
input_label = value.get("input_label") or value["label_in_input"]
|
||||
represented_as = value["represented_as"]
|
||||
|
||||
# adjust lengths
|
||||
max_l = max(
|
||||
[
|
||||
len(_misc.to_list(preferred_id)),
|
||||
len(_misc.to_list(input_label)),
|
||||
len(_misc.to_list(represented_as)),
|
||||
],
|
||||
)
|
||||
|
||||
# adjust pid length if necessary
|
||||
if isinstance(preferred_id, str):
|
||||
pids = [preferred_id] * max_l
|
||||
else:
|
||||
pids = preferred_id
|
||||
|
||||
# adjust rep length if necessary
|
||||
if isinstance(represented_as, str):
|
||||
reps = [represented_as] * max_l
|
||||
else:
|
||||
reps = represented_as
|
||||
|
||||
for pid, lab, rep in zip(pids, input_label, reps):
|
||||
skey = pid + "." + key
|
||||
svalue = {
|
||||
"preferred_id": pid,
|
||||
"input_label": lab,
|
||||
"represented_as": rep,
|
||||
# mark as virtual
|
||||
"virtual": True,
|
||||
}
|
||||
|
||||
# inherit is_a if exists
|
||||
if "is_a" in value.keys():
|
||||
# treat as multiple inheritance
|
||||
if isinstance(value["is_a"], list):
|
||||
v = list(value["is_a"])
|
||||
v.insert(0, key)
|
||||
svalue["is_a"] = v
|
||||
|
||||
else:
|
||||
svalue["is_a"] = [key, value["is_a"]]
|
||||
|
||||
else:
|
||||
# set parent as is_a
|
||||
svalue["is_a"] = key
|
||||
|
||||
# inherit everything except core attributes
|
||||
for k, v in value.items():
|
||||
if k not in [
|
||||
"is_a",
|
||||
"preferred_id",
|
||||
"input_label",
|
||||
"label_in_input",
|
||||
"represented_as",
|
||||
]:
|
||||
svalue[k] = v
|
||||
|
||||
leaves[skey] = svalue
|
||||
|
||||
return leaves
|
||||
|
||||
def _horizontal_inheritance_source(self, key, value):
|
||||
"""
|
||||
Create virtual leaves for multiple sources.
|
||||
|
||||
If we create virtual leaves, input_label/label_in_input always has to be
|
||||
a list.
|
||||
"""
|
||||
|
||||
leaves = {}
|
||||
|
||||
source = value["source"]
|
||||
input_label = value.get("input_label") or value["label_in_input"]
|
||||
represented_as = value["represented_as"]
|
||||
|
||||
# adjust lengths
|
||||
src_l = len(source)
|
||||
|
||||
# adjust label length if necessary
|
||||
if isinstance(input_label, str):
|
||||
labels = [input_label] * src_l
|
||||
else:
|
||||
labels = input_label
|
||||
|
||||
# adjust rep length if necessary
|
||||
if isinstance(represented_as, str):
|
||||
reps = [represented_as] * src_l
|
||||
else:
|
||||
reps = represented_as
|
||||
|
||||
for src, lab, rep in zip(source, labels, reps):
|
||||
skey = src + "." + key
|
||||
svalue = {
|
||||
"source": src,
|
||||
"input_label": lab,
|
||||
"represented_as": rep,
|
||||
# mark as virtual
|
||||
"virtual": True,
|
||||
}
|
||||
|
||||
# inherit is_a if exists
|
||||
if "is_a" in value.keys():
|
||||
# treat as multiple inheritance
|
||||
if isinstance(value["is_a"], list):
|
||||
v = list(value["is_a"])
|
||||
v.insert(0, key)
|
||||
svalue["is_a"] = v
|
||||
|
||||
else:
|
||||
svalue["is_a"] = [key, value["is_a"]]
|
||||
|
||||
else:
|
||||
# set parent as is_a
|
||||
svalue["is_a"] = key
|
||||
|
||||
# inherit everything except core attributes
|
||||
for k, v in value.items():
|
||||
if k not in [
|
||||
"is_a",
|
||||
"source",
|
||||
"input_label",
|
||||
"label_in_input",
|
||||
"represented_as",
|
||||
]:
|
||||
svalue[k] = v
|
||||
|
||||
leaves[skey] = svalue
|
||||
|
||||
return leaves
|
71
biocypher/_metadata.py
Normal file
71
biocypher/_metadata.py
Normal file
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
Package metadata (version, authors, etc).
|
||||
"""
|
||||
|
||||
__all__ = ["get_metadata"]
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
import importlib.metadata
|
||||
|
||||
import toml
|
||||
|
||||
_VERSION = "0.6.0"
|
||||
|
||||
|
||||
def get_metadata():
|
||||
"""
|
||||
Basic package metadata.
|
||||
|
||||
Retrieves package metadata from the current project directory or from
|
||||
the installed package.
|
||||
"""
|
||||
|
||||
here = pathlib.Path(__file__).parent
|
||||
pyproj_toml = "pyproject.toml"
|
||||
meta = {}
|
||||
|
||||
for project_dir in (here, here.parent):
|
||||
toml_path = str(project_dir.joinpath(pyproj_toml).absolute())
|
||||
|
||||
if os.path.exists(toml_path):
|
||||
pyproject = toml.load(toml_path)
|
||||
|
||||
meta = {
|
||||
"name": pyproject["tool"]["poetry"]["name"],
|
||||
"version": pyproject["tool"]["poetry"]["version"],
|
||||
"author": pyproject["tool"]["poetry"]["authors"],
|
||||
"license": pyproject["tool"]["poetry"]["license"],
|
||||
"full_metadata": pyproject,
|
||||
}
|
||||
|
||||
break
|
||||
|
||||
if not meta:
|
||||
try:
|
||||
meta = {
|
||||
k.lower(): v
|
||||
for k, v in importlib.metadata.metadata(here.name).items()
|
||||
}
|
||||
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
pass
|
||||
|
||||
meta["version"] = meta.get("version", None) or _VERSION
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
metadata = get_metadata()
|
||||
__version__ = metadata.get("version", None)
|
||||
__author__ = metadata.get("author", None)
|
||||
__license__ = metadata.get("license", None)
|
264
biocypher/_misc.py
Normal file
264
biocypher/_misc.py
Normal file
@ -0,0 +1,264 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
Handy functions for use in various places.
|
||||
"""
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
Union,
|
||||
Mapping,
|
||||
KeysView,
|
||||
Generator,
|
||||
ItemsView,
|
||||
ValuesView,
|
||||
)
|
||||
from collections.abc import Iterable
|
||||
import re
|
||||
|
||||
from treelib import Tree
|
||||
import networkx as nx
|
||||
import stringcase
|
||||
|
||||
__all__ = ["LIST_LIKE", "SIMPLE_TYPES", "ensure_iterable", "to_list"]
|
||||
|
||||
SIMPLE_TYPES = (
|
||||
bytes,
|
||||
str,
|
||||
int,
|
||||
float,
|
||||
bool,
|
||||
type(None),
|
||||
)
|
||||
|
||||
LIST_LIKE = (
|
||||
list,
|
||||
set,
|
||||
tuple,
|
||||
Generator,
|
||||
ItemsView,
|
||||
KeysView,
|
||||
Mapping,
|
||||
ValuesView,
|
||||
)
|
||||
|
||||
|
||||
def to_list(value: Any) -> list:
|
||||
"""
|
||||
Ensures that ``value`` is a list.
|
||||
"""
|
||||
|
||||
if isinstance(value, LIST_LIKE):
|
||||
value = list(value)
|
||||
|
||||
else:
|
||||
value = [value]
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def ensure_iterable(value: Any) -> Iterable:
|
||||
"""
|
||||
Returns iterables, except strings, wraps simple types into tuple.
|
||||
"""
|
||||
|
||||
return value if isinstance(value, LIST_LIKE) else (value,)
|
||||
|
||||
|
||||
def create_tree_visualisation(inheritance_graph: Union[dict, nx.Graph]) -> Tree:
|
||||
"""
|
||||
Creates a visualisation of the inheritance tree using treelib.
|
||||
"""
|
||||
inheritance_tree = _get_inheritance_tree(inheritance_graph)
|
||||
classes, root = _find_root_node(inheritance_tree)
|
||||
|
||||
tree = Tree()
|
||||
tree.create_node(root, root)
|
||||
while classes:
|
||||
for child in classes:
|
||||
parent = inheritance_tree[child]
|
||||
if parent in tree.nodes.keys() or parent == root:
|
||||
tree.create_node(child, child, parent=parent)
|
||||
|
||||
for node in tree.nodes.keys():
|
||||
if node in classes:
|
||||
classes.remove(node)
|
||||
|
||||
return tree
|
||||
|
||||
|
||||
def _get_inheritance_tree(inheritance_graph: Union[dict, nx.Graph]) -> dict:
|
||||
"""Transforms an inheritance_graph into an inheritance_tree.
|
||||
|
||||
Args:
|
||||
inheritance_graph: A dict or nx.Graph representing the inheritance graph.
|
||||
|
||||
Returns:
|
||||
A dict representing the inheritance tree.
|
||||
"""
|
||||
if isinstance(inheritance_graph, nx.Graph):
|
||||
inheritance_tree = nx.to_dict_of_lists(inheritance_graph)
|
||||
|
||||
multiple_parents_present = _multiple_inheritance_present(
|
||||
inheritance_tree
|
||||
)
|
||||
if multiple_parents_present:
|
||||
logger.warning(
|
||||
"The ontology contains multiple inheritance (one child node "
|
||||
"has multiple parent nodes). This is not visualized in the "
|
||||
"following hierarchy tree (the child node is only added once). "
|
||||
"If you wish to browse all relationships of the parsed "
|
||||
"ontologies, write a graphml file to disk using "
|
||||
"`to_disk = <directory>` and view this file."
|
||||
)
|
||||
|
||||
# unlist values
|
||||
inheritance_tree = {k: v[0] for k, v in inheritance_tree.items() if v}
|
||||
return inheritance_tree
|
||||
elif not _multiple_inheritance_present(inheritance_graph):
|
||||
return inheritance_graph
|
||||
|
||||
|
||||
def _multiple_inheritance_present(inheritance_tree: dict) -> bool:
|
||||
"""Checks if multiple inheritance is present in the inheritance_tree."""
|
||||
return any(len(value) > 1 for value in inheritance_tree.values())
|
||||
|
||||
|
||||
def _find_root_node(inheritance_tree: dict) -> tuple[set, str]:
|
||||
classes = set(inheritance_tree.keys())
|
||||
parents = set(inheritance_tree.values())
|
||||
root = list(parents - classes)
|
||||
if len(root) > 1:
|
||||
if "entity" in root:
|
||||
root = "entity" # TODO: default: good standard?
|
||||
else:
|
||||
raise ValueError(
|
||||
"Inheritance tree cannot have more than one root node. "
|
||||
f"Found {len(root)}: {root}."
|
||||
)
|
||||
else:
|
||||
root = root[0]
|
||||
if not root:
|
||||
# find key whose value is None
|
||||
root = list(inheritance_tree.keys())[
|
||||
list(inheritance_tree.values()).index(None)
|
||||
]
|
||||
return classes, root
|
||||
|
||||
|
||||
# string conversion, adapted from Biolink Model Toolkit
|
||||
lowercase_pattern = re.compile(r"[a-zA-Z]*[a-z][a-zA-Z]*")
|
||||
underscore_pattern = re.compile(r"(?<!^)(?=[A-Z][a-z])")
|
||||
|
||||
|
||||
def from_pascal(s: str, sep: str = " ") -> str:
|
||||
underscored = underscore_pattern.sub(sep, s)
|
||||
lowercased = lowercase_pattern.sub(
|
||||
lambda match: match.group(0).lower(),
|
||||
underscored,
|
||||
)
|
||||
return lowercased
|
||||
|
||||
|
||||
def pascalcase_to_sentencecase(s: str) -> str:
|
||||
"""
|
||||
Convert PascalCase to sentence case.
|
||||
|
||||
Args:
|
||||
s: Input string in PascalCase
|
||||
|
||||
Returns:
|
||||
string in sentence case form
|
||||
"""
|
||||
return from_pascal(s, sep=" ")
|
||||
|
||||
|
||||
def snakecase_to_sentencecase(s: str) -> str:
|
||||
"""
|
||||
Convert snake_case to sentence case.
|
||||
|
||||
Args:
|
||||
s: Input string in snake_case
|
||||
|
||||
Returns:
|
||||
string in sentence case form
|
||||
"""
|
||||
return stringcase.sentencecase(s).lower()
|
||||
|
||||
|
||||
def sentencecase_to_snakecase(s: str) -> str:
|
||||
"""
|
||||
Convert sentence case to snake_case.
|
||||
|
||||
Args:
|
||||
s: Input string in sentence case
|
||||
|
||||
Returns:
|
||||
string in snake_case form
|
||||
"""
|
||||
return stringcase.snakecase(s).lower()
|
||||
|
||||
|
||||
def sentencecase_to_pascalcase(s: str, sep: str = r"\s") -> str:
|
||||
"""
|
||||
Convert sentence case to PascalCase.
|
||||
|
||||
Args:
|
||||
s: Input string in sentence case
|
||||
|
||||
Returns:
|
||||
string in PascalCase form
|
||||
"""
|
||||
return re.sub(
|
||||
r"(?:^|[" + sep + "])([a-zA-Z])",
|
||||
lambda match: match.group(1).upper(),
|
||||
s,
|
||||
)
|
||||
|
||||
|
||||
def to_lower_sentence_case(s: str) -> str:
|
||||
"""
|
||||
Convert any string to lower sentence case. Works with snake_case,
|
||||
PascalCase, and sentence case.
|
||||
|
||||
Args:
|
||||
s: Input string
|
||||
|
||||
Returns:
|
||||
string in lower sentence case form
|
||||
"""
|
||||
if "_" in s:
|
||||
return snakecase_to_sentencecase(s)
|
||||
elif " " in s:
|
||||
return s.lower()
|
||||
elif s[0].isupper():
|
||||
return pascalcase_to_sentencecase(s)
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
def is_nested(lst) -> bool:
|
||||
"""
|
||||
Check if a list is nested.
|
||||
|
||||
Args:
|
||||
lst (list): The list to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the list is nested, False otherwise.
|
||||
"""
|
||||
for item in lst:
|
||||
if isinstance(item, list):
|
||||
return True
|
||||
return False
|
886
biocypher/_ontology.py
Normal file
886
biocypher/_ontology.py
Normal file
@ -0,0 +1,886 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'ontology' module. Contains classes and functions to handle parsing
|
||||
and representation of single ontologies as well as their hybridisation and
|
||||
other advanced operations.
|
||||
"""
|
||||
import os
|
||||
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from rdflib import Graph
|
||||
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
|
||||
import rdflib
|
||||
import networkx as nx
|
||||
|
||||
from ._misc import (
|
||||
to_list,
|
||||
to_lower_sentence_case,
|
||||
create_tree_visualisation,
|
||||
sentencecase_to_pascalcase,
|
||||
)
|
||||
from ._mapping import OntologyMapping
|
||||
|
||||
|
||||
class OntologyAdapter:
|
||||
"""
|
||||
Class that represents an ontology to be used in the Biocypher framework. Can
|
||||
read from a variety of formats, including OWL, OBO, and RDF/XML. The
|
||||
ontology is represented by a networkx.DiGraph object; an RDFlib graph is
|
||||
also kept. By default, the DiGraph reverses the label and identifier of the
|
||||
nodes, such that the node name in the graph is the human-readable label. The
|
||||
edges are oriented from child to parent.
|
||||
Labels are formatted in lower sentence case and underscores are replaced by spaces.
|
||||
Identifiers are taken as defined and the prefixes are removed by default.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ontology_file: str,
|
||||
root_label: str,
|
||||
ontology_file_format: Optional[str] = None,
|
||||
head_join_node_label: Optional[str] = None,
|
||||
merge_nodes: Optional[bool] = True,
|
||||
switch_label_and_id: bool = True,
|
||||
remove_prefixes: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize the OntologyAdapter class.
|
||||
|
||||
Args:
|
||||
ontology_file (str): Path to the ontology file. Can be local or
|
||||
remote.
|
||||
|
||||
root_label (str): The label of the root node in the ontology. In
|
||||
case of a tail ontology, this is the tail join node.
|
||||
|
||||
ontology_file_format (str): The format of the ontology file (e.g. "application/rdf+xml")
|
||||
If format is not passed, it is determined automatically.
|
||||
|
||||
head_join_node_label (str): Optional variable to store the label of the
|
||||
node in the head ontology that should be used to join to the
|
||||
root node of the tail ontology. Defaults to None.
|
||||
|
||||
merge_nodes (bool): If True, head and tail join nodes will be
|
||||
merged, using the label of the head join node. If False, the
|
||||
tail join node will be attached as a child of the head join
|
||||
node.
|
||||
|
||||
switch_label_and_id (bool): If True, the node names in the graph will be
|
||||
the human-readable labels. If False, the node names will be the
|
||||
identifiers. Defaults to True.
|
||||
|
||||
remove_prefixes (bool): If True, the prefixes of the identifiers will
|
||||
be removed. Defaults to True.
|
||||
"""
|
||||
|
||||
logger.info(f"Instantiating OntologyAdapter class for {ontology_file}.")
|
||||
|
||||
self._ontology_file = ontology_file
|
||||
self._root_label = root_label
|
||||
self._format = ontology_file_format
|
||||
self._merge_nodes = merge_nodes
|
||||
self._head_join_node = head_join_node_label
|
||||
self._switch_label_and_id = switch_label_and_id
|
||||
self._remove_prefixes = remove_prefixes
|
||||
|
||||
self._rdf_graph = self._load_rdf_graph(ontology_file)
|
||||
|
||||
self._nx_graph = self._rdf_to_nx(
|
||||
self._rdf_graph, root_label, switch_label_and_id
|
||||
)
|
||||
|
||||
def _rdf_to_nx(
|
||||
self,
|
||||
_rdf_graph: rdflib.Graph,
|
||||
root_label: str,
|
||||
switch_label_and_id: bool,
|
||||
rename_nodes: bool = True,
|
||||
) -> nx.DiGraph:
|
||||
one_to_one_triples, one_to_many_dict = self._get_relevant_rdf_triples(
|
||||
_rdf_graph
|
||||
)
|
||||
nx_graph = self._convert_to_nx(one_to_one_triples, one_to_many_dict)
|
||||
nx_graph = self._add_labels_to_nodes(nx_graph, switch_label_and_id)
|
||||
nx_graph = self._change_nodes_to_biocypher_format(
|
||||
nx_graph, switch_label_and_id, rename_nodes
|
||||
)
|
||||
nx_graph = self._get_all_ancestors(
|
||||
nx_graph, root_label, switch_label_and_id, rename_nodes
|
||||
)
|
||||
return nx.DiGraph(nx_graph)
|
||||
|
||||
def _get_relevant_rdf_triples(self, g: rdflib.Graph) -> tuple:
|
||||
one_to_one_inheritance_graph = self._get_one_to_one_inheritance_triples(
|
||||
g
|
||||
)
|
||||
intersection = self._get_multiple_inheritance_dict(g)
|
||||
return one_to_one_inheritance_graph, intersection
|
||||
|
||||
def _get_one_to_one_inheritance_triples(
|
||||
self, g: rdflib.Graph
|
||||
) -> rdflib.Graph:
|
||||
"""Get the one to one inheritance triples from the RDF graph.
|
||||
|
||||
Args:
|
||||
g (rdflib.Graph): The RDF graph
|
||||
|
||||
Returns:
|
||||
rdflib.Graph: The one to one inheritance graph
|
||||
"""
|
||||
one_to_one_inheritance_graph = Graph()
|
||||
for s, p, o in g.triples((None, rdflib.RDFS.subClassOf, None)):
|
||||
if self.has_label(s, g):
|
||||
one_to_one_inheritance_graph.add((s, p, o))
|
||||
return one_to_one_inheritance_graph
|
||||
|
||||
def _get_multiple_inheritance_dict(self, g: rdflib.Graph) -> dict:
|
||||
"""Get the multiple inheritance dictionary from the RDF graph.
|
||||
|
||||
Args:
|
||||
g (rdflib.Graph): The RDF graph
|
||||
|
||||
Returns:
|
||||
dict: The multiple inheritance dictionary
|
||||
"""
|
||||
multiple_inheritance = g.triples(
|
||||
(None, rdflib.OWL.intersectionOf, None)
|
||||
)
|
||||
intersection = {}
|
||||
for (
|
||||
node,
|
||||
has_multiple_parents,
|
||||
first_node_of_intersection_list,
|
||||
) in multiple_inheritance:
|
||||
parents = self._retrieve_rdf_linked_list(
|
||||
first_node_of_intersection_list
|
||||
)
|
||||
child_name = None
|
||||
for s_, _, _ in g.triples((None, rdflib.RDFS.subClassOf, node)):
|
||||
child_name = s_
|
||||
|
||||
# Handle Snomed CT post coordinated expressions
|
||||
if not child_name:
|
||||
for s_, _, _ in g.triples(
|
||||
(None, rdflib.OWL.equivalentClass, node)
|
||||
):
|
||||
child_name = s_
|
||||
|
||||
if child_name:
|
||||
intersection[node] = {
|
||||
"child_name": child_name,
|
||||
"parent_node_names": parents,
|
||||
}
|
||||
return intersection
|
||||
|
||||
def has_label(self, node: rdflib.URIRef, g: rdflib.Graph) -> bool:
|
||||
"""Does the node have a label in g?
|
||||
|
||||
Args:
|
||||
node (rdflib.URIRef): The node to check
|
||||
g (rdflib.Graph): The graph to check in
|
||||
Returns:
|
||||
bool: True if the node has a label, False otherwise
|
||||
"""
|
||||
return (node, rdflib.RDFS.label, None) in g
|
||||
|
||||
def _retrieve_rdf_linked_list(self, subject: rdflib.URIRef) -> list:
|
||||
"""Recursively retrieves a linked list from RDF.
|
||||
Example RDF list with the items [item1, item2]:
|
||||
list_node - first -> item1
|
||||
list_node - rest -> list_node2
|
||||
list_node2 - first -> item2
|
||||
list_node2 - rest -> nil
|
||||
Args:
|
||||
subject (rdflib.URIRef): One list_node of the RDF list
|
||||
Returns:
|
||||
list: The items of the RDF list
|
||||
"""
|
||||
g = self._rdf_graph
|
||||
rdf_list = []
|
||||
for s, p, o in g.triples((subject, rdflib.RDF.first, None)):
|
||||
rdf_list.append(o)
|
||||
for s, p, o in g.triples((subject, rdflib.RDF.rest, None)):
|
||||
if o != rdflib.RDF.nil:
|
||||
rdf_list.extend(self._retrieve_rdf_linked_list(o))
|
||||
return rdf_list
|
||||
|
||||
def _convert_to_nx(
|
||||
self, one_to_one: rdflib.Graph, one_to_many: dict
|
||||
) -> nx.DiGraph:
|
||||
"""Convert the one to one and one to many inheritance graphs to networkx.
|
||||
|
||||
Args:
|
||||
one_to_one (rdflib.Graph): The one to one inheritance graph
|
||||
one_to_many (dict): The one to many inheritance dictionary
|
||||
|
||||
Returns:
|
||||
nx.DiGraph: The networkx graph
|
||||
"""
|
||||
nx_graph = rdflib_to_networkx_digraph(
|
||||
one_to_one, edge_attrs=lambda s, p, o: {}, calc_weights=False
|
||||
)
|
||||
for key, value in one_to_many.items():
|
||||
nx_graph.add_edges_from(
|
||||
[
|
||||
(value["child_name"], parent)
|
||||
for parent in value["parent_node_names"]
|
||||
]
|
||||
)
|
||||
if key in nx_graph.nodes:
|
||||
nx_graph.remove_node(key)
|
||||
return nx_graph
|
||||
|
||||
def _add_labels_to_nodes(
|
||||
self, nx_graph: nx.DiGraph, switch_label_and_id: bool
|
||||
) -> nx.DiGraph:
|
||||
"""Add labels to the nodes in the networkx graph.
|
||||
|
||||
Args:
|
||||
nx_graph (nx.DiGraph): The networkx graph
|
||||
switch_label_and_id (bool): If True, id and label are switched
|
||||
|
||||
Returns:
|
||||
nx.DiGraph: The networkx graph with labels
|
||||
"""
|
||||
for node in list(nx_graph.nodes):
|
||||
nx_id, nx_label = self._get_nx_id_and_label(
|
||||
node, switch_label_and_id
|
||||
)
|
||||
if nx_id == "none":
|
||||
# remove node if it has no id
|
||||
nx_graph.remove_node(node)
|
||||
continue
|
||||
|
||||
nx_graph.nodes[node]["label"] = nx_label
|
||||
return nx_graph
|
||||
|
||||
def _change_nodes_to_biocypher_format(
|
||||
self,
|
||||
nx_graph: nx.DiGraph,
|
||||
switch_label_and_id: bool,
|
||||
rename_nodes: bool = True,
|
||||
) -> nx.DiGraph:
|
||||
"""Change the nodes in the networkx graph to BioCypher format:
|
||||
- remove the prefix of the identifier
|
||||
- switch id and label
|
||||
- adapt the labels (replace _ with space and convert to lower sentence case)
|
||||
|
||||
Args:
|
||||
nx_graph (nx.DiGraph): The networkx graph
|
||||
switch_label_and_id (bool): If True, id and label are switched
|
||||
rename_nodes (bool): If True, the nodes are renamed
|
||||
|
||||
Returns:
|
||||
nx.DiGraph: The networkx ontology graph in BioCypher format
|
||||
"""
|
||||
mapping = {
|
||||
node: self._get_nx_id_and_label(
|
||||
node, switch_label_and_id, rename_nodes
|
||||
)[0]
|
||||
for node in nx_graph.nodes
|
||||
}
|
||||
renamed = nx.relabel_nodes(nx_graph, mapping, copy=False)
|
||||
return renamed
|
||||
|
||||
def _get_all_ancestors(
|
||||
self,
|
||||
renamed: nx.DiGraph,
|
||||
root_label: str,
|
||||
switch_label_and_id: bool,
|
||||
rename_nodes: bool = True,
|
||||
) -> nx.DiGraph:
|
||||
"""Get all ancestors of the root node in the networkx graph.
|
||||
|
||||
Args:
|
||||
renamed (nx.DiGraph): The renamed networkx graph
|
||||
root_label (str): The label of the root node in the ontology
|
||||
switch_label_and_id (bool): If True, id and label are switched
|
||||
rename_nodes (bool): If True, the nodes are renamed
|
||||
|
||||
Returns:
|
||||
nx.DiGraph: The filtered networkx graph
|
||||
"""
|
||||
root = self._get_nx_id_and_label(
|
||||
self._find_root_label(self._rdf_graph, root_label),
|
||||
switch_label_and_id,
|
||||
rename_nodes,
|
||||
)[0]
|
||||
ancestors = nx.ancestors(renamed, root)
|
||||
ancestors.add(root)
|
||||
filtered_graph = renamed.subgraph(ancestors)
|
||||
return filtered_graph
|
||||
|
||||
def _get_nx_id_and_label(
|
||||
self, node, switch_id_and_label: bool, rename_nodes: bool = True
|
||||
) -> tuple[str, str]:
|
||||
"""Rename node id and label for nx graph.
|
||||
|
||||
Args:
|
||||
node (str): The node to rename
|
||||
switch_id_and_label (bool): If True, switch id and label
|
||||
|
||||
Returns:
|
||||
tuple[str, str]: The renamed node id and label
|
||||
"""
|
||||
node_id_str = self._remove_prefix(str(node))
|
||||
node_label_str = str(self._rdf_graph.value(node, rdflib.RDFS.label))
|
||||
if rename_nodes:
|
||||
node_label_str = node_label_str.replace("_", " ")
|
||||
node_label_str = to_lower_sentence_case(node_label_str)
|
||||
nx_id = node_label_str if switch_id_and_label else node_id_str
|
||||
nx_label = node_id_str if switch_id_and_label else node_label_str
|
||||
return nx_id, nx_label
|
||||
|
||||
def _find_root_label(self, g, root_label):
|
||||
# Loop through all labels in the ontology
|
||||
for label_subject, _, label_in_ontology in g.triples(
|
||||
(None, rdflib.RDFS.label, None)
|
||||
):
|
||||
# If the label is the root label, set the root node to the label's subject
|
||||
if str(label_in_ontology) == root_label:
|
||||
root = label_subject
|
||||
break
|
||||
else:
|
||||
labels_in_ontology = []
|
||||
for label_subject, _, label_in_ontology in g.triples(
|
||||
(None, rdflib.RDFS.label, None)
|
||||
):
|
||||
labels_in_ontology.append(str(label_in_ontology))
|
||||
raise ValueError(
|
||||
f"Could not find root node with label '{root_label}'. "
|
||||
f"The ontology contains the following labels: {labels_in_ontology}"
|
||||
)
|
||||
return root
|
||||
|
||||
def _remove_prefix(self, uri: str) -> str:
|
||||
"""
|
||||
Remove the prefix of a URI. URIs can contain either "#" or "/" as a
|
||||
separator between the prefix and the local name. The prefix is
|
||||
everything before the last separator.
|
||||
"""
|
||||
if self._remove_prefixes:
|
||||
return uri.rsplit("#", 1)[-1].rsplit("/", 1)[-1]
|
||||
else:
|
||||
return uri
|
||||
|
||||
def _load_rdf_graph(self, ontology_file):
|
||||
"""
|
||||
Load the ontology into an RDFlib graph. The ontology file can be in
|
||||
OWL, OBO, or RDF/XML format.
|
||||
"""
|
||||
g = rdflib.Graph()
|
||||
g.parse(ontology_file, format=self._get_format(ontology_file))
|
||||
return g
|
||||
|
||||
def _get_format(self, ontology_file):
|
||||
"""
|
||||
Get the format of the ontology file.
|
||||
"""
|
||||
if self._format:
|
||||
if self._format == "owl":
|
||||
return "application/rdf+xml"
|
||||
elif self._format == "obo":
|
||||
raise NotImplementedError("OBO format not yet supported")
|
||||
elif self._format == "rdf":
|
||||
return "application/rdf+xml"
|
||||
elif self._format == "ttl":
|
||||
return self._format
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Could not determine format of ontology file {ontology_file}"
|
||||
)
|
||||
|
||||
if ontology_file.endswith(".owl"):
|
||||
return "application/rdf+xml"
|
||||
elif ontology_file.endswith(".obo"):
|
||||
raise NotImplementedError("OBO format not yet supported")
|
||||
elif ontology_file.endswith(".rdf"):
|
||||
return "application/rdf+xml"
|
||||
elif ontology_file.endswith(".ttl"):
|
||||
return "ttl"
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Could not determine format of ontology file {ontology_file}"
|
||||
)
|
||||
|
||||
def get_nx_graph(self):
|
||||
"""
|
||||
Get the networkx graph representing the ontology.
|
||||
"""
|
||||
return self._nx_graph
|
||||
|
||||
def get_rdf_graph(self):
|
||||
"""
|
||||
Get the RDFlib graph representing the ontology.
|
||||
"""
|
||||
return self._rdf_graph
|
||||
|
||||
def get_root_node(self):
|
||||
"""
|
||||
Get root node in the ontology.
|
||||
|
||||
Returns:
|
||||
root_node: If _switch_label_and_id is True, the root node label is returned,
|
||||
otherwise the root node id is returned.
|
||||
"""
|
||||
|
||||
root_node = None
|
||||
root_label = self._root_label.replace("_", " ")
|
||||
|
||||
if self._switch_label_and_id:
|
||||
root_node = to_lower_sentence_case(root_label)
|
||||
elif not self._switch_label_and_id:
|
||||
for node, data in self.get_nx_graph().nodes(data=True):
|
||||
if "label" in data and data["label"] == to_lower_sentence_case(
|
||||
root_label
|
||||
):
|
||||
root_node = node
|
||||
break
|
||||
|
||||
return root_node
|
||||
|
||||
def get_ancestors(self, node_label):
|
||||
"""
|
||||
Get the ancestors of a node in the ontology.
|
||||
"""
|
||||
return nx.dfs_preorder_nodes(self._nx_graph, node_label)
|
||||
|
||||
def get_head_join_node(self):
|
||||
"""
|
||||
Get the head join node of the ontology.
|
||||
"""
|
||||
return self._head_join_node
|
||||
|
||||
|
||||
class Ontology:
|
||||
"""
|
||||
A class that represents the ontological "backbone" of a BioCypher knowledge
|
||||
graph. The ontology can be built from a single resource, or hybridised from
|
||||
a combination of resources, with one resource being the "head" ontology,
|
||||
while an arbitrary number of other resources can become "tail" ontologies at
|
||||
arbitrary fusion points inside the "head" ontology.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_ontology: dict,
|
||||
ontology_mapping: Optional["OntologyMapping"] = None,
|
||||
tail_ontologies: Optional[dict] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the Ontology class.
|
||||
|
||||
Args:
|
||||
head_ontology (OntologyAdapter): The head ontology.
|
||||
|
||||
tail_ontologies (list): A list of OntologyAdapters that will be
|
||||
added to the head ontology. Defaults to None.
|
||||
"""
|
||||
|
||||
self._head_ontology_meta = head_ontology
|
||||
self.mapping = ontology_mapping
|
||||
self._tail_ontology_meta = tail_ontologies
|
||||
|
||||
self._tail_ontologies = None
|
||||
self._nx_graph = None
|
||||
|
||||
# keep track of nodes that have been extended
|
||||
self._extended_nodes = set()
|
||||
|
||||
self._main()
|
||||
|
||||
def _main(self) -> None:
|
||||
"""
|
||||
Main method to be run on instantiation. Loads the ontologies, joins
|
||||
them, and returns the hybrid ontology. Loads only the head ontology
|
||||
if nothing else is given. Adds user extensions and properties from
|
||||
the mapping.
|
||||
"""
|
||||
self._load_ontologies()
|
||||
|
||||
if self._tail_ontologies:
|
||||
for adapter in self._tail_ontologies.values():
|
||||
head_join_node = self._get_head_join_node(adapter)
|
||||
self._join_ontologies(adapter, head_join_node)
|
||||
else:
|
||||
self._nx_graph = self._head_ontology.get_nx_graph()
|
||||
|
||||
if self.mapping:
|
||||
self._extend_ontology()
|
||||
|
||||
# experimental: add connections of disjoint classes to entity
|
||||
# self._connect_biolink_classes()
|
||||
|
||||
self._add_properties()
|
||||
|
||||
def _load_ontologies(self) -> None:
|
||||
"""
|
||||
For each ontology, load the OntologyAdapter object and store it as an
|
||||
instance variable (head) or a dictionary (tail).
|
||||
"""
|
||||
|
||||
logger.info("Loading ontologies...")
|
||||
|
||||
self._head_ontology = OntologyAdapter(
|
||||
ontology_file=self._head_ontology_meta["url"],
|
||||
root_label=self._head_ontology_meta["root_node"],
|
||||
ontology_file_format=self._head_ontology_meta.get("format", None),
|
||||
switch_label_and_id=self._head_ontology_meta.get(
|
||||
"switch_label_and_id", True
|
||||
),
|
||||
)
|
||||
|
||||
if self._tail_ontology_meta:
|
||||
self._tail_ontologies = {}
|
||||
for key, value in self._tail_ontology_meta.items():
|
||||
self._tail_ontologies[key] = OntologyAdapter(
|
||||
ontology_file=value["url"],
|
||||
root_label=value["tail_join_node"],
|
||||
head_join_node_label=value["head_join_node"],
|
||||
ontology_file_format=value.get("format", None),
|
||||
merge_nodes=value.get("merge_nodes", True),
|
||||
switch_label_and_id=value.get("switch_label_and_id", True),
|
||||
)
|
||||
|
||||
def _get_head_join_node(self, adapter: OntologyAdapter) -> str:
|
||||
"""
|
||||
Tries to find the head join node of the given ontology adapter in the
|
||||
head ontology. If the join node is not found, the method will raise an
|
||||
error.
|
||||
|
||||
Args:
|
||||
adapter (OntologyAdapter): The ontology adapter of which to find the
|
||||
join node in the head ontology.
|
||||
"""
|
||||
|
||||
head_join_node = None
|
||||
user_defined_head_join_node_label = adapter.get_head_join_node()
|
||||
head_join_node_label_in_bc_format = to_lower_sentence_case(
|
||||
user_defined_head_join_node_label.replace("_", " ")
|
||||
)
|
||||
|
||||
if self._head_ontology._switch_label_and_id:
|
||||
head_join_node = head_join_node_label_in_bc_format
|
||||
elif not self._head_ontology._switch_label_and_id:
|
||||
for node_id, data in self._head_ontology.get_nx_graph().nodes(
|
||||
data=True
|
||||
):
|
||||
if (
|
||||
"label" in data
|
||||
and data["label"] == head_join_node_label_in_bc_format
|
||||
):
|
||||
head_join_node = node_id
|
||||
break
|
||||
|
||||
if head_join_node not in self._head_ontology.get_nx_graph().nodes:
|
||||
head_ontology = self._head_ontology._rdf_to_nx(
|
||||
self._head_ontology.get_rdf_graph(),
|
||||
self._head_ontology._root_label,
|
||||
self._head_ontology._switch_label_and_id,
|
||||
rename_nodes=False,
|
||||
)
|
||||
raise ValueError(
|
||||
f"Head join node '{head_join_node}' not found in head ontology. "
|
||||
f"The head ontology contains the following nodes: {head_ontology.nodes}."
|
||||
)
|
||||
return head_join_node
|
||||
|
||||
def _join_ontologies(
|
||||
self, adapter: OntologyAdapter, head_join_node
|
||||
) -> None:
|
||||
"""
|
||||
Joins the ontologies by adding the tail ontology as a subgraph to the
|
||||
head ontology at the specified join nodes.
|
||||
|
||||
Args:
|
||||
adapter (OntologyAdapter): The ontology adapter of the tail ontology
|
||||
to be added to the head ontology.
|
||||
"""
|
||||
|
||||
if not self._nx_graph:
|
||||
self._nx_graph = self._head_ontology.get_nx_graph().copy()
|
||||
|
||||
tail_join_node = adapter.get_root_node()
|
||||
tail_ontology = adapter.get_nx_graph()
|
||||
|
||||
# subtree of tail ontology at join node
|
||||
tail_ontology_subtree = nx.dfs_tree(
|
||||
tail_ontology.reverse(), tail_join_node
|
||||
).reverse()
|
||||
|
||||
# transfer node attributes from tail ontology to subtree
|
||||
for node in tail_ontology_subtree.nodes:
|
||||
tail_ontology_subtree.nodes[node].update(tail_ontology.nodes[node])
|
||||
|
||||
# if merge_nodes is False, create parent of tail join node from head
|
||||
# join node
|
||||
if not adapter._merge_nodes:
|
||||
# add head join node from head ontology to tail ontology subtree
|
||||
# as parent of tail join node
|
||||
tail_ontology_subtree.add_node(
|
||||
head_join_node,
|
||||
**self._head_ontology.get_nx_graph().nodes[head_join_node],
|
||||
)
|
||||
tail_ontology_subtree.add_edge(tail_join_node, head_join_node)
|
||||
|
||||
# else rename tail join node to match head join node if necessary
|
||||
elif not tail_join_node == head_join_node:
|
||||
tail_ontology_subtree = nx.relabel_nodes(
|
||||
tail_ontology_subtree, {tail_join_node: head_join_node}
|
||||
)
|
||||
|
||||
# combine head ontology and tail subtree
|
||||
self._nx_graph = nx.compose(self._nx_graph, tail_ontology_subtree)
|
||||
|
||||
def _extend_ontology(self) -> None:
|
||||
"""
|
||||
Adds the user extensions to the ontology. Tries to find the parent in
|
||||
the ontology, adds it if necessary, and adds the child and a directed
|
||||
edge from child to parent. Can handle multiple parents.
|
||||
"""
|
||||
|
||||
if not self._nx_graph:
|
||||
self._nx_graph = self._head_ontology.get_nx_graph().copy()
|
||||
|
||||
for key, value in self.mapping.extended_schema.items():
|
||||
if not value.get("is_a"):
|
||||
if self._nx_graph.has_node(value.get("synonym_for")):
|
||||
continue
|
||||
|
||||
if not self._nx_graph.has_node(key):
|
||||
raise ValueError(
|
||||
f"Node {key} not found in ontology, but also has no "
|
||||
"inheritance definition. Please check your schema for "
|
||||
"spelling errors, first letter not in lower case, use of underscores, a missing `is_a` definition (SubClassOf a root node), or missing labels in class or super-classes."
|
||||
)
|
||||
|
||||
continue
|
||||
|
||||
parents = to_list(value.get("is_a"))
|
||||
child = key
|
||||
|
||||
while parents:
|
||||
parent = parents.pop(0)
|
||||
|
||||
if parent not in self._nx_graph.nodes:
|
||||
self._nx_graph.add_node(parent)
|
||||
self._nx_graph.nodes[parent][
|
||||
"label"
|
||||
] = sentencecase_to_pascalcase(parent)
|
||||
|
||||
# mark parent as user extension
|
||||
self._nx_graph.nodes[parent]["user_extension"] = True
|
||||
self._extended_nodes.add(parent)
|
||||
|
||||
if child not in self._nx_graph.nodes:
|
||||
self._nx_graph.add_node(child)
|
||||
self._nx_graph.nodes[child][
|
||||
"label"
|
||||
] = sentencecase_to_pascalcase(child)
|
||||
|
||||
# mark child as user extension
|
||||
self._nx_graph.nodes[child]["user_extension"] = True
|
||||
self._extended_nodes.add(child)
|
||||
|
||||
self._nx_graph.add_edge(child, parent)
|
||||
|
||||
child = parent
|
||||
|
||||
def _connect_biolink_classes(self) -> None:
|
||||
"""
|
||||
Experimental: Adds edges from disjoint classes to the entity node.
|
||||
"""
|
||||
|
||||
if not self._nx_graph:
|
||||
self._nx_graph = self._head_ontology.get_nx_graph().copy()
|
||||
|
||||
if "entity" not in self._nx_graph.nodes:
|
||||
return
|
||||
|
||||
# biolink classes that are disjoint from entity
|
||||
disjoint_classes = [
|
||||
"frequency qualifier mixin",
|
||||
"chemical entity to entity association mixin",
|
||||
"ontology class",
|
||||
"relationship quantifier",
|
||||
"physical essence or occurrent",
|
||||
"gene or gene product",
|
||||
"subject of investigation",
|
||||
]
|
||||
|
||||
for node in disjoint_classes:
|
||||
if not self._nx_graph.nodes.get(node):
|
||||
self._nx_graph.add_node(node)
|
||||
self._nx_graph.nodes[node][
|
||||
"label"
|
||||
] = sentencecase_to_pascalcase(node)
|
||||
|
||||
self._nx_graph.add_edge(node, "entity")
|
||||
|
||||
def _add_properties(self) -> None:
|
||||
"""
|
||||
For each entity in the mapping, update the ontology with the properties
|
||||
specified in the mapping. Updates synonym information in the graph,
|
||||
setting the synonym as the primary node label.
|
||||
"""
|
||||
|
||||
for key, value in self.mapping.extended_schema.items():
|
||||
if key in self._nx_graph.nodes:
|
||||
self._nx_graph.nodes[key].update(value)
|
||||
|
||||
if value.get("synonym_for"):
|
||||
# change node label to synonym
|
||||
if value["synonym_for"] not in self._nx_graph.nodes:
|
||||
raise ValueError(
|
||||
f'Node {value["synonym_for"]} not found in ontology.'
|
||||
)
|
||||
|
||||
self._nx_graph = nx.relabel_nodes(
|
||||
self._nx_graph, {value["synonym_for"]: key}
|
||||
)
|
||||
|
||||
def get_ancestors(self, node_label: str) -> list:
|
||||
"""
|
||||
Get the ancestors of a node in the ontology.
|
||||
|
||||
Args:
|
||||
node_label (str): The label of the node in the ontology.
|
||||
|
||||
Returns:
|
||||
list: A list of the ancestors of the node.
|
||||
"""
|
||||
return nx.dfs_tree(self._nx_graph, node_label)
|
||||
|
||||
def show_ontology_structure(self, to_disk: str = None, full: bool = False):
|
||||
"""
|
||||
Show the ontology structure using treelib or write to GRAPHML file.
|
||||
|
||||
Args:
|
||||
|
||||
to_disk (str): If specified, the ontology structure will be saved
|
||||
to disk as a GRAPHML file at the location (directory) specified
|
||||
by the `to_disk` string, to be opened in your favourite graph
|
||||
visualisation tool.
|
||||
|
||||
full (bool): If True, the full ontology structure will be shown,
|
||||
including all nodes and edges. If False, only the nodes and
|
||||
edges that are relevant to the extended schema will be shown.
|
||||
"""
|
||||
|
||||
if not full and not self.mapping.extended_schema:
|
||||
raise ValueError(
|
||||
"You are attempting to visualise a subset of the loaded"
|
||||
"ontology, but have not provided a schema configuration. "
|
||||
"To display a partial ontology graph, please provide a schema "
|
||||
"configuration file; to visualise the full graph, please use "
|
||||
"the parameter `full = True`."
|
||||
)
|
||||
|
||||
if not self._nx_graph:
|
||||
raise ValueError("Ontology not loaded.")
|
||||
|
||||
if not self._tail_ontologies:
|
||||
msg = f"Showing ontology structure based on {self._head_ontology._ontology_file}"
|
||||
|
||||
else:
|
||||
msg = f"Showing ontology structure based on {len(self._tail_ontology_meta)+1} ontologies: "
|
||||
|
||||
logger.info(msg)
|
||||
|
||||
if not full:
|
||||
# set of leaves and their intermediate parents up to the root
|
||||
filter_nodes = set(self.mapping.extended_schema.keys())
|
||||
|
||||
for node in self.mapping.extended_schema.keys():
|
||||
filter_nodes.update(self.get_ancestors(node).nodes)
|
||||
|
||||
# filter graph
|
||||
G = self._nx_graph.subgraph(filter_nodes)
|
||||
|
||||
else:
|
||||
G = self._nx_graph
|
||||
|
||||
if not to_disk:
|
||||
# create tree
|
||||
tree = create_tree_visualisation(G)
|
||||
|
||||
# add synonym information
|
||||
for node in self.mapping.extended_schema:
|
||||
if not isinstance(self.mapping.extended_schema[node], dict):
|
||||
continue
|
||||
if self.mapping.extended_schema[node].get("synonym_for"):
|
||||
tree.nodes[node].tag = (
|
||||
f"{node} = "
|
||||
f"{self.mapping.extended_schema[node].get('synonym_for')}"
|
||||
)
|
||||
|
||||
logger.info(f"\n{tree}")
|
||||
|
||||
return tree
|
||||
|
||||
else:
|
||||
# convert lists/dicts to strings for vis only
|
||||
for node in G.nodes:
|
||||
# rename node and use former id as label
|
||||
label = G.nodes[node].get("label")
|
||||
|
||||
if not label:
|
||||
label = node
|
||||
|
||||
G = nx.relabel_nodes(G, {node: label})
|
||||
G.nodes[label]["label"] = node
|
||||
|
||||
for attrib in G.nodes[label]:
|
||||
if type(G.nodes[label][attrib]) in [list, dict]:
|
||||
G.nodes[label][attrib] = str(G.nodes[label][attrib])
|
||||
|
||||
path = os.path.join(to_disk, "ontology_structure.graphml")
|
||||
|
||||
logger.info(f"Writing ontology structure to {path}.")
|
||||
|
||||
nx.write_graphml(G, path)
|
||||
|
||||
return True
|
||||
|
||||
def get_dict(self) -> dict:
|
||||
"""
|
||||
Returns a dictionary compatible with a BioCypher node for compatibility
|
||||
with the Neo4j driver.
|
||||
"""
|
||||
|
||||
d = {
|
||||
"node_id": self._get_current_id(),
|
||||
"node_label": "BioCypher",
|
||||
"properties": {
|
||||
"schema": "self.ontology_mapping.extended_schema",
|
||||
},
|
||||
}
|
||||
|
||||
return d
|
||||
|
||||
def _get_current_id(self):
|
||||
"""
|
||||
Instantiate a version ID for the current session. For now does simple
|
||||
versioning using datetime.
|
||||
|
||||
Can later implement incremental versioning, versioning from
|
||||
config file, or manual specification via argument.
|
||||
"""
|
||||
|
||||
now = datetime.now()
|
||||
return now.strftime("v%Y%m%d-%H%M%S")
|
480
biocypher/_translate.py
Normal file
480
biocypher/_translate.py
Normal file
@ -0,0 +1,480 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'translation' module. Responsible for translating between the raw
|
||||
input data and the BioCypherNode and BioCypherEdge objects.
|
||||
"""
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import Any, Union, Optional
|
||||
from collections.abc import Iterable, Generator
|
||||
|
||||
from more_itertools import peekable
|
||||
|
||||
from . import _misc
|
||||
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
||||
from ._ontology import Ontology
|
||||
|
||||
__all__ = ["BiolinkAdapter", "Translator"]
|
||||
|
||||
|
||||
class Translator:
|
||||
"""
|
||||
Class responsible for exacting the translation process that is configured in
|
||||
the schema_config.yaml file. Creates a mapping dictionary from that file,
|
||||
and, given nodes and edges, translates them into BioCypherNodes and
|
||||
BioCypherEdges. During this process, can also filter the properties of the
|
||||
entities if the schema_config.yaml file specifies a property whitelist or
|
||||
blacklist.
|
||||
|
||||
Provides utility functions for translating between input and output labels
|
||||
and cypher queries.
|
||||
"""
|
||||
|
||||
def __init__(self, ontology: "Ontology", strict_mode: bool = False):
|
||||
"""
|
||||
Args:
|
||||
leaves:
|
||||
Dictionary detailing the leaves of the hierarchy
|
||||
tree representing the structure of the graph; the leaves are
|
||||
the entities that will be direct components of the graph,
|
||||
while the intermediary nodes are additional labels for
|
||||
filtering purposes.
|
||||
strict_mode:
|
||||
If True, the translator will raise an error if input data do not
|
||||
carry source, licence, and version information.
|
||||
"""
|
||||
|
||||
self.ontology = ontology
|
||||
self.strict_mode = strict_mode
|
||||
|
||||
# record nodes without biolink type configured in schema_config.yaml
|
||||
self.notype = {}
|
||||
|
||||
# mapping functionality for translating terms and queries
|
||||
self.mappings = {}
|
||||
self.reverse_mappings = {}
|
||||
|
||||
self._update_ontology_types()
|
||||
|
||||
def translate_nodes(
|
||||
self,
|
||||
node_tuples: Iterable,
|
||||
) -> Generator[BioCypherNode, None, None]:
|
||||
"""
|
||||
Translates input node representation to a representation that
|
||||
conforms to the schema of the given BioCypher graph. For now
|
||||
requires explicit statement of node type on pass.
|
||||
|
||||
Args:
|
||||
node_tuples (list of tuples): collection of tuples
|
||||
representing individual nodes by their unique id and a type
|
||||
that is translated from the original database notation to
|
||||
the corresponding BioCypher notation.
|
||||
|
||||
"""
|
||||
|
||||
self._log_begin_translate(node_tuples, "nodes")
|
||||
|
||||
for _id, _type, _props in node_tuples:
|
||||
# check for strict mode requirements
|
||||
required_props = ["source", "licence", "version"]
|
||||
|
||||
if self.strict_mode:
|
||||
# rename 'license' to 'licence' in _props
|
||||
if _props.get("license"):
|
||||
_props["licence"] = _props.pop("license")
|
||||
|
||||
for prop in required_props:
|
||||
if prop not in _props:
|
||||
raise ValueError(
|
||||
f"Property `{prop}` missing from node {_id}. "
|
||||
"Strict mode is enabled, so this is not allowed."
|
||||
)
|
||||
|
||||
# find the node in leaves that represents ontology node type
|
||||
_ontology_class = self._get_ontology_mapping(_type)
|
||||
|
||||
if _ontology_class:
|
||||
# filter properties for those specified in schema_config if any
|
||||
_filtered_props = self._filter_props(_ontology_class, _props)
|
||||
|
||||
# preferred id
|
||||
_preferred_id = self._get_preferred_id(_ontology_class)
|
||||
|
||||
yield BioCypherNode(
|
||||
node_id=_id,
|
||||
node_label=_ontology_class,
|
||||
preferred_id=_preferred_id,
|
||||
properties=_filtered_props,
|
||||
)
|
||||
|
||||
else:
|
||||
self._record_no_type(_type, _id)
|
||||
|
||||
self._log_finish_translate("nodes")
|
||||
|
||||
def _get_preferred_id(self, _bl_type: str) -> str:
|
||||
"""
|
||||
Returns the preferred id for the given Biolink type.
|
||||
"""
|
||||
|
||||
return (
|
||||
self.ontology.mapping.extended_schema[_bl_type]["preferred_id"]
|
||||
if "preferred_id"
|
||||
in self.ontology.mapping.extended_schema.get(_bl_type, {})
|
||||
else "id"
|
||||
)
|
||||
|
||||
def _filter_props(self, bl_type: str, props: dict) -> dict:
|
||||
"""
|
||||
Filters properties for those specified in schema_config if any.
|
||||
"""
|
||||
|
||||
filter_props = self.ontology.mapping.extended_schema[bl_type].get(
|
||||
"properties", {}
|
||||
)
|
||||
|
||||
# strict mode: add required properties (only if there is a whitelist)
|
||||
if self.strict_mode and filter_props:
|
||||
filter_props.update(
|
||||
{"source": "str", "licence": "str", "version": "str"},
|
||||
)
|
||||
|
||||
exclude_props = self.ontology.mapping.extended_schema[bl_type].get(
|
||||
"exclude_properties", []
|
||||
)
|
||||
|
||||
if isinstance(exclude_props, str):
|
||||
exclude_props = [exclude_props]
|
||||
|
||||
if filter_props and exclude_props:
|
||||
filtered_props = {
|
||||
k: v
|
||||
for k, v in props.items()
|
||||
if (k in filter_props.keys() and k not in exclude_props)
|
||||
}
|
||||
|
||||
elif filter_props:
|
||||
filtered_props = {
|
||||
k: v for k, v in props.items() if k in filter_props.keys()
|
||||
}
|
||||
|
||||
elif exclude_props:
|
||||
filtered_props = {
|
||||
k: v for k, v in props.items() if k not in exclude_props
|
||||
}
|
||||
|
||||
else:
|
||||
return props
|
||||
|
||||
missing_props = [
|
||||
k for k in filter_props.keys() if k not in filtered_props.keys()
|
||||
]
|
||||
# add missing properties with default values
|
||||
for k in missing_props:
|
||||
filtered_props[k] = None
|
||||
|
||||
return filtered_props
|
||||
|
||||
def translate_edges(
|
||||
self,
|
||||
edge_tuples: Iterable,
|
||||
) -> Generator[Union[BioCypherEdge, BioCypherRelAsNode], None, None]:
|
||||
"""
|
||||
Translates input edge representation to a representation that
|
||||
conforms to the schema of the given BioCypher graph. For now
|
||||
requires explicit statement of edge type on pass.
|
||||
|
||||
Args:
|
||||
|
||||
edge_tuples (list of tuples):
|
||||
|
||||
collection of tuples representing source and target of
|
||||
an interaction via their unique ids as well as the type
|
||||
of interaction in the original database notation, which
|
||||
is translated to BioCypher notation using the `leaves`.
|
||||
Can optionally possess its own ID.
|
||||
"""
|
||||
|
||||
self._log_begin_translate(edge_tuples, "edges")
|
||||
|
||||
# legacy: deal with 4-tuples (no edge id)
|
||||
# TODO remove for performance reasons once safe
|
||||
edge_tuples = peekable(edge_tuples)
|
||||
if len(edge_tuples.peek()) == 4:
|
||||
edge_tuples = [
|
||||
(None, src, tar, typ, props)
|
||||
for src, tar, typ, props in edge_tuples
|
||||
]
|
||||
|
||||
for _id, _src, _tar, _type, _props in edge_tuples:
|
||||
# check for strict mode requirements
|
||||
if self.strict_mode:
|
||||
if not "source" in _props:
|
||||
raise ValueError(
|
||||
f"Edge {_id if _id else (_src, _tar)} does not have a `source` property.",
|
||||
" This is required in strict mode.",
|
||||
)
|
||||
if not "licence" in _props:
|
||||
raise ValueError(
|
||||
f"Edge {_id if _id else (_src, _tar)} does not have a `licence` property.",
|
||||
" This is required in strict mode.",
|
||||
)
|
||||
|
||||
# match the input label (_type) to
|
||||
# a Biolink label from schema_config
|
||||
bl_type = self._get_ontology_mapping(_type)
|
||||
|
||||
if bl_type:
|
||||
# filter properties for those specified in schema_config if any
|
||||
_filtered_props = self._filter_props(bl_type, _props)
|
||||
|
||||
rep = self.ontology.mapping.extended_schema[bl_type][
|
||||
"represented_as"
|
||||
]
|
||||
|
||||
if rep == "node":
|
||||
if _id:
|
||||
# if it brings its own ID, use it
|
||||
node_id = _id
|
||||
|
||||
else:
|
||||
# source target concat
|
||||
node_id = (
|
||||
str(_src)
|
||||
+ "_"
|
||||
+ str(_tar)
|
||||
+ "_"
|
||||
+ "_".join(str(v) for v in _filtered_props.values())
|
||||
)
|
||||
|
||||
n = BioCypherNode(
|
||||
node_id=node_id,
|
||||
node_label=bl_type,
|
||||
properties=_filtered_props,
|
||||
)
|
||||
|
||||
# directionality check TODO generalise to account for
|
||||
# different descriptions of directionality or find a
|
||||
# more consistent solution for indicating directionality
|
||||
if _filtered_props.get("directed") == True:
|
||||
l1 = "IS_SOURCE_OF"
|
||||
l2 = "IS_TARGET_OF"
|
||||
|
||||
elif _filtered_props.get(
|
||||
"src_role",
|
||||
) and _filtered_props.get("tar_role"):
|
||||
l1 = _filtered_props.get("src_role")
|
||||
l2 = _filtered_props.get("tar_role")
|
||||
|
||||
else:
|
||||
l1 = l2 = "IS_PART_OF"
|
||||
|
||||
e_s = BioCypherEdge(
|
||||
source_id=_src,
|
||||
target_id=node_id,
|
||||
relationship_label=l1,
|
||||
# additional here
|
||||
)
|
||||
|
||||
e_t = BioCypherEdge(
|
||||
source_id=_tar,
|
||||
target_id=node_id,
|
||||
relationship_label=l2,
|
||||
# additional here
|
||||
)
|
||||
|
||||
yield BioCypherRelAsNode(n, e_s, e_t)
|
||||
|
||||
else:
|
||||
edge_label = self.ontology.mapping.extended_schema[
|
||||
bl_type
|
||||
].get("label_as_edge")
|
||||
|
||||
if edge_label is None:
|
||||
edge_label = bl_type
|
||||
|
||||
yield BioCypherEdge(
|
||||
relationship_id=_id,
|
||||
source_id=_src,
|
||||
target_id=_tar,
|
||||
relationship_label=edge_label,
|
||||
properties=_filtered_props,
|
||||
)
|
||||
|
||||
else:
|
||||
self._record_no_type(_type, (_src, _tar))
|
||||
|
||||
self._log_finish_translate("edges")
|
||||
|
||||
def _record_no_type(self, _type: Any, what: Any) -> None:
|
||||
"""
|
||||
Records the type of a node or edge that is not represented in the
|
||||
schema_config.
|
||||
"""
|
||||
|
||||
logger.debug(f"No ontology type defined for `{_type}`: {what}")
|
||||
|
||||
if self.notype.get(_type, None):
|
||||
self.notype[_type] += 1
|
||||
|
||||
else:
|
||||
self.notype[_type] = 1
|
||||
|
||||
def get_missing_biolink_types(self) -> dict:
|
||||
"""
|
||||
Returns a dictionary of types that were not represented in the
|
||||
schema_config.
|
||||
"""
|
||||
|
||||
return self.notype
|
||||
|
||||
@staticmethod
|
||||
def _log_begin_translate(_input: Iterable, what: str):
|
||||
n = f"{len(_input)} " if hasattr(_input, "__len__") else ""
|
||||
|
||||
logger.debug(f"Translating {n}{what} to BioCypher")
|
||||
|
||||
@staticmethod
|
||||
def _log_finish_translate(what: str):
|
||||
logger.debug(f"Finished translating {what} to BioCypher.")
|
||||
|
||||
def _update_ontology_types(self):
|
||||
"""
|
||||
Creates a dictionary to translate from input labels to ontology labels.
|
||||
|
||||
If multiple input labels, creates mapping for each.
|
||||
"""
|
||||
|
||||
self._ontology_mapping = {}
|
||||
|
||||
for key, value in self.ontology.mapping.extended_schema.items():
|
||||
labels = value.get("input_label") or value.get("label_in_input")
|
||||
|
||||
if isinstance(labels, str):
|
||||
self._ontology_mapping[labels] = key
|
||||
|
||||
elif isinstance(labels, list):
|
||||
for label in labels:
|
||||
self._ontology_mapping[label] = key
|
||||
|
||||
if value.get("label_as_edge"):
|
||||
self._add_translation_mappings(labels, value["label_as_edge"])
|
||||
|
||||
else:
|
||||
self._add_translation_mappings(labels, key)
|
||||
|
||||
def _get_ontology_mapping(self, label: str) -> Optional[str]:
|
||||
"""
|
||||
For each given input type ("input_label" or "label_in_input"), find the
|
||||
corresponding ontology class in the leaves dictionary (from the
|
||||
`schema_config.yam`).
|
||||
|
||||
Args:
|
||||
label:
|
||||
The input type to find (`input_label` or `label_in_input` in
|
||||
`schema_config.yaml`).
|
||||
"""
|
||||
|
||||
# commented out until behaviour of _update_bl_types is fixed
|
||||
return self._ontology_mapping.get(label, None)
|
||||
|
||||
def translate_term(self, term):
|
||||
"""
|
||||
Translate a single term.
|
||||
"""
|
||||
|
||||
return self.mappings.get(term, None)
|
||||
|
||||
def reverse_translate_term(self, term):
|
||||
"""
|
||||
Reverse translate a single term.
|
||||
"""
|
||||
|
||||
return self.reverse_mappings.get(term, None)
|
||||
|
||||
def translate(self, query):
|
||||
"""
|
||||
Translate a cypher query. Only translates labels as of now.
|
||||
"""
|
||||
for key in self.mappings:
|
||||
query = query.replace(":" + key, ":" + self.mappings[key])
|
||||
return query
|
||||
|
||||
def reverse_translate(self, query):
|
||||
"""
|
||||
Reverse translate a cypher query. Only translates labels as of
|
||||
now.
|
||||
"""
|
||||
for key in self.reverse_mappings:
|
||||
a = ":" + key + ")"
|
||||
b = ":" + key + "]"
|
||||
# TODO this conditional probably does not cover all cases
|
||||
if a in query or b in query:
|
||||
if isinstance(self.reverse_mappings[key], list):
|
||||
raise NotImplementedError(
|
||||
"Reverse translation of multiple inputs not "
|
||||
"implemented yet. Many-to-one mappings are "
|
||||
"not reversible. "
|
||||
f"({key} -> {self.reverse_mappings[key]})",
|
||||
)
|
||||
else:
|
||||
query = query.replace(
|
||||
a,
|
||||
":" + self.reverse_mappings[key] + ")",
|
||||
).replace(b, ":" + self.reverse_mappings[key] + "]")
|
||||
return query
|
||||
|
||||
def _add_translation_mappings(self, original_name, biocypher_name):
|
||||
"""
|
||||
Add translation mappings for a label and name. We use here the
|
||||
PascalCase version of the BioCypher name, since sentence case is
|
||||
not useful for Cypher queries.
|
||||
"""
|
||||
if isinstance(original_name, list):
|
||||
for on in original_name:
|
||||
self.mappings[on] = self.name_sentence_to_pascal(
|
||||
biocypher_name,
|
||||
)
|
||||
else:
|
||||
self.mappings[original_name] = self.name_sentence_to_pascal(
|
||||
biocypher_name,
|
||||
)
|
||||
|
||||
if isinstance(biocypher_name, list):
|
||||
for bn in biocypher_name:
|
||||
self.reverse_mappings[
|
||||
self.name_sentence_to_pascal(
|
||||
bn,
|
||||
)
|
||||
] = original_name
|
||||
else:
|
||||
self.reverse_mappings[
|
||||
self.name_sentence_to_pascal(
|
||||
biocypher_name,
|
||||
)
|
||||
] = original_name
|
||||
|
||||
@staticmethod
|
||||
def name_sentence_to_pascal(name: str) -> str:
|
||||
"""
|
||||
Converts a name in sentence case to pascal case.
|
||||
"""
|
||||
# split on dots if dot is present
|
||||
if "." in name:
|
||||
return ".".join(
|
||||
[_misc.sentencecase_to_pascalcase(n) for n in name.split(".")],
|
||||
)
|
||||
else:
|
||||
return _misc.sentencecase_to_pascalcase(name)
|
0
biocypher/output/__init__.py
Normal file
0
biocypher/output/__init__.py
Normal file
0
biocypher/output/connect/__init__.py
Normal file
0
biocypher/output/connect/__init__.py
Normal file
422
biocypher/output/connect/_neo4j_driver.py
Normal file
422
biocypher/output/connect/_neo4j_driver.py
Normal file
@ -0,0 +1,422 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'online' mode. Handles connection and manipulation of a running DBMS.
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
from biocypher._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from collections.abc import Iterable
|
||||
import itertools
|
||||
|
||||
import neo4j_utils
|
||||
|
||||
from biocypher import _misc
|
||||
from biocypher._config import config as _config
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode
|
||||
from biocypher._translate import Translator
|
||||
|
||||
__all__ = ["_Neo4jDriver"]
|
||||
|
||||
|
||||
class _Neo4jDriver:
|
||||
"""
|
||||
Manages a BioCypher connection to a Neo4j database using the
|
||||
``neo4j_utils.Driver`` class.
|
||||
|
||||
Args:
|
||||
|
||||
database_name (str): The name of the database to connect to.
|
||||
|
||||
wipe (bool): Whether to wipe the database before importing.
|
||||
|
||||
uri (str): The URI of the database.
|
||||
|
||||
user (str): The username to use for authentication.
|
||||
|
||||
password (str): The password to use for authentication.
|
||||
|
||||
multi_db (bool): Whether to use multi-database mode.
|
||||
|
||||
fetch_size (int): The number of records to fetch at a time.
|
||||
|
||||
increment_version (bool): Whether to increment the version number.
|
||||
|
||||
translator (Translator): The translator to use for mapping.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
database_name: str,
|
||||
uri: str,
|
||||
user: str,
|
||||
password: str,
|
||||
multi_db: bool,
|
||||
translator: Translator,
|
||||
wipe: bool = False,
|
||||
fetch_size: int = 1000,
|
||||
increment_version: bool = True,
|
||||
):
|
||||
self.translator = translator
|
||||
|
||||
self._driver = neo4j_utils.Driver(
|
||||
db_name=database_name,
|
||||
db_uri=uri,
|
||||
db_user=user,
|
||||
db_passwd=password,
|
||||
fetch_size=fetch_size,
|
||||
wipe=wipe,
|
||||
multi_db=multi_db,
|
||||
raise_errors=True,
|
||||
)
|
||||
|
||||
# check for biocypher config in connected graph
|
||||
|
||||
if wipe:
|
||||
self.init_db()
|
||||
|
||||
if increment_version:
|
||||
# set new current version node
|
||||
self._update_meta_graph()
|
||||
|
||||
def _update_meta_graph(self):
|
||||
logger.info("Updating Neo4j meta graph.")
|
||||
|
||||
# find current version node
|
||||
db_version = self._driver.query(
|
||||
"MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v",
|
||||
)
|
||||
# add version node
|
||||
self.add_biocypher_nodes(self.translator.ontology)
|
||||
|
||||
# connect version node to previous
|
||||
if db_version[0]:
|
||||
previous = db_version[0][0]
|
||||
previous_id = previous["v"]["id"]
|
||||
e_meta = BioCypherEdge(
|
||||
previous_id,
|
||||
self.translator.ontology.get_dict().get("node_id"),
|
||||
"PRECEDES",
|
||||
)
|
||||
self.add_biocypher_edges(e_meta)
|
||||
|
||||
def init_db(self):
|
||||
"""
|
||||
Used to initialise a property graph database by setting up new
|
||||
constraints. Wipe has been performed by the ``neo4j_utils.Driver``
|
||||
class` already.
|
||||
|
||||
Todo:
|
||||
- set up constraint creation interactively depending on the
|
||||
need of the database
|
||||
"""
|
||||
|
||||
logger.info("Initialising database.")
|
||||
self._create_constraints()
|
||||
|
||||
def _create_constraints(self):
|
||||
"""
|
||||
Creates constraints on node types in the graph. Used for
|
||||
initial setup.
|
||||
|
||||
Grabs leaves of the ``schema_config.yaml`` file and creates
|
||||
constraints on the id of all entities represented as nodes.
|
||||
"""
|
||||
|
||||
logger.info("Creating constraints for node types in config.")
|
||||
|
||||
major_neo4j_version = int(self._get_neo4j_version().split(".")[0])
|
||||
# get structure
|
||||
for leaf in self.translator.ontology.mapping.extended_schema.items():
|
||||
label = _misc.sentencecase_to_pascalcase(leaf[0], sep=r"\s\.")
|
||||
if leaf[1]["represented_as"] == "node":
|
||||
if major_neo4j_version >= 5:
|
||||
s = (
|
||||
f"CREATE CONSTRAINT `{label}_id` "
|
||||
f"IF NOT EXISTS FOR (n:`{label}`) "
|
||||
"REQUIRE n.id IS UNIQUE"
|
||||
)
|
||||
self._driver.query(s)
|
||||
else:
|
||||
s = (
|
||||
f"CREATE CONSTRAINT `{label}_id` "
|
||||
f"IF NOT EXISTS ON (n:`{label}`) "
|
||||
"ASSERT n.id IS UNIQUE"
|
||||
)
|
||||
self._driver.query(s)
|
||||
|
||||
def _get_neo4j_version(self):
|
||||
"""Get neo4j version."""
|
||||
try:
|
||||
neo4j_version = self._driver.query(
|
||||
"""
|
||||
CALL dbms.components()
|
||||
YIELD name, versions, edition
|
||||
UNWIND versions AS version
|
||||
RETURN version AS version
|
||||
""",
|
||||
)[0][0]["version"]
|
||||
return neo4j_version
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error detecting Neo4j version: {e} use default version 4.0.0."
|
||||
)
|
||||
return "4.0.0"
|
||||
|
||||
def add_nodes(self, id_type_tuples: Iterable[tuple]) -> tuple:
|
||||
"""
|
||||
Generic node adder method to add any kind of input to the graph via the
|
||||
:class:`biocypher.create.BioCypherNode` class. Employs translation
|
||||
functionality and calls the :meth:`add_biocypher_nodes()` method.
|
||||
|
||||
Args:
|
||||
id_type_tuples (iterable of 3-tuple): for each node to add to
|
||||
the biocypher graph, a 3-tuple with the following layout:
|
||||
first, the (unique if constrained) ID of the node; second, the
|
||||
type of the node, capitalised or PascalCase and in noun form
|
||||
(Neo4j primary label, eg `:Protein`); and third, a dictionary
|
||||
of arbitrary properties the node should possess (can be empty).
|
||||
|
||||
Returns:
|
||||
2-tuple: the query result of :meth:`add_biocypher_nodes()`
|
||||
- first entry: data
|
||||
- second entry: Neo4j summary.
|
||||
"""
|
||||
|
||||
bn = self.translator.translate_nodes(id_type_tuples)
|
||||
return self.add_biocypher_nodes(bn)
|
||||
|
||||
def add_edges(self, id_src_tar_type_tuples: Iterable[tuple]) -> tuple:
|
||||
"""
|
||||
Generic edge adder method to add any kind of input to the graph
|
||||
via the :class:`biocypher.create.BioCypherEdge` class. Employs
|
||||
translation functionality and calls the
|
||||
:meth:`add_biocypher_edges()` method.
|
||||
|
||||
Args:
|
||||
|
||||
id_src_tar_type_tuples (iterable of 5-tuple):
|
||||
|
||||
for each edge to add to the biocypher graph, a 5-tuple
|
||||
with the following layout: first, the optional unique ID
|
||||
of the interaction. This can be `None` if there is no
|
||||
systematic identifier (which for many interactions is
|
||||
the case). Second and third, the (unique if constrained)
|
||||
IDs of the source and target nodes of the relationship;
|
||||
fourth, the type of the relationship; and fifth, a
|
||||
dictionary of arbitrary properties the edge should
|
||||
possess (can be empty).
|
||||
|
||||
Returns:
|
||||
|
||||
2-tuple: the query result of :meth:`add_biocypher_edges()`
|
||||
|
||||
- first entry: data
|
||||
- second entry: Neo4j summary.
|
||||
"""
|
||||
|
||||
bn = self.translator.translate_edges(id_src_tar_type_tuples)
|
||||
return self.add_biocypher_edges(bn)
|
||||
|
||||
def add_biocypher_nodes(
|
||||
self,
|
||||
nodes: Iterable[BioCypherNode],
|
||||
explain: bool = False,
|
||||
profile: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Accepts a node type handoff class
|
||||
(:class:`biocypher.create.BioCypherNode`) with id,
|
||||
label, and a dict of properties (passing on the type of
|
||||
property, ie, ``int``, ``str``, ...).
|
||||
|
||||
The dict retrieved by the
|
||||
:meth:`biocypher.create.BioCypherNode.get_dict()` method is
|
||||
passed into Neo4j as a map of maps, explicitly encoding node id
|
||||
and label, and adding all other properties from the 'properties'
|
||||
key of the dict. The merge is performed via APOC, matching only
|
||||
on node id to prevent duplicates. The same properties are set on
|
||||
match and on create, irrespective of the actual event.
|
||||
|
||||
Args:
|
||||
nodes:
|
||||
An iterable of :class:`biocypher.create.BioCypherNode` objects.
|
||||
explain:
|
||||
Call ``EXPLAIN`` on the CYPHER query.
|
||||
profile:
|
||||
Do profiling on the CYPHER query.
|
||||
|
||||
Returns:
|
||||
True for success, False otherwise.
|
||||
"""
|
||||
|
||||
try:
|
||||
nodes = _misc.to_list(nodes)
|
||||
|
||||
entities = [node.get_dict() for node in nodes]
|
||||
|
||||
except AttributeError:
|
||||
msg = "Nodes must have a `get_dict` method."
|
||||
logger.error(msg)
|
||||
|
||||
raise ValueError(msg)
|
||||
|
||||
logger.info(f"Merging {len(entities)} nodes.")
|
||||
|
||||
entity_query = (
|
||||
"UNWIND $entities AS ent "
|
||||
"CALL apoc.merge.node([ent.node_label], "
|
||||
"{id: ent.node_id}, ent.properties, ent.properties) "
|
||||
"YIELD node "
|
||||
"RETURN node"
|
||||
)
|
||||
|
||||
method = "explain" if explain else "profile" if profile else "query"
|
||||
|
||||
result = getattr(self._driver, method)(
|
||||
entity_query,
|
||||
parameters={
|
||||
"entities": entities,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info("Finished merging nodes.")
|
||||
|
||||
return result
|
||||
|
||||
def add_biocypher_edges(
|
||||
self,
|
||||
edges: Iterable[BioCypherEdge],
|
||||
explain: bool = False,
|
||||
profile: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Accepts an edge type handoff class
|
||||
(:class:`biocypher.create.BioCypherEdge`) with source
|
||||
and target ids, label, and a dict of properties (passing on the
|
||||
type of property, ie, int, string ...).
|
||||
|
||||
The individual edge is either passed as a singleton, in the case
|
||||
of representation as an edge in the graph, or as a 4-tuple, in
|
||||
the case of representation as a node (with two edges connecting
|
||||
to interaction partners).
|
||||
|
||||
The dict retrieved by the
|
||||
:meth:`biocypher.create.BioCypherEdge.get_dict()` method is
|
||||
passed into Neo4j as a map of maps, explicitly encoding source
|
||||
and target ids and the relationship label, and adding all edge
|
||||
properties from the 'properties' key of the dict. The merge is
|
||||
performed via APOC, matching only on source and target id to
|
||||
prevent duplicates. The same properties are set on match and on
|
||||
create, irrespective of the actual event.
|
||||
|
||||
Args:
|
||||
edges:
|
||||
An iterable of :class:`biocypher.create.BioCypherEdge` objects.
|
||||
explain:
|
||||
Call ``EXPLAIN`` on the CYPHER query.
|
||||
profile:
|
||||
Do profiling on the CYPHER query.
|
||||
|
||||
Returns:
|
||||
`True` for success, `False` otherwise.
|
||||
"""
|
||||
|
||||
edges = _misc.ensure_iterable(edges)
|
||||
edges = itertools.chain(*(_misc.ensure_iterable(i) for i in edges))
|
||||
|
||||
nodes = []
|
||||
rels = []
|
||||
|
||||
try:
|
||||
for e in edges:
|
||||
if hasattr(e, "get_node"):
|
||||
nodes.append(e.get_node())
|
||||
rels.append(e.get_source_edge().get_dict())
|
||||
rels.append(e.get_target_edge().get_dict())
|
||||
|
||||
else:
|
||||
rels.append(e.get_dict())
|
||||
|
||||
except AttributeError:
|
||||
msg = "Edges and nodes must have a `get_dict` method."
|
||||
logger.error(msg)
|
||||
|
||||
raise ValueError(msg)
|
||||
|
||||
self.add_biocypher_nodes(nodes)
|
||||
logger.info(f"Merging {len(rels)} edges.")
|
||||
|
||||
# cypher query
|
||||
|
||||
# merging only on the ids of the entities, passing the
|
||||
# properties on match and on create;
|
||||
# TODO add node labels?
|
||||
node_query = (
|
||||
"UNWIND $rels AS r "
|
||||
"MERGE (src {id: r.source_id}) "
|
||||
"MERGE (tar {id: r.target_id}) "
|
||||
)
|
||||
|
||||
self._driver.query(node_query, parameters={"rels": rels})
|
||||
|
||||
edge_query = (
|
||||
"UNWIND $rels AS r "
|
||||
"MATCH (src {id: r.source_id}) "
|
||||
"MATCH (tar {id: r.target_id}) "
|
||||
"WITH src, tar, r "
|
||||
"CALL apoc.merge.relationship"
|
||||
"(src, r.relationship_label, NULL, "
|
||||
"r.properties, tar, r.properties) "
|
||||
"YIELD rel "
|
||||
"RETURN rel"
|
||||
)
|
||||
|
||||
method = "explain" if explain else "profile" if profile else "query"
|
||||
|
||||
result = getattr(self._driver, method)(
|
||||
edge_query, parameters={"rels": rels}
|
||||
)
|
||||
|
||||
logger.info("Finished merging edges.")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_driver(
|
||||
dbms: str,
|
||||
translator: "Translator",
|
||||
):
|
||||
"""
|
||||
Function to return the writer class.
|
||||
|
||||
Returns:
|
||||
class: the writer class
|
||||
"""
|
||||
|
||||
dbms_config = _config(dbms)
|
||||
|
||||
if dbms == "neo4j":
|
||||
return _Neo4jDriver(
|
||||
database_name=dbms_config["database_name"],
|
||||
wipe=dbms_config["wipe"],
|
||||
uri=dbms_config["uri"],
|
||||
user=dbms_config["user"],
|
||||
password=dbms_config["password"],
|
||||
multi_db=dbms_config["multi_db"],
|
||||
translator=translator,
|
||||
)
|
||||
|
||||
return None
|
0
biocypher/output/in_memory/__init__.py
Normal file
0
biocypher/output/in_memory/__init__.py
Normal file
90
biocypher/output/in_memory/_pandas.py
Normal file
90
biocypher/output/in_memory/_pandas.py
Normal file
@ -0,0 +1,90 @@
|
||||
import pandas as pd
|
||||
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
||||
|
||||
|
||||
class Pandas:
|
||||
def __init__(self, translator, deduplicator):
|
||||
self.translator = translator
|
||||
self.deduplicator = deduplicator
|
||||
|
||||
self.dfs = {}
|
||||
|
||||
def _separate_entity_types(self, entities):
|
||||
"""
|
||||
Given mixed iterable of BioCypher objects, separate them into lists by
|
||||
type. Also deduplicates using the `Deduplicator` instance.
|
||||
"""
|
||||
lists = {}
|
||||
for entity in entities:
|
||||
if (
|
||||
not isinstance(entity, BioCypherNode)
|
||||
and not isinstance(entity, BioCypherEdge)
|
||||
and not isinstance(entity, BioCypherRelAsNode)
|
||||
):
|
||||
raise TypeError(
|
||||
"Expected a BioCypherNode / BioCypherEdge / "
|
||||
f"BioCypherRelAsNode, got {type(entity)}."
|
||||
)
|
||||
|
||||
if isinstance(entity, BioCypherNode):
|
||||
seen = self.deduplicator.node_seen(entity)
|
||||
elif isinstance(entity, BioCypherEdge):
|
||||
seen = self.deduplicator.edge_seen(entity)
|
||||
elif isinstance(entity, BioCypherRelAsNode):
|
||||
seen = self.deduplicator.rel_as_node_seen(entity)
|
||||
|
||||
if seen:
|
||||
continue
|
||||
|
||||
if isinstance(entity, BioCypherRelAsNode):
|
||||
node = entity.get_node()
|
||||
source_edge = entity.get_source_edge()
|
||||
target_edge = entity.get_target_edge()
|
||||
|
||||
_type = node.get_type()
|
||||
if not _type in lists:
|
||||
lists[_type] = []
|
||||
lists[_type].append(node)
|
||||
|
||||
_source_type = source_edge.get_type()
|
||||
if not _source_type in lists:
|
||||
lists[_source_type] = []
|
||||
lists[_source_type].append(source_edge)
|
||||
|
||||
_target_type = target_edge.get_type()
|
||||
if not _target_type in lists:
|
||||
lists[_target_type] = []
|
||||
lists[_target_type].append(target_edge)
|
||||
continue
|
||||
|
||||
_type = entity.get_type()
|
||||
if not _type in lists:
|
||||
lists[_type] = []
|
||||
lists[_type].append(entity)
|
||||
|
||||
return lists
|
||||
|
||||
def add_tables(self, entities):
|
||||
"""
|
||||
Add Pandas dataframes for each node and edge type in the input.
|
||||
"""
|
||||
|
||||
lists = self._separate_entity_types(entities)
|
||||
|
||||
for _type, _entities in lists.items():
|
||||
self._add_entity_df(_type, _entities)
|
||||
|
||||
def _add_entity_df(self, _type, _entities):
|
||||
df = pd.DataFrame(
|
||||
pd.json_normalize([node.get_dict() for node in _entities])
|
||||
)
|
||||
# replace "properties." with "" in column names
|
||||
df.columns = [col.replace("properties.", "") for col in df.columns]
|
||||
if _type not in self.dfs:
|
||||
self.dfs[_type] = df
|
||||
else:
|
||||
self.dfs[_type] = pd.concat(
|
||||
[self.dfs[_type], df], ignore_index=True
|
||||
)
|
||||
return self.dfs[_type]
|
0
biocypher/output/write/__init__.py
Normal file
0
biocypher/output/write/__init__.py
Normal file
1046
biocypher/output/write/_batch_writer.py
Normal file
1046
biocypher/output/write/_batch_writer.py
Normal file
File diff suppressed because it is too large
Load Diff
113
biocypher/output/write/_get_writer.py
Normal file
113
biocypher/output/write/_get_writer.py
Normal file
@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# Michael Hartung
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'offline' module. Handles the writing of node and edge representations
|
||||
suitable for import into a DBMS.
|
||||
"""
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write.graph._rdf import _RDFWriter
|
||||
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
|
||||
from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
|
||||
from biocypher.output.write.graph._networkx import _NetworkXWriter
|
||||
from biocypher.output.write.relational._csv import _PandasCSVWriter
|
||||
from biocypher.output.write.relational._sqlite import _SQLiteBatchWriter
|
||||
from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from biocypher._config import config as _config
|
||||
|
||||
__all__ = ["get_writer", "DBMS_TO_CLASS"]
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from biocypher._translate import Translator
|
||||
from biocypher._deduplicate import Deduplicator
|
||||
|
||||
DBMS_TO_CLASS = {
|
||||
"neo": _Neo4jBatchWriter,
|
||||
"neo4j": _Neo4jBatchWriter,
|
||||
"Neo4j": _Neo4jBatchWriter,
|
||||
"postgres": _PostgreSQLBatchWriter,
|
||||
"postgresql": _PostgreSQLBatchWriter,
|
||||
"PostgreSQL": _PostgreSQLBatchWriter,
|
||||
"arango": _ArangoDBBatchWriter,
|
||||
"arangodb": _ArangoDBBatchWriter,
|
||||
"ArangoDB": _ArangoDBBatchWriter,
|
||||
"sqlite": _SQLiteBatchWriter,
|
||||
"sqlite3": _SQLiteBatchWriter,
|
||||
"rdf": _RDFWriter,
|
||||
"RDF": _RDFWriter,
|
||||
"csv": _PandasCSVWriter,
|
||||
"CSV": _PandasCSVWriter,
|
||||
"pandas": _PandasCSVWriter,
|
||||
"Pandas": _PandasCSVWriter,
|
||||
"networkx": _NetworkXWriter,
|
||||
"NetworkX": _NetworkXWriter,
|
||||
}
|
||||
|
||||
|
||||
def get_writer(
|
||||
dbms: str,
|
||||
translator: "Translator",
|
||||
deduplicator: "Deduplicator",
|
||||
output_directory: str,
|
||||
strict_mode: bool,
|
||||
):
|
||||
"""
|
||||
Function to return the writer class based on the selection in the config
|
||||
file.
|
||||
|
||||
Args:
|
||||
dbms: the database management system; for options, see DBMS_TO_CLASS.
|
||||
translator: the Translator object.
|
||||
deduplicator: the Deduplicator object.
|
||||
output_directory: the directory to output.write the output files to.
|
||||
strict_mode: whether to use strict mode.
|
||||
|
||||
Returns:
|
||||
instance: an instance of the selected writer class.
|
||||
"""
|
||||
|
||||
dbms_config = _config(dbms)
|
||||
|
||||
writer = DBMS_TO_CLASS[dbms]
|
||||
|
||||
if not writer:
|
||||
raise ValueError(f"Unknown dbms: {dbms}")
|
||||
|
||||
if writer is not None:
|
||||
return writer(
|
||||
translator=translator,
|
||||
deduplicator=deduplicator,
|
||||
delimiter=dbms_config.get("delimiter"),
|
||||
array_delimiter=dbms_config.get("array_delimiter"),
|
||||
quote=dbms_config.get("quote_character"),
|
||||
output_directory=output_directory,
|
||||
db_name=dbms_config.get("database_name"),
|
||||
import_call_bin_prefix=dbms_config.get("import_call_bin_prefix"),
|
||||
import_call_file_prefix=dbms_config.get("import_call_file_prefix"),
|
||||
wipe=dbms_config.get("wipe"),
|
||||
strict_mode=strict_mode,
|
||||
skip_bad_relationships=dbms_config.get(
|
||||
"skip_bad_relationships"
|
||||
), # neo4j
|
||||
skip_duplicate_nodes=dbms_config.get(
|
||||
"skip_duplicate_nodes"
|
||||
), # neo4j
|
||||
db_user=dbms_config.get("user"), # psql
|
||||
db_password=dbms_config.get("password"), # psql
|
||||
db_port=dbms_config.get("port"), # psql
|
||||
rdf_format=dbms_config.get("rdf_format"), # rdf
|
||||
rdf_namespaces=dbms_config.get("rdf_namespaces"), # rdf
|
||||
)
|
200
biocypher/output/write/_writer.py
Normal file
200
biocypher/output/write/_writer.py
Normal file
@ -0,0 +1,200 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Union, Optional
|
||||
from collections.abc import Iterable
|
||||
import os
|
||||
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
||||
from biocypher._logger import logger
|
||||
from biocypher._translate import Translator
|
||||
from biocypher._deduplicate import Deduplicator
|
||||
|
||||
__all__ = ["_Writer"]
|
||||
|
||||
|
||||
class _Writer(ABC):
|
||||
"""Abstract class for writing node and edge representations to disk.
|
||||
Specifics of the different writers (e.g. neo4j, postgresql, csv, etc.)
|
||||
are implemented in the child classes. Any concrete writer needs to
|
||||
implement at least:
|
||||
- _write_node_data
|
||||
- _write_edge_data
|
||||
- _construct_import_call
|
||||
- _get_import_script_name
|
||||
|
||||
Args:
|
||||
translator (Translator): Instance of :py:class:`Translator` to enable translation of
|
||||
nodes and manipulation of properties.
|
||||
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
|
||||
of nodes and edges.
|
||||
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
|
||||
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
||||
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: Writer implementation must override '_write_node_data'
|
||||
NotImplementedError: Writer implementation must override '_write_edge_data'
|
||||
NotImplementedError: Writer implementation must override '_construct_import_call'
|
||||
NotImplementedError: Writer implementation must override '_get_import_script_name'
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
translator: Translator,
|
||||
deduplicator: Deduplicator,
|
||||
output_directory: Optional[str] = None,
|
||||
strict_mode: bool = False,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""Abstract class for writing node and edge representations to disk.
|
||||
|
||||
Args:
|
||||
translator (Translator): Instance of :py:class:`Translator` to enable translation of
|
||||
nodes and manipulation of properties.
|
||||
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
|
||||
of nodes and edges.
|
||||
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
|
||||
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
||||
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
||||
"""
|
||||
self.translator = translator
|
||||
self.deduplicator = deduplicator
|
||||
self.strict_mode = strict_mode
|
||||
self.output_directory = output_directory
|
||||
|
||||
if os.path.exists(self.output_directory):
|
||||
if kwargs.get("write_to_file", True):
|
||||
logger.warning(
|
||||
f"Output directory `{self.output_directory}` already exists. "
|
||||
"If this is not planned, file consistency may be compromised."
|
||||
)
|
||||
else:
|
||||
logger.info(f"Creating output directory `{self.output_directory}`.")
|
||||
os.makedirs(self.output_directory)
|
||||
|
||||
@abstractmethod
|
||||
def _write_node_data(
|
||||
self,
|
||||
nodes: Iterable[
|
||||
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
|
||||
],
|
||||
) -> bool:
|
||||
"""Implement how to output.write nodes to disk.
|
||||
|
||||
Args:
|
||||
nodes (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Writer implementation must override 'write_nodes'"
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _write_edge_data(
|
||||
self,
|
||||
edges: Iterable[
|
||||
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
|
||||
],
|
||||
) -> bool:
|
||||
"""Implement how to output.write edges to disk.
|
||||
|
||||
Args:
|
||||
edges (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Writer implementation must override 'write_edges'"
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: command for importing the output files into a DBMS.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Writer implementation must override '_construct_import_call'"
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""Returns the name of the import script.
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Writer implementation must override '_get_import_script_name'"
|
||||
)
|
||||
|
||||
def write_nodes(
|
||||
self, nodes, batch_size: int = int(1e6), force: bool = False
|
||||
):
|
||||
"""Wrapper for writing nodes.
|
||||
|
||||
Args:
|
||||
nodes (BioCypherNode): a list or generator of nodes in
|
||||
:py:class:`BioCypherNode` format
|
||||
batch_size (int): The batch size for writing nodes.
|
||||
force (bool): Whether to force writing nodes even if their type is
|
||||
not present in the schema.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
passed = self._write_node_data(nodes)
|
||||
if not passed:
|
||||
logger.error("Error while writing node data.")
|
||||
return False
|
||||
return True
|
||||
|
||||
def write_edges(
|
||||
self, edges, batch_size: int = int(1e6), force: bool = False
|
||||
):
|
||||
"""Wrapper for writing edges.
|
||||
|
||||
Args:
|
||||
nodes (BioCypherNode): a list or generator of nodes in
|
||||
:py:class:`BioCypherNode` format
|
||||
batch_size (int): The batch size for writing nodes.
|
||||
force (bool): Whether to force writing nodes even if their type is
|
||||
not present in the schema.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
passed = self._write_edge_data(edges)
|
||||
if not passed:
|
||||
logger.error("Error while writing edge data.")
|
||||
return False
|
||||
return True
|
||||
|
||||
def write_import_call(self):
|
||||
"""
|
||||
Function to output.write the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name, to the export folder as txt.
|
||||
|
||||
Returns:
|
||||
str: The path of the file holding the import call.
|
||||
"""
|
||||
file_path = os.path.join(
|
||||
self.output_directory, self._get_import_script_name()
|
||||
)
|
||||
logger.info(
|
||||
f"Writing {self.__class__.__name__} import call to `{file_path}`."
|
||||
)
|
||||
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(self._construct_import_call())
|
||||
|
||||
return file_path
|
0
biocypher/output/write/graph/__init__.py
Normal file
0
biocypher/output/write/graph/__init__.py
Normal file
241
biocypher/output/write/graph/_arangodb.py
Normal file
241
biocypher/output/write/graph/_arangodb.py
Normal file
@ -0,0 +1,241 @@
|
||||
import os
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
|
||||
|
||||
|
||||
class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to disk using the format
|
||||
specified by ArangoDB for the use of "arangoimport". Output files are
|
||||
similar to Neo4j, but with a different header format.
|
||||
"""
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the neo4j admin import location
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the neo4j admin import script
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return "arangodb-import-call.sh"
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of node.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.node_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.node_property_dict.items():
|
||||
# create header CSV with ID, properties, labels
|
||||
|
||||
_id = "_key"
|
||||
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
|
||||
# check if file already exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"File {header_path} already exists. Overwriting."
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k in props.keys():
|
||||
props_list.append(f"{k}")
|
||||
|
||||
# create list of lists and flatten
|
||||
# removes need for empty check of property list
|
||||
out_list = [[_id], props_list]
|
||||
out_list = [val for sublist in out_list for val in sublist]
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# concatenate with delimiter
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
# add collection from schema config
|
||||
collection = self.translator.ontology.mapping.extended_schema[
|
||||
label
|
||||
].get("db_collection_name", None)
|
||||
|
||||
# add file path to neo4 admin import statement
|
||||
# do once for each part file
|
||||
parts = self.parts.get(label, [])
|
||||
|
||||
if not parts:
|
||||
raise ValueError(
|
||||
f"No parts found for node label {label}. "
|
||||
f"Check that the data was parsed first.",
|
||||
)
|
||||
|
||||
for part in parts:
|
||||
import_call_header_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
import_call_parts_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
part,
|
||||
)
|
||||
|
||||
self.import_call_nodes.add(
|
||||
(
|
||||
import_call_header_path,
|
||||
import_call_parts_path,
|
||||
collection,
|
||||
)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.edge_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.edge_property_dict.items():
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
# paths
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
parts = f"{pascal_label}-part.*"
|
||||
|
||||
# check for file exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"Header file {header_path} already exists. Overwriting."
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k in props.keys():
|
||||
props_list.append(f"{k}")
|
||||
|
||||
out_list = ["_from", "_key", *props_list, "_to"]
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# concatenate with delimiter
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
# add collection from schema config
|
||||
if not self.translator.ontology.mapping.extended_schema.get(label):
|
||||
for (
|
||||
_,
|
||||
v,
|
||||
) in self.translator.ontology.mapping.extended_schema.items():
|
||||
if v.get("label_as_edge") == label:
|
||||
collection = v.get("db_collection_name", None)
|
||||
break
|
||||
|
||||
else:
|
||||
collection = self.translator.ontology.mapping.extended_schema[
|
||||
label
|
||||
].get("db_collection_name", None)
|
||||
|
||||
# add file path to neo4 admin import statement (import call path
|
||||
# may be different from actual output path)
|
||||
header_import_call_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
parts_import_call_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
parts,
|
||||
)
|
||||
self.import_call_edges.add(
|
||||
(
|
||||
header_import_call_path,
|
||||
parts_import_call_path,
|
||||
collection,
|
||||
)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for neo4j-admin import
|
||||
"""
|
||||
import_call = (
|
||||
f"{self.import_call_bin_prefix}arangoimp "
|
||||
f"--type csv "
|
||||
f'--separator="{self.escaped_delim}" '
|
||||
)
|
||||
|
||||
if self.quote == "'":
|
||||
import_call += f'--quote="{self.quote}" '
|
||||
else:
|
||||
import_call += f"--quote='{self.quote}' "
|
||||
|
||||
node_lines = ""
|
||||
|
||||
# node import calls: one line per node type
|
||||
for header_path, parts_path, collection in self.import_call_nodes:
|
||||
line = (
|
||||
f"{import_call} "
|
||||
f"--headers-file {header_path} "
|
||||
f"--file= {parts_path} "
|
||||
)
|
||||
|
||||
if collection:
|
||||
line += f"--create-collection --collection {collection} "
|
||||
|
||||
node_lines += f"{line}\n"
|
||||
|
||||
edge_lines = ""
|
||||
|
||||
# edge import calls: one line per edge type
|
||||
for header_path, parts_path, collection in self.import_call_edges:
|
||||
import_call += f'--relationships="{header_path},{parts_path}" '
|
||||
|
||||
return node_lines + edge_lines
|
502
biocypher/output/write/graph/_neo4j.py
Normal file
502
biocypher/output/write/graph/_neo4j.py
Normal file
@ -0,0 +1,502 @@
|
||||
import os
|
||||
import glob
|
||||
import pandas as pd
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._batch_writer import parse_label, _BatchWriter
|
||||
|
||||
|
||||
class _Neo4jBatchWriter(_BatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to disk using the
|
||||
format specified by Neo4j for the use of admin import. Each batch
|
||||
writer instance has a fixed representation that needs to be passed
|
||||
at instantiation via the :py:attr:`schema` argument. The instance
|
||||
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
|
||||
to convert and extend the hierarchy.
|
||||
|
||||
This class inherits from the abstract class "_BatchWriter" and implements the
|
||||
Neo4j-specific methods:
|
||||
|
||||
- _write_node_headers
|
||||
- _write_edge_headers
|
||||
- _construct_import_call
|
||||
- _write_array_string
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Constructor.
|
||||
|
||||
Check the version of Neo4j and adds a command scope if version >= 5.
|
||||
|
||||
Returns:
|
||||
_Neo4jBatchWriter: An instance of the writer.
|
||||
"""
|
||||
|
||||
# Should read the configuration and setup import_call_bin_prefix.
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the neo4j admin import location
|
||||
"""
|
||||
|
||||
return "bin/"
|
||||
|
||||
def _write_array_string(self, string_list):
|
||||
"""
|
||||
Abstract method to output.write the string representation of an array into a .csv file
|
||||
as required by the neo4j admin-import.
|
||||
|
||||
Args:
|
||||
string_list (list): list of ontology strings
|
||||
|
||||
Returns:
|
||||
str: The string representation of an array for the neo4j admin import
|
||||
"""
|
||||
string = self.adelim.join(string_list)
|
||||
return f"{self.quote}{string}{self.quote}"
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of node.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.node_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.node_property_dict.items():
|
||||
_id = ":ID"
|
||||
|
||||
##MeDaX dev remark:
|
||||
##From Fhir data we get case sensitive labels. E.g. 'Procedure' and 'procedure' are two distinct node types.
|
||||
##Because we are converting Resources to more specific node classes using their "resourceType" attribute.
|
||||
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(
|
||||
parse_label(label)
|
||||
)
|
||||
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
parts = f"{pascal_label}-part.*"
|
||||
|
||||
existing_header = False
|
||||
# check if file already exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"Header file `{header_path}` already exists. Overwriting.",
|
||||
)
|
||||
with open(header_path, "r", encoding="utf-8") as existing:
|
||||
existing_header = existing.read().strip().split(self.delim)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k, v in props.items():
|
||||
if v in ["int", "long", "integer"]:
|
||||
props_list.append(f"{k}:long")
|
||||
elif v in ["int[]", "long[]", "integer[]"]:
|
||||
props_list.append(f"{k}:long[]")
|
||||
elif v in ["float", "double", "dbl"]:
|
||||
props_list.append(f"{k}:double")
|
||||
elif v in ["float[]", "double[]"]:
|
||||
props_list.append(f"{k}:double[]")
|
||||
elif v in ["bool", "boolean"]:
|
||||
# TODO Neo4j boolean support / spelling?
|
||||
props_list.append(f"{k}:boolean")
|
||||
elif v in ["bool[]", "boolean[]"]:
|
||||
props_list.append(f"{k}:boolean[]")
|
||||
elif v in ["str[]", "string[]"]:
|
||||
props_list.append(f"{k}:string[]")
|
||||
else:
|
||||
props_list.append(f"{k}")
|
||||
|
||||
# create list of lists and flatten
|
||||
out_list = [[_id], props_list, [":LABEL"]]
|
||||
out_list = [val for sublist in out_list for val in sublist]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# Check if header file already exists and has different columns
|
||||
if os.path.exists(header_path):
|
||||
if existing_header:
|
||||
#existing_header = existing.read().strip().split(self.delim)
|
||||
# Compare existing and new headers
|
||||
if set(existing_header) != set(out_list):
|
||||
|
||||
# Get part files associated with this header
|
||||
base_name = os.path.basename(header_path).replace("-header.csv", "")
|
||||
part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
|
||||
|
||||
|
||||
# Find the highest numbered part file without full sorting
|
||||
highest_part = None
|
||||
highest_number = -1
|
||||
|
||||
for part_file in part_files:
|
||||
try:
|
||||
# Extract number from filename (assuming format like "part123.csv")
|
||||
file_name = os.path.basename(part_file)
|
||||
number_part = file_name.split("part")[1].split(".")[0]
|
||||
number = int(number_part)
|
||||
|
||||
if number > highest_number:
|
||||
highest_number = number
|
||||
highest_part = part_file
|
||||
except (IndexError, ValueError):
|
||||
# Skip files that don't match the expected pattern
|
||||
continue
|
||||
# Update each part file with the new columns
|
||||
for part_file in part_files:
|
||||
if part_file == highest_part:
|
||||
print(f"Skipping the highest part file: {highest_part}")
|
||||
continue
|
||||
try:
|
||||
#print("exi: ", existing_header)
|
||||
#print("out: ", out_list)
|
||||
df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
|
||||
# Read the file without headers
|
||||
|
||||
# Write back to file WITHOUT including the header
|
||||
df.to_csv(part_file, sep=self.delim, index=False, header=False)
|
||||
print(f"Updated {part_file} with new columns in correct positions")
|
||||
except Exception as e:
|
||||
print(f"Error updating {part_file}: {e}")
|
||||
|
||||
# Write the new header
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
|
||||
# add file path to neo4 admin import statement (import call file
|
||||
# path may be different from actual file path)
|
||||
import_call_header_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
import_call_parts_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
parts,
|
||||
)
|
||||
self.import_call_nodes.add(
|
||||
(import_call_header_path, import_call_parts_path)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.edge_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.edge_property_dict.items():
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(
|
||||
parse_label(label)
|
||||
)
|
||||
|
||||
# paths
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
parts = f"{pascal_label}-part.*"
|
||||
|
||||
# check for file exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"File {header_path} already exists. Overwriting."
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k, v in props.items():
|
||||
if v in ["int", "long", "integer"]:
|
||||
props_list.append(f"{k}:long")
|
||||
elif v in ["int[]", "long[]", "integer[]"]:
|
||||
props_list.append(f"{k}:long[]")
|
||||
elif v in ["float", "double"]:
|
||||
props_list.append(f"{k}:double")
|
||||
elif v in ["float[]", "double[]"]:
|
||||
props_list.append(f"{k}:double[]")
|
||||
elif v in [
|
||||
"bool",
|
||||
"boolean",
|
||||
]: # TODO does Neo4j support bool?
|
||||
props_list.append(f"{k}:boolean")
|
||||
elif v in ["bool[]", "boolean[]"]:
|
||||
props_list.append(f"{k}:boolean[]")
|
||||
elif v in ["str[]", "string[]"]:
|
||||
props_list.append(f"{k}:string[]")
|
||||
else:
|
||||
props_list.append(f"{k}")
|
||||
|
||||
skip_id = False
|
||||
schema_label = None
|
||||
|
||||
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
|
||||
skip_id = True
|
||||
elif not self.translator.ontology.mapping.extended_schema.get(
|
||||
label
|
||||
):
|
||||
# find label in schema by label_as_edge
|
||||
for (
|
||||
k,
|
||||
v,
|
||||
) in self.translator.ontology.mapping.extended_schema.items():
|
||||
if v.get("label_as_edge") == label:
|
||||
schema_label = k
|
||||
break
|
||||
else:
|
||||
schema_label = label
|
||||
|
||||
out_list = [":START_ID"]
|
||||
|
||||
if schema_label:
|
||||
if (
|
||||
self.translator.ontology.mapping.extended_schema.get(
|
||||
schema_label
|
||||
).get("use_id")
|
||||
== False
|
||||
):
|
||||
skip_id = True
|
||||
|
||||
if not skip_id:
|
||||
out_list.append("id")
|
||||
|
||||
out_list.extend(props_list)
|
||||
out_list.extend([":END_ID", ":TYPE"])
|
||||
|
||||
existing_header = False
|
||||
# check if file already exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"Header file `{header_path}` already exists. Overwriting.",
|
||||
)
|
||||
with open(header_path, "r", encoding="utf-8") as existing:
|
||||
existing_header = existing.read().strip().split(self.delim)
|
||||
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# Check if header file already exists and has different columns
|
||||
if os.path.exists(header_path):
|
||||
if existing_header:
|
||||
#existing_header = existing.read().strip().split(self.delim)
|
||||
# Compare existing and new headers
|
||||
if set(existing_header) != set(out_list):
|
||||
|
||||
# Get part files associated with this header
|
||||
base_name = os.path.basename(header_path).replace("-header.csv", "")
|
||||
part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
|
||||
|
||||
|
||||
# Find the highest numbered part file without full sorting
|
||||
highest_part = None
|
||||
highest_number = -1
|
||||
|
||||
for part_file in part_files:
|
||||
try:
|
||||
# Extract number from filename (assuming format like "part123.csv")
|
||||
file_name = os.path.basename(part_file)
|
||||
number_part = file_name.split("part")[1].split(".")[0]
|
||||
number = int(number_part)
|
||||
|
||||
if number > highest_number:
|
||||
highest_number = number
|
||||
highest_part = part_file
|
||||
except (IndexError, ValueError):
|
||||
# Skip files that don't match the expected pattern
|
||||
continue
|
||||
# Update each part file with the new columns
|
||||
for part_file in part_files:
|
||||
if part_file == highest_part:
|
||||
print(f"Skipping the highest part file: {highest_part}")
|
||||
continue
|
||||
try:
|
||||
print("exi: ", existing_header)
|
||||
print("out: ", out_list)
|
||||
df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
|
||||
# Read the file without headers
|
||||
|
||||
# Write back to file WITHOUT including the header
|
||||
df.to_csv(part_file, sep=self.delim, index=False, header=False)
|
||||
print(f"Updated {part_file} with new columns in correct positions")
|
||||
except Exception as e:
|
||||
print(f"Error updating {part_file}: {e}")
|
||||
|
||||
# Write the new header
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
# add file path to neo4 admin import statement (import call file
|
||||
# path may be different from actual file path)
|
||||
import_call_header_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
import_call_parts_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
parts,
|
||||
)
|
||||
self.import_call_edges.add(
|
||||
(import_call_header_path, import_call_parts_path)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the neo4j admin import script
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return "neo4j-admin-import-call.sh"
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for neo4j-admin import
|
||||
"""
|
||||
import_call_neo4j_v4 = self._get_import_call(
|
||||
"import", "--database=", "--force="
|
||||
)
|
||||
import_call_neo4j_v5 = self._get_import_call(
|
||||
"database import full", "", "--overwrite-destination="
|
||||
)
|
||||
neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
|
||||
|
||||
import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
|
||||
return import_script
|
||||
|
||||
def _get_import_call(
|
||||
self, import_cmd: str, database_cmd: str, wipe_cmd: str
|
||||
) -> str:
|
||||
"""Get parametrized import call for Neo4j 4 or 5+.
|
||||
|
||||
Args:
|
||||
import_cmd (str): The import command to use.
|
||||
database_cmd (str): The database command to use.
|
||||
wipe_cmd (str): The wipe command to use.
|
||||
|
||||
Returns:
|
||||
str: The import call.
|
||||
"""
|
||||
import_call = f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "
|
||||
|
||||
import_call += f"{database_cmd}{self.db_name} "
|
||||
|
||||
import_call += f'--delimiter="{self.escaped_delim}" '
|
||||
|
||||
import_call += f'--array-delimiter="{self.escaped_adelim}" '
|
||||
|
||||
if self.quote == "'":
|
||||
import_call += f'--quote="{self.quote}" '
|
||||
else:
|
||||
import_call += f"--quote='{self.quote}' "
|
||||
|
||||
if self.wipe:
|
||||
import_call += f"{wipe_cmd}true "
|
||||
if self.skip_bad_relationships:
|
||||
import_call += "--skip-bad-relationships=true "
|
||||
if self.skip_duplicate_nodes:
|
||||
import_call += "--skip-duplicate-nodes=true "
|
||||
|
||||
# append node import calls
|
||||
for header_path, parts_path in self.import_call_nodes:
|
||||
import_call += f'--nodes="{header_path},{parts_path}" '
|
||||
|
||||
# append edge import calls
|
||||
for header_path, parts_path in self.import_call_edges:
|
||||
import_call += f'--relationships="{header_path},{parts_path}" '
|
||||
|
||||
return import_call
|
||||
|
||||
|
||||
|
||||
|
||||
def adapt_csv_to_new_header(self, old_header, new_header, csv_file_path):
|
||||
"""
|
||||
Adapt a CSV table to a new header structure, placing new columns in their correct positions.
|
||||
|
||||
Parameters:
|
||||
old_header (list): The original header columns
|
||||
new_header (list): The new header columns
|
||||
csv_file_path (str): Path to the CSV file
|
||||
|
||||
Returns:
|
||||
pandas.DataFrame: CSV data with the new header structure
|
||||
"""
|
||||
|
||||
# Step 1: Read the CSV data without headers
|
||||
df = pd.read_csv(csv_file_path, sep=self.delim, header=None)
|
||||
|
||||
# Step 2: If the file is empty, return empty DataFrame with new headers
|
||||
if df.empty:
|
||||
return pd.DataFrame(columns=new_header)
|
||||
|
||||
# Step 3: If column count doesn't match old_header length, handle the mismatch
|
||||
if len(df.columns) != len(old_header):
|
||||
print(f"Warning: CSV columns count ({len(df.columns)}) doesn't match the provided old header count ({len(old_header)})")
|
||||
# If file has fewer columns than old_header, pad with NaN
|
||||
if len(df.columns) < len(old_header):
|
||||
for i in range(len(df.columns), len(old_header)):
|
||||
df[i] = None
|
||||
# If file has more columns than old_header, truncate
|
||||
else:
|
||||
df = df.iloc[:, :len(old_header)]
|
||||
|
||||
# Step 4: Assign old header names to the dataframe
|
||||
df.columns = old_header
|
||||
|
||||
# Step 5: Create a new DataFrame with the correct structure
|
||||
new_df = pd.DataFrame(columns=new_header)
|
||||
|
||||
# Step 6: For each column in the new header, find its position in the old header
|
||||
for new_col_idx, new_col in enumerate(new_header):
|
||||
if new_col in old_header:
|
||||
# If column exists in old header, copy data
|
||||
new_df[new_col] = df[new_col]
|
||||
else:
|
||||
# If new column, add empty column
|
||||
new_df[new_col] = None
|
||||
|
||||
# Step 7: Ensure columns are in the exact order of new_header
|
||||
new_df = new_df[new_header]
|
||||
|
||||
return new_df
|
76
biocypher/output/write/graph/_networkx.py
Normal file
76
biocypher/output/write/graph/_networkx.py
Normal file
@ -0,0 +1,76 @@
|
||||
import pickle
|
||||
|
||||
import networkx as nx
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._writer import _Writer
|
||||
from biocypher.output.write.relational._csv import _PandasCSVWriter
|
||||
|
||||
|
||||
class _NetworkXWriter(_Writer):
|
||||
"""
|
||||
Class for writing node and edges to a networkx DiGraph.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.csv_writer = _PandasCSVWriter(*args, write_to_file=False, **kwargs)
|
||||
self.G = nx.DiGraph()
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
|
||||
|
||||
Returns:
|
||||
str: Python code to load the csv files into Pandas dfs.
|
||||
"""
|
||||
logger.info(
|
||||
f"Writing networkx {self.G} to pickle file networkx_graph.pkl."
|
||||
)
|
||||
with open(f"{self.output_directory}/networkx_graph.pkl", "wb") as f:
|
||||
pickle.dump(self.G, f)
|
||||
|
||||
import_call = "import pickle\n"
|
||||
import_call += "with open('./networkx_graph.pkl', 'rb') as f:\n\tG_loaded = pickle.load(f)"
|
||||
return import_call
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""Function to return the name of the import script."""
|
||||
return "import_networkx.py"
|
||||
|
||||
def _write_node_data(self, nodes) -> bool:
|
||||
passed = self.csv_writer._write_entities_to_file(nodes)
|
||||
self.add_to_networkx()
|
||||
return passed
|
||||
|
||||
def _write_edge_data(self, edges) -> bool:
|
||||
passed = self.csv_writer._write_entities_to_file(edges)
|
||||
self.add_to_networkx()
|
||||
return passed
|
||||
|
||||
def add_to_networkx(self) -> bool:
|
||||
all_dfs = self.csv_writer.stored_dfs
|
||||
node_dfs = [
|
||||
df
|
||||
for df in all_dfs.values()
|
||||
if df.columns.str.contains("node_id").any()
|
||||
]
|
||||
edge_dfs = [
|
||||
df
|
||||
for df in all_dfs.values()
|
||||
if df.columns.str.contains("source_id").any()
|
||||
and df.columns.str.contains("target_id").any()
|
||||
]
|
||||
for df in node_dfs:
|
||||
nodes = df.set_index("node_id").to_dict(orient="index")
|
||||
self.G.add_nodes_from(nodes.items())
|
||||
for df in edge_dfs:
|
||||
edges = df.set_index(["source_id", "target_id"]).to_dict(
|
||||
orient="index"
|
||||
)
|
||||
self.G.add_edges_from(
|
||||
(
|
||||
(source, target, attrs)
|
||||
for (source, target), attrs in edges.items()
|
||||
)
|
||||
)
|
||||
return True
|
515
biocypher/output/write/graph/_rdf.py
Normal file
515
biocypher/output/write/graph/_rdf.py
Normal file
@ -0,0 +1,515 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Loes van den Biggelaar
|
||||
# Sebastian Lobentanzer
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'offline' module. Handles the writing of node and edge representations
|
||||
suitable for import into a DBMS.
|
||||
"""
|
||||
from types import GeneratorType
|
||||
from typing import Union
|
||||
import os
|
||||
|
||||
from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
|
||||
from rdflib.namespace import (
|
||||
_NAMESPACE_PREFIXES_CORE,
|
||||
_NAMESPACE_PREFIXES_RDFLIB,
|
||||
)
|
||||
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._batch_writer import _BatchWriter
|
||||
|
||||
|
||||
class _RDFWriter(_BatchWriter):
|
||||
"""
|
||||
Class to write BioCypher's property graph into an RDF format using
|
||||
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
|
||||
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
|
||||
is done keeping only the minimum information about node and edges,
|
||||
skipping all properties.
|
||||
"""
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the RDF admin import script.
|
||||
This function applicable for RDF export.
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return "rdf-import-call.sh"
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the RDF admin import location
|
||||
"""
|
||||
return "bin/"
|
||||
|
||||
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
|
||||
"""
|
||||
Function to check if the specified RDF format is supported.
|
||||
|
||||
Args:
|
||||
rdf_format (str): The RDF format to check.
|
||||
|
||||
Returns:
|
||||
bool: Returns True if rdf format supported, False otherwise.
|
||||
"""
|
||||
supported_formats = [
|
||||
"xml",
|
||||
"n3",
|
||||
"turtle",
|
||||
"nt",
|
||||
"pretty-xml",
|
||||
"trix",
|
||||
"trig",
|
||||
"nquads",
|
||||
"json-ld",
|
||||
]
|
||||
if rdf_format not in supported_formats:
|
||||
logger.error(
|
||||
f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
|
||||
f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
|
||||
)
|
||||
return False
|
||||
else:
|
||||
# RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
|
||||
if self.rdf_format == "turtle":
|
||||
self.extension = "ttl"
|
||||
elif self.rdf_format == "ttl":
|
||||
self.rdf_format = "turtle"
|
||||
self.extension = "ttl"
|
||||
else:
|
||||
self.extension = self.rdf_format
|
||||
return True
|
||||
|
||||
def _write_single_edge_list_to_file(
|
||||
self,
|
||||
edge_list: list,
|
||||
label: str,
|
||||
prop_dict: dict,
|
||||
):
|
||||
"""
|
||||
This function takes one list of biocypher edges and writes them
|
||||
to an RDF file with the given format.
|
||||
|
||||
Args:
|
||||
edge_list (list): list of BioCypherEdges to be written
|
||||
|
||||
label (str): the label (type) of the edge
|
||||
|
||||
prop_dict (dict): properties of node class passed from parsing
|
||||
function and their types
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
|
||||
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
||||
logger.error("Edges must be passed as type BioCypherEdge.")
|
||||
return False
|
||||
|
||||
# translate label to PascalCase
|
||||
label_pascal = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
# create file name
|
||||
file_name = os.path.join(
|
||||
self.outdir, f"{label_pascal}.{self.extension}"
|
||||
)
|
||||
|
||||
# write data in graph
|
||||
graph = Graph()
|
||||
self._init_namespaces(graph)
|
||||
|
||||
for edge in edge_list:
|
||||
rdf_subject = edge.get_source_id()
|
||||
rdf_object = edge.get_target_id()
|
||||
rdf_predicate = edge.get_id()
|
||||
rdf_properties = edge.get_properties()
|
||||
if rdf_predicate == None:
|
||||
rdf_predicate = rdf_subject + rdf_object
|
||||
|
||||
edge_label = self.translator.name_sentence_to_pascal(
|
||||
edge.get_label()
|
||||
)
|
||||
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
|
||||
graph.add((edge_uri, RDF.type, RDFS.Class))
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][rdf_predicate],
|
||||
RDF.type,
|
||||
edge_uri,
|
||||
)
|
||||
)
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][rdf_predicate],
|
||||
self.rdf_namespaces["biocypher"]["subject"],
|
||||
self.subject_to_uri(rdf_subject),
|
||||
)
|
||||
)
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][rdf_predicate],
|
||||
self.rdf_namespaces["biocypher"]["object"],
|
||||
self.subject_to_uri(rdf_object),
|
||||
)
|
||||
)
|
||||
|
||||
# add properties to the transformed edge --> node
|
||||
for key, value in rdf_properties.items():
|
||||
# only write value if it exists.
|
||||
if value:
|
||||
self.add_property_to_graph(graph, rdf_predicate, value, key)
|
||||
|
||||
graph.serialize(destination=file_name, format=self.rdf_format)
|
||||
|
||||
logger.info(
|
||||
f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def add_property_to_graph(
|
||||
self,
|
||||
graph: Graph,
|
||||
rdf_subject: str,
|
||||
rdf_object: str,
|
||||
rdf_predicate: str,
|
||||
):
|
||||
"""
|
||||
Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
|
||||
It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
|
||||
If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
|
||||
If the property is neither a list or string, it will also be added as a literal.
|
||||
|
||||
Args:
|
||||
graph (RDFLib.Graph): The RDF graph to add the nodes to.
|
||||
|
||||
rdf_subject (str): The subject of the RDF triple.
|
||||
|
||||
rdf_object (str): The object of the RDF triple.
|
||||
|
||||
rdf_predicate (str): The predicate of the RDF triple.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if isinstance(rdf_object, list):
|
||||
for obj in rdf_object:
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
self.property_to_uri(rdf_predicate),
|
||||
Literal(obj),
|
||||
)
|
||||
)
|
||||
elif isinstance(rdf_object, str):
|
||||
if rdf_object.startswith("[") and rdf_object.endswith("]"):
|
||||
self.add_property_to_graph(
|
||||
graph,
|
||||
rdf_subject,
|
||||
self.transform_string_to_list(rdf_object),
|
||||
rdf_predicate,
|
||||
)
|
||||
else:
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
self.property_to_uri(rdf_predicate),
|
||||
Literal(rdf_object),
|
||||
)
|
||||
)
|
||||
else:
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
self.property_to_uri(rdf_predicate),
|
||||
Literal(rdf_object),
|
||||
)
|
||||
)
|
||||
|
||||
def transform_string_to_list(self, string_list: str) -> list:
|
||||
"""
|
||||
Function to transform a string representation of a list into a list.
|
||||
|
||||
Args:
|
||||
string_list (str): The string representation of the list.
|
||||
|
||||
Returns:
|
||||
list: The list representation of the input string.
|
||||
"""
|
||||
return (
|
||||
string_list.replace("[", "")
|
||||
.replace("]", "")
|
||||
.replace("'", "")
|
||||
.split(", ")
|
||||
)
|
||||
|
||||
def _write_single_node_list_to_file(
|
||||
self,
|
||||
node_list: list,
|
||||
label: str,
|
||||
prop_dict: dict,
|
||||
labels: str,
|
||||
):
|
||||
"""
|
||||
This function takes a list of BioCypherNodes and writes them
|
||||
to an RDF file in the specified format.
|
||||
|
||||
Args:
|
||||
node_list (list): A list of BioCypherNodes to be written.
|
||||
|
||||
label (str): The label (type) of the nodes.
|
||||
|
||||
prop_dict (dict): A dictionary of properties and their types for the node class.
|
||||
|
||||
Returns:
|
||||
bool: True if the writing is successful, False otherwise.
|
||||
"""
|
||||
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
||||
logger.error("Nodes must be passed as type BioCypherNode.")
|
||||
return False
|
||||
|
||||
# translate label to PascalCase
|
||||
label_pascal = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
# create file name
|
||||
file_name = os.path.join(
|
||||
self.outdir, f"{label_pascal}.{self.extension}"
|
||||
)
|
||||
|
||||
# write data in graph
|
||||
graph = Graph()
|
||||
self._init_namespaces(graph)
|
||||
|
||||
for n in node_list:
|
||||
rdf_subject = n.get_id()
|
||||
rdf_object = n.get_label()
|
||||
properties = n.get_properties()
|
||||
class_name = self.translator.name_sentence_to_pascal(rdf_object)
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][class_name],
|
||||
RDF.type,
|
||||
RDFS.Class,
|
||||
)
|
||||
)
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
RDF.type,
|
||||
self.rdf_namespaces["biocypher"][class_name],
|
||||
)
|
||||
)
|
||||
for key, value in properties.items():
|
||||
# only write value if it exists.
|
||||
if value:
|
||||
self.add_property_to_graph(graph, rdf_subject, value, key)
|
||||
|
||||
graph.serialize(destination=file_name, format=self.rdf_format)
|
||||
|
||||
logger.info(
|
||||
f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def write_nodes(
|
||||
self, nodes, batch_size: int = int(1e6), force: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
|
||||
|
||||
Args:
|
||||
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
|
||||
batch_size (int): The number of nodes to write in each batch.
|
||||
force (bool): Flag to force the writing even if the output file already exists.
|
||||
|
||||
Returns:
|
||||
bool: True if the writing is successful, False otherwise.
|
||||
"""
|
||||
# check if specified output format is correct
|
||||
passed = self._is_rdf_format_supported(self.rdf_format)
|
||||
if not passed:
|
||||
logger.error("Error while writing node data, wrong RDF format")
|
||||
return False
|
||||
# write node data using _write_node_data method
|
||||
passed = self._write_node_data(nodes, batch_size, force)
|
||||
if not passed:
|
||||
logger.error("Error while writing node data.")
|
||||
return False
|
||||
return True
|
||||
|
||||
def write_edges(
|
||||
self,
|
||||
edges: Union[list, GeneratorType],
|
||||
batch_size: int = int(1e6),
|
||||
) -> bool:
|
||||
"""
|
||||
Wrapper for writing edges in RDF format. It calls _write_edge_data()
|
||||
functions specifying it's edge data.
|
||||
|
||||
Args:
|
||||
edges (BioCypherEdge): a list or generator of edges in
|
||||
:py:class:`BioCypherEdge` format
|
||||
batch_size (int): The number of edges to write in each batch.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# check if specified output format is correct
|
||||
passed = self._is_rdf_format_supported(self.rdf_format)
|
||||
if not passed:
|
||||
logger.error("Error while writing edge data, wrong RDF format")
|
||||
return False
|
||||
# write edge data using _write_edge_data method
|
||||
passed = self._write_edge_data(edges, batch_size=batch_size)
|
||||
if not passed:
|
||||
logger.error("Error while writing edge data.")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _construct_import_call(self) -> bool:
|
||||
"""
|
||||
Function to write the import call.
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _write_array_string(self, string_list):
|
||||
"""
|
||||
Abstract method to write the string representation of an array into a .csv file
|
||||
as required by the RDF admin-import.
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Args:
|
||||
string_list (list): list of ontology strings
|
||||
|
||||
Returns:
|
||||
str: The string representation of an array for the neo4j admin import
|
||||
"""
|
||||
|
||||
return True
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Abstract method that takes care of importing properties of a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Abstract method to write a database import-file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
return True
|
||||
|
||||
def subject_to_uri(self, subject: str) -> str:
|
||||
"""
|
||||
Converts the subject to a proper URI using the available namespaces.
|
||||
If the conversion fails, it defaults to the biocypher prefix.
|
||||
|
||||
Args:
|
||||
subject (str): The subject to be converted to a URI.
|
||||
|
||||
Returns:
|
||||
str: The corresponding URI for the subject.
|
||||
"""
|
||||
try:
|
||||
_pref, _id = subject.split(":")
|
||||
|
||||
if _pref in self.rdf_namespaces.keys():
|
||||
return self.rdf_namespaces[_pref][_id]
|
||||
else:
|
||||
return self.rdf_namespaces["biocypher"][subject]
|
||||
except ValueError:
|
||||
return self.rdf_namespaces["biocypher"][subject]
|
||||
|
||||
def property_to_uri(self, property_name: str) -> dict[str, str]:
|
||||
"""
|
||||
Converts a property name to its corresponding URI.
|
||||
|
||||
This function takes a property name and searches for its corresponding URI in various namespaces.
|
||||
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
|
||||
|
||||
Args:
|
||||
property_name (str): The property name to be converted to a URI.
|
||||
|
||||
Returns:
|
||||
str: The corresponding URI for the input property name.
|
||||
"""
|
||||
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
|
||||
for namespace in _NAMESPACE_PREFIXES_CORE.values():
|
||||
if property_name in namespace:
|
||||
return namespace[property_name]
|
||||
|
||||
# If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
|
||||
for namespace in [SKOS, DC, DCTERMS]:
|
||||
if property_name in namespace:
|
||||
return namespace[property_name]
|
||||
|
||||
# If the property name is still not found, try other namespaces from rdflib.
|
||||
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
|
||||
if property_name in namespace:
|
||||
return namespace[property_name]
|
||||
|
||||
# If the property name is "licence", it recursively calls the function with "license" as the input.
|
||||
if property_name == "licence":
|
||||
return self.property_to_uri("license")
|
||||
|
||||
# TODO: add an option to search trough manually implemented namespaces
|
||||
|
||||
# If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
|
||||
# TODO: give a warning and try to prevent this option altogether
|
||||
return self.rdf_namespaces["biocypher"][property_name]
|
||||
|
||||
def _init_namespaces(self, graph: Graph):
|
||||
"""
|
||||
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
|
||||
|
||||
This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
|
||||
If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
|
||||
the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
|
||||
|
||||
Args:
|
||||
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# add biocypher standard to self.rdf_namespaces
|
||||
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
|
||||
if not self.rdf_namespaces:
|
||||
self.rdf_namespaces = biocypher_standard
|
||||
else:
|
||||
self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
|
||||
|
||||
for key, value in self.rdf_namespaces.items():
|
||||
namespace = Namespace(value)
|
||||
self.rdf_namespaces[key] = namespace
|
||||
graph.bind(key, namespace)
|
0
biocypher/output/write/relational/__init__.py
Normal file
0
biocypher/output/write/relational/__init__.py
Normal file
76
biocypher/output/write/relational/_csv.py
Normal file
76
biocypher/output/write/relational/_csv.py
Normal file
@ -0,0 +1,76 @@
|
||||
from more_itertools import peekable
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._writer import _Writer
|
||||
from biocypher.output.in_memory._pandas import Pandas
|
||||
|
||||
|
||||
class _PandasCSVWriter(_Writer):
|
||||
"""
|
||||
Class for writing node and edge representations to a CSV file.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, write_to_file: bool = True, **kwargs):
|
||||
kwargs["write_to_file"] = write_to_file
|
||||
super().__init__(*args, **kwargs)
|
||||
self.in_memory_dfs = {}
|
||||
self.stored_dfs = {}
|
||||
self.pandas_in_memory = Pandas(
|
||||
translator=self.translator,
|
||||
deduplicator=self.deduplicator,
|
||||
)
|
||||
self.delimiter = kwargs.get("delimiter")
|
||||
if not self.delimiter:
|
||||
self.delimiter = ","
|
||||
self.write_to_file = write_to_file
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
|
||||
|
||||
Returns:
|
||||
str: Python code to load the csv files into Pandas dfs.
|
||||
"""
|
||||
import_call = "import pandas as pd\n\n"
|
||||
for df_name in self.stored_dfs.keys():
|
||||
import_call += f"{df_name} = pd.read_csv('./{df_name}.csv', header=0, index_col=0)\n"
|
||||
return import_call
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""Function to return the name of the import script."""
|
||||
return "import_pandas_csv.py"
|
||||
|
||||
def _write_node_data(self, nodes) -> bool:
|
||||
passed = self._write_entities_to_file(nodes)
|
||||
return passed
|
||||
|
||||
def _write_edge_data(self, edges) -> bool:
|
||||
passed = self._write_entities_to_file(edges)
|
||||
return passed
|
||||
|
||||
def _write_entities_to_file(self, entities: iter) -> bool:
|
||||
"""Function to output.write the entities to a CSV file.
|
||||
|
||||
Args:
|
||||
entities (iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
||||
"""
|
||||
entities = peekable(entities)
|
||||
entity_list = self.pandas_in_memory._separate_entity_types(entities)
|
||||
for entity_type, entities in entity_list.items():
|
||||
self.in_memory_dfs[
|
||||
entity_type
|
||||
] = self.pandas_in_memory._add_entity_df(entity_type, entities)
|
||||
for entity_type in self.in_memory_dfs.keys():
|
||||
entity_df = self.in_memory_dfs[entity_type]
|
||||
if " " in entity_type or "." in entity_type:
|
||||
entity_type = entity_type.replace(" ", "_").replace(".", "_")
|
||||
if self.write_to_file:
|
||||
logger.info(
|
||||
f"Writing {entity_df.shape[0]} entries to {entity_type}.csv."
|
||||
)
|
||||
entity_df.to_csv(
|
||||
f"{self.output_directory}/{entity_type}.csv",
|
||||
sep=self.delimiter,
|
||||
)
|
||||
self.stored_dfs[entity_type] = entity_df
|
||||
self.in_memory_dfs = {}
|
||||
return True
|
320
biocypher/output/write/relational/_postgresql.py
Normal file
320
biocypher/output/write/relational/_postgresql.py
Normal file
@ -0,0 +1,320 @@
|
||||
import os
|
||||
import glob
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._batch_writer import _BatchWriter
|
||||
|
||||
|
||||
class _PostgreSQLBatchWriter(_BatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to disk using the
|
||||
format specified by PostgreSQL for the use of "COPY FROM...". Each batch
|
||||
writer instance has a fixed representation that needs to be passed
|
||||
at instantiation via the :py:attr:`schema` argument. The instance
|
||||
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
|
||||
to convert and extend the hierarchy.
|
||||
|
||||
This class inherits from the abstract class "_BatchWriter" and implements the
|
||||
PostgreSQL-specific methods:
|
||||
|
||||
- _write_node_headers
|
||||
- _write_edge_headers
|
||||
- _construct_import_call
|
||||
- _write_array_string
|
||||
"""
|
||||
|
||||
DATA_TYPE_LOOKUP = {
|
||||
"str": "VARCHAR", # VARCHAR needs limit
|
||||
"int": "INTEGER",
|
||||
"long": "BIGINT",
|
||||
"float": "NUMERIC",
|
||||
"double": "NUMERIC",
|
||||
"dbl": "NUMERIC",
|
||||
"boolean": "BOOLEAN",
|
||||
"str[]": "VARCHAR[]",
|
||||
"string[]": "VARCHAR[]",
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._copy_from_csv_commands = set()
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the psql command
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _get_data_type(self, string) -> str:
|
||||
try:
|
||||
return self.DATA_TYPE_LOOKUP[string]
|
||||
except KeyError:
|
||||
logger.info(
|
||||
'Could not determine data type {string}. Using default "VARCHAR"'
|
||||
)
|
||||
return "VARCHAR"
|
||||
|
||||
def _write_array_string(self, string_list) -> str:
|
||||
"""
|
||||
Abstract method to output.write the string representation of an array into a .csv file
|
||||
as required by the postgresql COPY command, with '{','}' brackets and ',' separation.
|
||||
|
||||
Args:
|
||||
string_list (list): list of ontology strings
|
||||
|
||||
Returns:
|
||||
str: The string representation of an array for postgres COPY
|
||||
"""
|
||||
string = ",".join(string_list)
|
||||
string = f'"{{{string}}}"'
|
||||
return string
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the psql import script
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return f"{self.db_name}-import-call.sh"
|
||||
|
||||
def _adjust_pascal_to_psql(self, string):
|
||||
string = string.replace(".", "_")
|
||||
string = string.lower()
|
||||
return string
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of node.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.node_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.node_property_dict.items():
|
||||
# create header CSV with ID, properties, labels
|
||||
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
parts = f"{pascal_label}-part*.csv"
|
||||
parts_paths = os.path.join(self.outdir, parts)
|
||||
parts_paths = glob.glob(parts_paths)
|
||||
parts_paths.sort()
|
||||
|
||||
# adjust label for import to psql
|
||||
pascal_label = self._adjust_pascal_to_psql(pascal_label)
|
||||
table_create_command_path = os.path.join(
|
||||
self.outdir,
|
||||
f"{pascal_label}-create_table.sql",
|
||||
)
|
||||
|
||||
# check if file already exists
|
||||
if os.path.exists(table_create_command_path):
|
||||
logger.warning(
|
||||
f"File {table_create_command_path} already exists. Overwriting.",
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
columns = ["_ID VARCHAR"]
|
||||
for col_name, col_type in props.items():
|
||||
col_type = self._get_data_type(col_type)
|
||||
col_name = self._adjust_pascal_to_psql(col_name)
|
||||
columns.append(f"{col_name} {col_type}")
|
||||
columns.append("_LABEL VARCHAR[]")
|
||||
|
||||
with open(table_create_command_path, "w", encoding="utf-8") as f:
|
||||
command = ""
|
||||
if self.wipe:
|
||||
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
|
||||
|
||||
# table creation requires comma separation
|
||||
command += (
|
||||
f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
|
||||
)
|
||||
f.write(command)
|
||||
|
||||
for parts_path in parts_paths:
|
||||
# if import_call_file_prefix is set, replace actual path
|
||||
# with prefix
|
||||
if self.import_call_file_prefix != self.outdir:
|
||||
parts_path = parts_path.replace(
|
||||
self.outdir,
|
||||
self.import_call_file_prefix,
|
||||
)
|
||||
|
||||
self._copy_from_csv_commands.add(
|
||||
f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
|
||||
)
|
||||
|
||||
# add file path to import statement
|
||||
# if import_call_file_prefix is set, replace actual path
|
||||
# with prefix
|
||||
if self.import_call_file_prefix != self.outdir:
|
||||
table_create_command_path = table_create_command_path.replace(
|
||||
self.outdir,
|
||||
self.import_call_file_prefix,
|
||||
)
|
||||
|
||||
self.import_call_nodes.add(table_create_command_path)
|
||||
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.edge_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.edge_property_dict.items():
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv")
|
||||
parts_paths = glob.glob(parts_paths)
|
||||
parts_paths.sort()
|
||||
|
||||
# adjust label for import to psql
|
||||
pascal_label = self._adjust_pascal_to_psql(pascal_label)
|
||||
table_create_command_path = os.path.join(
|
||||
self.outdir,
|
||||
f"{pascal_label}-create_table.sql",
|
||||
)
|
||||
|
||||
# check for file exists
|
||||
if os.path.exists(table_create_command_path):
|
||||
logger.warning(
|
||||
f"File {table_create_command_path} already exists. Overwriting.",
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
columns = []
|
||||
for col_name, col_type in props.items():
|
||||
col_type = self._get_data_type(col_type)
|
||||
col_name = self._adjust_pascal_to_psql(col_name)
|
||||
if col_name == "_ID":
|
||||
# should ideally never happen
|
||||
raise ValueError(
|
||||
"Column name '_ID' is reserved for internal use, "
|
||||
"denoting the relationship ID. Please choose a "
|
||||
"different name for your column."
|
||||
)
|
||||
|
||||
columns.append(f"{col_name} {col_type}")
|
||||
|
||||
# create list of lists and flatten
|
||||
# removes need for empty check of property list
|
||||
out_list = [
|
||||
"_START_ID VARCHAR",
|
||||
"_ID VARCHAR",
|
||||
*columns,
|
||||
"_END_ID VARCHAR",
|
||||
"_TYPE VARCHAR",
|
||||
]
|
||||
|
||||
with open(table_create_command_path, "w", encoding="utf-8") as f:
|
||||
command = ""
|
||||
if self.wipe:
|
||||
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
|
||||
|
||||
# table creation requires comma separation
|
||||
command += (
|
||||
f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
|
||||
)
|
||||
f.write(command)
|
||||
|
||||
for parts_path in parts_paths:
|
||||
# if import_call_file_prefix is set, replace actual path
|
||||
# with prefix
|
||||
if self.import_call_file_prefix != self.outdir:
|
||||
parts_path = parts_path.replace(
|
||||
self.outdir,
|
||||
self.import_call_file_prefix,
|
||||
)
|
||||
|
||||
self._copy_from_csv_commands.add(
|
||||
f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
|
||||
)
|
||||
|
||||
# add file path to import statement
|
||||
# if import_call_file_prefix is set, replace actual path
|
||||
# with prefix
|
||||
if self.import_call_file_prefix != self.outdir:
|
||||
table_create_command_path = table_create_command_path.replace(
|
||||
self.outdir,
|
||||
self.import_call_file_prefix,
|
||||
)
|
||||
|
||||
self.import_call_edges.add(table_create_command_path)
|
||||
|
||||
return True
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for postgresql import
|
||||
"""
|
||||
import_call = ""
|
||||
|
||||
# create tables
|
||||
# At this point, csv files of nodes and edges do not require differentiation
|
||||
for import_file_path in [
|
||||
*self.import_call_nodes,
|
||||
*self.import_call_edges,
|
||||
]:
|
||||
import_call += f'echo "Setup {import_file_path}..."\n'
|
||||
if {self.db_password}:
|
||||
# set password variable inline
|
||||
import_call += f"PGPASSWORD={self.db_password} "
|
||||
import_call += (
|
||||
f"{self.import_call_bin_prefix}psql -f {import_file_path}"
|
||||
)
|
||||
import_call += f" --dbname {self.db_name}"
|
||||
import_call += f" --host {self.db_host}"
|
||||
import_call += f" --port {self.db_port}"
|
||||
import_call += f" --user {self.db_user}"
|
||||
import_call += '\necho "Done!"\n'
|
||||
import_call += "\n"
|
||||
|
||||
# copy data to tables
|
||||
for command in self._copy_from_csv_commands:
|
||||
table_part = command.split(" ")[3]
|
||||
import_call += f'echo "Importing {table_part}..."\n'
|
||||
if {self.db_password}:
|
||||
# set password variable inline
|
||||
import_call += f"PGPASSWORD={self.db_password} "
|
||||
import_call += f'{self.import_call_bin_prefix}psql -c "{command}"'
|
||||
import_call += f" --dbname {self.db_name}"
|
||||
import_call += f" --host {self.db_host}"
|
||||
import_call += f" --port {self.db_port}"
|
||||
import_call += f" --user {self.db_user}"
|
||||
import_call += '\necho "Done!"\n'
|
||||
import_call += "\n"
|
||||
|
||||
return import_call
|
51
biocypher/output/write/relational/_sqlite.py
Normal file
51
biocypher/output/write/relational/_sqlite.py
Normal file
@ -0,0 +1,51 @@
|
||||
from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
|
||||
|
||||
|
||||
class _SQLiteBatchWriter(_PostgreSQLBatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to a SQLite database.
|
||||
It uses the _PostgreSQLBatchWriter class under the hood, which already
|
||||
implements the logic to write the nodes/edges to a relational DBMS.
|
||||
Only the import bash script differs between PostgreSQL and SQLite
|
||||
and is therefore implemented in this class.
|
||||
|
||||
- _construct_import_call
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for sqlite import
|
||||
"""
|
||||
import_call = ""
|
||||
|
||||
# create tables
|
||||
# At this point, csv files of nodes and edges do not require differentiation
|
||||
for import_file_path in [
|
||||
*self.import_call_nodes,
|
||||
*self.import_call_edges,
|
||||
]:
|
||||
import_call += f'echo "Setup {import_file_path}..."\n'
|
||||
import_call += f"{self.import_call_bin_prefix}sqlite3 {self.db_name} < {import_file_path}"
|
||||
import_call += '\necho "Done!"\n'
|
||||
import_call += "\n"
|
||||
|
||||
for command in self._copy_from_csv_commands:
|
||||
table_name = command.split(" ")[1]
|
||||
table_part = command.split(" ")[3].replace("'", "")
|
||||
import_call += f'echo "Importing {table_part}..."\n'
|
||||
separator = self.delim
|
||||
import_part = f".import {table_part} {table_name}"
|
||||
import_call += f"{self.import_call_bin_prefix}sqlite3 -separator $'{separator}' {self.db_name} \"{import_part}\""
|
||||
import_call += '\necho "Done!"\n'
|
||||
import_call += "\n"
|
||||
|
||||
return import_call
|
Reference in New Issue
Block a user