release commit

This commit is contained in:
Tom Gebhardt 2025-04-16 22:12:19 +02:00
commit a9db0be88a
89 changed files with 2336827 additions and 0 deletions

10
.bumpversion.cfg Normal file
View File

@ -0,0 +1,10 @@
[bumpversion]
current_version = 0.6.0
commit = True
tag = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)
serialize = {major}.{minor}.{patch}
[bumpversion:file:pyproject.toml]
[bumpversion:file:biocypher/_metadata.py]

13
.env.example Normal file
View File

@ -0,0 +1,13 @@
MODE=testserver
COMPLEX_PATIENTS=TRUE
FHIR_SERVER_URL=http://hapi.fhir.org/baseR4
#FHIR_SERVER_USER=
#FHIR_SERVER_PW=
#HTTP_PROXY=
#HTTPS_PROXY=
#NO_PROXY=
NUMBER_OF_PATIENTS=100
BATCH_SIZE=35

View File

@ -0,0 +1,41 @@
name: "Test and code quality"
description: "Run tests and code quality checks"
inputs:
NEO4J_VERSION:
description: "Neo4j version"
runs:
using: "composite"
steps:
#----------------------------------------------
# setup docker containers for testing
#----------------------------------------------
# currently only running on Linux due to technical limitations
# - name: Install Docker
# uses: douglascamata/setup-docker-macos-action@v1-alpha
# if: ${{ runner.os == 'macOS' }}
- name: Start Neo4j Docker
run: docker run --restart always --publish=7474:7474 --publish=7687:7687 --env NEO4J_AUTH=neo4j/your_password_here --env NEO4J_PLUGINS='["apoc"]' --env=NEO4J_ACCEPT_LICENSE_AGREEMENT=yes -d neo4j:${{ inputs.NEO4J_VERSION }}
shell: bash
if: ${{ runner.os == 'Linux' }}
- name: Start Postgres Docker
run: docker run --restart always --publish=5432:5432 --env POSTGRES_PASSWORD=postgres -d postgres:11.21-bullseye
shell: bash
if: ${{ runner.os == 'Linux' }}
#----------------------------------------------
# run tests and code quality checks
#----------------------------------------------
- name: Run Tests (Windows)
run: |
poetry run pytest --version
poetry run pytest --password=your_password_here
shell: bash
if: runner.os == 'Windows'
- name: Run tests (Linux and MacOS)
run: |
poetry run pytest --version
poetry run pytest --password=your_password_here
shell: bash
if: runner.os != 'Windows'
- name: Check code quality
uses: pre-commit/action@v3.0.0

33
.gitignore vendored Normal file
View File

@ -0,0 +1,33 @@
*~
*__pycache__
build/
docs/pypath_log/
docs/_build/
docs/biocypher-log/
docs/modules/
docs/notebooks/*.yaml
docs/notebooks/*.py
.DS_Store
.vscode
biocypher.egg-info/
*.egg
dist/
*.prof
*.coverage
*.pickle
out/*
biocypher-log/*
biocypher-out/*
*.log
dist/*
*.pye
*.pyc
*.kate-swp
.hypothesis/
.venv/
.empty
.pytest_cache
*.graphml
.idea/*
.cache
*.iml

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "networkx-based"]
path = networkx-based
url = git@git.uni-greifswald.de:MeDaX/networkx-based.git

50
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,50 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
fail_fast: false
default_language_version:
python: python3
default_stages:
- commit
- push
minimum_pre_commit_version: 2.7.1
repos:
- repo: https://github.com/ambv/black
rev: 23.7.0
hooks:
- id: black
- repo: https://github.com/timothycrosley/isort
rev: 5.12.0
hooks:
- id: isort
additional_dependencies: [toml]
- repo: https://github.com/snok/pep585-upgrade
rev: v1.0
hooks:
- id: upgrade-type-hints
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: check-docstring-first
- id: end-of-file-fixer
- id: check-added-large-files
- id: mixed-line-ending
- id: trailing-whitespace
exclude: ^.bumpversion.cfg$
- id: check-merge-conflict
- id: check-case-conflict
- id: check-symlinks
- id: check-yaml
args: [--unsafe]
- id: check-ast
- id: fix-encoding-pragma
args: [--remove] # for Python3 codebase, it's not necessary
- id: requirements-txt-fixer
- repo: https://github.com/pre-commit/pygrep-hooks
rev: v1.10.0
hooks:
- id: python-no-eval
- id: python-use-type-annotations
- id: python-check-blanket-noqa
- id: rst-backticks
- id: rst-directive-colons
- id: rst-inline-touching-normal

36
Dockerfile Normal file
View File

@ -0,0 +1,36 @@
FROM python:3.9-slim
WORKDIR /app
# Copy requirements file if you have one
COPY requirements.txt .
RUN pip install --upgrade packaging
RUN pip install -r requirements.txt
# Install poetry
RUN pip install --no-cache-dir "poetry<2.0.0"
# Copy .env file
COPY .env ./
# Copy only pyproject.toml and poetry.lock (if exists) first
COPY pyproject.toml ./
COPY poetry.lock* ./
# Configure poetry to not create a virtual environment inside the container
RUN poetry config virtualenvs.create false
# Install dependencies
RUN poetry install --no-dev --no-interaction --no-ansi
# Copy your project files
COPY . .
# Make the entrypoint script executable
COPY entrypoint.sh .
RUN chmod +x entrypoint.sh
RUN sed -i 's/\r$//' /app/entrypoint.sh
ENTRYPOINT ["/app/entrypoint.sh"]

22
LICENSE Normal file
View File

@ -0,0 +1,22 @@
MIT License
Copyright (c) 2022 Saez Lab
Copyright (c) 2025 MeDaX research group
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

110
README.md Normal file
View File

@ -0,0 +1,110 @@
# MeDaX Pipeline
## 📋 Description
The MeDaX pipeline transforms healthcare data from FHIR databases into Neo4j graph databases. This conversion enables efficient searching, querying, and analyses of interconnected health data that would otherwise be complex to retrieve using traditional SQL databases.
## ✨ Features
- Seamless conversion from FHIR to Neo4j graph structure
- Support for patient-centric data retrieval using FHIR's `$everything` operation
- Configurable batch processing for handling large datasets
- Docker-based deployment for easy setup and portability
- Compatible with public FHIR servers (e.g., HAPI FHIR) and private authenticated instances
## ⚙️ Prerequisites
- [Docker](https://docs.docker.com/engine/install/) with the [Docker Compose plugin](https://docs.docker.com/compose/install/linux/)
- A FHIR database with API access and the `$everything` operation enabled for retrieving patient data
- Alternatively: Use a public FHIR server such as [HAPI FHIR](https://hapi.fhir.org/) (default configuration)
## 🚀 Installation
### Setup
1. Clone this repository
2. Create an environment configuration file
3. Configure the environment variables in `.env`:
- For HAPI test server (default): No changes needed
- For custom FHIR server:
- Change `MODE` to anything else
- Uncomment and set `URL`, `PASSWORD`, and `USERNAME` variables
- Adjust `BATCH_SIZE` and `NUMBER_OF_PATIENTS` according to your needs
- Configure any required proxy settings
4. If needed, modify proxy settings in the `Dockerfile`
- Uncomment and set proxy variables
### Running the Pipeline
**Start the containers:**
```bash
docker compose up --build
```
**Stop and clean up (between runs):**
```bash
docker compose down --volumes
```
**Complete removal (containers and images):**
```bash
docker compose down --volumes --rmi all
```
> **Note:** Depending on your Docker installation, you might need to use `docker-compose` instead of `docker compose`.
## 🔍 Accessing the Neo4j Database
Once the pipeline has completed processing, you can access the Neo4j database:
1. Open your browser and navigate to `http://localhost:8080/`
2. Connect using the following credentials:
- Username: neo4j
- Password: neo4j
3. Set the new password and save it to a secure password manager
## 📊 Example Queries
Here are some basic Cypher queries to get you started with exploring your health data:
```cypher
// Count all nodes by type
MATCH (n) RETURN labels(n) as NodeType, count(*) as Count;
// Find all records for a specific patient
MATCH (p:Patient {id: 'patient-id'})-[r]-(connected)
RETURN p, r, connected;
// Retrieve all medication prescriptions
MATCH (m:Medication)-[r]-(p:Patient)
RETURN m, r, p;
```
## ❓ Troubleshooting
**Common Issues:**
- **Connection refused to FHIR server**: Check your network settings and ensure the FHIR server is accessible from within the Docker container.
- **Authentication failures**: Verify your credentials in the `.env` file.
- **Container startup failures**: Ensure all required Docker ports are available and not used by other applications.
- **No data found in fhir bundle**: Ensure that the FHIR server is up and responding patient data. Try sett the COMPLEX_PATIENTS variable to FALSE in your .env file. Some FHIR servers might not support the FHIR search logic.
## 📚 Architecture
The MeDaX pipeline consists of the following components:
1. **FHIR Client**: Connects to the FHIR server and retrieves patient data
2. **Data Transformer**: Converts FHIR resources into graph entities and relationships
3. **Reference Processor**: Converts references to relationships
3. **BioCypher Adapter**: Prepares the transformed data for Neo4j admin import
4. **Neo4j Database**: Stores and serves the graph representation of the health data
## ✍️ Citation
If you use the MeDaX pipeline in your research, please cite: 10.5281/zenodo.15229077
## 🙏 Acknowledgements
- We are leveraging [BioCypher](https://biocypher.org) [![DOI](https://zenodo.org/badge/DOI/10.1038/s41587-023-01848-y.svg)](https://doi.org/10.1038/s41587-023-01848-y) to create the Neo4j admin input.
- Remark: We slight adjustments to BioCypher's code to support batching.
- We used BioCypher's git template as a starting point for our development:
- Lobentanzer, S., BioCypher Consortium, & Saez-Rodriguez, J. Democratizing knowledge representation with BioCypher [Computer software]. https://github.com/biocypher/biocypher

41
biocypher/__init__.py Normal file
View File

@ -0,0 +1,41 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher: a unifying framework for biomedical knowledge graphs.
"""
__all__ = [
"__version__",
"__author__",
"module_data",
"config",
"logfile",
"log",
"Driver",
"BioCypher",
"Resource",
]
from ._get import Resource
from ._core import BioCypher
from ._config import config, module_data
from ._logger import log, logger, logfile
from ._metadata import __author__, __version__
class Driver(BioCypher):
# initialise parent class but log a warning
def __init__(self, *args, **kwargs):
logger.warning(
"The class `Driver` is deprecated and will be removed in a future "
"release. Please use `BioCypher` instead."
)
super().__init__(*args, **kwargs)

View File

@ -0,0 +1,148 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
Module data directory, including:
* The BioLink database schema
* The default config files
"""
from typing import Any, Optional
import os
import warnings
import yaml
import appdirs
__all__ = ["module_data", "module_data_path", "read_config", "config", "reset"]
_USER_CONFIG_DIR = appdirs.user_config_dir("biocypher", "saezlab")
_USER_CONFIG_FILE = os.path.join(_USER_CONFIG_DIR, "conf.yaml")
class MyLoader(yaml.SafeLoader):
def construct_scalar(self, node):
# Check if the scalar contains double quotes and an escape sequence
value = super().construct_scalar(node)
q = bool(node.style == '"')
b = bool("\\" in value.encode("unicode_escape").decode("utf-8"))
if q and b:
warnings.warn(
(
"Double quotes detected in YAML configuration scalar: "
f"{value.encode('unicode_escape')}. "
"These allow escape sequences and may cause problems, for "
"instance with the Neo4j admin import files (e.g. '\\t'). "
"Make sure you wanted to do this, and use single quotes "
"whenever possible."
),
category=UserWarning,
)
return value
def module_data_path(name: str) -> str:
"""
Absolute path to a YAML file shipped with the module.
"""
here = os.path.dirname(os.path.abspath(__file__))
return os.path.join(here, f"{name}.yaml")
def module_data(name: str) -> Any:
"""
Retrieve the contents of a YAML file shipped with this module.
"""
path = module_data_path(name)
return _read_yaml(path)
def _read_yaml(path: str) -> Optional[dict]:
if os.path.exists(path):
with open(path, "r") as fp:
return yaml.load(fp.read(), Loader=MyLoader)
def read_config() -> dict:
"""
Read the module config.
Read and merge the built-in default, the user level and directory level
configuration, with the later taking precendence over the former.
TODO explain path configuration
"""
defaults = module_data("biocypher_config")
user = _read_yaml(_USER_CONFIG_FILE) or {}
# TODO account for .yml?
local = (
_read_yaml("biocypher_config.yaml")
or _read_yaml("config/biocypher_config.yaml")
or {}
)
for key in defaults:
value = (
local[key] if key in local else user[key] if key in user else None
)
if value is not None:
if isinstance(
defaults[key], str
): # first level config (like title)
defaults[key] = value
else:
defaults[key].update(value)
return defaults
def config(*args, **kwargs) -> Optional[Any]:
"""
Set or get module config parameters.
"""
if args and kwargs:
raise ValueError(
"Setting and getting values in the same call is not allowed.",
)
if args:
result = tuple(globals()["_config"].get(key, None) for key in args)
return result[0] if len(result) == 1 else result
for key, value in kwargs.items():
globals()["_config"][key].update(value)
def reset():
"""
Reload configuration from the config files.
"""
globals()["_config"] = read_config()
reset()
def update_from_file(path: str):
"""
Update the module configuration from a YAML file.
"""
config(**_read_yaml(path))

View File

@ -0,0 +1,141 @@
Title: BioCypher python module configuration file
## Some options are not used by default. Uncomment them to use them.
biocypher:
### Required parameters ###
## DBMS type
dbms: neo4j
## Schema configuration
# schema_config_path: config/schema_config.yaml
## Offline mode: do not connect to a running DBMS instance
## Can be used e.g. for writing batch import files
offline: true
## Strict mode: do not allow to create new nodes or relationships without
## specifying source, version, and license parameters
strict_mode: false
## Ontology configuration
head_ontology:
url: https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
root_node: entity
# switch_label_and_id: true
### Optional parameters ###
## Logging
# Write log to disk
log_to_disk: true
# Activate more granular logging
debug: true
# Change the log directory
# log_directory: biocypher-log
## Data output directory
# output_directory: biocypher-out
## Resource cache directory
# cache_directory: .cache
## Optional tail ontologies
# tail_ontologies:
# so:
# url: test/ontologies/so.owl
# head_join_node: sequence variant
# tail_join_node: sequence_variant
# switch_label_and_id: true
# mondo:
# url: test/ontologies/mondo.owl
# head_join_node: disease
# tail_join_node: disease
# switch_label_and_id: true
### DBMS configuration ###
neo4j:
### Neo4j configuration ###
## Database name
database_name: neo4j
## Wipe DB before import (offline mode: --force)
wipe: true
## Neo4j authentication
uri: neo4j://localhost:7687
user: neo4j
password: neo4j
## Neo4j admin import batch writer settings
delimiter: ";"
array_delimiter: "|"
quote_character: "'"
## MultiDB functionality
## Set to false for using community edition or older versions of Neo4j
multi_db: true
## Import options
skip_duplicate_nodes: false
skip_bad_relationships: false
## Import call prefixes
# import_call_bin_prefix: bin/
# import_call_file_prefix: path/to/files/
postgresql:
### PostgreSQL configuration ###
# PostgreSQL connection credentials
database_name: postgres # DB name
user: postgres # user name
password: postgres # password
host: localhost # host
port: 5432 # port
# PostgreSQL import batch writer settings
quote_character: '"'
delimiter: '\t'
# import_call_bin_prefix: '' # path to "psql"
# import_call_file_prefix: '/path/to/files'
rdf:
### RDF configuration ###
rdf_format: turtle
sqlite:
### SQLite configuration ###
# SQLite connection credentials
database_name: sqlite.db # DB name
# SQLite import batch writer settings
quote_character: '"'
delimiter: '\t'
# import_call_bin_prefix: '' # path to "sqlite3"
# import_call_file_prefix: '/path/to/files'
csv:
### CSV/Pandas configuration ###
delimiter: ","
networkx:
### NetworkX configuration ###
some_config: some_value # placeholder for technical reasons TODO

View File

@ -0,0 +1,5 @@
# We test the quote detection
valid: 'This is a valid string'
also_valid: "This is also a valid string"
invalid: "\t"

View File

@ -0,0 +1,140 @@
Title: BioCypher graph schema configuration file
# ---
# "Named Things"
# ---
protein:
represented_as: node
preferred_id: uniprot
input_label: protein
db_collection_name: proteins
properties:
name: str
score: float
taxon: int
genes: str[]
microRNA:
represented_as: node
preferred_id: mirbase.mature
input_label: mirna
complex:
synonym_for: macromolecular complex
represented_as: node
preferred_id: complexportal
input_label: complex
pathway:
represented_as: node
preferred_id: [reactome, wikipathways]
input_label: [reactome, wikipathways]
gene:
represented_as: node
preferred_id: hgnc
input_label: [hgnc, ensg]
exclude_properties: accession
disease:
represented_as: node
preferred_id: doid
input_label: Disease
side effect:
is_a: phenotypic feature
represented_as: node
preferred_id: sider.effect
input_label: sider
sequence variant:
represented_as: node
preferred_id: [clinically relevant, known, somatic]
input_label: [Clinically_relevant_variant, Known_variant, Somatic_mutation]
properties:
source: str
original_source: str
effect: str
biotype: str
snRNA sequence:
is_a: nucleic acid entity
represented_as: node
preferred_id: [intact, rnacentral]
input_label: [intact_snrna, rnacentral_snrna]
properties:
ac: str
fullName: str
shortName: str
preferredName: str
exclude_properties: sequence
DNA sequence:
is_a: nucleic acid entity
represented_as: node
preferred_id: ensembl
input_label: dna
properties:
ac: str
fullName: str
shortName: str
preferredName: str
sequence: str
dsDNA sequence:
is_a: [DNA sequence, nucleic acid entity]
inherit_properties: True
represented_as: node
preferred_id: [intact, uniparc]
input_label: [intact_dsdna, uniprot_archive_dsdna]
# ---
# Associations
# ---
post translational interaction:
is_a: pairwise molecular interaction
represented_as: node
label_as_edge: INTERACTS_POST_TRANSLATIONAL
input_label: post_translational
phosphorylation:
is_a: post translational interaction
represented_as: edge
input_label: phosphorylation
gene to disease association:
represented_as: edge
label_as_edge: PERTURBED_IN_DISEASE
input_label: [protein_disease, gene_disease]
exclude_properties: accession
mutation to tissue association:
is_a: [genotype to tissue association, entity to tissue association, association]
represented_as: edge
label_as_edge: Is_Mutated_In
input_label: Gene_Is_Mutated_In_Cell_Tissue
variant to gene association: # -> Known.... and Somatic....
represented_as: edge
source: [known.sequence variant, somatic.sequence variant]
target: gene
input_label: [
VARIANT_FOUND_IN_GENE_Known_variant_Gene,
VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene
]
gene to gene association:
represented_as: edge
input_label: gene_gene
properties:
directional: bool
curated: bool
score: float
id: str # should be removed
gene to variant association: # should be removed
is_a: gene to variant association
represented_as: edge
input_label: gene_variant

View File

@ -0,0 +1,3 @@
disconnected:
represented_as: node
label_in_input: disconnected

View File

@ -0,0 +1,152 @@
Title: BioCypher graph schema configuration file
# ---
# "Named Things"
# ---
protein:
represented_as: node
preferred_id: uniprot
input_label: protein
db_collection_name: proteins
properties:
name: str
score: float
taxon: int
genes: str[]
microRNA:
represented_as: node
preferred_id: mirbase.mature
input_label: mirna
complex:
synonym_for: macromolecular complex
represented_as: node
preferred_id: complexportal
input_label: complex
pathway:
represented_as: node
preferred_id: [reactome, wikipathways]
input_label: [reactome, wikipathways]
gene:
represented_as: node
preferred_id: hgnc
input_label: [hgnc, ensg]
exclude_properties: accession
disease:
represented_as: node
preferred_id: doid
input_label: Disease
side effect:
is_a: phenotypic feature
represented_as: node
preferred_id: sider.effect
input_label: sider
sequence variant:
represented_as: node
preferred_id: [clinically relevant, known, somatic]
input_label: [Clinically_relevant_variant, Known_variant, Somatic_mutation]
properties:
source: str
original_source: str
effect: str
biotype: str
altered gene product level:
represented_as: node
input_label: agpl
decreased gene product level:
represented_as: node
input_label: agpl_decreased
lethal variant:
represented_as: node
input_label: lethal
snRNA sequence:
is_a: nucleic acid entity
represented_as: node
preferred_id: [intact, rnacentral]
input_label: [intact_snrna, rnacentral_snrna]
properties:
ac: str
fullName: str
shortName: str
preferredName: str
exclude_properties: sequence
DNA sequence:
is_a: nucleic acid entity
represented_as: node
preferred_id: ensembl
input_label: dna
properties:
ac: str
fullName: str
shortName: str
preferredName: str
sequence: str
dsDNA sequence:
is_a: [DNA sequence, nucleic acid entity]
inherit_properties: True
represented_as: node
preferred_id: [intact, uniparc]
input_label: [intact_dsdna, uniprot_archive_dsdna]
# ---
# Associations
# ---
post translational interaction:
is_a: pairwise molecular interaction
represented_as: node
label_as_edge: INTERACTS_POST_TRANSLATIONAL
input_label: post_translational
phosphorylation:
is_a: post translational interaction
represented_as: edge
use_id: false
input_label: phosphorylation
gene to disease association:
represented_as: edge
label_as_edge: PERTURBED_IN_DISEASE
input_label: [protein_disease, gene_disease]
exclude_properties: accession
mutation to tissue association:
is_a: [genotype to tissue association, entity to tissue association, association]
represented_as: edge
label_as_edge: Is_Mutated_In
input_label: Gene_Is_Mutated_In_Cell_Tissue
variant to gene association: # -> Known.... and Somatic....
represented_as: edge
source: [known.sequence variant, somatic.sequence variant]
target: gene
input_label: [
VARIANT_FOUND_IN_GENE_Known_variant_Gene,
VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene
]
gene to gene association:
represented_as: edge
input_label: gene_gene
properties:
directional: bool
curated: bool
score: float
gene to variant association:
is_a: gene to variant association
represented_as: edge
input_label: gene_variant

734
biocypher/_core.py Normal file
View File

@ -0,0 +1,734 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher core module. Interfaces with the user and distributes tasks to
submodules.
"""
from typing import Optional
from datetime import datetime
import os
import json
from more_itertools import peekable
import yaml
import pandas as pd
from ._logger import logger
logger.debug(f"Loading module {__name__}.")
from ._get import Downloader
from ._config import config as _config
from ._config import update_from_file as _file_update
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
from ._mapping import OntologyMapping
from ._ontology import Ontology
from ._translate import Translator
from ._deduplicate import Deduplicator
from .output.in_memory._pandas import Pandas
from .output.write._get_writer import DBMS_TO_CLASS, get_writer
from .output.connect._neo4j_driver import get_driver
__all__ = ["BioCypher"]
SUPPORTED_DBMS = DBMS_TO_CLASS.keys()
REQUIRED_CONFIG = [
"dbms",
"offline",
"strict_mode",
"head_ontology",
]
class BioCypher:
"""
Orchestration of BioCypher operations. Instantiate this class to interact
with BioCypher.
Args:
dbms (str): The database management system to use. For supported
systems see SUPPORTED_DBMS.
offline (bool): Whether to run in offline mode. If True, no
connection to the database will be made.
strict_mode (bool): Whether to run in strict mode. If True, the
translator will raise an error if a node or edge does not
provide source, version, and licence information.
biocypher_config_path (str): Path to the BioCypher config file.
schema_config_path (str): Path to the user schema config
file.
head_ontology (dict): The head ontology defined by URL ('url') and root
node ('root_node').
tail_ontologies (dict): The tail ontologies defined by URL and
join nodes for both head and tail ontology.
output_directory (str): Path to the output directory. If not
provided, the default value 'biocypher-out' will be used.
"""
def __init__(
self,
dbms: str = None,
offline: bool = None,
strict_mode: bool = None,
biocypher_config_path: str = None,
schema_config_path: str = None,
head_ontology: dict = None,
tail_ontologies: dict = None,
output_directory: str = None,
cache_directory: str = None,
# legacy params
db_name: str = None,
):
# Update configuration if custom path is provided
if biocypher_config_path:
_file_update(biocypher_config_path)
if db_name:
logger.warning(
"The parameter `db_name` is deprecated. Please set the "
"`database_name` setting in the `biocypher_config.yaml` file "
"instead."
)
_config(**{db_name: {"database_name": db_name}})
# Load configuration
self.base_config = _config("biocypher")
# Check for required configuration
for key in REQUIRED_CONFIG:
if key not in self.base_config:
raise ValueError(f"Configuration key {key} is required.")
# Set configuration - mandatory
self._dbms = dbms or self.base_config["dbms"]
if offline is None:
self._offline = self.base_config["offline"]
else:
self._offline = offline
if strict_mode is None:
self._strict_mode = self.base_config["strict_mode"]
else:
self._strict_mode = strict_mode
self._schema_config_path = schema_config_path or self.base_config.get(
"schema_config_path"
)
if not self._schema_config_path:
logger.warning("Running BioCypher without schema configuration.")
else:
logger.info(
f"Running BioCypher with schema configuration from {self._schema_config_path}."
)
self._head_ontology = head_ontology or self.base_config["head_ontology"]
# Set configuration - optional
self._output_directory = output_directory or self.base_config.get(
"output_directory"
)
self._cache_directory = cache_directory or self.base_config.get(
"cache_directory"
)
self._tail_ontologies = tail_ontologies or self.base_config.get(
"tail_ontologies"
)
if self._dbms not in SUPPORTED_DBMS:
raise ValueError(
f"DBMS {self._dbms} not supported. "
f"Please select from {SUPPORTED_DBMS}."
)
# Initialize
self._ontology_mapping = None
self._deduplicator = None
self._translator = None
self._downloader = None
self._ontology = None
self._writer = None
self._pd = None
def _get_deduplicator(self) -> Deduplicator:
"""
Create deduplicator if not exists and return.
"""
if not self._deduplicator:
self._deduplicator = Deduplicator()
return self._deduplicator
def _get_ontology_mapping(self) -> OntologyMapping:
"""
Create ontology mapping if not exists and return.
"""
if not self._schema_config_path:
self._ontology_mapping = OntologyMapping()
if not self._ontology_mapping:
self._ontology_mapping = OntologyMapping(
config_file=self._schema_config_path,
)
return self._ontology_mapping
def _get_ontology(self) -> Ontology:
"""
Create ontology if not exists and return.
"""
if not self._ontology:
self._ontology = Ontology(
ontology_mapping=self._get_ontology_mapping(),
head_ontology=self._head_ontology,
tail_ontologies=self._tail_ontologies,
)
return self._ontology
def _get_translator(self) -> Translator:
"""
Create translator if not exists and return.
"""
if not self._translator:
self._translator = Translator(
ontology=self._get_ontology(),
strict_mode=self._strict_mode,
)
return self._translator
def _get_writer(self):
"""
Create writer if not online. Set as instance variable `self._writer`.
"""
if self._offline:
timestamp = lambda: datetime.now().strftime("%Y%m%d%H%M%S")
outdir = self._output_directory or os.path.join(
"biocypher-out", timestamp()
)
self._output_directory = os.path.abspath(outdir)
self._writer = get_writer(
dbms=self._dbms,
translator=self._get_translator(),
deduplicator=self._get_deduplicator(),
output_directory=self._output_directory,
strict_mode=self._strict_mode,
)
else:
raise NotImplementedError("Cannot get writer in online mode.")
def _get_driver(self):
"""
Create driver if not exists. Set as instance variable `self._driver`.
"""
if not self._offline:
self._driver = get_driver(
dbms=self._dbms,
translator=self._get_translator(),
deduplicator=self._get_deduplicator(),
)
else:
raise NotImplementedError("Cannot get driver in offline mode.")
def write_nodes(
self, nodes, batch_size: int = int(1e6), force: bool = False
) -> bool:
"""
Write nodes to database. Either takes an iterable of tuples (if given,
translates to ``BioCypherNode`` objects) or an iterable of
``BioCypherNode`` objects.
Args:
nodes (iterable): An iterable of nodes to write to the database.
batch_size (int): The batch size to use when writing to disk.
force (bool): Whether to force writing to the output directory even
if the node type is not present in the schema config file.
Returns:
bool: True if successful.
"""
if not self._writer:
self._get_writer()
nodes = peekable(nodes)
if not isinstance(nodes.peek(), BioCypherNode):
tnodes = self._translator.translate_nodes(nodes)
else:
tnodes = nodes
# write node files
return self._writer.write_nodes(
tnodes, batch_size=batch_size, force=force
)
def write_edges(self, edges, batch_size: int = int(1e6)) -> bool:
"""
Write edges to database. Either takes an iterable of tuples (if given,
translates to ``BioCypherEdge`` objects) or an iterable of
``BioCypherEdge`` objects.
Args:
edges (iterable): An iterable of edges to write to the database.
Returns:
bool: True if successful.
"""
if not self._writer:
self._get_writer()
edges = peekable(edges)
if not isinstance(edges.peek(), BioCypherEdge):
tedges = self._translator.translate_edges(edges)
else:
tedges = edges
# write edge files
return self._writer.write_edges(tedges, batch_size=batch_size)
def to_df(self) -> list[pd.DataFrame]:
"""
Convert entities to a pandas DataFrame for each entity type and return
a list.
Args:
entities (iterable): An iterable of entities to convert to a
DataFrame.
Returns:
pd.DataFrame: A pandas DataFrame.
"""
if not self._pd:
raise ValueError(
"No pandas instance found. Please call `add()` first."
)
return self._pd.dfs
def add(self, entities) -> None:
"""
Function to add entities to the in-memory database. Accepts an iterable
of tuples (if given, translates to ``BioCypherNode`` or
``BioCypherEdge`` objects) or an iterable of ``BioCypherNode`` or
``BioCypherEdge`` objects.
Args:
entities (iterable): An iterable of entities to add to the database.
Can be 3-tuples (nodes) or 5-tuples (edges); also accepts
4-tuples for edges (deprecated).
Returns:
None
"""
if not self._pd:
self._pd = Pandas(
translator=self._get_translator(),
deduplicator=self._get_deduplicator(),
)
entities = peekable(entities)
if (
isinstance(entities.peek(), BioCypherNode)
or isinstance(entities.peek(), BioCypherEdge)
or isinstance(entities.peek(), BioCypherRelAsNode)
):
tentities = entities
elif len(entities.peek()) < 4:
tentities = self._translator.translate_nodes(entities)
else:
tentities = self._translator.translate_edges(entities)
self._pd.add_tables(tentities)
def add_nodes(self, nodes) -> None:
"""
Wrapper for ``add()`` to add nodes to the in-memory database.
Args:
nodes (iterable): An iterable of node tuples to add to the database.
Returns:
None
"""
self.add(nodes)
def add_edges(self, edges) -> None:
"""
Wrapper for ``add()`` to add edges to the in-memory database.
Args:
edges (iterable): An iterable of edge tuples to add to the database.
Returns:
None
"""
self.add(edges)
def merge_nodes(self, nodes) -> bool:
"""
Merge nodes into database. Either takes an iterable of tuples (if given,
translates to ``BioCypherNode`` objects) or an iterable of
``BioCypherNode`` objects.
Args:
nodes (iterable): An iterable of nodes to merge into the database.
Returns:
bool: True if successful.
"""
if not self._driver:
self._get_driver()
nodes = peekable(nodes)
if not isinstance(nodes.peek(), BioCypherNode):
tnodes = self._translator.translate_nodes(nodes)
else:
tnodes = nodes
# write node files
return self._driver.add_biocypher_nodes(tnodes)
def merge_edges(self, edges) -> bool:
"""
Merge edges into database. Either takes an iterable of tuples (if given,
translates to ``BioCypherEdge`` objects) or an iterable of
``BioCypherEdge`` objects.
Args:
edges (iterable): An iterable of edges to merge into the database.
Returns:
bool: True if successful.
"""
if not self._driver:
self._get_driver()
edges = peekable(edges)
if not isinstance(edges.peek(), BioCypherEdge):
tedges = self._translator.translate_edges(edges)
else:
tedges = edges
# write edge files
return self._driver.add_biocypher_edges(tedges)
# DOWNLOAD AND CACHE MANAGEMENT METHODS ###
def _get_downloader(self, cache_dir: Optional[str] = None):
"""
Create downloader if not exists.
"""
if not self._downloader:
self._downloader = Downloader(self._cache_directory)
def download(self, *resources) -> None:
"""
Use the :class:`Downloader` class to download or load from cache the
resources given by the adapter.
"""
self._get_downloader()
return self._downloader.download(*resources)
# OVERVIEW AND CONVENIENCE METHODS ###
def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]:
"""
Get the set of input labels encountered without an entry in the
`schema_config.yaml` and print them to the logger.
Returns:
Optional[Dict[str, List[str]]]: A dictionary of Biolink types
encountered without an entry in the `schema_config.yaml` file.
"""
mt = self._translator.get_missing_biolink_types()
if mt:
msg = (
"Input entities not accounted for due to them not being "
f"present in the schema configuration file {self._schema_config_path} "
"(this is not necessarily a problem, if you did not intend "
"to include them in the database; see the log for details): \n"
)
for k, v in mt.items():
msg += f" {k}: {v} \n"
logger.info(msg)
return mt
else:
logger.info("No missing labels in input.")
return None
def log_duplicates(self) -> None:
"""
Get the set of duplicate nodes and edges encountered and print them to
the logger.
"""
dn = self._deduplicator.get_duplicate_nodes()
if dn:
ntypes = dn[0]
nids = dn[1]
msg = "Duplicate node types encountered (IDs in log): \n"
for typ in ntypes:
msg += f" {typ}\n"
logger.info(msg)
idmsg = "Duplicate node IDs encountered: \n"
for _id in nids:
idmsg += f" {_id}\n"
logger.debug(idmsg)
else:
logger.info("No duplicate nodes in input.")
de = self._deduplicator.get_duplicate_edges()
if de:
etypes = de[0]
eids = de[1]
msg = "Duplicate edge types encountered (IDs in log): \n"
for typ in etypes:
msg += f" {typ}\n"
logger.info(msg)
idmsg = "Duplicate edge IDs encountered: \n"
for _id in eids:
idmsg += f" {_id}\n"
logger.debug(idmsg)
else:
logger.info("No duplicate edges in input.")
def show_ontology_structure(self, **kwargs) -> None:
"""
Show the ontology structure using treelib or write to GRAPHML file.
Args:
to_disk (str): If specified, the ontology structure will be saved
to disk as a GRAPHML file, to be opened in your favourite
graph visualisation tool.
full (bool): If True, the full ontology structure will be shown,
including all nodes and edges. If False, only the nodes and
edges that are relevant to the extended schema will be shown.
"""
if not self._ontology:
self._get_ontology()
return self._ontology.show_ontology_structure(**kwargs)
def write_import_call(self) -> str:
"""
Write a shell script to import the database depending on the chosen
DBMS.
Returns:
str: path toward the file holding the import call.
"""
if not self._offline:
raise NotImplementedError(
"Cannot write import call in online mode."
)
return self._writer.write_import_call()
def write_schema_info(self, as_node: bool = False) -> None:
"""
Write an extended schema info YAML file that extends the
`schema_config.yaml` with run-time information of the built KG. For
instance, include information on whether something present in the actual
knowledge graph, whether it is a relationship (which is important in the
case of representing relationships as nodes) and the actual sources and
targets of edges. Since this file can be used in place of the original
`schema_config.yaml` file, it indicates that it is the extended schema
by setting `is_schema_info` to `true`.
We start by using the `extended_schema` dictionary from the ontology
class instance, which contains all expanded entities and relationships.
The information of whether something is a relationship can be gathered
from the deduplicator instance, which keeps track of all entities that
have been seen.
"""
if not self._offline:
raise NotImplementedError(
"Cannot write schema info in online mode."
)
ontology = self._get_ontology()
schema = ontology.mapping.extended_schema.copy()
schema["is_schema_info"] = True
deduplicator = self._get_deduplicator()
for node in deduplicator.entity_types:
if node in schema.keys():
schema[node]["present_in_knowledge_graph"] = True
schema[node]["is_relationship"] = False
else:
logger.info(
f"Node {node} not present in extended schema. "
"Skipping schema info."
)
# find 'label_as_edge' cases in schema entries
changed_labels = {}
for k, v in schema.items():
if not isinstance(v, dict):
continue
if "label_as_edge" in v.keys():
if v["label_as_edge"] in deduplicator.seen_relationships.keys():
changed_labels[v["label_as_edge"]] = k
for edge in deduplicator.seen_relationships.keys():
if edge in changed_labels.keys():
edge = changed_labels[edge]
if edge in schema.keys():
schema[edge]["present_in_knowledge_graph"] = True
schema[edge]["is_relationship"] = True
# TODO information about source and target nodes
else:
logger.info(
f"Edge {edge} not present in extended schema. "
"Skipping schema info."
)
# write to output directory as YAML file
path = os.path.join(self._output_directory, "schema_info.yaml")
with open(path, "w") as f:
f.write(yaml.dump(schema))
if as_node:
# write as node
node = BioCypherNode(
node_id="schema_info",
node_label="schema_info",
properties={"schema_info": json.dumps(schema)},
)
self.write_nodes([node], force=True)
# override import call with added schema info node
self.write_import_call()
return schema
# TRANSLATION METHODS ###
def translate_term(self, term: str) -> str:
"""
Translate a term to its BioCypher equivalent.
Args:
term (str): The term to translate.
Returns:
str: The BioCypher equivalent of the term.
"""
# instantiate adapter if not exists
self.start_ontology()
return self._translator.translate_term(term)
def summary(self) -> None:
"""
Wrapper for showing ontology structure and logging duplicates and
missing input types.
"""
self.show_ontology_structure()
self.log_duplicates()
self.log_missing_input_labels()
def reverse_translate_term(self, term: str) -> str:
"""
Reverse translate a term from its BioCypher equivalent.
Args:
term (str): The BioCypher term to reverse translate.
Returns:
str: The original term.
"""
# instantiate adapter if not exists
self.start_ontology()
return self._translator.reverse_translate_term(term)
def translate_query(self, query: str) -> str:
"""
Translate a query to its BioCypher equivalent.
Args:
query (str): The query to translate.
Returns:
str: The BioCypher equivalent of the query.
"""
# instantiate adapter if not exists
self.start_ontology()
return self._translator.translate(query)
def reverse_translate_query(self, query: str) -> str:
"""
Reverse translate a query from its BioCypher equivalent.
Args:
query (str): The BioCypher query to reverse translate.
Returns:
str: The original query.
"""
# instantiate adapter if not exists
self.start_ontology()
return self._translator.reverse_translate(query)

356
biocypher/_create.py Normal file
View File

@ -0,0 +1,356 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'create' module. Handles the creation of BioCypher node and edge
dataclasses.
"""
from ._logger import logger
logger.debug(f"Loading module {__name__}.")
from typing import Union
from dataclasses import field, dataclass
import os
__all__ = [
"BioCypherEdge",
"BioCypherNode",
"BioCypherRelAsNode",
]
@dataclass(frozen=True)
class BioCypherNode:
"""
Handoff class to represent biomedical entities as Neo4j nodes.
Has id, label, property dict; id and label (in the Neo4j sense of a
label, ie, the entity descriptor after the colon, such as
":Protein") are non-optional and called node_id and node_label to
avoid confusion with "label" properties. Node labels are written in
PascalCase and as nouns, as per Neo4j consensus.
Args:
node_id (string): consensus "best" id for biological entity
node_label (string): primary type of entity, capitalised
**properties (kwargs): collection of all other properties to be
passed to neo4j for the respective node (dict)
Todo:
- check and correct small inconsistencies such as capitalisation
of ID names ("uniprot" vs "UniProt")
- check for correct ID patterns (eg "ENSG" + string of numbers,
uniprot length)
- ID conversion using pypath translation facilities for now
"""
node_id: str
node_label: str
preferred_id: str = "id"
properties: dict = field(default_factory=dict)
def __post_init__(self):
"""
Add id field to properties.
Check for reserved keywords.
Replace unwanted characters in properties.
"""
self.properties["id"] = self.node_id
self.properties["preferred_id"] = self.preferred_id or None
# TODO actually make None possible here; as is, "id" is the default in
# the dataclass as well as in the configuration file
if ":TYPE" in self.properties.keys():
logger.warning(
"Keyword ':TYPE' is reserved for Neo4j. "
"Removing from properties.",
# "Renaming to 'type'."
)
# self.properties["type"] = self.properties[":TYPE"]
del self.properties[":TYPE"]
for k, v in self.properties.items():
if isinstance(v, str):
self.properties[k] = (
v.replace(
os.linesep,
" ",
)
.replace(
"\n",
" ",
)
.replace(
"\r",
" ",
)
)
elif isinstance(v, list):
#modified biocypher, because the data contained intgers in lists
self.properties[k] = [
(str(val) if isinstance(val, (int, float)) else val)
.replace(os.linesep, " ")
.replace("\n", " ")
.replace("\r", " ")
for val in v
]
def get_id(self) -> str:
"""
Returns primary node identifier.
Returns:
str: node_id
"""
return self.node_id
def get_label(self) -> str:
"""
Returns primary node label.
Returns:
str: node_label
"""
return self.node_label
def get_type(self) -> str:
"""
Returns primary node label.
Returns:
str: node_label
"""
return self.node_label
def get_preferred_id(self) -> str:
"""
Returns preferred id.
Returns:
str: preferred_id
"""
return self.preferred_id
def get_properties(self) -> dict:
"""
Returns all other node properties apart from primary id and
label as key-value pairs.
Returns:
dict: properties
"""
return self.properties
def get_dict(self) -> dict:
"""
Return dict of id, labels, and properties.
Returns:
dict: node_id and node_label as top-level key-value pairs,
properties as second-level dict.
"""
return {
"node_id": self.node_id,
"node_label": self.node_label,
"properties": self.properties,
}
@dataclass(frozen=True)
class BioCypherEdge:
"""
Handoff class to represent biomedical relationships in Neo4j.
Has source and target ids, label, property dict; ids and label (in
the Neo4j sense of a label, ie, the entity descriptor after the
colon, such as ":TARGETS") are non-optional and called source_id,
target_id, and relationship_label to avoid confusion with properties
called "label", which usually denotes the human-readable form.
Relationship labels are written in UPPERCASE and as verbs, as per
Neo4j consensus.
Args:
source_id (string): consensus "best" id for biological entity
target_id (string): consensus "best" id for biological entity
relationship_label (string): type of interaction, UPPERCASE
properties (dict): collection of all other properties of the
respective edge
"""
source_id: str
target_id: str
relationship_label: str
relationship_id: str = None
properties: dict = field(default_factory=dict)
def __post_init__(self):
"""
Check for reserved keywords.
"""
if ":TYPE" in self.properties.keys():
logger.debug(
"Keyword ':TYPE' is reserved for Neo4j. "
"Removing from properties.",
# "Renaming to 'type'."
)
# self.properties["type"] = self.properties[":TYPE"]
del self.properties[":TYPE"]
elif "id" in self.properties.keys():
logger.debug(
"Keyword 'id' is reserved for Neo4j. "
"Removing from properties.",
# "Renaming to 'type'."
)
# self.properties["type"] = self.properties[":TYPE"]
del self.properties["id"]
elif "_ID" in self.properties.keys():
logger.debug(
"Keyword '_ID' is reserved for Postgres. "
"Removing from properties.",
# "Renaming to 'type'."
)
# self.properties["type"] = self.properties[":TYPE"]
del self.properties["_ID"]
def get_id(self) -> Union[str, None]:
"""
Returns primary node identifier or None.
Returns:
str: node_id
"""
return self.relationship_id
def get_source_id(self) -> str:
"""
Returns primary node identifier of relationship source.
Returns:
str: source_id
"""
return self.source_id
def get_target_id(self) -> str:
"""
Returns primary node identifier of relationship target.
Returns:
str: target_id
"""
return self.target_id
def get_label(self) -> str:
"""
Returns relationship label.
Returns:
str: relationship_label
"""
return self.relationship_label
def get_type(self) -> str:
"""
Returns relationship label.
Returns:
str: relationship_label
"""
return self.relationship_label
def get_properties(self) -> dict:
"""
Returns all other relationship properties apart from primary ids
and label as key-value pairs.
Returns:
dict: properties
"""
return self.properties
def get_dict(self) -> dict:
"""
Return dict of ids, label, and properties.
Returns:
dict: source_id, target_id and relationship_label as
top-level key-value pairs, properties as second-level
dict.
"""
return {
"relationship_id": self.relationship_id or None,
"source_id": self.source_id,
"target_id": self.target_id,
"relationship_label": self.relationship_label,
"properties": self.properties,
}
@dataclass(frozen=True)
class BioCypherRelAsNode:
"""
Class to represent relationships as nodes (with in- and outgoing
edges) as a triplet of a BioCypherNode and two BioCypherEdges. Main
usage in type checking (instances where the receiving function needs
to check whether it receives a relationship as a single edge or as
a triplet).
Args:
node (BioCypherNode): node representing the relationship
source_edge (BioCypherEdge): edge representing the source of the
relationship
target_edge (BioCypherEdge): edge representing the target of the
relationship
"""
node: BioCypherNode
source_edge: BioCypherEdge
target_edge: BioCypherEdge
def __post_init__(self):
if not isinstance(self.node, BioCypherNode):
raise TypeError(
f"BioCypherRelAsNode.node must be a BioCypherNode, "
f"not {type(self.node)}.",
)
if not isinstance(self.source_edge, BioCypherEdge):
raise TypeError(
f"BioCypherRelAsNode.source_edge must be a BioCypherEdge, "
f"not {type(self.source_edge)}.",
)
if not isinstance(self.target_edge, BioCypherEdge):
raise TypeError(
f"BioCypherRelAsNode.target_edge must be a BioCypherEdge, "
f"not {type(self.target_edge)}.",
)
def get_node(self) -> BioCypherNode:
return self.node
def get_source_edge(self) -> BioCypherEdge:
return self.source_edge
def get_target_edge(self) -> BioCypherEdge:
return self.target_edge

147
biocypher/_deduplicate.py Normal file
View File

@ -0,0 +1,147 @@
from ._logger import logger
logger.debug(f"Loading module {__name__}.")
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
class Deduplicator:
"""
Singleton class responsible of deduplicating BioCypher inputs. Maintains
sets/dictionaries of node and edge types and their unique identifiers.
Nodes identifiers should be globally unique (represented as a set), while
edge identifiers are only unique per edge type (represented as a dict of
sets, keyed by edge type).
Stores collection of duplicate node and edge identifiers and types for
troubleshooting and to avoid overloading the log.
"""
def __init__(self):
self.seen_entity_ids = set()
self.duplicate_entity_ids = set()
self.entity_types = set()
self.duplicate_entity_types = set()
self.seen_relationships = {}
self.duplicate_relationship_ids = set()
self.duplicate_relationship_types = set()
def node_seen(self, entity: BioCypherNode) -> bool:
"""
Adds a node to the instance and checks if it has been seen before.
Args:
node: BioCypherNode to be added.
Returns:
True if the node has been seen before, False otherwise.
"""
if entity.get_label() not in self.entity_types:
self.entity_types.add(entity.get_label())
if entity.get_id() in self.seen_entity_ids:
self.duplicate_entity_ids.add(entity.get_id())
if entity.get_label() not in self.duplicate_entity_types:
logger.warning(
f"Duplicate node type {entity.get_label()} found. "
)
self.duplicate_entity_types.add(entity.get_label())
return True
self.seen_entity_ids.add(entity.get_id())
return False
def edge_seen(self, relationship: BioCypherEdge) -> bool:
"""
Adds an edge to the instance and checks if it has been seen before.
Args:
edge: BioCypherEdge to be added.
Returns:
True if the edge has been seen before, False otherwise.
"""
if relationship.get_type() not in self.seen_relationships:
self.seen_relationships[relationship.get_type()] = set()
# concatenate source and target if no id is present
if not relationship.get_id():
_id = (
f"{relationship.get_source_id()}_{relationship.get_target_id()}"
)
else:
_id = relationship.get_id()
if _id in self.seen_relationships[relationship.get_type()]:
self.duplicate_relationship_ids.add(_id)
if relationship.get_type() not in self.duplicate_relationship_types:
logger.warning(
f"Duplicate edge type {relationship.get_type()} found. "
)
self.duplicate_relationship_types.add(relationship.get_type())
return True
self.seen_relationships[relationship.get_type()].add(_id)
return False
def rel_as_node_seen(self, rel_as_node: BioCypherRelAsNode) -> bool:
"""
Adds a rel_as_node to the instance (one entity and two relationships)
and checks if it has been seen before. Only the node is relevant for
identifying the rel_as_node as a duplicate.
Args:
rel_as_node: BioCypherRelAsNode to be added.
Returns:
True if the rel_as_node has been seen before, False otherwise.
"""
node = rel_as_node.get_node()
if node.get_label() not in self.seen_relationships:
self.seen_relationships[node.get_label()] = set()
# rel as node always has an id
_id = node.get_id()
if _id in self.seen_relationships[node.get_type()]:
self.duplicate_relationship_ids.add(_id)
if node.get_type() not in self.duplicate_relationship_types:
logger.warning(f"Duplicate edge type {node.get_type()} found. ")
self.duplicate_relationship_types.add(node.get_type())
return True
self.seen_relationships[node.get_type()].add(_id)
return False
def get_duplicate_nodes(self):
"""
Function to return a list of duplicate nodes.
Returns:
list: list of duplicate nodes
"""
if self.duplicate_entity_types:
return (self.duplicate_entity_types, self.duplicate_entity_ids)
else:
return None
def get_duplicate_edges(self):
"""
Function to return a list of duplicate edges.
Returns:
list: list of duplicate edges
"""
if self.duplicate_relationship_types:
return (
self.duplicate_relationship_types,
self.duplicate_relationship_ids,
)
else:
return None

443
biocypher/_get.py Normal file
View File

@ -0,0 +1,443 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher get module. Used to download and cache data from external sources.
"""
from __future__ import annotations
from typing import Optional
import shutil
import requests
from ._logger import logger
logger.debug(f"Loading module {__name__}.")
from abc import ABC
from datetime import datetime, timedelta
from tempfile import TemporaryDirectory
import os
import json
import ftplib
import pooch
from ._misc import to_list, is_nested
class Resource(ABC):
def __init__(
self,
name: str,
url_s: str | list[str],
lifetime: int = 0,
):
"""
A Resource is a file, a list of files, an API request, or a list of API
requests, any of which can be downloaded from the given URL(s) and
cached locally. This class implements checks of the minimum requirements
for a resource, to be implemented by a biocypher adapter.
Args:
name (str): The name of the resource.
url_s (str | list[str]): The URL or URLs of the resource.
lifetime (int): The lifetime of the resource in days. If 0, the
resource is considered to be permanent.
"""
self.name = name
self.url_s = url_s
self.lifetime = lifetime
class FileDownload(Resource):
def __init__(
self,
name: str,
url_s: str | list[str],
lifetime: int = 0,
is_dir: bool = False,
):
"""
Represents basic information for a File Download.
Args:
name(str): The name of the File Download.
url_s(str|list[str]): The URL(s) of the File Download.
lifetime(int): The lifetime of the File Download in days. If 0, the
File Download is cached indefinitely.
is_dir (bool): Whether the URL points to a directory or not.
"""
super().__init__(name, url_s, lifetime)
self.is_dir = is_dir
class APIRequest(Resource):
def __init__(self, name: str, url_s: str | list[str], lifetime: int = 0):
"""
Represents basic information for an API Request.
Args:
name(str): The name of the API Request.
url_s(str|list): The URL of the API endpoint.
lifetime(int): The lifetime of the API Request in days. If 0, the
API Request is cached indefinitely.
"""
super().__init__(name, url_s, lifetime)
class Downloader:
def __init__(self, cache_dir: Optional[str] = None) -> None:
"""
The Downloader is a class that manages resources that can be downloaded
and cached locally. It manages the lifetime of downloaded resources by
keeping a JSON record of the download date of each resource.
Args:
cache_dir (str): The directory where the resources are cached. If
not given, a temporary directory is created.
"""
self.cache_dir = cache_dir or TemporaryDirectory().name
self.cache_file = os.path.join(self.cache_dir, "cache.json")
self.cache_dict = self._load_cache_dict()
def download(self, *resources: Resource):
"""
Download one or multiple resources. Load from cache if the resource is
already downloaded and the cache is not expired.
Args:
resources (Resource): The resource(s) to download or load from
cache.
Returns:
list[str]: The path or paths to the resource(s) that were downloaded
or loaded from cache.
"""
paths = []
for resource in resources:
paths.append(self._download_or_cache(resource))
# flatten list if it is nested
if is_nested(paths):
paths = [path for sublist in paths for path in sublist]
return paths
def _download_or_cache(self, resource: Resource, cache: bool = True):
"""
Download a resource if it is not cached or exceeded its lifetime.
Args:
resource (Resource): The resource to download.
Returns:
list[str]: The path or paths to the downloaded resource(s).
"""
expired = self._is_cache_expired(resource)
if expired or not cache:
self._delete_expired_cache(resource)
if isinstance(resource, FileDownload):
logger.info(f"Asking for download of resource {resource.name}.")
paths = self._download_files(cache, resource)
elif isinstance(resource, APIRequest):
logger.info(
f"Asking for download of api request {resource.name}."
)
paths = self._download_api_request(resource)
else:
raise TypeError(f"Unknown resource type: {type(resource)}")
else:
paths = self.get_cached_version(resource)
self._update_cache_record(resource)
return paths
def _is_cache_expired(self, resource: Resource) -> bool:
"""
Check if resource or API request cache is expired.
Args:
resource (Resource): The resource or API request to download.
Returns:
bool: True if cache is expired, False if not.
"""
cache_record = self._get_cache_record(resource)
if cache_record:
download_time = datetime.strptime(
cache_record.get("date_downloaded"), "%Y-%m-%d %H:%M:%S.%f"
)
lifetime = timedelta(days=resource.lifetime)
expired = download_time + lifetime < datetime.now()
else:
expired = True
return expired
def _delete_expired_cache(self, resource: Resource):
cache_resource_path = self.cache_dir + "/" + resource.name
if os.path.exists(cache_resource_path) and os.path.isdir(
cache_resource_path
):
shutil.rmtree(cache_resource_path)
def _download_files(self, cache, file_download: FileDownload):
"""
Download a resource given it is a file or a directory and return the
path.
Args:
cache (bool): Whether to cache the resource or not.
file_download (FileDownload): The resource to download.
Returns:
list[str]: The path or paths to the downloaded resource(s).
"""
if file_download.is_dir:
files = self._get_files(file_download)
file_download.url_s = [
file_download.url_s + "/" + file for file in files
]
file_download.is_dir = False
paths = self._download_or_cache(file_download, cache)
elif isinstance(file_download.url_s, list):
paths = []
for url in file_download.url_s:
fname = url[url.rfind("/") + 1 :].split("?")[0]
paths.append(
self._retrieve(
url=url,
fname=fname,
path=os.path.join(self.cache_dir, file_download.name),
)
)
else:
paths = []
fname = file_download.url_s[
file_download.url_s.rfind("/") + 1 :
].split("?")[0]
results = self._retrieve(
url=file_download.url_s,
fname=fname,
path=os.path.join(self.cache_dir, file_download.name),
)
if isinstance(results, list):
paths.extend(results)
else:
paths.append(results)
# sometimes a compressed file contains multiple files
# TODO ask for a list of files in the archive to be used from the
# adapter
return paths
def _download_api_request(self, api_request: APIRequest):
"""
Download an API request and return the path.
Args:
api_request(APIRequest): The API request result that is being
cached.
Returns:
list[str]: The path to the cached API request.
"""
urls = (
api_request.url_s
if isinstance(api_request.url_s, list)
else [api_request.url_s]
)
paths = []
for url in urls:
fname = url[url.rfind("/") + 1 :].rsplit(".", 1)[0]
logger.info(
f"Asking for caching API of {api_request.name} {fname}."
)
response = requests.get(url=url)
if response.status_code != 200:
response.raise_for_status()
response_data = response.json()
api_path = os.path.join(
self.cache_dir, api_request.name, f"{fname}.json"
)
os.makedirs(os.path.dirname(api_path), exist_ok=True)
with open(api_path, "w") as f:
json.dump(response_data, f)
logger.info(f"Caching API request to {api_path}.")
paths.append(api_path)
return paths
def get_cached_version(self, resource: Resource) -> list[str]:
"""Get the cached version of a resource.
Args:
resource(Resource): The resource to get the cached version of.
Returns:
list[str]: The paths to the cached resource(s).
"""
cached_location = os.path.join(self.cache_dir, resource.name)
logger.info(f"Use cached version from {cached_location}.")
paths = []
for file in os.listdir(cached_location):
paths.append(os.path.join(cached_location, file))
return paths
def _retrieve(
self,
url: str,
fname: str,
path: str,
known_hash: str = None,
):
"""
Retrieve a file from a URL using Pooch. Infer type of file from
extension and use appropriate processor.
Args:
url (str): The URL to retrieve the file from.
fname (str): The name of the file.
path (str): The path to the file.
"""
if fname.endswith(".zip"):
return pooch.retrieve(
url=url,
known_hash=known_hash,
fname=fname,
path=path,
processor=pooch.Unzip(),
progressbar=True,
)
elif fname.endswith(".tar.gz"):
return pooch.retrieve(
url=url,
known_hash=known_hash,
fname=fname,
path=path,
processor=pooch.Untar(),
progressbar=True,
)
elif fname.endswith(".gz"):
return pooch.retrieve(
url=url,
known_hash=known_hash,
fname=fname,
path=path,
processor=pooch.Decompress(),
progressbar=True,
)
else:
return pooch.retrieve(
url=url,
known_hash=known_hash,
fname=fname,
path=path,
progressbar=True,
)
def _get_files(self, file_download: FileDownload):
"""
Get the files contained in a directory file.
Args:
file_download (FileDownload): The directory file.
Returns:
list: The files contained in the directory.
"""
if file_download.url_s.startswith("ftp://"):
# remove protocol
url = file_download.url_s[6:]
# get base url
url = url[: url.find("/")]
# get directory (remove initial slash as well)
dir = file_download.url_s[7 + len(url) :]
# get files
ftp = ftplib.FTP(url)
ftp.login()
ftp.cwd(dir)
files = ftp.nlst()
ftp.quit()
else:
raise NotImplementedError(
"Only FTP directories are supported at the moment."
)
return files
def _load_cache_dict(self):
"""
Load the cache dictionary from the cache file. Create an empty cache
file if it does not exist.
"""
if not os.path.exists(self.cache_dir):
logger.info(f"Creating cache directory {self.cache_dir}.")
os.makedirs(self.cache_dir)
if not os.path.exists(self.cache_file):
logger.info(f"Creating cache file {self.cache_file}.")
with open(self.cache_file, "w") as f:
json.dump({}, f)
with open(self.cache_file, "r") as f:
logger.info(f"Loading cache file {self.cache_file}.")
return json.load(f)
def _get_cache_record(self, resource: Resource):
"""
Get the cache record of a resource.
Args:
resource (Resource): The resource to get the cache record of.
Returns:
The cache record of the resource.
"""
return self.cache_dict.get(resource.name, {})
def _update_cache_record(self, resource: Resource):
"""
Update the cache record of a resource.
Args:
resource (Resource): The resource to update the cache record of.
"""
cache_record = {}
cache_record["url"] = to_list(resource.url_s)
cache_record["date_downloaded"] = str(datetime.now())
cache_record["lifetime"] = resource.lifetime
self.cache_dict[resource.name] = cache_record
with open(self.cache_file, "w") as f:
json.dump(self.cache_dict, f, default=str)

121
biocypher/_logger.py Normal file
View File

@ -0,0 +1,121 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
Configuration of the module logger.
"""
__all__ = ["get_logger", "log", "logfile"]
from datetime import datetime
import os
import pydoc
import logging
from biocypher import _config
from biocypher._metadata import __version__
def get_logger(name: str = "biocypher") -> logging.Logger:
"""
Access the module logger, create a new one if does not exist yet.
Method providing central logger instance to main module. Is called
only from main submodule, :mod:`biocypher.driver`. In child modules,
the standard Python logging facility is called
(using ``logging.getLogger(__name__)``), automatically inheriting
the handlers from the central logger.
The file handler creates a log file named after the current date and
time. Levels to output to file and console can be set here.
Args:
name:
Name of the logger instance.
Returns:
An instance of the Python :py:mod:`logging.Logger`.
"""
if not logging.getLogger(name).hasHandlers():
# create logger
logger = logging.getLogger(name)
logger.setLevel(logging.DEBUG)
logger.propagate = True
# formatting
file_formatter = logging.Formatter(
"%(asctime)s\t%(levelname)s\tmodule:%(module)s\n%(message)s",
)
stdout_formatter = logging.Formatter("%(levelname)s -- %(message)s")
# file name and creation
now = datetime.now()
date_time = now.strftime("%Y%m%d-%H%M%S")
log_to_disk = _config.config("biocypher").get("log_to_disk")
if log_to_disk:
logdir = (
_config.config("biocypher").get("log_directory")
or "biocypher-log"
)
os.makedirs(logdir, exist_ok=True)
logfile = os.path.join(logdir, f"biocypher-{date_time}.log")
# file handler
file_handler = logging.FileHandler(logfile)
if _config.config("biocypher").get("debug"):
file_handler.setLevel(logging.DEBUG)
else:
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)
# handlers
# stream handler
stdout_handler = logging.StreamHandler()
stdout_handler.setLevel(logging.INFO)
stdout_handler.setFormatter(stdout_formatter)
# add handlers
logger.addHandler(stdout_handler)
# startup message
logger.info(f"This is BioCypher v{__version__}.")
if log_to_disk:
logger.info(f"Logging into `{logfile}`.")
else:
logger.info("Logging into stdout.")
return logging.getLogger(name)
def logfile() -> str:
"""
Path to the log file.
"""
return get_logger().handlers[0].baseFilename
def log():
"""
Browse the log file.
"""
with open(logfile()) as fp:
pydoc.pager(fp.read())
logger = get_logger()

307
biocypher/_mapping.py Normal file
View File

@ -0,0 +1,307 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'mapping' module. Handles the mapping of user-defined schema to the
underlying ontology.
"""
from ._logger import logger
logger.debug(f"Loading module {__name__}.")
from typing import Optional
from urllib.request import urlopen
import yaml
from . import _misc
from ._config import config as _config
class OntologyMapping:
"""
Class to store the ontology mapping and extensions.
"""
def __init__(self, config_file: str = None):
self.schema = self._read_config(config_file)
self.extended_schema = self._extend_schema()
def _read_config(self, config_file: str = None):
"""
Read the configuration file and store the ontology mapping and extensions.
"""
if config_file is None:
schema_config = {}
# load yaml file from web
elif config_file.startswith("http"):
with urlopen(config_file) as f:
schema_config = yaml.safe_load(f)
# get graph state from config (assume file is local)
else:
with open(config_file, "r") as f:
schema_config = yaml.safe_load(f)
return schema_config
def _extend_schema(self, d: Optional[dict] = None) -> dict:
"""
Get leaves of the tree hierarchy from the data structure dict
contained in the `schema_config.yaml`. Creates virtual leaves
(as children) from entries that provide more than one preferred
id type (and corresponding inputs).
Args:
d:
Data structure dict from yaml file.
"""
d = d or self.schema
extended_schema = dict()
# first pass: get parent leaves with direct representation in ontology
for k, v in d.items():
# k is not an entity
if "represented_as" not in v:
continue
# preferred_id optional: if not provided, use `id`
if not v.get("preferred_id"):
v["preferred_id"] = "id"
# k is an entity that is present in the ontology
if "is_a" not in v:
extended_schema[k] = v
# second pass: "vertical" inheritance
d = self._vertical_property_inheritance(d)
for k, v in d.items():
if "is_a" in v:
# prevent loops
if k == v["is_a"]:
logger.warning(
f"Loop detected in ontology mapping: {k} -> {v}. "
"Removing item. Please fix the inheritance if you want "
"to use this item."
)
continue
extended_schema[k] = v
# "horizontal" inheritance: create siblings for multiple identifiers or
# sources -> virtual leaves or implicit children
mi_leaves = {}
ms_leaves = {}
for k, v in d.items():
# k is not an entity
if "represented_as" not in v:
continue
if isinstance(v.get("preferred_id"), list):
mi_leaves = self._horizontal_inheritance_pid(k, v)
extended_schema.update(mi_leaves)
elif isinstance(v.get("source"), list):
ms_leaves = self._horizontal_inheritance_source(k, v)
extended_schema.update(ms_leaves)
return extended_schema
def _vertical_property_inheritance(self, d):
"""
Inherit properties from parents to children and update `d` accordingly.
"""
for k, v in d.items():
# k is not an entity
if "represented_as" not in v:
continue
# k is an entity that is present in the ontology
if "is_a" not in v:
continue
# "vertical" inheritance: inherit properties from parent
if v.get("inherit_properties", False):
# get direct ancestor
if isinstance(v["is_a"], list):
parent = v["is_a"][0]
else:
parent = v["is_a"]
# ensure child has properties and exclude_properties
if "properties" not in v:
v["properties"] = {}
if "exclude_properties" not in v:
v["exclude_properties"] = {}
# update properties of child
parent_props = self.schema[parent].get("properties", {})
if parent_props:
v["properties"].update(parent_props)
parent_excl_props = self.schema[parent].get(
"exclude_properties", {}
)
if parent_excl_props:
v["exclude_properties"].update(parent_excl_props)
# update schema (d)
d[k] = v
return d
def _horizontal_inheritance_pid(self, key, value):
"""
Create virtual leaves for multiple preferred id types or sources.
If we create virtual leaves, input_label/label_in_input always has to be
a list.
"""
leaves = {}
preferred_id = value["preferred_id"]
input_label = value.get("input_label") or value["label_in_input"]
represented_as = value["represented_as"]
# adjust lengths
max_l = max(
[
len(_misc.to_list(preferred_id)),
len(_misc.to_list(input_label)),
len(_misc.to_list(represented_as)),
],
)
# adjust pid length if necessary
if isinstance(preferred_id, str):
pids = [preferred_id] * max_l
else:
pids = preferred_id
# adjust rep length if necessary
if isinstance(represented_as, str):
reps = [represented_as] * max_l
else:
reps = represented_as
for pid, lab, rep in zip(pids, input_label, reps):
skey = pid + "." + key
svalue = {
"preferred_id": pid,
"input_label": lab,
"represented_as": rep,
# mark as virtual
"virtual": True,
}
# inherit is_a if exists
if "is_a" in value.keys():
# treat as multiple inheritance
if isinstance(value["is_a"], list):
v = list(value["is_a"])
v.insert(0, key)
svalue["is_a"] = v
else:
svalue["is_a"] = [key, value["is_a"]]
else:
# set parent as is_a
svalue["is_a"] = key
# inherit everything except core attributes
for k, v in value.items():
if k not in [
"is_a",
"preferred_id",
"input_label",
"label_in_input",
"represented_as",
]:
svalue[k] = v
leaves[skey] = svalue
return leaves
def _horizontal_inheritance_source(self, key, value):
"""
Create virtual leaves for multiple sources.
If we create virtual leaves, input_label/label_in_input always has to be
a list.
"""
leaves = {}
source = value["source"]
input_label = value.get("input_label") or value["label_in_input"]
represented_as = value["represented_as"]
# adjust lengths
src_l = len(source)
# adjust label length if necessary
if isinstance(input_label, str):
labels = [input_label] * src_l
else:
labels = input_label
# adjust rep length if necessary
if isinstance(represented_as, str):
reps = [represented_as] * src_l
else:
reps = represented_as
for src, lab, rep in zip(source, labels, reps):
skey = src + "." + key
svalue = {
"source": src,
"input_label": lab,
"represented_as": rep,
# mark as virtual
"virtual": True,
}
# inherit is_a if exists
if "is_a" in value.keys():
# treat as multiple inheritance
if isinstance(value["is_a"], list):
v = list(value["is_a"])
v.insert(0, key)
svalue["is_a"] = v
else:
svalue["is_a"] = [key, value["is_a"]]
else:
# set parent as is_a
svalue["is_a"] = key
# inherit everything except core attributes
for k, v in value.items():
if k not in [
"is_a",
"source",
"input_label",
"label_in_input",
"represented_as",
]:
svalue[k] = v
leaves[skey] = svalue
return leaves

71
biocypher/_metadata.py Normal file
View File

@ -0,0 +1,71 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
Package metadata (version, authors, etc).
"""
__all__ = ["get_metadata"]
import os
import pathlib
import importlib.metadata
import toml
_VERSION = "0.6.0"
def get_metadata():
"""
Basic package metadata.
Retrieves package metadata from the current project directory or from
the installed package.
"""
here = pathlib.Path(__file__).parent
pyproj_toml = "pyproject.toml"
meta = {}
for project_dir in (here, here.parent):
toml_path = str(project_dir.joinpath(pyproj_toml).absolute())
if os.path.exists(toml_path):
pyproject = toml.load(toml_path)
meta = {
"name": pyproject["tool"]["poetry"]["name"],
"version": pyproject["tool"]["poetry"]["version"],
"author": pyproject["tool"]["poetry"]["authors"],
"license": pyproject["tool"]["poetry"]["license"],
"full_metadata": pyproject,
}
break
if not meta:
try:
meta = {
k.lower(): v
for k, v in importlib.metadata.metadata(here.name).items()
}
except importlib.metadata.PackageNotFoundError:
pass
meta["version"] = meta.get("version", None) or _VERSION
return meta
metadata = get_metadata()
__version__ = metadata.get("version", None)
__author__ = metadata.get("author", None)
__license__ = metadata.get("license", None)

264
biocypher/_misc.py Normal file
View File

@ -0,0 +1,264 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
Handy functions for use in various places.
"""
from ._logger import logger
logger.debug(f"Loading module {__name__}.")
from typing import (
Any,
Union,
Mapping,
KeysView,
Generator,
ItemsView,
ValuesView,
)
from collections.abc import Iterable
import re
from treelib import Tree
import networkx as nx
import stringcase
__all__ = ["LIST_LIKE", "SIMPLE_TYPES", "ensure_iterable", "to_list"]
SIMPLE_TYPES = (
bytes,
str,
int,
float,
bool,
type(None),
)
LIST_LIKE = (
list,
set,
tuple,
Generator,
ItemsView,
KeysView,
Mapping,
ValuesView,
)
def to_list(value: Any) -> list:
"""
Ensures that ``value`` is a list.
"""
if isinstance(value, LIST_LIKE):
value = list(value)
else:
value = [value]
return value
def ensure_iterable(value: Any) -> Iterable:
"""
Returns iterables, except strings, wraps simple types into tuple.
"""
return value if isinstance(value, LIST_LIKE) else (value,)
def create_tree_visualisation(inheritance_graph: Union[dict, nx.Graph]) -> Tree:
"""
Creates a visualisation of the inheritance tree using treelib.
"""
inheritance_tree = _get_inheritance_tree(inheritance_graph)
classes, root = _find_root_node(inheritance_tree)
tree = Tree()
tree.create_node(root, root)
while classes:
for child in classes:
parent = inheritance_tree[child]
if parent in tree.nodes.keys() or parent == root:
tree.create_node(child, child, parent=parent)
for node in tree.nodes.keys():
if node in classes:
classes.remove(node)
return tree
def _get_inheritance_tree(inheritance_graph: Union[dict, nx.Graph]) -> dict:
"""Transforms an inheritance_graph into an inheritance_tree.
Args:
inheritance_graph: A dict or nx.Graph representing the inheritance graph.
Returns:
A dict representing the inheritance tree.
"""
if isinstance(inheritance_graph, nx.Graph):
inheritance_tree = nx.to_dict_of_lists(inheritance_graph)
multiple_parents_present = _multiple_inheritance_present(
inheritance_tree
)
if multiple_parents_present:
logger.warning(
"The ontology contains multiple inheritance (one child node "
"has multiple parent nodes). This is not visualized in the "
"following hierarchy tree (the child node is only added once). "
"If you wish to browse all relationships of the parsed "
"ontologies, write a graphml file to disk using "
"`to_disk = <directory>` and view this file."
)
# unlist values
inheritance_tree = {k: v[0] for k, v in inheritance_tree.items() if v}
return inheritance_tree
elif not _multiple_inheritance_present(inheritance_graph):
return inheritance_graph
def _multiple_inheritance_present(inheritance_tree: dict) -> bool:
"""Checks if multiple inheritance is present in the inheritance_tree."""
return any(len(value) > 1 for value in inheritance_tree.values())
def _find_root_node(inheritance_tree: dict) -> tuple[set, str]:
classes = set(inheritance_tree.keys())
parents = set(inheritance_tree.values())
root = list(parents - classes)
if len(root) > 1:
if "entity" in root:
root = "entity" # TODO: default: good standard?
else:
raise ValueError(
"Inheritance tree cannot have more than one root node. "
f"Found {len(root)}: {root}."
)
else:
root = root[0]
if not root:
# find key whose value is None
root = list(inheritance_tree.keys())[
list(inheritance_tree.values()).index(None)
]
return classes, root
# string conversion, adapted from Biolink Model Toolkit
lowercase_pattern = re.compile(r"[a-zA-Z]*[a-z][a-zA-Z]*")
underscore_pattern = re.compile(r"(?<!^)(?=[A-Z][a-z])")
def from_pascal(s: str, sep: str = " ") -> str:
underscored = underscore_pattern.sub(sep, s)
lowercased = lowercase_pattern.sub(
lambda match: match.group(0).lower(),
underscored,
)
return lowercased
def pascalcase_to_sentencecase(s: str) -> str:
"""
Convert PascalCase to sentence case.
Args:
s: Input string in PascalCase
Returns:
string in sentence case form
"""
return from_pascal(s, sep=" ")
def snakecase_to_sentencecase(s: str) -> str:
"""
Convert snake_case to sentence case.
Args:
s: Input string in snake_case
Returns:
string in sentence case form
"""
return stringcase.sentencecase(s).lower()
def sentencecase_to_snakecase(s: str) -> str:
"""
Convert sentence case to snake_case.
Args:
s: Input string in sentence case
Returns:
string in snake_case form
"""
return stringcase.snakecase(s).lower()
def sentencecase_to_pascalcase(s: str, sep: str = r"\s") -> str:
"""
Convert sentence case to PascalCase.
Args:
s: Input string in sentence case
Returns:
string in PascalCase form
"""
return re.sub(
r"(?:^|[" + sep + "])([a-zA-Z])",
lambda match: match.group(1).upper(),
s,
)
def to_lower_sentence_case(s: str) -> str:
"""
Convert any string to lower sentence case. Works with snake_case,
PascalCase, and sentence case.
Args:
s: Input string
Returns:
string in lower sentence case form
"""
if "_" in s:
return snakecase_to_sentencecase(s)
elif " " in s:
return s.lower()
elif s[0].isupper():
return pascalcase_to_sentencecase(s)
else:
return s
def is_nested(lst) -> bool:
"""
Check if a list is nested.
Args:
lst (list): The list to check.
Returns:
bool: True if the list is nested, False otherwise.
"""
for item in lst:
if isinstance(item, list):
return True
return False

886
biocypher/_ontology.py Normal file
View File

@ -0,0 +1,886 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'ontology' module. Contains classes and functions to handle parsing
and representation of single ontologies as well as their hybridisation and
other advanced operations.
"""
import os
from ._logger import logger
logger.debug(f"Loading module {__name__}.")
from typing import Optional
from datetime import datetime
from rdflib import Graph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
import rdflib
import networkx as nx
from ._misc import (
to_list,
to_lower_sentence_case,
create_tree_visualisation,
sentencecase_to_pascalcase,
)
from ._mapping import OntologyMapping
class OntologyAdapter:
"""
Class that represents an ontology to be used in the Biocypher framework. Can
read from a variety of formats, including OWL, OBO, and RDF/XML. The
ontology is represented by a networkx.DiGraph object; an RDFlib graph is
also kept. By default, the DiGraph reverses the label and identifier of the
nodes, such that the node name in the graph is the human-readable label. The
edges are oriented from child to parent.
Labels are formatted in lower sentence case and underscores are replaced by spaces.
Identifiers are taken as defined and the prefixes are removed by default.
"""
def __init__(
self,
ontology_file: str,
root_label: str,
ontology_file_format: Optional[str] = None,
head_join_node_label: Optional[str] = None,
merge_nodes: Optional[bool] = True,
switch_label_and_id: bool = True,
remove_prefixes: bool = True,
):
"""
Initialize the OntologyAdapter class.
Args:
ontology_file (str): Path to the ontology file. Can be local or
remote.
root_label (str): The label of the root node in the ontology. In
case of a tail ontology, this is the tail join node.
ontology_file_format (str): The format of the ontology file (e.g. "application/rdf+xml")
If format is not passed, it is determined automatically.
head_join_node_label (str): Optional variable to store the label of the
node in the head ontology that should be used to join to the
root node of the tail ontology. Defaults to None.
merge_nodes (bool): If True, head and tail join nodes will be
merged, using the label of the head join node. If False, the
tail join node will be attached as a child of the head join
node.
switch_label_and_id (bool): If True, the node names in the graph will be
the human-readable labels. If False, the node names will be the
identifiers. Defaults to True.
remove_prefixes (bool): If True, the prefixes of the identifiers will
be removed. Defaults to True.
"""
logger.info(f"Instantiating OntologyAdapter class for {ontology_file}.")
self._ontology_file = ontology_file
self._root_label = root_label
self._format = ontology_file_format
self._merge_nodes = merge_nodes
self._head_join_node = head_join_node_label
self._switch_label_and_id = switch_label_and_id
self._remove_prefixes = remove_prefixes
self._rdf_graph = self._load_rdf_graph(ontology_file)
self._nx_graph = self._rdf_to_nx(
self._rdf_graph, root_label, switch_label_and_id
)
def _rdf_to_nx(
self,
_rdf_graph: rdflib.Graph,
root_label: str,
switch_label_and_id: bool,
rename_nodes: bool = True,
) -> nx.DiGraph:
one_to_one_triples, one_to_many_dict = self._get_relevant_rdf_triples(
_rdf_graph
)
nx_graph = self._convert_to_nx(one_to_one_triples, one_to_many_dict)
nx_graph = self._add_labels_to_nodes(nx_graph, switch_label_and_id)
nx_graph = self._change_nodes_to_biocypher_format(
nx_graph, switch_label_and_id, rename_nodes
)
nx_graph = self._get_all_ancestors(
nx_graph, root_label, switch_label_and_id, rename_nodes
)
return nx.DiGraph(nx_graph)
def _get_relevant_rdf_triples(self, g: rdflib.Graph) -> tuple:
one_to_one_inheritance_graph = self._get_one_to_one_inheritance_triples(
g
)
intersection = self._get_multiple_inheritance_dict(g)
return one_to_one_inheritance_graph, intersection
def _get_one_to_one_inheritance_triples(
self, g: rdflib.Graph
) -> rdflib.Graph:
"""Get the one to one inheritance triples from the RDF graph.
Args:
g (rdflib.Graph): The RDF graph
Returns:
rdflib.Graph: The one to one inheritance graph
"""
one_to_one_inheritance_graph = Graph()
for s, p, o in g.triples((None, rdflib.RDFS.subClassOf, None)):
if self.has_label(s, g):
one_to_one_inheritance_graph.add((s, p, o))
return one_to_one_inheritance_graph
def _get_multiple_inheritance_dict(self, g: rdflib.Graph) -> dict:
"""Get the multiple inheritance dictionary from the RDF graph.
Args:
g (rdflib.Graph): The RDF graph
Returns:
dict: The multiple inheritance dictionary
"""
multiple_inheritance = g.triples(
(None, rdflib.OWL.intersectionOf, None)
)
intersection = {}
for (
node,
has_multiple_parents,
first_node_of_intersection_list,
) in multiple_inheritance:
parents = self._retrieve_rdf_linked_list(
first_node_of_intersection_list
)
child_name = None
for s_, _, _ in g.triples((None, rdflib.RDFS.subClassOf, node)):
child_name = s_
# Handle Snomed CT post coordinated expressions
if not child_name:
for s_, _, _ in g.triples(
(None, rdflib.OWL.equivalentClass, node)
):
child_name = s_
if child_name:
intersection[node] = {
"child_name": child_name,
"parent_node_names": parents,
}
return intersection
def has_label(self, node: rdflib.URIRef, g: rdflib.Graph) -> bool:
"""Does the node have a label in g?
Args:
node (rdflib.URIRef): The node to check
g (rdflib.Graph): The graph to check in
Returns:
bool: True if the node has a label, False otherwise
"""
return (node, rdflib.RDFS.label, None) in g
def _retrieve_rdf_linked_list(self, subject: rdflib.URIRef) -> list:
"""Recursively retrieves a linked list from RDF.
Example RDF list with the items [item1, item2]:
list_node - first -> item1
list_node - rest -> list_node2
list_node2 - first -> item2
list_node2 - rest -> nil
Args:
subject (rdflib.URIRef): One list_node of the RDF list
Returns:
list: The items of the RDF list
"""
g = self._rdf_graph
rdf_list = []
for s, p, o in g.triples((subject, rdflib.RDF.first, None)):
rdf_list.append(o)
for s, p, o in g.triples((subject, rdflib.RDF.rest, None)):
if o != rdflib.RDF.nil:
rdf_list.extend(self._retrieve_rdf_linked_list(o))
return rdf_list
def _convert_to_nx(
self, one_to_one: rdflib.Graph, one_to_many: dict
) -> nx.DiGraph:
"""Convert the one to one and one to many inheritance graphs to networkx.
Args:
one_to_one (rdflib.Graph): The one to one inheritance graph
one_to_many (dict): The one to many inheritance dictionary
Returns:
nx.DiGraph: The networkx graph
"""
nx_graph = rdflib_to_networkx_digraph(
one_to_one, edge_attrs=lambda s, p, o: {}, calc_weights=False
)
for key, value in one_to_many.items():
nx_graph.add_edges_from(
[
(value["child_name"], parent)
for parent in value["parent_node_names"]
]
)
if key in nx_graph.nodes:
nx_graph.remove_node(key)
return nx_graph
def _add_labels_to_nodes(
self, nx_graph: nx.DiGraph, switch_label_and_id: bool
) -> nx.DiGraph:
"""Add labels to the nodes in the networkx graph.
Args:
nx_graph (nx.DiGraph): The networkx graph
switch_label_and_id (bool): If True, id and label are switched
Returns:
nx.DiGraph: The networkx graph with labels
"""
for node in list(nx_graph.nodes):
nx_id, nx_label = self._get_nx_id_and_label(
node, switch_label_and_id
)
if nx_id == "none":
# remove node if it has no id
nx_graph.remove_node(node)
continue
nx_graph.nodes[node]["label"] = nx_label
return nx_graph
def _change_nodes_to_biocypher_format(
self,
nx_graph: nx.DiGraph,
switch_label_and_id: bool,
rename_nodes: bool = True,
) -> nx.DiGraph:
"""Change the nodes in the networkx graph to BioCypher format:
- remove the prefix of the identifier
- switch id and label
- adapt the labels (replace _ with space and convert to lower sentence case)
Args:
nx_graph (nx.DiGraph): The networkx graph
switch_label_and_id (bool): If True, id and label are switched
rename_nodes (bool): If True, the nodes are renamed
Returns:
nx.DiGraph: The networkx ontology graph in BioCypher format
"""
mapping = {
node: self._get_nx_id_and_label(
node, switch_label_and_id, rename_nodes
)[0]
for node in nx_graph.nodes
}
renamed = nx.relabel_nodes(nx_graph, mapping, copy=False)
return renamed
def _get_all_ancestors(
self,
renamed: nx.DiGraph,
root_label: str,
switch_label_and_id: bool,
rename_nodes: bool = True,
) -> nx.DiGraph:
"""Get all ancestors of the root node in the networkx graph.
Args:
renamed (nx.DiGraph): The renamed networkx graph
root_label (str): The label of the root node in the ontology
switch_label_and_id (bool): If True, id and label are switched
rename_nodes (bool): If True, the nodes are renamed
Returns:
nx.DiGraph: The filtered networkx graph
"""
root = self._get_nx_id_and_label(
self._find_root_label(self._rdf_graph, root_label),
switch_label_and_id,
rename_nodes,
)[0]
ancestors = nx.ancestors(renamed, root)
ancestors.add(root)
filtered_graph = renamed.subgraph(ancestors)
return filtered_graph
def _get_nx_id_and_label(
self, node, switch_id_and_label: bool, rename_nodes: bool = True
) -> tuple[str, str]:
"""Rename node id and label for nx graph.
Args:
node (str): The node to rename
switch_id_and_label (bool): If True, switch id and label
Returns:
tuple[str, str]: The renamed node id and label
"""
node_id_str = self._remove_prefix(str(node))
node_label_str = str(self._rdf_graph.value(node, rdflib.RDFS.label))
if rename_nodes:
node_label_str = node_label_str.replace("_", " ")
node_label_str = to_lower_sentence_case(node_label_str)
nx_id = node_label_str if switch_id_and_label else node_id_str
nx_label = node_id_str if switch_id_and_label else node_label_str
return nx_id, nx_label
def _find_root_label(self, g, root_label):
# Loop through all labels in the ontology
for label_subject, _, label_in_ontology in g.triples(
(None, rdflib.RDFS.label, None)
):
# If the label is the root label, set the root node to the label's subject
if str(label_in_ontology) == root_label:
root = label_subject
break
else:
labels_in_ontology = []
for label_subject, _, label_in_ontology in g.triples(
(None, rdflib.RDFS.label, None)
):
labels_in_ontology.append(str(label_in_ontology))
raise ValueError(
f"Could not find root node with label '{root_label}'. "
f"The ontology contains the following labels: {labels_in_ontology}"
)
return root
def _remove_prefix(self, uri: str) -> str:
"""
Remove the prefix of a URI. URIs can contain either "#" or "/" as a
separator between the prefix and the local name. The prefix is
everything before the last separator.
"""
if self._remove_prefixes:
return uri.rsplit("#", 1)[-1].rsplit("/", 1)[-1]
else:
return uri
def _load_rdf_graph(self, ontology_file):
"""
Load the ontology into an RDFlib graph. The ontology file can be in
OWL, OBO, or RDF/XML format.
"""
g = rdflib.Graph()
g.parse(ontology_file, format=self._get_format(ontology_file))
return g
def _get_format(self, ontology_file):
"""
Get the format of the ontology file.
"""
if self._format:
if self._format == "owl":
return "application/rdf+xml"
elif self._format == "obo":
raise NotImplementedError("OBO format not yet supported")
elif self._format == "rdf":
return "application/rdf+xml"
elif self._format == "ttl":
return self._format
else:
raise ValueError(
f"Could not determine format of ontology file {ontology_file}"
)
if ontology_file.endswith(".owl"):
return "application/rdf+xml"
elif ontology_file.endswith(".obo"):
raise NotImplementedError("OBO format not yet supported")
elif ontology_file.endswith(".rdf"):
return "application/rdf+xml"
elif ontology_file.endswith(".ttl"):
return "ttl"
else:
raise ValueError(
f"Could not determine format of ontology file {ontology_file}"
)
def get_nx_graph(self):
"""
Get the networkx graph representing the ontology.
"""
return self._nx_graph
def get_rdf_graph(self):
"""
Get the RDFlib graph representing the ontology.
"""
return self._rdf_graph
def get_root_node(self):
"""
Get root node in the ontology.
Returns:
root_node: If _switch_label_and_id is True, the root node label is returned,
otherwise the root node id is returned.
"""
root_node = None
root_label = self._root_label.replace("_", " ")
if self._switch_label_and_id:
root_node = to_lower_sentence_case(root_label)
elif not self._switch_label_and_id:
for node, data in self.get_nx_graph().nodes(data=True):
if "label" in data and data["label"] == to_lower_sentence_case(
root_label
):
root_node = node
break
return root_node
def get_ancestors(self, node_label):
"""
Get the ancestors of a node in the ontology.
"""
return nx.dfs_preorder_nodes(self._nx_graph, node_label)
def get_head_join_node(self):
"""
Get the head join node of the ontology.
"""
return self._head_join_node
class Ontology:
"""
A class that represents the ontological "backbone" of a BioCypher knowledge
graph. The ontology can be built from a single resource, or hybridised from
a combination of resources, with one resource being the "head" ontology,
while an arbitrary number of other resources can become "tail" ontologies at
arbitrary fusion points inside the "head" ontology.
"""
def __init__(
self,
head_ontology: dict,
ontology_mapping: Optional["OntologyMapping"] = None,
tail_ontologies: Optional[dict] = None,
):
"""
Initialize the Ontology class.
Args:
head_ontology (OntologyAdapter): The head ontology.
tail_ontologies (list): A list of OntologyAdapters that will be
added to the head ontology. Defaults to None.
"""
self._head_ontology_meta = head_ontology
self.mapping = ontology_mapping
self._tail_ontology_meta = tail_ontologies
self._tail_ontologies = None
self._nx_graph = None
# keep track of nodes that have been extended
self._extended_nodes = set()
self._main()
def _main(self) -> None:
"""
Main method to be run on instantiation. Loads the ontologies, joins
them, and returns the hybrid ontology. Loads only the head ontology
if nothing else is given. Adds user extensions and properties from
the mapping.
"""
self._load_ontologies()
if self._tail_ontologies:
for adapter in self._tail_ontologies.values():
head_join_node = self._get_head_join_node(adapter)
self._join_ontologies(adapter, head_join_node)
else:
self._nx_graph = self._head_ontology.get_nx_graph()
if self.mapping:
self._extend_ontology()
# experimental: add connections of disjoint classes to entity
# self._connect_biolink_classes()
self._add_properties()
def _load_ontologies(self) -> None:
"""
For each ontology, load the OntologyAdapter object and store it as an
instance variable (head) or a dictionary (tail).
"""
logger.info("Loading ontologies...")
self._head_ontology = OntologyAdapter(
ontology_file=self._head_ontology_meta["url"],
root_label=self._head_ontology_meta["root_node"],
ontology_file_format=self._head_ontology_meta.get("format", None),
switch_label_and_id=self._head_ontology_meta.get(
"switch_label_and_id", True
),
)
if self._tail_ontology_meta:
self._tail_ontologies = {}
for key, value in self._tail_ontology_meta.items():
self._tail_ontologies[key] = OntologyAdapter(
ontology_file=value["url"],
root_label=value["tail_join_node"],
head_join_node_label=value["head_join_node"],
ontology_file_format=value.get("format", None),
merge_nodes=value.get("merge_nodes", True),
switch_label_and_id=value.get("switch_label_and_id", True),
)
def _get_head_join_node(self, adapter: OntologyAdapter) -> str:
"""
Tries to find the head join node of the given ontology adapter in the
head ontology. If the join node is not found, the method will raise an
error.
Args:
adapter (OntologyAdapter): The ontology adapter of which to find the
join node in the head ontology.
"""
head_join_node = None
user_defined_head_join_node_label = adapter.get_head_join_node()
head_join_node_label_in_bc_format = to_lower_sentence_case(
user_defined_head_join_node_label.replace("_", " ")
)
if self._head_ontology._switch_label_and_id:
head_join_node = head_join_node_label_in_bc_format
elif not self._head_ontology._switch_label_and_id:
for node_id, data in self._head_ontology.get_nx_graph().nodes(
data=True
):
if (
"label" in data
and data["label"] == head_join_node_label_in_bc_format
):
head_join_node = node_id
break
if head_join_node not in self._head_ontology.get_nx_graph().nodes:
head_ontology = self._head_ontology._rdf_to_nx(
self._head_ontology.get_rdf_graph(),
self._head_ontology._root_label,
self._head_ontology._switch_label_and_id,
rename_nodes=False,
)
raise ValueError(
f"Head join node '{head_join_node}' not found in head ontology. "
f"The head ontology contains the following nodes: {head_ontology.nodes}."
)
return head_join_node
def _join_ontologies(
self, adapter: OntologyAdapter, head_join_node
) -> None:
"""
Joins the ontologies by adding the tail ontology as a subgraph to the
head ontology at the specified join nodes.
Args:
adapter (OntologyAdapter): The ontology adapter of the tail ontology
to be added to the head ontology.
"""
if not self._nx_graph:
self._nx_graph = self._head_ontology.get_nx_graph().copy()
tail_join_node = adapter.get_root_node()
tail_ontology = adapter.get_nx_graph()
# subtree of tail ontology at join node
tail_ontology_subtree = nx.dfs_tree(
tail_ontology.reverse(), tail_join_node
).reverse()
# transfer node attributes from tail ontology to subtree
for node in tail_ontology_subtree.nodes:
tail_ontology_subtree.nodes[node].update(tail_ontology.nodes[node])
# if merge_nodes is False, create parent of tail join node from head
# join node
if not adapter._merge_nodes:
# add head join node from head ontology to tail ontology subtree
# as parent of tail join node
tail_ontology_subtree.add_node(
head_join_node,
**self._head_ontology.get_nx_graph().nodes[head_join_node],
)
tail_ontology_subtree.add_edge(tail_join_node, head_join_node)
# else rename tail join node to match head join node if necessary
elif not tail_join_node == head_join_node:
tail_ontology_subtree = nx.relabel_nodes(
tail_ontology_subtree, {tail_join_node: head_join_node}
)
# combine head ontology and tail subtree
self._nx_graph = nx.compose(self._nx_graph, tail_ontology_subtree)
def _extend_ontology(self) -> None:
"""
Adds the user extensions to the ontology. Tries to find the parent in
the ontology, adds it if necessary, and adds the child and a directed
edge from child to parent. Can handle multiple parents.
"""
if not self._nx_graph:
self._nx_graph = self._head_ontology.get_nx_graph().copy()
for key, value in self.mapping.extended_schema.items():
if not value.get("is_a"):
if self._nx_graph.has_node(value.get("synonym_for")):
continue
if not self._nx_graph.has_node(key):
raise ValueError(
f"Node {key} not found in ontology, but also has no "
"inheritance definition. Please check your schema for "
"spelling errors, first letter not in lower case, use of underscores, a missing `is_a` definition (SubClassOf a root node), or missing labels in class or super-classes."
)
continue
parents = to_list(value.get("is_a"))
child = key
while parents:
parent = parents.pop(0)
if parent not in self._nx_graph.nodes:
self._nx_graph.add_node(parent)
self._nx_graph.nodes[parent][
"label"
] = sentencecase_to_pascalcase(parent)
# mark parent as user extension
self._nx_graph.nodes[parent]["user_extension"] = True
self._extended_nodes.add(parent)
if child not in self._nx_graph.nodes:
self._nx_graph.add_node(child)
self._nx_graph.nodes[child][
"label"
] = sentencecase_to_pascalcase(child)
# mark child as user extension
self._nx_graph.nodes[child]["user_extension"] = True
self._extended_nodes.add(child)
self._nx_graph.add_edge(child, parent)
child = parent
def _connect_biolink_classes(self) -> None:
"""
Experimental: Adds edges from disjoint classes to the entity node.
"""
if not self._nx_graph:
self._nx_graph = self._head_ontology.get_nx_graph().copy()
if "entity" not in self._nx_graph.nodes:
return
# biolink classes that are disjoint from entity
disjoint_classes = [
"frequency qualifier mixin",
"chemical entity to entity association mixin",
"ontology class",
"relationship quantifier",
"physical essence or occurrent",
"gene or gene product",
"subject of investigation",
]
for node in disjoint_classes:
if not self._nx_graph.nodes.get(node):
self._nx_graph.add_node(node)
self._nx_graph.nodes[node][
"label"
] = sentencecase_to_pascalcase(node)
self._nx_graph.add_edge(node, "entity")
def _add_properties(self) -> None:
"""
For each entity in the mapping, update the ontology with the properties
specified in the mapping. Updates synonym information in the graph,
setting the synonym as the primary node label.
"""
for key, value in self.mapping.extended_schema.items():
if key in self._nx_graph.nodes:
self._nx_graph.nodes[key].update(value)
if value.get("synonym_for"):
# change node label to synonym
if value["synonym_for"] not in self._nx_graph.nodes:
raise ValueError(
f'Node {value["synonym_for"]} not found in ontology.'
)
self._nx_graph = nx.relabel_nodes(
self._nx_graph, {value["synonym_for"]: key}
)
def get_ancestors(self, node_label: str) -> list:
"""
Get the ancestors of a node in the ontology.
Args:
node_label (str): The label of the node in the ontology.
Returns:
list: A list of the ancestors of the node.
"""
return nx.dfs_tree(self._nx_graph, node_label)
def show_ontology_structure(self, to_disk: str = None, full: bool = False):
"""
Show the ontology structure using treelib or write to GRAPHML file.
Args:
to_disk (str): If specified, the ontology structure will be saved
to disk as a GRAPHML file at the location (directory) specified
by the `to_disk` string, to be opened in your favourite graph
visualisation tool.
full (bool): If True, the full ontology structure will be shown,
including all nodes and edges. If False, only the nodes and
edges that are relevant to the extended schema will be shown.
"""
if not full and not self.mapping.extended_schema:
raise ValueError(
"You are attempting to visualise a subset of the loaded"
"ontology, but have not provided a schema configuration. "
"To display a partial ontology graph, please provide a schema "
"configuration file; to visualise the full graph, please use "
"the parameter `full = True`."
)
if not self._nx_graph:
raise ValueError("Ontology not loaded.")
if not self._tail_ontologies:
msg = f"Showing ontology structure based on {self._head_ontology._ontology_file}"
else:
msg = f"Showing ontology structure based on {len(self._tail_ontology_meta)+1} ontologies: "
logger.info(msg)
if not full:
# set of leaves and their intermediate parents up to the root
filter_nodes = set(self.mapping.extended_schema.keys())
for node in self.mapping.extended_schema.keys():
filter_nodes.update(self.get_ancestors(node).nodes)
# filter graph
G = self._nx_graph.subgraph(filter_nodes)
else:
G = self._nx_graph
if not to_disk:
# create tree
tree = create_tree_visualisation(G)
# add synonym information
for node in self.mapping.extended_schema:
if not isinstance(self.mapping.extended_schema[node], dict):
continue
if self.mapping.extended_schema[node].get("synonym_for"):
tree.nodes[node].tag = (
f"{node} = "
f"{self.mapping.extended_schema[node].get('synonym_for')}"
)
logger.info(f"\n{tree}")
return tree
else:
# convert lists/dicts to strings for vis only
for node in G.nodes:
# rename node and use former id as label
label = G.nodes[node].get("label")
if not label:
label = node
G = nx.relabel_nodes(G, {node: label})
G.nodes[label]["label"] = node
for attrib in G.nodes[label]:
if type(G.nodes[label][attrib]) in [list, dict]:
G.nodes[label][attrib] = str(G.nodes[label][attrib])
path = os.path.join(to_disk, "ontology_structure.graphml")
logger.info(f"Writing ontology structure to {path}.")
nx.write_graphml(G, path)
return True
def get_dict(self) -> dict:
"""
Returns a dictionary compatible with a BioCypher node for compatibility
with the Neo4j driver.
"""
d = {
"node_id": self._get_current_id(),
"node_label": "BioCypher",
"properties": {
"schema": "self.ontology_mapping.extended_schema",
},
}
return d
def _get_current_id(self):
"""
Instantiate a version ID for the current session. For now does simple
versioning using datetime.
Can later implement incremental versioning, versioning from
config file, or manual specification via argument.
"""
now = datetime.now()
return now.strftime("v%Y%m%d-%H%M%S")

480
biocypher/_translate.py Normal file
View File

@ -0,0 +1,480 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'translation' module. Responsible for translating between the raw
input data and the BioCypherNode and BioCypherEdge objects.
"""
from ._logger import logger
logger.debug(f"Loading module {__name__}.")
from typing import Any, Union, Optional
from collections.abc import Iterable, Generator
from more_itertools import peekable
from . import _misc
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
from ._ontology import Ontology
__all__ = ["BiolinkAdapter", "Translator"]
class Translator:
"""
Class responsible for exacting the translation process that is configured in
the schema_config.yaml file. Creates a mapping dictionary from that file,
and, given nodes and edges, translates them into BioCypherNodes and
BioCypherEdges. During this process, can also filter the properties of the
entities if the schema_config.yaml file specifies a property whitelist or
blacklist.
Provides utility functions for translating between input and output labels
and cypher queries.
"""
def __init__(self, ontology: "Ontology", strict_mode: bool = False):
"""
Args:
leaves:
Dictionary detailing the leaves of the hierarchy
tree representing the structure of the graph; the leaves are
the entities that will be direct components of the graph,
while the intermediary nodes are additional labels for
filtering purposes.
strict_mode:
If True, the translator will raise an error if input data do not
carry source, licence, and version information.
"""
self.ontology = ontology
self.strict_mode = strict_mode
# record nodes without biolink type configured in schema_config.yaml
self.notype = {}
# mapping functionality for translating terms and queries
self.mappings = {}
self.reverse_mappings = {}
self._update_ontology_types()
def translate_nodes(
self,
node_tuples: Iterable,
) -> Generator[BioCypherNode, None, None]:
"""
Translates input node representation to a representation that
conforms to the schema of the given BioCypher graph. For now
requires explicit statement of node type on pass.
Args:
node_tuples (list of tuples): collection of tuples
representing individual nodes by their unique id and a type
that is translated from the original database notation to
the corresponding BioCypher notation.
"""
self._log_begin_translate(node_tuples, "nodes")
for _id, _type, _props in node_tuples:
# check for strict mode requirements
required_props = ["source", "licence", "version"]
if self.strict_mode:
# rename 'license' to 'licence' in _props
if _props.get("license"):
_props["licence"] = _props.pop("license")
for prop in required_props:
if prop not in _props:
raise ValueError(
f"Property `{prop}` missing from node {_id}. "
"Strict mode is enabled, so this is not allowed."
)
# find the node in leaves that represents ontology node type
_ontology_class = self._get_ontology_mapping(_type)
if _ontology_class:
# filter properties for those specified in schema_config if any
_filtered_props = self._filter_props(_ontology_class, _props)
# preferred id
_preferred_id = self._get_preferred_id(_ontology_class)
yield BioCypherNode(
node_id=_id,
node_label=_ontology_class,
preferred_id=_preferred_id,
properties=_filtered_props,
)
else:
self._record_no_type(_type, _id)
self._log_finish_translate("nodes")
def _get_preferred_id(self, _bl_type: str) -> str:
"""
Returns the preferred id for the given Biolink type.
"""
return (
self.ontology.mapping.extended_schema[_bl_type]["preferred_id"]
if "preferred_id"
in self.ontology.mapping.extended_schema.get(_bl_type, {})
else "id"
)
def _filter_props(self, bl_type: str, props: dict) -> dict:
"""
Filters properties for those specified in schema_config if any.
"""
filter_props = self.ontology.mapping.extended_schema[bl_type].get(
"properties", {}
)
# strict mode: add required properties (only if there is a whitelist)
if self.strict_mode and filter_props:
filter_props.update(
{"source": "str", "licence": "str", "version": "str"},
)
exclude_props = self.ontology.mapping.extended_schema[bl_type].get(
"exclude_properties", []
)
if isinstance(exclude_props, str):
exclude_props = [exclude_props]
if filter_props and exclude_props:
filtered_props = {
k: v
for k, v in props.items()
if (k in filter_props.keys() and k not in exclude_props)
}
elif filter_props:
filtered_props = {
k: v for k, v in props.items() if k in filter_props.keys()
}
elif exclude_props:
filtered_props = {
k: v for k, v in props.items() if k not in exclude_props
}
else:
return props
missing_props = [
k for k in filter_props.keys() if k not in filtered_props.keys()
]
# add missing properties with default values
for k in missing_props:
filtered_props[k] = None
return filtered_props
def translate_edges(
self,
edge_tuples: Iterable,
) -> Generator[Union[BioCypherEdge, BioCypherRelAsNode], None, None]:
"""
Translates input edge representation to a representation that
conforms to the schema of the given BioCypher graph. For now
requires explicit statement of edge type on pass.
Args:
edge_tuples (list of tuples):
collection of tuples representing source and target of
an interaction via their unique ids as well as the type
of interaction in the original database notation, which
is translated to BioCypher notation using the `leaves`.
Can optionally possess its own ID.
"""
self._log_begin_translate(edge_tuples, "edges")
# legacy: deal with 4-tuples (no edge id)
# TODO remove for performance reasons once safe
edge_tuples = peekable(edge_tuples)
if len(edge_tuples.peek()) == 4:
edge_tuples = [
(None, src, tar, typ, props)
for src, tar, typ, props in edge_tuples
]
for _id, _src, _tar, _type, _props in edge_tuples:
# check for strict mode requirements
if self.strict_mode:
if not "source" in _props:
raise ValueError(
f"Edge {_id if _id else (_src, _tar)} does not have a `source` property.",
" This is required in strict mode.",
)
if not "licence" in _props:
raise ValueError(
f"Edge {_id if _id else (_src, _tar)} does not have a `licence` property.",
" This is required in strict mode.",
)
# match the input label (_type) to
# a Biolink label from schema_config
bl_type = self._get_ontology_mapping(_type)
if bl_type:
# filter properties for those specified in schema_config if any
_filtered_props = self._filter_props(bl_type, _props)
rep = self.ontology.mapping.extended_schema[bl_type][
"represented_as"
]
if rep == "node":
if _id:
# if it brings its own ID, use it
node_id = _id
else:
# source target concat
node_id = (
str(_src)
+ "_"
+ str(_tar)
+ "_"
+ "_".join(str(v) for v in _filtered_props.values())
)
n = BioCypherNode(
node_id=node_id,
node_label=bl_type,
properties=_filtered_props,
)
# directionality check TODO generalise to account for
# different descriptions of directionality or find a
# more consistent solution for indicating directionality
if _filtered_props.get("directed") == True:
l1 = "IS_SOURCE_OF"
l2 = "IS_TARGET_OF"
elif _filtered_props.get(
"src_role",
) and _filtered_props.get("tar_role"):
l1 = _filtered_props.get("src_role")
l2 = _filtered_props.get("tar_role")
else:
l1 = l2 = "IS_PART_OF"
e_s = BioCypherEdge(
source_id=_src,
target_id=node_id,
relationship_label=l1,
# additional here
)
e_t = BioCypherEdge(
source_id=_tar,
target_id=node_id,
relationship_label=l2,
# additional here
)
yield BioCypherRelAsNode(n, e_s, e_t)
else:
edge_label = self.ontology.mapping.extended_schema[
bl_type
].get("label_as_edge")
if edge_label is None:
edge_label = bl_type
yield BioCypherEdge(
relationship_id=_id,
source_id=_src,
target_id=_tar,
relationship_label=edge_label,
properties=_filtered_props,
)
else:
self._record_no_type(_type, (_src, _tar))
self._log_finish_translate("edges")
def _record_no_type(self, _type: Any, what: Any) -> None:
"""
Records the type of a node or edge that is not represented in the
schema_config.
"""
logger.debug(f"No ontology type defined for `{_type}`: {what}")
if self.notype.get(_type, None):
self.notype[_type] += 1
else:
self.notype[_type] = 1
def get_missing_biolink_types(self) -> dict:
"""
Returns a dictionary of types that were not represented in the
schema_config.
"""
return self.notype
@staticmethod
def _log_begin_translate(_input: Iterable, what: str):
n = f"{len(_input)} " if hasattr(_input, "__len__") else ""
logger.debug(f"Translating {n}{what} to BioCypher")
@staticmethod
def _log_finish_translate(what: str):
logger.debug(f"Finished translating {what} to BioCypher.")
def _update_ontology_types(self):
"""
Creates a dictionary to translate from input labels to ontology labels.
If multiple input labels, creates mapping for each.
"""
self._ontology_mapping = {}
for key, value in self.ontology.mapping.extended_schema.items():
labels = value.get("input_label") or value.get("label_in_input")
if isinstance(labels, str):
self._ontology_mapping[labels] = key
elif isinstance(labels, list):
for label in labels:
self._ontology_mapping[label] = key
if value.get("label_as_edge"):
self._add_translation_mappings(labels, value["label_as_edge"])
else:
self._add_translation_mappings(labels, key)
def _get_ontology_mapping(self, label: str) -> Optional[str]:
"""
For each given input type ("input_label" or "label_in_input"), find the
corresponding ontology class in the leaves dictionary (from the
`schema_config.yam`).
Args:
label:
The input type to find (`input_label` or `label_in_input` in
`schema_config.yaml`).
"""
# commented out until behaviour of _update_bl_types is fixed
return self._ontology_mapping.get(label, None)
def translate_term(self, term):
"""
Translate a single term.
"""
return self.mappings.get(term, None)
def reverse_translate_term(self, term):
"""
Reverse translate a single term.
"""
return self.reverse_mappings.get(term, None)
def translate(self, query):
"""
Translate a cypher query. Only translates labels as of now.
"""
for key in self.mappings:
query = query.replace(":" + key, ":" + self.mappings[key])
return query
def reverse_translate(self, query):
"""
Reverse translate a cypher query. Only translates labels as of
now.
"""
for key in self.reverse_mappings:
a = ":" + key + ")"
b = ":" + key + "]"
# TODO this conditional probably does not cover all cases
if a in query or b in query:
if isinstance(self.reverse_mappings[key], list):
raise NotImplementedError(
"Reverse translation of multiple inputs not "
"implemented yet. Many-to-one mappings are "
"not reversible. "
f"({key} -> {self.reverse_mappings[key]})",
)
else:
query = query.replace(
a,
":" + self.reverse_mappings[key] + ")",
).replace(b, ":" + self.reverse_mappings[key] + "]")
return query
def _add_translation_mappings(self, original_name, biocypher_name):
"""
Add translation mappings for a label and name. We use here the
PascalCase version of the BioCypher name, since sentence case is
not useful for Cypher queries.
"""
if isinstance(original_name, list):
for on in original_name:
self.mappings[on] = self.name_sentence_to_pascal(
biocypher_name,
)
else:
self.mappings[original_name] = self.name_sentence_to_pascal(
biocypher_name,
)
if isinstance(biocypher_name, list):
for bn in biocypher_name:
self.reverse_mappings[
self.name_sentence_to_pascal(
bn,
)
] = original_name
else:
self.reverse_mappings[
self.name_sentence_to_pascal(
biocypher_name,
)
] = original_name
@staticmethod
def name_sentence_to_pascal(name: str) -> str:
"""
Converts a name in sentence case to pascal case.
"""
# split on dots if dot is present
if "." in name:
return ".".join(
[_misc.sentencecase_to_pascalcase(n) for n in name.split(".")],
)
else:
return _misc.sentencecase_to_pascalcase(name)

View File

View File

View File

@ -0,0 +1,422 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'online' mode. Handles connection and manipulation of a running DBMS.
"""
import subprocess
from biocypher._logger import logger
logger.debug(f"Loading module {__name__}.")
from collections.abc import Iterable
import itertools
import neo4j_utils
from biocypher import _misc
from biocypher._config import config as _config
from biocypher._create import BioCypherEdge, BioCypherNode
from biocypher._translate import Translator
__all__ = ["_Neo4jDriver"]
class _Neo4jDriver:
"""
Manages a BioCypher connection to a Neo4j database using the
``neo4j_utils.Driver`` class.
Args:
database_name (str): The name of the database to connect to.
wipe (bool): Whether to wipe the database before importing.
uri (str): The URI of the database.
user (str): The username to use for authentication.
password (str): The password to use for authentication.
multi_db (bool): Whether to use multi-database mode.
fetch_size (int): The number of records to fetch at a time.
increment_version (bool): Whether to increment the version number.
translator (Translator): The translator to use for mapping.
"""
def __init__(
self,
database_name: str,
uri: str,
user: str,
password: str,
multi_db: bool,
translator: Translator,
wipe: bool = False,
fetch_size: int = 1000,
increment_version: bool = True,
):
self.translator = translator
self._driver = neo4j_utils.Driver(
db_name=database_name,
db_uri=uri,
db_user=user,
db_passwd=password,
fetch_size=fetch_size,
wipe=wipe,
multi_db=multi_db,
raise_errors=True,
)
# check for biocypher config in connected graph
if wipe:
self.init_db()
if increment_version:
# set new current version node
self._update_meta_graph()
def _update_meta_graph(self):
logger.info("Updating Neo4j meta graph.")
# find current version node
db_version = self._driver.query(
"MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v",
)
# add version node
self.add_biocypher_nodes(self.translator.ontology)
# connect version node to previous
if db_version[0]:
previous = db_version[0][0]
previous_id = previous["v"]["id"]
e_meta = BioCypherEdge(
previous_id,
self.translator.ontology.get_dict().get("node_id"),
"PRECEDES",
)
self.add_biocypher_edges(e_meta)
def init_db(self):
"""
Used to initialise a property graph database by setting up new
constraints. Wipe has been performed by the ``neo4j_utils.Driver``
class` already.
Todo:
- set up constraint creation interactively depending on the
need of the database
"""
logger.info("Initialising database.")
self._create_constraints()
def _create_constraints(self):
"""
Creates constraints on node types in the graph. Used for
initial setup.
Grabs leaves of the ``schema_config.yaml`` file and creates
constraints on the id of all entities represented as nodes.
"""
logger.info("Creating constraints for node types in config.")
major_neo4j_version = int(self._get_neo4j_version().split(".")[0])
# get structure
for leaf in self.translator.ontology.mapping.extended_schema.items():
label = _misc.sentencecase_to_pascalcase(leaf[0], sep=r"\s\.")
if leaf[1]["represented_as"] == "node":
if major_neo4j_version >= 5:
s = (
f"CREATE CONSTRAINT `{label}_id` "
f"IF NOT EXISTS FOR (n:`{label}`) "
"REQUIRE n.id IS UNIQUE"
)
self._driver.query(s)
else:
s = (
f"CREATE CONSTRAINT `{label}_id` "
f"IF NOT EXISTS ON (n:`{label}`) "
"ASSERT n.id IS UNIQUE"
)
self._driver.query(s)
def _get_neo4j_version(self):
"""Get neo4j version."""
try:
neo4j_version = self._driver.query(
"""
CALL dbms.components()
YIELD name, versions, edition
UNWIND versions AS version
RETURN version AS version
""",
)[0][0]["version"]
return neo4j_version
except Exception as e:
logger.warning(
f"Error detecting Neo4j version: {e} use default version 4.0.0."
)
return "4.0.0"
def add_nodes(self, id_type_tuples: Iterable[tuple]) -> tuple:
"""
Generic node adder method to add any kind of input to the graph via the
:class:`biocypher.create.BioCypherNode` class. Employs translation
functionality and calls the :meth:`add_biocypher_nodes()` method.
Args:
id_type_tuples (iterable of 3-tuple): for each node to add to
the biocypher graph, a 3-tuple with the following layout:
first, the (unique if constrained) ID of the node; second, the
type of the node, capitalised or PascalCase and in noun form
(Neo4j primary label, eg `:Protein`); and third, a dictionary
of arbitrary properties the node should possess (can be empty).
Returns:
2-tuple: the query result of :meth:`add_biocypher_nodes()`
- first entry: data
- second entry: Neo4j summary.
"""
bn = self.translator.translate_nodes(id_type_tuples)
return self.add_biocypher_nodes(bn)
def add_edges(self, id_src_tar_type_tuples: Iterable[tuple]) -> tuple:
"""
Generic edge adder method to add any kind of input to the graph
via the :class:`biocypher.create.BioCypherEdge` class. Employs
translation functionality and calls the
:meth:`add_biocypher_edges()` method.
Args:
id_src_tar_type_tuples (iterable of 5-tuple):
for each edge to add to the biocypher graph, a 5-tuple
with the following layout: first, the optional unique ID
of the interaction. This can be `None` if there is no
systematic identifier (which for many interactions is
the case). Second and third, the (unique if constrained)
IDs of the source and target nodes of the relationship;
fourth, the type of the relationship; and fifth, a
dictionary of arbitrary properties the edge should
possess (can be empty).
Returns:
2-tuple: the query result of :meth:`add_biocypher_edges()`
- first entry: data
- second entry: Neo4j summary.
"""
bn = self.translator.translate_edges(id_src_tar_type_tuples)
return self.add_biocypher_edges(bn)
def add_biocypher_nodes(
self,
nodes: Iterable[BioCypherNode],
explain: bool = False,
profile: bool = False,
) -> bool:
"""
Accepts a node type handoff class
(:class:`biocypher.create.BioCypherNode`) with id,
label, and a dict of properties (passing on the type of
property, ie, ``int``, ``str``, ...).
The dict retrieved by the
:meth:`biocypher.create.BioCypherNode.get_dict()` method is
passed into Neo4j as a map of maps, explicitly encoding node id
and label, and adding all other properties from the 'properties'
key of the dict. The merge is performed via APOC, matching only
on node id to prevent duplicates. The same properties are set on
match and on create, irrespective of the actual event.
Args:
nodes:
An iterable of :class:`biocypher.create.BioCypherNode` objects.
explain:
Call ``EXPLAIN`` on the CYPHER query.
profile:
Do profiling on the CYPHER query.
Returns:
True for success, False otherwise.
"""
try:
nodes = _misc.to_list(nodes)
entities = [node.get_dict() for node in nodes]
except AttributeError:
msg = "Nodes must have a `get_dict` method."
logger.error(msg)
raise ValueError(msg)
logger.info(f"Merging {len(entities)} nodes.")
entity_query = (
"UNWIND $entities AS ent "
"CALL apoc.merge.node([ent.node_label], "
"{id: ent.node_id}, ent.properties, ent.properties) "
"YIELD node "
"RETURN node"
)
method = "explain" if explain else "profile" if profile else "query"
result = getattr(self._driver, method)(
entity_query,
parameters={
"entities": entities,
},
)
logger.info("Finished merging nodes.")
return result
def add_biocypher_edges(
self,
edges: Iterable[BioCypherEdge],
explain: bool = False,
profile: bool = False,
) -> bool:
"""
Accepts an edge type handoff class
(:class:`biocypher.create.BioCypherEdge`) with source
and target ids, label, and a dict of properties (passing on the
type of property, ie, int, string ...).
The individual edge is either passed as a singleton, in the case
of representation as an edge in the graph, or as a 4-tuple, in
the case of representation as a node (with two edges connecting
to interaction partners).
The dict retrieved by the
:meth:`biocypher.create.BioCypherEdge.get_dict()` method is
passed into Neo4j as a map of maps, explicitly encoding source
and target ids and the relationship label, and adding all edge
properties from the 'properties' key of the dict. The merge is
performed via APOC, matching only on source and target id to
prevent duplicates. The same properties are set on match and on
create, irrespective of the actual event.
Args:
edges:
An iterable of :class:`biocypher.create.BioCypherEdge` objects.
explain:
Call ``EXPLAIN`` on the CYPHER query.
profile:
Do profiling on the CYPHER query.
Returns:
`True` for success, `False` otherwise.
"""
edges = _misc.ensure_iterable(edges)
edges = itertools.chain(*(_misc.ensure_iterable(i) for i in edges))
nodes = []
rels = []
try:
for e in edges:
if hasattr(e, "get_node"):
nodes.append(e.get_node())
rels.append(e.get_source_edge().get_dict())
rels.append(e.get_target_edge().get_dict())
else:
rels.append(e.get_dict())
except AttributeError:
msg = "Edges and nodes must have a `get_dict` method."
logger.error(msg)
raise ValueError(msg)
self.add_biocypher_nodes(nodes)
logger.info(f"Merging {len(rels)} edges.")
# cypher query
# merging only on the ids of the entities, passing the
# properties on match and on create;
# TODO add node labels?
node_query = (
"UNWIND $rels AS r "
"MERGE (src {id: r.source_id}) "
"MERGE (tar {id: r.target_id}) "
)
self._driver.query(node_query, parameters={"rels": rels})
edge_query = (
"UNWIND $rels AS r "
"MATCH (src {id: r.source_id}) "
"MATCH (tar {id: r.target_id}) "
"WITH src, tar, r "
"CALL apoc.merge.relationship"
"(src, r.relationship_label, NULL, "
"r.properties, tar, r.properties) "
"YIELD rel "
"RETURN rel"
)
method = "explain" if explain else "profile" if profile else "query"
result = getattr(self._driver, method)(
edge_query, parameters={"rels": rels}
)
logger.info("Finished merging edges.")
return result
def get_driver(
dbms: str,
translator: "Translator",
):
"""
Function to return the writer class.
Returns:
class: the writer class
"""
dbms_config = _config(dbms)
if dbms == "neo4j":
return _Neo4jDriver(
database_name=dbms_config["database_name"],
wipe=dbms_config["wipe"],
uri=dbms_config["uri"],
user=dbms_config["user"],
password=dbms_config["password"],
multi_db=dbms_config["multi_db"],
translator=translator,
)
return None

View File

View File

@ -0,0 +1,90 @@
import pandas as pd
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
class Pandas:
def __init__(self, translator, deduplicator):
self.translator = translator
self.deduplicator = deduplicator
self.dfs = {}
def _separate_entity_types(self, entities):
"""
Given mixed iterable of BioCypher objects, separate them into lists by
type. Also deduplicates using the `Deduplicator` instance.
"""
lists = {}
for entity in entities:
if (
not isinstance(entity, BioCypherNode)
and not isinstance(entity, BioCypherEdge)
and not isinstance(entity, BioCypherRelAsNode)
):
raise TypeError(
"Expected a BioCypherNode / BioCypherEdge / "
f"BioCypherRelAsNode, got {type(entity)}."
)
if isinstance(entity, BioCypherNode):
seen = self.deduplicator.node_seen(entity)
elif isinstance(entity, BioCypherEdge):
seen = self.deduplicator.edge_seen(entity)
elif isinstance(entity, BioCypherRelAsNode):
seen = self.deduplicator.rel_as_node_seen(entity)
if seen:
continue
if isinstance(entity, BioCypherRelAsNode):
node = entity.get_node()
source_edge = entity.get_source_edge()
target_edge = entity.get_target_edge()
_type = node.get_type()
if not _type in lists:
lists[_type] = []
lists[_type].append(node)
_source_type = source_edge.get_type()
if not _source_type in lists:
lists[_source_type] = []
lists[_source_type].append(source_edge)
_target_type = target_edge.get_type()
if not _target_type in lists:
lists[_target_type] = []
lists[_target_type].append(target_edge)
continue
_type = entity.get_type()
if not _type in lists:
lists[_type] = []
lists[_type].append(entity)
return lists
def add_tables(self, entities):
"""
Add Pandas dataframes for each node and edge type in the input.
"""
lists = self._separate_entity_types(entities)
for _type, _entities in lists.items():
self._add_entity_df(_type, _entities)
def _add_entity_df(self, _type, _entities):
df = pd.DataFrame(
pd.json_normalize([node.get_dict() for node in _entities])
)
# replace "properties." with "" in column names
df.columns = [col.replace("properties.", "") for col in df.columns]
if _type not in self.dfs:
self.dfs[_type] = df
else:
self.dfs[_type] = pd.concat(
[self.dfs[_type], df], ignore_index=True
)
return self.dfs[_type]

View File

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,113 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# Michael Hartung
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'offline' module. Handles the writing of node and edge representations
suitable for import into a DBMS.
"""
from biocypher._logger import logger
from biocypher.output.write.graph._rdf import _RDFWriter
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
from biocypher.output.write.graph._networkx import _NetworkXWriter
from biocypher.output.write.relational._csv import _PandasCSVWriter
from biocypher.output.write.relational._sqlite import _SQLiteBatchWriter
from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
logger.debug(f"Loading module {__name__}.")
from typing import TYPE_CHECKING
from biocypher._config import config as _config
__all__ = ["get_writer", "DBMS_TO_CLASS"]
if TYPE_CHECKING:
from biocypher._translate import Translator
from biocypher._deduplicate import Deduplicator
DBMS_TO_CLASS = {
"neo": _Neo4jBatchWriter,
"neo4j": _Neo4jBatchWriter,
"Neo4j": _Neo4jBatchWriter,
"postgres": _PostgreSQLBatchWriter,
"postgresql": _PostgreSQLBatchWriter,
"PostgreSQL": _PostgreSQLBatchWriter,
"arango": _ArangoDBBatchWriter,
"arangodb": _ArangoDBBatchWriter,
"ArangoDB": _ArangoDBBatchWriter,
"sqlite": _SQLiteBatchWriter,
"sqlite3": _SQLiteBatchWriter,
"rdf": _RDFWriter,
"RDF": _RDFWriter,
"csv": _PandasCSVWriter,
"CSV": _PandasCSVWriter,
"pandas": _PandasCSVWriter,
"Pandas": _PandasCSVWriter,
"networkx": _NetworkXWriter,
"NetworkX": _NetworkXWriter,
}
def get_writer(
dbms: str,
translator: "Translator",
deduplicator: "Deduplicator",
output_directory: str,
strict_mode: bool,
):
"""
Function to return the writer class based on the selection in the config
file.
Args:
dbms: the database management system; for options, see DBMS_TO_CLASS.
translator: the Translator object.
deduplicator: the Deduplicator object.
output_directory: the directory to output.write the output files to.
strict_mode: whether to use strict mode.
Returns:
instance: an instance of the selected writer class.
"""
dbms_config = _config(dbms)
writer = DBMS_TO_CLASS[dbms]
if not writer:
raise ValueError(f"Unknown dbms: {dbms}")
if writer is not None:
return writer(
translator=translator,
deduplicator=deduplicator,
delimiter=dbms_config.get("delimiter"),
array_delimiter=dbms_config.get("array_delimiter"),
quote=dbms_config.get("quote_character"),
output_directory=output_directory,
db_name=dbms_config.get("database_name"),
import_call_bin_prefix=dbms_config.get("import_call_bin_prefix"),
import_call_file_prefix=dbms_config.get("import_call_file_prefix"),
wipe=dbms_config.get("wipe"),
strict_mode=strict_mode,
skip_bad_relationships=dbms_config.get(
"skip_bad_relationships"
), # neo4j
skip_duplicate_nodes=dbms_config.get(
"skip_duplicate_nodes"
), # neo4j
db_user=dbms_config.get("user"), # psql
db_password=dbms_config.get("password"), # psql
db_port=dbms_config.get("port"), # psql
rdf_format=dbms_config.get("rdf_format"), # rdf
rdf_namespaces=dbms_config.get("rdf_namespaces"), # rdf
)

View File

@ -0,0 +1,200 @@
from abc import ABC, abstractmethod
from typing import Union, Optional
from collections.abc import Iterable
import os
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
from biocypher._logger import logger
from biocypher._translate import Translator
from biocypher._deduplicate import Deduplicator
__all__ = ["_Writer"]
class _Writer(ABC):
"""Abstract class for writing node and edge representations to disk.
Specifics of the different writers (e.g. neo4j, postgresql, csv, etc.)
are implemented in the child classes. Any concrete writer needs to
implement at least:
- _write_node_data
- _write_edge_data
- _construct_import_call
- _get_import_script_name
Args:
translator (Translator): Instance of :py:class:`Translator` to enable translation of
nodes and manipulation of properties.
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
of nodes and edges.
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
Raises:
NotImplementedError: Writer implementation must override '_write_node_data'
NotImplementedError: Writer implementation must override '_write_edge_data'
NotImplementedError: Writer implementation must override '_construct_import_call'
NotImplementedError: Writer implementation must override '_get_import_script_name'
"""
def __init__(
self,
translator: Translator,
deduplicator: Deduplicator,
output_directory: Optional[str] = None,
strict_mode: bool = False,
*args,
**kwargs,
):
"""Abstract class for writing node and edge representations to disk.
Args:
translator (Translator): Instance of :py:class:`Translator` to enable translation of
nodes and manipulation of properties.
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
of nodes and edges.
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
"""
self.translator = translator
self.deduplicator = deduplicator
self.strict_mode = strict_mode
self.output_directory = output_directory
if os.path.exists(self.output_directory):
if kwargs.get("write_to_file", True):
logger.warning(
f"Output directory `{self.output_directory}` already exists. "
"If this is not planned, file consistency may be compromised."
)
else:
logger.info(f"Creating output directory `{self.output_directory}`.")
os.makedirs(self.output_directory)
@abstractmethod
def _write_node_data(
self,
nodes: Iterable[
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
],
) -> bool:
"""Implement how to output.write nodes to disk.
Args:
nodes (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
Returns:
bool: The return value. True for success, False otherwise.
"""
raise NotImplementedError(
"Writer implementation must override 'write_nodes'"
)
@abstractmethod
def _write_edge_data(
self,
edges: Iterable[
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
],
) -> bool:
"""Implement how to output.write edges to disk.
Args:
edges (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
Returns:
bool: The return value. True for success, False otherwise.
"""
raise NotImplementedError(
"Writer implementation must override 'write_edges'"
)
@abstractmethod
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: command for importing the output files into a DBMS.
"""
raise NotImplementedError(
"Writer implementation must override '_construct_import_call'"
)
@abstractmethod
def _get_import_script_name(self) -> str:
"""Returns the name of the import script.
Returns:
str: The name of the import script (ending in .sh)
"""
raise NotImplementedError(
"Writer implementation must override '_get_import_script_name'"
)
def write_nodes(
self, nodes, batch_size: int = int(1e6), force: bool = False
):
"""Wrapper for writing nodes.
Args:
nodes (BioCypherNode): a list or generator of nodes in
:py:class:`BioCypherNode` format
batch_size (int): The batch size for writing nodes.
force (bool): Whether to force writing nodes even if their type is
not present in the schema.
Returns:
bool: The return value. True for success, False otherwise.
"""
passed = self._write_node_data(nodes)
if not passed:
logger.error("Error while writing node data.")
return False
return True
def write_edges(
self, edges, batch_size: int = int(1e6), force: bool = False
):
"""Wrapper for writing edges.
Args:
nodes (BioCypherNode): a list or generator of nodes in
:py:class:`BioCypherNode` format
batch_size (int): The batch size for writing nodes.
force (bool): Whether to force writing nodes even if their type is
not present in the schema.
Returns:
bool: The return value. True for success, False otherwise.
"""
passed = self._write_edge_data(edges)
if not passed:
logger.error("Error while writing edge data.")
return False
return True
def write_import_call(self):
"""
Function to output.write the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name, to the export folder as txt.
Returns:
str: The path of the file holding the import call.
"""
file_path = os.path.join(
self.output_directory, self._get_import_script_name()
)
logger.info(
f"Writing {self.__class__.__name__} import call to `{file_path}`."
)
with open(file_path, "w", encoding="utf-8") as f:
f.write(self._construct_import_call())
return file_path

View File

View File

@ -0,0 +1,241 @@
import os
from biocypher._logger import logger
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
class _ArangoDBBatchWriter(_Neo4jBatchWriter):
"""
Class for writing node and edge representations to disk using the format
specified by ArangoDB for the use of "arangoimport". Output files are
similar to Neo4j, but with a different header format.
"""
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the neo4j admin import location
"""
return ""
def _get_import_script_name(self) -> str:
"""
Returns the name of the neo4j admin import script
Returns:
str: The name of the import script (ending in .sh)
"""
return "arangodb-import-call.sh"
def _write_node_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`,
containing only the header for this type of node.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.node_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.node_property_dict.items():
# create header CSV with ID, properties, labels
_id = "_key"
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(label)
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
# check if file already exists
if os.path.exists(header_path):
logger.warning(
f"File {header_path} already exists. Overwriting."
)
# concatenate key:value in props
props_list = []
for k in props.keys():
props_list.append(f"{k}")
# create list of lists and flatten
# removes need for empty check of property list
out_list = [[_id], props_list]
out_list = [val for sublist in out_list for val in sublist]
with open(header_path, "w", encoding="utf-8") as f:
# concatenate with delimiter
row = self.delim.join(out_list)
f.write(row)
# add collection from schema config
collection = self.translator.ontology.mapping.extended_schema[
label
].get("db_collection_name", None)
# add file path to neo4 admin import statement
# do once for each part file
parts = self.parts.get(label, [])
if not parts:
raise ValueError(
f"No parts found for node label {label}. "
f"Check that the data was parsed first.",
)
for part in parts:
import_call_header_path = os.path.join(
self.import_call_file_prefix,
header,
)
import_call_parts_path = os.path.join(
self.import_call_file_prefix,
part,
)
self.import_call_nodes.add(
(
import_call_header_path,
import_call_parts_path,
collection,
)
)
return True
def _write_edge_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.edge_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.edge_property_dict.items():
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(label)
# paths
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
parts = f"{pascal_label}-part.*"
# check for file exists
if os.path.exists(header_path):
logger.warning(
f"Header file {header_path} already exists. Overwriting."
)
# concatenate key:value in props
props_list = []
for k in props.keys():
props_list.append(f"{k}")
out_list = ["_from", "_key", *props_list, "_to"]
with open(header_path, "w", encoding="utf-8") as f:
# concatenate with delimiter
row = self.delim.join(out_list)
f.write(row)
# add collection from schema config
if not self.translator.ontology.mapping.extended_schema.get(label):
for (
_,
v,
) in self.translator.ontology.mapping.extended_schema.items():
if v.get("label_as_edge") == label:
collection = v.get("db_collection_name", None)
break
else:
collection = self.translator.ontology.mapping.extended_schema[
label
].get("db_collection_name", None)
# add file path to neo4 admin import statement (import call path
# may be different from actual output path)
header_import_call_path = os.path.join(
self.import_call_file_prefix,
header,
)
parts_import_call_path = os.path.join(
self.import_call_file_prefix,
parts,
)
self.import_call_edges.add(
(
header_import_call_path,
parts_import_call_path,
collection,
)
)
return True
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: a bash command for neo4j-admin import
"""
import_call = (
f"{self.import_call_bin_prefix}arangoimp "
f"--type csv "
f'--separator="{self.escaped_delim}" '
)
if self.quote == "'":
import_call += f'--quote="{self.quote}" '
else:
import_call += f"--quote='{self.quote}' "
node_lines = ""
# node import calls: one line per node type
for header_path, parts_path, collection in self.import_call_nodes:
line = (
f"{import_call} "
f"--headers-file {header_path} "
f"--file= {parts_path} "
)
if collection:
line += f"--create-collection --collection {collection} "
node_lines += f"{line}\n"
edge_lines = ""
# edge import calls: one line per edge type
for header_path, parts_path, collection in self.import_call_edges:
import_call += f'--relationships="{header_path},{parts_path}" '
return node_lines + edge_lines

View File

@ -0,0 +1,502 @@
import os
import glob
import pandas as pd
from biocypher._logger import logger
from biocypher.output.write._batch_writer import parse_label, _BatchWriter
class _Neo4jBatchWriter(_BatchWriter):
"""
Class for writing node and edge representations to disk using the
format specified by Neo4j for the use of admin import. Each batch
writer instance has a fixed representation that needs to be passed
at instantiation via the :py:attr:`schema` argument. The instance
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
to convert and extend the hierarchy.
This class inherits from the abstract class "_BatchWriter" and implements the
Neo4j-specific methods:
- _write_node_headers
- _write_edge_headers
- _construct_import_call
- _write_array_string
"""
def __init__(self, *args, **kwargs):
"""
Constructor.
Check the version of Neo4j and adds a command scope if version >= 5.
Returns:
_Neo4jBatchWriter: An instance of the writer.
"""
# Should read the configuration and setup import_call_bin_prefix.
super().__init__(*args, **kwargs)
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the neo4j admin import location
"""
return "bin/"
def _write_array_string(self, string_list):
"""
Abstract method to output.write the string representation of an array into a .csv file
as required by the neo4j admin-import.
Args:
string_list (list): list of ontology strings
Returns:
str: The string representation of an array for the neo4j admin import
"""
string = self.adelim.join(string_list)
return f"{self.quote}{string}{self.quote}"
def _write_node_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`,
containing only the header for this type of node.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.node_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.node_property_dict.items():
_id = ":ID"
##MeDaX dev remark:
##From Fhir data we get case sensitive labels. E.g. 'Procedure' and 'procedure' are two distinct node types.
##Because we are converting Resources to more specific node classes using their "resourceType" attribute.
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(
parse_label(label)
)
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
parts = f"{pascal_label}-part.*"
existing_header = False
# check if file already exists
if os.path.exists(header_path):
logger.warning(
f"Header file `{header_path}` already exists. Overwriting.",
)
with open(header_path, "r", encoding="utf-8") as existing:
existing_header = existing.read().strip().split(self.delim)
# concatenate key:value in props
props_list = []
for k, v in props.items():
if v in ["int", "long", "integer"]:
props_list.append(f"{k}:long")
elif v in ["int[]", "long[]", "integer[]"]:
props_list.append(f"{k}:long[]")
elif v in ["float", "double", "dbl"]:
props_list.append(f"{k}:double")
elif v in ["float[]", "double[]"]:
props_list.append(f"{k}:double[]")
elif v in ["bool", "boolean"]:
# TODO Neo4j boolean support / spelling?
props_list.append(f"{k}:boolean")
elif v in ["bool[]", "boolean[]"]:
props_list.append(f"{k}:boolean[]")
elif v in ["str[]", "string[]"]:
props_list.append(f"{k}:string[]")
else:
props_list.append(f"{k}")
# create list of lists and flatten
out_list = [[_id], props_list, [":LABEL"]]
out_list = [val for sublist in out_list for val in sublist]
with open(header_path, "w", encoding="utf-8") as f:
# Check if header file already exists and has different columns
if os.path.exists(header_path):
if existing_header:
#existing_header = existing.read().strip().split(self.delim)
# Compare existing and new headers
if set(existing_header) != set(out_list):
# Get part files associated with this header
base_name = os.path.basename(header_path).replace("-header.csv", "")
part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
# Find the highest numbered part file without full sorting
highest_part = None
highest_number = -1
for part_file in part_files:
try:
# Extract number from filename (assuming format like "part123.csv")
file_name = os.path.basename(part_file)
number_part = file_name.split("part")[1].split(".")[0]
number = int(number_part)
if number > highest_number:
highest_number = number
highest_part = part_file
except (IndexError, ValueError):
# Skip files that don't match the expected pattern
continue
# Update each part file with the new columns
for part_file in part_files:
if part_file == highest_part:
print(f"Skipping the highest part file: {highest_part}")
continue
try:
#print("exi: ", existing_header)
#print("out: ", out_list)
df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
# Read the file without headers
# Write back to file WITHOUT including the header
df.to_csv(part_file, sep=self.delim, index=False, header=False)
print(f"Updated {part_file} with new columns in correct positions")
except Exception as e:
print(f"Error updating {part_file}: {e}")
# Write the new header
row = self.delim.join(out_list)
f.write(row)
# add file path to neo4 admin import statement (import call file
# path may be different from actual file path)
import_call_header_path = os.path.join(
self.import_call_file_prefix,
header,
)
import_call_parts_path = os.path.join(
self.import_call_file_prefix,
parts,
)
self.import_call_nodes.add(
(import_call_header_path, import_call_parts_path)
)
return True
def _write_edge_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.edge_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.edge_property_dict.items():
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(
parse_label(label)
)
# paths
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
parts = f"{pascal_label}-part.*"
# check for file exists
if os.path.exists(header_path):
logger.warning(
f"File {header_path} already exists. Overwriting."
)
# concatenate key:value in props
props_list = []
for k, v in props.items():
if v in ["int", "long", "integer"]:
props_list.append(f"{k}:long")
elif v in ["int[]", "long[]", "integer[]"]:
props_list.append(f"{k}:long[]")
elif v in ["float", "double"]:
props_list.append(f"{k}:double")
elif v in ["float[]", "double[]"]:
props_list.append(f"{k}:double[]")
elif v in [
"bool",
"boolean",
]: # TODO does Neo4j support bool?
props_list.append(f"{k}:boolean")
elif v in ["bool[]", "boolean[]"]:
props_list.append(f"{k}:boolean[]")
elif v in ["str[]", "string[]"]:
props_list.append(f"{k}:string[]")
else:
props_list.append(f"{k}")
skip_id = False
schema_label = None
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
skip_id = True
elif not self.translator.ontology.mapping.extended_schema.get(
label
):
# find label in schema by label_as_edge
for (
k,
v,
) in self.translator.ontology.mapping.extended_schema.items():
if v.get("label_as_edge") == label:
schema_label = k
break
else:
schema_label = label
out_list = [":START_ID"]
if schema_label:
if (
self.translator.ontology.mapping.extended_schema.get(
schema_label
).get("use_id")
== False
):
skip_id = True
if not skip_id:
out_list.append("id")
out_list.extend(props_list)
out_list.extend([":END_ID", ":TYPE"])
existing_header = False
# check if file already exists
if os.path.exists(header_path):
logger.warning(
f"Header file `{header_path}` already exists. Overwriting.",
)
with open(header_path, "r", encoding="utf-8") as existing:
existing_header = existing.read().strip().split(self.delim)
with open(header_path, "w", encoding="utf-8") as f:
# Check if header file already exists and has different columns
if os.path.exists(header_path):
if existing_header:
#existing_header = existing.read().strip().split(self.delim)
# Compare existing and new headers
if set(existing_header) != set(out_list):
# Get part files associated with this header
base_name = os.path.basename(header_path).replace("-header.csv", "")
part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
# Find the highest numbered part file without full sorting
highest_part = None
highest_number = -1
for part_file in part_files:
try:
# Extract number from filename (assuming format like "part123.csv")
file_name = os.path.basename(part_file)
number_part = file_name.split("part")[1].split(".")[0]
number = int(number_part)
if number > highest_number:
highest_number = number
highest_part = part_file
except (IndexError, ValueError):
# Skip files that don't match the expected pattern
continue
# Update each part file with the new columns
for part_file in part_files:
if part_file == highest_part:
print(f"Skipping the highest part file: {highest_part}")
continue
try:
print("exi: ", existing_header)
print("out: ", out_list)
df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
# Read the file without headers
# Write back to file WITHOUT including the header
df.to_csv(part_file, sep=self.delim, index=False, header=False)
print(f"Updated {part_file} with new columns in correct positions")
except Exception as e:
print(f"Error updating {part_file}: {e}")
# Write the new header
row = self.delim.join(out_list)
f.write(row)
# add file path to neo4 admin import statement (import call file
# path may be different from actual file path)
import_call_header_path = os.path.join(
self.import_call_file_prefix,
header,
)
import_call_parts_path = os.path.join(
self.import_call_file_prefix,
parts,
)
self.import_call_edges.add(
(import_call_header_path, import_call_parts_path)
)
return True
def _get_import_script_name(self) -> str:
"""
Returns the name of the neo4j admin import script
Returns:
str: The name of the import script (ending in .sh)
"""
return "neo4j-admin-import-call.sh"
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: a bash command for neo4j-admin import
"""
import_call_neo4j_v4 = self._get_import_call(
"import", "--database=", "--force="
)
import_call_neo4j_v5 = self._get_import_call(
"database import full", "", "--overwrite-destination="
)
neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
return import_script
def _get_import_call(
self, import_cmd: str, database_cmd: str, wipe_cmd: str
) -> str:
"""Get parametrized import call for Neo4j 4 or 5+.
Args:
import_cmd (str): The import command to use.
database_cmd (str): The database command to use.
wipe_cmd (str): The wipe command to use.
Returns:
str: The import call.
"""
import_call = f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "
import_call += f"{database_cmd}{self.db_name} "
import_call += f'--delimiter="{self.escaped_delim}" '
import_call += f'--array-delimiter="{self.escaped_adelim}" '
if self.quote == "'":
import_call += f'--quote="{self.quote}" '
else:
import_call += f"--quote='{self.quote}' "
if self.wipe:
import_call += f"{wipe_cmd}true "
if self.skip_bad_relationships:
import_call += "--skip-bad-relationships=true "
if self.skip_duplicate_nodes:
import_call += "--skip-duplicate-nodes=true "
# append node import calls
for header_path, parts_path in self.import_call_nodes:
import_call += f'--nodes="{header_path},{parts_path}" '
# append edge import calls
for header_path, parts_path in self.import_call_edges:
import_call += f'--relationships="{header_path},{parts_path}" '
return import_call
def adapt_csv_to_new_header(self, old_header, new_header, csv_file_path):
"""
Adapt a CSV table to a new header structure, placing new columns in their correct positions.
Parameters:
old_header (list): The original header columns
new_header (list): The new header columns
csv_file_path (str): Path to the CSV file
Returns:
pandas.DataFrame: CSV data with the new header structure
"""
# Step 1: Read the CSV data without headers
df = pd.read_csv(csv_file_path, sep=self.delim, header=None)
# Step 2: If the file is empty, return empty DataFrame with new headers
if df.empty:
return pd.DataFrame(columns=new_header)
# Step 3: If column count doesn't match old_header length, handle the mismatch
if len(df.columns) != len(old_header):
print(f"Warning: CSV columns count ({len(df.columns)}) doesn't match the provided old header count ({len(old_header)})")
# If file has fewer columns than old_header, pad with NaN
if len(df.columns) < len(old_header):
for i in range(len(df.columns), len(old_header)):
df[i] = None
# If file has more columns than old_header, truncate
else:
df = df.iloc[:, :len(old_header)]
# Step 4: Assign old header names to the dataframe
df.columns = old_header
# Step 5: Create a new DataFrame with the correct structure
new_df = pd.DataFrame(columns=new_header)
# Step 6: For each column in the new header, find its position in the old header
for new_col_idx, new_col in enumerate(new_header):
if new_col in old_header:
# If column exists in old header, copy data
new_df[new_col] = df[new_col]
else:
# If new column, add empty column
new_df[new_col] = None
# Step 7: Ensure columns are in the exact order of new_header
new_df = new_df[new_header]
return new_df

View File

@ -0,0 +1,76 @@
import pickle
import networkx as nx
from biocypher._logger import logger
from biocypher.output.write._writer import _Writer
from biocypher.output.write.relational._csv import _PandasCSVWriter
class _NetworkXWriter(_Writer):
"""
Class for writing node and edges to a networkx DiGraph.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.csv_writer = _PandasCSVWriter(*args, write_to_file=False, **kwargs)
self.G = nx.DiGraph()
def _construct_import_call(self) -> str:
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
Returns:
str: Python code to load the csv files into Pandas dfs.
"""
logger.info(
f"Writing networkx {self.G} to pickle file networkx_graph.pkl."
)
with open(f"{self.output_directory}/networkx_graph.pkl", "wb") as f:
pickle.dump(self.G, f)
import_call = "import pickle\n"
import_call += "with open('./networkx_graph.pkl', 'rb') as f:\n\tG_loaded = pickle.load(f)"
return import_call
def _get_import_script_name(self) -> str:
"""Function to return the name of the import script."""
return "import_networkx.py"
def _write_node_data(self, nodes) -> bool:
passed = self.csv_writer._write_entities_to_file(nodes)
self.add_to_networkx()
return passed
def _write_edge_data(self, edges) -> bool:
passed = self.csv_writer._write_entities_to_file(edges)
self.add_to_networkx()
return passed
def add_to_networkx(self) -> bool:
all_dfs = self.csv_writer.stored_dfs
node_dfs = [
df
for df in all_dfs.values()
if df.columns.str.contains("node_id").any()
]
edge_dfs = [
df
for df in all_dfs.values()
if df.columns.str.contains("source_id").any()
and df.columns.str.contains("target_id").any()
]
for df in node_dfs:
nodes = df.set_index("node_id").to_dict(orient="index")
self.G.add_nodes_from(nodes.items())
for df in edge_dfs:
edges = df.set_index(["source_id", "target_id"]).to_dict(
orient="index"
)
self.G.add_edges_from(
(
(source, target, attrs)
for (source, target), attrs in edges.items()
)
)
return True

View File

@ -0,0 +1,515 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Loes van den Biggelaar
# Sebastian Lobentanzer
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'offline' module. Handles the writing of node and edge representations
suitable for import into a DBMS.
"""
from types import GeneratorType
from typing import Union
import os
from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
from rdflib.namespace import (
_NAMESPACE_PREFIXES_CORE,
_NAMESPACE_PREFIXES_RDFLIB,
)
from biocypher._create import BioCypherEdge, BioCypherNode
from biocypher._logger import logger
from biocypher.output.write._batch_writer import _BatchWriter
class _RDFWriter(_BatchWriter):
"""
Class to write BioCypher's property graph into an RDF format using
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
is done keeping only the minimum information about node and edges,
skipping all properties.
"""
def _get_import_script_name(self) -> str:
"""
Returns the name of the RDF admin import script.
This function applicable for RDF export.
Returns:
str: The name of the import script (ending in .sh)
"""
return "rdf-import-call.sh"
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the RDF admin import location
"""
return "bin/"
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
"""
Function to check if the specified RDF format is supported.
Args:
rdf_format (str): The RDF format to check.
Returns:
bool: Returns True if rdf format supported, False otherwise.
"""
supported_formats = [
"xml",
"n3",
"turtle",
"nt",
"pretty-xml",
"trix",
"trig",
"nquads",
"json-ld",
]
if rdf_format not in supported_formats:
logger.error(
f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
)
return False
else:
# RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
if self.rdf_format == "turtle":
self.extension = "ttl"
elif self.rdf_format == "ttl":
self.rdf_format = "turtle"
self.extension = "ttl"
else:
self.extension = self.rdf_format
return True
def _write_single_edge_list_to_file(
self,
edge_list: list,
label: str,
prop_dict: dict,
):
"""
This function takes one list of biocypher edges and writes them
to an RDF file with the given format.
Args:
edge_list (list): list of BioCypherEdges to be written
label (str): the label (type) of the edge
prop_dict (dict): properties of node class passed from parsing
function and their types
Returns:
bool: The return value. True for success, False otherwise.
"""
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
logger.error("Edges must be passed as type BioCypherEdge.")
return False
# translate label to PascalCase
label_pascal = self.translator.name_sentence_to_pascal(label)
# create file name
file_name = os.path.join(
self.outdir, f"{label_pascal}.{self.extension}"
)
# write data in graph
graph = Graph()
self._init_namespaces(graph)
for edge in edge_list:
rdf_subject = edge.get_source_id()
rdf_object = edge.get_target_id()
rdf_predicate = edge.get_id()
rdf_properties = edge.get_properties()
if rdf_predicate == None:
rdf_predicate = rdf_subject + rdf_object
edge_label = self.translator.name_sentence_to_pascal(
edge.get_label()
)
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
graph.add((edge_uri, RDF.type, RDFS.Class))
graph.add(
(
self.rdf_namespaces["biocypher"][rdf_predicate],
RDF.type,
edge_uri,
)
)
graph.add(
(
self.rdf_namespaces["biocypher"][rdf_predicate],
self.rdf_namespaces["biocypher"]["subject"],
self.subject_to_uri(rdf_subject),
)
)
graph.add(
(
self.rdf_namespaces["biocypher"][rdf_predicate],
self.rdf_namespaces["biocypher"]["object"],
self.subject_to_uri(rdf_object),
)
)
# add properties to the transformed edge --> node
for key, value in rdf_properties.items():
# only write value if it exists.
if value:
self.add_property_to_graph(graph, rdf_predicate, value, key)
graph.serialize(destination=file_name, format=self.rdf_format)
logger.info(
f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
)
return True
def add_property_to_graph(
self,
graph: Graph,
rdf_subject: str,
rdf_object: str,
rdf_predicate: str,
):
"""
Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
If the property is neither a list or string, it will also be added as a literal.
Args:
graph (RDFLib.Graph): The RDF graph to add the nodes to.
rdf_subject (str): The subject of the RDF triple.
rdf_object (str): The object of the RDF triple.
rdf_predicate (str): The predicate of the RDF triple.
Returns:
None
"""
if isinstance(rdf_object, list):
for obj in rdf_object:
graph.add(
(
self.subject_to_uri(rdf_subject),
self.property_to_uri(rdf_predicate),
Literal(obj),
)
)
elif isinstance(rdf_object, str):
if rdf_object.startswith("[") and rdf_object.endswith("]"):
self.add_property_to_graph(
graph,
rdf_subject,
self.transform_string_to_list(rdf_object),
rdf_predicate,
)
else:
graph.add(
(
self.subject_to_uri(rdf_subject),
self.property_to_uri(rdf_predicate),
Literal(rdf_object),
)
)
else:
graph.add(
(
self.subject_to_uri(rdf_subject),
self.property_to_uri(rdf_predicate),
Literal(rdf_object),
)
)
def transform_string_to_list(self, string_list: str) -> list:
"""
Function to transform a string representation of a list into a list.
Args:
string_list (str): The string representation of the list.
Returns:
list: The list representation of the input string.
"""
return (
string_list.replace("[", "")
.replace("]", "")
.replace("'", "")
.split(", ")
)
def _write_single_node_list_to_file(
self,
node_list: list,
label: str,
prop_dict: dict,
labels: str,
):
"""
This function takes a list of BioCypherNodes and writes them
to an RDF file in the specified format.
Args:
node_list (list): A list of BioCypherNodes to be written.
label (str): The label (type) of the nodes.
prop_dict (dict): A dictionary of properties and their types for the node class.
Returns:
bool: True if the writing is successful, False otherwise.
"""
if not all(isinstance(n, BioCypherNode) for n in node_list):
logger.error("Nodes must be passed as type BioCypherNode.")
return False
# translate label to PascalCase
label_pascal = self.translator.name_sentence_to_pascal(label)
# create file name
file_name = os.path.join(
self.outdir, f"{label_pascal}.{self.extension}"
)
# write data in graph
graph = Graph()
self._init_namespaces(graph)
for n in node_list:
rdf_subject = n.get_id()
rdf_object = n.get_label()
properties = n.get_properties()
class_name = self.translator.name_sentence_to_pascal(rdf_object)
graph.add(
(
self.rdf_namespaces["biocypher"][class_name],
RDF.type,
RDFS.Class,
)
)
graph.add(
(
self.subject_to_uri(rdf_subject),
RDF.type,
self.rdf_namespaces["biocypher"][class_name],
)
)
for key, value in properties.items():
# only write value if it exists.
if value:
self.add_property_to_graph(graph, rdf_subject, value, key)
graph.serialize(destination=file_name, format=self.rdf_format)
logger.info(
f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
)
return True
def write_nodes(
self, nodes, batch_size: int = int(1e6), force: bool = False
) -> bool:
"""
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
Args:
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
batch_size (int): The number of nodes to write in each batch.
force (bool): Flag to force the writing even if the output file already exists.
Returns:
bool: True if the writing is successful, False otherwise.
"""
# check if specified output format is correct
passed = self._is_rdf_format_supported(self.rdf_format)
if not passed:
logger.error("Error while writing node data, wrong RDF format")
return False
# write node data using _write_node_data method
passed = self._write_node_data(nodes, batch_size, force)
if not passed:
logger.error("Error while writing node data.")
return False
return True
def write_edges(
self,
edges: Union[list, GeneratorType],
batch_size: int = int(1e6),
) -> bool:
"""
Wrapper for writing edges in RDF format. It calls _write_edge_data()
functions specifying it's edge data.
Args:
edges (BioCypherEdge): a list or generator of edges in
:py:class:`BioCypherEdge` format
batch_size (int): The number of edges to write in each batch.
Returns:
bool: The return value. True for success, False otherwise.
"""
# check if specified output format is correct
passed = self._is_rdf_format_supported(self.rdf_format)
if not passed:
logger.error("Error while writing edge data, wrong RDF format")
return False
# write edge data using _write_edge_data method
passed = self._write_edge_data(edges, batch_size=batch_size)
if not passed:
logger.error("Error while writing edge data.")
return False
return True
def _construct_import_call(self) -> bool:
"""
Function to write the import call.
This function is not applicable for RDF.
Returns:
bool: The return value. True for success, False otherwise.
"""
return ""
def _write_array_string(self, string_list):
"""
Abstract method to write the string representation of an array into a .csv file
as required by the RDF admin-import.
This function is not applicable for RDF.
Args:
string_list (list): list of ontology strings
Returns:
str: The string representation of an array for the neo4j admin import
"""
return True
def _write_node_headers(self):
"""
Abstract method that takes care of importing properties of a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`
This function is not applicable for RDF.
Returns:
bool: The return value. True for success, False otherwise.
"""
return True
def _write_edge_headers(self):
"""
Abstract method to write a database import-file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
This function is not applicable for RDF.
Returns:
bool: The return value. True for success, False otherwise.
"""
return True
def subject_to_uri(self, subject: str) -> str:
"""
Converts the subject to a proper URI using the available namespaces.
If the conversion fails, it defaults to the biocypher prefix.
Args:
subject (str): The subject to be converted to a URI.
Returns:
str: The corresponding URI for the subject.
"""
try:
_pref, _id = subject.split(":")
if _pref in self.rdf_namespaces.keys():
return self.rdf_namespaces[_pref][_id]
else:
return self.rdf_namespaces["biocypher"][subject]
except ValueError:
return self.rdf_namespaces["biocypher"][subject]
def property_to_uri(self, property_name: str) -> dict[str, str]:
"""
Converts a property name to its corresponding URI.
This function takes a property name and searches for its corresponding URI in various namespaces.
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
Args:
property_name (str): The property name to be converted to a URI.
Returns:
str: The corresponding URI for the input property name.
"""
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
for namespace in _NAMESPACE_PREFIXES_CORE.values():
if property_name in namespace:
return namespace[property_name]
# If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
for namespace in [SKOS, DC, DCTERMS]:
if property_name in namespace:
return namespace[property_name]
# If the property name is still not found, try other namespaces from rdflib.
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
if property_name in namespace:
return namespace[property_name]
# If the property name is "licence", it recursively calls the function with "license" as the input.
if property_name == "licence":
return self.property_to_uri("license")
# TODO: add an option to search trough manually implemented namespaces
# If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
# TODO: give a warning and try to prevent this option altogether
return self.rdf_namespaces["biocypher"][property_name]
def _init_namespaces(self, graph: Graph):
"""
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
Args:
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
Returns:
None
"""
# add biocypher standard to self.rdf_namespaces
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
if not self.rdf_namespaces:
self.rdf_namespaces = biocypher_standard
else:
self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
for key, value in self.rdf_namespaces.items():
namespace = Namespace(value)
self.rdf_namespaces[key] = namespace
graph.bind(key, namespace)

View File

@ -0,0 +1,76 @@
from more_itertools import peekable
from biocypher._logger import logger
from biocypher.output.write._writer import _Writer
from biocypher.output.in_memory._pandas import Pandas
class _PandasCSVWriter(_Writer):
"""
Class for writing node and edge representations to a CSV file.
"""
def __init__(self, *args, write_to_file: bool = True, **kwargs):
kwargs["write_to_file"] = write_to_file
super().__init__(*args, **kwargs)
self.in_memory_dfs = {}
self.stored_dfs = {}
self.pandas_in_memory = Pandas(
translator=self.translator,
deduplicator=self.deduplicator,
)
self.delimiter = kwargs.get("delimiter")
if not self.delimiter:
self.delimiter = ","
self.write_to_file = write_to_file
def _construct_import_call(self) -> str:
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
Returns:
str: Python code to load the csv files into Pandas dfs.
"""
import_call = "import pandas as pd\n\n"
for df_name in self.stored_dfs.keys():
import_call += f"{df_name} = pd.read_csv('./{df_name}.csv', header=0, index_col=0)\n"
return import_call
def _get_import_script_name(self) -> str:
"""Function to return the name of the import script."""
return "import_pandas_csv.py"
def _write_node_data(self, nodes) -> bool:
passed = self._write_entities_to_file(nodes)
return passed
def _write_edge_data(self, edges) -> bool:
passed = self._write_entities_to_file(edges)
return passed
def _write_entities_to_file(self, entities: iter) -> bool:
"""Function to output.write the entities to a CSV file.
Args:
entities (iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
"""
entities = peekable(entities)
entity_list = self.pandas_in_memory._separate_entity_types(entities)
for entity_type, entities in entity_list.items():
self.in_memory_dfs[
entity_type
] = self.pandas_in_memory._add_entity_df(entity_type, entities)
for entity_type in self.in_memory_dfs.keys():
entity_df = self.in_memory_dfs[entity_type]
if " " in entity_type or "." in entity_type:
entity_type = entity_type.replace(" ", "_").replace(".", "_")
if self.write_to_file:
logger.info(
f"Writing {entity_df.shape[0]} entries to {entity_type}.csv."
)
entity_df.to_csv(
f"{self.output_directory}/{entity_type}.csv",
sep=self.delimiter,
)
self.stored_dfs[entity_type] = entity_df
self.in_memory_dfs = {}
return True

View File

@ -0,0 +1,320 @@
import os
import glob
from biocypher._logger import logger
from biocypher.output.write._batch_writer import _BatchWriter
class _PostgreSQLBatchWriter(_BatchWriter):
"""
Class for writing node and edge representations to disk using the
format specified by PostgreSQL for the use of "COPY FROM...". Each batch
writer instance has a fixed representation that needs to be passed
at instantiation via the :py:attr:`schema` argument. The instance
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
to convert and extend the hierarchy.
This class inherits from the abstract class "_BatchWriter" and implements the
PostgreSQL-specific methods:
- _write_node_headers
- _write_edge_headers
- _construct_import_call
- _write_array_string
"""
DATA_TYPE_LOOKUP = {
"str": "VARCHAR", # VARCHAR needs limit
"int": "INTEGER",
"long": "BIGINT",
"float": "NUMERIC",
"double": "NUMERIC",
"dbl": "NUMERIC",
"boolean": "BOOLEAN",
"str[]": "VARCHAR[]",
"string[]": "VARCHAR[]",
}
def __init__(self, *args, **kwargs):
self._copy_from_csv_commands = set()
super().__init__(*args, **kwargs)
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the psql command
"""
return ""
def _get_data_type(self, string) -> str:
try:
return self.DATA_TYPE_LOOKUP[string]
except KeyError:
logger.info(
'Could not determine data type {string}. Using default "VARCHAR"'
)
return "VARCHAR"
def _write_array_string(self, string_list) -> str:
"""
Abstract method to output.write the string representation of an array into a .csv file
as required by the postgresql COPY command, with '{','}' brackets and ',' separation.
Args:
string_list (list): list of ontology strings
Returns:
str: The string representation of an array for postgres COPY
"""
string = ",".join(string_list)
string = f'"{{{string}}}"'
return string
def _get_import_script_name(self) -> str:
"""
Returns the name of the psql import script
Returns:
str: The name of the import script (ending in .sh)
"""
return f"{self.db_name}-import-call.sh"
def _adjust_pascal_to_psql(self, string):
string = string.replace(".", "_")
string = string.lower()
return string
def _write_node_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`,
containing only the header for this type of node.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.node_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.node_property_dict.items():
# create header CSV with ID, properties, labels
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(label)
parts = f"{pascal_label}-part*.csv"
parts_paths = os.path.join(self.outdir, parts)
parts_paths = glob.glob(parts_paths)
parts_paths.sort()
# adjust label for import to psql
pascal_label = self._adjust_pascal_to_psql(pascal_label)
table_create_command_path = os.path.join(
self.outdir,
f"{pascal_label}-create_table.sql",
)
# check if file already exists
if os.path.exists(table_create_command_path):
logger.warning(
f"File {table_create_command_path} already exists. Overwriting.",
)
# concatenate key:value in props
columns = ["_ID VARCHAR"]
for col_name, col_type in props.items():
col_type = self._get_data_type(col_type)
col_name = self._adjust_pascal_to_psql(col_name)
columns.append(f"{col_name} {col_type}")
columns.append("_LABEL VARCHAR[]")
with open(table_create_command_path, "w", encoding="utf-8") as f:
command = ""
if self.wipe:
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
# table creation requires comma separation
command += (
f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
)
f.write(command)
for parts_path in parts_paths:
# if import_call_file_prefix is set, replace actual path
# with prefix
if self.import_call_file_prefix != self.outdir:
parts_path = parts_path.replace(
self.outdir,
self.import_call_file_prefix,
)
self._copy_from_csv_commands.add(
f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
)
# add file path to import statement
# if import_call_file_prefix is set, replace actual path
# with prefix
if self.import_call_file_prefix != self.outdir:
table_create_command_path = table_create_command_path.replace(
self.outdir,
self.import_call_file_prefix,
)
self.import_call_nodes.add(table_create_command_path)
return True
def _write_edge_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.edge_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.edge_property_dict.items():
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(label)
parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv")
parts_paths = glob.glob(parts_paths)
parts_paths.sort()
# adjust label for import to psql
pascal_label = self._adjust_pascal_to_psql(pascal_label)
table_create_command_path = os.path.join(
self.outdir,
f"{pascal_label}-create_table.sql",
)
# check for file exists
if os.path.exists(table_create_command_path):
logger.warning(
f"File {table_create_command_path} already exists. Overwriting.",
)
# concatenate key:value in props
columns = []
for col_name, col_type in props.items():
col_type = self._get_data_type(col_type)
col_name = self._adjust_pascal_to_psql(col_name)
if col_name == "_ID":
# should ideally never happen
raise ValueError(
"Column name '_ID' is reserved for internal use, "
"denoting the relationship ID. Please choose a "
"different name for your column."
)
columns.append(f"{col_name} {col_type}")
# create list of lists and flatten
# removes need for empty check of property list
out_list = [
"_START_ID VARCHAR",
"_ID VARCHAR",
*columns,
"_END_ID VARCHAR",
"_TYPE VARCHAR",
]
with open(table_create_command_path, "w", encoding="utf-8") as f:
command = ""
if self.wipe:
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
# table creation requires comma separation
command += (
f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
)
f.write(command)
for parts_path in parts_paths:
# if import_call_file_prefix is set, replace actual path
# with prefix
if self.import_call_file_prefix != self.outdir:
parts_path = parts_path.replace(
self.outdir,
self.import_call_file_prefix,
)
self._copy_from_csv_commands.add(
f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
)
# add file path to import statement
# if import_call_file_prefix is set, replace actual path
# with prefix
if self.import_call_file_prefix != self.outdir:
table_create_command_path = table_create_command_path.replace(
self.outdir,
self.import_call_file_prefix,
)
self.import_call_edges.add(table_create_command_path)
return True
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: a bash command for postgresql import
"""
import_call = ""
# create tables
# At this point, csv files of nodes and edges do not require differentiation
for import_file_path in [
*self.import_call_nodes,
*self.import_call_edges,
]:
import_call += f'echo "Setup {import_file_path}..."\n'
if {self.db_password}:
# set password variable inline
import_call += f"PGPASSWORD={self.db_password} "
import_call += (
f"{self.import_call_bin_prefix}psql -f {import_file_path}"
)
import_call += f" --dbname {self.db_name}"
import_call += f" --host {self.db_host}"
import_call += f" --port {self.db_port}"
import_call += f" --user {self.db_user}"
import_call += '\necho "Done!"\n'
import_call += "\n"
# copy data to tables
for command in self._copy_from_csv_commands:
table_part = command.split(" ")[3]
import_call += f'echo "Importing {table_part}..."\n'
if {self.db_password}:
# set password variable inline
import_call += f"PGPASSWORD={self.db_password} "
import_call += f'{self.import_call_bin_prefix}psql -c "{command}"'
import_call += f" --dbname {self.db_name}"
import_call += f" --host {self.db_host}"
import_call += f" --port {self.db_port}"
import_call += f" --user {self.db_user}"
import_call += '\necho "Done!"\n'
import_call += "\n"
return import_call

View File

@ -0,0 +1,51 @@
from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
class _SQLiteBatchWriter(_PostgreSQLBatchWriter):
"""
Class for writing node and edge representations to a SQLite database.
It uses the _PostgreSQLBatchWriter class under the hood, which already
implements the logic to write the nodes/edges to a relational DBMS.
Only the import bash script differs between PostgreSQL and SQLite
and is therefore implemented in this class.
- _construct_import_call
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: a bash command for sqlite import
"""
import_call = ""
# create tables
# At this point, csv files of nodes and edges do not require differentiation
for import_file_path in [
*self.import_call_nodes,
*self.import_call_edges,
]:
import_call += f'echo "Setup {import_file_path}..."\n'
import_call += f"{self.import_call_bin_prefix}sqlite3 {self.db_name} < {import_file_path}"
import_call += '\necho "Done!"\n'
import_call += "\n"
for command in self._copy_from_csv_commands:
table_name = command.split(" ")[1]
table_part = command.split(" ")[3].replace("'", "")
import_call += f'echo "Importing {table_part}..."\n'
separator = self.delim
import_part = f".import {table_part} {table_name}"
import_call += f"{self.import_call_bin_prefix}sqlite3 -separator $'{separator}' {self.db_name} \"{import_part}\""
import_call += '\necho "Done!"\n'
import_call += "\n"
return import_call

View File

@ -0,0 +1,18 @@
# add your settings here (overriding the defaults)
biocypher:
dbms: neo4j
offline: true
#debug: true
output_directory: /neo4j_import #comment if you want to debug, so that bc creates a new folder for each run in /biocypher-out
schema_config_path: config/automated_schema.yaml
head_ontology:
url: config/head_ontology/biolink-model.owl.ttl
root_node: entity
neo4j:
delimiter: '\t'
array_delimiter: '|'
skip_duplicate_nodes: true
skip_bad_relationships: true

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,283 @@
Title: BioCypher graph schema configuration file
# This configuration file establishes the hierarchy and connectivity in a newly
# set-up BioCypher property graph database. Naming should adhere to Biolink
# nomenclature (available at https://biolink.github.io/biolink-model/ or via
# the python module 'biolink-model-toolkit').
# The BioCypher YAML file specifies only the leaves of the hierarchy tree of
# the desired graph; the hierarchical structure of entities will be derived
# from the Biolink model + BRO model. Thus, only the immediate constituents
# of the graph need to be specified in the schema config.
# ---
# "Named Things"
# ---
# The implementation of named things is fairly straightforward, since they are
# usually represented in node form, which is also the Biolink recommendation.
# The same is not true for associations.
#
# A little more complex is the representation of aggregates of named things.
clinicalStatus:
is_a: ClinicalEntity
represented_as: node
preferred_id: fhir_id
label_in_input: clinicalStatus
properties:
coding_system: str
label: str
coding_code: str
Condition:
is_a: ClinicalEntity
represented_as: node
preferred_id: fhir_id
label_in_input: Condition
properties:
input_format: HL7 FHIR
data_specification: Medical Informatics Initiative Germany Core Data Set, Basic Modules
diagnosis:
is_a: ClinicalEntity
represented_as: node
preferred_id: fhir_id
label_in_input: diagnosis
properties:
type.coding_code: str
sequence: str
label: str
type.coding_system: str
DiagnosticReport:
is_a: ClinicalEntity
represented_as: node
preferred_id: fhir_id
label_in_input: DiagnosticReport
properties:
resourceType: str
label: str
status: str
id: str
Encounter:
is_a: ClinicalEntity
represented_as: node
preferred_id: fhir_id
label_in_input: Encounter
properties:
resourceType: str
label: str
status: str
id: str
identifier:
is_a: Attribute
represented_as: node
preferred_id: fhir_id
label_in_input: identifier
properties:
label: str
value: str
system: str
interpretation: #
is_a: named thing
represented_as: node
preferred_id: fhir_id
label_in_input: interpretation
properties:
extension.valueCoding_system: str
extension_url: str
extension.valueCoding_display: str
coding_code: str
coding_system: str
label: str
extension.valueCoding_code: str
maritalStatus:
is_a: OrganismAttribute
represented_as: node
preferred_id: fhir_id
label_in_input: maritalStatus
properties:
label: str
coding_system: str
coding_code: str
Observation:
is_a: ClinicalEntity
represented_as: node
preferred_id: fhir_id
label_in_input: Observation
properties:
resourceType: str
label: str
effectiveDateTime: str
status: str
id: str
Organization:
is_a: AdministrativeEntity
represented_as: node
preferred_id: fhir_id
label_in_input: Organization
properties:
label: str
id: str
name: str
resourceType: str
Patient:
is_a: Human
represented_as: node
preferred_id: fhir_id
label_in_input: Patient
properties:
resourceType: str
label: str
gender: str
id: str
birthDate: str
Procedure:
# is_a: Procedure
represented_as: node
preferred_id: fhir_id
label_in_input: Procedure
properties:
label: str
performedDateTime: str
resourceType: str
status: str
id: str
referenceRange: #
is_a: named thing
represented_as: node
preferred_id: fhir_id
label_in_input: referenceRange
properties:
high_system: str
high_value: str
high_code: str
label: str
high_unit: str
search: #
is_a: named thing
represented_as: node
preferred_id: fhir_id
label_in_input: search
properties:
label: str
mode: str
type:
is_a: Attribute
represented_as: node
preferred_id: fhir_id
label_in_input: type
properties:
coding_system: str
label: str
coding_code: str
coding_display: str
verificationStatus:
is_a: Attribute
represented_as: node
preferred_id: fhir_id
label_in_input: verificationStatus
properties:
coding_system: str
label: str
coding_code: str
coding_display: str
# ---
# Associations
# ---
# Associations are not supposed to be represented in node form as per the
# specifications of Biolink. However, in an analytic context, it often makes
# sense to represent interactions as nodes in Neo4j, because it enables, for
# instance, the annotation of a relationship with a publication as source of
# evidence (also known as reification in the knowledge graph world).
# The Biolink specifications for these types of relationships do
# not go into depth; for example, the hierarchy for molecular interactions
# (ie, "associations") ends at "PairwiseMolecularInteraction", there are no
# explicit terms for protein-protein-interaction, phosphorylation, miRNA-
# targeting, etc. Biolink proposes to use interaction identifiers from
# ontologies, such as https://www.ebi.ac.uk/ols/ontologies/mi/.
# association to connect anything to an identifier node
# if functional, includes:
# IDENTIFIED_BY_Condition_Identifier,
# IDENTIFIED_BY_DiagnosticReport_Identifier,
# IDENTIFIED_BY_Encounter_Identifier,
# IDENTIFIED_BY_Observation_Identifier,
# IDENTIFIED_BY_Organization_Identifier
# IDENTIFIED_BY_Patient_Identifier,
# IDENTIFIED_BY_Procedure_Identifier
condition to identifier association:
is_a: association
represented_as: edge
label_in_input: IDENTIFIED_BY_Condition_Identifier
diagnostic report to identifier association:
is_a: association
represented_as: edge
label_in_input: IDENTIFIED_BY_DiagnosticReport_Identifier
observation to identifier association:
is_a: association
represented_as: edge
label_in_input: IDENTIFIED_BY_Observation_Identifier
observation derived from observation association:
is_a: association
represented_as: edge
label_in_input: DERIVED_FROM_Observation_Observation
observation has member observation association:
is_a: association
represented_as: edge
label_in_input: HAS_MEMBER_Observation_Observation
procedure to identifier association:
is_a: association
represented_as: edge
label_in_input: IDENTIFIED_BY_Procedure_Identifier
procedure to diagnostic report association:
is_a: association
represented_as: edge
label_in_input: IDENTIFIED_BY_Procedure_Identifier
procedure reasoned by observation association:
is_a: association
represented_as: edge
label_in_input: HAS_REASON_REFERENCE_Procedure_Observation
procedure performer is practitioner association:
is_a: association
represented_as: edge
label_in_input: HAS_ACTOR_ProcedurePerformer_Practitioner
#represented_as: edge
#label_in_input: DERIVED_FROM_Observation_Observation:
#represented_as: edge
#label_in_input: DERIVED_FROM_Observation_Observation
#protein interaction:
# is_a: Pairwise molecular interaction
# represented_as: edge
# label_in_input: protein_protein_interaction
#protein to disease association:
# is_a: Association
# represented_as: edge
# label_in_input: protein_disease_association

76
docker-compose.yml Normal file
View File

@ -0,0 +1,76 @@
services:
neo4j:
image: neo4j:5.7
environment:
- NEO4J_AUTH=${NEO4J_AUTH:-neo4j/password}
- NEO4J_PLUGINS=["apoc"]
- NEO4J_server_config_strict__validation_enabled=false
- NEO4J_apoc_export_file_enabled=true
- NEO4J_apoc_import_file_enabled=true
- NEO4J_apoc_import_file_use__neo4j__config=true
- SHARED_PATH=/neo4j_import
command: >
bash -c '
echo "running cmd from docker compose" &&
#neo4j start &&
while true; do
if [ -f /neo4j_import/ready-to-import ]; then
echo "Starting import process..."
neo4j stop &&
bash /neo4j_import/neo4j-admin-import-call.sh &&
rm /neo4j_import/ready-to-import &&
touch /neo4j_import/import-complete &&
chmod 777 /neo4j_import/import-complete
neo4j start
echo "The container is running. STR+C will end the bash command and thus, the neo4j container"
fi
sleep 10
done
'
healthcheck:
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:7474 || exit 1"]
interval: 10s
timeout: 5s
retries: 5
ports:
- "8080:7474"
- "8081:7687"
volumes:
- neo4j_data:/neo4j_data
- neo4j_logs:/neo4j_logs
- neo4j_import:/neo4j_import
- ${INPUT_DATA_PATH:-./data}:/input_data
- ./init-scripts:/init-scripts
- ./importData:/importData
python_app:
build:
context: .
dockerfile: Dockerfile
args:
HTTP_PROXY: ${HTTP_PROXY}
HTTPS_PROXY: ${HTTPS_PROXY}
NO_PROXY: ${NO_PROXY}
env_file:
- .env
environment:
- NEO4J_URI=bolt://neo4j:7687
- NEO4J_USER=${NEO4J_USER:-neo4j}
- NEO4J_PASSWORD=${NEO4J_PASSWORD:-password}
- INPUT_DATA_PATH=/input_data
- POETRY_VIRTUALENVS_CREATE=false
- NEO4J_dbms_directories_import=/neo4j_import
volumes:
- neo4j_import:/neo4j_import
- ${INPUT_DATA_PATH:-./data}:/input_data
- ./importData:/importData # Share the import data directory
# depends_on:
# neo4j:
# condition: service_healthy
# Define named volumes
volumes:
neo4j_data:
neo4j_logs:
neo4j_import:

38
entrypoint.sh Normal file
View File

@ -0,0 +1,38 @@
#!/bin/bash
set -e
chmod -R 777 /neo4j_import #make the dir accessible for both the python app an neo4j
#echo "Waiting for Neo4j to be ready... ..."
#python wait-for-neo4j.py
#if [ $? -ne 0 ]; then
# echo "Failed to connect to Neo4j"
# exit 1
#fi
echo "Running Python data processing script..."
poetry run python import_fhir_to_nx_diGraph.py
echo "Running Neo4j import..."
# Wait a bit before attempting database operations
sleep 5
while [ ! -f /neo4j_import/shell-scipt-complete ]; do
echo "Waiting for shell-script file"
sleep 5
done
# Create a signal file that we've prepared the data
touch /neo4j_import/ready-to-import
chmod -R 777 /neo4j_import/ready-to-import
# Wait for import to complete by monitoring a completion file
echo "Waiting for Neo4j import to complete..."
while [ ! -f /neo4j_import/import-complete ]; do
echo "Waiting for import-complete file"
sleep 5
done
echo "Database setup complete!"

66
fhirImport.py Normal file
View File

@ -0,0 +1,66 @@
import requests
from typing import List, Dict, Any
from dotenv import load_dotenv
import os
from requests.auth import HTTPBasicAuth
# Load environment variables from .env file
load_dotenv()
def getBundle(url: str, search: str):
headers = {
'Accept': 'application/fhir+json',
'Content-Type': 'application/fhir+json'
}
# Get configuration from environment variables
mode = os.getenv('MODE')
fhir_server = os.getenv('FHIR_SERVER_URL')
if mode != 'testsever':
username = os.getenv('FHIR_SERVER_USER')
password = os.getenv('FHIR_SERVER_PW')
if not fhir_server:
raise ValueError("FHIR_SERVER_URL not found in environment variables")
if (not username or not password) and mode != 'testserver':
raise ValueError("FHIR_USERNAME and FHIR_SERVER_PW must be set in environment variables")
# Setup basic authentication
auth = HTTPBasicAuth(username, password)
if url is not None:
link = url + '?_format=json'
else:
link = fhir_server + search + '&_format=json'
#print(link)
if mode != 'testserver':
response = requests.get(
link,
headers=headers,
auth=auth
)
else:
response = requests.get(
link,
headers=headers,
)
return response
def getPatientEverything(id: str):
search = '/Patient/' + id + '/$everything?'
return getBundle(None, search)
# Example usage
if __name__ == "__main__":
bundles = get_bundles(None)
data = bundles.json()
# Process the bundles
for entry in data['entry']:
print(f"{entry['fullUrl']}")

View File

@ -0,0 +1,43 @@
import json
import networkx as nx
def add_nodes_from_dict(graph, parent_node, current_dict):
for key, value in current_dict.items():
if isinstance(value, dict):
# Create a new node for the nested dictionary
new_node = f"{parent_node}.{key}"
graph.add_node(new_node, label=key)
# Add an edge from the parent node to the new node
graph.add_edge(parent_node, new_node, edge_type=key)
# Recurse into the nested dictionary
add_nodes_from_dict(graph, new_node, value)
elif isinstance(value, list):
# if list doesn't contain any nested dictionaries, make it a value in the node
if any(isinstance(item, dict) for item in value)==False:
graph.nodes[parent_node][key] = value
else:
# Process each dictionary in the list
for index, item in enumerate(value):
if isinstance(item, dict):
if len(value)>1:
item_node = f"{parent_node}.{key}[{index}]"
else:
item_node = f"{parent_node}.{key}"
graph.add_node(item_node, label=key)
graph.add_edge(parent_node, item_node, edge_type=key)
add_nodes_from_dict(graph, item_node, item)
else:
# For non-dict and non-list values, add them as attributes to the parent node
graph.nodes[parent_node][key] = value
def add_json_to_networkx(json_data, bundle_name, graph):
if not isinstance(graph, nx.DiGraph):
raise ValueError("The provided graph must be a networkx.DiGraph")
root_node = bundle_name+'_bundle'
graph.add_node(root_node, label='root')
add_nodes_from_dict(graph, root_node, json_data)

View File

@ -0,0 +1,40 @@
import networkx as nx
class Resource:
def __init__(self, resource_type):
self.resource_type = resource_type
def create_resource_class(resource_type):
return type(resource_type, (Resource,), {})
def set_resource_type(graph):
for node, data in graph.nodes(data=True):
print(node, data)
print("-----------------------------")
nodes_to_replace = []
for node, data in graph.nodes(data=True):
print(isinstance(node, Resource), node, type(node))
if isinstance(node, Resource):
print("Found a resource: ", node)
resource_type = node.resource_type
if resource_type:
# Dynamically create a new class based on the resource_type
NewResourceClass = create_resource_class(resource_type)
new_node = NewResourceClass(resource_type)
nodes_to_replace.append((node, new_node, data))
else:
print(f"Warning: Node {node} is a resource but has no resource_type")
# Replace old nodes with new ones
for old_node, new_node, data in nodes_to_replace:
graph.add_node(new_node, **data)
for pred in graph.predecessors(old_node):
graph.add_edge(pred, new_node)
for succ in graph.successors(old_node):
graph.add_edge(new_node, succ)
graph.remove_node(old_node)
"""
for node, data in graph.nodes(data=True):
print(node, data) """

View File

@ -0,0 +1,102 @@
import networkx as nx
def parse_synthea_reference(ref):
if not ref.startswith('#'):
#print("reference: ", ref)
if '?' in ref and '|' in ref:
parsed_ref = ref.split('|')[1]
# elif '/' in ref:
# parsed_ref = ref.split('/')[1]
else:
parsed_ref = ref.split(':')[2]
else:
parsed_ref = 'mock'
return(parsed_ref)
def process_references(graph):
isSynthea = False
nodes_with_reference = [[n, attr['reference']] for n, attr in graph.nodes(data=True) if 'reference' in attr]
directly_referenced_nodes = []
indirectly_referenced_nodes = []
dummy_references = []
if isSynthea:
nodes_with_mock_reference = []
for i in range(len(nodes_with_reference)):
reference = nodes_with_reference[i][1]
parsed_reference = parse_synthea_reference(reference)
if parsed_reference != 'mock':
nodes_with_reference[i].append(parsed_reference)
else:
nodes_with_mock_reference.append(i)
for i in sorted(nodes_with_mock_reference, reverse=True):
del nodes_with_reference[i]
id_to_node = {data["id"]: node for node, data in graph.nodes(data=True) if "id" in data}
id_to_identifier_node = {data["value"]: node for node, data in graph.nodes(data=True) if ("value" in data and data['label'] == 'identifier')}
for i in nodes_with_reference:
ref_id=i[2]
if ref_id in id_to_node.keys():
directly_referenced_nodes.append([i[0], id_to_node[ref_id]])
elif ref_id in id_to_identifier_node.keys():
indirectly_referenced_nodes.append([i[0], id_to_identifier_node[ref_id]])
#else:
# print("KEY ERROR: Key neither in to_node nor in to_identifier_node", i)
for i in indirectly_referenced_nodes:
node_from=list(graph.predecessors(i[0]))[0]
node_to=list(graph.predecessors(i[1]))[0]
ref_type=graph.nodes[i[0]]['label']
graph.add_edge(node_from, node_to, edge_type='reference', reference_type=ref_type)
else:
#for node, data in graph.nodes(data=True):
# if "id" in data:
# if not "resourceType" in data:
# print("FAILS AT: ", data, node)
id_to_node = {data["resourceType"]+'/'+data["id"]: node for node, data in graph.nodes(data=True) if ("id" in data and "resourceType" in data)}
for i in nodes_with_reference:
ref_id=i[1]
if ref_id in id_to_node.keys():
directly_referenced_nodes.append([i[0], id_to_node[ref_id]])
else:
dummy_references.append([i[0], ref_id])
for i in directly_referenced_nodes:
node_from=list(graph.predecessors(i[0]))[0]
node_to=i[1]
ref_type=graph.nodes[i[0]]['label']
graph.add_edge(node_from, node_to, edge_type='reference', reference_type=ref_type)
for i in dummy_references:
#print(i)
node_to='dummy_' + i[1]
graph.add_node(node_to, label='dummy', unique_id=i[1])
node_from=list(graph.predecessors(i[0]))[0]
ref_type=graph.nodes[i[0]]['label']
graph.add_edge(node_from, node_to, edge_type='reference', reference_type=ref_type)
#graph.remove_nodes_from([i[0] for i in nodes_with_reference])
graph.remove_nodes_from([i[0] for i in directly_referenced_nodes])
graph.remove_nodes_from([i[0] for i in indirectly_referenced_nodes])
graph.remove_nodes_from([i[0] for i in dummy_references])
nodes_to_remove = [n for n, attr in graph.nodes(data=True) if attr.get('label') in ['root', 'entry', 'request']]
graph.remove_nodes_from(nodes_to_remove)
#graph.remove_nodes_from(list(nx.isolates(graph)))

View File

@ -0,0 +1,107 @@
import networkx as nx
def find_paths(graph, start_node):
def is_leaf(node):
#Checks if a node is a leaf (no outgoing edges)
return graph.out_degree(node) == 0
def custom_dfs(path, reference_count):
#Performs a DFS to find paths for both patterns
current_node = path[-1]
'''if the current node is labeled 'resource', the path length is greater than 3,
and we have exactly one 'reference' edge in the path'''
if len(path) > 3 and graph.nodes[current_node].get('label') == 'resource' and reference_count == 1:
# add path to the list of property paths containing a reference
reference_paths.append(list(path))
'''if the current node is a leaf node (no outgoing edges),
the path length is greater than 2, and we have no references in the path'''
if len(path) > 2 and is_leaf(current_node) and reference_count == 0:
'''add path to the dictionary of property paths ending in leaves,
by the corresponding property key'''
leaf_paths.setdefault(path[1].split('.')[-1], []).extend(list(path))
# check neighbors
for neighbor in graph.successors(current_node):
edge_type = graph.edges[current_node, neighbor].get('edge_type', None)
new_reference_count = reference_count + (1 if edge_type == 'reference' else 0)
# continue the search only if we have at most one 'reference' edge so far
if new_reference_count <= 1:
custom_dfs(path + [neighbor], new_reference_count)
reference_paths = []
leaf_paths = {}
custom_dfs([start_node], 0)
return reference_paths, leaf_paths
def property_convolution(graph):
# Find all nodes with label 'resource'
resource_nodes = [n for n, attr in graph.nodes(data=True) if attr.get('label') == 'resource']
#print("Got all nodes with label 'resource'", flush=True)
'''collect all paths starting with a resource node, that contain one reference edge,
end with a resource node and are >3 nodes long'''
'''collect all paths starting with a resource node, that do not contain reference edges,
end with a leaf node and are >2 nodes long'''
property_paths_with_reference = []
property_paths_with_leaves = {}
for resource_node in resource_nodes:
temp_ref_paths, temp_leaf_paths = find_paths(graph, resource_node)
# add paths to the list of property paths containing a reference, for all nodes
property_paths_with_reference.extend(temp_ref_paths)
# add paths to the dictionary of property paths ending in leaves, by the corresponding resouce key
property_paths_with_leaves[resource_node] = temp_leaf_paths
# print("Collected all paths", flush=True)
# transfer reference edge to first property node for all reference paths
for i in property_paths_with_reference:
ref_edge_data = graph.get_edge_data(i[-2], i[-1])
ref_type = ref_edge_data.get('reference_type')
graph.remove_edge(i[-2], i[-1])
graph.add_edge(i[1], i[-1], edge_type='reference', reference_type=ref_type)
'''after transferrence, add the modified reference path (that now ends in a leaf)
to the dictionary of leaf paths, by corresponding resource and property keys'''
property_paths_with_leaves[i[0]].setdefault(i[1].split('.')[-1], []).extend(i[:-1])
#print("Transfered all references edges", flush=True)
'''create a list of collections of property paths ending in leaves,
removing duplicate nodes from each path collection'''
list_property_paths_with_leaves = [list(dict.fromkeys(i)) for j in property_paths_with_leaves.values() for i in j.values()]
nodes_to_remove=[]
for i in list_property_paths_with_leaves:
for j in range(len(i)-1, 1, -1):
source_attributes = graph.nodes[i[j]]
marker='|'.join(i[j].split('resource.')[1].split('.')[1:])
# transfer attributes to first property node
for attr, value in source_attributes.items():
if attr != 'label':
graph.nodes[i[1]][marker+'_'+attr] = value
nodes_to_remove.append(i[j])
#print("Transferred attributes for all paths", flush=True)
graph.remove_nodes_from(nodes_to_remove)
for i in resource_nodes:
unique_resource_id = graph.nodes[i]['resourceType']+'/'+graph.nodes[i]['id']
graph.nodes[i]['unique_id'] = unique_resource_id
for j in graph.successors(i):
if graph[i][j].get('edge_type') != 'reference':
graph.nodes[j]['unique_id'] = unique_resource_id+'/'+j.split('.')[-1]

View File

@ -0,0 +1,276 @@
from biocypher import BioCypher
import networkx as nx
import json
import os
import sys
import re
import uuid
import gc
from dotenv import load_dotenv
from graphCreation import create_graph
from graphCreation.process_references import process_references
from graphCreation.property_convolution import property_convolution
from schema_config_generation import write_automated_schema
from fhirImport import getPatientEverything, getBundle
def load_multiple_fhir_patients(n):
#graph = nx.DiGraph()
init = True
ids = []
#get n ids
nextIds = True
while len(ids) < n and nextIds:
if init:
complex = os.getenv('COMPLEX_PATIENTS')
if complex and complex.upper() != 'TRUE':
bundle = getBundle(None, '/Patient?_count=' + str(n))
else:
bundle = getBundle(None, '/Patient?_has:Observation:subject:status=final&_count=' + str(n))
else:
bundle = getBundle(None, nextLink)
if not 'entry' in bundle.json():
print("ERROR -- No data found in the fhir bundle. Check the request and if the server is up and responding")
sys.exit(1)
for entry in bundle.json()['entry']:
ids.append(entry['resource']['id'])
nextIds = False
for l in bundle.json()['link']:
if l['relation'] == "next":
nextLink = l['url']
nextIds = True
if len(ids) < n:
n = len(ids)
batchSize = int(os.getenv('BATCH_SIZE'))
c = 0
print(len(ids))
#get bundle for each ID
for id in ids:
c += 1
bundle = getPatientEverything(id).json()
bundle = replace_single_quotes(bundle) ### maybe not needed for german data
if init:
graph = nx.DiGraph()
init = False
create_graph.add_json_to_networkx(bundle, id + '_bundle', graph)
if c % 50 == 0:
print("---------- ", c, " patients loaded ----------", flush=True)
if c % batchSize == 0 or c == n:
print(c, " patients imported, reducing graph", flush = True)
process_references(graph)
property_convolution(graph)
lastChunk = False
if n == c:
lastChunk = True
runBioCypher(graph, lastChunk)
init = True
print(graph)
del graph
gc.collect
def replace_single_quotes(obj):
if isinstance(obj, str): # If it's a string, replace single quotes
return obj.replace("'", "''")
elif isinstance(obj, dict): # If it's a dictionary, process each key-value pair
return {key: replace_single_quotes(value) for key, value in obj.items()}
elif isinstance(obj, list): # If it's a list, process each item
return [replace_single_quotes(item) for item in obj]
else:
return obj # Leave other data types unchanged
def main():
## create networkX and run improvement scripts
print("Creating the graph...", flush=True)
nPatients = int(os.getenv('NUMBER_OF_PATIENTS'))
load_multiple_fhir_patients(nPatients)
def runBioCypher(nxGraph, final):
#get lists of node and edge types
print("Generate auto schema...", flush=True)
write_automated_schema(nxGraph, 'config/automated_schema.yaml', 'config/manual_schema_config.yaml')
# create Biocypher driver
bc = BioCypher(
biocypher_config_path="config/biocypher_config.yaml",
)
#bc.show_ontology_structure() #very extensive
#BioCypher preperation
def node_generator():
for node in nxGraph.nodes():
label = nxGraph.nodes[node].get('label')
if label == "resource":
label = nxGraph.nodes[node].get('resourceType')
nxGraph.nodes[node]['label'] = label.capitalize()
label = label.capitalize()
unq_id = nxGraph.nodes[node].get('unique_id', False)
if(nxGraph.nodes[node].get('label') in ['search', 'meta', 'link']):
#print("skipped a node: ", nxGraph.nodes[node].get('label'))
continue
label = nxGraph.nodes[node].get('label')
if(label == 'dummy'):
#print("SKIPPED dummy node: ", unq_id)
continue
yield(
nxGraph.nodes[node].get('unique_id', node), #remark: this returns the node id if this attribute exists. otherwise it returns node which equals the identifier that is used by nx
label,
nxGraph.nodes[node] # get properties
)
def edge_generator():
for edge in nxGraph.edges(data = True):
source, target, attributes = edge
sLabel = nxGraph.nodes[source].get('label')
if sLabel == 'resource':
sLabel = nxGraph.nodes[source].get('resourceType')
tLabel = nxGraph.nodes[target].get('label')
if tLabel == 'resource':
tLabel = nxGraph.nodes[target].get('resourceType')
label = sLabel.capitalize() + '_to_' + tLabel
yield(
attributes.get('id', str(uuid.uuid4())), # Edge ID (if exists, otherwise use nx internal id)
nxGraph.nodes[source].get('unique_id', source),
nxGraph.nodes[target].get('unique_id', target),
label,
attributes # All edge attributes
)
#import nodes
bc.write_nodes(node_generator())
bc.write_edges(edge_generator())
#write the import script -- we are creating our own script since BC would only consider the last batch as an input
if final:
print("CREATING THE SCRIPT")
generate_neo4j_import_script()
with open('/neo4j_import/shell-scipt-complete', 'w') as f:
f.write('Import completed successfully')
print("FHIR import completed successfully")
def generate_neo4j_import_script(directory_path="/neo4j_import/", output_file="neo4j-admin-import-call.sh"):
"""
Reads files in a directory and generates a Neo4j import shell script.
Args:
directory_path (str): Path to the directory containing CSV files
output_file (str): Name of the output shell script file
Returns:
str: Path to the generated shell script
"""
# Get all files in the directory
all_files = os.listdir(directory_path)
# Dictionary to store entity types (nodes and relationships)
entity_types = {}
# Find all header files and use them to identify entity types
for filename in all_files:
if '-header.csv' in filename:
entity_name = filename.split('-header.csv')[0]
# Check if it's a relationship (contains "To" and "Association")
is_relationship = "To" in entity_name and "Association" in entity_name
# Store in entity_types dictionary
if is_relationship:
entity_type = "relationships"
else:
entity_type = "nodes"
# Initialize the entity if not already present
if entity_name not in entity_types:
entity_types[entity_name] = {
"type": entity_type,
"header": f"/neo4j_import/{filename}",
"has_parts": False
}
# Check for part files for each entity
for entity_name in entity_types:
# Create pattern to match part files for this entity
part_pattern = f"{entity_name}-part"
# Check if any file matches the pattern
for filename in all_files:
if part_pattern in filename:
entity_types[entity_name]["has_parts"] = True
break
# Generate the import commands
nodes_command = ""
relationships_command = ""
for entity_name, info in entity_types.items():
if info["has_parts"]:
# Create the command string with wildcard for part files
command = f" --{info['type']}=\"{info['header']},/neo4j_import/{entity_name}-part.*\""
# Add to appropriate command string
if info['type'] == "nodes":
nodes_command += command
else: # relationships
relationships_command += command
# Create the shell script content
script_content = """#!/bin/bash
version=$(bin/neo4j-admin --version | cut -d '.' -f 1)
if [[ $version -ge 5 ]]; then
\tbin/neo4j-admin database import full neo4j --delimiter="\\t" --array-delimiter="|" --quote="'" --overwrite-destination=true --skip-bad-relationships=true --skip-duplicate-nodes=true{nodes}{relationships}
else
\tbin/neo4j-admin import --database=neo4j --delimiter="\\t" --array-delimiter="|" --quote="'" --force=true --skip-bad-relationships=true --skip-duplicate-nodes=true{nodes}{relationships}
fi
""".format(nodes=nodes_command, relationships=relationships_command)
# Write the script to file
script_path = os.path.join(directory_path, output_file)
with open(script_path, 'w') as f:
f.write(script_content)
# Make the script executable
os.chmod(script_path, 0o755)
print("Shell import script created", flush=True)
if __name__ == "__main__":
main()

199
import_nx_diGraph.py Normal file
View File

@ -0,0 +1,199 @@
from biocypher import BioCypher
import networkx as nx
import json
import os
import uuid
#from networkx_based import create_graph
from graphCreation import create_graph
#from networkx_based.process_references import process_references
from graphCreation.process_references import process_references
#from networkx_based.property_convolution import property_convolution
from graphCreation.property_convolution import property_convolution
from schema_config_generation import write_automated_schema
#from networkx_based.node_typing import set_ressource_type
from graphCreation.node_typing import set_resource_type
def load_multiple_fhir_bundles(directory_path):
graph = nx.DiGraph()
init = True
#limit = 2
# Iterate over all files in the directory
for filename in os.listdir(directory_path):
if filename.endswith('.json'): # Assuming FHIR bundles are in JSON format
file_path = os.path.join(directory_path, filename)
with open(file_path, 'r') as f:
bundle_json = json.load(f)
#fix all strings to to enable ' in neo4j
fixedQuotes = replace_single_quotes(bundle_json)
if init:
#print(bundle_json, filename, graph)
create_graph.json_to_networkx(fixedQuotes, filename, graph)
init = False
else:
create_graph.add_json_to_networkx(fixedQuotes, filename, graph)
print("Imported: ", filename)
#if limit == 0:
# return graph
#limit = limit - 1
return graph
def replace_single_quotes(obj):
if isinstance(obj, str): # If it's a string, replace single quotes
return obj.replace("'", "''")
elif isinstance(obj, dict): # If it's a dictionary, process each key-value pair
return {key: replace_single_quotes(value) for key, value in obj.items()}
elif isinstance(obj, list): # If it's a list, process each item
return [replace_single_quotes(item) for item in obj]
else:
return obj # Leave other data types unchanged
def main():
#get a list of nodes that should be imported
## create networkX and run improvement scripts
print("Creating the graph...", flush=True)
nxGraph = load_multiple_fhir_bundles('./testData/') # 'mockData' for unit test data, 'testData' for Synthea files
print(nxGraph)
print("Reducing references...", flush=True)
process_references(nxGraph)
print(nxGraph)
print("Convolute references...", flush=True)
property_convolution(nxGraph)
print(nxGraph)
#Set types of all resource nodes to resource_type
#set_resource_type(nxGraph)
#get lists of node and edge types
""" all_nLabels = set()
all_eLabels = set()
for node, attrs in nxGraph.nodes(data=True):
for attr_name, attr_value in attrs.items():
if attr_name == "label":
all_nLabels.add(attr_value)
for nt in all_nLabels:
print(nt)
print("-" * 50)
for u, v, attrs in nxGraph.edges(data=True):
u_label = nxGraph.nodes[u]['label']
if u_label == "resource":
u_label = nxGraph.nodes[u]['resourceType']
v_label = nxGraph.nodes[v]['label']
if v_label == "resource":
v_label = nxGraph.nodes[v]['resourceType']
all_eLabels.add(u_label + " to " + v_label)
for et in all_eLabels:
print(et)
print("-" * 50)
print("...end")
return """
print("Generate auto schema...")
write_automated_schema(nxGraph, 'config/automated_schema.yaml')
# create Biocypher driver
bc = BioCypher(
biocypher_config_path="config/biocypher_config.yaml",
#schema_config_path="/config/manual_schema_config.yaml"
)
bc.show_ontology_structure()
#BioCypher preperation
## node generator: extract id, label and property dictionary
def node_generator():
for node in nxGraph.nodes():
""" #single qoutes break neo4j import, e.g. 'CHILDREN'S Hospital'
checkDisplay = nxGraph.nodes[node].get('display')
if checkDisplay:
checkDisplay = checkDisplay.replace("'", "''")
nxGraph.nodes[node]['display'] = checkDisplay
#print("------->", nxGraph.nodes[node].get('display'))
checkName = nxGraph.nodes[node].get('name')
if checkName:
checkName = checkName.replace("'", "''")
nxGraph.nodes[node]['name'] = checkName
#print("------->", nxGraph.nodes[node].get('name')) """
label = nxGraph.nodes[node].get('label')
if label == "resource":
label = nxGraph.nodes[node].get('resourceType')
'''
elif label == 'identifier':
label = nxGraph.nodes[node].get('system')
print('/' in label)
if '/' in label:
lastSlash = label.rfind('/') + 1
label = label[lastSlash:] + '-ID'
elif label == 'telecom':
label = nxGraph.nodes[node].get('system')
print('/' in label)
if '/' in label:
lastSlash = label.rfind('/') + 1
label = 'telecom-' + label[lastSlash:]
elif label == 'address':
extension = nxGraph.nodes[node].get('extension_url')
print("EX!: ", extension)
if extension:
lastSlash = extension.rfind('/') + 1
label = label + '-' + extension[lastSlash:]
'''
yield(
nxGraph.nodes[node].get('id', node), #remark: this returns the node id if this attribute exists. otherwise it returns node which equals the identifier that is used by nx
label,
nxGraph.nodes[node] # get properties
)
def edge_generator():
for edge in nxGraph.edges(data = True):
source, target, attributes = edge
sLabel = nxGraph.nodes[source].get('label')
if sLabel == 'resource':
sLabel = nxGraph.nodes[source].get('resourceType')
tLabel = nxGraph.nodes[target].get('label')
if tLabel == 'resource':
tLabel = nxGraph.nodes[target].get('resourceType')
label = sLabel + '_to_' + tLabel
yield(
attributes.get('id', str(uuid.uuid4())), # Edge ID (if exists, otherwise use nx internal id)
nxGraph.nodes[source].get('id', source),
nxGraph.nodes[target].get('id', target),
label,
attributes # All edge attributes
)
#import nodes
bc.write_nodes(node_generator())
bc.write_edges(edge_generator())
#write the import script
bc.write_import_call()
if __name__ == "__main__":
#print("Called import script. Should run its main function now...")
main()

View File

@ -0,0 +1,8 @@
// Example initialization script - modify according to your schema
CREATE CONSTRAINT IF NOT EXISTS FOR (n:YourLabel) REQUIRE n.id IS UNIQUE;
CREATE INDEX IF NOT EXISTS FOR (n:YourLabel) ON (n.someProperty);
// Add any other initialization queries here
// For example:
// CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person) REQUIRE n.email IS UNIQUE;
// CREATE INDEX IF NOT EXISTS FOR (n:Product) ON (n.sku);

View File

@ -0,0 +1,210 @@
{
"resourceType": "Bundle",
"type": "transaction",
"entry": [ {
"fullUrl": "urn:uuid:a7a285c0-4714-dd3c-4837-8719c9b67873",
"resource": {
"resourceType": "Patient",
"id": "a7a285c0-4714-dd3c-4837-8719c9b67873",
"meta": {
"profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient" ]
},
"text": {
"status": "generated",
"div": "<div xmlns=\"http://www.w3.org/1999/xhtml\">Generated by <a href=\"https://github.com/synthetichealth/synthea\">Synthea</a>.Version identifier: 3c23908\n . Person seed: -5557164924473669144 Population seed: 1693908535569</div>"
},
"extension": [ {
"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race",
"extension": [ {
"url": "ombCategory",
"valueCoding": {
"system": "urn:oid:2.16.840.1.113883.6.238",
"code": "2106-3",
"display": "White"
}
}, {
"url": "text",
"valueString": "White"
} ]
}, {
"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity",
"extension": [ {
"url": "ombCategory",
"valueCoding": {
"system": "urn:oid:2.16.840.1.113883.6.238",
"code": "2186-5",
"display": "Not Hispanic or Latino"
}
}, {
"url": "text",
"valueString": "Not Hispanic or Latino"
} ]
}, {
"url": "http://hl7.org/fhir/StructureDefinition/patient-mothersMaidenName",
"valueString": "Leana211 Sauer652"
}, {
"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex",
"valueCode": "M"
}, {
"url": "http://hl7.org/fhir/StructureDefinition/patient-birthPlace",
"valueAddress": {
"city": "Quincy",
"state": "Massachusetts",
"country": "US"
}
}, {
"url": "http://synthetichealth.github.io/synthea/disability-adjusted-life-years",
"valueDecimal": 0.0
}, {
"url": "http://synthetichealth.github.io/synthea/quality-adjusted-life-years",
"valueDecimal": 1.0
} ],
"identifier": [
{
"system": "https://github.com/synthetichealth/synthea",
"value": "a7a285c0-4714-dd3c-4837-8719c9b67873"
},
{
"type": {
"coding": [ {
"system": "http://terminology.hl7.org/CodeSystem/v2-0203",
"code": "MR",
"display": "Medical Record Number"
} ],
"text": "Medical Record Number"
},
"system": "http://hospital.smarthealthit.org",
"value": "a7a285c0-4714-dd3c-4837-8719c9b67873"
}, {
"type": {
"coding": [ {
"system": "http://terminology.hl7.org/CodeSystem/v2-0203",
"code": "SS",
"display": "Social Security Number"
} ],
"text": "Social Security Number"
},
"system": "http://hl7.org/fhir/sid/us-ssn",
"value": "999-89-9528"
} ],
"name": [ {
"use": "official",
"family": "Schoen8",
"given": [ "Johnny786", "Vince741" ]
} ],
"telecom": [ {
"system": "phone",
"value": "555-753-6560",
"use": "home"
} ],
"gender": "male",
"birthDate": "2021-05-22",
"address": [ {
"extension": [ {
"url": "http://hl7.org/fhir/StructureDefinition/geolocation",
"extension": [ {
"url": "latitude",
"valueDecimal": 42.05921178859317
}, {
"url": "longitude",
"valueDecimal": -70.79219595855132
} ]
} ],
"line": [ "463 Rempel Ranch Unit 81" ],
"city": "Pembroke",
"state": "MA",
"postalCode": "00000",
"country": "US"
} ],
"maritalStatus": {
"coding": [ {
"system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus",
"code": "S",
"display": "Never Married"
} ],
"text": "Never Married"
},
"multipleBirthBoolean": false,
"communication": [ {
"language": {
"coding": [ {
"system": "urn:ietf:bcp:47",
"code": "en-US",
"display": "English (United States)"
} ],
"text": "English (United States)"
}
} ]
},
"request": {
"method": "POST",
"url": "Patient"
}
}, {
"fullUrl": "urn:uuid:0eb53bda-2881-5e8e-3597-87a9430af96a",
"resource": {
"resourceType": "Encounter",
"id": "0eb53bda-2881-5e8e-3597-87a9430af96a",
"meta": {
"profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-encounter" ]
},
"identifier": [ {
"use": "official",
"system": "https://github.com/synthetichealth/synthea",
"value": "0eb53bda-2881-5e8e-3597-87a9430af96a"
} ],
"status": "finished",
"class": {
"system": "http://terminology.hl7.org/CodeSystem/v3-ActCode",
"code": "AMB"
},
"type": [ {
"coding": [ {
"system": "http://snomed.info/sct",
"code": "410620009",
"display": "Well child visit (procedure)"
} ],
"text": "Well child visit (procedure)"
} ],
"subject": {
"reference": "urn:uuid:a7a285c0-4714-dd3c-4837-8719c9b67873",
"display": "Johnny786 Vince741 Schoen8"
},
"participant": [ {
"type": [ {
"coding": [ {
"system": "http://terminology.hl7.org/CodeSystem/v3-ParticipationType",
"code": "PPRF",
"display": "primary performer"
} ],
"text": "primary performer"
} ],
"period": {
"start": "2021-05-22T00:13:45+02:00",
"end": "2021-05-22T00:28:45+02:00"
},
"individual": {
"reference": "Practitioner?identifier=http://hl7.org/fhir/sid/us-npi|9999942599",
"display": "Dr. Regenia619 Bosco882"
}
} ],
"period": {
"start": "2021-05-22T00:13:45+02:00",
"end": "2021-05-22T00:28:45+02:00"
},
"location": [ {
"location": {
"reference": "Location?identifier=https://github.com/synthetichealth/synthea|6e3d04a3-9064-33e4-b8b5-63bb468d7629",
"display": "UNITED MEDICAL CARE LLC"
}
} ],
"serviceProvider": {
"reference": "Organization?identifier=https://github.com/synthetichealth/synthea|4e56c7ec-99e5-3023-8e4f-95ad18a03f06",
"display": "UNITED MEDICAL CARE LLC"
}
},
"request": {
"method": "POST",
"url": "Encounter"
}
}]}

View File

@ -0,0 +1,98 @@
{
"resourceType": "Bundle",
"type": "batch",
"entry": [ {
"fullUrl": "urn:uuid:4e56c7ec-99e5-3023-8e4f-95ad18a03f06",
"resource": {
"resourceType": "Organization",
"id": "4e56c7ec-99e5-3023-8e4f-95ad18a03f06",
"meta": {
"profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-organization" ]
},
"extension": [ {
"url": "http://synthetichealth.github.io/synthea/utilization-encounters-extension",
"valueInteger": 9
}, {
"url": "http://synthetichealth.github.io/synthea/utilization-procedures-extension",
"valueInteger": 2
}, {
"url": "http://synthetichealth.github.io/synthea/utilization-labs-extension",
"valueInteger": 1
}, {
"url": "http://synthetichealth.github.io/synthea/utilization-prescriptions-extension",
"valueInteger": 3
} ],
"identifier": [ {
"system": "https://github.com/synthetichealth/synthea",
"value": "4e56c7ec-99e5-3023-8e4f-95ad18a03f06"
} ],
"active": true,
"type": [ {
"coding": [ {
"system": "http://terminology.hl7.org/CodeSystem/organization-type",
"code": "prov",
"display": "Healthcare Provider"
} ],
"text": "Healthcare Provider"
} ],
"name": "UNITED MEDICAL CARE LLC",
"telecom": [ {
"system": "phone",
"value": "5089715500"
} ],
"address": [ {
"line": [ "28 RIVERSIDE DR STE 101" ],
"city": "PEMBROKE",
"state": "MA",
"postalCode": "023594947",
"country": "US"
} ]
},
"request": {
"method": "POST",
"url": "Organization",
"ifNoneExist": "identifier=https://github.com/synthetichealth/synthea|4e56c7ec-99e5-3023-8e4f-95ad18a03f06"
}
}, {
"fullUrl": "urn:uuid:6e3d04a3-9064-33e4-b8b5-63bb468d7629",
"resource": {
"resourceType": "Location",
"id": "6e3d04a3-9064-33e4-b8b5-63bb468d7629",
"meta": {
"profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-location" ]
},
"identifier": [ {
"system": "https://github.com/synthetichealth/synthea",
"value": "6e3d04a3-9064-33e4-b8b5-63bb468d7629"
} ],
"status": "active",
"name": "UNITED MEDICAL CARE LLC",
"telecom": [ {
"system": "phone",
"value": "5089715500"
} ],
"address": {
"line": [ "28 RIVERSIDE DR STE 101" ],
"city": "PEMBROKE",
"state": "MA",
"postalCode": "023594947",
"country": "US"
},
"position": {
"longitude": -70.77534154695786,
"latitude": 42.11004715
},
"managingOrganization": {
"identifier": {
"system": "https://github.com/synthetichealth/synthea",
"value": "4e56c7ec-99e5-3023-8e4f-95ad18a03f06"
},
"display": "UNITED MEDICAL CARE LLC"
}
},
"request": {
"method": "POST",
"url": "Location",
"ifNoneExist": "identifier=https://github.com/synthetichealth/synthea|6e3d04a3-9064-33e4-b8b5-63bb468d7629"
}
}]}

View File

@ -0,0 +1,50 @@
{
"resourceType": "Bundle",
"type": "batch",
"entry": [ {
"fullUrl": "urn:uuid:0368f101-0e65-3251-a809-566ebd6b2c2a",
"resource": {
"resourceType": "Practitioner",
"id": "0368f101-0e65-3251-a809-566ebd6b2c2a",
"meta": {
"profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-practitioner" ]
},
"extension": [ {
"url": "http://synthetichealth.github.io/synthea/utilization-encounters-extension",
"valueInteger": 9
} ],
"identifier": [ {
"system": "http://hl7.org/fhir/sid/us-npi",
"value": "9999942599"
} ],
"active": true,
"name": [ {
"family": "Bosco882",
"given": [ "Regenia619" ],
"prefix": [ "Dr." ]
} ],
"telecom": [ {
"extension": [ {
"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-direct",
"valueBoolean": true
} ],
"system": "email",
"value": "Regenia619.Bosco882@example.com",
"use": "work"
} ],
"address": [ {
"line": [ "28 RIVERSIDE DR STE 101" ],
"city": "PEMBROKE",
"state": "MA",
"postalCode": "023594947",
"country": "US"
} ],
"gender": "female"
},
"request": {
"method": "POST",
"url": "Practitioner",
"ifNoneExist": "identifier=http://hl7.org/fhir/sid/us-npi|9999942599"
}
}]}

44
pipeline.puml Normal file
View File

@ -0,0 +1,44 @@
@startuml "MeDaX pipeline"
left to right direction
actor admin
database "fhir server" as fhir
node "docker compose" as compose{
node "python app" as pyApp {
[scripts]
[nodeGenerator] as ngen
[edgeGenerator] as egen
[BioCypher] as BC
file "generated Schema" as gSchema
file "manual Schema" as mSchema
mSchema --> scripts : input
scripts --> gSchema : generates
scripts --> ngen : generates
scripts --> egen : generates
gSchema --> BC : input
ngen--> BC : input
egen--> BC : input
}
node "neo4j app" as neoApp{
database "neo4j GDB" as neoDB
[web server] as neoServer
neoDB --> neoServer
}
folder "admin files" as afiles {
file nodes
file edges
file "import script" as iscript
}
admin -[dashed]-> compose : triggers
BC --> afiles : exports
fhir --> scripts : http request
afiles --> neoApp : input
}
actor user
user --> neoServer : uses
neoApp --> pyApp : kills
@enduml

3315
poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

132
pyproject.toml Normal file
View File

@ -0,0 +1,132 @@
[tool.poetry]
name = "MeDaX pipeline"
version = "1.0.0"
description = "A unifying framework for biomedical research knowledge graphs"
authors = [
"Ilya Mazien",
"Tom Gebhardt",
"Lea Michaelis",
"Ron Henkel",
"Benjamin Winter",
"Dagmar Waltemath",
"Judith Wodke"
]
license = "MIT"
packages = [
{ include = "biocypher" }
]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python",
"Natural Language :: English",
"Topic :: Scientific/Engineering :: Bio-Informatics"
]
repository = "https://github.com/biocypher/biocypher"
readme = "README.md"
[project.urls]
Homepage = "https://www.medizin.uni-greifswald.de/medizininformatik/research/current-projects/medax/"
[tool.poetry.dependencies]
python = "^3.9"
PyYAML = ">=5.0"
more_itertools = "*"
appdirs = "*"
treelib = "1.6.4"
rdflib = "^6.2.0"
networkx = "^3.0"
stringcase = "^1.2.0"
neo4j-utils = "0.0.7"
pandas = "^2.0.1"
pooch = "^1.7.0"
tqdm = "^4.65.0"
[tool.poetry.group.dev.dependencies]
sphinx = ">=5.0.0"
sphinx-design = "^0.3.0"
sphinx-rtd-theme = ">=1.0.0"
sphinx-last-updated-by-git = ">=0.3"
sphinx-autodoc-typehints = ">=1.18.0"
myst-parser = "^0.18.0"
yapf = "^0.32.0"
pytest = ">=6.0"
tox = ">=3.20.1"
pre-commit = ">=2.17.0"
bump2version = "*"
coverage = ">=6.0"
pytest-cov = "^3.0.0"
hypothesis = "^6.50.1"
isort = "^5.10.1"
ipython = "^8.7.0"
ipykernel = "^6.23.1"
sphinxext-opengraph = "^0.8.2"
coverage-badge = "^1.1.0"
nbsphinx = "^0.9.2"
black = "^23.9.1"
flake8 = "^6.1.0"
[build-system]
requires = ["poetry-core<2.0.0"]
build-backend = "poetry.core.masonry.api"
[tool.poetry.urls]
"Bug Tracker" = "https://github.com/biocypher/biocypher/issues"
[tool.pytest.ini_options]
log_cli = true
log_level = "INFO"
markers = [
"requires_neo4j: Requires connection to a Neo4j server",
"requires_postgresql: Requires connection to a PostgreSQL server",
"inject_driver_args(driver_args): Arguments for the Driver",
]
[tool.black]
line-length = 80
target-version = ['py310']
include = '\.pyi?$'
exclude = '''
(
/(
\.eggs
| \.git
| \.hg
| \.mypy_cache
| \.tox
| \.venv
| _build
| buck-out
| build
| dist
)/
)
'''
[tool.isort]
from_first = true
line_length = 80
multi_line_output = 3
include_trailing_comma = true
use_parentheses = true
known_num="numpy,pandas"
sections = "FUTURE,STDLIB,THIRDPARTY,NUM,FIRSTPARTY,LOCALFOLDER"
no_lines_before="LOCALFOLDER"
balanced_wrapping = true
force_grid_wrap = 0
length_sort = "1"
indent = " "
profile = "black"
[tool.flake8]
ignore = ["E203", "D200", "D202", "D401", "D105", "W504"]
per-file-ignores = [
"docs/source/conf.py:D100",
"tests/*:D100,D101,D102",
"*/__init__.py:F401"
]
max-line-length = 80
count = true

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
requests==2.31.0
python-dotenv==1.0.0

188
schema_config_generation.py Normal file
View File

@ -0,0 +1,188 @@
#!/usr/bin/env python
# coding: utf-8
from pathlib import Path
#import networkx as nx
import yaml
from collections import defaultdict
#extract all node types and generate basic yaml config part for nodes
def write_automated_schema(graph, filePath, mSchemaPath):
schemaData = {
'nodes': {},
'edges': {}
}
if Path(filePath).exists():
schemaData = loadManualSchema(filePath)
elif mSchemaPath:
print("using the manual schema")
schemaData = loadManualSchema(mSchemaPath)
for node in graph.nodes():
label = graph.nodes[node].get('label')
if label == 'resource':
label = graph.nodes[node].get('resourceType')
label = label.capitalize()
if not label in schemaData['nodes']:
schemaData['nodes'][label] = {}
if not 'properties' in schemaData['nodes'][label]:
schemaData['nodes'][label]['properties'] = {}
for k in graph.nodes[node].keys():
#print(k, '----- ', graph.nodes[node][k])
#if k != 'label':
schemaData['nodes'][label]['properties'][k] = 'str'
#schemaData['nodes'][label]['properties'].update(graph.nodes[node].keys())
file=open(filePath, 'w')
for n in schemaData['nodes']:
temp = n+':\n'
if 'is_a' in schemaData['nodes'][n]:
temp += ' is_a: ' + schemaData['nodes'][n]['is_a'] + '\n'
else:
temp += ' is_a: named thing\n'
if 'represented_as' in schemaData['nodes'][n]:
temp += ' represented_as: ' + schemaData['nodes'][n]['represented_as'] + '\n'
else:
temp += ' represented_as: node\n'
if 'label_in_input' in schemaData['nodes'][n]:
temp += ' label_in_input: ' + schemaData['nodes'][n]['label_in_input'] + '\n'
if 'preferred_id' in schemaData['nodes'][n]:
temp += ' preferred_id: ' + schemaData['nodes'][n]['preferred_id'] + '\n'
else:
temp += ' preferred_id: fhir_id\n'
temp += ' label_in_input: ' + n + '\n'
temp += ' properties:\n'
# get property values from schemaData if exists
for pKey in schemaData['nodes'][n]['properties']:
temp += ' ' + pKey + ': ' + schemaData['nodes'][n]['properties'][pKey] + '\n'
#elif schemaData['nodes']['properties']:
#print("----> ", schemaData['nodes']['properties'])
""" else:
for attr in schemaData['nodes'][n]:
temp += ' ' + attr + ': str\n' """
temp += '\n'
file.write(temp)
file.write('\n')
#extract all relationship types and generate basic yaml config part for relationships
#if not edgeTypes: edgeTypes = set()
for u, v, a in graph.edges(data=True):
#edge_label = graph[u][v].get('edge_type', '')
source_label = graph.nodes[u].get('label')
target_label = graph.nodes[v].get('label')
if source_label == 'resource':
source_label = graph.nodes[u].get('resourceType', str(u))
if target_label == 'resource':
target_label = graph.nodes[v].get('resourceType', str(v))
source_label = source_label.capitalize()
#target_label = target_label.capitalize()
if source_label + ' to ' + target_label + ' association' in schemaData['edges']:
# add missing attributes
continue
elif source_label + ' derived from ' + target_label + ' association' in schemaData['edges']:
continue
elif source_label + ' has member ' + target_label + ' association' in schemaData['edges']:
continue
elif source_label + ' reasoned by ' + target_label + ' association' in schemaData['edges']:
continue
elif source_label + ' is ' + target_label + ' association' in schemaData['edges']:
continue
else:
#schemaData['edges'][source_label + ' to ' + target_label + ' association'] = set()
schemaData['edges'][source_label + ' to ' + target_label + ' association'] = {
'is_a': 'association',
'represented_as': 'edge',
'label_in_input': source_label + '_to_' + target_label,
'properties': a
}
for label in schemaData['edges']:
temp = '' + label + ':\n'
for key in schemaData['edges'][label]:
if key == 'properties':
temp += ' properties:\n'
for prop in schemaData['edges'][label][key]:
temp += ' ' + prop + ': ' + schemaData['edges'][label][key][prop] + '\n'
else:
temp+= ' ' + key + ': ' + schemaData['edges'][label][key] + '\n'
temp += '\n'
file.write(temp)
file.close()
def loadManualSchema(path):
schemaData = {
'nodes': {},
'edges': {}
}
edgeTypes = set()
with open(path, 'r') as file:
# Load YAML with comments stripped
data = yaml.safe_load(file)
for label, attrs in data.items():
cLabel = label.capitalize()
if not label == 'Title':
if attrs["represented_as"] == 'node':
if not hasattr(schemaData['nodes'], cLabel):
schemaData['nodes'][cLabel] = set()
#assuming uniqueness in schema file here. If the same node type exits twice, it will be overwritten.
schemaData['nodes'][cLabel] = attrs
#for a in attrs:
#print(v)
""" for k, v in attrs:
if not k == ''
schemaData['nodes'][label][k] = v """
else:
if not hasattr(schemaData['edges'], cLabel):
schemaData['edges'][cLabel] = set()
#assuming uniqueness in schema file here. If the same node type exits twice, it will be overwritten.
schemaData['edges'][cLabel] = attrs
return schemaData

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff