release commit
This commit is contained in:
commit
a9db0be88a
10
.bumpversion.cfg
Normal file
10
.bumpversion.cfg
Normal file
@ -0,0 +1,10 @@
|
||||
[bumpversion]
|
||||
current_version = 0.6.0
|
||||
commit = True
|
||||
tag = True
|
||||
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)
|
||||
serialize = {major}.{minor}.{patch}
|
||||
|
||||
[bumpversion:file:pyproject.toml]
|
||||
|
||||
[bumpversion:file:biocypher/_metadata.py]
|
13
.env.example
Normal file
13
.env.example
Normal file
@ -0,0 +1,13 @@
|
||||
MODE=testserver
|
||||
COMPLEX_PATIENTS=TRUE
|
||||
FHIR_SERVER_URL=http://hapi.fhir.org/baseR4
|
||||
#FHIR_SERVER_USER=
|
||||
#FHIR_SERVER_PW=
|
||||
#HTTP_PROXY=
|
||||
#HTTPS_PROXY=
|
||||
#NO_PROXY=
|
||||
NUMBER_OF_PATIENTS=100
|
||||
BATCH_SIZE=35
|
||||
|
||||
|
||||
|
41
.gitea/actions/test/action.yaml
Normal file
41
.gitea/actions/test/action.yaml
Normal file
@ -0,0 +1,41 @@
|
||||
name: "Test and code quality"
|
||||
description: "Run tests and code quality checks"
|
||||
inputs:
|
||||
NEO4J_VERSION:
|
||||
description: "Neo4j version"
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
#----------------------------------------------
|
||||
# setup docker containers for testing
|
||||
#----------------------------------------------
|
||||
# currently only running on Linux due to technical limitations
|
||||
# - name: Install Docker
|
||||
# uses: douglascamata/setup-docker-macos-action@v1-alpha
|
||||
# if: ${{ runner.os == 'macOS' }}
|
||||
- name: Start Neo4j Docker
|
||||
run: docker run --restart always --publish=7474:7474 --publish=7687:7687 --env NEO4J_AUTH=neo4j/your_password_here --env NEO4J_PLUGINS='["apoc"]' --env=NEO4J_ACCEPT_LICENSE_AGREEMENT=yes -d neo4j:${{ inputs.NEO4J_VERSION }}
|
||||
shell: bash
|
||||
if: ${{ runner.os == 'Linux' }}
|
||||
- name: Start Postgres Docker
|
||||
run: docker run --restart always --publish=5432:5432 --env POSTGRES_PASSWORD=postgres -d postgres:11.21-bullseye
|
||||
shell: bash
|
||||
if: ${{ runner.os == 'Linux' }}
|
||||
#----------------------------------------------
|
||||
# run tests and code quality checks
|
||||
#----------------------------------------------
|
||||
- name: Run Tests (Windows)
|
||||
run: |
|
||||
poetry run pytest --version
|
||||
poetry run pytest --password=your_password_here
|
||||
shell: bash
|
||||
if: runner.os == 'Windows'
|
||||
- name: Run tests (Linux and MacOS)
|
||||
run: |
|
||||
poetry run pytest --version
|
||||
poetry run pytest --password=your_password_here
|
||||
shell: bash
|
||||
if: runner.os != 'Windows'
|
||||
- name: Check code quality
|
||||
uses: pre-commit/action@v3.0.0
|
33
.gitignore
vendored
Normal file
33
.gitignore
vendored
Normal file
@ -0,0 +1,33 @@
|
||||
*~
|
||||
*__pycache__
|
||||
build/
|
||||
docs/pypath_log/
|
||||
docs/_build/
|
||||
docs/biocypher-log/
|
||||
docs/modules/
|
||||
docs/notebooks/*.yaml
|
||||
docs/notebooks/*.py
|
||||
.DS_Store
|
||||
.vscode
|
||||
biocypher.egg-info/
|
||||
*.egg
|
||||
dist/
|
||||
*.prof
|
||||
*.coverage
|
||||
*.pickle
|
||||
out/*
|
||||
biocypher-log/*
|
||||
biocypher-out/*
|
||||
*.log
|
||||
dist/*
|
||||
*.pye
|
||||
*.pyc
|
||||
*.kate-swp
|
||||
.hypothesis/
|
||||
.venv/
|
||||
.empty
|
||||
.pytest_cache
|
||||
*.graphml
|
||||
.idea/*
|
||||
.cache
|
||||
*.iml
|
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
[submodule "networkx-based"]
|
||||
path = networkx-based
|
||||
url = git@git.uni-greifswald.de:MeDaX/networkx-based.git
|
50
.pre-commit-config.yaml
Normal file
50
.pre-commit-config.yaml
Normal file
@ -0,0 +1,50 @@
|
||||
# See https://pre-commit.com for more information
|
||||
# See https://pre-commit.com/hooks.html for more hooks
|
||||
fail_fast: false
|
||||
default_language_version:
|
||||
python: python3
|
||||
default_stages:
|
||||
- commit
|
||||
- push
|
||||
minimum_pre_commit_version: 2.7.1
|
||||
repos:
|
||||
- repo: https://github.com/ambv/black
|
||||
rev: 23.7.0
|
||||
hooks:
|
||||
- id: black
|
||||
- repo: https://github.com/timothycrosley/isort
|
||||
rev: 5.12.0
|
||||
hooks:
|
||||
- id: isort
|
||||
additional_dependencies: [toml]
|
||||
- repo: https://github.com/snok/pep585-upgrade
|
||||
rev: v1.0
|
||||
hooks:
|
||||
- id: upgrade-type-hints
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.4.0
|
||||
hooks:
|
||||
- id: check-docstring-first
|
||||
- id: end-of-file-fixer
|
||||
- id: check-added-large-files
|
||||
- id: mixed-line-ending
|
||||
- id: trailing-whitespace
|
||||
exclude: ^.bumpversion.cfg$
|
||||
- id: check-merge-conflict
|
||||
- id: check-case-conflict
|
||||
- id: check-symlinks
|
||||
- id: check-yaml
|
||||
args: [--unsafe]
|
||||
- id: check-ast
|
||||
- id: fix-encoding-pragma
|
||||
args: [--remove] # for Python3 codebase, it's not necessary
|
||||
- id: requirements-txt-fixer
|
||||
- repo: https://github.com/pre-commit/pygrep-hooks
|
||||
rev: v1.10.0
|
||||
hooks:
|
||||
- id: python-no-eval
|
||||
- id: python-use-type-annotations
|
||||
- id: python-check-blanket-noqa
|
||||
- id: rst-backticks
|
||||
- id: rst-directive-colons
|
||||
- id: rst-inline-touching-normal
|
36
Dockerfile
Normal file
36
Dockerfile
Normal file
@ -0,0 +1,36 @@
|
||||
FROM python:3.9-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy requirements file if you have one
|
||||
COPY requirements.txt .
|
||||
|
||||
RUN pip install --upgrade packaging
|
||||
RUN pip install -r requirements.txt
|
||||
# Install poetry
|
||||
RUN pip install --no-cache-dir "poetry<2.0.0"
|
||||
|
||||
|
||||
# Copy .env file
|
||||
COPY .env ./
|
||||
|
||||
|
||||
# Copy only pyproject.toml and poetry.lock (if exists) first
|
||||
COPY pyproject.toml ./
|
||||
COPY poetry.lock* ./
|
||||
|
||||
# Configure poetry to not create a virtual environment inside the container
|
||||
RUN poetry config virtualenvs.create false
|
||||
|
||||
# Install dependencies
|
||||
RUN poetry install --no-dev --no-interaction --no-ansi
|
||||
|
||||
# Copy your project files
|
||||
COPY . .
|
||||
|
||||
# Make the entrypoint script executable
|
||||
COPY entrypoint.sh .
|
||||
RUN chmod +x entrypoint.sh
|
||||
|
||||
RUN sed -i 's/\r$//' /app/entrypoint.sh
|
||||
ENTRYPOINT ["/app/entrypoint.sh"]
|
22
LICENSE
Normal file
22
LICENSE
Normal file
@ -0,0 +1,22 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2022 Saez Lab
|
||||
Copyright (c) 2025 MeDaX research group
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
110
README.md
Normal file
110
README.md
Normal file
@ -0,0 +1,110 @@
|
||||
# MeDaX Pipeline
|
||||
|
||||
## 📋 Description
|
||||
The MeDaX pipeline transforms healthcare data from FHIR databases into Neo4j graph databases. This conversion enables efficient searching, querying, and analyses of interconnected health data that would otherwise be complex to retrieve using traditional SQL databases.
|
||||
|
||||
## ✨ Features
|
||||
- Seamless conversion from FHIR to Neo4j graph structure
|
||||
- Support for patient-centric data retrieval using FHIR's `$everything` operation
|
||||
- Configurable batch processing for handling large datasets
|
||||
- Docker-based deployment for easy setup and portability
|
||||
- Compatible with public FHIR servers (e.g., HAPI FHIR) and private authenticated instances
|
||||
|
||||
## ⚙️ Prerequisites
|
||||
- [Docker](https://docs.docker.com/engine/install/) with the [Docker Compose plugin](https://docs.docker.com/compose/install/linux/)
|
||||
- A FHIR database with API access and the `$everything` operation enabled for retrieving patient data
|
||||
- Alternatively: Use a public FHIR server such as [HAPI FHIR](https://hapi.fhir.org/) (default configuration)
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
### Setup
|
||||
1. Clone this repository
|
||||
2. Create an environment configuration file
|
||||
3. Configure the environment variables in `.env`:
|
||||
- For HAPI test server (default): No changes needed
|
||||
- For custom FHIR server:
|
||||
- Change `MODE` to anything else
|
||||
- Uncomment and set `URL`, `PASSWORD`, and `USERNAME` variables
|
||||
- Adjust `BATCH_SIZE` and `NUMBER_OF_PATIENTS` according to your needs
|
||||
- Configure any required proxy settings
|
||||
|
||||
4. If needed, modify proxy settings in the `Dockerfile`
|
||||
- Uncomment and set proxy variables
|
||||
|
||||
### Running the Pipeline
|
||||
|
||||
**Start the containers:**
|
||||
```bash
|
||||
docker compose up --build
|
||||
```
|
||||
|
||||
**Stop and clean up (between runs):**
|
||||
```bash
|
||||
docker compose down --volumes
|
||||
```
|
||||
|
||||
**Complete removal (containers and images):**
|
||||
```bash
|
||||
docker compose down --volumes --rmi all
|
||||
```
|
||||
|
||||
> **Note:** Depending on your Docker installation, you might need to use `docker-compose` instead of `docker compose`.
|
||||
|
||||
## 🔍 Accessing the Neo4j Database
|
||||
|
||||
Once the pipeline has completed processing, you can access the Neo4j database:
|
||||
|
||||
1. Open your browser and navigate to `http://localhost:8080/`
|
||||
2. Connect using the following credentials:
|
||||
- Username: neo4j
|
||||
- Password: neo4j
|
||||
3. Set the new password and save it to a secure password manager
|
||||
|
||||
## 📊 Example Queries
|
||||
|
||||
Here are some basic Cypher queries to get you started with exploring your health data:
|
||||
|
||||
```cypher
|
||||
// Count all nodes by type
|
||||
MATCH (n) RETURN labels(n) as NodeType, count(*) as Count;
|
||||
|
||||
// Find all records for a specific patient
|
||||
MATCH (p:Patient {id: 'patient-id'})-[r]-(connected)
|
||||
RETURN p, r, connected;
|
||||
|
||||
// Retrieve all medication prescriptions
|
||||
MATCH (m:Medication)-[r]-(p:Patient)
|
||||
RETURN m, r, p;
|
||||
```
|
||||
|
||||
## ❓ Troubleshooting
|
||||
|
||||
**Common Issues:**
|
||||
|
||||
- **Connection refused to FHIR server**: Check your network settings and ensure the FHIR server is accessible from within the Docker container.
|
||||
- **Authentication failures**: Verify your credentials in the `.env` file.
|
||||
- **Container startup failures**: Ensure all required Docker ports are available and not used by other applications.
|
||||
- **No data found in fhir bundle**: Ensure that the FHIR server is up and responding patient data. Try sett the COMPLEX_PATIENTS variable to FALSE in your .env file. Some FHIR servers might not support the FHIR search logic.
|
||||
|
||||
## 📚 Architecture
|
||||
|
||||
The MeDaX pipeline consists of the following components:
|
||||
|
||||
1. **FHIR Client**: Connects to the FHIR server and retrieves patient data
|
||||
2. **Data Transformer**: Converts FHIR resources into graph entities and relationships
|
||||
3. **Reference Processor**: Converts references to relationships
|
||||
3. **BioCypher Adapter**: Prepares the transformed data for Neo4j admin import
|
||||
4. **Neo4j Database**: Stores and serves the graph representation of the health data
|
||||
|
||||
## ✍️ Citation
|
||||
|
||||
If you use the MeDaX pipeline in your research, please cite: 10.5281/zenodo.15229077
|
||||
|
||||
## 🙏 Acknowledgements
|
||||
|
||||
- We are leveraging [BioCypher](https://biocypher.org) [](https://doi.org/10.1038/s41587-023-01848-y) to create the Neo4j admin input.
|
||||
- Remark: We slight adjustments to BioCypher's code to support batching.
|
||||
- We used BioCypher's git template as a starting point for our development:
|
||||
- Lobentanzer, S., BioCypher Consortium, & Saez-Rodriguez, J. Democratizing knowledge representation with BioCypher [Computer software]. https://github.com/biocypher/biocypher
|
||||
|
||||
|
41
biocypher/__init__.py
Normal file
41
biocypher/__init__.py
Normal file
@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher: a unifying framework for biomedical knowledge graphs.
|
||||
"""
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"__author__",
|
||||
"module_data",
|
||||
"config",
|
||||
"logfile",
|
||||
"log",
|
||||
"Driver",
|
||||
"BioCypher",
|
||||
"Resource",
|
||||
]
|
||||
|
||||
from ._get import Resource
|
||||
from ._core import BioCypher
|
||||
from ._config import config, module_data
|
||||
from ._logger import log, logger, logfile
|
||||
from ._metadata import __author__, __version__
|
||||
|
||||
|
||||
class Driver(BioCypher):
|
||||
# initialise parent class but log a warning
|
||||
def __init__(self, *args, **kwargs):
|
||||
logger.warning(
|
||||
"The class `Driver` is deprecated and will be removed in a future "
|
||||
"release. Please use `BioCypher` instead."
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
148
biocypher/_config/__init__.py
Normal file
148
biocypher/_config/__init__.py
Normal file
@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
Module data directory, including:
|
||||
|
||||
* The BioLink database schema
|
||||
* The default config files
|
||||
"""
|
||||
|
||||
from typing import Any, Optional
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import yaml
|
||||
import appdirs
|
||||
|
||||
__all__ = ["module_data", "module_data_path", "read_config", "config", "reset"]
|
||||
|
||||
_USER_CONFIG_DIR = appdirs.user_config_dir("biocypher", "saezlab")
|
||||
_USER_CONFIG_FILE = os.path.join(_USER_CONFIG_DIR, "conf.yaml")
|
||||
|
||||
|
||||
class MyLoader(yaml.SafeLoader):
|
||||
def construct_scalar(self, node):
|
||||
# Check if the scalar contains double quotes and an escape sequence
|
||||
value = super().construct_scalar(node)
|
||||
q = bool(node.style == '"')
|
||||
b = bool("\\" in value.encode("unicode_escape").decode("utf-8"))
|
||||
if q and b:
|
||||
warnings.warn(
|
||||
(
|
||||
"Double quotes detected in YAML configuration scalar: "
|
||||
f"{value.encode('unicode_escape')}. "
|
||||
"These allow escape sequences and may cause problems, for "
|
||||
"instance with the Neo4j admin import files (e.g. '\\t'). "
|
||||
"Make sure you wanted to do this, and use single quotes "
|
||||
"whenever possible."
|
||||
),
|
||||
category=UserWarning,
|
||||
)
|
||||
return value
|
||||
|
||||
|
||||
def module_data_path(name: str) -> str:
|
||||
"""
|
||||
Absolute path to a YAML file shipped with the module.
|
||||
"""
|
||||
|
||||
here = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
return os.path.join(here, f"{name}.yaml")
|
||||
|
||||
|
||||
def module_data(name: str) -> Any:
|
||||
"""
|
||||
Retrieve the contents of a YAML file shipped with this module.
|
||||
"""
|
||||
|
||||
path = module_data_path(name)
|
||||
|
||||
return _read_yaml(path)
|
||||
|
||||
|
||||
def _read_yaml(path: str) -> Optional[dict]:
|
||||
if os.path.exists(path):
|
||||
with open(path, "r") as fp:
|
||||
return yaml.load(fp.read(), Loader=MyLoader)
|
||||
|
||||
|
||||
def read_config() -> dict:
|
||||
"""
|
||||
Read the module config.
|
||||
|
||||
Read and merge the built-in default, the user level and directory level
|
||||
configuration, with the later taking precendence over the former.
|
||||
|
||||
TODO explain path configuration
|
||||
"""
|
||||
|
||||
defaults = module_data("biocypher_config")
|
||||
user = _read_yaml(_USER_CONFIG_FILE) or {}
|
||||
# TODO account for .yml?
|
||||
local = (
|
||||
_read_yaml("biocypher_config.yaml")
|
||||
or _read_yaml("config/biocypher_config.yaml")
|
||||
or {}
|
||||
)
|
||||
|
||||
for key in defaults:
|
||||
value = (
|
||||
local[key] if key in local else user[key] if key in user else None
|
||||
)
|
||||
|
||||
if value is not None:
|
||||
if isinstance(
|
||||
defaults[key], str
|
||||
): # first level config (like title)
|
||||
defaults[key] = value
|
||||
else:
|
||||
defaults[key].update(value)
|
||||
|
||||
return defaults
|
||||
|
||||
|
||||
def config(*args, **kwargs) -> Optional[Any]:
|
||||
"""
|
||||
Set or get module config parameters.
|
||||
"""
|
||||
|
||||
if args and kwargs:
|
||||
raise ValueError(
|
||||
"Setting and getting values in the same call is not allowed.",
|
||||
)
|
||||
|
||||
if args:
|
||||
result = tuple(globals()["_config"].get(key, None) for key in args)
|
||||
|
||||
return result[0] if len(result) == 1 else result
|
||||
|
||||
for key, value in kwargs.items():
|
||||
globals()["_config"][key].update(value)
|
||||
|
||||
|
||||
def reset():
|
||||
"""
|
||||
Reload configuration from the config files.
|
||||
"""
|
||||
|
||||
globals()["_config"] = read_config()
|
||||
|
||||
|
||||
reset()
|
||||
|
||||
|
||||
def update_from_file(path: str):
|
||||
"""
|
||||
Update the module configuration from a YAML file.
|
||||
"""
|
||||
|
||||
config(**_read_yaml(path))
|
141
biocypher/_config/biocypher_config.yaml
Normal file
141
biocypher/_config/biocypher_config.yaml
Normal file
@ -0,0 +1,141 @@
|
||||
Title: BioCypher python module configuration file
|
||||
|
||||
## Some options are not used by default. Uncomment them to use them.
|
||||
|
||||
biocypher:
|
||||
### Required parameters ###
|
||||
## DBMS type
|
||||
|
||||
dbms: neo4j
|
||||
|
||||
## Schema configuration
|
||||
|
||||
# schema_config_path: config/schema_config.yaml
|
||||
|
||||
## Offline mode: do not connect to a running DBMS instance
|
||||
## Can be used e.g. for writing batch import files
|
||||
|
||||
offline: true
|
||||
|
||||
## Strict mode: do not allow to create new nodes or relationships without
|
||||
## specifying source, version, and license parameters
|
||||
|
||||
strict_mode: false
|
||||
|
||||
## Ontology configuration
|
||||
|
||||
head_ontology:
|
||||
url: https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
|
||||
root_node: entity
|
||||
# switch_label_and_id: true
|
||||
|
||||
### Optional parameters ###
|
||||
|
||||
## Logging
|
||||
# Write log to disk
|
||||
log_to_disk: true
|
||||
|
||||
# Activate more granular logging
|
||||
debug: true
|
||||
|
||||
# Change the log directory
|
||||
# log_directory: biocypher-log
|
||||
|
||||
## Data output directory
|
||||
# output_directory: biocypher-out
|
||||
|
||||
## Resource cache directory
|
||||
# cache_directory: .cache
|
||||
|
||||
## Optional tail ontologies
|
||||
|
||||
# tail_ontologies:
|
||||
# so:
|
||||
# url: test/ontologies/so.owl
|
||||
# head_join_node: sequence variant
|
||||
# tail_join_node: sequence_variant
|
||||
# switch_label_and_id: true
|
||||
# mondo:
|
||||
# url: test/ontologies/mondo.owl
|
||||
# head_join_node: disease
|
||||
# tail_join_node: disease
|
||||
# switch_label_and_id: true
|
||||
|
||||
### DBMS configuration ###
|
||||
|
||||
neo4j:
|
||||
### Neo4j configuration ###
|
||||
## Database name
|
||||
|
||||
database_name: neo4j
|
||||
|
||||
## Wipe DB before import (offline mode: --force)
|
||||
|
||||
wipe: true
|
||||
|
||||
## Neo4j authentication
|
||||
|
||||
uri: neo4j://localhost:7687
|
||||
user: neo4j
|
||||
password: neo4j
|
||||
|
||||
## Neo4j admin import batch writer settings
|
||||
|
||||
delimiter: ";"
|
||||
array_delimiter: "|"
|
||||
quote_character: "'"
|
||||
|
||||
## MultiDB functionality
|
||||
## Set to false for using community edition or older versions of Neo4j
|
||||
|
||||
multi_db: true
|
||||
|
||||
## Import options
|
||||
|
||||
skip_duplicate_nodes: false
|
||||
skip_bad_relationships: false
|
||||
|
||||
## Import call prefixes
|
||||
|
||||
# import_call_bin_prefix: bin/
|
||||
# import_call_file_prefix: path/to/files/
|
||||
|
||||
postgresql:
|
||||
### PostgreSQL configuration ###
|
||||
|
||||
# PostgreSQL connection credentials
|
||||
database_name: postgres # DB name
|
||||
user: postgres # user name
|
||||
password: postgres # password
|
||||
host: localhost # host
|
||||
port: 5432 # port
|
||||
|
||||
# PostgreSQL import batch writer settings
|
||||
quote_character: '"'
|
||||
delimiter: '\t'
|
||||
# import_call_bin_prefix: '' # path to "psql"
|
||||
# import_call_file_prefix: '/path/to/files'
|
||||
|
||||
rdf:
|
||||
### RDF configuration ###
|
||||
rdf_format: turtle
|
||||
|
||||
sqlite:
|
||||
### SQLite configuration ###
|
||||
|
||||
# SQLite connection credentials
|
||||
database_name: sqlite.db # DB name
|
||||
|
||||
# SQLite import batch writer settings
|
||||
quote_character: '"'
|
||||
delimiter: '\t'
|
||||
# import_call_bin_prefix: '' # path to "sqlite3"
|
||||
# import_call_file_prefix: '/path/to/files'
|
||||
|
||||
csv:
|
||||
### CSV/Pandas configuration ###
|
||||
delimiter: ","
|
||||
|
||||
networkx:
|
||||
### NetworkX configuration ###
|
||||
some_config: some_value # placeholder for technical reasons TODO
|
5
biocypher/_config/test_config.yaml
Normal file
5
biocypher/_config/test_config.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
# We test the quote detection
|
||||
|
||||
valid: 'This is a valid string'
|
||||
also_valid: "This is also a valid string"
|
||||
invalid: "\t"
|
140
biocypher/_config/test_schema_config.yaml
Normal file
140
biocypher/_config/test_schema_config.yaml
Normal file
@ -0,0 +1,140 @@
|
||||
Title: BioCypher graph schema configuration file
|
||||
|
||||
# ---
|
||||
# "Named Things"
|
||||
# ---
|
||||
|
||||
protein:
|
||||
represented_as: node
|
||||
preferred_id: uniprot
|
||||
input_label: protein
|
||||
db_collection_name: proteins
|
||||
properties:
|
||||
name: str
|
||||
score: float
|
||||
taxon: int
|
||||
genes: str[]
|
||||
|
||||
microRNA:
|
||||
represented_as: node
|
||||
preferred_id: mirbase.mature
|
||||
input_label: mirna
|
||||
|
||||
complex:
|
||||
synonym_for: macromolecular complex
|
||||
represented_as: node
|
||||
preferred_id: complexportal
|
||||
input_label: complex
|
||||
|
||||
pathway:
|
||||
represented_as: node
|
||||
preferred_id: [reactome, wikipathways]
|
||||
input_label: [reactome, wikipathways]
|
||||
|
||||
gene:
|
||||
represented_as: node
|
||||
preferred_id: hgnc
|
||||
input_label: [hgnc, ensg]
|
||||
exclude_properties: accession
|
||||
|
||||
disease:
|
||||
represented_as: node
|
||||
preferred_id: doid
|
||||
input_label: Disease
|
||||
|
||||
side effect:
|
||||
is_a: phenotypic feature
|
||||
represented_as: node
|
||||
preferred_id: sider.effect
|
||||
input_label: sider
|
||||
|
||||
sequence variant:
|
||||
represented_as: node
|
||||
preferred_id: [clinically relevant, known, somatic]
|
||||
input_label: [Clinically_relevant_variant, Known_variant, Somatic_mutation]
|
||||
properties:
|
||||
source: str
|
||||
original_source: str
|
||||
effect: str
|
||||
biotype: str
|
||||
|
||||
snRNA sequence:
|
||||
is_a: nucleic acid entity
|
||||
represented_as: node
|
||||
preferred_id: [intact, rnacentral]
|
||||
input_label: [intact_snrna, rnacentral_snrna]
|
||||
properties:
|
||||
ac: str
|
||||
fullName: str
|
||||
shortName: str
|
||||
preferredName: str
|
||||
exclude_properties: sequence
|
||||
|
||||
DNA sequence:
|
||||
is_a: nucleic acid entity
|
||||
represented_as: node
|
||||
preferred_id: ensembl
|
||||
input_label: dna
|
||||
properties:
|
||||
ac: str
|
||||
fullName: str
|
||||
shortName: str
|
||||
preferredName: str
|
||||
sequence: str
|
||||
|
||||
dsDNA sequence:
|
||||
is_a: [DNA sequence, nucleic acid entity]
|
||||
inherit_properties: True
|
||||
represented_as: node
|
||||
preferred_id: [intact, uniparc]
|
||||
input_label: [intact_dsdna, uniprot_archive_dsdna]
|
||||
|
||||
# ---
|
||||
# Associations
|
||||
# ---
|
||||
|
||||
post translational interaction:
|
||||
is_a: pairwise molecular interaction
|
||||
represented_as: node
|
||||
label_as_edge: INTERACTS_POST_TRANSLATIONAL
|
||||
input_label: post_translational
|
||||
|
||||
phosphorylation:
|
||||
is_a: post translational interaction
|
||||
represented_as: edge
|
||||
input_label: phosphorylation
|
||||
|
||||
gene to disease association:
|
||||
represented_as: edge
|
||||
label_as_edge: PERTURBED_IN_DISEASE
|
||||
input_label: [protein_disease, gene_disease]
|
||||
exclude_properties: accession
|
||||
|
||||
mutation to tissue association:
|
||||
is_a: [genotype to tissue association, entity to tissue association, association]
|
||||
represented_as: edge
|
||||
label_as_edge: Is_Mutated_In
|
||||
input_label: Gene_Is_Mutated_In_Cell_Tissue
|
||||
|
||||
variant to gene association: # -> Known.... and Somatic....
|
||||
represented_as: edge
|
||||
source: [known.sequence variant, somatic.sequence variant]
|
||||
target: gene
|
||||
input_label: [
|
||||
VARIANT_FOUND_IN_GENE_Known_variant_Gene,
|
||||
VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene
|
||||
]
|
||||
|
||||
gene to gene association:
|
||||
represented_as: edge
|
||||
input_label: gene_gene
|
||||
properties:
|
||||
directional: bool
|
||||
curated: bool
|
||||
score: float
|
||||
id: str # should be removed
|
||||
|
||||
gene to variant association: # should be removed
|
||||
is_a: gene to variant association
|
||||
represented_as: edge
|
||||
input_label: gene_variant
|
3
biocypher/_config/test_schema_config_disconnected.yaml
Normal file
3
biocypher/_config/test_schema_config_disconnected.yaml
Normal file
@ -0,0 +1,3 @@
|
||||
disconnected:
|
||||
represented_as: node
|
||||
label_in_input: disconnected
|
152
biocypher/_config/test_schema_config_extended.yaml
Normal file
152
biocypher/_config/test_schema_config_extended.yaml
Normal file
@ -0,0 +1,152 @@
|
||||
Title: BioCypher graph schema configuration file
|
||||
|
||||
# ---
|
||||
# "Named Things"
|
||||
# ---
|
||||
|
||||
protein:
|
||||
represented_as: node
|
||||
preferred_id: uniprot
|
||||
input_label: protein
|
||||
db_collection_name: proteins
|
||||
properties:
|
||||
name: str
|
||||
score: float
|
||||
taxon: int
|
||||
genes: str[]
|
||||
|
||||
microRNA:
|
||||
represented_as: node
|
||||
preferred_id: mirbase.mature
|
||||
input_label: mirna
|
||||
|
||||
complex:
|
||||
synonym_for: macromolecular complex
|
||||
represented_as: node
|
||||
preferred_id: complexportal
|
||||
input_label: complex
|
||||
|
||||
pathway:
|
||||
represented_as: node
|
||||
preferred_id: [reactome, wikipathways]
|
||||
input_label: [reactome, wikipathways]
|
||||
|
||||
gene:
|
||||
represented_as: node
|
||||
preferred_id: hgnc
|
||||
input_label: [hgnc, ensg]
|
||||
exclude_properties: accession
|
||||
|
||||
disease:
|
||||
represented_as: node
|
||||
preferred_id: doid
|
||||
input_label: Disease
|
||||
|
||||
side effect:
|
||||
is_a: phenotypic feature
|
||||
represented_as: node
|
||||
preferred_id: sider.effect
|
||||
input_label: sider
|
||||
|
||||
sequence variant:
|
||||
represented_as: node
|
||||
preferred_id: [clinically relevant, known, somatic]
|
||||
input_label: [Clinically_relevant_variant, Known_variant, Somatic_mutation]
|
||||
properties:
|
||||
source: str
|
||||
original_source: str
|
||||
effect: str
|
||||
biotype: str
|
||||
|
||||
altered gene product level:
|
||||
represented_as: node
|
||||
input_label: agpl
|
||||
|
||||
decreased gene product level:
|
||||
represented_as: node
|
||||
input_label: agpl_decreased
|
||||
|
||||
lethal variant:
|
||||
represented_as: node
|
||||
input_label: lethal
|
||||
|
||||
snRNA sequence:
|
||||
is_a: nucleic acid entity
|
||||
represented_as: node
|
||||
preferred_id: [intact, rnacentral]
|
||||
input_label: [intact_snrna, rnacentral_snrna]
|
||||
properties:
|
||||
ac: str
|
||||
fullName: str
|
||||
shortName: str
|
||||
preferredName: str
|
||||
exclude_properties: sequence
|
||||
|
||||
DNA sequence:
|
||||
is_a: nucleic acid entity
|
||||
represented_as: node
|
||||
preferred_id: ensembl
|
||||
input_label: dna
|
||||
properties:
|
||||
ac: str
|
||||
fullName: str
|
||||
shortName: str
|
||||
preferredName: str
|
||||
sequence: str
|
||||
|
||||
dsDNA sequence:
|
||||
is_a: [DNA sequence, nucleic acid entity]
|
||||
inherit_properties: True
|
||||
represented_as: node
|
||||
preferred_id: [intact, uniparc]
|
||||
input_label: [intact_dsdna, uniprot_archive_dsdna]
|
||||
|
||||
# ---
|
||||
# Associations
|
||||
# ---
|
||||
|
||||
post translational interaction:
|
||||
is_a: pairwise molecular interaction
|
||||
represented_as: node
|
||||
label_as_edge: INTERACTS_POST_TRANSLATIONAL
|
||||
input_label: post_translational
|
||||
|
||||
phosphorylation:
|
||||
is_a: post translational interaction
|
||||
represented_as: edge
|
||||
use_id: false
|
||||
input_label: phosphorylation
|
||||
|
||||
gene to disease association:
|
||||
represented_as: edge
|
||||
label_as_edge: PERTURBED_IN_DISEASE
|
||||
input_label: [protein_disease, gene_disease]
|
||||
exclude_properties: accession
|
||||
|
||||
mutation to tissue association:
|
||||
is_a: [genotype to tissue association, entity to tissue association, association]
|
||||
represented_as: edge
|
||||
label_as_edge: Is_Mutated_In
|
||||
input_label: Gene_Is_Mutated_In_Cell_Tissue
|
||||
|
||||
variant to gene association: # -> Known.... and Somatic....
|
||||
represented_as: edge
|
||||
source: [known.sequence variant, somatic.sequence variant]
|
||||
target: gene
|
||||
input_label: [
|
||||
VARIANT_FOUND_IN_GENE_Known_variant_Gene,
|
||||
VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene
|
||||
]
|
||||
|
||||
gene to gene association:
|
||||
represented_as: edge
|
||||
input_label: gene_gene
|
||||
properties:
|
||||
directional: bool
|
||||
curated: bool
|
||||
score: float
|
||||
|
||||
gene to variant association:
|
||||
is_a: gene to variant association
|
||||
represented_as: edge
|
||||
input_label: gene_variant
|
734
biocypher/_core.py
Normal file
734
biocypher/_core.py
Normal file
@ -0,0 +1,734 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher core module. Interfaces with the user and distributes tasks to
|
||||
submodules.
|
||||
"""
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
import os
|
||||
import json
|
||||
|
||||
from more_itertools import peekable
|
||||
import yaml
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from ._get import Downloader
|
||||
from ._config import config as _config
|
||||
from ._config import update_from_file as _file_update
|
||||
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
||||
from ._mapping import OntologyMapping
|
||||
from ._ontology import Ontology
|
||||
from ._translate import Translator
|
||||
from ._deduplicate import Deduplicator
|
||||
from .output.in_memory._pandas import Pandas
|
||||
from .output.write._get_writer import DBMS_TO_CLASS, get_writer
|
||||
from .output.connect._neo4j_driver import get_driver
|
||||
|
||||
__all__ = ["BioCypher"]
|
||||
|
||||
SUPPORTED_DBMS = DBMS_TO_CLASS.keys()
|
||||
|
||||
REQUIRED_CONFIG = [
|
||||
"dbms",
|
||||
"offline",
|
||||
"strict_mode",
|
||||
"head_ontology",
|
||||
]
|
||||
|
||||
|
||||
class BioCypher:
|
||||
"""
|
||||
Orchestration of BioCypher operations. Instantiate this class to interact
|
||||
with BioCypher.
|
||||
|
||||
Args:
|
||||
|
||||
dbms (str): The database management system to use. For supported
|
||||
systems see SUPPORTED_DBMS.
|
||||
|
||||
offline (bool): Whether to run in offline mode. If True, no
|
||||
connection to the database will be made.
|
||||
|
||||
strict_mode (bool): Whether to run in strict mode. If True, the
|
||||
translator will raise an error if a node or edge does not
|
||||
provide source, version, and licence information.
|
||||
|
||||
biocypher_config_path (str): Path to the BioCypher config file.
|
||||
|
||||
schema_config_path (str): Path to the user schema config
|
||||
file.
|
||||
|
||||
head_ontology (dict): The head ontology defined by URL ('url') and root
|
||||
node ('root_node').
|
||||
|
||||
tail_ontologies (dict): The tail ontologies defined by URL and
|
||||
join nodes for both head and tail ontology.
|
||||
|
||||
output_directory (str): Path to the output directory. If not
|
||||
provided, the default value 'biocypher-out' will be used.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dbms: str = None,
|
||||
offline: bool = None,
|
||||
strict_mode: bool = None,
|
||||
biocypher_config_path: str = None,
|
||||
schema_config_path: str = None,
|
||||
head_ontology: dict = None,
|
||||
tail_ontologies: dict = None,
|
||||
output_directory: str = None,
|
||||
cache_directory: str = None,
|
||||
# legacy params
|
||||
db_name: str = None,
|
||||
):
|
||||
# Update configuration if custom path is provided
|
||||
if biocypher_config_path:
|
||||
_file_update(biocypher_config_path)
|
||||
|
||||
if db_name:
|
||||
logger.warning(
|
||||
"The parameter `db_name` is deprecated. Please set the "
|
||||
"`database_name` setting in the `biocypher_config.yaml` file "
|
||||
"instead."
|
||||
)
|
||||
_config(**{db_name: {"database_name": db_name}})
|
||||
|
||||
# Load configuration
|
||||
self.base_config = _config("biocypher")
|
||||
|
||||
# Check for required configuration
|
||||
for key in REQUIRED_CONFIG:
|
||||
if key not in self.base_config:
|
||||
raise ValueError(f"Configuration key {key} is required.")
|
||||
|
||||
# Set configuration - mandatory
|
||||
self._dbms = dbms or self.base_config["dbms"]
|
||||
|
||||
if offline is None:
|
||||
self._offline = self.base_config["offline"]
|
||||
else:
|
||||
self._offline = offline
|
||||
|
||||
if strict_mode is None:
|
||||
self._strict_mode = self.base_config["strict_mode"]
|
||||
else:
|
||||
self._strict_mode = strict_mode
|
||||
|
||||
self._schema_config_path = schema_config_path or self.base_config.get(
|
||||
"schema_config_path"
|
||||
)
|
||||
|
||||
if not self._schema_config_path:
|
||||
logger.warning("Running BioCypher without schema configuration.")
|
||||
else:
|
||||
logger.info(
|
||||
f"Running BioCypher with schema configuration from {self._schema_config_path}."
|
||||
)
|
||||
|
||||
self._head_ontology = head_ontology or self.base_config["head_ontology"]
|
||||
|
||||
# Set configuration - optional
|
||||
self._output_directory = output_directory or self.base_config.get(
|
||||
"output_directory"
|
||||
)
|
||||
self._cache_directory = cache_directory or self.base_config.get(
|
||||
"cache_directory"
|
||||
)
|
||||
self._tail_ontologies = tail_ontologies or self.base_config.get(
|
||||
"tail_ontologies"
|
||||
)
|
||||
|
||||
if self._dbms not in SUPPORTED_DBMS:
|
||||
raise ValueError(
|
||||
f"DBMS {self._dbms} not supported. "
|
||||
f"Please select from {SUPPORTED_DBMS}."
|
||||
)
|
||||
|
||||
# Initialize
|
||||
self._ontology_mapping = None
|
||||
self._deduplicator = None
|
||||
self._translator = None
|
||||
self._downloader = None
|
||||
self._ontology = None
|
||||
self._writer = None
|
||||
self._pd = None
|
||||
|
||||
def _get_deduplicator(self) -> Deduplicator:
|
||||
"""
|
||||
Create deduplicator if not exists and return.
|
||||
"""
|
||||
|
||||
if not self._deduplicator:
|
||||
self._deduplicator = Deduplicator()
|
||||
|
||||
return self._deduplicator
|
||||
|
||||
def _get_ontology_mapping(self) -> OntologyMapping:
|
||||
"""
|
||||
Create ontology mapping if not exists and return.
|
||||
"""
|
||||
|
||||
if not self._schema_config_path:
|
||||
self._ontology_mapping = OntologyMapping()
|
||||
|
||||
if not self._ontology_mapping:
|
||||
self._ontology_mapping = OntologyMapping(
|
||||
config_file=self._schema_config_path,
|
||||
)
|
||||
|
||||
return self._ontology_mapping
|
||||
|
||||
def _get_ontology(self) -> Ontology:
|
||||
"""
|
||||
Create ontology if not exists and return.
|
||||
"""
|
||||
|
||||
if not self._ontology:
|
||||
self._ontology = Ontology(
|
||||
ontology_mapping=self._get_ontology_mapping(),
|
||||
head_ontology=self._head_ontology,
|
||||
tail_ontologies=self._tail_ontologies,
|
||||
)
|
||||
|
||||
return self._ontology
|
||||
|
||||
def _get_translator(self) -> Translator:
|
||||
"""
|
||||
Create translator if not exists and return.
|
||||
"""
|
||||
|
||||
if not self._translator:
|
||||
self._translator = Translator(
|
||||
ontology=self._get_ontology(),
|
||||
strict_mode=self._strict_mode,
|
||||
)
|
||||
|
||||
return self._translator
|
||||
|
||||
def _get_writer(self):
|
||||
"""
|
||||
Create writer if not online. Set as instance variable `self._writer`.
|
||||
"""
|
||||
|
||||
if self._offline:
|
||||
timestamp = lambda: datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
outdir = self._output_directory or os.path.join(
|
||||
"biocypher-out", timestamp()
|
||||
)
|
||||
self._output_directory = os.path.abspath(outdir)
|
||||
|
||||
self._writer = get_writer(
|
||||
dbms=self._dbms,
|
||||
translator=self._get_translator(),
|
||||
deduplicator=self._get_deduplicator(),
|
||||
output_directory=self._output_directory,
|
||||
strict_mode=self._strict_mode,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError("Cannot get writer in online mode.")
|
||||
|
||||
def _get_driver(self):
|
||||
"""
|
||||
Create driver if not exists. Set as instance variable `self._driver`.
|
||||
"""
|
||||
|
||||
if not self._offline:
|
||||
self._driver = get_driver(
|
||||
dbms=self._dbms,
|
||||
translator=self._get_translator(),
|
||||
deduplicator=self._get_deduplicator(),
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError("Cannot get driver in offline mode.")
|
||||
|
||||
def write_nodes(
|
||||
self, nodes, batch_size: int = int(1e6), force: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Write nodes to database. Either takes an iterable of tuples (if given,
|
||||
translates to ``BioCypherNode`` objects) or an iterable of
|
||||
``BioCypherNode`` objects.
|
||||
|
||||
Args:
|
||||
nodes (iterable): An iterable of nodes to write to the database.
|
||||
|
||||
batch_size (int): The batch size to use when writing to disk.
|
||||
|
||||
force (bool): Whether to force writing to the output directory even
|
||||
if the node type is not present in the schema config file.
|
||||
|
||||
Returns:
|
||||
bool: True if successful.
|
||||
"""
|
||||
|
||||
if not self._writer:
|
||||
self._get_writer()
|
||||
|
||||
nodes = peekable(nodes)
|
||||
if not isinstance(nodes.peek(), BioCypherNode):
|
||||
tnodes = self._translator.translate_nodes(nodes)
|
||||
else:
|
||||
tnodes = nodes
|
||||
# write node files
|
||||
return self._writer.write_nodes(
|
||||
tnodes, batch_size=batch_size, force=force
|
||||
)
|
||||
|
||||
def write_edges(self, edges, batch_size: int = int(1e6)) -> bool:
|
||||
"""
|
||||
Write edges to database. Either takes an iterable of tuples (if given,
|
||||
translates to ``BioCypherEdge`` objects) or an iterable of
|
||||
``BioCypherEdge`` objects.
|
||||
|
||||
Args:
|
||||
edges (iterable): An iterable of edges to write to the database.
|
||||
|
||||
Returns:
|
||||
bool: True if successful.
|
||||
"""
|
||||
|
||||
if not self._writer:
|
||||
self._get_writer()
|
||||
|
||||
edges = peekable(edges)
|
||||
if not isinstance(edges.peek(), BioCypherEdge):
|
||||
tedges = self._translator.translate_edges(edges)
|
||||
else:
|
||||
tedges = edges
|
||||
# write edge files
|
||||
return self._writer.write_edges(tedges, batch_size=batch_size)
|
||||
|
||||
def to_df(self) -> list[pd.DataFrame]:
|
||||
"""
|
||||
Convert entities to a pandas DataFrame for each entity type and return
|
||||
a list.
|
||||
|
||||
Args:
|
||||
entities (iterable): An iterable of entities to convert to a
|
||||
DataFrame.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: A pandas DataFrame.
|
||||
"""
|
||||
if not self._pd:
|
||||
raise ValueError(
|
||||
"No pandas instance found. Please call `add()` first."
|
||||
)
|
||||
|
||||
return self._pd.dfs
|
||||
|
||||
def add(self, entities) -> None:
|
||||
"""
|
||||
Function to add entities to the in-memory database. Accepts an iterable
|
||||
of tuples (if given, translates to ``BioCypherNode`` or
|
||||
``BioCypherEdge`` objects) or an iterable of ``BioCypherNode`` or
|
||||
``BioCypherEdge`` objects.
|
||||
|
||||
Args:
|
||||
entities (iterable): An iterable of entities to add to the database.
|
||||
Can be 3-tuples (nodes) or 5-tuples (edges); also accepts
|
||||
4-tuples for edges (deprecated).
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if not self._pd:
|
||||
self._pd = Pandas(
|
||||
translator=self._get_translator(),
|
||||
deduplicator=self._get_deduplicator(),
|
||||
)
|
||||
|
||||
entities = peekable(entities)
|
||||
|
||||
if (
|
||||
isinstance(entities.peek(), BioCypherNode)
|
||||
or isinstance(entities.peek(), BioCypherEdge)
|
||||
or isinstance(entities.peek(), BioCypherRelAsNode)
|
||||
):
|
||||
tentities = entities
|
||||
elif len(entities.peek()) < 4:
|
||||
tentities = self._translator.translate_nodes(entities)
|
||||
else:
|
||||
tentities = self._translator.translate_edges(entities)
|
||||
|
||||
self._pd.add_tables(tentities)
|
||||
|
||||
def add_nodes(self, nodes) -> None:
|
||||
"""
|
||||
Wrapper for ``add()`` to add nodes to the in-memory database.
|
||||
|
||||
Args:
|
||||
nodes (iterable): An iterable of node tuples to add to the database.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
self.add(nodes)
|
||||
|
||||
def add_edges(self, edges) -> None:
|
||||
"""
|
||||
Wrapper for ``add()`` to add edges to the in-memory database.
|
||||
|
||||
Args:
|
||||
edges (iterable): An iterable of edge tuples to add to the database.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
self.add(edges)
|
||||
|
||||
def merge_nodes(self, nodes) -> bool:
|
||||
"""
|
||||
Merge nodes into database. Either takes an iterable of tuples (if given,
|
||||
translates to ``BioCypherNode`` objects) or an iterable of
|
||||
``BioCypherNode`` objects.
|
||||
|
||||
Args:
|
||||
nodes (iterable): An iterable of nodes to merge into the database.
|
||||
|
||||
Returns:
|
||||
bool: True if successful.
|
||||
"""
|
||||
|
||||
if not self._driver:
|
||||
self._get_driver()
|
||||
|
||||
nodes = peekable(nodes)
|
||||
if not isinstance(nodes.peek(), BioCypherNode):
|
||||
tnodes = self._translator.translate_nodes(nodes)
|
||||
else:
|
||||
tnodes = nodes
|
||||
# write node files
|
||||
return self._driver.add_biocypher_nodes(tnodes)
|
||||
|
||||
def merge_edges(self, edges) -> bool:
|
||||
"""
|
||||
Merge edges into database. Either takes an iterable of tuples (if given,
|
||||
translates to ``BioCypherEdge`` objects) or an iterable of
|
||||
``BioCypherEdge`` objects.
|
||||
|
||||
Args:
|
||||
edges (iterable): An iterable of edges to merge into the database.
|
||||
|
||||
Returns:
|
||||
bool: True if successful.
|
||||
"""
|
||||
|
||||
if not self._driver:
|
||||
self._get_driver()
|
||||
|
||||
edges = peekable(edges)
|
||||
if not isinstance(edges.peek(), BioCypherEdge):
|
||||
tedges = self._translator.translate_edges(edges)
|
||||
else:
|
||||
tedges = edges
|
||||
# write edge files
|
||||
return self._driver.add_biocypher_edges(tedges)
|
||||
|
||||
# DOWNLOAD AND CACHE MANAGEMENT METHODS ###
|
||||
|
||||
def _get_downloader(self, cache_dir: Optional[str] = None):
|
||||
"""
|
||||
Create downloader if not exists.
|
||||
"""
|
||||
|
||||
if not self._downloader:
|
||||
self._downloader = Downloader(self._cache_directory)
|
||||
|
||||
def download(self, *resources) -> None:
|
||||
"""
|
||||
Use the :class:`Downloader` class to download or load from cache the
|
||||
resources given by the adapter.
|
||||
"""
|
||||
|
||||
self._get_downloader()
|
||||
return self._downloader.download(*resources)
|
||||
|
||||
# OVERVIEW AND CONVENIENCE METHODS ###
|
||||
|
||||
def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]:
|
||||
"""
|
||||
|
||||
Get the set of input labels encountered without an entry in the
|
||||
`schema_config.yaml` and print them to the logger.
|
||||
|
||||
Returns:
|
||||
|
||||
Optional[Dict[str, List[str]]]: A dictionary of Biolink types
|
||||
encountered without an entry in the `schema_config.yaml` file.
|
||||
|
||||
"""
|
||||
|
||||
mt = self._translator.get_missing_biolink_types()
|
||||
|
||||
if mt:
|
||||
msg = (
|
||||
"Input entities not accounted for due to them not being "
|
||||
f"present in the schema configuration file {self._schema_config_path} "
|
||||
"(this is not necessarily a problem, if you did not intend "
|
||||
"to include them in the database; see the log for details): \n"
|
||||
)
|
||||
for k, v in mt.items():
|
||||
msg += f" {k}: {v} \n"
|
||||
|
||||
logger.info(msg)
|
||||
return mt
|
||||
|
||||
else:
|
||||
logger.info("No missing labels in input.")
|
||||
return None
|
||||
|
||||
def log_duplicates(self) -> None:
|
||||
"""
|
||||
Get the set of duplicate nodes and edges encountered and print them to
|
||||
the logger.
|
||||
"""
|
||||
|
||||
dn = self._deduplicator.get_duplicate_nodes()
|
||||
|
||||
if dn:
|
||||
ntypes = dn[0]
|
||||
nids = dn[1]
|
||||
|
||||
msg = "Duplicate node types encountered (IDs in log): \n"
|
||||
for typ in ntypes:
|
||||
msg += f" {typ}\n"
|
||||
|
||||
logger.info(msg)
|
||||
|
||||
idmsg = "Duplicate node IDs encountered: \n"
|
||||
for _id in nids:
|
||||
idmsg += f" {_id}\n"
|
||||
|
||||
logger.debug(idmsg)
|
||||
|
||||
else:
|
||||
logger.info("No duplicate nodes in input.")
|
||||
|
||||
de = self._deduplicator.get_duplicate_edges()
|
||||
|
||||
if de:
|
||||
etypes = de[0]
|
||||
eids = de[1]
|
||||
|
||||
msg = "Duplicate edge types encountered (IDs in log): \n"
|
||||
for typ in etypes:
|
||||
msg += f" {typ}\n"
|
||||
|
||||
logger.info(msg)
|
||||
|
||||
idmsg = "Duplicate edge IDs encountered: \n"
|
||||
for _id in eids:
|
||||
idmsg += f" {_id}\n"
|
||||
|
||||
logger.debug(idmsg)
|
||||
|
||||
else:
|
||||
logger.info("No duplicate edges in input.")
|
||||
|
||||
def show_ontology_structure(self, **kwargs) -> None:
|
||||
"""
|
||||
Show the ontology structure using treelib or write to GRAPHML file.
|
||||
|
||||
Args:
|
||||
|
||||
to_disk (str): If specified, the ontology structure will be saved
|
||||
to disk as a GRAPHML file, to be opened in your favourite
|
||||
graph visualisation tool.
|
||||
|
||||
full (bool): If True, the full ontology structure will be shown,
|
||||
including all nodes and edges. If False, only the nodes and
|
||||
edges that are relevant to the extended schema will be shown.
|
||||
"""
|
||||
|
||||
if not self._ontology:
|
||||
self._get_ontology()
|
||||
|
||||
return self._ontology.show_ontology_structure(**kwargs)
|
||||
|
||||
def write_import_call(self) -> str:
|
||||
"""
|
||||
Write a shell script to import the database depending on the chosen
|
||||
DBMS.
|
||||
|
||||
Returns:
|
||||
str: path toward the file holding the import call.
|
||||
"""
|
||||
|
||||
if not self._offline:
|
||||
raise NotImplementedError(
|
||||
"Cannot write import call in online mode."
|
||||
)
|
||||
|
||||
return self._writer.write_import_call()
|
||||
|
||||
def write_schema_info(self, as_node: bool = False) -> None:
|
||||
"""
|
||||
Write an extended schema info YAML file that extends the
|
||||
`schema_config.yaml` with run-time information of the built KG. For
|
||||
instance, include information on whether something present in the actual
|
||||
knowledge graph, whether it is a relationship (which is important in the
|
||||
case of representing relationships as nodes) and the actual sources and
|
||||
targets of edges. Since this file can be used in place of the original
|
||||
`schema_config.yaml` file, it indicates that it is the extended schema
|
||||
by setting `is_schema_info` to `true`.
|
||||
|
||||
We start by using the `extended_schema` dictionary from the ontology
|
||||
class instance, which contains all expanded entities and relationships.
|
||||
The information of whether something is a relationship can be gathered
|
||||
from the deduplicator instance, which keeps track of all entities that
|
||||
have been seen.
|
||||
"""
|
||||
|
||||
if not self._offline:
|
||||
raise NotImplementedError(
|
||||
"Cannot write schema info in online mode."
|
||||
)
|
||||
|
||||
ontology = self._get_ontology()
|
||||
schema = ontology.mapping.extended_schema.copy()
|
||||
schema["is_schema_info"] = True
|
||||
|
||||
deduplicator = self._get_deduplicator()
|
||||
for node in deduplicator.entity_types:
|
||||
if node in schema.keys():
|
||||
schema[node]["present_in_knowledge_graph"] = True
|
||||
schema[node]["is_relationship"] = False
|
||||
else:
|
||||
logger.info(
|
||||
f"Node {node} not present in extended schema. "
|
||||
"Skipping schema info."
|
||||
)
|
||||
|
||||
# find 'label_as_edge' cases in schema entries
|
||||
changed_labels = {}
|
||||
for k, v in schema.items():
|
||||
if not isinstance(v, dict):
|
||||
continue
|
||||
if "label_as_edge" in v.keys():
|
||||
if v["label_as_edge"] in deduplicator.seen_relationships.keys():
|
||||
changed_labels[v["label_as_edge"]] = k
|
||||
|
||||
for edge in deduplicator.seen_relationships.keys():
|
||||
if edge in changed_labels.keys():
|
||||
edge = changed_labels[edge]
|
||||
if edge in schema.keys():
|
||||
schema[edge]["present_in_knowledge_graph"] = True
|
||||
schema[edge]["is_relationship"] = True
|
||||
# TODO information about source and target nodes
|
||||
else:
|
||||
logger.info(
|
||||
f"Edge {edge} not present in extended schema. "
|
||||
"Skipping schema info."
|
||||
)
|
||||
|
||||
# write to output directory as YAML file
|
||||
path = os.path.join(self._output_directory, "schema_info.yaml")
|
||||
with open(path, "w") as f:
|
||||
f.write(yaml.dump(schema))
|
||||
|
||||
if as_node:
|
||||
# write as node
|
||||
node = BioCypherNode(
|
||||
node_id="schema_info",
|
||||
node_label="schema_info",
|
||||
properties={"schema_info": json.dumps(schema)},
|
||||
)
|
||||
self.write_nodes([node], force=True)
|
||||
|
||||
# override import call with added schema info node
|
||||
self.write_import_call()
|
||||
|
||||
return schema
|
||||
|
||||
# TRANSLATION METHODS ###
|
||||
|
||||
def translate_term(self, term: str) -> str:
|
||||
"""
|
||||
Translate a term to its BioCypher equivalent.
|
||||
|
||||
Args:
|
||||
term (str): The term to translate.
|
||||
|
||||
Returns:
|
||||
str: The BioCypher equivalent of the term.
|
||||
"""
|
||||
|
||||
# instantiate adapter if not exists
|
||||
self.start_ontology()
|
||||
|
||||
return self._translator.translate_term(term)
|
||||
|
||||
def summary(self) -> None:
|
||||
"""
|
||||
Wrapper for showing ontology structure and logging duplicates and
|
||||
missing input types.
|
||||
"""
|
||||
|
||||
self.show_ontology_structure()
|
||||
self.log_duplicates()
|
||||
self.log_missing_input_labels()
|
||||
|
||||
def reverse_translate_term(self, term: str) -> str:
|
||||
"""
|
||||
Reverse translate a term from its BioCypher equivalent.
|
||||
|
||||
Args:
|
||||
term (str): The BioCypher term to reverse translate.
|
||||
|
||||
Returns:
|
||||
str: The original term.
|
||||
"""
|
||||
|
||||
# instantiate adapter if not exists
|
||||
self.start_ontology()
|
||||
|
||||
return self._translator.reverse_translate_term(term)
|
||||
|
||||
def translate_query(self, query: str) -> str:
|
||||
"""
|
||||
Translate a query to its BioCypher equivalent.
|
||||
|
||||
Args:
|
||||
query (str): The query to translate.
|
||||
|
||||
Returns:
|
||||
str: The BioCypher equivalent of the query.
|
||||
"""
|
||||
|
||||
# instantiate adapter if not exists
|
||||
self.start_ontology()
|
||||
|
||||
return self._translator.translate(query)
|
||||
|
||||
def reverse_translate_query(self, query: str) -> str:
|
||||
"""
|
||||
Reverse translate a query from its BioCypher equivalent.
|
||||
|
||||
Args:
|
||||
query (str): The BioCypher query to reverse translate.
|
||||
|
||||
Returns:
|
||||
str: The original query.
|
||||
"""
|
||||
|
||||
# instantiate adapter if not exists
|
||||
self.start_ontology()
|
||||
|
||||
return self._translator.reverse_translate(query)
|
356
biocypher/_create.py
Normal file
356
biocypher/_create.py
Normal file
@ -0,0 +1,356 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'create' module. Handles the creation of BioCypher node and edge
|
||||
dataclasses.
|
||||
"""
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import Union
|
||||
from dataclasses import field, dataclass
|
||||
import os
|
||||
|
||||
__all__ = [
|
||||
"BioCypherEdge",
|
||||
"BioCypherNode",
|
||||
"BioCypherRelAsNode",
|
||||
]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BioCypherNode:
|
||||
"""
|
||||
Handoff class to represent biomedical entities as Neo4j nodes.
|
||||
|
||||
Has id, label, property dict; id and label (in the Neo4j sense of a
|
||||
label, ie, the entity descriptor after the colon, such as
|
||||
":Protein") are non-optional and called node_id and node_label to
|
||||
avoid confusion with "label" properties. Node labels are written in
|
||||
PascalCase and as nouns, as per Neo4j consensus.
|
||||
|
||||
Args:
|
||||
node_id (string): consensus "best" id for biological entity
|
||||
node_label (string): primary type of entity, capitalised
|
||||
**properties (kwargs): collection of all other properties to be
|
||||
passed to neo4j for the respective node (dict)
|
||||
|
||||
Todo:
|
||||
- check and correct small inconsistencies such as capitalisation
|
||||
of ID names ("uniprot" vs "UniProt")
|
||||
- check for correct ID patterns (eg "ENSG" + string of numbers,
|
||||
uniprot length)
|
||||
- ID conversion using pypath translation facilities for now
|
||||
"""
|
||||
|
||||
node_id: str
|
||||
node_label: str
|
||||
preferred_id: str = "id"
|
||||
properties: dict = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Add id field to properties.
|
||||
|
||||
Check for reserved keywords.
|
||||
|
||||
Replace unwanted characters in properties.
|
||||
"""
|
||||
self.properties["id"] = self.node_id
|
||||
self.properties["preferred_id"] = self.preferred_id or None
|
||||
# TODO actually make None possible here; as is, "id" is the default in
|
||||
# the dataclass as well as in the configuration file
|
||||
|
||||
if ":TYPE" in self.properties.keys():
|
||||
logger.warning(
|
||||
"Keyword ':TYPE' is reserved for Neo4j. "
|
||||
"Removing from properties.",
|
||||
# "Renaming to 'type'."
|
||||
)
|
||||
# self.properties["type"] = self.properties[":TYPE"]
|
||||
del self.properties[":TYPE"]
|
||||
|
||||
for k, v in self.properties.items():
|
||||
if isinstance(v, str):
|
||||
self.properties[k] = (
|
||||
v.replace(
|
||||
os.linesep,
|
||||
" ",
|
||||
)
|
||||
.replace(
|
||||
"\n",
|
||||
" ",
|
||||
)
|
||||
.replace(
|
||||
"\r",
|
||||
" ",
|
||||
)
|
||||
)
|
||||
|
||||
elif isinstance(v, list):
|
||||
#modified biocypher, because the data contained intgers in lists
|
||||
self.properties[k] = [
|
||||
(str(val) if isinstance(val, (int, float)) else val)
|
||||
.replace(os.linesep, " ")
|
||||
.replace("\n", " ")
|
||||
.replace("\r", " ")
|
||||
for val in v
|
||||
]
|
||||
|
||||
def get_id(self) -> str:
|
||||
"""
|
||||
Returns primary node identifier.
|
||||
|
||||
Returns:
|
||||
str: node_id
|
||||
"""
|
||||
return self.node_id
|
||||
|
||||
def get_label(self) -> str:
|
||||
"""
|
||||
Returns primary node label.
|
||||
|
||||
Returns:
|
||||
str: node_label
|
||||
"""
|
||||
return self.node_label
|
||||
|
||||
def get_type(self) -> str:
|
||||
"""
|
||||
Returns primary node label.
|
||||
|
||||
Returns:
|
||||
str: node_label
|
||||
"""
|
||||
return self.node_label
|
||||
|
||||
def get_preferred_id(self) -> str:
|
||||
"""
|
||||
Returns preferred id.
|
||||
|
||||
Returns:
|
||||
str: preferred_id
|
||||
"""
|
||||
return self.preferred_id
|
||||
|
||||
def get_properties(self) -> dict:
|
||||
"""
|
||||
Returns all other node properties apart from primary id and
|
||||
label as key-value pairs.
|
||||
|
||||
Returns:
|
||||
dict: properties
|
||||
"""
|
||||
return self.properties
|
||||
|
||||
def get_dict(self) -> dict:
|
||||
"""
|
||||
Return dict of id, labels, and properties.
|
||||
|
||||
Returns:
|
||||
dict: node_id and node_label as top-level key-value pairs,
|
||||
properties as second-level dict.
|
||||
"""
|
||||
return {
|
||||
"node_id": self.node_id,
|
||||
"node_label": self.node_label,
|
||||
"properties": self.properties,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BioCypherEdge:
|
||||
"""
|
||||
Handoff class to represent biomedical relationships in Neo4j.
|
||||
|
||||
Has source and target ids, label, property dict; ids and label (in
|
||||
the Neo4j sense of a label, ie, the entity descriptor after the
|
||||
colon, such as ":TARGETS") are non-optional and called source_id,
|
||||
target_id, and relationship_label to avoid confusion with properties
|
||||
called "label", which usually denotes the human-readable form.
|
||||
Relationship labels are written in UPPERCASE and as verbs, as per
|
||||
Neo4j consensus.
|
||||
|
||||
Args:
|
||||
|
||||
source_id (string): consensus "best" id for biological entity
|
||||
|
||||
target_id (string): consensus "best" id for biological entity
|
||||
|
||||
relationship_label (string): type of interaction, UPPERCASE
|
||||
|
||||
properties (dict): collection of all other properties of the
|
||||
respective edge
|
||||
|
||||
"""
|
||||
|
||||
source_id: str
|
||||
target_id: str
|
||||
relationship_label: str
|
||||
relationship_id: str = None
|
||||
properties: dict = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Check for reserved keywords.
|
||||
"""
|
||||
|
||||
if ":TYPE" in self.properties.keys():
|
||||
logger.debug(
|
||||
"Keyword ':TYPE' is reserved for Neo4j. "
|
||||
"Removing from properties.",
|
||||
# "Renaming to 'type'."
|
||||
)
|
||||
# self.properties["type"] = self.properties[":TYPE"]
|
||||
del self.properties[":TYPE"]
|
||||
elif "id" in self.properties.keys():
|
||||
logger.debug(
|
||||
"Keyword 'id' is reserved for Neo4j. "
|
||||
"Removing from properties.",
|
||||
# "Renaming to 'type'."
|
||||
)
|
||||
# self.properties["type"] = self.properties[":TYPE"]
|
||||
del self.properties["id"]
|
||||
elif "_ID" in self.properties.keys():
|
||||
logger.debug(
|
||||
"Keyword '_ID' is reserved for Postgres. "
|
||||
"Removing from properties.",
|
||||
# "Renaming to 'type'."
|
||||
)
|
||||
# self.properties["type"] = self.properties[":TYPE"]
|
||||
del self.properties["_ID"]
|
||||
|
||||
def get_id(self) -> Union[str, None]:
|
||||
"""
|
||||
Returns primary node identifier or None.
|
||||
|
||||
Returns:
|
||||
str: node_id
|
||||
"""
|
||||
|
||||
return self.relationship_id
|
||||
|
||||
def get_source_id(self) -> str:
|
||||
"""
|
||||
Returns primary node identifier of relationship source.
|
||||
|
||||
Returns:
|
||||
str: source_id
|
||||
"""
|
||||
return self.source_id
|
||||
|
||||
def get_target_id(self) -> str:
|
||||
"""
|
||||
Returns primary node identifier of relationship target.
|
||||
|
||||
Returns:
|
||||
str: target_id
|
||||
"""
|
||||
return self.target_id
|
||||
|
||||
def get_label(self) -> str:
|
||||
"""
|
||||
Returns relationship label.
|
||||
|
||||
Returns:
|
||||
str: relationship_label
|
||||
"""
|
||||
return self.relationship_label
|
||||
|
||||
def get_type(self) -> str:
|
||||
"""
|
||||
Returns relationship label.
|
||||
|
||||
Returns:
|
||||
str: relationship_label
|
||||
"""
|
||||
return self.relationship_label
|
||||
|
||||
def get_properties(self) -> dict:
|
||||
"""
|
||||
Returns all other relationship properties apart from primary ids
|
||||
and label as key-value pairs.
|
||||
|
||||
Returns:
|
||||
dict: properties
|
||||
"""
|
||||
return self.properties
|
||||
|
||||
def get_dict(self) -> dict:
|
||||
"""
|
||||
Return dict of ids, label, and properties.
|
||||
|
||||
Returns:
|
||||
dict: source_id, target_id and relationship_label as
|
||||
top-level key-value pairs, properties as second-level
|
||||
dict.
|
||||
"""
|
||||
return {
|
||||
"relationship_id": self.relationship_id or None,
|
||||
"source_id": self.source_id,
|
||||
"target_id": self.target_id,
|
||||
"relationship_label": self.relationship_label,
|
||||
"properties": self.properties,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BioCypherRelAsNode:
|
||||
"""
|
||||
Class to represent relationships as nodes (with in- and outgoing
|
||||
edges) as a triplet of a BioCypherNode and two BioCypherEdges. Main
|
||||
usage in type checking (instances where the receiving function needs
|
||||
to check whether it receives a relationship as a single edge or as
|
||||
a triplet).
|
||||
|
||||
Args:
|
||||
|
||||
node (BioCypherNode): node representing the relationship
|
||||
|
||||
source_edge (BioCypherEdge): edge representing the source of the
|
||||
relationship
|
||||
|
||||
target_edge (BioCypherEdge): edge representing the target of the
|
||||
relationship
|
||||
|
||||
"""
|
||||
|
||||
node: BioCypherNode
|
||||
source_edge: BioCypherEdge
|
||||
target_edge: BioCypherEdge
|
||||
|
||||
def __post_init__(self):
|
||||
if not isinstance(self.node, BioCypherNode):
|
||||
raise TypeError(
|
||||
f"BioCypherRelAsNode.node must be a BioCypherNode, "
|
||||
f"not {type(self.node)}.",
|
||||
)
|
||||
|
||||
if not isinstance(self.source_edge, BioCypherEdge):
|
||||
raise TypeError(
|
||||
f"BioCypherRelAsNode.source_edge must be a BioCypherEdge, "
|
||||
f"not {type(self.source_edge)}.",
|
||||
)
|
||||
|
||||
if not isinstance(self.target_edge, BioCypherEdge):
|
||||
raise TypeError(
|
||||
f"BioCypherRelAsNode.target_edge must be a BioCypherEdge, "
|
||||
f"not {type(self.target_edge)}.",
|
||||
)
|
||||
|
||||
def get_node(self) -> BioCypherNode:
|
||||
return self.node
|
||||
|
||||
def get_source_edge(self) -> BioCypherEdge:
|
||||
return self.source_edge
|
||||
|
||||
def get_target_edge(self) -> BioCypherEdge:
|
||||
return self.target_edge
|
147
biocypher/_deduplicate.py
Normal file
147
biocypher/_deduplicate.py
Normal file
@ -0,0 +1,147 @@
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
||||
|
||||
|
||||
class Deduplicator:
|
||||
"""
|
||||
Singleton class responsible of deduplicating BioCypher inputs. Maintains
|
||||
sets/dictionaries of node and edge types and their unique identifiers.
|
||||
|
||||
Nodes identifiers should be globally unique (represented as a set), while
|
||||
edge identifiers are only unique per edge type (represented as a dict of
|
||||
sets, keyed by edge type).
|
||||
|
||||
Stores collection of duplicate node and edge identifiers and types for
|
||||
troubleshooting and to avoid overloading the log.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.seen_entity_ids = set()
|
||||
self.duplicate_entity_ids = set()
|
||||
|
||||
self.entity_types = set()
|
||||
self.duplicate_entity_types = set()
|
||||
|
||||
self.seen_relationships = {}
|
||||
self.duplicate_relationship_ids = set()
|
||||
self.duplicate_relationship_types = set()
|
||||
|
||||
def node_seen(self, entity: BioCypherNode) -> bool:
|
||||
"""
|
||||
Adds a node to the instance and checks if it has been seen before.
|
||||
|
||||
Args:
|
||||
node: BioCypherNode to be added.
|
||||
|
||||
Returns:
|
||||
True if the node has been seen before, False otherwise.
|
||||
"""
|
||||
if entity.get_label() not in self.entity_types:
|
||||
self.entity_types.add(entity.get_label())
|
||||
|
||||
if entity.get_id() in self.seen_entity_ids:
|
||||
self.duplicate_entity_ids.add(entity.get_id())
|
||||
if entity.get_label() not in self.duplicate_entity_types:
|
||||
logger.warning(
|
||||
f"Duplicate node type {entity.get_label()} found. "
|
||||
)
|
||||
self.duplicate_entity_types.add(entity.get_label())
|
||||
return True
|
||||
|
||||
self.seen_entity_ids.add(entity.get_id())
|
||||
return False
|
||||
|
||||
def edge_seen(self, relationship: BioCypherEdge) -> bool:
|
||||
"""
|
||||
Adds an edge to the instance and checks if it has been seen before.
|
||||
|
||||
Args:
|
||||
edge: BioCypherEdge to be added.
|
||||
|
||||
Returns:
|
||||
True if the edge has been seen before, False otherwise.
|
||||
"""
|
||||
if relationship.get_type() not in self.seen_relationships:
|
||||
self.seen_relationships[relationship.get_type()] = set()
|
||||
|
||||
# concatenate source and target if no id is present
|
||||
if not relationship.get_id():
|
||||
_id = (
|
||||
f"{relationship.get_source_id()}_{relationship.get_target_id()}"
|
||||
)
|
||||
else:
|
||||
_id = relationship.get_id()
|
||||
|
||||
if _id in self.seen_relationships[relationship.get_type()]:
|
||||
self.duplicate_relationship_ids.add(_id)
|
||||
if relationship.get_type() not in self.duplicate_relationship_types:
|
||||
logger.warning(
|
||||
f"Duplicate edge type {relationship.get_type()} found. "
|
||||
)
|
||||
self.duplicate_relationship_types.add(relationship.get_type())
|
||||
return True
|
||||
|
||||
self.seen_relationships[relationship.get_type()].add(_id)
|
||||
return False
|
||||
|
||||
def rel_as_node_seen(self, rel_as_node: BioCypherRelAsNode) -> bool:
|
||||
"""
|
||||
Adds a rel_as_node to the instance (one entity and two relationships)
|
||||
and checks if it has been seen before. Only the node is relevant for
|
||||
identifying the rel_as_node as a duplicate.
|
||||
|
||||
Args:
|
||||
rel_as_node: BioCypherRelAsNode to be added.
|
||||
|
||||
Returns:
|
||||
True if the rel_as_node has been seen before, False otherwise.
|
||||
"""
|
||||
node = rel_as_node.get_node()
|
||||
|
||||
if node.get_label() not in self.seen_relationships:
|
||||
self.seen_relationships[node.get_label()] = set()
|
||||
|
||||
# rel as node always has an id
|
||||
_id = node.get_id()
|
||||
|
||||
if _id in self.seen_relationships[node.get_type()]:
|
||||
self.duplicate_relationship_ids.add(_id)
|
||||
if node.get_type() not in self.duplicate_relationship_types:
|
||||
logger.warning(f"Duplicate edge type {node.get_type()} found. ")
|
||||
self.duplicate_relationship_types.add(node.get_type())
|
||||
return True
|
||||
|
||||
self.seen_relationships[node.get_type()].add(_id)
|
||||
return False
|
||||
|
||||
def get_duplicate_nodes(self):
|
||||
"""
|
||||
Function to return a list of duplicate nodes.
|
||||
|
||||
Returns:
|
||||
list: list of duplicate nodes
|
||||
"""
|
||||
|
||||
if self.duplicate_entity_types:
|
||||
return (self.duplicate_entity_types, self.duplicate_entity_ids)
|
||||
else:
|
||||
return None
|
||||
|
||||
def get_duplicate_edges(self):
|
||||
"""
|
||||
Function to return a list of duplicate edges.
|
||||
|
||||
Returns:
|
||||
list: list of duplicate edges
|
||||
"""
|
||||
|
||||
if self.duplicate_relationship_types:
|
||||
return (
|
||||
self.duplicate_relationship_types,
|
||||
self.duplicate_relationship_ids,
|
||||
)
|
||||
else:
|
||||
return None
|
443
biocypher/_get.py
Normal file
443
biocypher/_get.py
Normal file
@ -0,0 +1,443 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher get module. Used to download and cache data from external sources.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
import shutil
|
||||
|
||||
import requests
|
||||
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from abc import ABC
|
||||
from datetime import datetime, timedelta
|
||||
from tempfile import TemporaryDirectory
|
||||
import os
|
||||
import json
|
||||
import ftplib
|
||||
|
||||
import pooch
|
||||
|
||||
from ._misc import to_list, is_nested
|
||||
|
||||
|
||||
class Resource(ABC):
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
url_s: str | list[str],
|
||||
lifetime: int = 0,
|
||||
):
|
||||
"""
|
||||
|
||||
A Resource is a file, a list of files, an API request, or a list of API
|
||||
requests, any of which can be downloaded from the given URL(s) and
|
||||
cached locally. This class implements checks of the minimum requirements
|
||||
for a resource, to be implemented by a biocypher adapter.
|
||||
|
||||
Args:
|
||||
name (str): The name of the resource.
|
||||
|
||||
url_s (str | list[str]): The URL or URLs of the resource.
|
||||
|
||||
lifetime (int): The lifetime of the resource in days. If 0, the
|
||||
resource is considered to be permanent.
|
||||
"""
|
||||
self.name = name
|
||||
self.url_s = url_s
|
||||
self.lifetime = lifetime
|
||||
|
||||
|
||||
class FileDownload(Resource):
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
url_s: str | list[str],
|
||||
lifetime: int = 0,
|
||||
is_dir: bool = False,
|
||||
):
|
||||
"""
|
||||
Represents basic information for a File Download.
|
||||
|
||||
Args:
|
||||
name(str): The name of the File Download.
|
||||
|
||||
url_s(str|list[str]): The URL(s) of the File Download.
|
||||
|
||||
lifetime(int): The lifetime of the File Download in days. If 0, the
|
||||
File Download is cached indefinitely.
|
||||
|
||||
is_dir (bool): Whether the URL points to a directory or not.
|
||||
"""
|
||||
|
||||
super().__init__(name, url_s, lifetime)
|
||||
self.is_dir = is_dir
|
||||
|
||||
|
||||
class APIRequest(Resource):
|
||||
def __init__(self, name: str, url_s: str | list[str], lifetime: int = 0):
|
||||
"""
|
||||
Represents basic information for an API Request.
|
||||
|
||||
Args:
|
||||
name(str): The name of the API Request.
|
||||
|
||||
url_s(str|list): The URL of the API endpoint.
|
||||
|
||||
lifetime(int): The lifetime of the API Request in days. If 0, the
|
||||
API Request is cached indefinitely.
|
||||
|
||||
"""
|
||||
super().__init__(name, url_s, lifetime)
|
||||
|
||||
|
||||
class Downloader:
|
||||
def __init__(self, cache_dir: Optional[str] = None) -> None:
|
||||
"""
|
||||
The Downloader is a class that manages resources that can be downloaded
|
||||
and cached locally. It manages the lifetime of downloaded resources by
|
||||
keeping a JSON record of the download date of each resource.
|
||||
|
||||
Args:
|
||||
cache_dir (str): The directory where the resources are cached. If
|
||||
not given, a temporary directory is created.
|
||||
"""
|
||||
self.cache_dir = cache_dir or TemporaryDirectory().name
|
||||
self.cache_file = os.path.join(self.cache_dir, "cache.json")
|
||||
self.cache_dict = self._load_cache_dict()
|
||||
|
||||
def download(self, *resources: Resource):
|
||||
"""
|
||||
Download one or multiple resources. Load from cache if the resource is
|
||||
already downloaded and the cache is not expired.
|
||||
|
||||
Args:
|
||||
resources (Resource): The resource(s) to download or load from
|
||||
cache.
|
||||
|
||||
Returns:
|
||||
list[str]: The path or paths to the resource(s) that were downloaded
|
||||
or loaded from cache.
|
||||
|
||||
"""
|
||||
paths = []
|
||||
for resource in resources:
|
||||
paths.append(self._download_or_cache(resource))
|
||||
|
||||
# flatten list if it is nested
|
||||
if is_nested(paths):
|
||||
paths = [path for sublist in paths for path in sublist]
|
||||
|
||||
return paths
|
||||
|
||||
def _download_or_cache(self, resource: Resource, cache: bool = True):
|
||||
"""
|
||||
Download a resource if it is not cached or exceeded its lifetime.
|
||||
|
||||
Args:
|
||||
resource (Resource): The resource to download.
|
||||
Returns:
|
||||
list[str]: The path or paths to the downloaded resource(s).
|
||||
|
||||
|
||||
"""
|
||||
expired = self._is_cache_expired(resource)
|
||||
|
||||
if expired or not cache:
|
||||
self._delete_expired_cache(resource)
|
||||
if isinstance(resource, FileDownload):
|
||||
logger.info(f"Asking for download of resource {resource.name}.")
|
||||
paths = self._download_files(cache, resource)
|
||||
elif isinstance(resource, APIRequest):
|
||||
logger.info(
|
||||
f"Asking for download of api request {resource.name}."
|
||||
)
|
||||
paths = self._download_api_request(resource)
|
||||
|
||||
else:
|
||||
raise TypeError(f"Unknown resource type: {type(resource)}")
|
||||
|
||||
else:
|
||||
paths = self.get_cached_version(resource)
|
||||
self._update_cache_record(resource)
|
||||
return paths
|
||||
|
||||
def _is_cache_expired(self, resource: Resource) -> bool:
|
||||
"""
|
||||
Check if resource or API request cache is expired.
|
||||
|
||||
Args:
|
||||
|
||||
resource (Resource): The resource or API request to download.
|
||||
|
||||
Returns:
|
||||
bool: True if cache is expired, False if not.
|
||||
"""
|
||||
cache_record = self._get_cache_record(resource)
|
||||
if cache_record:
|
||||
download_time = datetime.strptime(
|
||||
cache_record.get("date_downloaded"), "%Y-%m-%d %H:%M:%S.%f"
|
||||
)
|
||||
lifetime = timedelta(days=resource.lifetime)
|
||||
expired = download_time + lifetime < datetime.now()
|
||||
else:
|
||||
expired = True
|
||||
return expired
|
||||
|
||||
def _delete_expired_cache(self, resource: Resource):
|
||||
cache_resource_path = self.cache_dir + "/" + resource.name
|
||||
if os.path.exists(cache_resource_path) and os.path.isdir(
|
||||
cache_resource_path
|
||||
):
|
||||
shutil.rmtree(cache_resource_path)
|
||||
|
||||
def _download_files(self, cache, file_download: FileDownload):
|
||||
"""
|
||||
Download a resource given it is a file or a directory and return the
|
||||
path.
|
||||
|
||||
Args:
|
||||
cache (bool): Whether to cache the resource or not.
|
||||
file_download (FileDownload): The resource to download.
|
||||
|
||||
Returns:
|
||||
list[str]: The path or paths to the downloaded resource(s).
|
||||
"""
|
||||
if file_download.is_dir:
|
||||
files = self._get_files(file_download)
|
||||
file_download.url_s = [
|
||||
file_download.url_s + "/" + file for file in files
|
||||
]
|
||||
file_download.is_dir = False
|
||||
paths = self._download_or_cache(file_download, cache)
|
||||
elif isinstance(file_download.url_s, list):
|
||||
paths = []
|
||||
for url in file_download.url_s:
|
||||
fname = url[url.rfind("/") + 1 :].split("?")[0]
|
||||
paths.append(
|
||||
self._retrieve(
|
||||
url=url,
|
||||
fname=fname,
|
||||
path=os.path.join(self.cache_dir, file_download.name),
|
||||
)
|
||||
)
|
||||
else:
|
||||
paths = []
|
||||
fname = file_download.url_s[
|
||||
file_download.url_s.rfind("/") + 1 :
|
||||
].split("?")[0]
|
||||
results = self._retrieve(
|
||||
url=file_download.url_s,
|
||||
fname=fname,
|
||||
path=os.path.join(self.cache_dir, file_download.name),
|
||||
)
|
||||
if isinstance(results, list):
|
||||
paths.extend(results)
|
||||
else:
|
||||
paths.append(results)
|
||||
|
||||
# sometimes a compressed file contains multiple files
|
||||
# TODO ask for a list of files in the archive to be used from the
|
||||
# adapter
|
||||
return paths
|
||||
|
||||
def _download_api_request(self, api_request: APIRequest):
|
||||
"""
|
||||
Download an API request and return the path.
|
||||
|
||||
Args:
|
||||
api_request(APIRequest): The API request result that is being
|
||||
cached.
|
||||
Returns:
|
||||
list[str]: The path to the cached API request.
|
||||
|
||||
"""
|
||||
urls = (
|
||||
api_request.url_s
|
||||
if isinstance(api_request.url_s, list)
|
||||
else [api_request.url_s]
|
||||
)
|
||||
paths = []
|
||||
for url in urls:
|
||||
fname = url[url.rfind("/") + 1 :].rsplit(".", 1)[0]
|
||||
logger.info(
|
||||
f"Asking for caching API of {api_request.name} {fname}."
|
||||
)
|
||||
response = requests.get(url=url)
|
||||
|
||||
if response.status_code != 200:
|
||||
response.raise_for_status()
|
||||
response_data = response.json()
|
||||
api_path = os.path.join(
|
||||
self.cache_dir, api_request.name, f"{fname}.json"
|
||||
)
|
||||
|
||||
os.makedirs(os.path.dirname(api_path), exist_ok=True)
|
||||
with open(api_path, "w") as f:
|
||||
json.dump(response_data, f)
|
||||
logger.info(f"Caching API request to {api_path}.")
|
||||
paths.append(api_path)
|
||||
return paths
|
||||
|
||||
def get_cached_version(self, resource: Resource) -> list[str]:
|
||||
"""Get the cached version of a resource.
|
||||
|
||||
Args:
|
||||
resource(Resource): The resource to get the cached version of.
|
||||
|
||||
Returns:
|
||||
list[str]: The paths to the cached resource(s).
|
||||
|
||||
"""
|
||||
cached_location = os.path.join(self.cache_dir, resource.name)
|
||||
logger.info(f"Use cached version from {cached_location}.")
|
||||
paths = []
|
||||
for file in os.listdir(cached_location):
|
||||
paths.append(os.path.join(cached_location, file))
|
||||
return paths
|
||||
|
||||
def _retrieve(
|
||||
self,
|
||||
url: str,
|
||||
fname: str,
|
||||
path: str,
|
||||
known_hash: str = None,
|
||||
):
|
||||
"""
|
||||
Retrieve a file from a URL using Pooch. Infer type of file from
|
||||
extension and use appropriate processor.
|
||||
|
||||
Args:
|
||||
url (str): The URL to retrieve the file from.
|
||||
|
||||
fname (str): The name of the file.
|
||||
|
||||
path (str): The path to the file.
|
||||
"""
|
||||
if fname.endswith(".zip"):
|
||||
return pooch.retrieve(
|
||||
url=url,
|
||||
known_hash=known_hash,
|
||||
fname=fname,
|
||||
path=path,
|
||||
processor=pooch.Unzip(),
|
||||
progressbar=True,
|
||||
)
|
||||
|
||||
elif fname.endswith(".tar.gz"):
|
||||
return pooch.retrieve(
|
||||
url=url,
|
||||
known_hash=known_hash,
|
||||
fname=fname,
|
||||
path=path,
|
||||
processor=pooch.Untar(),
|
||||
progressbar=True,
|
||||
)
|
||||
|
||||
elif fname.endswith(".gz"):
|
||||
return pooch.retrieve(
|
||||
url=url,
|
||||
known_hash=known_hash,
|
||||
fname=fname,
|
||||
path=path,
|
||||
processor=pooch.Decompress(),
|
||||
progressbar=True,
|
||||
)
|
||||
|
||||
else:
|
||||
return pooch.retrieve(
|
||||
url=url,
|
||||
known_hash=known_hash,
|
||||
fname=fname,
|
||||
path=path,
|
||||
progressbar=True,
|
||||
)
|
||||
|
||||
def _get_files(self, file_download: FileDownload):
|
||||
"""
|
||||
Get the files contained in a directory file.
|
||||
|
||||
Args:
|
||||
file_download (FileDownload): The directory file.
|
||||
|
||||
Returns:
|
||||
list: The files contained in the directory.
|
||||
"""
|
||||
if file_download.url_s.startswith("ftp://"):
|
||||
# remove protocol
|
||||
url = file_download.url_s[6:]
|
||||
# get base url
|
||||
url = url[: url.find("/")]
|
||||
# get directory (remove initial slash as well)
|
||||
dir = file_download.url_s[7 + len(url) :]
|
||||
# get files
|
||||
ftp = ftplib.FTP(url)
|
||||
ftp.login()
|
||||
ftp.cwd(dir)
|
||||
files = ftp.nlst()
|
||||
ftp.quit()
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only FTP directories are supported at the moment."
|
||||
)
|
||||
|
||||
return files
|
||||
|
||||
def _load_cache_dict(self):
|
||||
"""
|
||||
Load the cache dictionary from the cache file. Create an empty cache
|
||||
file if it does not exist.
|
||||
"""
|
||||
if not os.path.exists(self.cache_dir):
|
||||
logger.info(f"Creating cache directory {self.cache_dir}.")
|
||||
os.makedirs(self.cache_dir)
|
||||
|
||||
if not os.path.exists(self.cache_file):
|
||||
logger.info(f"Creating cache file {self.cache_file}.")
|
||||
with open(self.cache_file, "w") as f:
|
||||
json.dump({}, f)
|
||||
|
||||
with open(self.cache_file, "r") as f:
|
||||
logger.info(f"Loading cache file {self.cache_file}.")
|
||||
return json.load(f)
|
||||
|
||||
def _get_cache_record(self, resource: Resource):
|
||||
"""
|
||||
Get the cache record of a resource.
|
||||
|
||||
Args:
|
||||
resource (Resource): The resource to get the cache record of.
|
||||
|
||||
Returns:
|
||||
The cache record of the resource.
|
||||
"""
|
||||
return self.cache_dict.get(resource.name, {})
|
||||
|
||||
def _update_cache_record(self, resource: Resource):
|
||||
"""
|
||||
Update the cache record of a resource.
|
||||
|
||||
Args:
|
||||
resource (Resource): The resource to update the cache record of.
|
||||
"""
|
||||
cache_record = {}
|
||||
cache_record["url"] = to_list(resource.url_s)
|
||||
cache_record["date_downloaded"] = str(datetime.now())
|
||||
cache_record["lifetime"] = resource.lifetime
|
||||
self.cache_dict[resource.name] = cache_record
|
||||
with open(self.cache_file, "w") as f:
|
||||
json.dump(self.cache_dict, f, default=str)
|
121
biocypher/_logger.py
Normal file
121
biocypher/_logger.py
Normal file
@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
Configuration of the module logger.
|
||||
"""
|
||||
|
||||
__all__ = ["get_logger", "log", "logfile"]
|
||||
|
||||
from datetime import datetime
|
||||
import os
|
||||
import pydoc
|
||||
import logging
|
||||
|
||||
from biocypher import _config
|
||||
from biocypher._metadata import __version__
|
||||
|
||||
|
||||
def get_logger(name: str = "biocypher") -> logging.Logger:
|
||||
"""
|
||||
Access the module logger, create a new one if does not exist yet.
|
||||
|
||||
Method providing central logger instance to main module. Is called
|
||||
only from main submodule, :mod:`biocypher.driver`. In child modules,
|
||||
the standard Python logging facility is called
|
||||
(using ``logging.getLogger(__name__)``), automatically inheriting
|
||||
the handlers from the central logger.
|
||||
|
||||
The file handler creates a log file named after the current date and
|
||||
time. Levels to output to file and console can be set here.
|
||||
|
||||
Args:
|
||||
name:
|
||||
Name of the logger instance.
|
||||
|
||||
Returns:
|
||||
An instance of the Python :py:mod:`logging.Logger`.
|
||||
"""
|
||||
|
||||
if not logging.getLogger(name).hasHandlers():
|
||||
# create logger
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.propagate = True
|
||||
|
||||
# formatting
|
||||
file_formatter = logging.Formatter(
|
||||
"%(asctime)s\t%(levelname)s\tmodule:%(module)s\n%(message)s",
|
||||
)
|
||||
stdout_formatter = logging.Formatter("%(levelname)s -- %(message)s")
|
||||
|
||||
# file name and creation
|
||||
now = datetime.now()
|
||||
date_time = now.strftime("%Y%m%d-%H%M%S")
|
||||
|
||||
log_to_disk = _config.config("biocypher").get("log_to_disk")
|
||||
|
||||
if log_to_disk:
|
||||
logdir = (
|
||||
_config.config("biocypher").get("log_directory")
|
||||
or "biocypher-log"
|
||||
)
|
||||
os.makedirs(logdir, exist_ok=True)
|
||||
logfile = os.path.join(logdir, f"biocypher-{date_time}.log")
|
||||
|
||||
# file handler
|
||||
file_handler = logging.FileHandler(logfile)
|
||||
|
||||
if _config.config("biocypher").get("debug"):
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
else:
|
||||
file_handler.setLevel(logging.INFO)
|
||||
|
||||
file_handler.setFormatter(file_formatter)
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# handlers
|
||||
# stream handler
|
||||
stdout_handler = logging.StreamHandler()
|
||||
stdout_handler.setLevel(logging.INFO)
|
||||
stdout_handler.setFormatter(stdout_formatter)
|
||||
|
||||
# add handlers
|
||||
logger.addHandler(stdout_handler)
|
||||
|
||||
# startup message
|
||||
logger.info(f"This is BioCypher v{__version__}.")
|
||||
if log_to_disk:
|
||||
logger.info(f"Logging into `{logfile}`.")
|
||||
else:
|
||||
logger.info("Logging into stdout.")
|
||||
|
||||
return logging.getLogger(name)
|
||||
|
||||
|
||||
def logfile() -> str:
|
||||
"""
|
||||
Path to the log file.
|
||||
"""
|
||||
|
||||
return get_logger().handlers[0].baseFilename
|
||||
|
||||
|
||||
def log():
|
||||
"""
|
||||
Browse the log file.
|
||||
"""
|
||||
|
||||
with open(logfile()) as fp:
|
||||
pydoc.pager(fp.read())
|
||||
|
||||
|
||||
logger = get_logger()
|
307
biocypher/_mapping.py
Normal file
307
biocypher/_mapping.py
Normal file
@ -0,0 +1,307 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'mapping' module. Handles the mapping of user-defined schema to the
|
||||
underlying ontology.
|
||||
"""
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import Optional
|
||||
from urllib.request import urlopen
|
||||
|
||||
import yaml
|
||||
|
||||
from . import _misc
|
||||
from ._config import config as _config
|
||||
|
||||
|
||||
class OntologyMapping:
|
||||
"""
|
||||
Class to store the ontology mapping and extensions.
|
||||
"""
|
||||
|
||||
def __init__(self, config_file: str = None):
|
||||
self.schema = self._read_config(config_file)
|
||||
|
||||
self.extended_schema = self._extend_schema()
|
||||
|
||||
def _read_config(self, config_file: str = None):
|
||||
"""
|
||||
Read the configuration file and store the ontology mapping and extensions.
|
||||
"""
|
||||
if config_file is None:
|
||||
schema_config = {}
|
||||
|
||||
# load yaml file from web
|
||||
elif config_file.startswith("http"):
|
||||
with urlopen(config_file) as f:
|
||||
schema_config = yaml.safe_load(f)
|
||||
|
||||
# get graph state from config (assume file is local)
|
||||
else:
|
||||
with open(config_file, "r") as f:
|
||||
schema_config = yaml.safe_load(f)
|
||||
|
||||
return schema_config
|
||||
|
||||
def _extend_schema(self, d: Optional[dict] = None) -> dict:
|
||||
"""
|
||||
Get leaves of the tree hierarchy from the data structure dict
|
||||
contained in the `schema_config.yaml`. Creates virtual leaves
|
||||
(as children) from entries that provide more than one preferred
|
||||
id type (and corresponding inputs).
|
||||
|
||||
Args:
|
||||
d:
|
||||
Data structure dict from yaml file.
|
||||
|
||||
"""
|
||||
|
||||
d = d or self.schema
|
||||
|
||||
extended_schema = dict()
|
||||
|
||||
# first pass: get parent leaves with direct representation in ontology
|
||||
for k, v in d.items():
|
||||
# k is not an entity
|
||||
if "represented_as" not in v:
|
||||
continue
|
||||
|
||||
# preferred_id optional: if not provided, use `id`
|
||||
if not v.get("preferred_id"):
|
||||
v["preferred_id"] = "id"
|
||||
|
||||
# k is an entity that is present in the ontology
|
||||
if "is_a" not in v:
|
||||
extended_schema[k] = v
|
||||
|
||||
# second pass: "vertical" inheritance
|
||||
d = self._vertical_property_inheritance(d)
|
||||
for k, v in d.items():
|
||||
if "is_a" in v:
|
||||
# prevent loops
|
||||
if k == v["is_a"]:
|
||||
logger.warning(
|
||||
f"Loop detected in ontology mapping: {k} -> {v}. "
|
||||
"Removing item. Please fix the inheritance if you want "
|
||||
"to use this item."
|
||||
)
|
||||
continue
|
||||
|
||||
extended_schema[k] = v
|
||||
|
||||
# "horizontal" inheritance: create siblings for multiple identifiers or
|
||||
# sources -> virtual leaves or implicit children
|
||||
mi_leaves = {}
|
||||
ms_leaves = {}
|
||||
for k, v in d.items():
|
||||
# k is not an entity
|
||||
if "represented_as" not in v:
|
||||
continue
|
||||
|
||||
if isinstance(v.get("preferred_id"), list):
|
||||
mi_leaves = self._horizontal_inheritance_pid(k, v)
|
||||
extended_schema.update(mi_leaves)
|
||||
|
||||
elif isinstance(v.get("source"), list):
|
||||
ms_leaves = self._horizontal_inheritance_source(k, v)
|
||||
extended_schema.update(ms_leaves)
|
||||
|
||||
return extended_schema
|
||||
|
||||
def _vertical_property_inheritance(self, d):
|
||||
"""
|
||||
Inherit properties from parents to children and update `d` accordingly.
|
||||
"""
|
||||
for k, v in d.items():
|
||||
# k is not an entity
|
||||
if "represented_as" not in v:
|
||||
continue
|
||||
|
||||
# k is an entity that is present in the ontology
|
||||
if "is_a" not in v:
|
||||
continue
|
||||
|
||||
# "vertical" inheritance: inherit properties from parent
|
||||
if v.get("inherit_properties", False):
|
||||
# get direct ancestor
|
||||
if isinstance(v["is_a"], list):
|
||||
parent = v["is_a"][0]
|
||||
else:
|
||||
parent = v["is_a"]
|
||||
|
||||
# ensure child has properties and exclude_properties
|
||||
if "properties" not in v:
|
||||
v["properties"] = {}
|
||||
if "exclude_properties" not in v:
|
||||
v["exclude_properties"] = {}
|
||||
|
||||
# update properties of child
|
||||
parent_props = self.schema[parent].get("properties", {})
|
||||
if parent_props:
|
||||
v["properties"].update(parent_props)
|
||||
|
||||
parent_excl_props = self.schema[parent].get(
|
||||
"exclude_properties", {}
|
||||
)
|
||||
if parent_excl_props:
|
||||
v["exclude_properties"].update(parent_excl_props)
|
||||
|
||||
# update schema (d)
|
||||
d[k] = v
|
||||
|
||||
return d
|
||||
|
||||
def _horizontal_inheritance_pid(self, key, value):
|
||||
"""
|
||||
Create virtual leaves for multiple preferred id types or sources.
|
||||
|
||||
If we create virtual leaves, input_label/label_in_input always has to be
|
||||
a list.
|
||||
"""
|
||||
|
||||
leaves = {}
|
||||
|
||||
preferred_id = value["preferred_id"]
|
||||
input_label = value.get("input_label") or value["label_in_input"]
|
||||
represented_as = value["represented_as"]
|
||||
|
||||
# adjust lengths
|
||||
max_l = max(
|
||||
[
|
||||
len(_misc.to_list(preferred_id)),
|
||||
len(_misc.to_list(input_label)),
|
||||
len(_misc.to_list(represented_as)),
|
||||
],
|
||||
)
|
||||
|
||||
# adjust pid length if necessary
|
||||
if isinstance(preferred_id, str):
|
||||
pids = [preferred_id] * max_l
|
||||
else:
|
||||
pids = preferred_id
|
||||
|
||||
# adjust rep length if necessary
|
||||
if isinstance(represented_as, str):
|
||||
reps = [represented_as] * max_l
|
||||
else:
|
||||
reps = represented_as
|
||||
|
||||
for pid, lab, rep in zip(pids, input_label, reps):
|
||||
skey = pid + "." + key
|
||||
svalue = {
|
||||
"preferred_id": pid,
|
||||
"input_label": lab,
|
||||
"represented_as": rep,
|
||||
# mark as virtual
|
||||
"virtual": True,
|
||||
}
|
||||
|
||||
# inherit is_a if exists
|
||||
if "is_a" in value.keys():
|
||||
# treat as multiple inheritance
|
||||
if isinstance(value["is_a"], list):
|
||||
v = list(value["is_a"])
|
||||
v.insert(0, key)
|
||||
svalue["is_a"] = v
|
||||
|
||||
else:
|
||||
svalue["is_a"] = [key, value["is_a"]]
|
||||
|
||||
else:
|
||||
# set parent as is_a
|
||||
svalue["is_a"] = key
|
||||
|
||||
# inherit everything except core attributes
|
||||
for k, v in value.items():
|
||||
if k not in [
|
||||
"is_a",
|
||||
"preferred_id",
|
||||
"input_label",
|
||||
"label_in_input",
|
||||
"represented_as",
|
||||
]:
|
||||
svalue[k] = v
|
||||
|
||||
leaves[skey] = svalue
|
||||
|
||||
return leaves
|
||||
|
||||
def _horizontal_inheritance_source(self, key, value):
|
||||
"""
|
||||
Create virtual leaves for multiple sources.
|
||||
|
||||
If we create virtual leaves, input_label/label_in_input always has to be
|
||||
a list.
|
||||
"""
|
||||
|
||||
leaves = {}
|
||||
|
||||
source = value["source"]
|
||||
input_label = value.get("input_label") or value["label_in_input"]
|
||||
represented_as = value["represented_as"]
|
||||
|
||||
# adjust lengths
|
||||
src_l = len(source)
|
||||
|
||||
# adjust label length if necessary
|
||||
if isinstance(input_label, str):
|
||||
labels = [input_label] * src_l
|
||||
else:
|
||||
labels = input_label
|
||||
|
||||
# adjust rep length if necessary
|
||||
if isinstance(represented_as, str):
|
||||
reps = [represented_as] * src_l
|
||||
else:
|
||||
reps = represented_as
|
||||
|
||||
for src, lab, rep in zip(source, labels, reps):
|
||||
skey = src + "." + key
|
||||
svalue = {
|
||||
"source": src,
|
||||
"input_label": lab,
|
||||
"represented_as": rep,
|
||||
# mark as virtual
|
||||
"virtual": True,
|
||||
}
|
||||
|
||||
# inherit is_a if exists
|
||||
if "is_a" in value.keys():
|
||||
# treat as multiple inheritance
|
||||
if isinstance(value["is_a"], list):
|
||||
v = list(value["is_a"])
|
||||
v.insert(0, key)
|
||||
svalue["is_a"] = v
|
||||
|
||||
else:
|
||||
svalue["is_a"] = [key, value["is_a"]]
|
||||
|
||||
else:
|
||||
# set parent as is_a
|
||||
svalue["is_a"] = key
|
||||
|
||||
# inherit everything except core attributes
|
||||
for k, v in value.items():
|
||||
if k not in [
|
||||
"is_a",
|
||||
"source",
|
||||
"input_label",
|
||||
"label_in_input",
|
||||
"represented_as",
|
||||
]:
|
||||
svalue[k] = v
|
||||
|
||||
leaves[skey] = svalue
|
||||
|
||||
return leaves
|
71
biocypher/_metadata.py
Normal file
71
biocypher/_metadata.py
Normal file
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
Package metadata (version, authors, etc).
|
||||
"""
|
||||
|
||||
__all__ = ["get_metadata"]
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
import importlib.metadata
|
||||
|
||||
import toml
|
||||
|
||||
_VERSION = "0.6.0"
|
||||
|
||||
|
||||
def get_metadata():
|
||||
"""
|
||||
Basic package metadata.
|
||||
|
||||
Retrieves package metadata from the current project directory or from
|
||||
the installed package.
|
||||
"""
|
||||
|
||||
here = pathlib.Path(__file__).parent
|
||||
pyproj_toml = "pyproject.toml"
|
||||
meta = {}
|
||||
|
||||
for project_dir in (here, here.parent):
|
||||
toml_path = str(project_dir.joinpath(pyproj_toml).absolute())
|
||||
|
||||
if os.path.exists(toml_path):
|
||||
pyproject = toml.load(toml_path)
|
||||
|
||||
meta = {
|
||||
"name": pyproject["tool"]["poetry"]["name"],
|
||||
"version": pyproject["tool"]["poetry"]["version"],
|
||||
"author": pyproject["tool"]["poetry"]["authors"],
|
||||
"license": pyproject["tool"]["poetry"]["license"],
|
||||
"full_metadata": pyproject,
|
||||
}
|
||||
|
||||
break
|
||||
|
||||
if not meta:
|
||||
try:
|
||||
meta = {
|
||||
k.lower(): v
|
||||
for k, v in importlib.metadata.metadata(here.name).items()
|
||||
}
|
||||
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
pass
|
||||
|
||||
meta["version"] = meta.get("version", None) or _VERSION
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
metadata = get_metadata()
|
||||
__version__ = metadata.get("version", None)
|
||||
__author__ = metadata.get("author", None)
|
||||
__license__ = metadata.get("license", None)
|
264
biocypher/_misc.py
Normal file
264
biocypher/_misc.py
Normal file
@ -0,0 +1,264 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
Handy functions for use in various places.
|
||||
"""
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
Union,
|
||||
Mapping,
|
||||
KeysView,
|
||||
Generator,
|
||||
ItemsView,
|
||||
ValuesView,
|
||||
)
|
||||
from collections.abc import Iterable
|
||||
import re
|
||||
|
||||
from treelib import Tree
|
||||
import networkx as nx
|
||||
import stringcase
|
||||
|
||||
__all__ = ["LIST_LIKE", "SIMPLE_TYPES", "ensure_iterable", "to_list"]
|
||||
|
||||
SIMPLE_TYPES = (
|
||||
bytes,
|
||||
str,
|
||||
int,
|
||||
float,
|
||||
bool,
|
||||
type(None),
|
||||
)
|
||||
|
||||
LIST_LIKE = (
|
||||
list,
|
||||
set,
|
||||
tuple,
|
||||
Generator,
|
||||
ItemsView,
|
||||
KeysView,
|
||||
Mapping,
|
||||
ValuesView,
|
||||
)
|
||||
|
||||
|
||||
def to_list(value: Any) -> list:
|
||||
"""
|
||||
Ensures that ``value`` is a list.
|
||||
"""
|
||||
|
||||
if isinstance(value, LIST_LIKE):
|
||||
value = list(value)
|
||||
|
||||
else:
|
||||
value = [value]
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def ensure_iterable(value: Any) -> Iterable:
|
||||
"""
|
||||
Returns iterables, except strings, wraps simple types into tuple.
|
||||
"""
|
||||
|
||||
return value if isinstance(value, LIST_LIKE) else (value,)
|
||||
|
||||
|
||||
def create_tree_visualisation(inheritance_graph: Union[dict, nx.Graph]) -> Tree:
|
||||
"""
|
||||
Creates a visualisation of the inheritance tree using treelib.
|
||||
"""
|
||||
inheritance_tree = _get_inheritance_tree(inheritance_graph)
|
||||
classes, root = _find_root_node(inheritance_tree)
|
||||
|
||||
tree = Tree()
|
||||
tree.create_node(root, root)
|
||||
while classes:
|
||||
for child in classes:
|
||||
parent = inheritance_tree[child]
|
||||
if parent in tree.nodes.keys() or parent == root:
|
||||
tree.create_node(child, child, parent=parent)
|
||||
|
||||
for node in tree.nodes.keys():
|
||||
if node in classes:
|
||||
classes.remove(node)
|
||||
|
||||
return tree
|
||||
|
||||
|
||||
def _get_inheritance_tree(inheritance_graph: Union[dict, nx.Graph]) -> dict:
|
||||
"""Transforms an inheritance_graph into an inheritance_tree.
|
||||
|
||||
Args:
|
||||
inheritance_graph: A dict or nx.Graph representing the inheritance graph.
|
||||
|
||||
Returns:
|
||||
A dict representing the inheritance tree.
|
||||
"""
|
||||
if isinstance(inheritance_graph, nx.Graph):
|
||||
inheritance_tree = nx.to_dict_of_lists(inheritance_graph)
|
||||
|
||||
multiple_parents_present = _multiple_inheritance_present(
|
||||
inheritance_tree
|
||||
)
|
||||
if multiple_parents_present:
|
||||
logger.warning(
|
||||
"The ontology contains multiple inheritance (one child node "
|
||||
"has multiple parent nodes). This is not visualized in the "
|
||||
"following hierarchy tree (the child node is only added once). "
|
||||
"If you wish to browse all relationships of the parsed "
|
||||
"ontologies, write a graphml file to disk using "
|
||||
"`to_disk = <directory>` and view this file."
|
||||
)
|
||||
|
||||
# unlist values
|
||||
inheritance_tree = {k: v[0] for k, v in inheritance_tree.items() if v}
|
||||
return inheritance_tree
|
||||
elif not _multiple_inheritance_present(inheritance_graph):
|
||||
return inheritance_graph
|
||||
|
||||
|
||||
def _multiple_inheritance_present(inheritance_tree: dict) -> bool:
|
||||
"""Checks if multiple inheritance is present in the inheritance_tree."""
|
||||
return any(len(value) > 1 for value in inheritance_tree.values())
|
||||
|
||||
|
||||
def _find_root_node(inheritance_tree: dict) -> tuple[set, str]:
|
||||
classes = set(inheritance_tree.keys())
|
||||
parents = set(inheritance_tree.values())
|
||||
root = list(parents - classes)
|
||||
if len(root) > 1:
|
||||
if "entity" in root:
|
||||
root = "entity" # TODO: default: good standard?
|
||||
else:
|
||||
raise ValueError(
|
||||
"Inheritance tree cannot have more than one root node. "
|
||||
f"Found {len(root)}: {root}."
|
||||
)
|
||||
else:
|
||||
root = root[0]
|
||||
if not root:
|
||||
# find key whose value is None
|
||||
root = list(inheritance_tree.keys())[
|
||||
list(inheritance_tree.values()).index(None)
|
||||
]
|
||||
return classes, root
|
||||
|
||||
|
||||
# string conversion, adapted from Biolink Model Toolkit
|
||||
lowercase_pattern = re.compile(r"[a-zA-Z]*[a-z][a-zA-Z]*")
|
||||
underscore_pattern = re.compile(r"(?<!^)(?=[A-Z][a-z])")
|
||||
|
||||
|
||||
def from_pascal(s: str, sep: str = " ") -> str:
|
||||
underscored = underscore_pattern.sub(sep, s)
|
||||
lowercased = lowercase_pattern.sub(
|
||||
lambda match: match.group(0).lower(),
|
||||
underscored,
|
||||
)
|
||||
return lowercased
|
||||
|
||||
|
||||
def pascalcase_to_sentencecase(s: str) -> str:
|
||||
"""
|
||||
Convert PascalCase to sentence case.
|
||||
|
||||
Args:
|
||||
s: Input string in PascalCase
|
||||
|
||||
Returns:
|
||||
string in sentence case form
|
||||
"""
|
||||
return from_pascal(s, sep=" ")
|
||||
|
||||
|
||||
def snakecase_to_sentencecase(s: str) -> str:
|
||||
"""
|
||||
Convert snake_case to sentence case.
|
||||
|
||||
Args:
|
||||
s: Input string in snake_case
|
||||
|
||||
Returns:
|
||||
string in sentence case form
|
||||
"""
|
||||
return stringcase.sentencecase(s).lower()
|
||||
|
||||
|
||||
def sentencecase_to_snakecase(s: str) -> str:
|
||||
"""
|
||||
Convert sentence case to snake_case.
|
||||
|
||||
Args:
|
||||
s: Input string in sentence case
|
||||
|
||||
Returns:
|
||||
string in snake_case form
|
||||
"""
|
||||
return stringcase.snakecase(s).lower()
|
||||
|
||||
|
||||
def sentencecase_to_pascalcase(s: str, sep: str = r"\s") -> str:
|
||||
"""
|
||||
Convert sentence case to PascalCase.
|
||||
|
||||
Args:
|
||||
s: Input string in sentence case
|
||||
|
||||
Returns:
|
||||
string in PascalCase form
|
||||
"""
|
||||
return re.sub(
|
||||
r"(?:^|[" + sep + "])([a-zA-Z])",
|
||||
lambda match: match.group(1).upper(),
|
||||
s,
|
||||
)
|
||||
|
||||
|
||||
def to_lower_sentence_case(s: str) -> str:
|
||||
"""
|
||||
Convert any string to lower sentence case. Works with snake_case,
|
||||
PascalCase, and sentence case.
|
||||
|
||||
Args:
|
||||
s: Input string
|
||||
|
||||
Returns:
|
||||
string in lower sentence case form
|
||||
"""
|
||||
if "_" in s:
|
||||
return snakecase_to_sentencecase(s)
|
||||
elif " " in s:
|
||||
return s.lower()
|
||||
elif s[0].isupper():
|
||||
return pascalcase_to_sentencecase(s)
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
def is_nested(lst) -> bool:
|
||||
"""
|
||||
Check if a list is nested.
|
||||
|
||||
Args:
|
||||
lst (list): The list to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the list is nested, False otherwise.
|
||||
"""
|
||||
for item in lst:
|
||||
if isinstance(item, list):
|
||||
return True
|
||||
return False
|
886
biocypher/_ontology.py
Normal file
886
biocypher/_ontology.py
Normal file
@ -0,0 +1,886 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'ontology' module. Contains classes and functions to handle parsing
|
||||
and representation of single ontologies as well as their hybridisation and
|
||||
other advanced operations.
|
||||
"""
|
||||
import os
|
||||
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from rdflib import Graph
|
||||
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
|
||||
import rdflib
|
||||
import networkx as nx
|
||||
|
||||
from ._misc import (
|
||||
to_list,
|
||||
to_lower_sentence_case,
|
||||
create_tree_visualisation,
|
||||
sentencecase_to_pascalcase,
|
||||
)
|
||||
from ._mapping import OntologyMapping
|
||||
|
||||
|
||||
class OntologyAdapter:
|
||||
"""
|
||||
Class that represents an ontology to be used in the Biocypher framework. Can
|
||||
read from a variety of formats, including OWL, OBO, and RDF/XML. The
|
||||
ontology is represented by a networkx.DiGraph object; an RDFlib graph is
|
||||
also kept. By default, the DiGraph reverses the label and identifier of the
|
||||
nodes, such that the node name in the graph is the human-readable label. The
|
||||
edges are oriented from child to parent.
|
||||
Labels are formatted in lower sentence case and underscores are replaced by spaces.
|
||||
Identifiers are taken as defined and the prefixes are removed by default.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ontology_file: str,
|
||||
root_label: str,
|
||||
ontology_file_format: Optional[str] = None,
|
||||
head_join_node_label: Optional[str] = None,
|
||||
merge_nodes: Optional[bool] = True,
|
||||
switch_label_and_id: bool = True,
|
||||
remove_prefixes: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize the OntologyAdapter class.
|
||||
|
||||
Args:
|
||||
ontology_file (str): Path to the ontology file. Can be local or
|
||||
remote.
|
||||
|
||||
root_label (str): The label of the root node in the ontology. In
|
||||
case of a tail ontology, this is the tail join node.
|
||||
|
||||
ontology_file_format (str): The format of the ontology file (e.g. "application/rdf+xml")
|
||||
If format is not passed, it is determined automatically.
|
||||
|
||||
head_join_node_label (str): Optional variable to store the label of the
|
||||
node in the head ontology that should be used to join to the
|
||||
root node of the tail ontology. Defaults to None.
|
||||
|
||||
merge_nodes (bool): If True, head and tail join nodes will be
|
||||
merged, using the label of the head join node. If False, the
|
||||
tail join node will be attached as a child of the head join
|
||||
node.
|
||||
|
||||
switch_label_and_id (bool): If True, the node names in the graph will be
|
||||
the human-readable labels. If False, the node names will be the
|
||||
identifiers. Defaults to True.
|
||||
|
||||
remove_prefixes (bool): If True, the prefixes of the identifiers will
|
||||
be removed. Defaults to True.
|
||||
"""
|
||||
|
||||
logger.info(f"Instantiating OntologyAdapter class for {ontology_file}.")
|
||||
|
||||
self._ontology_file = ontology_file
|
||||
self._root_label = root_label
|
||||
self._format = ontology_file_format
|
||||
self._merge_nodes = merge_nodes
|
||||
self._head_join_node = head_join_node_label
|
||||
self._switch_label_and_id = switch_label_and_id
|
||||
self._remove_prefixes = remove_prefixes
|
||||
|
||||
self._rdf_graph = self._load_rdf_graph(ontology_file)
|
||||
|
||||
self._nx_graph = self._rdf_to_nx(
|
||||
self._rdf_graph, root_label, switch_label_and_id
|
||||
)
|
||||
|
||||
def _rdf_to_nx(
|
||||
self,
|
||||
_rdf_graph: rdflib.Graph,
|
||||
root_label: str,
|
||||
switch_label_and_id: bool,
|
||||
rename_nodes: bool = True,
|
||||
) -> nx.DiGraph:
|
||||
one_to_one_triples, one_to_many_dict = self._get_relevant_rdf_triples(
|
||||
_rdf_graph
|
||||
)
|
||||
nx_graph = self._convert_to_nx(one_to_one_triples, one_to_many_dict)
|
||||
nx_graph = self._add_labels_to_nodes(nx_graph, switch_label_and_id)
|
||||
nx_graph = self._change_nodes_to_biocypher_format(
|
||||
nx_graph, switch_label_and_id, rename_nodes
|
||||
)
|
||||
nx_graph = self._get_all_ancestors(
|
||||
nx_graph, root_label, switch_label_and_id, rename_nodes
|
||||
)
|
||||
return nx.DiGraph(nx_graph)
|
||||
|
||||
def _get_relevant_rdf_triples(self, g: rdflib.Graph) -> tuple:
|
||||
one_to_one_inheritance_graph = self._get_one_to_one_inheritance_triples(
|
||||
g
|
||||
)
|
||||
intersection = self._get_multiple_inheritance_dict(g)
|
||||
return one_to_one_inheritance_graph, intersection
|
||||
|
||||
def _get_one_to_one_inheritance_triples(
|
||||
self, g: rdflib.Graph
|
||||
) -> rdflib.Graph:
|
||||
"""Get the one to one inheritance triples from the RDF graph.
|
||||
|
||||
Args:
|
||||
g (rdflib.Graph): The RDF graph
|
||||
|
||||
Returns:
|
||||
rdflib.Graph: The one to one inheritance graph
|
||||
"""
|
||||
one_to_one_inheritance_graph = Graph()
|
||||
for s, p, o in g.triples((None, rdflib.RDFS.subClassOf, None)):
|
||||
if self.has_label(s, g):
|
||||
one_to_one_inheritance_graph.add((s, p, o))
|
||||
return one_to_one_inheritance_graph
|
||||
|
||||
def _get_multiple_inheritance_dict(self, g: rdflib.Graph) -> dict:
|
||||
"""Get the multiple inheritance dictionary from the RDF graph.
|
||||
|
||||
Args:
|
||||
g (rdflib.Graph): The RDF graph
|
||||
|
||||
Returns:
|
||||
dict: The multiple inheritance dictionary
|
||||
"""
|
||||
multiple_inheritance = g.triples(
|
||||
(None, rdflib.OWL.intersectionOf, None)
|
||||
)
|
||||
intersection = {}
|
||||
for (
|
||||
node,
|
||||
has_multiple_parents,
|
||||
first_node_of_intersection_list,
|
||||
) in multiple_inheritance:
|
||||
parents = self._retrieve_rdf_linked_list(
|
||||
first_node_of_intersection_list
|
||||
)
|
||||
child_name = None
|
||||
for s_, _, _ in g.triples((None, rdflib.RDFS.subClassOf, node)):
|
||||
child_name = s_
|
||||
|
||||
# Handle Snomed CT post coordinated expressions
|
||||
if not child_name:
|
||||
for s_, _, _ in g.triples(
|
||||
(None, rdflib.OWL.equivalentClass, node)
|
||||
):
|
||||
child_name = s_
|
||||
|
||||
if child_name:
|
||||
intersection[node] = {
|
||||
"child_name": child_name,
|
||||
"parent_node_names": parents,
|
||||
}
|
||||
return intersection
|
||||
|
||||
def has_label(self, node: rdflib.URIRef, g: rdflib.Graph) -> bool:
|
||||
"""Does the node have a label in g?
|
||||
|
||||
Args:
|
||||
node (rdflib.URIRef): The node to check
|
||||
g (rdflib.Graph): The graph to check in
|
||||
Returns:
|
||||
bool: True if the node has a label, False otherwise
|
||||
"""
|
||||
return (node, rdflib.RDFS.label, None) in g
|
||||
|
||||
def _retrieve_rdf_linked_list(self, subject: rdflib.URIRef) -> list:
|
||||
"""Recursively retrieves a linked list from RDF.
|
||||
Example RDF list with the items [item1, item2]:
|
||||
list_node - first -> item1
|
||||
list_node - rest -> list_node2
|
||||
list_node2 - first -> item2
|
||||
list_node2 - rest -> nil
|
||||
Args:
|
||||
subject (rdflib.URIRef): One list_node of the RDF list
|
||||
Returns:
|
||||
list: The items of the RDF list
|
||||
"""
|
||||
g = self._rdf_graph
|
||||
rdf_list = []
|
||||
for s, p, o in g.triples((subject, rdflib.RDF.first, None)):
|
||||
rdf_list.append(o)
|
||||
for s, p, o in g.triples((subject, rdflib.RDF.rest, None)):
|
||||
if o != rdflib.RDF.nil:
|
||||
rdf_list.extend(self._retrieve_rdf_linked_list(o))
|
||||
return rdf_list
|
||||
|
||||
def _convert_to_nx(
|
||||
self, one_to_one: rdflib.Graph, one_to_many: dict
|
||||
) -> nx.DiGraph:
|
||||
"""Convert the one to one and one to many inheritance graphs to networkx.
|
||||
|
||||
Args:
|
||||
one_to_one (rdflib.Graph): The one to one inheritance graph
|
||||
one_to_many (dict): The one to many inheritance dictionary
|
||||
|
||||
Returns:
|
||||
nx.DiGraph: The networkx graph
|
||||
"""
|
||||
nx_graph = rdflib_to_networkx_digraph(
|
||||
one_to_one, edge_attrs=lambda s, p, o: {}, calc_weights=False
|
||||
)
|
||||
for key, value in one_to_many.items():
|
||||
nx_graph.add_edges_from(
|
||||
[
|
||||
(value["child_name"], parent)
|
||||
for parent in value["parent_node_names"]
|
||||
]
|
||||
)
|
||||
if key in nx_graph.nodes:
|
||||
nx_graph.remove_node(key)
|
||||
return nx_graph
|
||||
|
||||
def _add_labels_to_nodes(
|
||||
self, nx_graph: nx.DiGraph, switch_label_and_id: bool
|
||||
) -> nx.DiGraph:
|
||||
"""Add labels to the nodes in the networkx graph.
|
||||
|
||||
Args:
|
||||
nx_graph (nx.DiGraph): The networkx graph
|
||||
switch_label_and_id (bool): If True, id and label are switched
|
||||
|
||||
Returns:
|
||||
nx.DiGraph: The networkx graph with labels
|
||||
"""
|
||||
for node in list(nx_graph.nodes):
|
||||
nx_id, nx_label = self._get_nx_id_and_label(
|
||||
node, switch_label_and_id
|
||||
)
|
||||
if nx_id == "none":
|
||||
# remove node if it has no id
|
||||
nx_graph.remove_node(node)
|
||||
continue
|
||||
|
||||
nx_graph.nodes[node]["label"] = nx_label
|
||||
return nx_graph
|
||||
|
||||
def _change_nodes_to_biocypher_format(
|
||||
self,
|
||||
nx_graph: nx.DiGraph,
|
||||
switch_label_and_id: bool,
|
||||
rename_nodes: bool = True,
|
||||
) -> nx.DiGraph:
|
||||
"""Change the nodes in the networkx graph to BioCypher format:
|
||||
- remove the prefix of the identifier
|
||||
- switch id and label
|
||||
- adapt the labels (replace _ with space and convert to lower sentence case)
|
||||
|
||||
Args:
|
||||
nx_graph (nx.DiGraph): The networkx graph
|
||||
switch_label_and_id (bool): If True, id and label are switched
|
||||
rename_nodes (bool): If True, the nodes are renamed
|
||||
|
||||
Returns:
|
||||
nx.DiGraph: The networkx ontology graph in BioCypher format
|
||||
"""
|
||||
mapping = {
|
||||
node: self._get_nx_id_and_label(
|
||||
node, switch_label_and_id, rename_nodes
|
||||
)[0]
|
||||
for node in nx_graph.nodes
|
||||
}
|
||||
renamed = nx.relabel_nodes(nx_graph, mapping, copy=False)
|
||||
return renamed
|
||||
|
||||
def _get_all_ancestors(
|
||||
self,
|
||||
renamed: nx.DiGraph,
|
||||
root_label: str,
|
||||
switch_label_and_id: bool,
|
||||
rename_nodes: bool = True,
|
||||
) -> nx.DiGraph:
|
||||
"""Get all ancestors of the root node in the networkx graph.
|
||||
|
||||
Args:
|
||||
renamed (nx.DiGraph): The renamed networkx graph
|
||||
root_label (str): The label of the root node in the ontology
|
||||
switch_label_and_id (bool): If True, id and label are switched
|
||||
rename_nodes (bool): If True, the nodes are renamed
|
||||
|
||||
Returns:
|
||||
nx.DiGraph: The filtered networkx graph
|
||||
"""
|
||||
root = self._get_nx_id_and_label(
|
||||
self._find_root_label(self._rdf_graph, root_label),
|
||||
switch_label_and_id,
|
||||
rename_nodes,
|
||||
)[0]
|
||||
ancestors = nx.ancestors(renamed, root)
|
||||
ancestors.add(root)
|
||||
filtered_graph = renamed.subgraph(ancestors)
|
||||
return filtered_graph
|
||||
|
||||
def _get_nx_id_and_label(
|
||||
self, node, switch_id_and_label: bool, rename_nodes: bool = True
|
||||
) -> tuple[str, str]:
|
||||
"""Rename node id and label for nx graph.
|
||||
|
||||
Args:
|
||||
node (str): The node to rename
|
||||
switch_id_and_label (bool): If True, switch id and label
|
||||
|
||||
Returns:
|
||||
tuple[str, str]: The renamed node id and label
|
||||
"""
|
||||
node_id_str = self._remove_prefix(str(node))
|
||||
node_label_str = str(self._rdf_graph.value(node, rdflib.RDFS.label))
|
||||
if rename_nodes:
|
||||
node_label_str = node_label_str.replace("_", " ")
|
||||
node_label_str = to_lower_sentence_case(node_label_str)
|
||||
nx_id = node_label_str if switch_id_and_label else node_id_str
|
||||
nx_label = node_id_str if switch_id_and_label else node_label_str
|
||||
return nx_id, nx_label
|
||||
|
||||
def _find_root_label(self, g, root_label):
|
||||
# Loop through all labels in the ontology
|
||||
for label_subject, _, label_in_ontology in g.triples(
|
||||
(None, rdflib.RDFS.label, None)
|
||||
):
|
||||
# If the label is the root label, set the root node to the label's subject
|
||||
if str(label_in_ontology) == root_label:
|
||||
root = label_subject
|
||||
break
|
||||
else:
|
||||
labels_in_ontology = []
|
||||
for label_subject, _, label_in_ontology in g.triples(
|
||||
(None, rdflib.RDFS.label, None)
|
||||
):
|
||||
labels_in_ontology.append(str(label_in_ontology))
|
||||
raise ValueError(
|
||||
f"Could not find root node with label '{root_label}'. "
|
||||
f"The ontology contains the following labels: {labels_in_ontology}"
|
||||
)
|
||||
return root
|
||||
|
||||
def _remove_prefix(self, uri: str) -> str:
|
||||
"""
|
||||
Remove the prefix of a URI. URIs can contain either "#" or "/" as a
|
||||
separator between the prefix and the local name. The prefix is
|
||||
everything before the last separator.
|
||||
"""
|
||||
if self._remove_prefixes:
|
||||
return uri.rsplit("#", 1)[-1].rsplit("/", 1)[-1]
|
||||
else:
|
||||
return uri
|
||||
|
||||
def _load_rdf_graph(self, ontology_file):
|
||||
"""
|
||||
Load the ontology into an RDFlib graph. The ontology file can be in
|
||||
OWL, OBO, or RDF/XML format.
|
||||
"""
|
||||
g = rdflib.Graph()
|
||||
g.parse(ontology_file, format=self._get_format(ontology_file))
|
||||
return g
|
||||
|
||||
def _get_format(self, ontology_file):
|
||||
"""
|
||||
Get the format of the ontology file.
|
||||
"""
|
||||
if self._format:
|
||||
if self._format == "owl":
|
||||
return "application/rdf+xml"
|
||||
elif self._format == "obo":
|
||||
raise NotImplementedError("OBO format not yet supported")
|
||||
elif self._format == "rdf":
|
||||
return "application/rdf+xml"
|
||||
elif self._format == "ttl":
|
||||
return self._format
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Could not determine format of ontology file {ontology_file}"
|
||||
)
|
||||
|
||||
if ontology_file.endswith(".owl"):
|
||||
return "application/rdf+xml"
|
||||
elif ontology_file.endswith(".obo"):
|
||||
raise NotImplementedError("OBO format not yet supported")
|
||||
elif ontology_file.endswith(".rdf"):
|
||||
return "application/rdf+xml"
|
||||
elif ontology_file.endswith(".ttl"):
|
||||
return "ttl"
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Could not determine format of ontology file {ontology_file}"
|
||||
)
|
||||
|
||||
def get_nx_graph(self):
|
||||
"""
|
||||
Get the networkx graph representing the ontology.
|
||||
"""
|
||||
return self._nx_graph
|
||||
|
||||
def get_rdf_graph(self):
|
||||
"""
|
||||
Get the RDFlib graph representing the ontology.
|
||||
"""
|
||||
return self._rdf_graph
|
||||
|
||||
def get_root_node(self):
|
||||
"""
|
||||
Get root node in the ontology.
|
||||
|
||||
Returns:
|
||||
root_node: If _switch_label_and_id is True, the root node label is returned,
|
||||
otherwise the root node id is returned.
|
||||
"""
|
||||
|
||||
root_node = None
|
||||
root_label = self._root_label.replace("_", " ")
|
||||
|
||||
if self._switch_label_and_id:
|
||||
root_node = to_lower_sentence_case(root_label)
|
||||
elif not self._switch_label_and_id:
|
||||
for node, data in self.get_nx_graph().nodes(data=True):
|
||||
if "label" in data and data["label"] == to_lower_sentence_case(
|
||||
root_label
|
||||
):
|
||||
root_node = node
|
||||
break
|
||||
|
||||
return root_node
|
||||
|
||||
def get_ancestors(self, node_label):
|
||||
"""
|
||||
Get the ancestors of a node in the ontology.
|
||||
"""
|
||||
return nx.dfs_preorder_nodes(self._nx_graph, node_label)
|
||||
|
||||
def get_head_join_node(self):
|
||||
"""
|
||||
Get the head join node of the ontology.
|
||||
"""
|
||||
return self._head_join_node
|
||||
|
||||
|
||||
class Ontology:
|
||||
"""
|
||||
A class that represents the ontological "backbone" of a BioCypher knowledge
|
||||
graph. The ontology can be built from a single resource, or hybridised from
|
||||
a combination of resources, with one resource being the "head" ontology,
|
||||
while an arbitrary number of other resources can become "tail" ontologies at
|
||||
arbitrary fusion points inside the "head" ontology.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_ontology: dict,
|
||||
ontology_mapping: Optional["OntologyMapping"] = None,
|
||||
tail_ontologies: Optional[dict] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the Ontology class.
|
||||
|
||||
Args:
|
||||
head_ontology (OntologyAdapter): The head ontology.
|
||||
|
||||
tail_ontologies (list): A list of OntologyAdapters that will be
|
||||
added to the head ontology. Defaults to None.
|
||||
"""
|
||||
|
||||
self._head_ontology_meta = head_ontology
|
||||
self.mapping = ontology_mapping
|
||||
self._tail_ontology_meta = tail_ontologies
|
||||
|
||||
self._tail_ontologies = None
|
||||
self._nx_graph = None
|
||||
|
||||
# keep track of nodes that have been extended
|
||||
self._extended_nodes = set()
|
||||
|
||||
self._main()
|
||||
|
||||
def _main(self) -> None:
|
||||
"""
|
||||
Main method to be run on instantiation. Loads the ontologies, joins
|
||||
them, and returns the hybrid ontology. Loads only the head ontology
|
||||
if nothing else is given. Adds user extensions and properties from
|
||||
the mapping.
|
||||
"""
|
||||
self._load_ontologies()
|
||||
|
||||
if self._tail_ontologies:
|
||||
for adapter in self._tail_ontologies.values():
|
||||
head_join_node = self._get_head_join_node(adapter)
|
||||
self._join_ontologies(adapter, head_join_node)
|
||||
else:
|
||||
self._nx_graph = self._head_ontology.get_nx_graph()
|
||||
|
||||
if self.mapping:
|
||||
self._extend_ontology()
|
||||
|
||||
# experimental: add connections of disjoint classes to entity
|
||||
# self._connect_biolink_classes()
|
||||
|
||||
self._add_properties()
|
||||
|
||||
def _load_ontologies(self) -> None:
|
||||
"""
|
||||
For each ontology, load the OntologyAdapter object and store it as an
|
||||
instance variable (head) or a dictionary (tail).
|
||||
"""
|
||||
|
||||
logger.info("Loading ontologies...")
|
||||
|
||||
self._head_ontology = OntologyAdapter(
|
||||
ontology_file=self._head_ontology_meta["url"],
|
||||
root_label=self._head_ontology_meta["root_node"],
|
||||
ontology_file_format=self._head_ontology_meta.get("format", None),
|
||||
switch_label_and_id=self._head_ontology_meta.get(
|
||||
"switch_label_and_id", True
|
||||
),
|
||||
)
|
||||
|
||||
if self._tail_ontology_meta:
|
||||
self._tail_ontologies = {}
|
||||
for key, value in self._tail_ontology_meta.items():
|
||||
self._tail_ontologies[key] = OntologyAdapter(
|
||||
ontology_file=value["url"],
|
||||
root_label=value["tail_join_node"],
|
||||
head_join_node_label=value["head_join_node"],
|
||||
ontology_file_format=value.get("format", None),
|
||||
merge_nodes=value.get("merge_nodes", True),
|
||||
switch_label_and_id=value.get("switch_label_and_id", True),
|
||||
)
|
||||
|
||||
def _get_head_join_node(self, adapter: OntologyAdapter) -> str:
|
||||
"""
|
||||
Tries to find the head join node of the given ontology adapter in the
|
||||
head ontology. If the join node is not found, the method will raise an
|
||||
error.
|
||||
|
||||
Args:
|
||||
adapter (OntologyAdapter): The ontology adapter of which to find the
|
||||
join node in the head ontology.
|
||||
"""
|
||||
|
||||
head_join_node = None
|
||||
user_defined_head_join_node_label = adapter.get_head_join_node()
|
||||
head_join_node_label_in_bc_format = to_lower_sentence_case(
|
||||
user_defined_head_join_node_label.replace("_", " ")
|
||||
)
|
||||
|
||||
if self._head_ontology._switch_label_and_id:
|
||||
head_join_node = head_join_node_label_in_bc_format
|
||||
elif not self._head_ontology._switch_label_and_id:
|
||||
for node_id, data in self._head_ontology.get_nx_graph().nodes(
|
||||
data=True
|
||||
):
|
||||
if (
|
||||
"label" in data
|
||||
and data["label"] == head_join_node_label_in_bc_format
|
||||
):
|
||||
head_join_node = node_id
|
||||
break
|
||||
|
||||
if head_join_node not in self._head_ontology.get_nx_graph().nodes:
|
||||
head_ontology = self._head_ontology._rdf_to_nx(
|
||||
self._head_ontology.get_rdf_graph(),
|
||||
self._head_ontology._root_label,
|
||||
self._head_ontology._switch_label_and_id,
|
||||
rename_nodes=False,
|
||||
)
|
||||
raise ValueError(
|
||||
f"Head join node '{head_join_node}' not found in head ontology. "
|
||||
f"The head ontology contains the following nodes: {head_ontology.nodes}."
|
||||
)
|
||||
return head_join_node
|
||||
|
||||
def _join_ontologies(
|
||||
self, adapter: OntologyAdapter, head_join_node
|
||||
) -> None:
|
||||
"""
|
||||
Joins the ontologies by adding the tail ontology as a subgraph to the
|
||||
head ontology at the specified join nodes.
|
||||
|
||||
Args:
|
||||
adapter (OntologyAdapter): The ontology adapter of the tail ontology
|
||||
to be added to the head ontology.
|
||||
"""
|
||||
|
||||
if not self._nx_graph:
|
||||
self._nx_graph = self._head_ontology.get_nx_graph().copy()
|
||||
|
||||
tail_join_node = adapter.get_root_node()
|
||||
tail_ontology = adapter.get_nx_graph()
|
||||
|
||||
# subtree of tail ontology at join node
|
||||
tail_ontology_subtree = nx.dfs_tree(
|
||||
tail_ontology.reverse(), tail_join_node
|
||||
).reverse()
|
||||
|
||||
# transfer node attributes from tail ontology to subtree
|
||||
for node in tail_ontology_subtree.nodes:
|
||||
tail_ontology_subtree.nodes[node].update(tail_ontology.nodes[node])
|
||||
|
||||
# if merge_nodes is False, create parent of tail join node from head
|
||||
# join node
|
||||
if not adapter._merge_nodes:
|
||||
# add head join node from head ontology to tail ontology subtree
|
||||
# as parent of tail join node
|
||||
tail_ontology_subtree.add_node(
|
||||
head_join_node,
|
||||
**self._head_ontology.get_nx_graph().nodes[head_join_node],
|
||||
)
|
||||
tail_ontology_subtree.add_edge(tail_join_node, head_join_node)
|
||||
|
||||
# else rename tail join node to match head join node if necessary
|
||||
elif not tail_join_node == head_join_node:
|
||||
tail_ontology_subtree = nx.relabel_nodes(
|
||||
tail_ontology_subtree, {tail_join_node: head_join_node}
|
||||
)
|
||||
|
||||
# combine head ontology and tail subtree
|
||||
self._nx_graph = nx.compose(self._nx_graph, tail_ontology_subtree)
|
||||
|
||||
def _extend_ontology(self) -> None:
|
||||
"""
|
||||
Adds the user extensions to the ontology. Tries to find the parent in
|
||||
the ontology, adds it if necessary, and adds the child and a directed
|
||||
edge from child to parent. Can handle multiple parents.
|
||||
"""
|
||||
|
||||
if not self._nx_graph:
|
||||
self._nx_graph = self._head_ontology.get_nx_graph().copy()
|
||||
|
||||
for key, value in self.mapping.extended_schema.items():
|
||||
if not value.get("is_a"):
|
||||
if self._nx_graph.has_node(value.get("synonym_for")):
|
||||
continue
|
||||
|
||||
if not self._nx_graph.has_node(key):
|
||||
raise ValueError(
|
||||
f"Node {key} not found in ontology, but also has no "
|
||||
"inheritance definition. Please check your schema for "
|
||||
"spelling errors, first letter not in lower case, use of underscores, a missing `is_a` definition (SubClassOf a root node), or missing labels in class or super-classes."
|
||||
)
|
||||
|
||||
continue
|
||||
|
||||
parents = to_list(value.get("is_a"))
|
||||
child = key
|
||||
|
||||
while parents:
|
||||
parent = parents.pop(0)
|
||||
|
||||
if parent not in self._nx_graph.nodes:
|
||||
self._nx_graph.add_node(parent)
|
||||
self._nx_graph.nodes[parent][
|
||||
"label"
|
||||
] = sentencecase_to_pascalcase(parent)
|
||||
|
||||
# mark parent as user extension
|
||||
self._nx_graph.nodes[parent]["user_extension"] = True
|
||||
self._extended_nodes.add(parent)
|
||||
|
||||
if child not in self._nx_graph.nodes:
|
||||
self._nx_graph.add_node(child)
|
||||
self._nx_graph.nodes[child][
|
||||
"label"
|
||||
] = sentencecase_to_pascalcase(child)
|
||||
|
||||
# mark child as user extension
|
||||
self._nx_graph.nodes[child]["user_extension"] = True
|
||||
self._extended_nodes.add(child)
|
||||
|
||||
self._nx_graph.add_edge(child, parent)
|
||||
|
||||
child = parent
|
||||
|
||||
def _connect_biolink_classes(self) -> None:
|
||||
"""
|
||||
Experimental: Adds edges from disjoint classes to the entity node.
|
||||
"""
|
||||
|
||||
if not self._nx_graph:
|
||||
self._nx_graph = self._head_ontology.get_nx_graph().copy()
|
||||
|
||||
if "entity" not in self._nx_graph.nodes:
|
||||
return
|
||||
|
||||
# biolink classes that are disjoint from entity
|
||||
disjoint_classes = [
|
||||
"frequency qualifier mixin",
|
||||
"chemical entity to entity association mixin",
|
||||
"ontology class",
|
||||
"relationship quantifier",
|
||||
"physical essence or occurrent",
|
||||
"gene or gene product",
|
||||
"subject of investigation",
|
||||
]
|
||||
|
||||
for node in disjoint_classes:
|
||||
if not self._nx_graph.nodes.get(node):
|
||||
self._nx_graph.add_node(node)
|
||||
self._nx_graph.nodes[node][
|
||||
"label"
|
||||
] = sentencecase_to_pascalcase(node)
|
||||
|
||||
self._nx_graph.add_edge(node, "entity")
|
||||
|
||||
def _add_properties(self) -> None:
|
||||
"""
|
||||
For each entity in the mapping, update the ontology with the properties
|
||||
specified in the mapping. Updates synonym information in the graph,
|
||||
setting the synonym as the primary node label.
|
||||
"""
|
||||
|
||||
for key, value in self.mapping.extended_schema.items():
|
||||
if key in self._nx_graph.nodes:
|
||||
self._nx_graph.nodes[key].update(value)
|
||||
|
||||
if value.get("synonym_for"):
|
||||
# change node label to synonym
|
||||
if value["synonym_for"] not in self._nx_graph.nodes:
|
||||
raise ValueError(
|
||||
f'Node {value["synonym_for"]} not found in ontology.'
|
||||
)
|
||||
|
||||
self._nx_graph = nx.relabel_nodes(
|
||||
self._nx_graph, {value["synonym_for"]: key}
|
||||
)
|
||||
|
||||
def get_ancestors(self, node_label: str) -> list:
|
||||
"""
|
||||
Get the ancestors of a node in the ontology.
|
||||
|
||||
Args:
|
||||
node_label (str): The label of the node in the ontology.
|
||||
|
||||
Returns:
|
||||
list: A list of the ancestors of the node.
|
||||
"""
|
||||
return nx.dfs_tree(self._nx_graph, node_label)
|
||||
|
||||
def show_ontology_structure(self, to_disk: str = None, full: bool = False):
|
||||
"""
|
||||
Show the ontology structure using treelib or write to GRAPHML file.
|
||||
|
||||
Args:
|
||||
|
||||
to_disk (str): If specified, the ontology structure will be saved
|
||||
to disk as a GRAPHML file at the location (directory) specified
|
||||
by the `to_disk` string, to be opened in your favourite graph
|
||||
visualisation tool.
|
||||
|
||||
full (bool): If True, the full ontology structure will be shown,
|
||||
including all nodes and edges. If False, only the nodes and
|
||||
edges that are relevant to the extended schema will be shown.
|
||||
"""
|
||||
|
||||
if not full and not self.mapping.extended_schema:
|
||||
raise ValueError(
|
||||
"You are attempting to visualise a subset of the loaded"
|
||||
"ontology, but have not provided a schema configuration. "
|
||||
"To display a partial ontology graph, please provide a schema "
|
||||
"configuration file; to visualise the full graph, please use "
|
||||
"the parameter `full = True`."
|
||||
)
|
||||
|
||||
if not self._nx_graph:
|
||||
raise ValueError("Ontology not loaded.")
|
||||
|
||||
if not self._tail_ontologies:
|
||||
msg = f"Showing ontology structure based on {self._head_ontology._ontology_file}"
|
||||
|
||||
else:
|
||||
msg = f"Showing ontology structure based on {len(self._tail_ontology_meta)+1} ontologies: "
|
||||
|
||||
logger.info(msg)
|
||||
|
||||
if not full:
|
||||
# set of leaves and their intermediate parents up to the root
|
||||
filter_nodes = set(self.mapping.extended_schema.keys())
|
||||
|
||||
for node in self.mapping.extended_schema.keys():
|
||||
filter_nodes.update(self.get_ancestors(node).nodes)
|
||||
|
||||
# filter graph
|
||||
G = self._nx_graph.subgraph(filter_nodes)
|
||||
|
||||
else:
|
||||
G = self._nx_graph
|
||||
|
||||
if not to_disk:
|
||||
# create tree
|
||||
tree = create_tree_visualisation(G)
|
||||
|
||||
# add synonym information
|
||||
for node in self.mapping.extended_schema:
|
||||
if not isinstance(self.mapping.extended_schema[node], dict):
|
||||
continue
|
||||
if self.mapping.extended_schema[node].get("synonym_for"):
|
||||
tree.nodes[node].tag = (
|
||||
f"{node} = "
|
||||
f"{self.mapping.extended_schema[node].get('synonym_for')}"
|
||||
)
|
||||
|
||||
logger.info(f"\n{tree}")
|
||||
|
||||
return tree
|
||||
|
||||
else:
|
||||
# convert lists/dicts to strings for vis only
|
||||
for node in G.nodes:
|
||||
# rename node and use former id as label
|
||||
label = G.nodes[node].get("label")
|
||||
|
||||
if not label:
|
||||
label = node
|
||||
|
||||
G = nx.relabel_nodes(G, {node: label})
|
||||
G.nodes[label]["label"] = node
|
||||
|
||||
for attrib in G.nodes[label]:
|
||||
if type(G.nodes[label][attrib]) in [list, dict]:
|
||||
G.nodes[label][attrib] = str(G.nodes[label][attrib])
|
||||
|
||||
path = os.path.join(to_disk, "ontology_structure.graphml")
|
||||
|
||||
logger.info(f"Writing ontology structure to {path}.")
|
||||
|
||||
nx.write_graphml(G, path)
|
||||
|
||||
return True
|
||||
|
||||
def get_dict(self) -> dict:
|
||||
"""
|
||||
Returns a dictionary compatible with a BioCypher node for compatibility
|
||||
with the Neo4j driver.
|
||||
"""
|
||||
|
||||
d = {
|
||||
"node_id": self._get_current_id(),
|
||||
"node_label": "BioCypher",
|
||||
"properties": {
|
||||
"schema": "self.ontology_mapping.extended_schema",
|
||||
},
|
||||
}
|
||||
|
||||
return d
|
||||
|
||||
def _get_current_id(self):
|
||||
"""
|
||||
Instantiate a version ID for the current session. For now does simple
|
||||
versioning using datetime.
|
||||
|
||||
Can later implement incremental versioning, versioning from
|
||||
config file, or manual specification via argument.
|
||||
"""
|
||||
|
||||
now = datetime.now()
|
||||
return now.strftime("v%Y%m%d-%H%M%S")
|
480
biocypher/_translate.py
Normal file
480
biocypher/_translate.py
Normal file
@ -0,0 +1,480 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'translation' module. Responsible for translating between the raw
|
||||
input data and the BioCypherNode and BioCypherEdge objects.
|
||||
"""
|
||||
from ._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import Any, Union, Optional
|
||||
from collections.abc import Iterable, Generator
|
||||
|
||||
from more_itertools import peekable
|
||||
|
||||
from . import _misc
|
||||
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
||||
from ._ontology import Ontology
|
||||
|
||||
__all__ = ["BiolinkAdapter", "Translator"]
|
||||
|
||||
|
||||
class Translator:
|
||||
"""
|
||||
Class responsible for exacting the translation process that is configured in
|
||||
the schema_config.yaml file. Creates a mapping dictionary from that file,
|
||||
and, given nodes and edges, translates them into BioCypherNodes and
|
||||
BioCypherEdges. During this process, can also filter the properties of the
|
||||
entities if the schema_config.yaml file specifies a property whitelist or
|
||||
blacklist.
|
||||
|
||||
Provides utility functions for translating between input and output labels
|
||||
and cypher queries.
|
||||
"""
|
||||
|
||||
def __init__(self, ontology: "Ontology", strict_mode: bool = False):
|
||||
"""
|
||||
Args:
|
||||
leaves:
|
||||
Dictionary detailing the leaves of the hierarchy
|
||||
tree representing the structure of the graph; the leaves are
|
||||
the entities that will be direct components of the graph,
|
||||
while the intermediary nodes are additional labels for
|
||||
filtering purposes.
|
||||
strict_mode:
|
||||
If True, the translator will raise an error if input data do not
|
||||
carry source, licence, and version information.
|
||||
"""
|
||||
|
||||
self.ontology = ontology
|
||||
self.strict_mode = strict_mode
|
||||
|
||||
# record nodes without biolink type configured in schema_config.yaml
|
||||
self.notype = {}
|
||||
|
||||
# mapping functionality for translating terms and queries
|
||||
self.mappings = {}
|
||||
self.reverse_mappings = {}
|
||||
|
||||
self._update_ontology_types()
|
||||
|
||||
def translate_nodes(
|
||||
self,
|
||||
node_tuples: Iterable,
|
||||
) -> Generator[BioCypherNode, None, None]:
|
||||
"""
|
||||
Translates input node representation to a representation that
|
||||
conforms to the schema of the given BioCypher graph. For now
|
||||
requires explicit statement of node type on pass.
|
||||
|
||||
Args:
|
||||
node_tuples (list of tuples): collection of tuples
|
||||
representing individual nodes by their unique id and a type
|
||||
that is translated from the original database notation to
|
||||
the corresponding BioCypher notation.
|
||||
|
||||
"""
|
||||
|
||||
self._log_begin_translate(node_tuples, "nodes")
|
||||
|
||||
for _id, _type, _props in node_tuples:
|
||||
# check for strict mode requirements
|
||||
required_props = ["source", "licence", "version"]
|
||||
|
||||
if self.strict_mode:
|
||||
# rename 'license' to 'licence' in _props
|
||||
if _props.get("license"):
|
||||
_props["licence"] = _props.pop("license")
|
||||
|
||||
for prop in required_props:
|
||||
if prop not in _props:
|
||||
raise ValueError(
|
||||
f"Property `{prop}` missing from node {_id}. "
|
||||
"Strict mode is enabled, so this is not allowed."
|
||||
)
|
||||
|
||||
# find the node in leaves that represents ontology node type
|
||||
_ontology_class = self._get_ontology_mapping(_type)
|
||||
|
||||
if _ontology_class:
|
||||
# filter properties for those specified in schema_config if any
|
||||
_filtered_props = self._filter_props(_ontology_class, _props)
|
||||
|
||||
# preferred id
|
||||
_preferred_id = self._get_preferred_id(_ontology_class)
|
||||
|
||||
yield BioCypherNode(
|
||||
node_id=_id,
|
||||
node_label=_ontology_class,
|
||||
preferred_id=_preferred_id,
|
||||
properties=_filtered_props,
|
||||
)
|
||||
|
||||
else:
|
||||
self._record_no_type(_type, _id)
|
||||
|
||||
self._log_finish_translate("nodes")
|
||||
|
||||
def _get_preferred_id(self, _bl_type: str) -> str:
|
||||
"""
|
||||
Returns the preferred id for the given Biolink type.
|
||||
"""
|
||||
|
||||
return (
|
||||
self.ontology.mapping.extended_schema[_bl_type]["preferred_id"]
|
||||
if "preferred_id"
|
||||
in self.ontology.mapping.extended_schema.get(_bl_type, {})
|
||||
else "id"
|
||||
)
|
||||
|
||||
def _filter_props(self, bl_type: str, props: dict) -> dict:
|
||||
"""
|
||||
Filters properties for those specified in schema_config if any.
|
||||
"""
|
||||
|
||||
filter_props = self.ontology.mapping.extended_schema[bl_type].get(
|
||||
"properties", {}
|
||||
)
|
||||
|
||||
# strict mode: add required properties (only if there is a whitelist)
|
||||
if self.strict_mode and filter_props:
|
||||
filter_props.update(
|
||||
{"source": "str", "licence": "str", "version": "str"},
|
||||
)
|
||||
|
||||
exclude_props = self.ontology.mapping.extended_schema[bl_type].get(
|
||||
"exclude_properties", []
|
||||
)
|
||||
|
||||
if isinstance(exclude_props, str):
|
||||
exclude_props = [exclude_props]
|
||||
|
||||
if filter_props and exclude_props:
|
||||
filtered_props = {
|
||||
k: v
|
||||
for k, v in props.items()
|
||||
if (k in filter_props.keys() and k not in exclude_props)
|
||||
}
|
||||
|
||||
elif filter_props:
|
||||
filtered_props = {
|
||||
k: v for k, v in props.items() if k in filter_props.keys()
|
||||
}
|
||||
|
||||
elif exclude_props:
|
||||
filtered_props = {
|
||||
k: v for k, v in props.items() if k not in exclude_props
|
||||
}
|
||||
|
||||
else:
|
||||
return props
|
||||
|
||||
missing_props = [
|
||||
k for k in filter_props.keys() if k not in filtered_props.keys()
|
||||
]
|
||||
# add missing properties with default values
|
||||
for k in missing_props:
|
||||
filtered_props[k] = None
|
||||
|
||||
return filtered_props
|
||||
|
||||
def translate_edges(
|
||||
self,
|
||||
edge_tuples: Iterable,
|
||||
) -> Generator[Union[BioCypherEdge, BioCypherRelAsNode], None, None]:
|
||||
"""
|
||||
Translates input edge representation to a representation that
|
||||
conforms to the schema of the given BioCypher graph. For now
|
||||
requires explicit statement of edge type on pass.
|
||||
|
||||
Args:
|
||||
|
||||
edge_tuples (list of tuples):
|
||||
|
||||
collection of tuples representing source and target of
|
||||
an interaction via their unique ids as well as the type
|
||||
of interaction in the original database notation, which
|
||||
is translated to BioCypher notation using the `leaves`.
|
||||
Can optionally possess its own ID.
|
||||
"""
|
||||
|
||||
self._log_begin_translate(edge_tuples, "edges")
|
||||
|
||||
# legacy: deal with 4-tuples (no edge id)
|
||||
# TODO remove for performance reasons once safe
|
||||
edge_tuples = peekable(edge_tuples)
|
||||
if len(edge_tuples.peek()) == 4:
|
||||
edge_tuples = [
|
||||
(None, src, tar, typ, props)
|
||||
for src, tar, typ, props in edge_tuples
|
||||
]
|
||||
|
||||
for _id, _src, _tar, _type, _props in edge_tuples:
|
||||
# check for strict mode requirements
|
||||
if self.strict_mode:
|
||||
if not "source" in _props:
|
||||
raise ValueError(
|
||||
f"Edge {_id if _id else (_src, _tar)} does not have a `source` property.",
|
||||
" This is required in strict mode.",
|
||||
)
|
||||
if not "licence" in _props:
|
||||
raise ValueError(
|
||||
f"Edge {_id if _id else (_src, _tar)} does not have a `licence` property.",
|
||||
" This is required in strict mode.",
|
||||
)
|
||||
|
||||
# match the input label (_type) to
|
||||
# a Biolink label from schema_config
|
||||
bl_type = self._get_ontology_mapping(_type)
|
||||
|
||||
if bl_type:
|
||||
# filter properties for those specified in schema_config if any
|
||||
_filtered_props = self._filter_props(bl_type, _props)
|
||||
|
||||
rep = self.ontology.mapping.extended_schema[bl_type][
|
||||
"represented_as"
|
||||
]
|
||||
|
||||
if rep == "node":
|
||||
if _id:
|
||||
# if it brings its own ID, use it
|
||||
node_id = _id
|
||||
|
||||
else:
|
||||
# source target concat
|
||||
node_id = (
|
||||
str(_src)
|
||||
+ "_"
|
||||
+ str(_tar)
|
||||
+ "_"
|
||||
+ "_".join(str(v) for v in _filtered_props.values())
|
||||
)
|
||||
|
||||
n = BioCypherNode(
|
||||
node_id=node_id,
|
||||
node_label=bl_type,
|
||||
properties=_filtered_props,
|
||||
)
|
||||
|
||||
# directionality check TODO generalise to account for
|
||||
# different descriptions of directionality or find a
|
||||
# more consistent solution for indicating directionality
|
||||
if _filtered_props.get("directed") == True:
|
||||
l1 = "IS_SOURCE_OF"
|
||||
l2 = "IS_TARGET_OF"
|
||||
|
||||
elif _filtered_props.get(
|
||||
"src_role",
|
||||
) and _filtered_props.get("tar_role"):
|
||||
l1 = _filtered_props.get("src_role")
|
||||
l2 = _filtered_props.get("tar_role")
|
||||
|
||||
else:
|
||||
l1 = l2 = "IS_PART_OF"
|
||||
|
||||
e_s = BioCypherEdge(
|
||||
source_id=_src,
|
||||
target_id=node_id,
|
||||
relationship_label=l1,
|
||||
# additional here
|
||||
)
|
||||
|
||||
e_t = BioCypherEdge(
|
||||
source_id=_tar,
|
||||
target_id=node_id,
|
||||
relationship_label=l2,
|
||||
# additional here
|
||||
)
|
||||
|
||||
yield BioCypherRelAsNode(n, e_s, e_t)
|
||||
|
||||
else:
|
||||
edge_label = self.ontology.mapping.extended_schema[
|
||||
bl_type
|
||||
].get("label_as_edge")
|
||||
|
||||
if edge_label is None:
|
||||
edge_label = bl_type
|
||||
|
||||
yield BioCypherEdge(
|
||||
relationship_id=_id,
|
||||
source_id=_src,
|
||||
target_id=_tar,
|
||||
relationship_label=edge_label,
|
||||
properties=_filtered_props,
|
||||
)
|
||||
|
||||
else:
|
||||
self._record_no_type(_type, (_src, _tar))
|
||||
|
||||
self._log_finish_translate("edges")
|
||||
|
||||
def _record_no_type(self, _type: Any, what: Any) -> None:
|
||||
"""
|
||||
Records the type of a node or edge that is not represented in the
|
||||
schema_config.
|
||||
"""
|
||||
|
||||
logger.debug(f"No ontology type defined for `{_type}`: {what}")
|
||||
|
||||
if self.notype.get(_type, None):
|
||||
self.notype[_type] += 1
|
||||
|
||||
else:
|
||||
self.notype[_type] = 1
|
||||
|
||||
def get_missing_biolink_types(self) -> dict:
|
||||
"""
|
||||
Returns a dictionary of types that were not represented in the
|
||||
schema_config.
|
||||
"""
|
||||
|
||||
return self.notype
|
||||
|
||||
@staticmethod
|
||||
def _log_begin_translate(_input: Iterable, what: str):
|
||||
n = f"{len(_input)} " if hasattr(_input, "__len__") else ""
|
||||
|
||||
logger.debug(f"Translating {n}{what} to BioCypher")
|
||||
|
||||
@staticmethod
|
||||
def _log_finish_translate(what: str):
|
||||
logger.debug(f"Finished translating {what} to BioCypher.")
|
||||
|
||||
def _update_ontology_types(self):
|
||||
"""
|
||||
Creates a dictionary to translate from input labels to ontology labels.
|
||||
|
||||
If multiple input labels, creates mapping for each.
|
||||
"""
|
||||
|
||||
self._ontology_mapping = {}
|
||||
|
||||
for key, value in self.ontology.mapping.extended_schema.items():
|
||||
labels = value.get("input_label") or value.get("label_in_input")
|
||||
|
||||
if isinstance(labels, str):
|
||||
self._ontology_mapping[labels] = key
|
||||
|
||||
elif isinstance(labels, list):
|
||||
for label in labels:
|
||||
self._ontology_mapping[label] = key
|
||||
|
||||
if value.get("label_as_edge"):
|
||||
self._add_translation_mappings(labels, value["label_as_edge"])
|
||||
|
||||
else:
|
||||
self._add_translation_mappings(labels, key)
|
||||
|
||||
def _get_ontology_mapping(self, label: str) -> Optional[str]:
|
||||
"""
|
||||
For each given input type ("input_label" or "label_in_input"), find the
|
||||
corresponding ontology class in the leaves dictionary (from the
|
||||
`schema_config.yam`).
|
||||
|
||||
Args:
|
||||
label:
|
||||
The input type to find (`input_label` or `label_in_input` in
|
||||
`schema_config.yaml`).
|
||||
"""
|
||||
|
||||
# commented out until behaviour of _update_bl_types is fixed
|
||||
return self._ontology_mapping.get(label, None)
|
||||
|
||||
def translate_term(self, term):
|
||||
"""
|
||||
Translate a single term.
|
||||
"""
|
||||
|
||||
return self.mappings.get(term, None)
|
||||
|
||||
def reverse_translate_term(self, term):
|
||||
"""
|
||||
Reverse translate a single term.
|
||||
"""
|
||||
|
||||
return self.reverse_mappings.get(term, None)
|
||||
|
||||
def translate(self, query):
|
||||
"""
|
||||
Translate a cypher query. Only translates labels as of now.
|
||||
"""
|
||||
for key in self.mappings:
|
||||
query = query.replace(":" + key, ":" + self.mappings[key])
|
||||
return query
|
||||
|
||||
def reverse_translate(self, query):
|
||||
"""
|
||||
Reverse translate a cypher query. Only translates labels as of
|
||||
now.
|
||||
"""
|
||||
for key in self.reverse_mappings:
|
||||
a = ":" + key + ")"
|
||||
b = ":" + key + "]"
|
||||
# TODO this conditional probably does not cover all cases
|
||||
if a in query or b in query:
|
||||
if isinstance(self.reverse_mappings[key], list):
|
||||
raise NotImplementedError(
|
||||
"Reverse translation of multiple inputs not "
|
||||
"implemented yet. Many-to-one mappings are "
|
||||
"not reversible. "
|
||||
f"({key} -> {self.reverse_mappings[key]})",
|
||||
)
|
||||
else:
|
||||
query = query.replace(
|
||||
a,
|
||||
":" + self.reverse_mappings[key] + ")",
|
||||
).replace(b, ":" + self.reverse_mappings[key] + "]")
|
||||
return query
|
||||
|
||||
def _add_translation_mappings(self, original_name, biocypher_name):
|
||||
"""
|
||||
Add translation mappings for a label and name. We use here the
|
||||
PascalCase version of the BioCypher name, since sentence case is
|
||||
not useful for Cypher queries.
|
||||
"""
|
||||
if isinstance(original_name, list):
|
||||
for on in original_name:
|
||||
self.mappings[on] = self.name_sentence_to_pascal(
|
||||
biocypher_name,
|
||||
)
|
||||
else:
|
||||
self.mappings[original_name] = self.name_sentence_to_pascal(
|
||||
biocypher_name,
|
||||
)
|
||||
|
||||
if isinstance(biocypher_name, list):
|
||||
for bn in biocypher_name:
|
||||
self.reverse_mappings[
|
||||
self.name_sentence_to_pascal(
|
||||
bn,
|
||||
)
|
||||
] = original_name
|
||||
else:
|
||||
self.reverse_mappings[
|
||||
self.name_sentence_to_pascal(
|
||||
biocypher_name,
|
||||
)
|
||||
] = original_name
|
||||
|
||||
@staticmethod
|
||||
def name_sentence_to_pascal(name: str) -> str:
|
||||
"""
|
||||
Converts a name in sentence case to pascal case.
|
||||
"""
|
||||
# split on dots if dot is present
|
||||
if "." in name:
|
||||
return ".".join(
|
||||
[_misc.sentencecase_to_pascalcase(n) for n in name.split(".")],
|
||||
)
|
||||
else:
|
||||
return _misc.sentencecase_to_pascalcase(name)
|
0
biocypher/output/__init__.py
Normal file
0
biocypher/output/__init__.py
Normal file
0
biocypher/output/connect/__init__.py
Normal file
0
biocypher/output/connect/__init__.py
Normal file
422
biocypher/output/connect/_neo4j_driver.py
Normal file
422
biocypher/output/connect/_neo4j_driver.py
Normal file
@ -0,0 +1,422 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# ...
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'online' mode. Handles connection and manipulation of a running DBMS.
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
from biocypher._logger import logger
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from collections.abc import Iterable
|
||||
import itertools
|
||||
|
||||
import neo4j_utils
|
||||
|
||||
from biocypher import _misc
|
||||
from biocypher._config import config as _config
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode
|
||||
from biocypher._translate import Translator
|
||||
|
||||
__all__ = ["_Neo4jDriver"]
|
||||
|
||||
|
||||
class _Neo4jDriver:
|
||||
"""
|
||||
Manages a BioCypher connection to a Neo4j database using the
|
||||
``neo4j_utils.Driver`` class.
|
||||
|
||||
Args:
|
||||
|
||||
database_name (str): The name of the database to connect to.
|
||||
|
||||
wipe (bool): Whether to wipe the database before importing.
|
||||
|
||||
uri (str): The URI of the database.
|
||||
|
||||
user (str): The username to use for authentication.
|
||||
|
||||
password (str): The password to use for authentication.
|
||||
|
||||
multi_db (bool): Whether to use multi-database mode.
|
||||
|
||||
fetch_size (int): The number of records to fetch at a time.
|
||||
|
||||
increment_version (bool): Whether to increment the version number.
|
||||
|
||||
translator (Translator): The translator to use for mapping.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
database_name: str,
|
||||
uri: str,
|
||||
user: str,
|
||||
password: str,
|
||||
multi_db: bool,
|
||||
translator: Translator,
|
||||
wipe: bool = False,
|
||||
fetch_size: int = 1000,
|
||||
increment_version: bool = True,
|
||||
):
|
||||
self.translator = translator
|
||||
|
||||
self._driver = neo4j_utils.Driver(
|
||||
db_name=database_name,
|
||||
db_uri=uri,
|
||||
db_user=user,
|
||||
db_passwd=password,
|
||||
fetch_size=fetch_size,
|
||||
wipe=wipe,
|
||||
multi_db=multi_db,
|
||||
raise_errors=True,
|
||||
)
|
||||
|
||||
# check for biocypher config in connected graph
|
||||
|
||||
if wipe:
|
||||
self.init_db()
|
||||
|
||||
if increment_version:
|
||||
# set new current version node
|
||||
self._update_meta_graph()
|
||||
|
||||
def _update_meta_graph(self):
|
||||
logger.info("Updating Neo4j meta graph.")
|
||||
|
||||
# find current version node
|
||||
db_version = self._driver.query(
|
||||
"MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v",
|
||||
)
|
||||
# add version node
|
||||
self.add_biocypher_nodes(self.translator.ontology)
|
||||
|
||||
# connect version node to previous
|
||||
if db_version[0]:
|
||||
previous = db_version[0][0]
|
||||
previous_id = previous["v"]["id"]
|
||||
e_meta = BioCypherEdge(
|
||||
previous_id,
|
||||
self.translator.ontology.get_dict().get("node_id"),
|
||||
"PRECEDES",
|
||||
)
|
||||
self.add_biocypher_edges(e_meta)
|
||||
|
||||
def init_db(self):
|
||||
"""
|
||||
Used to initialise a property graph database by setting up new
|
||||
constraints. Wipe has been performed by the ``neo4j_utils.Driver``
|
||||
class` already.
|
||||
|
||||
Todo:
|
||||
- set up constraint creation interactively depending on the
|
||||
need of the database
|
||||
"""
|
||||
|
||||
logger.info("Initialising database.")
|
||||
self._create_constraints()
|
||||
|
||||
def _create_constraints(self):
|
||||
"""
|
||||
Creates constraints on node types in the graph. Used for
|
||||
initial setup.
|
||||
|
||||
Grabs leaves of the ``schema_config.yaml`` file and creates
|
||||
constraints on the id of all entities represented as nodes.
|
||||
"""
|
||||
|
||||
logger.info("Creating constraints for node types in config.")
|
||||
|
||||
major_neo4j_version = int(self._get_neo4j_version().split(".")[0])
|
||||
# get structure
|
||||
for leaf in self.translator.ontology.mapping.extended_schema.items():
|
||||
label = _misc.sentencecase_to_pascalcase(leaf[0], sep=r"\s\.")
|
||||
if leaf[1]["represented_as"] == "node":
|
||||
if major_neo4j_version >= 5:
|
||||
s = (
|
||||
f"CREATE CONSTRAINT `{label}_id` "
|
||||
f"IF NOT EXISTS FOR (n:`{label}`) "
|
||||
"REQUIRE n.id IS UNIQUE"
|
||||
)
|
||||
self._driver.query(s)
|
||||
else:
|
||||
s = (
|
||||
f"CREATE CONSTRAINT `{label}_id` "
|
||||
f"IF NOT EXISTS ON (n:`{label}`) "
|
||||
"ASSERT n.id IS UNIQUE"
|
||||
)
|
||||
self._driver.query(s)
|
||||
|
||||
def _get_neo4j_version(self):
|
||||
"""Get neo4j version."""
|
||||
try:
|
||||
neo4j_version = self._driver.query(
|
||||
"""
|
||||
CALL dbms.components()
|
||||
YIELD name, versions, edition
|
||||
UNWIND versions AS version
|
||||
RETURN version AS version
|
||||
""",
|
||||
)[0][0]["version"]
|
||||
return neo4j_version
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error detecting Neo4j version: {e} use default version 4.0.0."
|
||||
)
|
||||
return "4.0.0"
|
||||
|
||||
def add_nodes(self, id_type_tuples: Iterable[tuple]) -> tuple:
|
||||
"""
|
||||
Generic node adder method to add any kind of input to the graph via the
|
||||
:class:`biocypher.create.BioCypherNode` class. Employs translation
|
||||
functionality and calls the :meth:`add_biocypher_nodes()` method.
|
||||
|
||||
Args:
|
||||
id_type_tuples (iterable of 3-tuple): for each node to add to
|
||||
the biocypher graph, a 3-tuple with the following layout:
|
||||
first, the (unique if constrained) ID of the node; second, the
|
||||
type of the node, capitalised or PascalCase and in noun form
|
||||
(Neo4j primary label, eg `:Protein`); and third, a dictionary
|
||||
of arbitrary properties the node should possess (can be empty).
|
||||
|
||||
Returns:
|
||||
2-tuple: the query result of :meth:`add_biocypher_nodes()`
|
||||
- first entry: data
|
||||
- second entry: Neo4j summary.
|
||||
"""
|
||||
|
||||
bn = self.translator.translate_nodes(id_type_tuples)
|
||||
return self.add_biocypher_nodes(bn)
|
||||
|
||||
def add_edges(self, id_src_tar_type_tuples: Iterable[tuple]) -> tuple:
|
||||
"""
|
||||
Generic edge adder method to add any kind of input to the graph
|
||||
via the :class:`biocypher.create.BioCypherEdge` class. Employs
|
||||
translation functionality and calls the
|
||||
:meth:`add_biocypher_edges()` method.
|
||||
|
||||
Args:
|
||||
|
||||
id_src_tar_type_tuples (iterable of 5-tuple):
|
||||
|
||||
for each edge to add to the biocypher graph, a 5-tuple
|
||||
with the following layout: first, the optional unique ID
|
||||
of the interaction. This can be `None` if there is no
|
||||
systematic identifier (which for many interactions is
|
||||
the case). Second and third, the (unique if constrained)
|
||||
IDs of the source and target nodes of the relationship;
|
||||
fourth, the type of the relationship; and fifth, a
|
||||
dictionary of arbitrary properties the edge should
|
||||
possess (can be empty).
|
||||
|
||||
Returns:
|
||||
|
||||
2-tuple: the query result of :meth:`add_biocypher_edges()`
|
||||
|
||||
- first entry: data
|
||||
- second entry: Neo4j summary.
|
||||
"""
|
||||
|
||||
bn = self.translator.translate_edges(id_src_tar_type_tuples)
|
||||
return self.add_biocypher_edges(bn)
|
||||
|
||||
def add_biocypher_nodes(
|
||||
self,
|
||||
nodes: Iterable[BioCypherNode],
|
||||
explain: bool = False,
|
||||
profile: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Accepts a node type handoff class
|
||||
(:class:`biocypher.create.BioCypherNode`) with id,
|
||||
label, and a dict of properties (passing on the type of
|
||||
property, ie, ``int``, ``str``, ...).
|
||||
|
||||
The dict retrieved by the
|
||||
:meth:`biocypher.create.BioCypherNode.get_dict()` method is
|
||||
passed into Neo4j as a map of maps, explicitly encoding node id
|
||||
and label, and adding all other properties from the 'properties'
|
||||
key of the dict. The merge is performed via APOC, matching only
|
||||
on node id to prevent duplicates. The same properties are set on
|
||||
match and on create, irrespective of the actual event.
|
||||
|
||||
Args:
|
||||
nodes:
|
||||
An iterable of :class:`biocypher.create.BioCypherNode` objects.
|
||||
explain:
|
||||
Call ``EXPLAIN`` on the CYPHER query.
|
||||
profile:
|
||||
Do profiling on the CYPHER query.
|
||||
|
||||
Returns:
|
||||
True for success, False otherwise.
|
||||
"""
|
||||
|
||||
try:
|
||||
nodes = _misc.to_list(nodes)
|
||||
|
||||
entities = [node.get_dict() for node in nodes]
|
||||
|
||||
except AttributeError:
|
||||
msg = "Nodes must have a `get_dict` method."
|
||||
logger.error(msg)
|
||||
|
||||
raise ValueError(msg)
|
||||
|
||||
logger.info(f"Merging {len(entities)} nodes.")
|
||||
|
||||
entity_query = (
|
||||
"UNWIND $entities AS ent "
|
||||
"CALL apoc.merge.node([ent.node_label], "
|
||||
"{id: ent.node_id}, ent.properties, ent.properties) "
|
||||
"YIELD node "
|
||||
"RETURN node"
|
||||
)
|
||||
|
||||
method = "explain" if explain else "profile" if profile else "query"
|
||||
|
||||
result = getattr(self._driver, method)(
|
||||
entity_query,
|
||||
parameters={
|
||||
"entities": entities,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info("Finished merging nodes.")
|
||||
|
||||
return result
|
||||
|
||||
def add_biocypher_edges(
|
||||
self,
|
||||
edges: Iterable[BioCypherEdge],
|
||||
explain: bool = False,
|
||||
profile: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Accepts an edge type handoff class
|
||||
(:class:`biocypher.create.BioCypherEdge`) with source
|
||||
and target ids, label, and a dict of properties (passing on the
|
||||
type of property, ie, int, string ...).
|
||||
|
||||
The individual edge is either passed as a singleton, in the case
|
||||
of representation as an edge in the graph, or as a 4-tuple, in
|
||||
the case of representation as a node (with two edges connecting
|
||||
to interaction partners).
|
||||
|
||||
The dict retrieved by the
|
||||
:meth:`biocypher.create.BioCypherEdge.get_dict()` method is
|
||||
passed into Neo4j as a map of maps, explicitly encoding source
|
||||
and target ids and the relationship label, and adding all edge
|
||||
properties from the 'properties' key of the dict. The merge is
|
||||
performed via APOC, matching only on source and target id to
|
||||
prevent duplicates. The same properties are set on match and on
|
||||
create, irrespective of the actual event.
|
||||
|
||||
Args:
|
||||
edges:
|
||||
An iterable of :class:`biocypher.create.BioCypherEdge` objects.
|
||||
explain:
|
||||
Call ``EXPLAIN`` on the CYPHER query.
|
||||
profile:
|
||||
Do profiling on the CYPHER query.
|
||||
|
||||
Returns:
|
||||
`True` for success, `False` otherwise.
|
||||
"""
|
||||
|
||||
edges = _misc.ensure_iterable(edges)
|
||||
edges = itertools.chain(*(_misc.ensure_iterable(i) for i in edges))
|
||||
|
||||
nodes = []
|
||||
rels = []
|
||||
|
||||
try:
|
||||
for e in edges:
|
||||
if hasattr(e, "get_node"):
|
||||
nodes.append(e.get_node())
|
||||
rels.append(e.get_source_edge().get_dict())
|
||||
rels.append(e.get_target_edge().get_dict())
|
||||
|
||||
else:
|
||||
rels.append(e.get_dict())
|
||||
|
||||
except AttributeError:
|
||||
msg = "Edges and nodes must have a `get_dict` method."
|
||||
logger.error(msg)
|
||||
|
||||
raise ValueError(msg)
|
||||
|
||||
self.add_biocypher_nodes(nodes)
|
||||
logger.info(f"Merging {len(rels)} edges.")
|
||||
|
||||
# cypher query
|
||||
|
||||
# merging only on the ids of the entities, passing the
|
||||
# properties on match and on create;
|
||||
# TODO add node labels?
|
||||
node_query = (
|
||||
"UNWIND $rels AS r "
|
||||
"MERGE (src {id: r.source_id}) "
|
||||
"MERGE (tar {id: r.target_id}) "
|
||||
)
|
||||
|
||||
self._driver.query(node_query, parameters={"rels": rels})
|
||||
|
||||
edge_query = (
|
||||
"UNWIND $rels AS r "
|
||||
"MATCH (src {id: r.source_id}) "
|
||||
"MATCH (tar {id: r.target_id}) "
|
||||
"WITH src, tar, r "
|
||||
"CALL apoc.merge.relationship"
|
||||
"(src, r.relationship_label, NULL, "
|
||||
"r.properties, tar, r.properties) "
|
||||
"YIELD rel "
|
||||
"RETURN rel"
|
||||
)
|
||||
|
||||
method = "explain" if explain else "profile" if profile else "query"
|
||||
|
||||
result = getattr(self._driver, method)(
|
||||
edge_query, parameters={"rels": rels}
|
||||
)
|
||||
|
||||
logger.info("Finished merging edges.")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_driver(
|
||||
dbms: str,
|
||||
translator: "Translator",
|
||||
):
|
||||
"""
|
||||
Function to return the writer class.
|
||||
|
||||
Returns:
|
||||
class: the writer class
|
||||
"""
|
||||
|
||||
dbms_config = _config(dbms)
|
||||
|
||||
if dbms == "neo4j":
|
||||
return _Neo4jDriver(
|
||||
database_name=dbms_config["database_name"],
|
||||
wipe=dbms_config["wipe"],
|
||||
uri=dbms_config["uri"],
|
||||
user=dbms_config["user"],
|
||||
password=dbms_config["password"],
|
||||
multi_db=dbms_config["multi_db"],
|
||||
translator=translator,
|
||||
)
|
||||
|
||||
return None
|
0
biocypher/output/in_memory/__init__.py
Normal file
0
biocypher/output/in_memory/__init__.py
Normal file
90
biocypher/output/in_memory/_pandas.py
Normal file
90
biocypher/output/in_memory/_pandas.py
Normal file
@ -0,0 +1,90 @@
|
||||
import pandas as pd
|
||||
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
||||
|
||||
|
||||
class Pandas:
|
||||
def __init__(self, translator, deduplicator):
|
||||
self.translator = translator
|
||||
self.deduplicator = deduplicator
|
||||
|
||||
self.dfs = {}
|
||||
|
||||
def _separate_entity_types(self, entities):
|
||||
"""
|
||||
Given mixed iterable of BioCypher objects, separate them into lists by
|
||||
type. Also deduplicates using the `Deduplicator` instance.
|
||||
"""
|
||||
lists = {}
|
||||
for entity in entities:
|
||||
if (
|
||||
not isinstance(entity, BioCypherNode)
|
||||
and not isinstance(entity, BioCypherEdge)
|
||||
and not isinstance(entity, BioCypherRelAsNode)
|
||||
):
|
||||
raise TypeError(
|
||||
"Expected a BioCypherNode / BioCypherEdge / "
|
||||
f"BioCypherRelAsNode, got {type(entity)}."
|
||||
)
|
||||
|
||||
if isinstance(entity, BioCypherNode):
|
||||
seen = self.deduplicator.node_seen(entity)
|
||||
elif isinstance(entity, BioCypherEdge):
|
||||
seen = self.deduplicator.edge_seen(entity)
|
||||
elif isinstance(entity, BioCypherRelAsNode):
|
||||
seen = self.deduplicator.rel_as_node_seen(entity)
|
||||
|
||||
if seen:
|
||||
continue
|
||||
|
||||
if isinstance(entity, BioCypherRelAsNode):
|
||||
node = entity.get_node()
|
||||
source_edge = entity.get_source_edge()
|
||||
target_edge = entity.get_target_edge()
|
||||
|
||||
_type = node.get_type()
|
||||
if not _type in lists:
|
||||
lists[_type] = []
|
||||
lists[_type].append(node)
|
||||
|
||||
_source_type = source_edge.get_type()
|
||||
if not _source_type in lists:
|
||||
lists[_source_type] = []
|
||||
lists[_source_type].append(source_edge)
|
||||
|
||||
_target_type = target_edge.get_type()
|
||||
if not _target_type in lists:
|
||||
lists[_target_type] = []
|
||||
lists[_target_type].append(target_edge)
|
||||
continue
|
||||
|
||||
_type = entity.get_type()
|
||||
if not _type in lists:
|
||||
lists[_type] = []
|
||||
lists[_type].append(entity)
|
||||
|
||||
return lists
|
||||
|
||||
def add_tables(self, entities):
|
||||
"""
|
||||
Add Pandas dataframes for each node and edge type in the input.
|
||||
"""
|
||||
|
||||
lists = self._separate_entity_types(entities)
|
||||
|
||||
for _type, _entities in lists.items():
|
||||
self._add_entity_df(_type, _entities)
|
||||
|
||||
def _add_entity_df(self, _type, _entities):
|
||||
df = pd.DataFrame(
|
||||
pd.json_normalize([node.get_dict() for node in _entities])
|
||||
)
|
||||
# replace "properties." with "" in column names
|
||||
df.columns = [col.replace("properties.", "") for col in df.columns]
|
||||
if _type not in self.dfs:
|
||||
self.dfs[_type] = df
|
||||
else:
|
||||
self.dfs[_type] = pd.concat(
|
||||
[self.dfs[_type], df], ignore_index=True
|
||||
)
|
||||
return self.dfs[_type]
|
0
biocypher/output/write/__init__.py
Normal file
0
biocypher/output/write/__init__.py
Normal file
1046
biocypher/output/write/_batch_writer.py
Normal file
1046
biocypher/output/write/_batch_writer.py
Normal file
File diff suppressed because it is too large
Load Diff
113
biocypher/output/write/_get_writer.py
Normal file
113
biocypher/output/write/_get_writer.py
Normal file
@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Sebastian Lobentanzer
|
||||
# Michael Hartung
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'offline' module. Handles the writing of node and edge representations
|
||||
suitable for import into a DBMS.
|
||||
"""
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write.graph._rdf import _RDFWriter
|
||||
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
|
||||
from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
|
||||
from biocypher.output.write.graph._networkx import _NetworkXWriter
|
||||
from biocypher.output.write.relational._csv import _PandasCSVWriter
|
||||
from biocypher.output.write.relational._sqlite import _SQLiteBatchWriter
|
||||
from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
|
||||
|
||||
logger.debug(f"Loading module {__name__}.")
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from biocypher._config import config as _config
|
||||
|
||||
__all__ = ["get_writer", "DBMS_TO_CLASS"]
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from biocypher._translate import Translator
|
||||
from biocypher._deduplicate import Deduplicator
|
||||
|
||||
DBMS_TO_CLASS = {
|
||||
"neo": _Neo4jBatchWriter,
|
||||
"neo4j": _Neo4jBatchWriter,
|
||||
"Neo4j": _Neo4jBatchWriter,
|
||||
"postgres": _PostgreSQLBatchWriter,
|
||||
"postgresql": _PostgreSQLBatchWriter,
|
||||
"PostgreSQL": _PostgreSQLBatchWriter,
|
||||
"arango": _ArangoDBBatchWriter,
|
||||
"arangodb": _ArangoDBBatchWriter,
|
||||
"ArangoDB": _ArangoDBBatchWriter,
|
||||
"sqlite": _SQLiteBatchWriter,
|
||||
"sqlite3": _SQLiteBatchWriter,
|
||||
"rdf": _RDFWriter,
|
||||
"RDF": _RDFWriter,
|
||||
"csv": _PandasCSVWriter,
|
||||
"CSV": _PandasCSVWriter,
|
||||
"pandas": _PandasCSVWriter,
|
||||
"Pandas": _PandasCSVWriter,
|
||||
"networkx": _NetworkXWriter,
|
||||
"NetworkX": _NetworkXWriter,
|
||||
}
|
||||
|
||||
|
||||
def get_writer(
|
||||
dbms: str,
|
||||
translator: "Translator",
|
||||
deduplicator: "Deduplicator",
|
||||
output_directory: str,
|
||||
strict_mode: bool,
|
||||
):
|
||||
"""
|
||||
Function to return the writer class based on the selection in the config
|
||||
file.
|
||||
|
||||
Args:
|
||||
dbms: the database management system; for options, see DBMS_TO_CLASS.
|
||||
translator: the Translator object.
|
||||
deduplicator: the Deduplicator object.
|
||||
output_directory: the directory to output.write the output files to.
|
||||
strict_mode: whether to use strict mode.
|
||||
|
||||
Returns:
|
||||
instance: an instance of the selected writer class.
|
||||
"""
|
||||
|
||||
dbms_config = _config(dbms)
|
||||
|
||||
writer = DBMS_TO_CLASS[dbms]
|
||||
|
||||
if not writer:
|
||||
raise ValueError(f"Unknown dbms: {dbms}")
|
||||
|
||||
if writer is not None:
|
||||
return writer(
|
||||
translator=translator,
|
||||
deduplicator=deduplicator,
|
||||
delimiter=dbms_config.get("delimiter"),
|
||||
array_delimiter=dbms_config.get("array_delimiter"),
|
||||
quote=dbms_config.get("quote_character"),
|
||||
output_directory=output_directory,
|
||||
db_name=dbms_config.get("database_name"),
|
||||
import_call_bin_prefix=dbms_config.get("import_call_bin_prefix"),
|
||||
import_call_file_prefix=dbms_config.get("import_call_file_prefix"),
|
||||
wipe=dbms_config.get("wipe"),
|
||||
strict_mode=strict_mode,
|
||||
skip_bad_relationships=dbms_config.get(
|
||||
"skip_bad_relationships"
|
||||
), # neo4j
|
||||
skip_duplicate_nodes=dbms_config.get(
|
||||
"skip_duplicate_nodes"
|
||||
), # neo4j
|
||||
db_user=dbms_config.get("user"), # psql
|
||||
db_password=dbms_config.get("password"), # psql
|
||||
db_port=dbms_config.get("port"), # psql
|
||||
rdf_format=dbms_config.get("rdf_format"), # rdf
|
||||
rdf_namespaces=dbms_config.get("rdf_namespaces"), # rdf
|
||||
)
|
200
biocypher/output/write/_writer.py
Normal file
200
biocypher/output/write/_writer.py
Normal file
@ -0,0 +1,200 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Union, Optional
|
||||
from collections.abc import Iterable
|
||||
import os
|
||||
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
||||
from biocypher._logger import logger
|
||||
from biocypher._translate import Translator
|
||||
from biocypher._deduplicate import Deduplicator
|
||||
|
||||
__all__ = ["_Writer"]
|
||||
|
||||
|
||||
class _Writer(ABC):
|
||||
"""Abstract class for writing node and edge representations to disk.
|
||||
Specifics of the different writers (e.g. neo4j, postgresql, csv, etc.)
|
||||
are implemented in the child classes. Any concrete writer needs to
|
||||
implement at least:
|
||||
- _write_node_data
|
||||
- _write_edge_data
|
||||
- _construct_import_call
|
||||
- _get_import_script_name
|
||||
|
||||
Args:
|
||||
translator (Translator): Instance of :py:class:`Translator` to enable translation of
|
||||
nodes and manipulation of properties.
|
||||
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
|
||||
of nodes and edges.
|
||||
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
|
||||
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
||||
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: Writer implementation must override '_write_node_data'
|
||||
NotImplementedError: Writer implementation must override '_write_edge_data'
|
||||
NotImplementedError: Writer implementation must override '_construct_import_call'
|
||||
NotImplementedError: Writer implementation must override '_get_import_script_name'
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
translator: Translator,
|
||||
deduplicator: Deduplicator,
|
||||
output_directory: Optional[str] = None,
|
||||
strict_mode: bool = False,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""Abstract class for writing node and edge representations to disk.
|
||||
|
||||
Args:
|
||||
translator (Translator): Instance of :py:class:`Translator` to enable translation of
|
||||
nodes and manipulation of properties.
|
||||
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
|
||||
of nodes and edges.
|
||||
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
|
||||
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
||||
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
||||
"""
|
||||
self.translator = translator
|
||||
self.deduplicator = deduplicator
|
||||
self.strict_mode = strict_mode
|
||||
self.output_directory = output_directory
|
||||
|
||||
if os.path.exists(self.output_directory):
|
||||
if kwargs.get("write_to_file", True):
|
||||
logger.warning(
|
||||
f"Output directory `{self.output_directory}` already exists. "
|
||||
"If this is not planned, file consistency may be compromised."
|
||||
)
|
||||
else:
|
||||
logger.info(f"Creating output directory `{self.output_directory}`.")
|
||||
os.makedirs(self.output_directory)
|
||||
|
||||
@abstractmethod
|
||||
def _write_node_data(
|
||||
self,
|
||||
nodes: Iterable[
|
||||
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
|
||||
],
|
||||
) -> bool:
|
||||
"""Implement how to output.write nodes to disk.
|
||||
|
||||
Args:
|
||||
nodes (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Writer implementation must override 'write_nodes'"
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _write_edge_data(
|
||||
self,
|
||||
edges: Iterable[
|
||||
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
|
||||
],
|
||||
) -> bool:
|
||||
"""Implement how to output.write edges to disk.
|
||||
|
||||
Args:
|
||||
edges (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Writer implementation must override 'write_edges'"
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: command for importing the output files into a DBMS.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Writer implementation must override '_construct_import_call'"
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""Returns the name of the import script.
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Writer implementation must override '_get_import_script_name'"
|
||||
)
|
||||
|
||||
def write_nodes(
|
||||
self, nodes, batch_size: int = int(1e6), force: bool = False
|
||||
):
|
||||
"""Wrapper for writing nodes.
|
||||
|
||||
Args:
|
||||
nodes (BioCypherNode): a list or generator of nodes in
|
||||
:py:class:`BioCypherNode` format
|
||||
batch_size (int): The batch size for writing nodes.
|
||||
force (bool): Whether to force writing nodes even if their type is
|
||||
not present in the schema.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
passed = self._write_node_data(nodes)
|
||||
if not passed:
|
||||
logger.error("Error while writing node data.")
|
||||
return False
|
||||
return True
|
||||
|
||||
def write_edges(
|
||||
self, edges, batch_size: int = int(1e6), force: bool = False
|
||||
):
|
||||
"""Wrapper for writing edges.
|
||||
|
||||
Args:
|
||||
nodes (BioCypherNode): a list or generator of nodes in
|
||||
:py:class:`BioCypherNode` format
|
||||
batch_size (int): The batch size for writing nodes.
|
||||
force (bool): Whether to force writing nodes even if their type is
|
||||
not present in the schema.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
passed = self._write_edge_data(edges)
|
||||
if not passed:
|
||||
logger.error("Error while writing edge data.")
|
||||
return False
|
||||
return True
|
||||
|
||||
def write_import_call(self):
|
||||
"""
|
||||
Function to output.write the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name, to the export folder as txt.
|
||||
|
||||
Returns:
|
||||
str: The path of the file holding the import call.
|
||||
"""
|
||||
file_path = os.path.join(
|
||||
self.output_directory, self._get_import_script_name()
|
||||
)
|
||||
logger.info(
|
||||
f"Writing {self.__class__.__name__} import call to `{file_path}`."
|
||||
)
|
||||
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(self._construct_import_call())
|
||||
|
||||
return file_path
|
0
biocypher/output/write/graph/__init__.py
Normal file
0
biocypher/output/write/graph/__init__.py
Normal file
241
biocypher/output/write/graph/_arangodb.py
Normal file
241
biocypher/output/write/graph/_arangodb.py
Normal file
@ -0,0 +1,241 @@
|
||||
import os
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
|
||||
|
||||
|
||||
class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to disk using the format
|
||||
specified by ArangoDB for the use of "arangoimport". Output files are
|
||||
similar to Neo4j, but with a different header format.
|
||||
"""
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the neo4j admin import location
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the neo4j admin import script
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return "arangodb-import-call.sh"
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of node.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.node_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.node_property_dict.items():
|
||||
# create header CSV with ID, properties, labels
|
||||
|
||||
_id = "_key"
|
||||
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
|
||||
# check if file already exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"File {header_path} already exists. Overwriting."
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k in props.keys():
|
||||
props_list.append(f"{k}")
|
||||
|
||||
# create list of lists and flatten
|
||||
# removes need for empty check of property list
|
||||
out_list = [[_id], props_list]
|
||||
out_list = [val for sublist in out_list for val in sublist]
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# concatenate with delimiter
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
# add collection from schema config
|
||||
collection = self.translator.ontology.mapping.extended_schema[
|
||||
label
|
||||
].get("db_collection_name", None)
|
||||
|
||||
# add file path to neo4 admin import statement
|
||||
# do once for each part file
|
||||
parts = self.parts.get(label, [])
|
||||
|
||||
if not parts:
|
||||
raise ValueError(
|
||||
f"No parts found for node label {label}. "
|
||||
f"Check that the data was parsed first.",
|
||||
)
|
||||
|
||||
for part in parts:
|
||||
import_call_header_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
import_call_parts_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
part,
|
||||
)
|
||||
|
||||
self.import_call_nodes.add(
|
||||
(
|
||||
import_call_header_path,
|
||||
import_call_parts_path,
|
||||
collection,
|
||||
)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.edge_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.edge_property_dict.items():
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
# paths
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
parts = f"{pascal_label}-part.*"
|
||||
|
||||
# check for file exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"Header file {header_path} already exists. Overwriting."
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k in props.keys():
|
||||
props_list.append(f"{k}")
|
||||
|
||||
out_list = ["_from", "_key", *props_list, "_to"]
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# concatenate with delimiter
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
# add collection from schema config
|
||||
if not self.translator.ontology.mapping.extended_schema.get(label):
|
||||
for (
|
||||
_,
|
||||
v,
|
||||
) in self.translator.ontology.mapping.extended_schema.items():
|
||||
if v.get("label_as_edge") == label:
|
||||
collection = v.get("db_collection_name", None)
|
||||
break
|
||||
|
||||
else:
|
||||
collection = self.translator.ontology.mapping.extended_schema[
|
||||
label
|
||||
].get("db_collection_name", None)
|
||||
|
||||
# add file path to neo4 admin import statement (import call path
|
||||
# may be different from actual output path)
|
||||
header_import_call_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
parts_import_call_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
parts,
|
||||
)
|
||||
self.import_call_edges.add(
|
||||
(
|
||||
header_import_call_path,
|
||||
parts_import_call_path,
|
||||
collection,
|
||||
)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for neo4j-admin import
|
||||
"""
|
||||
import_call = (
|
||||
f"{self.import_call_bin_prefix}arangoimp "
|
||||
f"--type csv "
|
||||
f'--separator="{self.escaped_delim}" '
|
||||
)
|
||||
|
||||
if self.quote == "'":
|
||||
import_call += f'--quote="{self.quote}" '
|
||||
else:
|
||||
import_call += f"--quote='{self.quote}' "
|
||||
|
||||
node_lines = ""
|
||||
|
||||
# node import calls: one line per node type
|
||||
for header_path, parts_path, collection in self.import_call_nodes:
|
||||
line = (
|
||||
f"{import_call} "
|
||||
f"--headers-file {header_path} "
|
||||
f"--file= {parts_path} "
|
||||
)
|
||||
|
||||
if collection:
|
||||
line += f"--create-collection --collection {collection} "
|
||||
|
||||
node_lines += f"{line}\n"
|
||||
|
||||
edge_lines = ""
|
||||
|
||||
# edge import calls: one line per edge type
|
||||
for header_path, parts_path, collection in self.import_call_edges:
|
||||
import_call += f'--relationships="{header_path},{parts_path}" '
|
||||
|
||||
return node_lines + edge_lines
|
502
biocypher/output/write/graph/_neo4j.py
Normal file
502
biocypher/output/write/graph/_neo4j.py
Normal file
@ -0,0 +1,502 @@
|
||||
import os
|
||||
import glob
|
||||
import pandas as pd
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._batch_writer import parse_label, _BatchWriter
|
||||
|
||||
|
||||
class _Neo4jBatchWriter(_BatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to disk using the
|
||||
format specified by Neo4j for the use of admin import. Each batch
|
||||
writer instance has a fixed representation that needs to be passed
|
||||
at instantiation via the :py:attr:`schema` argument. The instance
|
||||
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
|
||||
to convert and extend the hierarchy.
|
||||
|
||||
This class inherits from the abstract class "_BatchWriter" and implements the
|
||||
Neo4j-specific methods:
|
||||
|
||||
- _write_node_headers
|
||||
- _write_edge_headers
|
||||
- _construct_import_call
|
||||
- _write_array_string
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Constructor.
|
||||
|
||||
Check the version of Neo4j and adds a command scope if version >= 5.
|
||||
|
||||
Returns:
|
||||
_Neo4jBatchWriter: An instance of the writer.
|
||||
"""
|
||||
|
||||
# Should read the configuration and setup import_call_bin_prefix.
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the neo4j admin import location
|
||||
"""
|
||||
|
||||
return "bin/"
|
||||
|
||||
def _write_array_string(self, string_list):
|
||||
"""
|
||||
Abstract method to output.write the string representation of an array into a .csv file
|
||||
as required by the neo4j admin-import.
|
||||
|
||||
Args:
|
||||
string_list (list): list of ontology strings
|
||||
|
||||
Returns:
|
||||
str: The string representation of an array for the neo4j admin import
|
||||
"""
|
||||
string = self.adelim.join(string_list)
|
||||
return f"{self.quote}{string}{self.quote}"
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of node.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.node_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.node_property_dict.items():
|
||||
_id = ":ID"
|
||||
|
||||
##MeDaX dev remark:
|
||||
##From Fhir data we get case sensitive labels. E.g. 'Procedure' and 'procedure' are two distinct node types.
|
||||
##Because we are converting Resources to more specific node classes using their "resourceType" attribute.
|
||||
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(
|
||||
parse_label(label)
|
||||
)
|
||||
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
parts = f"{pascal_label}-part.*"
|
||||
|
||||
existing_header = False
|
||||
# check if file already exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"Header file `{header_path}` already exists. Overwriting.",
|
||||
)
|
||||
with open(header_path, "r", encoding="utf-8") as existing:
|
||||
existing_header = existing.read().strip().split(self.delim)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k, v in props.items():
|
||||
if v in ["int", "long", "integer"]:
|
||||
props_list.append(f"{k}:long")
|
||||
elif v in ["int[]", "long[]", "integer[]"]:
|
||||
props_list.append(f"{k}:long[]")
|
||||
elif v in ["float", "double", "dbl"]:
|
||||
props_list.append(f"{k}:double")
|
||||
elif v in ["float[]", "double[]"]:
|
||||
props_list.append(f"{k}:double[]")
|
||||
elif v in ["bool", "boolean"]:
|
||||
# TODO Neo4j boolean support / spelling?
|
||||
props_list.append(f"{k}:boolean")
|
||||
elif v in ["bool[]", "boolean[]"]:
|
||||
props_list.append(f"{k}:boolean[]")
|
||||
elif v in ["str[]", "string[]"]:
|
||||
props_list.append(f"{k}:string[]")
|
||||
else:
|
||||
props_list.append(f"{k}")
|
||||
|
||||
# create list of lists and flatten
|
||||
out_list = [[_id], props_list, [":LABEL"]]
|
||||
out_list = [val for sublist in out_list for val in sublist]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# Check if header file already exists and has different columns
|
||||
if os.path.exists(header_path):
|
||||
if existing_header:
|
||||
#existing_header = existing.read().strip().split(self.delim)
|
||||
# Compare existing and new headers
|
||||
if set(existing_header) != set(out_list):
|
||||
|
||||
# Get part files associated with this header
|
||||
base_name = os.path.basename(header_path).replace("-header.csv", "")
|
||||
part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
|
||||
|
||||
|
||||
# Find the highest numbered part file without full sorting
|
||||
highest_part = None
|
||||
highest_number = -1
|
||||
|
||||
for part_file in part_files:
|
||||
try:
|
||||
# Extract number from filename (assuming format like "part123.csv")
|
||||
file_name = os.path.basename(part_file)
|
||||
number_part = file_name.split("part")[1].split(".")[0]
|
||||
number = int(number_part)
|
||||
|
||||
if number > highest_number:
|
||||
highest_number = number
|
||||
highest_part = part_file
|
||||
except (IndexError, ValueError):
|
||||
# Skip files that don't match the expected pattern
|
||||
continue
|
||||
# Update each part file with the new columns
|
||||
for part_file in part_files:
|
||||
if part_file == highest_part:
|
||||
print(f"Skipping the highest part file: {highest_part}")
|
||||
continue
|
||||
try:
|
||||
#print("exi: ", existing_header)
|
||||
#print("out: ", out_list)
|
||||
df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
|
||||
# Read the file without headers
|
||||
|
||||
# Write back to file WITHOUT including the header
|
||||
df.to_csv(part_file, sep=self.delim, index=False, header=False)
|
||||
print(f"Updated {part_file} with new columns in correct positions")
|
||||
except Exception as e:
|
||||
print(f"Error updating {part_file}: {e}")
|
||||
|
||||
# Write the new header
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
|
||||
# add file path to neo4 admin import statement (import call file
|
||||
# path may be different from actual file path)
|
||||
import_call_header_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
import_call_parts_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
parts,
|
||||
)
|
||||
self.import_call_nodes.add(
|
||||
(import_call_header_path, import_call_parts_path)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.edge_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.edge_property_dict.items():
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(
|
||||
parse_label(label)
|
||||
)
|
||||
|
||||
# paths
|
||||
header = f"{pascal_label}-header.csv"
|
||||
header_path = os.path.join(
|
||||
self.outdir,
|
||||
header,
|
||||
)
|
||||
parts = f"{pascal_label}-part.*"
|
||||
|
||||
# check for file exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"File {header_path} already exists. Overwriting."
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
props_list = []
|
||||
for k, v in props.items():
|
||||
if v in ["int", "long", "integer"]:
|
||||
props_list.append(f"{k}:long")
|
||||
elif v in ["int[]", "long[]", "integer[]"]:
|
||||
props_list.append(f"{k}:long[]")
|
||||
elif v in ["float", "double"]:
|
||||
props_list.append(f"{k}:double")
|
||||
elif v in ["float[]", "double[]"]:
|
||||
props_list.append(f"{k}:double[]")
|
||||
elif v in [
|
||||
"bool",
|
||||
"boolean",
|
||||
]: # TODO does Neo4j support bool?
|
||||
props_list.append(f"{k}:boolean")
|
||||
elif v in ["bool[]", "boolean[]"]:
|
||||
props_list.append(f"{k}:boolean[]")
|
||||
elif v in ["str[]", "string[]"]:
|
||||
props_list.append(f"{k}:string[]")
|
||||
else:
|
||||
props_list.append(f"{k}")
|
||||
|
||||
skip_id = False
|
||||
schema_label = None
|
||||
|
||||
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
|
||||
skip_id = True
|
||||
elif not self.translator.ontology.mapping.extended_schema.get(
|
||||
label
|
||||
):
|
||||
# find label in schema by label_as_edge
|
||||
for (
|
||||
k,
|
||||
v,
|
||||
) in self.translator.ontology.mapping.extended_schema.items():
|
||||
if v.get("label_as_edge") == label:
|
||||
schema_label = k
|
||||
break
|
||||
else:
|
||||
schema_label = label
|
||||
|
||||
out_list = [":START_ID"]
|
||||
|
||||
if schema_label:
|
||||
if (
|
||||
self.translator.ontology.mapping.extended_schema.get(
|
||||
schema_label
|
||||
).get("use_id")
|
||||
== False
|
||||
):
|
||||
skip_id = True
|
||||
|
||||
if not skip_id:
|
||||
out_list.append("id")
|
||||
|
||||
out_list.extend(props_list)
|
||||
out_list.extend([":END_ID", ":TYPE"])
|
||||
|
||||
existing_header = False
|
||||
# check if file already exists
|
||||
if os.path.exists(header_path):
|
||||
logger.warning(
|
||||
f"Header file `{header_path}` already exists. Overwriting.",
|
||||
)
|
||||
with open(header_path, "r", encoding="utf-8") as existing:
|
||||
existing_header = existing.read().strip().split(self.delim)
|
||||
|
||||
|
||||
with open(header_path, "w", encoding="utf-8") as f:
|
||||
# Check if header file already exists and has different columns
|
||||
if os.path.exists(header_path):
|
||||
if existing_header:
|
||||
#existing_header = existing.read().strip().split(self.delim)
|
||||
# Compare existing and new headers
|
||||
if set(existing_header) != set(out_list):
|
||||
|
||||
# Get part files associated with this header
|
||||
base_name = os.path.basename(header_path).replace("-header.csv", "")
|
||||
part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
|
||||
|
||||
|
||||
# Find the highest numbered part file without full sorting
|
||||
highest_part = None
|
||||
highest_number = -1
|
||||
|
||||
for part_file in part_files:
|
||||
try:
|
||||
# Extract number from filename (assuming format like "part123.csv")
|
||||
file_name = os.path.basename(part_file)
|
||||
number_part = file_name.split("part")[1].split(".")[0]
|
||||
number = int(number_part)
|
||||
|
||||
if number > highest_number:
|
||||
highest_number = number
|
||||
highest_part = part_file
|
||||
except (IndexError, ValueError):
|
||||
# Skip files that don't match the expected pattern
|
||||
continue
|
||||
# Update each part file with the new columns
|
||||
for part_file in part_files:
|
||||
if part_file == highest_part:
|
||||
print(f"Skipping the highest part file: {highest_part}")
|
||||
continue
|
||||
try:
|
||||
print("exi: ", existing_header)
|
||||
print("out: ", out_list)
|
||||
df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
|
||||
# Read the file without headers
|
||||
|
||||
# Write back to file WITHOUT including the header
|
||||
df.to_csv(part_file, sep=self.delim, index=False, header=False)
|
||||
print(f"Updated {part_file} with new columns in correct positions")
|
||||
except Exception as e:
|
||||
print(f"Error updating {part_file}: {e}")
|
||||
|
||||
# Write the new header
|
||||
row = self.delim.join(out_list)
|
||||
f.write(row)
|
||||
|
||||
# add file path to neo4 admin import statement (import call file
|
||||
# path may be different from actual file path)
|
||||
import_call_header_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
header,
|
||||
)
|
||||
import_call_parts_path = os.path.join(
|
||||
self.import_call_file_prefix,
|
||||
parts,
|
||||
)
|
||||
self.import_call_edges.add(
|
||||
(import_call_header_path, import_call_parts_path)
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the neo4j admin import script
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return "neo4j-admin-import-call.sh"
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for neo4j-admin import
|
||||
"""
|
||||
import_call_neo4j_v4 = self._get_import_call(
|
||||
"import", "--database=", "--force="
|
||||
)
|
||||
import_call_neo4j_v5 = self._get_import_call(
|
||||
"database import full", "", "--overwrite-destination="
|
||||
)
|
||||
neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
|
||||
|
||||
import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
|
||||
return import_script
|
||||
|
||||
def _get_import_call(
|
||||
self, import_cmd: str, database_cmd: str, wipe_cmd: str
|
||||
) -> str:
|
||||
"""Get parametrized import call for Neo4j 4 or 5+.
|
||||
|
||||
Args:
|
||||
import_cmd (str): The import command to use.
|
||||
database_cmd (str): The database command to use.
|
||||
wipe_cmd (str): The wipe command to use.
|
||||
|
||||
Returns:
|
||||
str: The import call.
|
||||
"""
|
||||
import_call = f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "
|
||||
|
||||
import_call += f"{database_cmd}{self.db_name} "
|
||||
|
||||
import_call += f'--delimiter="{self.escaped_delim}" '
|
||||
|
||||
import_call += f'--array-delimiter="{self.escaped_adelim}" '
|
||||
|
||||
if self.quote == "'":
|
||||
import_call += f'--quote="{self.quote}" '
|
||||
else:
|
||||
import_call += f"--quote='{self.quote}' "
|
||||
|
||||
if self.wipe:
|
||||
import_call += f"{wipe_cmd}true "
|
||||
if self.skip_bad_relationships:
|
||||
import_call += "--skip-bad-relationships=true "
|
||||
if self.skip_duplicate_nodes:
|
||||
import_call += "--skip-duplicate-nodes=true "
|
||||
|
||||
# append node import calls
|
||||
for header_path, parts_path in self.import_call_nodes:
|
||||
import_call += f'--nodes="{header_path},{parts_path}" '
|
||||
|
||||
# append edge import calls
|
||||
for header_path, parts_path in self.import_call_edges:
|
||||
import_call += f'--relationships="{header_path},{parts_path}" '
|
||||
|
||||
return import_call
|
||||
|
||||
|
||||
|
||||
|
||||
def adapt_csv_to_new_header(self, old_header, new_header, csv_file_path):
|
||||
"""
|
||||
Adapt a CSV table to a new header structure, placing new columns in their correct positions.
|
||||
|
||||
Parameters:
|
||||
old_header (list): The original header columns
|
||||
new_header (list): The new header columns
|
||||
csv_file_path (str): Path to the CSV file
|
||||
|
||||
Returns:
|
||||
pandas.DataFrame: CSV data with the new header structure
|
||||
"""
|
||||
|
||||
# Step 1: Read the CSV data without headers
|
||||
df = pd.read_csv(csv_file_path, sep=self.delim, header=None)
|
||||
|
||||
# Step 2: If the file is empty, return empty DataFrame with new headers
|
||||
if df.empty:
|
||||
return pd.DataFrame(columns=new_header)
|
||||
|
||||
# Step 3: If column count doesn't match old_header length, handle the mismatch
|
||||
if len(df.columns) != len(old_header):
|
||||
print(f"Warning: CSV columns count ({len(df.columns)}) doesn't match the provided old header count ({len(old_header)})")
|
||||
# If file has fewer columns than old_header, pad with NaN
|
||||
if len(df.columns) < len(old_header):
|
||||
for i in range(len(df.columns), len(old_header)):
|
||||
df[i] = None
|
||||
# If file has more columns than old_header, truncate
|
||||
else:
|
||||
df = df.iloc[:, :len(old_header)]
|
||||
|
||||
# Step 4: Assign old header names to the dataframe
|
||||
df.columns = old_header
|
||||
|
||||
# Step 5: Create a new DataFrame with the correct structure
|
||||
new_df = pd.DataFrame(columns=new_header)
|
||||
|
||||
# Step 6: For each column in the new header, find its position in the old header
|
||||
for new_col_idx, new_col in enumerate(new_header):
|
||||
if new_col in old_header:
|
||||
# If column exists in old header, copy data
|
||||
new_df[new_col] = df[new_col]
|
||||
else:
|
||||
# If new column, add empty column
|
||||
new_df[new_col] = None
|
||||
|
||||
# Step 7: Ensure columns are in the exact order of new_header
|
||||
new_df = new_df[new_header]
|
||||
|
||||
return new_df
|
76
biocypher/output/write/graph/_networkx.py
Normal file
76
biocypher/output/write/graph/_networkx.py
Normal file
@ -0,0 +1,76 @@
|
||||
import pickle
|
||||
|
||||
import networkx as nx
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._writer import _Writer
|
||||
from biocypher.output.write.relational._csv import _PandasCSVWriter
|
||||
|
||||
|
||||
class _NetworkXWriter(_Writer):
|
||||
"""
|
||||
Class for writing node and edges to a networkx DiGraph.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.csv_writer = _PandasCSVWriter(*args, write_to_file=False, **kwargs)
|
||||
self.G = nx.DiGraph()
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
|
||||
|
||||
Returns:
|
||||
str: Python code to load the csv files into Pandas dfs.
|
||||
"""
|
||||
logger.info(
|
||||
f"Writing networkx {self.G} to pickle file networkx_graph.pkl."
|
||||
)
|
||||
with open(f"{self.output_directory}/networkx_graph.pkl", "wb") as f:
|
||||
pickle.dump(self.G, f)
|
||||
|
||||
import_call = "import pickle\n"
|
||||
import_call += "with open('./networkx_graph.pkl', 'rb') as f:\n\tG_loaded = pickle.load(f)"
|
||||
return import_call
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""Function to return the name of the import script."""
|
||||
return "import_networkx.py"
|
||||
|
||||
def _write_node_data(self, nodes) -> bool:
|
||||
passed = self.csv_writer._write_entities_to_file(nodes)
|
||||
self.add_to_networkx()
|
||||
return passed
|
||||
|
||||
def _write_edge_data(self, edges) -> bool:
|
||||
passed = self.csv_writer._write_entities_to_file(edges)
|
||||
self.add_to_networkx()
|
||||
return passed
|
||||
|
||||
def add_to_networkx(self) -> bool:
|
||||
all_dfs = self.csv_writer.stored_dfs
|
||||
node_dfs = [
|
||||
df
|
||||
for df in all_dfs.values()
|
||||
if df.columns.str.contains("node_id").any()
|
||||
]
|
||||
edge_dfs = [
|
||||
df
|
||||
for df in all_dfs.values()
|
||||
if df.columns.str.contains("source_id").any()
|
||||
and df.columns.str.contains("target_id").any()
|
||||
]
|
||||
for df in node_dfs:
|
||||
nodes = df.set_index("node_id").to_dict(orient="index")
|
||||
self.G.add_nodes_from(nodes.items())
|
||||
for df in edge_dfs:
|
||||
edges = df.set_index(["source_id", "target_id"]).to_dict(
|
||||
orient="index"
|
||||
)
|
||||
self.G.add_edges_from(
|
||||
(
|
||||
(source, target, attrs)
|
||||
for (source, target), attrs in edges.items()
|
||||
)
|
||||
)
|
||||
return True
|
515
biocypher/output/write/graph/_rdf.py
Normal file
515
biocypher/output/write/graph/_rdf.py
Normal file
@ -0,0 +1,515 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Loes van den Biggelaar
|
||||
# Sebastian Lobentanzer
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'offline' module. Handles the writing of node and edge representations
|
||||
suitable for import into a DBMS.
|
||||
"""
|
||||
from types import GeneratorType
|
||||
from typing import Union
|
||||
import os
|
||||
|
||||
from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
|
||||
from rdflib.namespace import (
|
||||
_NAMESPACE_PREFIXES_CORE,
|
||||
_NAMESPACE_PREFIXES_RDFLIB,
|
||||
)
|
||||
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._batch_writer import _BatchWriter
|
||||
|
||||
|
||||
class _RDFWriter(_BatchWriter):
|
||||
"""
|
||||
Class to write BioCypher's property graph into an RDF format using
|
||||
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
|
||||
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
|
||||
is done keeping only the minimum information about node and edges,
|
||||
skipping all properties.
|
||||
"""
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the RDF admin import script.
|
||||
This function applicable for RDF export.
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return "rdf-import-call.sh"
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the RDF admin import location
|
||||
"""
|
||||
return "bin/"
|
||||
|
||||
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
|
||||
"""
|
||||
Function to check if the specified RDF format is supported.
|
||||
|
||||
Args:
|
||||
rdf_format (str): The RDF format to check.
|
||||
|
||||
Returns:
|
||||
bool: Returns True if rdf format supported, False otherwise.
|
||||
"""
|
||||
supported_formats = [
|
||||
"xml",
|
||||
"n3",
|
||||
"turtle",
|
||||
"nt",
|
||||
"pretty-xml",
|
||||
"trix",
|
||||
"trig",
|
||||
"nquads",
|
||||
"json-ld",
|
||||
]
|
||||
if rdf_format not in supported_formats:
|
||||
logger.error(
|
||||
f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
|
||||
f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
|
||||
)
|
||||
return False
|
||||
else:
|
||||
# RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
|
||||
if self.rdf_format == "turtle":
|
||||
self.extension = "ttl"
|
||||
elif self.rdf_format == "ttl":
|
||||
self.rdf_format = "turtle"
|
||||
self.extension = "ttl"
|
||||
else:
|
||||
self.extension = self.rdf_format
|
||||
return True
|
||||
|
||||
def _write_single_edge_list_to_file(
|
||||
self,
|
||||
edge_list: list,
|
||||
label: str,
|
||||
prop_dict: dict,
|
||||
):
|
||||
"""
|
||||
This function takes one list of biocypher edges and writes them
|
||||
to an RDF file with the given format.
|
||||
|
||||
Args:
|
||||
edge_list (list): list of BioCypherEdges to be written
|
||||
|
||||
label (str): the label (type) of the edge
|
||||
|
||||
prop_dict (dict): properties of node class passed from parsing
|
||||
function and their types
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
|
||||
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
||||
logger.error("Edges must be passed as type BioCypherEdge.")
|
||||
return False
|
||||
|
||||
# translate label to PascalCase
|
||||
label_pascal = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
# create file name
|
||||
file_name = os.path.join(
|
||||
self.outdir, f"{label_pascal}.{self.extension}"
|
||||
)
|
||||
|
||||
# write data in graph
|
||||
graph = Graph()
|
||||
self._init_namespaces(graph)
|
||||
|
||||
for edge in edge_list:
|
||||
rdf_subject = edge.get_source_id()
|
||||
rdf_object = edge.get_target_id()
|
||||
rdf_predicate = edge.get_id()
|
||||
rdf_properties = edge.get_properties()
|
||||
if rdf_predicate == None:
|
||||
rdf_predicate = rdf_subject + rdf_object
|
||||
|
||||
edge_label = self.translator.name_sentence_to_pascal(
|
||||
edge.get_label()
|
||||
)
|
||||
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
|
||||
graph.add((edge_uri, RDF.type, RDFS.Class))
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][rdf_predicate],
|
||||
RDF.type,
|
||||
edge_uri,
|
||||
)
|
||||
)
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][rdf_predicate],
|
||||
self.rdf_namespaces["biocypher"]["subject"],
|
||||
self.subject_to_uri(rdf_subject),
|
||||
)
|
||||
)
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][rdf_predicate],
|
||||
self.rdf_namespaces["biocypher"]["object"],
|
||||
self.subject_to_uri(rdf_object),
|
||||
)
|
||||
)
|
||||
|
||||
# add properties to the transformed edge --> node
|
||||
for key, value in rdf_properties.items():
|
||||
# only write value if it exists.
|
||||
if value:
|
||||
self.add_property_to_graph(graph, rdf_predicate, value, key)
|
||||
|
||||
graph.serialize(destination=file_name, format=self.rdf_format)
|
||||
|
||||
logger.info(
|
||||
f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def add_property_to_graph(
|
||||
self,
|
||||
graph: Graph,
|
||||
rdf_subject: str,
|
||||
rdf_object: str,
|
||||
rdf_predicate: str,
|
||||
):
|
||||
"""
|
||||
Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
|
||||
It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
|
||||
If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
|
||||
If the property is neither a list or string, it will also be added as a literal.
|
||||
|
||||
Args:
|
||||
graph (RDFLib.Graph): The RDF graph to add the nodes to.
|
||||
|
||||
rdf_subject (str): The subject of the RDF triple.
|
||||
|
||||
rdf_object (str): The object of the RDF triple.
|
||||
|
||||
rdf_predicate (str): The predicate of the RDF triple.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if isinstance(rdf_object, list):
|
||||
for obj in rdf_object:
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
self.property_to_uri(rdf_predicate),
|
||||
Literal(obj),
|
||||
)
|
||||
)
|
||||
elif isinstance(rdf_object, str):
|
||||
if rdf_object.startswith("[") and rdf_object.endswith("]"):
|
||||
self.add_property_to_graph(
|
||||
graph,
|
||||
rdf_subject,
|
||||
self.transform_string_to_list(rdf_object),
|
||||
rdf_predicate,
|
||||
)
|
||||
else:
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
self.property_to_uri(rdf_predicate),
|
||||
Literal(rdf_object),
|
||||
)
|
||||
)
|
||||
else:
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
self.property_to_uri(rdf_predicate),
|
||||
Literal(rdf_object),
|
||||
)
|
||||
)
|
||||
|
||||
def transform_string_to_list(self, string_list: str) -> list:
|
||||
"""
|
||||
Function to transform a string representation of a list into a list.
|
||||
|
||||
Args:
|
||||
string_list (str): The string representation of the list.
|
||||
|
||||
Returns:
|
||||
list: The list representation of the input string.
|
||||
"""
|
||||
return (
|
||||
string_list.replace("[", "")
|
||||
.replace("]", "")
|
||||
.replace("'", "")
|
||||
.split(", ")
|
||||
)
|
||||
|
||||
def _write_single_node_list_to_file(
|
||||
self,
|
||||
node_list: list,
|
||||
label: str,
|
||||
prop_dict: dict,
|
||||
labels: str,
|
||||
):
|
||||
"""
|
||||
This function takes a list of BioCypherNodes and writes them
|
||||
to an RDF file in the specified format.
|
||||
|
||||
Args:
|
||||
node_list (list): A list of BioCypherNodes to be written.
|
||||
|
||||
label (str): The label (type) of the nodes.
|
||||
|
||||
prop_dict (dict): A dictionary of properties and their types for the node class.
|
||||
|
||||
Returns:
|
||||
bool: True if the writing is successful, False otherwise.
|
||||
"""
|
||||
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
||||
logger.error("Nodes must be passed as type BioCypherNode.")
|
||||
return False
|
||||
|
||||
# translate label to PascalCase
|
||||
label_pascal = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
# create file name
|
||||
file_name = os.path.join(
|
||||
self.outdir, f"{label_pascal}.{self.extension}"
|
||||
)
|
||||
|
||||
# write data in graph
|
||||
graph = Graph()
|
||||
self._init_namespaces(graph)
|
||||
|
||||
for n in node_list:
|
||||
rdf_subject = n.get_id()
|
||||
rdf_object = n.get_label()
|
||||
properties = n.get_properties()
|
||||
class_name = self.translator.name_sentence_to_pascal(rdf_object)
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][class_name],
|
||||
RDF.type,
|
||||
RDFS.Class,
|
||||
)
|
||||
)
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
RDF.type,
|
||||
self.rdf_namespaces["biocypher"][class_name],
|
||||
)
|
||||
)
|
||||
for key, value in properties.items():
|
||||
# only write value if it exists.
|
||||
if value:
|
||||
self.add_property_to_graph(graph, rdf_subject, value, key)
|
||||
|
||||
graph.serialize(destination=file_name, format=self.rdf_format)
|
||||
|
||||
logger.info(
|
||||
f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def write_nodes(
|
||||
self, nodes, batch_size: int = int(1e6), force: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
|
||||
|
||||
Args:
|
||||
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
|
||||
batch_size (int): The number of nodes to write in each batch.
|
||||
force (bool): Flag to force the writing even if the output file already exists.
|
||||
|
||||
Returns:
|
||||
bool: True if the writing is successful, False otherwise.
|
||||
"""
|
||||
# check if specified output format is correct
|
||||
passed = self._is_rdf_format_supported(self.rdf_format)
|
||||
if not passed:
|
||||
logger.error("Error while writing node data, wrong RDF format")
|
||||
return False
|
||||
# write node data using _write_node_data method
|
||||
passed = self._write_node_data(nodes, batch_size, force)
|
||||
if not passed:
|
||||
logger.error("Error while writing node data.")
|
||||
return False
|
||||
return True
|
||||
|
||||
def write_edges(
|
||||
self,
|
||||
edges: Union[list, GeneratorType],
|
||||
batch_size: int = int(1e6),
|
||||
) -> bool:
|
||||
"""
|
||||
Wrapper for writing edges in RDF format. It calls _write_edge_data()
|
||||
functions specifying it's edge data.
|
||||
|
||||
Args:
|
||||
edges (BioCypherEdge): a list or generator of edges in
|
||||
:py:class:`BioCypherEdge` format
|
||||
batch_size (int): The number of edges to write in each batch.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# check if specified output format is correct
|
||||
passed = self._is_rdf_format_supported(self.rdf_format)
|
||||
if not passed:
|
||||
logger.error("Error while writing edge data, wrong RDF format")
|
||||
return False
|
||||
# write edge data using _write_edge_data method
|
||||
passed = self._write_edge_data(edges, batch_size=batch_size)
|
||||
if not passed:
|
||||
logger.error("Error while writing edge data.")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _construct_import_call(self) -> bool:
|
||||
"""
|
||||
Function to write the import call.
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _write_array_string(self, string_list):
|
||||
"""
|
||||
Abstract method to write the string representation of an array into a .csv file
|
||||
as required by the RDF admin-import.
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Args:
|
||||
string_list (list): list of ontology strings
|
||||
|
||||
Returns:
|
||||
str: The string representation of an array for the neo4j admin import
|
||||
"""
|
||||
|
||||
return True
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Abstract method that takes care of importing properties of a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Abstract method to write a database import-file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
return True
|
||||
|
||||
def subject_to_uri(self, subject: str) -> str:
|
||||
"""
|
||||
Converts the subject to a proper URI using the available namespaces.
|
||||
If the conversion fails, it defaults to the biocypher prefix.
|
||||
|
||||
Args:
|
||||
subject (str): The subject to be converted to a URI.
|
||||
|
||||
Returns:
|
||||
str: The corresponding URI for the subject.
|
||||
"""
|
||||
try:
|
||||
_pref, _id = subject.split(":")
|
||||
|
||||
if _pref in self.rdf_namespaces.keys():
|
||||
return self.rdf_namespaces[_pref][_id]
|
||||
else:
|
||||
return self.rdf_namespaces["biocypher"][subject]
|
||||
except ValueError:
|
||||
return self.rdf_namespaces["biocypher"][subject]
|
||||
|
||||
def property_to_uri(self, property_name: str) -> dict[str, str]:
|
||||
"""
|
||||
Converts a property name to its corresponding URI.
|
||||
|
||||
This function takes a property name and searches for its corresponding URI in various namespaces.
|
||||
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
|
||||
|
||||
Args:
|
||||
property_name (str): The property name to be converted to a URI.
|
||||
|
||||
Returns:
|
||||
str: The corresponding URI for the input property name.
|
||||
"""
|
||||
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
|
||||
for namespace in _NAMESPACE_PREFIXES_CORE.values():
|
||||
if property_name in namespace:
|
||||
return namespace[property_name]
|
||||
|
||||
# If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
|
||||
for namespace in [SKOS, DC, DCTERMS]:
|
||||
if property_name in namespace:
|
||||
return namespace[property_name]
|
||||
|
||||
# If the property name is still not found, try other namespaces from rdflib.
|
||||
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
|
||||
if property_name in namespace:
|
||||
return namespace[property_name]
|
||||
|
||||
# If the property name is "licence", it recursively calls the function with "license" as the input.
|
||||
if property_name == "licence":
|
||||
return self.property_to_uri("license")
|
||||
|
||||
# TODO: add an option to search trough manually implemented namespaces
|
||||
|
||||
# If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
|
||||
# TODO: give a warning and try to prevent this option altogether
|
||||
return self.rdf_namespaces["biocypher"][property_name]
|
||||
|
||||
def _init_namespaces(self, graph: Graph):
|
||||
"""
|
||||
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
|
||||
|
||||
This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
|
||||
If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
|
||||
the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
|
||||
|
||||
Args:
|
||||
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# add biocypher standard to self.rdf_namespaces
|
||||
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
|
||||
if not self.rdf_namespaces:
|
||||
self.rdf_namespaces = biocypher_standard
|
||||
else:
|
||||
self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
|
||||
|
||||
for key, value in self.rdf_namespaces.items():
|
||||
namespace = Namespace(value)
|
||||
self.rdf_namespaces[key] = namespace
|
||||
graph.bind(key, namespace)
|
0
biocypher/output/write/relational/__init__.py
Normal file
0
biocypher/output/write/relational/__init__.py
Normal file
76
biocypher/output/write/relational/_csv.py
Normal file
76
biocypher/output/write/relational/_csv.py
Normal file
@ -0,0 +1,76 @@
|
||||
from more_itertools import peekable
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._writer import _Writer
|
||||
from biocypher.output.in_memory._pandas import Pandas
|
||||
|
||||
|
||||
class _PandasCSVWriter(_Writer):
|
||||
"""
|
||||
Class for writing node and edge representations to a CSV file.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, write_to_file: bool = True, **kwargs):
|
||||
kwargs["write_to_file"] = write_to_file
|
||||
super().__init__(*args, **kwargs)
|
||||
self.in_memory_dfs = {}
|
||||
self.stored_dfs = {}
|
||||
self.pandas_in_memory = Pandas(
|
||||
translator=self.translator,
|
||||
deduplicator=self.deduplicator,
|
||||
)
|
||||
self.delimiter = kwargs.get("delimiter")
|
||||
if not self.delimiter:
|
||||
self.delimiter = ","
|
||||
self.write_to_file = write_to_file
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
|
||||
|
||||
Returns:
|
||||
str: Python code to load the csv files into Pandas dfs.
|
||||
"""
|
||||
import_call = "import pandas as pd\n\n"
|
||||
for df_name in self.stored_dfs.keys():
|
||||
import_call += f"{df_name} = pd.read_csv('./{df_name}.csv', header=0, index_col=0)\n"
|
||||
return import_call
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""Function to return the name of the import script."""
|
||||
return "import_pandas_csv.py"
|
||||
|
||||
def _write_node_data(self, nodes) -> bool:
|
||||
passed = self._write_entities_to_file(nodes)
|
||||
return passed
|
||||
|
||||
def _write_edge_data(self, edges) -> bool:
|
||||
passed = self._write_entities_to_file(edges)
|
||||
return passed
|
||||
|
||||
def _write_entities_to_file(self, entities: iter) -> bool:
|
||||
"""Function to output.write the entities to a CSV file.
|
||||
|
||||
Args:
|
||||
entities (iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
||||
"""
|
||||
entities = peekable(entities)
|
||||
entity_list = self.pandas_in_memory._separate_entity_types(entities)
|
||||
for entity_type, entities in entity_list.items():
|
||||
self.in_memory_dfs[
|
||||
entity_type
|
||||
] = self.pandas_in_memory._add_entity_df(entity_type, entities)
|
||||
for entity_type in self.in_memory_dfs.keys():
|
||||
entity_df = self.in_memory_dfs[entity_type]
|
||||
if " " in entity_type or "." in entity_type:
|
||||
entity_type = entity_type.replace(" ", "_").replace(".", "_")
|
||||
if self.write_to_file:
|
||||
logger.info(
|
||||
f"Writing {entity_df.shape[0]} entries to {entity_type}.csv."
|
||||
)
|
||||
entity_df.to_csv(
|
||||
f"{self.output_directory}/{entity_type}.csv",
|
||||
sep=self.delimiter,
|
||||
)
|
||||
self.stored_dfs[entity_type] = entity_df
|
||||
self.in_memory_dfs = {}
|
||||
return True
|
320
biocypher/output/write/relational/_postgresql.py
Normal file
320
biocypher/output/write/relational/_postgresql.py
Normal file
@ -0,0 +1,320 @@
|
||||
import os
|
||||
import glob
|
||||
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._batch_writer import _BatchWriter
|
||||
|
||||
|
||||
class _PostgreSQLBatchWriter(_BatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to disk using the
|
||||
format specified by PostgreSQL for the use of "COPY FROM...". Each batch
|
||||
writer instance has a fixed representation that needs to be passed
|
||||
at instantiation via the :py:attr:`schema` argument. The instance
|
||||
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
|
||||
to convert and extend the hierarchy.
|
||||
|
||||
This class inherits from the abstract class "_BatchWriter" and implements the
|
||||
PostgreSQL-specific methods:
|
||||
|
||||
- _write_node_headers
|
||||
- _write_edge_headers
|
||||
- _construct_import_call
|
||||
- _write_array_string
|
||||
"""
|
||||
|
||||
DATA_TYPE_LOOKUP = {
|
||||
"str": "VARCHAR", # VARCHAR needs limit
|
||||
"int": "INTEGER",
|
||||
"long": "BIGINT",
|
||||
"float": "NUMERIC",
|
||||
"double": "NUMERIC",
|
||||
"dbl": "NUMERIC",
|
||||
"boolean": "BOOLEAN",
|
||||
"str[]": "VARCHAR[]",
|
||||
"string[]": "VARCHAR[]",
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._copy_from_csv_commands = set()
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the psql command
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _get_data_type(self, string) -> str:
|
||||
try:
|
||||
return self.DATA_TYPE_LOOKUP[string]
|
||||
except KeyError:
|
||||
logger.info(
|
||||
'Could not determine data type {string}. Using default "VARCHAR"'
|
||||
)
|
||||
return "VARCHAR"
|
||||
|
||||
def _write_array_string(self, string_list) -> str:
|
||||
"""
|
||||
Abstract method to output.write the string representation of an array into a .csv file
|
||||
as required by the postgresql COPY command, with '{','}' brackets and ',' separation.
|
||||
|
||||
Args:
|
||||
string_list (list): list of ontology strings
|
||||
|
||||
Returns:
|
||||
str: The string representation of an array for postgres COPY
|
||||
"""
|
||||
string = ",".join(string_list)
|
||||
string = f'"{{{string}}}"'
|
||||
return string
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the psql import script
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return f"{self.db_name}-import-call.sh"
|
||||
|
||||
def _adjust_pascal_to_psql(self, string):
|
||||
string = string.replace(".", "_")
|
||||
string = string.lower()
|
||||
return string
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of node.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.node_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.node_property_dict.items():
|
||||
# create header CSV with ID, properties, labels
|
||||
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
parts = f"{pascal_label}-part*.csv"
|
||||
parts_paths = os.path.join(self.outdir, parts)
|
||||
parts_paths = glob.glob(parts_paths)
|
||||
parts_paths.sort()
|
||||
|
||||
# adjust label for import to psql
|
||||
pascal_label = self._adjust_pascal_to_psql(pascal_label)
|
||||
table_create_command_path = os.path.join(
|
||||
self.outdir,
|
||||
f"{pascal_label}-create_table.sql",
|
||||
)
|
||||
|
||||
# check if file already exists
|
||||
if os.path.exists(table_create_command_path):
|
||||
logger.warning(
|
||||
f"File {table_create_command_path} already exists. Overwriting.",
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
columns = ["_ID VARCHAR"]
|
||||
for col_name, col_type in props.items():
|
||||
col_type = self._get_data_type(col_type)
|
||||
col_name = self._adjust_pascal_to_psql(col_name)
|
||||
columns.append(f"{col_name} {col_type}")
|
||||
columns.append("_LABEL VARCHAR[]")
|
||||
|
||||
with open(table_create_command_path, "w", encoding="utf-8") as f:
|
||||
command = ""
|
||||
if self.wipe:
|
||||
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
|
||||
|
||||
# table creation requires comma separation
|
||||
command += (
|
||||
f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
|
||||
)
|
||||
f.write(command)
|
||||
|
||||
for parts_path in parts_paths:
|
||||
# if import_call_file_prefix is set, replace actual path
|
||||
# with prefix
|
||||
if self.import_call_file_prefix != self.outdir:
|
||||
parts_path = parts_path.replace(
|
||||
self.outdir,
|
||||
self.import_call_file_prefix,
|
||||
)
|
||||
|
||||
self._copy_from_csv_commands.add(
|
||||
f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
|
||||
)
|
||||
|
||||
# add file path to import statement
|
||||
# if import_call_file_prefix is set, replace actual path
|
||||
# with prefix
|
||||
if self.import_call_file_prefix != self.outdir:
|
||||
table_create_command_path = table_create_command_path.replace(
|
||||
self.outdir,
|
||||
self.import_call_file_prefix,
|
||||
)
|
||||
|
||||
self.import_call_nodes.add(table_create_command_path)
|
||||
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Writes single CSV file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# load headers from data parse
|
||||
if not self.edge_property_dict:
|
||||
logger.error(
|
||||
"Header information not found. Was the data parsed first?",
|
||||
)
|
||||
return False
|
||||
|
||||
for label, props in self.edge_property_dict.items():
|
||||
# translate label to PascalCase
|
||||
pascal_label = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv")
|
||||
parts_paths = glob.glob(parts_paths)
|
||||
parts_paths.sort()
|
||||
|
||||
# adjust label for import to psql
|
||||
pascal_label = self._adjust_pascal_to_psql(pascal_label)
|
||||
table_create_command_path = os.path.join(
|
||||
self.outdir,
|
||||
f"{pascal_label}-create_table.sql",
|
||||
)
|
||||
|
||||
# check for file exists
|
||||
if os.path.exists(table_create_command_path):
|
||||
logger.warning(
|
||||
f"File {table_create_command_path} already exists. Overwriting.",
|
||||
)
|
||||
|
||||
# concatenate key:value in props
|
||||
columns = []
|
||||
for col_name, col_type in props.items():
|
||||
col_type = self._get_data_type(col_type)
|
||||
col_name = self._adjust_pascal_to_psql(col_name)
|
||||
if col_name == "_ID":
|
||||
# should ideally never happen
|
||||
raise ValueError(
|
||||
"Column name '_ID' is reserved for internal use, "
|
||||
"denoting the relationship ID. Please choose a "
|
||||
"different name for your column."
|
||||
)
|
||||
|
||||
columns.append(f"{col_name} {col_type}")
|
||||
|
||||
# create list of lists and flatten
|
||||
# removes need for empty check of property list
|
||||
out_list = [
|
||||
"_START_ID VARCHAR",
|
||||
"_ID VARCHAR",
|
||||
*columns,
|
||||
"_END_ID VARCHAR",
|
||||
"_TYPE VARCHAR",
|
||||
]
|
||||
|
||||
with open(table_create_command_path, "w", encoding="utf-8") as f:
|
||||
command = ""
|
||||
if self.wipe:
|
||||
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
|
||||
|
||||
# table creation requires comma separation
|
||||
command += (
|
||||
f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
|
||||
)
|
||||
f.write(command)
|
||||
|
||||
for parts_path in parts_paths:
|
||||
# if import_call_file_prefix is set, replace actual path
|
||||
# with prefix
|
||||
if self.import_call_file_prefix != self.outdir:
|
||||
parts_path = parts_path.replace(
|
||||
self.outdir,
|
||||
self.import_call_file_prefix,
|
||||
)
|
||||
|
||||
self._copy_from_csv_commands.add(
|
||||
f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
|
||||
)
|
||||
|
||||
# add file path to import statement
|
||||
# if import_call_file_prefix is set, replace actual path
|
||||
# with prefix
|
||||
if self.import_call_file_prefix != self.outdir:
|
||||
table_create_command_path = table_create_command_path.replace(
|
||||
self.outdir,
|
||||
self.import_call_file_prefix,
|
||||
)
|
||||
|
||||
self.import_call_edges.add(table_create_command_path)
|
||||
|
||||
return True
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for postgresql import
|
||||
"""
|
||||
import_call = ""
|
||||
|
||||
# create tables
|
||||
# At this point, csv files of nodes and edges do not require differentiation
|
||||
for import_file_path in [
|
||||
*self.import_call_nodes,
|
||||
*self.import_call_edges,
|
||||
]:
|
||||
import_call += f'echo "Setup {import_file_path}..."\n'
|
||||
if {self.db_password}:
|
||||
# set password variable inline
|
||||
import_call += f"PGPASSWORD={self.db_password} "
|
||||
import_call += (
|
||||
f"{self.import_call_bin_prefix}psql -f {import_file_path}"
|
||||
)
|
||||
import_call += f" --dbname {self.db_name}"
|
||||
import_call += f" --host {self.db_host}"
|
||||
import_call += f" --port {self.db_port}"
|
||||
import_call += f" --user {self.db_user}"
|
||||
import_call += '\necho "Done!"\n'
|
||||
import_call += "\n"
|
||||
|
||||
# copy data to tables
|
||||
for command in self._copy_from_csv_commands:
|
||||
table_part = command.split(" ")[3]
|
||||
import_call += f'echo "Importing {table_part}..."\n'
|
||||
if {self.db_password}:
|
||||
# set password variable inline
|
||||
import_call += f"PGPASSWORD={self.db_password} "
|
||||
import_call += f'{self.import_call_bin_prefix}psql -c "{command}"'
|
||||
import_call += f" --dbname {self.db_name}"
|
||||
import_call += f" --host {self.db_host}"
|
||||
import_call += f" --port {self.db_port}"
|
||||
import_call += f" --user {self.db_user}"
|
||||
import_call += '\necho "Done!"\n'
|
||||
import_call += "\n"
|
||||
|
||||
return import_call
|
51
biocypher/output/write/relational/_sqlite.py
Normal file
51
biocypher/output/write/relational/_sqlite.py
Normal file
@ -0,0 +1,51 @@
|
||||
from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
|
||||
|
||||
|
||||
class _SQLiteBatchWriter(_PostgreSQLBatchWriter):
|
||||
"""
|
||||
Class for writing node and edge representations to a SQLite database.
|
||||
It uses the _PostgreSQLBatchWriter class under the hood, which already
|
||||
implements the logic to write the nodes/edges to a relational DBMS.
|
||||
Only the import bash script differs between PostgreSQL and SQLite
|
||||
and is therefore implemented in this class.
|
||||
|
||||
- _construct_import_call
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _construct_import_call(self) -> str:
|
||||
"""
|
||||
Function to construct the import call detailing folder and
|
||||
individual node and edge headers and data files, as well as
|
||||
delimiters and database name. Built after all data has been
|
||||
processed to ensure that nodes are called before any edges.
|
||||
|
||||
Returns:
|
||||
str: a bash command for sqlite import
|
||||
"""
|
||||
import_call = ""
|
||||
|
||||
# create tables
|
||||
# At this point, csv files of nodes and edges do not require differentiation
|
||||
for import_file_path in [
|
||||
*self.import_call_nodes,
|
||||
*self.import_call_edges,
|
||||
]:
|
||||
import_call += f'echo "Setup {import_file_path}..."\n'
|
||||
import_call += f"{self.import_call_bin_prefix}sqlite3 {self.db_name} < {import_file_path}"
|
||||
import_call += '\necho "Done!"\n'
|
||||
import_call += "\n"
|
||||
|
||||
for command in self._copy_from_csv_commands:
|
||||
table_name = command.split(" ")[1]
|
||||
table_part = command.split(" ")[3].replace("'", "")
|
||||
import_call += f'echo "Importing {table_part}..."\n'
|
||||
separator = self.delim
|
||||
import_part = f".import {table_part} {table_name}"
|
||||
import_call += f"{self.import_call_bin_prefix}sqlite3 -separator $'{separator}' {self.db_name} \"{import_part}\""
|
||||
import_call += '\necho "Done!"\n'
|
||||
import_call += "\n"
|
||||
|
||||
return import_call
|
18
config/biocypher_config.yaml
Normal file
18
config/biocypher_config.yaml
Normal file
@ -0,0 +1,18 @@
|
||||
# add your settings here (overriding the defaults)
|
||||
|
||||
biocypher:
|
||||
dbms: neo4j
|
||||
offline: true
|
||||
#debug: true
|
||||
output_directory: /neo4j_import #comment if you want to debug, so that bc creates a new folder for each run in /biocypher-out
|
||||
schema_config_path: config/automated_schema.yaml
|
||||
|
||||
head_ontology:
|
||||
url: config/head_ontology/biolink-model.owl.ttl
|
||||
root_node: entity
|
||||
|
||||
neo4j:
|
||||
delimiter: '\t'
|
||||
array_delimiter: '|'
|
||||
skip_duplicate_nodes: true
|
||||
skip_bad_relationships: true
|
9088
config/head_ontology/biolink-model.owl.ttl
Normal file
9088
config/head_ontology/biolink-model.owl.ttl
Normal file
File diff suppressed because it is too large
Load Diff
283
config/manual_schema_config.yaml
Normal file
283
config/manual_schema_config.yaml
Normal file
@ -0,0 +1,283 @@
|
||||
Title: BioCypher graph schema configuration file
|
||||
|
||||
# This configuration file establishes the hierarchy and connectivity in a newly
|
||||
# set-up BioCypher property graph database. Naming should adhere to Biolink
|
||||
# nomenclature (available at https://biolink.github.io/biolink-model/ or via
|
||||
# the python module 'biolink-model-toolkit').
|
||||
|
||||
# The BioCypher YAML file specifies only the leaves of the hierarchy tree of
|
||||
# the desired graph; the hierarchical structure of entities will be derived
|
||||
# from the Biolink model + BRO model. Thus, only the immediate constituents
|
||||
# of the graph need to be specified in the schema config.
|
||||
|
||||
|
||||
# ---
|
||||
# "Named Things"
|
||||
# ---
|
||||
# The implementation of named things is fairly straightforward, since they are
|
||||
# usually represented in node form, which is also the Biolink recommendation.
|
||||
# The same is not true for associations.
|
||||
#
|
||||
# A little more complex is the representation of aggregates of named things.
|
||||
|
||||
clinicalStatus:
|
||||
is_a: ClinicalEntity
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: clinicalStatus
|
||||
properties:
|
||||
coding_system: str
|
||||
label: str
|
||||
coding_code: str
|
||||
|
||||
Condition:
|
||||
is_a: ClinicalEntity
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: Condition
|
||||
properties:
|
||||
input_format: HL7 FHIR
|
||||
data_specification: Medical Informatics Initiative Germany Core Data Set, Basic Modules
|
||||
|
||||
diagnosis:
|
||||
is_a: ClinicalEntity
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: diagnosis
|
||||
properties:
|
||||
type.coding_code: str
|
||||
sequence: str
|
||||
label: str
|
||||
type.coding_system: str
|
||||
|
||||
DiagnosticReport:
|
||||
is_a: ClinicalEntity
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: DiagnosticReport
|
||||
properties:
|
||||
resourceType: str
|
||||
label: str
|
||||
status: str
|
||||
id: str
|
||||
|
||||
Encounter:
|
||||
is_a: ClinicalEntity
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: Encounter
|
||||
properties:
|
||||
resourceType: str
|
||||
label: str
|
||||
status: str
|
||||
id: str
|
||||
|
||||
identifier:
|
||||
is_a: Attribute
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: identifier
|
||||
properties:
|
||||
label: str
|
||||
value: str
|
||||
system: str
|
||||
|
||||
interpretation: #
|
||||
is_a: named thing
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: interpretation
|
||||
properties:
|
||||
extension.valueCoding_system: str
|
||||
extension_url: str
|
||||
extension.valueCoding_display: str
|
||||
coding_code: str
|
||||
coding_system: str
|
||||
label: str
|
||||
extension.valueCoding_code: str
|
||||
|
||||
maritalStatus:
|
||||
is_a: OrganismAttribute
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: maritalStatus
|
||||
properties:
|
||||
label: str
|
||||
coding_system: str
|
||||
coding_code: str
|
||||
|
||||
Observation:
|
||||
is_a: ClinicalEntity
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: Observation
|
||||
properties:
|
||||
resourceType: str
|
||||
label: str
|
||||
effectiveDateTime: str
|
||||
status: str
|
||||
id: str
|
||||
|
||||
Organization:
|
||||
is_a: AdministrativeEntity
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: Organization
|
||||
properties:
|
||||
label: str
|
||||
id: str
|
||||
name: str
|
||||
resourceType: str
|
||||
|
||||
Patient:
|
||||
is_a: Human
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: Patient
|
||||
properties:
|
||||
resourceType: str
|
||||
label: str
|
||||
gender: str
|
||||
id: str
|
||||
birthDate: str
|
||||
|
||||
Procedure:
|
||||
# is_a: Procedure
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: Procedure
|
||||
properties:
|
||||
label: str
|
||||
performedDateTime: str
|
||||
resourceType: str
|
||||
status: str
|
||||
id: str
|
||||
|
||||
referenceRange: #
|
||||
is_a: named thing
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: referenceRange
|
||||
properties:
|
||||
high_system: str
|
||||
high_value: str
|
||||
high_code: str
|
||||
label: str
|
||||
high_unit: str
|
||||
|
||||
search: #
|
||||
is_a: named thing
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: search
|
||||
properties:
|
||||
label: str
|
||||
mode: str
|
||||
|
||||
type:
|
||||
is_a: Attribute
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: type
|
||||
properties:
|
||||
coding_system: str
|
||||
label: str
|
||||
coding_code: str
|
||||
coding_display: str
|
||||
|
||||
verificationStatus:
|
||||
is_a: Attribute
|
||||
represented_as: node
|
||||
preferred_id: fhir_id
|
||||
label_in_input: verificationStatus
|
||||
properties:
|
||||
coding_system: str
|
||||
label: str
|
||||
coding_code: str
|
||||
coding_display: str
|
||||
|
||||
|
||||
# ---
|
||||
# Associations
|
||||
# ---
|
||||
# Associations are not supposed to be represented in node form as per the
|
||||
# specifications of Biolink. However, in an analytic context, it often makes
|
||||
# sense to represent interactions as nodes in Neo4j, because it enables, for
|
||||
# instance, the annotation of a relationship with a publication as source of
|
||||
# evidence (also known as reification in the knowledge graph world).
|
||||
|
||||
# The Biolink specifications for these types of relationships do
|
||||
# not go into depth; for example, the hierarchy for molecular interactions
|
||||
# (ie, "associations") ends at "PairwiseMolecularInteraction", there are no
|
||||
# explicit terms for protein-protein-interaction, phosphorylation, miRNA-
|
||||
# targeting, etc. Biolink proposes to use interaction identifiers from
|
||||
# ontologies, such as https://www.ebi.ac.uk/ols/ontologies/mi/.
|
||||
|
||||
# association to connect anything to an identifier node
|
||||
# if functional, includes:
|
||||
# IDENTIFIED_BY_Condition_Identifier,
|
||||
# IDENTIFIED_BY_DiagnosticReport_Identifier,
|
||||
# IDENTIFIED_BY_Encounter_Identifier,
|
||||
# IDENTIFIED_BY_Observation_Identifier,
|
||||
# IDENTIFIED_BY_Organization_Identifier
|
||||
# IDENTIFIED_BY_Patient_Identifier,
|
||||
# IDENTIFIED_BY_Procedure_Identifier
|
||||
|
||||
condition to identifier association:
|
||||
is_a: association
|
||||
represented_as: edge
|
||||
label_in_input: IDENTIFIED_BY_Condition_Identifier
|
||||
|
||||
diagnostic report to identifier association:
|
||||
is_a: association
|
||||
represented_as: edge
|
||||
label_in_input: IDENTIFIED_BY_DiagnosticReport_Identifier
|
||||
|
||||
observation to identifier association:
|
||||
is_a: association
|
||||
represented_as: edge
|
||||
label_in_input: IDENTIFIED_BY_Observation_Identifier
|
||||
|
||||
observation derived from observation association:
|
||||
is_a: association
|
||||
represented_as: edge
|
||||
label_in_input: DERIVED_FROM_Observation_Observation
|
||||
|
||||
observation has member observation association:
|
||||
is_a: association
|
||||
represented_as: edge
|
||||
label_in_input: HAS_MEMBER_Observation_Observation
|
||||
|
||||
procedure to identifier association:
|
||||
is_a: association
|
||||
represented_as: edge
|
||||
label_in_input: IDENTIFIED_BY_Procedure_Identifier
|
||||
|
||||
procedure to diagnostic report association:
|
||||
is_a: association
|
||||
represented_as: edge
|
||||
label_in_input: IDENTIFIED_BY_Procedure_Identifier
|
||||
|
||||
procedure reasoned by observation association:
|
||||
is_a: association
|
||||
represented_as: edge
|
||||
label_in_input: HAS_REASON_REFERENCE_Procedure_Observation
|
||||
|
||||
procedure performer is practitioner association:
|
||||
is_a: association
|
||||
represented_as: edge
|
||||
label_in_input: HAS_ACTOR_ProcedurePerformer_Practitioner
|
||||
|
||||
#represented_as: edge
|
||||
#label_in_input: DERIVED_FROM_Observation_Observation:
|
||||
#represented_as: edge
|
||||
#label_in_input: DERIVED_FROM_Observation_Observation
|
||||
#protein interaction:
|
||||
# is_a: Pairwise molecular interaction
|
||||
# represented_as: edge
|
||||
# label_in_input: protein_protein_interaction
|
||||
|
||||
#protein to disease association:
|
||||
# is_a: Association
|
||||
# represented_as: edge
|
||||
# label_in_input: protein_disease_association
|
76
docker-compose.yml
Normal file
76
docker-compose.yml
Normal file
@ -0,0 +1,76 @@
|
||||
services:
|
||||
neo4j:
|
||||
image: neo4j:5.7
|
||||
environment:
|
||||
- NEO4J_AUTH=${NEO4J_AUTH:-neo4j/password}
|
||||
- NEO4J_PLUGINS=["apoc"]
|
||||
- NEO4J_server_config_strict__validation_enabled=false
|
||||
- NEO4J_apoc_export_file_enabled=true
|
||||
- NEO4J_apoc_import_file_enabled=true
|
||||
- NEO4J_apoc_import_file_use__neo4j__config=true
|
||||
- SHARED_PATH=/neo4j_import
|
||||
command: >
|
||||
bash -c '
|
||||
echo "running cmd from docker compose" &&
|
||||
#neo4j start &&
|
||||
while true; do
|
||||
if [ -f /neo4j_import/ready-to-import ]; then
|
||||
echo "Starting import process..."
|
||||
neo4j stop &&
|
||||
bash /neo4j_import/neo4j-admin-import-call.sh &&
|
||||
rm /neo4j_import/ready-to-import &&
|
||||
touch /neo4j_import/import-complete &&
|
||||
chmod 777 /neo4j_import/import-complete
|
||||
neo4j start
|
||||
echo "The container is running. STR+C will end the bash command and thus, the neo4j container"
|
||||
fi
|
||||
sleep 10
|
||||
done
|
||||
'
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:7474 || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
ports:
|
||||
- "8080:7474"
|
||||
- "8081:7687"
|
||||
volumes:
|
||||
- neo4j_data:/neo4j_data
|
||||
- neo4j_logs:/neo4j_logs
|
||||
- neo4j_import:/neo4j_import
|
||||
- ${INPUT_DATA_PATH:-./data}:/input_data
|
||||
- ./init-scripts:/init-scripts
|
||||
- ./importData:/importData
|
||||
|
||||
|
||||
python_app:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
args:
|
||||
HTTP_PROXY: ${HTTP_PROXY}
|
||||
HTTPS_PROXY: ${HTTPS_PROXY}
|
||||
NO_PROXY: ${NO_PROXY}
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- NEO4J_URI=bolt://neo4j:7687
|
||||
- NEO4J_USER=${NEO4J_USER:-neo4j}
|
||||
- NEO4J_PASSWORD=${NEO4J_PASSWORD:-password}
|
||||
- INPUT_DATA_PATH=/input_data
|
||||
- POETRY_VIRTUALENVS_CREATE=false
|
||||
- NEO4J_dbms_directories_import=/neo4j_import
|
||||
volumes:
|
||||
- neo4j_import:/neo4j_import
|
||||
- ${INPUT_DATA_PATH:-./data}:/input_data
|
||||
- ./importData:/importData # Share the import data directory
|
||||
# depends_on:
|
||||
# neo4j:
|
||||
# condition: service_healthy
|
||||
|
||||
# Define named volumes
|
||||
volumes:
|
||||
neo4j_data:
|
||||
neo4j_logs:
|
||||
neo4j_import:
|
38
entrypoint.sh
Normal file
38
entrypoint.sh
Normal file
@ -0,0 +1,38 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
chmod -R 777 /neo4j_import #make the dir accessible for both the python app an neo4j
|
||||
|
||||
|
||||
#echo "Waiting for Neo4j to be ready... ..."
|
||||
#python wait-for-neo4j.py
|
||||
|
||||
#if [ $? -ne 0 ]; then
|
||||
# echo "Failed to connect to Neo4j"
|
||||
# exit 1
|
||||
#fi
|
||||
|
||||
echo "Running Python data processing script..."
|
||||
poetry run python import_fhir_to_nx_diGraph.py
|
||||
|
||||
echo "Running Neo4j import..."
|
||||
# Wait a bit before attempting database operations
|
||||
sleep 5
|
||||
|
||||
while [ ! -f /neo4j_import/shell-scipt-complete ]; do
|
||||
echo "Waiting for shell-script file"
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# Create a signal file that we've prepared the data
|
||||
touch /neo4j_import/ready-to-import
|
||||
chmod -R 777 /neo4j_import/ready-to-import
|
||||
|
||||
# Wait for import to complete by monitoring a completion file
|
||||
echo "Waiting for Neo4j import to complete..."
|
||||
while [ ! -f /neo4j_import/import-complete ]; do
|
||||
echo "Waiting for import-complete file"
|
||||
sleep 5
|
||||
done
|
||||
|
||||
echo "Database setup complete!"
|
66
fhirImport.py
Normal file
66
fhirImport.py
Normal file
@ -0,0 +1,66 @@
|
||||
import requests
|
||||
from typing import List, Dict, Any
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
def getBundle(url: str, search: str):
|
||||
headers = {
|
||||
'Accept': 'application/fhir+json',
|
||||
'Content-Type': 'application/fhir+json'
|
||||
}
|
||||
|
||||
# Get configuration from environment variables
|
||||
mode = os.getenv('MODE')
|
||||
fhir_server = os.getenv('FHIR_SERVER_URL')
|
||||
|
||||
if mode != 'testsever':
|
||||
username = os.getenv('FHIR_SERVER_USER')
|
||||
password = os.getenv('FHIR_SERVER_PW')
|
||||
|
||||
|
||||
if not fhir_server:
|
||||
raise ValueError("FHIR_SERVER_URL not found in environment variables")
|
||||
if (not username or not password) and mode != 'testserver':
|
||||
raise ValueError("FHIR_USERNAME and FHIR_SERVER_PW must be set in environment variables")
|
||||
|
||||
# Setup basic authentication
|
||||
auth = HTTPBasicAuth(username, password)
|
||||
|
||||
|
||||
if url is not None:
|
||||
link = url + '?_format=json'
|
||||
else:
|
||||
link = fhir_server + search + '&_format=json'
|
||||
|
||||
#print(link)
|
||||
|
||||
if mode != 'testserver':
|
||||
response = requests.get(
|
||||
link,
|
||||
headers=headers,
|
||||
auth=auth
|
||||
)
|
||||
else:
|
||||
response = requests.get(
|
||||
link,
|
||||
headers=headers,
|
||||
)
|
||||
return response
|
||||
|
||||
def getPatientEverything(id: str):
|
||||
search = '/Patient/' + id + '/$everything?'
|
||||
return getBundle(None, search)
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
bundles = get_bundles(None)
|
||||
data = bundles.json()
|
||||
|
||||
# Process the bundles
|
||||
for entry in data['entry']:
|
||||
print(f"{entry['fullUrl']}")
|
43
graphCreation/create_graph.py
Normal file
43
graphCreation/create_graph.py
Normal file
@ -0,0 +1,43 @@
|
||||
import json
|
||||
import networkx as nx
|
||||
|
||||
def add_nodes_from_dict(graph, parent_node, current_dict):
|
||||
for key, value in current_dict.items():
|
||||
if isinstance(value, dict):
|
||||
# Create a new node for the nested dictionary
|
||||
new_node = f"{parent_node}.{key}"
|
||||
graph.add_node(new_node, label=key)
|
||||
# Add an edge from the parent node to the new node
|
||||
graph.add_edge(parent_node, new_node, edge_type=key)
|
||||
# Recurse into the nested dictionary
|
||||
add_nodes_from_dict(graph, new_node, value)
|
||||
elif isinstance(value, list):
|
||||
|
||||
# if list doesn't contain any nested dictionaries, make it a value in the node
|
||||
if any(isinstance(item, dict) for item in value)==False:
|
||||
graph.nodes[parent_node][key] = value
|
||||
|
||||
else:
|
||||
|
||||
# Process each dictionary in the list
|
||||
for index, item in enumerate(value):
|
||||
if isinstance(item, dict):
|
||||
if len(value)>1:
|
||||
item_node = f"{parent_node}.{key}[{index}]"
|
||||
else:
|
||||
item_node = f"{parent_node}.{key}"
|
||||
graph.add_node(item_node, label=key)
|
||||
graph.add_edge(parent_node, item_node, edge_type=key)
|
||||
add_nodes_from_dict(graph, item_node, item)
|
||||
|
||||
else:
|
||||
|
||||
# For non-dict and non-list values, add them as attributes to the parent node
|
||||
graph.nodes[parent_node][key] = value
|
||||
|
||||
def add_json_to_networkx(json_data, bundle_name, graph):
|
||||
if not isinstance(graph, nx.DiGraph):
|
||||
raise ValueError("The provided graph must be a networkx.DiGraph")
|
||||
root_node = bundle_name+'_bundle'
|
||||
graph.add_node(root_node, label='root')
|
||||
add_nodes_from_dict(graph, root_node, json_data)
|
40
graphCreation/node_typing.py
Normal file
40
graphCreation/node_typing.py
Normal file
@ -0,0 +1,40 @@
|
||||
import networkx as nx
|
||||
|
||||
class Resource:
|
||||
def __init__(self, resource_type):
|
||||
self.resource_type = resource_type
|
||||
|
||||
def create_resource_class(resource_type):
|
||||
return type(resource_type, (Resource,), {})
|
||||
|
||||
def set_resource_type(graph):
|
||||
for node, data in graph.nodes(data=True):
|
||||
print(node, data)
|
||||
|
||||
print("-----------------------------")
|
||||
|
||||
nodes_to_replace = []
|
||||
for node, data in graph.nodes(data=True):
|
||||
print(isinstance(node, Resource), node, type(node))
|
||||
if isinstance(node, Resource):
|
||||
print("Found a resource: ", node)
|
||||
resource_type = node.resource_type
|
||||
if resource_type:
|
||||
# Dynamically create a new class based on the resource_type
|
||||
NewResourceClass = create_resource_class(resource_type)
|
||||
new_node = NewResourceClass(resource_type)
|
||||
nodes_to_replace.append((node, new_node, data))
|
||||
else:
|
||||
print(f"Warning: Node {node} is a resource but has no resource_type")
|
||||
|
||||
# Replace old nodes with new ones
|
||||
for old_node, new_node, data in nodes_to_replace:
|
||||
graph.add_node(new_node, **data)
|
||||
for pred in graph.predecessors(old_node):
|
||||
graph.add_edge(pred, new_node)
|
||||
for succ in graph.successors(old_node):
|
||||
graph.add_edge(new_node, succ)
|
||||
graph.remove_node(old_node)
|
||||
"""
|
||||
for node, data in graph.nodes(data=True):
|
||||
print(node, data) """
|
102
graphCreation/process_references.py
Normal file
102
graphCreation/process_references.py
Normal file
@ -0,0 +1,102 @@
|
||||
import networkx as nx
|
||||
|
||||
def parse_synthea_reference(ref):
|
||||
if not ref.startswith('#'):
|
||||
#print("reference: ", ref)
|
||||
if '?' in ref and '|' in ref:
|
||||
parsed_ref = ref.split('|')[1]
|
||||
# elif '/' in ref:
|
||||
# parsed_ref = ref.split('/')[1]
|
||||
else:
|
||||
parsed_ref = ref.split(':')[2]
|
||||
else:
|
||||
parsed_ref = 'mock'
|
||||
return(parsed_ref)
|
||||
|
||||
def process_references(graph):
|
||||
|
||||
isSynthea = False
|
||||
|
||||
nodes_with_reference = [[n, attr['reference']] for n, attr in graph.nodes(data=True) if 'reference' in attr]
|
||||
|
||||
directly_referenced_nodes = []
|
||||
|
||||
indirectly_referenced_nodes = []
|
||||
|
||||
dummy_references = []
|
||||
|
||||
if isSynthea:
|
||||
|
||||
nodes_with_mock_reference = []
|
||||
|
||||
for i in range(len(nodes_with_reference)):
|
||||
reference = nodes_with_reference[i][1]
|
||||
parsed_reference = parse_synthea_reference(reference)
|
||||
|
||||
if parsed_reference != 'mock':
|
||||
nodes_with_reference[i].append(parsed_reference)
|
||||
else:
|
||||
nodes_with_mock_reference.append(i)
|
||||
|
||||
for i in sorted(nodes_with_mock_reference, reverse=True):
|
||||
del nodes_with_reference[i]
|
||||
|
||||
id_to_node = {data["id"]: node for node, data in graph.nodes(data=True) if "id" in data}
|
||||
id_to_identifier_node = {data["value"]: node for node, data in graph.nodes(data=True) if ("value" in data and data['label'] == 'identifier')}
|
||||
|
||||
for i in nodes_with_reference:
|
||||
ref_id=i[2]
|
||||
if ref_id in id_to_node.keys():
|
||||
directly_referenced_nodes.append([i[0], id_to_node[ref_id]])
|
||||
elif ref_id in id_to_identifier_node.keys():
|
||||
indirectly_referenced_nodes.append([i[0], id_to_identifier_node[ref_id]])
|
||||
#else:
|
||||
# print("KEY ERROR: Key neither in to_node nor in to_identifier_node", i)
|
||||
|
||||
for i in indirectly_referenced_nodes:
|
||||
node_from=list(graph.predecessors(i[0]))[0]
|
||||
node_to=list(graph.predecessors(i[1]))[0]
|
||||
ref_type=graph.nodes[i[0]]['label']
|
||||
graph.add_edge(node_from, node_to, edge_type='reference', reference_type=ref_type)
|
||||
|
||||
else:
|
||||
|
||||
#for node, data in graph.nodes(data=True):
|
||||
# if "id" in data:
|
||||
# if not "resourceType" in data:
|
||||
# print("FAILS AT: ", data, node)
|
||||
id_to_node = {data["resourceType"]+'/'+data["id"]: node for node, data in graph.nodes(data=True) if ("id" in data and "resourceType" in data)}
|
||||
|
||||
for i in nodes_with_reference:
|
||||
|
||||
ref_id=i[1]
|
||||
if ref_id in id_to_node.keys():
|
||||
directly_referenced_nodes.append([i[0], id_to_node[ref_id]])
|
||||
else:
|
||||
dummy_references.append([i[0], ref_id])
|
||||
|
||||
for i in directly_referenced_nodes:
|
||||
node_from=list(graph.predecessors(i[0]))[0]
|
||||
node_to=i[1]
|
||||
ref_type=graph.nodes[i[0]]['label']
|
||||
graph.add_edge(node_from, node_to, edge_type='reference', reference_type=ref_type)
|
||||
|
||||
for i in dummy_references:
|
||||
#print(i)
|
||||
node_to='dummy_' + i[1]
|
||||
graph.add_node(node_to, label='dummy', unique_id=i[1])
|
||||
node_from=list(graph.predecessors(i[0]))[0]
|
||||
ref_type=graph.nodes[i[0]]['label']
|
||||
graph.add_edge(node_from, node_to, edge_type='reference', reference_type=ref_type)
|
||||
|
||||
#graph.remove_nodes_from([i[0] for i in nodes_with_reference])
|
||||
|
||||
graph.remove_nodes_from([i[0] for i in directly_referenced_nodes])
|
||||
graph.remove_nodes_from([i[0] for i in indirectly_referenced_nodes])
|
||||
graph.remove_nodes_from([i[0] for i in dummy_references])
|
||||
|
||||
nodes_to_remove = [n for n, attr in graph.nodes(data=True) if attr.get('label') in ['root', 'entry', 'request']]
|
||||
|
||||
graph.remove_nodes_from(nodes_to_remove)
|
||||
|
||||
#graph.remove_nodes_from(list(nx.isolates(graph)))
|
107
graphCreation/property_convolution.py
Normal file
107
graphCreation/property_convolution.py
Normal file
@ -0,0 +1,107 @@
|
||||
import networkx as nx
|
||||
|
||||
def find_paths(graph, start_node):
|
||||
def is_leaf(node):
|
||||
#Checks if a node is a leaf (no outgoing edges)
|
||||
return graph.out_degree(node) == 0
|
||||
|
||||
def custom_dfs(path, reference_count):
|
||||
#Performs a DFS to find paths for both patterns
|
||||
current_node = path[-1]
|
||||
|
||||
'''if the current node is labeled 'resource', the path length is greater than 3,
|
||||
and we have exactly one 'reference' edge in the path'''
|
||||
if len(path) > 3 and graph.nodes[current_node].get('label') == 'resource' and reference_count == 1:
|
||||
# add path to the list of property paths containing a reference
|
||||
reference_paths.append(list(path))
|
||||
|
||||
'''if the current node is a leaf node (no outgoing edges),
|
||||
the path length is greater than 2, and we have no references in the path'''
|
||||
if len(path) > 2 and is_leaf(current_node) and reference_count == 0:
|
||||
'''add path to the dictionary of property paths ending in leaves,
|
||||
by the corresponding property key'''
|
||||
leaf_paths.setdefault(path[1].split('.')[-1], []).extend(list(path))
|
||||
|
||||
# check neighbors
|
||||
for neighbor in graph.successors(current_node):
|
||||
edge_type = graph.edges[current_node, neighbor].get('edge_type', None)
|
||||
new_reference_count = reference_count + (1 if edge_type == 'reference' else 0)
|
||||
|
||||
# continue the search only if we have at most one 'reference' edge so far
|
||||
if new_reference_count <= 1:
|
||||
custom_dfs(path + [neighbor], new_reference_count)
|
||||
|
||||
reference_paths = []
|
||||
leaf_paths = {}
|
||||
|
||||
custom_dfs([start_node], 0)
|
||||
|
||||
return reference_paths, leaf_paths
|
||||
|
||||
def property_convolution(graph):
|
||||
|
||||
# Find all nodes with label 'resource'
|
||||
resource_nodes = [n for n, attr in graph.nodes(data=True) if attr.get('label') == 'resource']
|
||||
|
||||
#print("Got all nodes with label 'resource'", flush=True)
|
||||
|
||||
'''collect all paths starting with a resource node, that contain one reference edge,
|
||||
end with a resource node and are >3 nodes long'''
|
||||
'''collect all paths starting with a resource node, that do not contain reference edges,
|
||||
end with a leaf node and are >2 nodes long'''
|
||||
|
||||
property_paths_with_reference = []
|
||||
property_paths_with_leaves = {}
|
||||
|
||||
for resource_node in resource_nodes:
|
||||
temp_ref_paths, temp_leaf_paths = find_paths(graph, resource_node)
|
||||
# add paths to the list of property paths containing a reference, for all nodes
|
||||
property_paths_with_reference.extend(temp_ref_paths)
|
||||
# add paths to the dictionary of property paths ending in leaves, by the corresponding resouce key
|
||||
property_paths_with_leaves[resource_node] = temp_leaf_paths
|
||||
|
||||
# print("Collected all paths", flush=True)
|
||||
|
||||
# transfer reference edge to first property node for all reference paths
|
||||
for i in property_paths_with_reference:
|
||||
ref_edge_data = graph.get_edge_data(i[-2], i[-1])
|
||||
ref_type = ref_edge_data.get('reference_type')
|
||||
graph.remove_edge(i[-2], i[-1])
|
||||
graph.add_edge(i[1], i[-1], edge_type='reference', reference_type=ref_type)
|
||||
|
||||
'''after transferrence, add the modified reference path (that now ends in a leaf)
|
||||
to the dictionary of leaf paths, by corresponding resource and property keys'''
|
||||
property_paths_with_leaves[i[0]].setdefault(i[1].split('.')[-1], []).extend(i[:-1])
|
||||
|
||||
#print("Transfered all references edges", flush=True)
|
||||
|
||||
'''create a list of collections of property paths ending in leaves,
|
||||
removing duplicate nodes from each path collection'''
|
||||
list_property_paths_with_leaves = [list(dict.fromkeys(i)) for j in property_paths_with_leaves.values() for i in j.values()]
|
||||
|
||||
nodes_to_remove=[]
|
||||
|
||||
for i in list_property_paths_with_leaves:
|
||||
for j in range(len(i)-1, 1, -1):
|
||||
|
||||
source_attributes = graph.nodes[i[j]]
|
||||
|
||||
marker='|'.join(i[j].split('resource.')[1].split('.')[1:])
|
||||
|
||||
# transfer attributes to first property node
|
||||
for attr, value in source_attributes.items():
|
||||
if attr != 'label':
|
||||
graph.nodes[i[1]][marker+'_'+attr] = value
|
||||
|
||||
nodes_to_remove.append(i[j])
|
||||
|
||||
#print("Transferred attributes for all paths", flush=True)
|
||||
|
||||
graph.remove_nodes_from(nodes_to_remove)
|
||||
|
||||
for i in resource_nodes:
|
||||
unique_resource_id = graph.nodes[i]['resourceType']+'/'+graph.nodes[i]['id']
|
||||
graph.nodes[i]['unique_id'] = unique_resource_id
|
||||
for j in graph.successors(i):
|
||||
if graph[i][j].get('edge_type') != 'reference':
|
||||
graph.nodes[j]['unique_id'] = unique_resource_id+'/'+j.split('.')[-1]
|
276
import_fhir_to_nx_diGraph.py
Normal file
276
import_fhir_to_nx_diGraph.py
Normal file
@ -0,0 +1,276 @@
|
||||
from biocypher import BioCypher
|
||||
import networkx as nx
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import uuid
|
||||
import gc
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
from graphCreation import create_graph
|
||||
|
||||
from graphCreation.process_references import process_references
|
||||
|
||||
from graphCreation.property_convolution import property_convolution
|
||||
|
||||
from schema_config_generation import write_automated_schema
|
||||
|
||||
from fhirImport import getPatientEverything, getBundle
|
||||
|
||||
|
||||
|
||||
def load_multiple_fhir_patients(n):
|
||||
#graph = nx.DiGraph()
|
||||
init = True
|
||||
ids = []
|
||||
#get n ids
|
||||
nextIds = True
|
||||
while len(ids) < n and nextIds:
|
||||
if init:
|
||||
complex = os.getenv('COMPLEX_PATIENTS')
|
||||
if complex and complex.upper() != 'TRUE':
|
||||
bundle = getBundle(None, '/Patient?_count=' + str(n))
|
||||
else:
|
||||
bundle = getBundle(None, '/Patient?_has:Observation:subject:status=final&_count=' + str(n))
|
||||
else:
|
||||
bundle = getBundle(None, nextLink)
|
||||
|
||||
if not 'entry' in bundle.json():
|
||||
print("ERROR -- No data found in the fhir bundle. Check the request and if the server is up and responding")
|
||||
sys.exit(1)
|
||||
|
||||
for entry in bundle.json()['entry']:
|
||||
ids.append(entry['resource']['id'])
|
||||
|
||||
nextIds = False
|
||||
for l in bundle.json()['link']:
|
||||
if l['relation'] == "next":
|
||||
nextLink = l['url']
|
||||
nextIds = True
|
||||
|
||||
if len(ids) < n:
|
||||
n = len(ids)
|
||||
|
||||
batchSize = int(os.getenv('BATCH_SIZE'))
|
||||
|
||||
c = 0
|
||||
|
||||
print(len(ids))
|
||||
|
||||
#get bundle for each ID
|
||||
for id in ids:
|
||||
|
||||
c += 1
|
||||
|
||||
bundle = getPatientEverything(id).json()
|
||||
|
||||
bundle = replace_single_quotes(bundle) ### maybe not needed for german data
|
||||
if init:
|
||||
graph = nx.DiGraph()
|
||||
init = False
|
||||
|
||||
create_graph.add_json_to_networkx(bundle, id + '_bundle', graph)
|
||||
|
||||
if c % 50 == 0:
|
||||
print("---------- ", c, " patients loaded ----------", flush=True)
|
||||
|
||||
if c % batchSize == 0 or c == n:
|
||||
print(c, " patients imported, reducing graph", flush = True)
|
||||
process_references(graph)
|
||||
property_convolution(graph)
|
||||
|
||||
lastChunk = False
|
||||
if n == c:
|
||||
lastChunk = True
|
||||
runBioCypher(graph, lastChunk)
|
||||
init = True
|
||||
print(graph)
|
||||
del graph
|
||||
gc.collect
|
||||
|
||||
def replace_single_quotes(obj):
|
||||
if isinstance(obj, str): # If it's a string, replace single quotes
|
||||
return obj.replace("'", "''")
|
||||
elif isinstance(obj, dict): # If it's a dictionary, process each key-value pair
|
||||
return {key: replace_single_quotes(value) for key, value in obj.items()}
|
||||
elif isinstance(obj, list): # If it's a list, process each item
|
||||
return [replace_single_quotes(item) for item in obj]
|
||||
else:
|
||||
return obj # Leave other data types unchanged
|
||||
|
||||
def main():
|
||||
## create networkX and run improvement scripts
|
||||
print("Creating the graph...", flush=True)
|
||||
|
||||
nPatients = int(os.getenv('NUMBER_OF_PATIENTS'))
|
||||
load_multiple_fhir_patients(nPatients)
|
||||
|
||||
|
||||
def runBioCypher(nxGraph, final):
|
||||
|
||||
#get lists of node and edge types
|
||||
print("Generate auto schema...", flush=True)
|
||||
write_automated_schema(nxGraph, 'config/automated_schema.yaml', 'config/manual_schema_config.yaml')
|
||||
|
||||
|
||||
# create Biocypher driver
|
||||
bc = BioCypher(
|
||||
biocypher_config_path="config/biocypher_config.yaml",
|
||||
)
|
||||
|
||||
#bc.show_ontology_structure() #very extensive
|
||||
|
||||
#BioCypher preperation
|
||||
def node_generator():
|
||||
for node in nxGraph.nodes():
|
||||
|
||||
label = nxGraph.nodes[node].get('label')
|
||||
|
||||
if label == "resource":
|
||||
label = nxGraph.nodes[node].get('resourceType')
|
||||
nxGraph.nodes[node]['label'] = label.capitalize()
|
||||
|
||||
label = label.capitalize()
|
||||
|
||||
unq_id = nxGraph.nodes[node].get('unique_id', False)
|
||||
|
||||
if(nxGraph.nodes[node].get('label') in ['search', 'meta', 'link']):
|
||||
#print("skipped a node: ", nxGraph.nodes[node].get('label'))
|
||||
continue
|
||||
|
||||
label = nxGraph.nodes[node].get('label')
|
||||
if(label == 'dummy'):
|
||||
#print("SKIPPED dummy node: ", unq_id)
|
||||
continue
|
||||
|
||||
yield(
|
||||
nxGraph.nodes[node].get('unique_id', node), #remark: this returns the node id if this attribute exists. otherwise it returns node which equals the identifier that is used by nx
|
||||
label,
|
||||
nxGraph.nodes[node] # get properties
|
||||
)
|
||||
|
||||
def edge_generator():
|
||||
for edge in nxGraph.edges(data = True):
|
||||
source, target, attributes = edge
|
||||
|
||||
sLabel = nxGraph.nodes[source].get('label')
|
||||
if sLabel == 'resource':
|
||||
sLabel = nxGraph.nodes[source].get('resourceType')
|
||||
tLabel = nxGraph.nodes[target].get('label')
|
||||
if tLabel == 'resource':
|
||||
tLabel = nxGraph.nodes[target].get('resourceType')
|
||||
label = sLabel.capitalize() + '_to_' + tLabel
|
||||
|
||||
yield(
|
||||
attributes.get('id', str(uuid.uuid4())), # Edge ID (if exists, otherwise use nx internal id)
|
||||
nxGraph.nodes[source].get('unique_id', source),
|
||||
nxGraph.nodes[target].get('unique_id', target),
|
||||
label,
|
||||
attributes # All edge attributes
|
||||
)
|
||||
|
||||
#import nodes
|
||||
bc.write_nodes(node_generator())
|
||||
bc.write_edges(edge_generator())
|
||||
|
||||
#write the import script -- we are creating our own script since BC would only consider the last batch as an input
|
||||
if final:
|
||||
print("CREATING THE SCRIPT")
|
||||
generate_neo4j_import_script()
|
||||
with open('/neo4j_import/shell-scipt-complete', 'w') as f:
|
||||
f.write('Import completed successfully')
|
||||
|
||||
print("FHIR import completed successfully")
|
||||
|
||||
|
||||
def generate_neo4j_import_script(directory_path="/neo4j_import/", output_file="neo4j-admin-import-call.sh"):
|
||||
"""
|
||||
Reads files in a directory and generates a Neo4j import shell script.
|
||||
|
||||
Args:
|
||||
directory_path (str): Path to the directory containing CSV files
|
||||
output_file (str): Name of the output shell script file
|
||||
|
||||
Returns:
|
||||
str: Path to the generated shell script
|
||||
"""
|
||||
# Get all files in the directory
|
||||
all_files = os.listdir(directory_path)
|
||||
|
||||
# Dictionary to store entity types (nodes and relationships)
|
||||
entity_types = {}
|
||||
|
||||
# Find all header files and use them to identify entity types
|
||||
for filename in all_files:
|
||||
if '-header.csv' in filename:
|
||||
entity_name = filename.split('-header.csv')[0]
|
||||
|
||||
# Check if it's a relationship (contains "To" and "Association")
|
||||
is_relationship = "To" in entity_name and "Association" in entity_name
|
||||
|
||||
# Store in entity_types dictionary
|
||||
if is_relationship:
|
||||
entity_type = "relationships"
|
||||
else:
|
||||
entity_type = "nodes"
|
||||
|
||||
# Initialize the entity if not already present
|
||||
if entity_name not in entity_types:
|
||||
entity_types[entity_name] = {
|
||||
"type": entity_type,
|
||||
"header": f"/neo4j_import/{filename}",
|
||||
"has_parts": False
|
||||
}
|
||||
|
||||
# Check for part files for each entity
|
||||
for entity_name in entity_types:
|
||||
# Create pattern to match part files for this entity
|
||||
part_pattern = f"{entity_name}-part"
|
||||
|
||||
# Check if any file matches the pattern
|
||||
for filename in all_files:
|
||||
if part_pattern in filename:
|
||||
entity_types[entity_name]["has_parts"] = True
|
||||
break
|
||||
|
||||
# Generate the import commands
|
||||
nodes_command = ""
|
||||
relationships_command = ""
|
||||
|
||||
for entity_name, info in entity_types.items():
|
||||
if info["has_parts"]:
|
||||
# Create the command string with wildcard for part files
|
||||
command = f" --{info['type']}=\"{info['header']},/neo4j_import/{entity_name}-part.*\""
|
||||
|
||||
# Add to appropriate command string
|
||||
if info['type'] == "nodes":
|
||||
nodes_command += command
|
||||
else: # relationships
|
||||
relationships_command += command
|
||||
|
||||
# Create the shell script content
|
||||
script_content = """#!/bin/bash
|
||||
version=$(bin/neo4j-admin --version | cut -d '.' -f 1)
|
||||
if [[ $version -ge 5 ]]; then
|
||||
\tbin/neo4j-admin database import full neo4j --delimiter="\\t" --array-delimiter="|" --quote="'" --overwrite-destination=true --skip-bad-relationships=true --skip-duplicate-nodes=true{nodes}{relationships}
|
||||
else
|
||||
\tbin/neo4j-admin import --database=neo4j --delimiter="\\t" --array-delimiter="|" --quote="'" --force=true --skip-bad-relationships=true --skip-duplicate-nodes=true{nodes}{relationships}
|
||||
fi
|
||||
""".format(nodes=nodes_command, relationships=relationships_command)
|
||||
|
||||
# Write the script to file
|
||||
script_path = os.path.join(directory_path, output_file)
|
||||
with open(script_path, 'w') as f:
|
||||
f.write(script_content)
|
||||
|
||||
# Make the script executable
|
||||
os.chmod(script_path, 0o755)
|
||||
|
||||
print("Shell import script created", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
199
import_nx_diGraph.py
Normal file
199
import_nx_diGraph.py
Normal file
@ -0,0 +1,199 @@
|
||||
from biocypher import BioCypher
|
||||
import networkx as nx
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
#from networkx_based import create_graph
|
||||
from graphCreation import create_graph
|
||||
#from networkx_based.process_references import process_references
|
||||
from graphCreation.process_references import process_references
|
||||
#from networkx_based.property_convolution import property_convolution
|
||||
from graphCreation.property_convolution import property_convolution
|
||||
|
||||
from schema_config_generation import write_automated_schema
|
||||
#from networkx_based.node_typing import set_ressource_type
|
||||
from graphCreation.node_typing import set_resource_type
|
||||
|
||||
|
||||
|
||||
def load_multiple_fhir_bundles(directory_path):
|
||||
graph = nx.DiGraph()
|
||||
init = True
|
||||
#limit = 2
|
||||
# Iterate over all files in the directory
|
||||
for filename in os.listdir(directory_path):
|
||||
if filename.endswith('.json'): # Assuming FHIR bundles are in JSON format
|
||||
file_path = os.path.join(directory_path, filename)
|
||||
with open(file_path, 'r') as f:
|
||||
bundle_json = json.load(f)
|
||||
|
||||
#fix all strings to to enable ' in neo4j
|
||||
fixedQuotes = replace_single_quotes(bundle_json)
|
||||
if init:
|
||||
#print(bundle_json, filename, graph)
|
||||
create_graph.json_to_networkx(fixedQuotes, filename, graph)
|
||||
init = False
|
||||
else:
|
||||
create_graph.add_json_to_networkx(fixedQuotes, filename, graph)
|
||||
print("Imported: ", filename)
|
||||
|
||||
#if limit == 0:
|
||||
# return graph
|
||||
#limit = limit - 1
|
||||
|
||||
|
||||
return graph
|
||||
|
||||
def replace_single_quotes(obj):
|
||||
if isinstance(obj, str): # If it's a string, replace single quotes
|
||||
return obj.replace("'", "''")
|
||||
elif isinstance(obj, dict): # If it's a dictionary, process each key-value pair
|
||||
return {key: replace_single_quotes(value) for key, value in obj.items()}
|
||||
elif isinstance(obj, list): # If it's a list, process each item
|
||||
return [replace_single_quotes(item) for item in obj]
|
||||
else:
|
||||
return obj # Leave other data types unchanged
|
||||
|
||||
def main():
|
||||
#get a list of nodes that should be imported
|
||||
## create networkX and run improvement scripts
|
||||
print("Creating the graph...", flush=True)
|
||||
nxGraph = load_multiple_fhir_bundles('./testData/') # 'mockData' for unit test data, 'testData' for Synthea files
|
||||
print(nxGraph)
|
||||
|
||||
print("Reducing references...", flush=True)
|
||||
process_references(nxGraph)
|
||||
print(nxGraph)
|
||||
|
||||
print("Convolute references...", flush=True)
|
||||
property_convolution(nxGraph)
|
||||
print(nxGraph)
|
||||
|
||||
|
||||
|
||||
#Set types of all resource nodes to resource_type
|
||||
#set_resource_type(nxGraph)
|
||||
|
||||
#get lists of node and edge types
|
||||
""" all_nLabels = set()
|
||||
all_eLabels = set()
|
||||
|
||||
for node, attrs in nxGraph.nodes(data=True):
|
||||
for attr_name, attr_value in attrs.items():
|
||||
if attr_name == "label":
|
||||
all_nLabels.add(attr_value)
|
||||
|
||||
for nt in all_nLabels:
|
||||
print(nt)
|
||||
|
||||
print("-" * 50)
|
||||
|
||||
for u, v, attrs in nxGraph.edges(data=True):
|
||||
u_label = nxGraph.nodes[u]['label']
|
||||
if u_label == "resource":
|
||||
u_label = nxGraph.nodes[u]['resourceType']
|
||||
v_label = nxGraph.nodes[v]['label']
|
||||
if v_label == "resource":
|
||||
v_label = nxGraph.nodes[v]['resourceType']
|
||||
all_eLabels.add(u_label + " to " + v_label)
|
||||
|
||||
for et in all_eLabels:
|
||||
print(et)
|
||||
|
||||
print("-" * 50)
|
||||
|
||||
print("...end")
|
||||
return """
|
||||
|
||||
print("Generate auto schema...")
|
||||
write_automated_schema(nxGraph, 'config/automated_schema.yaml')
|
||||
|
||||
|
||||
# create Biocypher driver
|
||||
bc = BioCypher(
|
||||
biocypher_config_path="config/biocypher_config.yaml",
|
||||
#schema_config_path="/config/manual_schema_config.yaml"
|
||||
)
|
||||
|
||||
bc.show_ontology_structure()
|
||||
|
||||
#BioCypher preperation
|
||||
## node generator: extract id, label and property dictionary
|
||||
def node_generator():
|
||||
for node in nxGraph.nodes():
|
||||
|
||||
""" #single qoutes break neo4j import, e.g. 'CHILDREN'S Hospital'
|
||||
checkDisplay = nxGraph.nodes[node].get('display')
|
||||
if checkDisplay:
|
||||
checkDisplay = checkDisplay.replace("'", "''")
|
||||
nxGraph.nodes[node]['display'] = checkDisplay
|
||||
#print("------->", nxGraph.nodes[node].get('display'))
|
||||
|
||||
checkName = nxGraph.nodes[node].get('name')
|
||||
if checkName:
|
||||
checkName = checkName.replace("'", "''")
|
||||
nxGraph.nodes[node]['name'] = checkName
|
||||
#print("------->", nxGraph.nodes[node].get('name')) """
|
||||
|
||||
label = nxGraph.nodes[node].get('label')
|
||||
|
||||
if label == "resource":
|
||||
label = nxGraph.nodes[node].get('resourceType')
|
||||
'''
|
||||
elif label == 'identifier':
|
||||
label = nxGraph.nodes[node].get('system')
|
||||
print('/' in label)
|
||||
if '/' in label:
|
||||
lastSlash = label.rfind('/') + 1
|
||||
label = label[lastSlash:] + '-ID'
|
||||
elif label == 'telecom':
|
||||
label = nxGraph.nodes[node].get('system')
|
||||
print('/' in label)
|
||||
if '/' in label:
|
||||
lastSlash = label.rfind('/') + 1
|
||||
label = 'telecom-' + label[lastSlash:]
|
||||
elif label == 'address':
|
||||
extension = nxGraph.nodes[node].get('extension_url')
|
||||
print("EX!: ", extension)
|
||||
if extension:
|
||||
lastSlash = extension.rfind('/') + 1
|
||||
label = label + '-' + extension[lastSlash:]
|
||||
'''
|
||||
|
||||
yield(
|
||||
nxGraph.nodes[node].get('id', node), #remark: this returns the node id if this attribute exists. otherwise it returns node which equals the identifier that is used by nx
|
||||
label,
|
||||
nxGraph.nodes[node] # get properties
|
||||
)
|
||||
|
||||
def edge_generator():
|
||||
for edge in nxGraph.edges(data = True):
|
||||
source, target, attributes = edge
|
||||
|
||||
sLabel = nxGraph.nodes[source].get('label')
|
||||
if sLabel == 'resource':
|
||||
sLabel = nxGraph.nodes[source].get('resourceType')
|
||||
tLabel = nxGraph.nodes[target].get('label')
|
||||
if tLabel == 'resource':
|
||||
tLabel = nxGraph.nodes[target].get('resourceType')
|
||||
label = sLabel + '_to_' + tLabel
|
||||
|
||||
yield(
|
||||
attributes.get('id', str(uuid.uuid4())), # Edge ID (if exists, otherwise use nx internal id)
|
||||
nxGraph.nodes[source].get('id', source),
|
||||
nxGraph.nodes[target].get('id', target),
|
||||
label,
|
||||
attributes # All edge attributes
|
||||
)
|
||||
|
||||
#import nodes
|
||||
bc.write_nodes(node_generator())
|
||||
bc.write_edges(edge_generator())
|
||||
|
||||
#write the import script
|
||||
bc.write_import_call()
|
||||
|
||||
if __name__ == "__main__":
|
||||
#print("Called import script. Should run its main function now...")
|
||||
main()
|
||||
|
8
init-scripts/setup.cypher
Normal file
8
init-scripts/setup.cypher
Normal file
@ -0,0 +1,8 @@
|
||||
// Example initialization script - modify according to your schema
|
||||
CREATE CONSTRAINT IF NOT EXISTS FOR (n:YourLabel) REQUIRE n.id IS UNIQUE;
|
||||
CREATE INDEX IF NOT EXISTS FOR (n:YourLabel) ON (n.someProperty);
|
||||
|
||||
// Add any other initialization queries here
|
||||
// For example:
|
||||
// CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person) REQUIRE n.email IS UNIQUE;
|
||||
// CREATE INDEX IF NOT EXISTS FOR (n:Product) ON (n.sku);
|
210
mockData/short_Johnny_Schoen.json
Normal file
210
mockData/short_Johnny_Schoen.json
Normal file
@ -0,0 +1,210 @@
|
||||
{
|
||||
"resourceType": "Bundle",
|
||||
"type": "transaction",
|
||||
"entry": [ {
|
||||
"fullUrl": "urn:uuid:a7a285c0-4714-dd3c-4837-8719c9b67873",
|
||||
"resource": {
|
||||
"resourceType": "Patient",
|
||||
"id": "a7a285c0-4714-dd3c-4837-8719c9b67873",
|
||||
"meta": {
|
||||
"profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient" ]
|
||||
},
|
||||
"text": {
|
||||
"status": "generated",
|
||||
"div": "<div xmlns=\"http://www.w3.org/1999/xhtml\">Generated by <a href=\"https://github.com/synthetichealth/synthea\">Synthea</a>.Version identifier: 3c23908\n . Person seed: -5557164924473669144 Population seed: 1693908535569</div>"
|
||||
},
|
||||
"extension": [ {
|
||||
"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race",
|
||||
"extension": [ {
|
||||
"url": "ombCategory",
|
||||
"valueCoding": {
|
||||
"system": "urn:oid:2.16.840.1.113883.6.238",
|
||||
"code": "2106-3",
|
||||
"display": "White"
|
||||
}
|
||||
}, {
|
||||
"url": "text",
|
||||
"valueString": "White"
|
||||
} ]
|
||||
}, {
|
||||
"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity",
|
||||
"extension": [ {
|
||||
"url": "ombCategory",
|
||||
"valueCoding": {
|
||||
"system": "urn:oid:2.16.840.1.113883.6.238",
|
||||
"code": "2186-5",
|
||||
"display": "Not Hispanic or Latino"
|
||||
}
|
||||
}, {
|
||||
"url": "text",
|
||||
"valueString": "Not Hispanic or Latino"
|
||||
} ]
|
||||
}, {
|
||||
"url": "http://hl7.org/fhir/StructureDefinition/patient-mothersMaidenName",
|
||||
"valueString": "Leana211 Sauer652"
|
||||
}, {
|
||||
"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex",
|
||||
"valueCode": "M"
|
||||
}, {
|
||||
"url": "http://hl7.org/fhir/StructureDefinition/patient-birthPlace",
|
||||
"valueAddress": {
|
||||
"city": "Quincy",
|
||||
"state": "Massachusetts",
|
||||
"country": "US"
|
||||
}
|
||||
}, {
|
||||
"url": "http://synthetichealth.github.io/synthea/disability-adjusted-life-years",
|
||||
"valueDecimal": 0.0
|
||||
}, {
|
||||
"url": "http://synthetichealth.github.io/synthea/quality-adjusted-life-years",
|
||||
"valueDecimal": 1.0
|
||||
} ],
|
||||
"identifier": [
|
||||
{
|
||||
"system": "https://github.com/synthetichealth/synthea",
|
||||
"value": "a7a285c0-4714-dd3c-4837-8719c9b67873"
|
||||
},
|
||||
{
|
||||
"type": {
|
||||
"coding": [ {
|
||||
"system": "http://terminology.hl7.org/CodeSystem/v2-0203",
|
||||
"code": "MR",
|
||||
"display": "Medical Record Number"
|
||||
} ],
|
||||
"text": "Medical Record Number"
|
||||
},
|
||||
"system": "http://hospital.smarthealthit.org",
|
||||
"value": "a7a285c0-4714-dd3c-4837-8719c9b67873"
|
||||
}, {
|
||||
"type": {
|
||||
"coding": [ {
|
||||
"system": "http://terminology.hl7.org/CodeSystem/v2-0203",
|
||||
"code": "SS",
|
||||
"display": "Social Security Number"
|
||||
} ],
|
||||
"text": "Social Security Number"
|
||||
},
|
||||
"system": "http://hl7.org/fhir/sid/us-ssn",
|
||||
"value": "999-89-9528"
|
||||
} ],
|
||||
"name": [ {
|
||||
"use": "official",
|
||||
"family": "Schoen8",
|
||||
"given": [ "Johnny786", "Vince741" ]
|
||||
} ],
|
||||
"telecom": [ {
|
||||
"system": "phone",
|
||||
"value": "555-753-6560",
|
||||
"use": "home"
|
||||
} ],
|
||||
"gender": "male",
|
||||
"birthDate": "2021-05-22",
|
||||
"address": [ {
|
||||
"extension": [ {
|
||||
"url": "http://hl7.org/fhir/StructureDefinition/geolocation",
|
||||
"extension": [ {
|
||||
"url": "latitude",
|
||||
"valueDecimal": 42.05921178859317
|
||||
}, {
|
||||
"url": "longitude",
|
||||
"valueDecimal": -70.79219595855132
|
||||
} ]
|
||||
} ],
|
||||
"line": [ "463 Rempel Ranch Unit 81" ],
|
||||
"city": "Pembroke",
|
||||
"state": "MA",
|
||||
"postalCode": "00000",
|
||||
"country": "US"
|
||||
} ],
|
||||
"maritalStatus": {
|
||||
"coding": [ {
|
||||
"system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus",
|
||||
"code": "S",
|
||||
"display": "Never Married"
|
||||
} ],
|
||||
"text": "Never Married"
|
||||
},
|
||||
"multipleBirthBoolean": false,
|
||||
"communication": [ {
|
||||
"language": {
|
||||
"coding": [ {
|
||||
"system": "urn:ietf:bcp:47",
|
||||
"code": "en-US",
|
||||
"display": "English (United States)"
|
||||
} ],
|
||||
"text": "English (United States)"
|
||||
}
|
||||
} ]
|
||||
},
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": "Patient"
|
||||
}
|
||||
}, {
|
||||
"fullUrl": "urn:uuid:0eb53bda-2881-5e8e-3597-87a9430af96a",
|
||||
"resource": {
|
||||
"resourceType": "Encounter",
|
||||
"id": "0eb53bda-2881-5e8e-3597-87a9430af96a",
|
||||
"meta": {
|
||||
"profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-encounter" ]
|
||||
},
|
||||
"identifier": [ {
|
||||
"use": "official",
|
||||
"system": "https://github.com/synthetichealth/synthea",
|
||||
"value": "0eb53bda-2881-5e8e-3597-87a9430af96a"
|
||||
} ],
|
||||
"status": "finished",
|
||||
"class": {
|
||||
"system": "http://terminology.hl7.org/CodeSystem/v3-ActCode",
|
||||
"code": "AMB"
|
||||
},
|
||||
"type": [ {
|
||||
"coding": [ {
|
||||
"system": "http://snomed.info/sct",
|
||||
"code": "410620009",
|
||||
"display": "Well child visit (procedure)"
|
||||
} ],
|
||||
"text": "Well child visit (procedure)"
|
||||
} ],
|
||||
"subject": {
|
||||
"reference": "urn:uuid:a7a285c0-4714-dd3c-4837-8719c9b67873",
|
||||
"display": "Johnny786 Vince741 Schoen8"
|
||||
},
|
||||
"participant": [ {
|
||||
"type": [ {
|
||||
"coding": [ {
|
||||
"system": "http://terminology.hl7.org/CodeSystem/v3-ParticipationType",
|
||||
"code": "PPRF",
|
||||
"display": "primary performer"
|
||||
} ],
|
||||
"text": "primary performer"
|
||||
} ],
|
||||
"period": {
|
||||
"start": "2021-05-22T00:13:45+02:00",
|
||||
"end": "2021-05-22T00:28:45+02:00"
|
||||
},
|
||||
"individual": {
|
||||
"reference": "Practitioner?identifier=http://hl7.org/fhir/sid/us-npi|9999942599",
|
||||
"display": "Dr. Regenia619 Bosco882"
|
||||
}
|
||||
} ],
|
||||
"period": {
|
||||
"start": "2021-05-22T00:13:45+02:00",
|
||||
"end": "2021-05-22T00:28:45+02:00"
|
||||
},
|
||||
"location": [ {
|
||||
"location": {
|
||||
"reference": "Location?identifier=https://github.com/synthetichealth/synthea|6e3d04a3-9064-33e4-b8b5-63bb468d7629",
|
||||
"display": "UNITED MEDICAL CARE LLC"
|
||||
}
|
||||
} ],
|
||||
"serviceProvider": {
|
||||
"reference": "Organization?identifier=https://github.com/synthetichealth/synthea|4e56c7ec-99e5-3023-8e4f-95ad18a03f06",
|
||||
"display": "UNITED MEDICAL CARE LLC"
|
||||
}
|
||||
},
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": "Encounter"
|
||||
}
|
||||
}]}
|
98
mockData/short_hospitalInformation.json
Normal file
98
mockData/short_hospitalInformation.json
Normal file
@ -0,0 +1,98 @@
|
||||
{
|
||||
"resourceType": "Bundle",
|
||||
"type": "batch",
|
||||
"entry": [ {
|
||||
"fullUrl": "urn:uuid:4e56c7ec-99e5-3023-8e4f-95ad18a03f06",
|
||||
"resource": {
|
||||
"resourceType": "Organization",
|
||||
"id": "4e56c7ec-99e5-3023-8e4f-95ad18a03f06",
|
||||
"meta": {
|
||||
"profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-organization" ]
|
||||
},
|
||||
"extension": [ {
|
||||
"url": "http://synthetichealth.github.io/synthea/utilization-encounters-extension",
|
||||
"valueInteger": 9
|
||||
}, {
|
||||
"url": "http://synthetichealth.github.io/synthea/utilization-procedures-extension",
|
||||
"valueInteger": 2
|
||||
}, {
|
||||
"url": "http://synthetichealth.github.io/synthea/utilization-labs-extension",
|
||||
"valueInteger": 1
|
||||
}, {
|
||||
"url": "http://synthetichealth.github.io/synthea/utilization-prescriptions-extension",
|
||||
"valueInteger": 3
|
||||
} ],
|
||||
"identifier": [ {
|
||||
"system": "https://github.com/synthetichealth/synthea",
|
||||
"value": "4e56c7ec-99e5-3023-8e4f-95ad18a03f06"
|
||||
} ],
|
||||
"active": true,
|
||||
"type": [ {
|
||||
"coding": [ {
|
||||
"system": "http://terminology.hl7.org/CodeSystem/organization-type",
|
||||
"code": "prov",
|
||||
"display": "Healthcare Provider"
|
||||
} ],
|
||||
"text": "Healthcare Provider"
|
||||
} ],
|
||||
"name": "UNITED MEDICAL CARE LLC",
|
||||
"telecom": [ {
|
||||
"system": "phone",
|
||||
"value": "5089715500"
|
||||
} ],
|
||||
"address": [ {
|
||||
"line": [ "28 RIVERSIDE DR STE 101" ],
|
||||
"city": "PEMBROKE",
|
||||
"state": "MA",
|
||||
"postalCode": "023594947",
|
||||
"country": "US"
|
||||
} ]
|
||||
},
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": "Organization",
|
||||
"ifNoneExist": "identifier=https://github.com/synthetichealth/synthea|4e56c7ec-99e5-3023-8e4f-95ad18a03f06"
|
||||
}
|
||||
}, {
|
||||
"fullUrl": "urn:uuid:6e3d04a3-9064-33e4-b8b5-63bb468d7629",
|
||||
"resource": {
|
||||
"resourceType": "Location",
|
||||
"id": "6e3d04a3-9064-33e4-b8b5-63bb468d7629",
|
||||
"meta": {
|
||||
"profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-location" ]
|
||||
},
|
||||
"identifier": [ {
|
||||
"system": "https://github.com/synthetichealth/synthea",
|
||||
"value": "6e3d04a3-9064-33e4-b8b5-63bb468d7629"
|
||||
} ],
|
||||
"status": "active",
|
||||
"name": "UNITED MEDICAL CARE LLC",
|
||||
"telecom": [ {
|
||||
"system": "phone",
|
||||
"value": "5089715500"
|
||||
} ],
|
||||
"address": {
|
||||
"line": [ "28 RIVERSIDE DR STE 101" ],
|
||||
"city": "PEMBROKE",
|
||||
"state": "MA",
|
||||
"postalCode": "023594947",
|
||||
"country": "US"
|
||||
},
|
||||
"position": {
|
||||
"longitude": -70.77534154695786,
|
||||
"latitude": 42.11004715
|
||||
},
|
||||
"managingOrganization": {
|
||||
"identifier": {
|
||||
"system": "https://github.com/synthetichealth/synthea",
|
||||
"value": "4e56c7ec-99e5-3023-8e4f-95ad18a03f06"
|
||||
},
|
||||
"display": "UNITED MEDICAL CARE LLC"
|
||||
}
|
||||
},
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": "Location",
|
||||
"ifNoneExist": "identifier=https://github.com/synthetichealth/synthea|6e3d04a3-9064-33e4-b8b5-63bb468d7629"
|
||||
}
|
||||
}]}
|
50
mockData/short_practitionerInformation.json
Normal file
50
mockData/short_practitionerInformation.json
Normal file
@ -0,0 +1,50 @@
|
||||
{
|
||||
"resourceType": "Bundle",
|
||||
"type": "batch",
|
||||
"entry": [ {
|
||||
"fullUrl": "urn:uuid:0368f101-0e65-3251-a809-566ebd6b2c2a",
|
||||
"resource": {
|
||||
"resourceType": "Practitioner",
|
||||
"id": "0368f101-0e65-3251-a809-566ebd6b2c2a",
|
||||
"meta": {
|
||||
"profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-practitioner" ]
|
||||
},
|
||||
"extension": [ {
|
||||
"url": "http://synthetichealth.github.io/synthea/utilization-encounters-extension",
|
||||
"valueInteger": 9
|
||||
} ],
|
||||
"identifier": [ {
|
||||
"system": "http://hl7.org/fhir/sid/us-npi",
|
||||
"value": "9999942599"
|
||||
} ],
|
||||
"active": true,
|
||||
"name": [ {
|
||||
"family": "Bosco882",
|
||||
"given": [ "Regenia619" ],
|
||||
"prefix": [ "Dr." ]
|
||||
} ],
|
||||
"telecom": [ {
|
||||
"extension": [ {
|
||||
"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-direct",
|
||||
"valueBoolean": true
|
||||
} ],
|
||||
"system": "email",
|
||||
"value": "Regenia619.Bosco882@example.com",
|
||||
"use": "work"
|
||||
} ],
|
||||
"address": [ {
|
||||
"line": [ "28 RIVERSIDE DR STE 101" ],
|
||||
"city": "PEMBROKE",
|
||||
"state": "MA",
|
||||
"postalCode": "023594947",
|
||||
"country": "US"
|
||||
} ],
|
||||
"gender": "female"
|
||||
},
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": "Practitioner",
|
||||
"ifNoneExist": "identifier=http://hl7.org/fhir/sid/us-npi|9999942599"
|
||||
}
|
||||
}]}
|
||||
|
44
pipeline.puml
Normal file
44
pipeline.puml
Normal file
@ -0,0 +1,44 @@
|
||||
@startuml "MeDaX pipeline"
|
||||
left to right direction
|
||||
actor admin
|
||||
database "fhir server" as fhir
|
||||
node "docker compose" as compose{
|
||||
node "python app" as pyApp {
|
||||
[scripts]
|
||||
[nodeGenerator] as ngen
|
||||
[edgeGenerator] as egen
|
||||
[BioCypher] as BC
|
||||
file "generated Schema" as gSchema
|
||||
file "manual Schema" as mSchema
|
||||
|
||||
mSchema --> scripts : input
|
||||
scripts --> gSchema : generates
|
||||
scripts --> ngen : generates
|
||||
scripts --> egen : generates
|
||||
gSchema --> BC : input
|
||||
ngen--> BC : input
|
||||
egen--> BC : input
|
||||
}
|
||||
node "neo4j app" as neoApp{
|
||||
database "neo4j GDB" as neoDB
|
||||
[web server] as neoServer
|
||||
|
||||
neoDB --> neoServer
|
||||
}
|
||||
folder "admin files" as afiles {
|
||||
file nodes
|
||||
file edges
|
||||
file "import script" as iscript
|
||||
}
|
||||
|
||||
|
||||
admin -[dashed]-> compose : triggers
|
||||
|
||||
BC --> afiles : exports
|
||||
fhir --> scripts : http request
|
||||
afiles --> neoApp : input
|
||||
}
|
||||
actor user
|
||||
user --> neoServer : uses
|
||||
neoApp --> pyApp : kills
|
||||
@enduml
|
3315
poetry.lock
generated
Normal file
3315
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
132
pyproject.toml
Normal file
132
pyproject.toml
Normal file
@ -0,0 +1,132 @@
|
||||
[tool.poetry]
|
||||
name = "MeDaX pipeline"
|
||||
version = "1.0.0"
|
||||
description = "A unifying framework for biomedical research knowledge graphs"
|
||||
authors = [
|
||||
"Ilya Mazien",
|
||||
"Tom Gebhardt",
|
||||
"Lea Michaelis",
|
||||
"Ron Henkel",
|
||||
"Benjamin Winter",
|
||||
"Dagmar Waltemath",
|
||||
"Judith Wodke"
|
||||
]
|
||||
license = "MIT"
|
||||
packages = [
|
||||
{ include = "biocypher" }
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Science/Research",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python",
|
||||
"Natural Language :: English",
|
||||
"Topic :: Scientific/Engineering :: Bio-Informatics"
|
||||
]
|
||||
repository = "https://github.com/biocypher/biocypher"
|
||||
readme = "README.md"
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://www.medizin.uni-greifswald.de/medizininformatik/research/current-projects/medax/"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
PyYAML = ">=5.0"
|
||||
more_itertools = "*"
|
||||
appdirs = "*"
|
||||
treelib = "1.6.4"
|
||||
rdflib = "^6.2.0"
|
||||
networkx = "^3.0"
|
||||
stringcase = "^1.2.0"
|
||||
neo4j-utils = "0.0.7"
|
||||
pandas = "^2.0.1"
|
||||
pooch = "^1.7.0"
|
||||
tqdm = "^4.65.0"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
sphinx = ">=5.0.0"
|
||||
sphinx-design = "^0.3.0"
|
||||
sphinx-rtd-theme = ">=1.0.0"
|
||||
sphinx-last-updated-by-git = ">=0.3"
|
||||
sphinx-autodoc-typehints = ">=1.18.0"
|
||||
myst-parser = "^0.18.0"
|
||||
yapf = "^0.32.0"
|
||||
pytest = ">=6.0"
|
||||
tox = ">=3.20.1"
|
||||
pre-commit = ">=2.17.0"
|
||||
bump2version = "*"
|
||||
coverage = ">=6.0"
|
||||
pytest-cov = "^3.0.0"
|
||||
hypothesis = "^6.50.1"
|
||||
isort = "^5.10.1"
|
||||
ipython = "^8.7.0"
|
||||
ipykernel = "^6.23.1"
|
||||
sphinxext-opengraph = "^0.8.2"
|
||||
coverage-badge = "^1.1.0"
|
||||
nbsphinx = "^0.9.2"
|
||||
black = "^23.9.1"
|
||||
flake8 = "^6.1.0"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core<2.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Bug Tracker" = "https://github.com/biocypher/biocypher/issues"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
log_cli = true
|
||||
log_level = "INFO"
|
||||
markers = [
|
||||
"requires_neo4j: Requires connection to a Neo4j server",
|
||||
"requires_postgresql: Requires connection to a PostgreSQL server",
|
||||
"inject_driver_args(driver_args): Arguments for the Driver",
|
||||
]
|
||||
|
||||
[tool.black]
|
||||
line-length = 80
|
||||
target-version = ['py310']
|
||||
include = '\.pyi?$'
|
||||
exclude = '''
|
||||
(
|
||||
/(
|
||||
\.eggs
|
||||
| \.git
|
||||
| \.hg
|
||||
| \.mypy_cache
|
||||
| \.tox
|
||||
| \.venv
|
||||
| _build
|
||||
| buck-out
|
||||
| build
|
||||
| dist
|
||||
)/
|
||||
)
|
||||
'''
|
||||
|
||||
[tool.isort]
|
||||
from_first = true
|
||||
line_length = 80
|
||||
multi_line_output = 3
|
||||
include_trailing_comma = true
|
||||
use_parentheses = true
|
||||
known_num="numpy,pandas"
|
||||
sections = "FUTURE,STDLIB,THIRDPARTY,NUM,FIRSTPARTY,LOCALFOLDER"
|
||||
no_lines_before="LOCALFOLDER"
|
||||
balanced_wrapping = true
|
||||
force_grid_wrap = 0
|
||||
length_sort = "1"
|
||||
indent = " "
|
||||
profile = "black"
|
||||
|
||||
[tool.flake8]
|
||||
ignore = ["E203", "D200", "D202", "D401", "D105", "W504"]
|
||||
per-file-ignores = [
|
||||
"docs/source/conf.py:D100",
|
||||
"tests/*:D100,D101,D102",
|
||||
"*/__init__.py:F401"
|
||||
]
|
||||
max-line-length = 80
|
||||
count = true
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
requests==2.31.0
|
||||
python-dotenv==1.0.0
|
188
schema_config_generation.py
Normal file
188
schema_config_generation.py
Normal file
@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
#import networkx as nx
|
||||
import yaml
|
||||
from collections import defaultdict
|
||||
|
||||
#extract all node types and generate basic yaml config part for nodes
|
||||
|
||||
def write_automated_schema(graph, filePath, mSchemaPath):
|
||||
schemaData = {
|
||||
'nodes': {},
|
||||
'edges': {}
|
||||
}
|
||||
|
||||
|
||||
if Path(filePath).exists():
|
||||
schemaData = loadManualSchema(filePath)
|
||||
elif mSchemaPath:
|
||||
print("using the manual schema")
|
||||
schemaData = loadManualSchema(mSchemaPath)
|
||||
|
||||
|
||||
|
||||
for node in graph.nodes():
|
||||
label = graph.nodes[node].get('label')
|
||||
|
||||
if label == 'resource':
|
||||
label = graph.nodes[node].get('resourceType')
|
||||
|
||||
label = label.capitalize()
|
||||
|
||||
if not label in schemaData['nodes']:
|
||||
schemaData['nodes'][label] = {}
|
||||
|
||||
if not 'properties' in schemaData['nodes'][label]:
|
||||
schemaData['nodes'][label]['properties'] = {}
|
||||
|
||||
for k in graph.nodes[node].keys():
|
||||
#print(k, '----- ', graph.nodes[node][k])
|
||||
#if k != 'label':
|
||||
schemaData['nodes'][label]['properties'][k] = 'str'
|
||||
|
||||
#schemaData['nodes'][label]['properties'].update(graph.nodes[node].keys())
|
||||
|
||||
|
||||
file=open(filePath, 'w')
|
||||
|
||||
for n in schemaData['nodes']:
|
||||
temp = n+':\n'
|
||||
if 'is_a' in schemaData['nodes'][n]:
|
||||
temp += ' is_a: ' + schemaData['nodes'][n]['is_a'] + '\n'
|
||||
else:
|
||||
temp += ' is_a: named thing\n'
|
||||
|
||||
if 'represented_as' in schemaData['nodes'][n]:
|
||||
temp += ' represented_as: ' + schemaData['nodes'][n]['represented_as'] + '\n'
|
||||
else:
|
||||
temp += ' represented_as: node\n'
|
||||
|
||||
if 'label_in_input' in schemaData['nodes'][n]:
|
||||
temp += ' label_in_input: ' + schemaData['nodes'][n]['label_in_input'] + '\n'
|
||||
|
||||
if 'preferred_id' in schemaData['nodes'][n]:
|
||||
temp += ' preferred_id: ' + schemaData['nodes'][n]['preferred_id'] + '\n'
|
||||
else:
|
||||
temp += ' preferred_id: fhir_id\n'
|
||||
|
||||
temp += ' label_in_input: ' + n + '\n'
|
||||
|
||||
temp += ' properties:\n'
|
||||
# get property values from schemaData if exists
|
||||
|
||||
for pKey in schemaData['nodes'][n]['properties']:
|
||||
temp += ' ' + pKey + ': ' + schemaData['nodes'][n]['properties'][pKey] + '\n'
|
||||
#elif schemaData['nodes']['properties']:
|
||||
#print("----> ", schemaData['nodes']['properties'])
|
||||
""" else:
|
||||
for attr in schemaData['nodes'][n]:
|
||||
temp += ' ' + attr + ': str\n' """
|
||||
|
||||
temp += '\n'
|
||||
|
||||
file.write(temp)
|
||||
|
||||
file.write('\n')
|
||||
|
||||
#extract all relationship types and generate basic yaml config part for relationships
|
||||
#if not edgeTypes: edgeTypes = set()
|
||||
|
||||
for u, v, a in graph.edges(data=True):
|
||||
|
||||
#edge_label = graph[u][v].get('edge_type', '')
|
||||
source_label = graph.nodes[u].get('label')
|
||||
target_label = graph.nodes[v].get('label')
|
||||
|
||||
if source_label == 'resource':
|
||||
source_label = graph.nodes[u].get('resourceType', str(u))
|
||||
|
||||
if target_label == 'resource':
|
||||
target_label = graph.nodes[v].get('resourceType', str(v))
|
||||
|
||||
source_label = source_label.capitalize()
|
||||
#target_label = target_label.capitalize()
|
||||
|
||||
|
||||
if source_label + ' to ' + target_label + ' association' in schemaData['edges']:
|
||||
# add missing attributes
|
||||
continue
|
||||
elif source_label + ' derived from ' + target_label + ' association' in schemaData['edges']:
|
||||
continue
|
||||
elif source_label + ' has member ' + target_label + ' association' in schemaData['edges']:
|
||||
continue
|
||||
elif source_label + ' reasoned by ' + target_label + ' association' in schemaData['edges']:
|
||||
continue
|
||||
elif source_label + ' is ' + target_label + ' association' in schemaData['edges']:
|
||||
continue
|
||||
else:
|
||||
#schemaData['edges'][source_label + ' to ' + target_label + ' association'] = set()
|
||||
schemaData['edges'][source_label + ' to ' + target_label + ' association'] = {
|
||||
'is_a': 'association',
|
||||
'represented_as': 'edge',
|
||||
'label_in_input': source_label + '_to_' + target_label,
|
||||
'properties': a
|
||||
}
|
||||
|
||||
|
||||
for label in schemaData['edges']:
|
||||
temp = '' + label + ':\n'
|
||||
for key in schemaData['edges'][label]:
|
||||
if key == 'properties':
|
||||
temp += ' properties:\n'
|
||||
for prop in schemaData['edges'][label][key]:
|
||||
temp += ' ' + prop + ': ' + schemaData['edges'][label][key][prop] + '\n'
|
||||
else:
|
||||
temp+= ' ' + key + ': ' + schemaData['edges'][label][key] + '\n'
|
||||
|
||||
temp += '\n'
|
||||
file.write(temp)
|
||||
|
||||
|
||||
file.close()
|
||||
|
||||
def loadManualSchema(path):
|
||||
schemaData = {
|
||||
'nodes': {},
|
||||
'edges': {}
|
||||
}
|
||||
edgeTypes = set()
|
||||
|
||||
with open(path, 'r') as file:
|
||||
# Load YAML with comments stripped
|
||||
data = yaml.safe_load(file)
|
||||
|
||||
for label, attrs in data.items():
|
||||
cLabel = label.capitalize()
|
||||
if not label == 'Title':
|
||||
if attrs["represented_as"] == 'node':
|
||||
if not hasattr(schemaData['nodes'], cLabel):
|
||||
schemaData['nodes'][cLabel] = set()
|
||||
|
||||
#assuming uniqueness in schema file here. If the same node type exits twice, it will be overwritten.
|
||||
schemaData['nodes'][cLabel] = attrs
|
||||
#for a in attrs:
|
||||
|
||||
#print(v)
|
||||
""" for k, v in attrs:
|
||||
if not k == ''
|
||||
schemaData['nodes'][label][k] = v """
|
||||
else:
|
||||
if not hasattr(schemaData['edges'], cLabel):
|
||||
schemaData['edges'][cLabel] = set()
|
||||
|
||||
#assuming uniqueness in schema file here. If the same node type exits twice, it will be overwritten.
|
||||
schemaData['edges'][cLabel] = attrs
|
||||
|
||||
return schemaData
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
32154
testData/Alfonzo975_Medhurst46_cab042ec-9851-e5ed-80c8-0952376f5b08.json
Normal file
32154
testData/Alfonzo975_Medhurst46_cab042ec-9851-e5ed-80c8-0952376f5b08.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
30558
testData/Conrad619_Zemlak964_881bcdd0-ce56-0da9-f297-696da35bd4a3.json
Normal file
30558
testData/Conrad619_Zemlak964_881bcdd0-ce56-0da9-f297-696da35bd4a3.json
Normal file
File diff suppressed because it is too large
Load Diff
56795
testData/Daysi106_Borer986_0096fcc6-e2d6-5aed-4790-beda6322c9be.json
Normal file
56795
testData/Daysi106_Borer986_0096fcc6-e2d6-5aed-4790-beda6322c9be.json
Normal file
File diff suppressed because it is too large
Load Diff
74841
testData/Domingo513_Durgan499_97fcce97-37ab-3fa7-d3d7-0729d60afcb5.json
Normal file
74841
testData/Domingo513_Durgan499_97fcce97-37ab-3fa7-d3d7-0729d60afcb5.json
Normal file
File diff suppressed because one or more lines are too long
35694
testData/Donnell534_Koss676_5fcaaba4-cfdf-43a8-95c6-7d9f2fa6905e.json
Normal file
35694
testData/Donnell534_Koss676_5fcaaba4-cfdf-43a8-95c6-7d9f2fa6905e.json
Normal file
File diff suppressed because it is too large
Load Diff
63091
testData/Dorthy94_Klocko335_50390ac6-8c15-46f2-3b23-767f52a2e80c.json
Normal file
63091
testData/Dorthy94_Klocko335_50390ac6-8c15-46f2-3b23-767f52a2e80c.json
Normal file
File diff suppressed because it is too large
Load Diff
125205
testData/Emmie273_Reinger292_630e4b67-6e16-6bc7-6f28-2544b1a5d4d7.json
Normal file
125205
testData/Emmie273_Reinger292_630e4b67-6e16-6bc7-6f28-2544b1a5d4d7.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
100796
testData/Jimmie93_Pfeffer420_b60cecd8-24c6-7983-3915-3bd6e8ede863.json
Normal file
100796
testData/Jimmie93_Pfeffer420_b60cecd8-24c6-7983-3915-3bd6e8ede863.json
Normal file
File diff suppressed because one or more lines are too long
15349
testData/Johnny786_Schoen8_a7a285c0-4714-dd3c-4837-8719c9b67873.json
Normal file
15349
testData/Johnny786_Schoen8_a7a285c0-4714-dd3c-4837-8719c9b67873.json
Normal file
File diff suppressed because it is too large
Load Diff
1143628
testData/Lala778_Jerilyn993_Cole117_ecf9d885-b712-e318-a6fa-f1ec9ffdbde8.json
Normal file
1143628
testData/Lala778_Jerilyn993_Cole117_ecf9d885-b712-e318-a6fa-f1ec9ffdbde8.json
Normal file
File diff suppressed because one or more lines are too long
43259
testData/Lilia791_Herrera193_8ebd900a-3563-5008-1fa9-9614ef666647.json
Normal file
43259
testData/Lilia791_Herrera193_8ebd900a-3563-5008-1fa9-9614ef666647.json
Normal file
File diff suppressed because it is too large
Load Diff
107501
testData/Lois157_Becker968_258cfb42-3e7c-be88-92fd-f31c94f3e76e.json
Normal file
107501
testData/Lois157_Becker968_258cfb42-3e7c-be88-92fd-f31c94f3e76e.json
Normal file
File diff suppressed because it is too large
Load Diff
25182
testData/Marlana402_Hansen121_8c570871-8d00-8c04-5da3-1ef43ed00f76.json
Normal file
25182
testData/Marlana402_Hansen121_8c570871-8d00-8c04-5da3-1ef43ed00f76.json
Normal file
File diff suppressed because it is too large
Load Diff
29082
testData/Monte325_Prosacco716_2268a882-1b01-f71f-4377-a0dd87a78dd9.json
Normal file
29082
testData/Monte325_Prosacco716_2268a882-1b01-f71f-4377-a0dd87a78dd9.json
Normal file
File diff suppressed because it is too large
Load Diff
67500
testData/Murray856_Lesch175_0759e805-61d1-ba2b-8b14-9e4308784a84.json
Normal file
67500
testData/Murray856_Lesch175_0759e805-61d1-ba2b-8b14-9e4308784a84.json
Normal file
File diff suppressed because it is too large
Load Diff
20506
testData/Niesha86_Anderson154_2e96848a-43c1-78ea-16f2-d9d471d0f9d2.json
Normal file
20506
testData/Niesha86_Anderson154_2e96848a-43c1-78ea-16f2-d9d471d0f9d2.json
Normal file
File diff suppressed because it is too large
Load Diff
24191
testData/Perry780_Gaylord332_c6456550-9c7a-0cf3-c18b-b266ba91ef1b.json
Normal file
24191
testData/Perry780_Gaylord332_c6456550-9c7a-0cf3-c18b-b266ba91ef1b.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
7798
testData/hospitalInformation1693908535569.json
Normal file
7798
testData/hospitalInformation1693908535569.json
Normal file
File diff suppressed because it is too large
Load Diff
8779
testData/practitionerInformation1693908535569.json
Normal file
8779
testData/practitionerInformation1693908535569.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user