release commit

2025-04-16 22:12:19 +02:00 · 2025-04-16 22:12:19 +02:00 · a9db0be88a
commit a9db0be88a
89 changed files with 2336827 additions and 0 deletions
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@ -0,0 +1,10 @@
 [bumpversion]
 current_version = 0.6.0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)
 serialize = {major}.{minor}.{patch}
 [bumpversion:file:pyproject.toml]
 [bumpversion:file:biocypher/_metadata.py]
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,13 @@
 MODE=testserver
 COMPLEX_PATIENTS=TRUE
 FHIR_SERVER_URL=http://hapi.fhir.org/baseR4
 #FHIR_SERVER_USER=
 #FHIR_SERVER_PW=
 #HTTP_PROXY=
 #HTTPS_PROXY=
 #NO_PROXY=
 NUMBER_OF_PATIENTS=100
 BATCH_SIZE=35
--- a/.gitea/actions/test/action.yaml
+++ b/.gitea/actions/test/action.yaml
@ -0,0 +1,41 @@
 name: "Test and code quality"
 description: "Run tests and code quality checks"
 inputs:
  NEO4J_VERSION:
    description: "Neo4j version"
 runs:
  using: "composite"
  steps:
    #----------------------------------------------
    #    setup docker containers for testing
    #----------------------------------------------
    # currently only running on Linux due to technical limitations
    # - name: Install Docker
    #   uses: douglascamata/setup-docker-macos-action@v1-alpha
    #   if: ${{ runner.os == 'macOS' }}
    - name: Start Neo4j Docker
      run: docker run --restart always --publish=7474:7474 --publish=7687:7687 --env NEO4J_AUTH=neo4j/your_password_here --env NEO4J_PLUGINS='["apoc"]' --env=NEO4J_ACCEPT_LICENSE_AGREEMENT=yes -d neo4j:${{ inputs.NEO4J_VERSION }}
      shell: bash
      if: ${{ runner.os == 'Linux' }}
    - name: Start Postgres Docker
      run: docker run --restart always --publish=5432:5432 --env POSTGRES_PASSWORD=postgres -d postgres:11.21-bullseye
      shell: bash
      if: ${{ runner.os == 'Linux' }}
    #----------------------------------------------
    #     run tests and code quality checks
    #----------------------------------------------
    - name: Run Tests (Windows)
      run: |
        poetry run pytest --version
        poetry run pytest --password=your_password_here
      shell: bash
      if: runner.os == 'Windows'
    - name: Run tests (Linux and MacOS)
      run: |
        poetry run pytest --version
        poetry run pytest --password=your_password_here
      shell: bash
      if: runner.os != 'Windows'
    - name: Check code quality
      uses: pre-commit/action@v3.0.0
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,33 @@
 *~
 *__pycache__
 build/
 docs/pypath_log/
 docs/_build/
 docs/biocypher-log/
 docs/modules/
 docs/notebooks/*.yaml
 docs/notebooks/*.py
 .DS_Store
 .vscode
 biocypher.egg-info/
 *.egg
 dist/
 *.prof
 *.coverage
 *.pickle
 out/*
 biocypher-log/*
 biocypher-out/*
 *.log
 dist/*
 *.pye
 *.pyc
 *.kate-swp
 .hypothesis/
 .venv/
 .empty
 .pytest_cache
 *.graphml
 .idea/*
 .cache
 *.iml
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
 [submodule "networkx-based"]
 	path = networkx-based
 	url = git@git.uni-greifswald.de:MeDaX/networkx-based.git
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,50 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
 fail_fast: false
 default_language_version:
    python: python3
 default_stages:
 -   commit
 -   push
 minimum_pre_commit_version: 2.7.1
 repos:
 -   repo: https://github.com/ambv/black
    rev: 23.7.0
    hooks:
    -   id: black
 -   repo: https://github.com/timothycrosley/isort
    rev: 5.12.0
    hooks:
    -   id: isort
        additional_dependencies: [toml]
 -   repo: https://github.com/snok/pep585-upgrade
    rev: v1.0
    hooks:
    -   id: upgrade-type-hints
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.4.0
    hooks:
    -   id: check-docstring-first
    -   id: end-of-file-fixer
    -   id: check-added-large-files
    -   id: mixed-line-ending
    -   id: trailing-whitespace
        exclude: ^.bumpversion.cfg$
    -   id: check-merge-conflict
    -   id: check-case-conflict
    -   id: check-symlinks
    -   id: check-yaml
        args: [--unsafe]
    -   id: check-ast
    -   id: fix-encoding-pragma
        args: [--remove] # for Python3 codebase, it's not necessary
    -   id: requirements-txt-fixer
 -   repo: https://github.com/pre-commit/pygrep-hooks
    rev: v1.10.0
    hooks:
    -   id: python-no-eval
    -   id: python-use-type-annotations
    -   id: python-check-blanket-noqa
    -   id: rst-backticks
    -   id: rst-directive-colons
    -   id: rst-inline-touching-normal
--- a/36
+++ b/36
@ -0,0 +1,36 @@
 FROM python:3.9-slim
 WORKDIR /app
 # Copy requirements file if you have one
 COPY requirements.txt .
 RUN pip install --upgrade packaging
 RUN pip install -r requirements.txt
 # Install poetry
 RUN pip install  --no-cache-dir "poetry<2.0.0"
 # Copy .env file
 COPY .env ./
 # Copy only pyproject.toml and poetry.lock (if exists) first
 COPY pyproject.toml ./
 COPY poetry.lock* ./
 # Configure poetry to not create a virtual environment inside the container
 RUN poetry config virtualenvs.create false
 # Install dependencies
 RUN poetry install --no-dev --no-interaction --no-ansi
 # Copy your project files
 COPY . .
 # Make the entrypoint script executable
 COPY entrypoint.sh .
 RUN chmod +x entrypoint.sh
 RUN sed -i 's/\r$//' /app/entrypoint.sh
 ENTRYPOINT ["/app/entrypoint.sh"]
--- a/22
+++ b/22
@ -0,0 +1,22 @@
 MIT License
 Copyright (c) 2022 Saez Lab
 Copyright (c) 2025 MeDaX research group
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,110 @@
 # MeDaX Pipeline
 ## 📋 Description
 The MeDaX pipeline transforms healthcare data from FHIR databases into Neo4j graph databases. This conversion enables efficient searching, querying, and analyses of interconnected health data that would otherwise be complex to retrieve using traditional SQL databases.
 ## ✨ Features
 - Seamless conversion from FHIR to Neo4j graph structure
 - Support for patient-centric data retrieval using FHIR's `$everything` operation
 - Configurable batch processing for handling large datasets
 - Docker-based deployment for easy setup and portability
 - Compatible with public FHIR servers (e.g., HAPI FHIR) and private authenticated instances
 ## ⚙️ Prerequisites
 - [Docker](https://docs.docker.com/engine/install/) with the [Docker Compose plugin](https://docs.docker.com/compose/install/linux/)
 - A FHIR database with API access and the `$everything` operation enabled for retrieving patient data
  - Alternatively: Use a public FHIR server such as [HAPI FHIR](https://hapi.fhir.org/) (default configuration)
 ## 🚀 Installation
 ### Setup
 1. Clone this repository
 2. Create an environment configuration file
 3. Configure the environment variables in `.env`:
   - For HAPI test server (default): No changes needed
   - For custom FHIR server:
     - Change `MODE` to anything else
     - Uncomment and set `URL`, `PASSWORD`, and `USERNAME` variables
     - Adjust `BATCH_SIZE` and `NUMBER_OF_PATIENTS` according to your needs
     - Configure any required proxy settings
 4. If needed, modify proxy settings in the `Dockerfile`
    - Uncomment and set proxy variables
 ### Running the Pipeline
 **Start the containers:**
 ```bash
 docker compose up --build
 ```
 **Stop and clean up (between runs):**
 ```bash
 docker compose down --volumes
 ```
 **Complete removal (containers and images):**
 ```bash
 docker compose down --volumes --rmi all
 ```
 > **Note:** Depending on your Docker installation, you might need to use `docker-compose` instead of `docker compose`.
 ## 🔍 Accessing the Neo4j Database
 Once the pipeline has completed processing, you can access the Neo4j database:
 1. Open your browser and navigate to `http://localhost:8080/`
 2. Connect using the following credentials:
   - Username: neo4j
   - Password: neo4j
 3. Set the new password and save it to a secure password manager
 ## 📊 Example Queries
 Here are some basic Cypher queries to get you started with exploring your health data:
 ```cypher
 // Count all nodes by type
 MATCH (n) RETURN labels(n) as NodeType, count(*) as Count;
 // Find all records for a specific patient
 MATCH (p:Patient {id: 'patient-id'})-[r]-(connected)
 RETURN p, r, connected;
 // Retrieve all medication prescriptions
 MATCH (m:Medication)-[r]-(p:Patient)
 RETURN m, r, p;
 ```
 ## ❓ Troubleshooting
 **Common Issues:**
 - **Connection refused to FHIR server**: Check your network settings and ensure the FHIR server is accessible from within the Docker container.
 - **Authentication failures**: Verify your credentials in the `.env` file.
 - **Container startup failures**: Ensure all required Docker ports are available and not used by other applications.
 - **No data found in fhir bundle**: Ensure that the FHIR server is up and responding patient data. Try sett the COMPLEX_PATIENTS variable to FALSE in your .env file. Some FHIR servers might  not support the FHIR search logic.
 ## 📚 Architecture
 The MeDaX pipeline consists of the following components:
 1. **FHIR Client**: Connects to the FHIR server and retrieves patient data
 2. **Data Transformer**: Converts FHIR resources into graph entities and relationships
 3. **Reference Processor**: Converts references to relationships
 3. **BioCypher Adapter**: Prepares the transformed data for Neo4j admin import
 4. **Neo4j Database**: Stores and serves the graph representation of the health data
 ## ✍️ Citation
 If you use the MeDaX pipeline in your research, please cite: 10.5281/zenodo.15229077
 ## 🙏 Acknowledgements
 - We are leveraging [BioCypher](https://biocypher.org) [![DOI](https://zenodo.org/badge/DOI/10.1038/s41587-023-01848-y.svg)](https://doi.org/10.1038/s41587-023-01848-y) to create the Neo4j admin input.
  - Remark: We slight adjustments to BioCypher's code to support batching.
 - We used BioCypher's git template as a starting point for our development:
  - Lobentanzer, S., BioCypher Consortium, & Saez-Rodriguez, J. Democratizing knowledge representation with BioCypher [Computer software]. https://github.com/biocypher/biocypher
--- a/biocypher/init.py
+++ b/biocypher/init.py
@ -0,0 +1,41 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 ...
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 BioCypher: a unifying framework for biomedical knowledge graphs.
 """
 __all__ = [
    "__version__",
    "__author__",
    "module_data",
    "config",
    "logfile",
    "log",
    "Driver",
    "BioCypher",
    "Resource",
 ]
 from ._get import Resource
 from ._core import BioCypher
 from ._config import config, module_data
 from ._logger import log, logger, logfile
 from ._metadata import __author__, __version__
 class Driver(BioCypher):
    # initialise parent class but log a warning
    def __init__(self, *args, **kwargs):
        logger.warning(
            "The class `Driver` is deprecated and will be removed in a future "
            "release. Please use `BioCypher` instead."
        )
        super().__init__(*args, **kwargs)
--- a/biocypher/_config/init.py
+++ b/biocypher/_config/init.py
@ -0,0 +1,148 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 ...
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 Module data directory, including:
 * The BioLink database schema
 * The default config files
 """
 from typing import Any, Optional
 import os
 import warnings
 import yaml
 import appdirs
 __all__ = ["module_data", "module_data_path", "read_config", "config", "reset"]
 _USER_CONFIG_DIR = appdirs.user_config_dir("biocypher", "saezlab")
 _USER_CONFIG_FILE = os.path.join(_USER_CONFIG_DIR, "conf.yaml")
 class MyLoader(yaml.SafeLoader):
    def construct_scalar(self, node):
        # Check if the scalar contains double quotes and an escape sequence
        value = super().construct_scalar(node)
        q = bool(node.style == '"')
        b = bool("\\" in value.encode("unicode_escape").decode("utf-8"))
        if q and b:
            warnings.warn(
                (
                    "Double quotes detected in YAML configuration scalar: "
                    f"{value.encode('unicode_escape')}. "
                    "These allow escape sequences and may cause problems, for "
                    "instance with the Neo4j admin import files (e.g. '\\t'). "
                    "Make sure you wanted to do this, and use single quotes "
                    "whenever possible."
                ),
                category=UserWarning,
            )
        return value
 def module_data_path(name: str) -> str:
    """
    Absolute path to a YAML file shipped with the module.
    """
    here = os.path.dirname(os.path.abspath(__file__))
    return os.path.join(here, f"{name}.yaml")
 def module_data(name: str) -> Any:
    """
    Retrieve the contents of a YAML file shipped with this module.
    """
    path = module_data_path(name)
    return _read_yaml(path)
 def _read_yaml(path: str) -> Optional[dict]:
    if os.path.exists(path):
        with open(path, "r") as fp:
            return yaml.load(fp.read(), Loader=MyLoader)
 def read_config() -> dict:
    """
    Read the module config.
    Read and merge the built-in default, the user level and directory level
    configuration, with the later taking precendence over the former.
    TODO explain path configuration
    """
    defaults = module_data("biocypher_config")
    user = _read_yaml(_USER_CONFIG_FILE) or {}
    # TODO account for .yml?
    local = (
        _read_yaml("biocypher_config.yaml")
        or _read_yaml("config/biocypher_config.yaml")
        or {}
    )
    for key in defaults:
        value = (
            local[key] if key in local else user[key] if key in user else None
        )
        if value is not None:
            if isinstance(
                defaults[key], str
            ):  # first level config (like title)
                defaults[key] = value
            else:
                defaults[key].update(value)
    return defaults
 def config(*args, **kwargs) -> Optional[Any]:
    """
    Set or get module config parameters.
    """
    if args and kwargs:
        raise ValueError(
            "Setting and getting values in the same call is not allowed.",
        )
    if args:
        result = tuple(globals()["_config"].get(key, None) for key in args)
        return result[0] if len(result) == 1 else result
    for key, value in kwargs.items():
        globals()["_config"][key].update(value)
 def reset():
    """
    Reload configuration from the config files.
    """
    globals()["_config"] = read_config()
 reset()
 def update_from_file(path: str):
    """
    Update the module configuration from a YAML file.
    """
    config(**_read_yaml(path))
--- a/biocypher/_config/biocypher_config.yaml
+++ b/biocypher/_config/biocypher_config.yaml
@ -0,0 +1,141 @@
 Title: BioCypher python module configuration file
 ## Some options are not used by default. Uncomment them to use them.
 biocypher:
  ### Required parameters ###
  ## DBMS type
  dbms: neo4j
  ## Schema configuration
  # schema_config_path: config/schema_config.yaml
  ## Offline mode: do not connect to a running DBMS instance
  ## Can be used e.g. for writing batch import files
  offline: true
  ## Strict mode: do not allow to create new nodes or relationships without
  ## specifying source, version, and license parameters
  strict_mode: false
  ## Ontology configuration
  head_ontology:
    url: https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
    root_node: entity
    # switch_label_and_id: true
  ### Optional parameters ###
  ## Logging
  # Write log to disk
  log_to_disk: true
  # Activate more granular logging
  debug: true
  # Change the log directory
  # log_directory: biocypher-log
  ## Data output directory
  # output_directory: biocypher-out
  ## Resource cache directory
  # cache_directory: .cache
  ## Optional tail ontologies
  # tail_ontologies:
  #   so:
  #     url: test/ontologies/so.owl
  #     head_join_node: sequence variant
  #     tail_join_node: sequence_variant
  #     switch_label_and_id: true
  #   mondo:
  #     url: test/ontologies/mondo.owl
  #     head_join_node: disease
  #     tail_join_node: disease
  #     switch_label_and_id: true
 ### DBMS configuration ###
 neo4j:
  ### Neo4j configuration ###
  ## Database name
  database_name: neo4j
  ## Wipe DB before import (offline mode: --force)
  wipe: true
  ## Neo4j authentication
  uri: neo4j://localhost:7687
  user: neo4j
  password: neo4j
  ## Neo4j admin import batch writer settings
  delimiter: ";"
  array_delimiter: "|"
  quote_character: "'"
  ## MultiDB functionality
  ## Set to false for using community edition or older versions of Neo4j
  multi_db: true
  ## Import options
  skip_duplicate_nodes: false
  skip_bad_relationships: false
  ## Import call prefixes
  # import_call_bin_prefix: bin/
  # import_call_file_prefix: path/to/files/
 postgresql:
  ### PostgreSQL configuration ###
  # PostgreSQL connection credentials
  database_name: postgres # DB name
  user: postgres # user name
  password: postgres # password
  host: localhost # host
  port: 5432 # port
  # PostgreSQL import batch writer settings
  quote_character: '"'
  delimiter: '\t'
  # import_call_bin_prefix: '' # path to "psql"
  # import_call_file_prefix: '/path/to/files'
 rdf:
  ### RDF configuration ###
  rdf_format: turtle
 sqlite:
  ### SQLite configuration ###
  # SQLite connection credentials
  database_name: sqlite.db # DB name
  # SQLite import batch writer settings
  quote_character: '"'
  delimiter: '\t'
  # import_call_bin_prefix: '' # path to "sqlite3"
  # import_call_file_prefix: '/path/to/files'
 csv:
  ### CSV/Pandas configuration ###
  delimiter: ","
 networkx:
  ### NetworkX configuration ###
  some_config: some_value # placeholder for technical reasons TODO
--- a/biocypher/_config/test_config.yaml
+++ b/biocypher/_config/test_config.yaml
@ -0,0 +1,5 @@
 # We test the quote detection
 valid: 'This is a valid string'
 also_valid: "This is also a valid string"
 invalid: "\t"
--- a/biocypher/_config/test_schema_config.yaml
+++ b/biocypher/_config/test_schema_config.yaml
@ -0,0 +1,140 @@
 Title: BioCypher graph schema configuration file
 # ---
 # "Named Things"
 # ---
 protein:
  represented_as: node
  preferred_id: uniprot
  input_label: protein
  db_collection_name: proteins
  properties:
    name: str
    score: float
    taxon: int
    genes: str[]
 microRNA:
  represented_as: node
  preferred_id: mirbase.mature
  input_label: mirna
 complex:
  synonym_for: macromolecular complex
  represented_as: node
  preferred_id: complexportal
  input_label: complex
 pathway:
  represented_as: node
  preferred_id: [reactome, wikipathways]
  input_label: [reactome, wikipathways]
 gene:
  represented_as: node
  preferred_id: hgnc
  input_label: [hgnc, ensg]
  exclude_properties: accession
 disease:
  represented_as: node
  preferred_id: doid
  input_label: Disease
 side effect:
  is_a: phenotypic feature
  represented_as: node
  preferred_id: sider.effect
  input_label: sider
 sequence variant:
  represented_as: node
  preferred_id: [clinically relevant, known, somatic]
  input_label: [Clinically_relevant_variant, Known_variant, Somatic_mutation]
  properties:
    source: str
    original_source: str
    effect: str
    biotype: str
 snRNA sequence:
  is_a: nucleic acid entity
  represented_as: node
  preferred_id: [intact, rnacentral]
  input_label: [intact_snrna, rnacentral_snrna]
  properties:
    ac: str
    fullName: str
    shortName: str
    preferredName: str
  exclude_properties: sequence
 DNA sequence:
  is_a: nucleic acid entity
  represented_as: node
  preferred_id: ensembl
  input_label: dna
  properties:
    ac: str
    fullName: str
    shortName: str
    preferredName: str
    sequence: str
 dsDNA sequence:
  is_a: [DNA sequence, nucleic acid entity]
  inherit_properties: True
  represented_as: node
  preferred_id: [intact, uniparc]
  input_label: [intact_dsdna, uniprot_archive_dsdna]
 # ---
 # Associations
 # ---
 post translational interaction:
  is_a: pairwise molecular interaction
  represented_as: node
  label_as_edge: INTERACTS_POST_TRANSLATIONAL
  input_label: post_translational
 phosphorylation:
  is_a: post translational interaction
  represented_as: edge
  input_label: phosphorylation
 gene to disease association:
  represented_as: edge
  label_as_edge: PERTURBED_IN_DISEASE
  input_label: [protein_disease, gene_disease]
  exclude_properties: accession
 mutation to tissue association:
  is_a: [genotype to tissue association, entity to tissue association, association]
  represented_as: edge
  label_as_edge: Is_Mutated_In
  input_label: Gene_Is_Mutated_In_Cell_Tissue
 variant to gene association: # -> Known.... and Somatic....
  represented_as: edge
  source: [known.sequence variant, somatic.sequence variant]
  target: gene
  input_label: [
    VARIANT_FOUND_IN_GENE_Known_variant_Gene,
    VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene
  ]
 gene to gene association:
  represented_as: edge
  input_label: gene_gene
  properties:
    directional: bool
    curated: bool
    score: float
    id: str  # should be removed
 gene to variant association:  # should be removed
  is_a: gene to variant association
  represented_as: edge
  input_label: gene_variant
--- a/biocypher/_config/test_schema_config_disconnected.yaml
+++ b/biocypher/_config/test_schema_config_disconnected.yaml
@ -0,0 +1,3 @@
 disconnected:
  represented_as: node
  label_in_input: disconnected
--- a/biocypher/_config/test_schema_config_extended.yaml
+++ b/biocypher/_config/test_schema_config_extended.yaml
@ -0,0 +1,152 @@
 Title: BioCypher graph schema configuration file
 # ---
 # "Named Things"
 # ---
 protein:
  represented_as: node
  preferred_id: uniprot
  input_label: protein
  db_collection_name: proteins
  properties:
    name: str
    score: float
    taxon: int
    genes: str[]
 microRNA:
  represented_as: node
  preferred_id: mirbase.mature
  input_label: mirna
 complex:
  synonym_for: macromolecular complex
  represented_as: node
  preferred_id: complexportal
  input_label: complex
 pathway:
  represented_as: node
  preferred_id: [reactome, wikipathways]
  input_label: [reactome, wikipathways]
 gene:
  represented_as: node
  preferred_id: hgnc
  input_label: [hgnc, ensg]
  exclude_properties: accession
 disease:
  represented_as: node
  preferred_id: doid
  input_label: Disease
 side effect:
  is_a: phenotypic feature
  represented_as: node
  preferred_id: sider.effect
  input_label: sider
 sequence variant:
  represented_as: node
  preferred_id: [clinically relevant, known, somatic]
  input_label: [Clinically_relevant_variant, Known_variant, Somatic_mutation]
  properties:
    source: str
    original_source: str
    effect: str
    biotype: str
 altered gene product level:
  represented_as: node
  input_label: agpl
 decreased gene product level:
  represented_as: node
  input_label: agpl_decreased
 lethal variant:
  represented_as: node
  input_label: lethal
 snRNA sequence:
  is_a: nucleic acid entity
  represented_as: node
  preferred_id: [intact, rnacentral]
  input_label: [intact_snrna, rnacentral_snrna]
  properties:
    ac: str
    fullName: str
    shortName: str
    preferredName: str
  exclude_properties: sequence
 DNA sequence:
  is_a: nucleic acid entity
  represented_as: node
  preferred_id: ensembl
  input_label: dna
  properties:
    ac: str
    fullName: str
    shortName: str
    preferredName: str
    sequence: str
 dsDNA sequence:
  is_a: [DNA sequence, nucleic acid entity]
  inherit_properties: True
  represented_as: node
  preferred_id: [intact, uniparc]
  input_label: [intact_dsdna, uniprot_archive_dsdna]
 # ---
 # Associations
 # ---
 post translational interaction:
  is_a: pairwise molecular interaction
  represented_as: node
  label_as_edge: INTERACTS_POST_TRANSLATIONAL
  input_label: post_translational
 phosphorylation:
  is_a: post translational interaction
  represented_as: edge
  use_id: false
  input_label: phosphorylation
 gene to disease association:
  represented_as: edge
  label_as_edge: PERTURBED_IN_DISEASE
  input_label: [protein_disease, gene_disease]
  exclude_properties: accession
 mutation to tissue association:
  is_a: [genotype to tissue association, entity to tissue association, association]
  represented_as: edge
  label_as_edge: Is_Mutated_In
  input_label: Gene_Is_Mutated_In_Cell_Tissue
 variant to gene association: # -> Known.... and Somatic....
  represented_as: edge
  source: [known.sequence variant, somatic.sequence variant]
  target: gene
  input_label: [
    VARIANT_FOUND_IN_GENE_Known_variant_Gene,
    VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene
  ]
 gene to gene association:
  represented_as: edge
  input_label: gene_gene
  properties:
    directional: bool
    curated: bool
    score: float
 gene to variant association:
  is_a: gene to variant association
  represented_as: edge
  input_label: gene_variant
--- a/biocypher/_core.py
+++ b/biocypher/_core.py
@ -0,0 +1,734 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 ...
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 BioCypher core module. Interfaces with the user and distributes tasks to
 submodules.
 """
 from typing import Optional
 from datetime import datetime
 import os
 import json
 from more_itertools import peekable
 import yaml
 import pandas as pd
 from ._logger import logger
 logger.debug(f"Loading module {__name__}.")
 from ._get import Downloader
 from ._config import config as _config
 from ._config import update_from_file as _file_update
 from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
 from ._mapping import OntologyMapping
 from ._ontology import Ontology
 from ._translate import Translator
 from ._deduplicate import Deduplicator
 from .output.in_memory._pandas import Pandas
 from .output.write._get_writer import DBMS_TO_CLASS, get_writer
 from .output.connect._neo4j_driver import get_driver
 __all__ = ["BioCypher"]
 SUPPORTED_DBMS = DBMS_TO_CLASS.keys()
 REQUIRED_CONFIG = [
    "dbms",
    "offline",
    "strict_mode",
    "head_ontology",
 ]
 class BioCypher:
    """
    Orchestration of BioCypher operations. Instantiate this class to interact
    with BioCypher.
    Args:
        dbms (str): The database management system to use. For supported
            systems see SUPPORTED_DBMS.
        offline (bool): Whether to run in offline mode. If True, no
            connection to the database will be made.
        strict_mode (bool): Whether to run in strict mode. If True, the
            translator will raise an error if a node or edge does not
            provide source, version, and licence information.
        biocypher_config_path (str): Path to the BioCypher config file.
        schema_config_path (str): Path to the user schema config
            file.
        head_ontology (dict): The head ontology defined by URL ('url') and root
            node ('root_node').
        tail_ontologies (dict): The tail ontologies defined by URL and
            join nodes for both head and tail ontology.
        output_directory (str): Path to the output directory. If not
            provided, the default value 'biocypher-out' will be used.
    """
    def __init__(
        self,
        dbms: str = None,
        offline: bool = None,
        strict_mode: bool = None,
        biocypher_config_path: str = None,
        schema_config_path: str = None,
        head_ontology: dict = None,
        tail_ontologies: dict = None,
        output_directory: str = None,
        cache_directory: str = None,
        # legacy params
        db_name: str = None,
    ):
        # Update configuration if custom path is provided
        if biocypher_config_path:
            _file_update(biocypher_config_path)
        if db_name:
            logger.warning(
                "The parameter `db_name` is deprecated. Please set the "
                "`database_name` setting in the `biocypher_config.yaml` file "
                "instead."
            )
            _config(**{db_name: {"database_name": db_name}})
        # Load configuration
        self.base_config = _config("biocypher")
        # Check for required configuration
        for key in REQUIRED_CONFIG:
            if key not in self.base_config:
                raise ValueError(f"Configuration key {key} is required.")
        # Set configuration - mandatory
        self._dbms = dbms or self.base_config["dbms"]
        if offline is None:
            self._offline = self.base_config["offline"]
        else:
            self._offline = offline
        if strict_mode is None:
            self._strict_mode = self.base_config["strict_mode"]
        else:
            self._strict_mode = strict_mode
        self._schema_config_path = schema_config_path or self.base_config.get(
            "schema_config_path"
        )
        if not self._schema_config_path:
            logger.warning("Running BioCypher without schema configuration.")
        else:
            logger.info(
                f"Running BioCypher with schema configuration from {self._schema_config_path}."
            )
        self._head_ontology = head_ontology or self.base_config["head_ontology"]
        # Set configuration - optional
        self._output_directory = output_directory or self.base_config.get(
            "output_directory"
        )
        self._cache_directory = cache_directory or self.base_config.get(
            "cache_directory"
        )
        self._tail_ontologies = tail_ontologies or self.base_config.get(
            "tail_ontologies"
        )
        if self._dbms not in SUPPORTED_DBMS:
            raise ValueError(
                f"DBMS {self._dbms} not supported. "
                f"Please select from {SUPPORTED_DBMS}."
            )
        # Initialize
        self._ontology_mapping = None
        self._deduplicator = None
        self._translator = None
        self._downloader = None
        self._ontology = None
        self._writer = None
        self._pd = None
    def _get_deduplicator(self) -> Deduplicator:
        """
        Create deduplicator if not exists and return.
        """
        if not self._deduplicator:
            self._deduplicator = Deduplicator()
        return self._deduplicator
    def _get_ontology_mapping(self) -> OntologyMapping:
        """
        Create ontology mapping if not exists and return.
        """
        if not self._schema_config_path:
            self._ontology_mapping = OntologyMapping()
        if not self._ontology_mapping:
            self._ontology_mapping = OntologyMapping(
                config_file=self._schema_config_path,
            )
        return self._ontology_mapping
    def _get_ontology(self) -> Ontology:
        """
        Create ontology if not exists and return.
        """
        if not self._ontology:
            self._ontology = Ontology(
                ontology_mapping=self._get_ontology_mapping(),
                head_ontology=self._head_ontology,
                tail_ontologies=self._tail_ontologies,
            )
        return self._ontology
    def _get_translator(self) -> Translator:
        """
        Create translator if not exists and return.
        """
        if not self._translator:
            self._translator = Translator(
                ontology=self._get_ontology(),
                strict_mode=self._strict_mode,
            )
        return self._translator
    def _get_writer(self):
        """
        Create writer if not online. Set as instance variable `self._writer`.
        """
        if self._offline:
            timestamp = lambda: datetime.now().strftime("%Y%m%d%H%M%S")
            outdir = self._output_directory or os.path.join(
                "biocypher-out", timestamp()
            )
            self._output_directory = os.path.abspath(outdir)
            self._writer = get_writer(
                dbms=self._dbms,
                translator=self._get_translator(),
                deduplicator=self._get_deduplicator(),
                output_directory=self._output_directory,
                strict_mode=self._strict_mode,
            )
        else:
            raise NotImplementedError("Cannot get writer in online mode.")
    def _get_driver(self):
        """
        Create driver if not exists. Set as instance variable `self._driver`.
        """
        if not self._offline:
            self._driver = get_driver(
                dbms=self._dbms,
                translator=self._get_translator(),
                deduplicator=self._get_deduplicator(),
            )
        else:
            raise NotImplementedError("Cannot get driver in offline mode.")
    def write_nodes(
        self, nodes, batch_size: int = int(1e6), force: bool = False
    ) -> bool:
        """
        Write nodes to database. Either takes an iterable of tuples (if given,
        translates to ``BioCypherNode`` objects) or an iterable of
        ``BioCypherNode`` objects.
        Args:
            nodes (iterable): An iterable of nodes to write to the database.
            batch_size (int): The batch size to use when writing to disk.
            force (bool): Whether to force writing to the output directory even
                if the node type is not present in the schema config file.
        Returns:
            bool: True if successful.
        """
        if not self._writer:
            self._get_writer()
        nodes = peekable(nodes)
        if not isinstance(nodes.peek(), BioCypherNode):
            tnodes = self._translator.translate_nodes(nodes)
        else:
            tnodes = nodes
        # write node files
        return self._writer.write_nodes(
            tnodes, batch_size=batch_size, force=force
        )
    def write_edges(self, edges, batch_size: int = int(1e6)) -> bool:
        """
        Write edges to database. Either takes an iterable of tuples (if given,
        translates to ``BioCypherEdge`` objects) or an iterable of
        ``BioCypherEdge`` objects.
        Args:
            edges (iterable): An iterable of edges to write to the database.
        Returns:
            bool: True if successful.
        """
        if not self._writer:
            self._get_writer()
        edges = peekable(edges)
        if not isinstance(edges.peek(), BioCypherEdge):
            tedges = self._translator.translate_edges(edges)
        else:
            tedges = edges
        # write edge files
        return self._writer.write_edges(tedges, batch_size=batch_size)
    def to_df(self) -> list[pd.DataFrame]:
        """
        Convert entities to a pandas DataFrame for each entity type and return
        a list.
        Args:
            entities (iterable): An iterable of entities to convert to a
                DataFrame.
        Returns:
            pd.DataFrame: A pandas DataFrame.
        """
        if not self._pd:
            raise ValueError(
                "No pandas instance found. Please call `add()` first."
            )
        return self._pd.dfs
    def add(self, entities) -> None:
        """
        Function to add entities to the in-memory database. Accepts an iterable
        of tuples (if given, translates to ``BioCypherNode`` or
        ``BioCypherEdge`` objects) or an iterable of ``BioCypherNode`` or
        ``BioCypherEdge`` objects.
        Args:
            entities (iterable): An iterable of entities to add to the database.
                Can be 3-tuples (nodes) or 5-tuples (edges); also accepts
                4-tuples for edges (deprecated).
        Returns:
            None
        """
        if not self._pd:
            self._pd = Pandas(
                translator=self._get_translator(),
                deduplicator=self._get_deduplicator(),
            )
        entities = peekable(entities)
        if (
            isinstance(entities.peek(), BioCypherNode)
            or isinstance(entities.peek(), BioCypherEdge)
            or isinstance(entities.peek(), BioCypherRelAsNode)
        ):
            tentities = entities
        elif len(entities.peek()) < 4:
            tentities = self._translator.translate_nodes(entities)
        else:
            tentities = self._translator.translate_edges(entities)
        self._pd.add_tables(tentities)
    def add_nodes(self, nodes) -> None:
        """
        Wrapper for ``add()`` to add nodes to the in-memory database.
        Args:
            nodes (iterable): An iterable of node tuples to add to the database.
        Returns:
            None
        """
        self.add(nodes)
    def add_edges(self, edges) -> None:
        """
        Wrapper for ``add()`` to add edges to the in-memory database.
        Args:
            edges (iterable): An iterable of edge tuples to add to the database.
        Returns:
            None
        """
        self.add(edges)
    def merge_nodes(self, nodes) -> bool:
        """
        Merge nodes into database. Either takes an iterable of tuples (if given,
        translates to ``BioCypherNode`` objects) or an iterable of
        ``BioCypherNode`` objects.
        Args:
            nodes (iterable): An iterable of nodes to merge into the database.
        Returns:
            bool: True if successful.
        """
        if not self._driver:
            self._get_driver()
        nodes = peekable(nodes)
        if not isinstance(nodes.peek(), BioCypherNode):
            tnodes = self._translator.translate_nodes(nodes)
        else:
            tnodes = nodes
        # write node files
        return self._driver.add_biocypher_nodes(tnodes)
    def merge_edges(self, edges) -> bool:
        """
        Merge edges into database. Either takes an iterable of tuples (if given,
        translates to ``BioCypherEdge`` objects) or an iterable of
        ``BioCypherEdge`` objects.
        Args:
            edges (iterable): An iterable of edges to merge into the database.
        Returns:
            bool: True if successful.
        """
        if not self._driver:
            self._get_driver()
        edges = peekable(edges)
        if not isinstance(edges.peek(), BioCypherEdge):
            tedges = self._translator.translate_edges(edges)
        else:
            tedges = edges
        # write edge files
        return self._driver.add_biocypher_edges(tedges)
    # DOWNLOAD AND CACHE MANAGEMENT METHODS ###
    def _get_downloader(self, cache_dir: Optional[str] = None):
        """
        Create downloader if not exists.
        """
        if not self._downloader:
            self._downloader = Downloader(self._cache_directory)
    def download(self, *resources) -> None:
        """
        Use the :class:`Downloader` class to download or load from cache the
        resources given by the adapter.
        """
        self._get_downloader()
        return self._downloader.download(*resources)
    # OVERVIEW AND CONVENIENCE METHODS ###
    def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]:
        """
        Get the set of input labels encountered without an entry in the
        `schema_config.yaml` and print them to the logger.
        Returns:
            Optional[Dict[str, List[str]]]: A dictionary of Biolink types
            encountered without an entry in the `schema_config.yaml` file.
        """
        mt = self._translator.get_missing_biolink_types()
        if mt:
            msg = (
                "Input entities not accounted for due to them not being "
                f"present in the schema configuration file {self._schema_config_path} "
                "(this is not necessarily a problem, if you did not intend "
                "to include them in the database; see the log for details): \n"
            )
            for k, v in mt.items():
                msg += f"    {k}: {v} \n"
            logger.info(msg)
            return mt
        else:
            logger.info("No missing labels in input.")
            return None
    def log_duplicates(self) -> None:
        """
        Get the set of duplicate nodes and edges encountered and print them to
        the logger.
        """
        dn = self._deduplicator.get_duplicate_nodes()
        if dn:
            ntypes = dn[0]
            nids = dn[1]
            msg = "Duplicate node types encountered (IDs in log): \n"
            for typ in ntypes:
                msg += f"    {typ}\n"
            logger.info(msg)
            idmsg = "Duplicate node IDs encountered: \n"
            for _id in nids:
                idmsg += f"    {_id}\n"
            logger.debug(idmsg)
        else:
            logger.info("No duplicate nodes in input.")
        de = self._deduplicator.get_duplicate_edges()
        if de:
            etypes = de[0]
            eids = de[1]
            msg = "Duplicate edge types encountered (IDs in log): \n"
            for typ in etypes:
                msg += f"    {typ}\n"
            logger.info(msg)
            idmsg = "Duplicate edge IDs encountered: \n"
            for _id in eids:
                idmsg += f"    {_id}\n"
            logger.debug(idmsg)
        else:
            logger.info("No duplicate edges in input.")
    def show_ontology_structure(self, **kwargs) -> None:
        """
        Show the ontology structure using treelib or write to GRAPHML file.
        Args:
            to_disk (str): If specified, the ontology structure will be saved
                to disk as a GRAPHML file, to be opened in your favourite
                graph visualisation tool.
            full (bool): If True, the full ontology structure will be shown,
                including all nodes and edges. If False, only the nodes and
                edges that are relevant to the extended schema will be shown.
        """
        if not self._ontology:
            self._get_ontology()
        return self._ontology.show_ontology_structure(**kwargs)
    def write_import_call(self) -> str:
        """
        Write a shell script to import the database depending on the chosen
        DBMS.
        Returns:
            str: path toward the file holding the import call.
        """
        if not self._offline:
            raise NotImplementedError(
                "Cannot write import call in online mode."
            )
        return self._writer.write_import_call()
    def write_schema_info(self, as_node: bool = False) -> None:
        """
        Write an extended schema info YAML file that extends the
        `schema_config.yaml` with run-time information of the built KG. For
        instance, include information on whether something present in the actual
        knowledge graph, whether it is a relationship (which is important in the
        case of representing relationships as nodes) and the actual sources and
        targets of edges. Since this file can be used in place of the original
        `schema_config.yaml` file, it indicates that it is the extended schema
        by setting `is_schema_info` to `true`.
        We start by using the `extended_schema` dictionary from the ontology
        class instance, which contains all expanded entities and relationships.
        The information of whether something is a relationship can be gathered
        from the deduplicator instance, which keeps track of all entities that
        have been seen.
        """
        if not self._offline:
            raise NotImplementedError(
                "Cannot write schema info in online mode."
            )
        ontology = self._get_ontology()
        schema = ontology.mapping.extended_schema.copy()
        schema["is_schema_info"] = True
        deduplicator = self._get_deduplicator()
        for node in deduplicator.entity_types:
            if node in schema.keys():
                schema[node]["present_in_knowledge_graph"] = True
                schema[node]["is_relationship"] = False
            else:
                logger.info(
                    f"Node {node} not present in extended schema. "
                    "Skipping schema info."
                )
        # find 'label_as_edge' cases in schema entries
        changed_labels = {}
        for k, v in schema.items():
            if not isinstance(v, dict):
                continue
            if "label_as_edge" in v.keys():
                if v["label_as_edge"] in deduplicator.seen_relationships.keys():
                    changed_labels[v["label_as_edge"]] = k
        for edge in deduplicator.seen_relationships.keys():
            if edge in changed_labels.keys():
                edge = changed_labels[edge]
            if edge in schema.keys():
                schema[edge]["present_in_knowledge_graph"] = True
                schema[edge]["is_relationship"] = True
                # TODO information about source and target nodes
            else:
                logger.info(
                    f"Edge {edge} not present in extended schema. "
                    "Skipping schema info."
                )
        # write to output directory as YAML file
        path = os.path.join(self._output_directory, "schema_info.yaml")
        with open(path, "w") as f:
            f.write(yaml.dump(schema))
        if as_node:
            # write as node
            node = BioCypherNode(
                node_id="schema_info",
                node_label="schema_info",
                properties={"schema_info": json.dumps(schema)},
            )
            self.write_nodes([node], force=True)
            # override import call with added schema info node
            self.write_import_call()
        return schema
    # TRANSLATION METHODS ###
    def translate_term(self, term: str) -> str:
        """
        Translate a term to its BioCypher equivalent.
        Args:
            term (str): The term to translate.
        Returns:
            str: The BioCypher equivalent of the term.
        """
        # instantiate adapter if not exists
        self.start_ontology()
        return self._translator.translate_term(term)
    def summary(self) -> None:
        """
        Wrapper for showing ontology structure and logging duplicates and
        missing input types.
        """
        self.show_ontology_structure()
        self.log_duplicates()
        self.log_missing_input_labels()
    def reverse_translate_term(self, term: str) -> str:
        """
        Reverse translate a term from its BioCypher equivalent.
        Args:
            term (str): The BioCypher term to reverse translate.
        Returns:
            str: The original term.
        """
        # instantiate adapter if not exists
        self.start_ontology()
        return self._translator.reverse_translate_term(term)
    def translate_query(self, query: str) -> str:
        """
        Translate a query to its BioCypher equivalent.
        Args:
            query (str): The query to translate.
        Returns:
            str: The BioCypher equivalent of the query.
        """
        # instantiate adapter if not exists
        self.start_ontology()
        return self._translator.translate(query)
    def reverse_translate_query(self, query: str) -> str:
        """
        Reverse translate a query from its BioCypher equivalent.
        Args:
            query (str): The BioCypher query to reverse translate.
        Returns:
            str: The original query.
        """
        # instantiate adapter if not exists
        self.start_ontology()
        return self._translator.reverse_translate(query)
--- a/biocypher/_create.py
+++ b/biocypher/_create.py
@ -0,0 +1,356 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 ...
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 BioCypher 'create' module. Handles the creation of BioCypher node and edge
 dataclasses.
 """
 from ._logger import logger
 logger.debug(f"Loading module {__name__}.")
 from typing import Union
 from dataclasses import field, dataclass
 import os
 __all__ = [
    "BioCypherEdge",
    "BioCypherNode",
    "BioCypherRelAsNode",
 ]
@dataclass(frozen=True)
 class BioCypherNode:
    """
    Handoff class to represent biomedical entities as Neo4j nodes.
    Has id, label, property dict; id and label (in the Neo4j sense of a
    label, ie, the entity descriptor after the colon, such as
    ":Protein") are non-optional and called node_id and node_label to
    avoid confusion with "label" properties. Node labels are written in
    PascalCase and as nouns, as per Neo4j consensus.
    Args:
        node_id (string): consensus "best" id for biological entity
        node_label (string): primary type of entity, capitalised
        **properties (kwargs): collection of all other properties to be
            passed to neo4j for the respective node (dict)
    Todo:
        - check and correct small inconsistencies such as capitalisation
            of ID names ("uniprot" vs "UniProt")
        - check for correct ID patterns (eg "ENSG" + string of numbers,
            uniprot length)
        - ID conversion using pypath translation facilities for now
    """
    node_id: str
    node_label: str
    preferred_id: str = "id"
    properties: dict = field(default_factory=dict)
    def __post_init__(self):
        """
        Add id field to properties.
        Check for reserved keywords.
        Replace unwanted characters in properties.
        """
        self.properties["id"] = self.node_id
        self.properties["preferred_id"] = self.preferred_id or None
        # TODO actually make None possible here; as is, "id" is the default in
        # the dataclass as well as in the configuration file
        if ":TYPE" in self.properties.keys():
            logger.warning(
                "Keyword ':TYPE' is reserved for Neo4j. "
                "Removing from properties.",
                # "Renaming to 'type'."
            )
            # self.properties["type"] = self.properties[":TYPE"]
            del self.properties[":TYPE"]
        for k, v in self.properties.items():
            if isinstance(v, str):
                self.properties[k] = (
                    v.replace(
                        os.linesep,
                        " ",
                    )
                    .replace(
                        "\n",
                        " ",
                    )
                    .replace(
                        "\r",
                        " ",
                    )
                )
            elif isinstance(v, list):
                #modified biocypher, because the data contained intgers in lists
                self.properties[k] = [
                    (str(val) if isinstance(val, (int, float)) else val)
                    .replace(os.linesep, " ")
                    .replace("\n", " ")
                    .replace("\r", " ")
                    for val in v
                ]
    def get_id(self) -> str:
        """
        Returns primary node identifier.
        Returns:
            str: node_id
        """
        return self.node_id
    def get_label(self) -> str:
        """
        Returns primary node label.
        Returns:
            str: node_label
        """
        return self.node_label
    def get_type(self) -> str:
        """
        Returns primary node label.
        Returns:
            str: node_label
        """
        return self.node_label
    def get_preferred_id(self) -> str:
        """
        Returns preferred id.
        Returns:
            str: preferred_id
        """
        return self.preferred_id
    def get_properties(self) -> dict:
        """
        Returns all other node properties apart from primary id and
        label as key-value pairs.
        Returns:
            dict: properties
        """
        return self.properties
    def get_dict(self) -> dict:
        """
        Return dict of id, labels, and properties.
        Returns:
            dict: node_id and node_label as top-level key-value pairs,
            properties as second-level dict.
        """
        return {
            "node_id": self.node_id,
            "node_label": self.node_label,
            "properties": self.properties,
        }
@dataclass(frozen=True)
 class BioCypherEdge:
    """
    Handoff class to represent biomedical relationships in Neo4j.
    Has source and target ids, label, property dict; ids and label (in
    the Neo4j sense of a label, ie, the entity descriptor after the
    colon, such as ":TARGETS") are non-optional and called source_id,
    target_id, and relationship_label to avoid confusion with properties
    called "label", which usually denotes the human-readable form.
    Relationship labels are written in UPPERCASE and as verbs, as per
    Neo4j consensus.
    Args:
        source_id (string): consensus "best" id for biological entity
        target_id (string): consensus "best" id for biological entity
        relationship_label (string): type of interaction, UPPERCASE
        properties (dict): collection of all other properties of the
        respective edge
    """
    source_id: str
    target_id: str
    relationship_label: str
    relationship_id: str = None
    properties: dict = field(default_factory=dict)
    def __post_init__(self):
        """
        Check for reserved keywords.
        """
        if ":TYPE" in self.properties.keys():
            logger.debug(
                "Keyword ':TYPE' is reserved for Neo4j. "
                "Removing from properties.",
                # "Renaming to 'type'."
            )
            # self.properties["type"] = self.properties[":TYPE"]
            del self.properties[":TYPE"]
        elif "id" in self.properties.keys():
            logger.debug(
                "Keyword 'id' is reserved for Neo4j. "
                "Removing from properties.",
                # "Renaming to 'type'."
            )
            # self.properties["type"] = self.properties[":TYPE"]
            del self.properties["id"]
        elif "_ID" in self.properties.keys():
            logger.debug(
                "Keyword '_ID' is reserved for Postgres. "
                "Removing from properties.",
                # "Renaming to 'type'."
            )
            # self.properties["type"] = self.properties[":TYPE"]
            del self.properties["_ID"]
    def get_id(self) -> Union[str, None]:
        """
        Returns primary node identifier or None.
        Returns:
            str: node_id
        """
        return self.relationship_id
    def get_source_id(self) -> str:
        """
        Returns primary node identifier of relationship source.
        Returns:
            str: source_id
        """
        return self.source_id
    def get_target_id(self) -> str:
        """
        Returns primary node identifier of relationship target.
        Returns:
            str: target_id
        """
        return self.target_id
    def get_label(self) -> str:
        """
        Returns relationship label.
        Returns:
            str: relationship_label
        """
        return self.relationship_label
    def get_type(self) -> str:
        """
        Returns relationship label.
        Returns:
            str: relationship_label
        """
        return self.relationship_label
    def get_properties(self) -> dict:
        """
        Returns all other relationship properties apart from primary ids
        and label as key-value pairs.
        Returns:
            dict: properties
        """
        return self.properties
    def get_dict(self) -> dict:
        """
        Return dict of ids, label, and properties.
        Returns:
            dict: source_id, target_id and relationship_label as
                top-level key-value pairs, properties as second-level
                dict.
        """
        return {
            "relationship_id": self.relationship_id or None,
            "source_id": self.source_id,
            "target_id": self.target_id,
            "relationship_label": self.relationship_label,
            "properties": self.properties,
        }
@dataclass(frozen=True)
 class BioCypherRelAsNode:
    """
    Class to represent relationships as nodes (with in- and outgoing
    edges) as a triplet of a BioCypherNode and two BioCypherEdges. Main
    usage in type checking (instances where the receiving function needs
    to check whether it receives a relationship as a single edge or as
    a triplet).
    Args:
        node (BioCypherNode): node representing the relationship
        source_edge (BioCypherEdge): edge representing the source of the
            relationship
        target_edge (BioCypherEdge): edge representing the target of the
            relationship
    """
    node: BioCypherNode
    source_edge: BioCypherEdge
    target_edge: BioCypherEdge
    def __post_init__(self):
        if not isinstance(self.node, BioCypherNode):
            raise TypeError(
                f"BioCypherRelAsNode.node must be a BioCypherNode, "
                f"not {type(self.node)}.",
            )
        if not isinstance(self.source_edge, BioCypherEdge):
            raise TypeError(
                f"BioCypherRelAsNode.source_edge must be a BioCypherEdge, "
                f"not {type(self.source_edge)}.",
            )
        if not isinstance(self.target_edge, BioCypherEdge):
            raise TypeError(
                f"BioCypherRelAsNode.target_edge must be a BioCypherEdge, "
                f"not {type(self.target_edge)}.",
            )
    def get_node(self) -> BioCypherNode:
        return self.node
    def get_source_edge(self) -> BioCypherEdge:
        return self.source_edge
    def get_target_edge(self) -> BioCypherEdge:
        return self.target_edge
--- a/biocypher/_deduplicate.py
+++ b/biocypher/_deduplicate.py
@ -0,0 +1,147 @@
 from ._logger import logger
 logger.debug(f"Loading module {__name__}.")
 from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
 class Deduplicator:
    """
    Singleton class responsible of deduplicating BioCypher inputs. Maintains
    sets/dictionaries of node and edge types and their unique identifiers.
    Nodes identifiers should be globally unique (represented as a set), while
    edge identifiers are only unique per edge type (represented as a dict of
    sets, keyed by edge type).
    Stores collection of duplicate node and edge identifiers and types for
    troubleshooting and to avoid overloading the log.
    """
    def __init__(self):
        self.seen_entity_ids = set()
        self.duplicate_entity_ids = set()
        self.entity_types = set()
        self.duplicate_entity_types = set()
        self.seen_relationships = {}
        self.duplicate_relationship_ids = set()
        self.duplicate_relationship_types = set()
    def node_seen(self, entity: BioCypherNode) -> bool:
        """
        Adds a node to the instance and checks if it has been seen before.
        Args:
            node: BioCypherNode to be added.
        Returns:
            True if the node has been seen before, False otherwise.
        """
        if entity.get_label() not in self.entity_types:
            self.entity_types.add(entity.get_label())
        if entity.get_id() in self.seen_entity_ids:
            self.duplicate_entity_ids.add(entity.get_id())
            if entity.get_label() not in self.duplicate_entity_types:
                logger.warning(
                    f"Duplicate node type {entity.get_label()} found. "
                )
                self.duplicate_entity_types.add(entity.get_label())
            return True
        self.seen_entity_ids.add(entity.get_id())
        return False
    def edge_seen(self, relationship: BioCypherEdge) -> bool:
        """
        Adds an edge to the instance and checks if it has been seen before.
        Args:
            edge: BioCypherEdge to be added.
        Returns:
            True if the edge has been seen before, False otherwise.
        """
        if relationship.get_type() not in self.seen_relationships:
            self.seen_relationships[relationship.get_type()] = set()
        # concatenate source and target if no id is present
        if not relationship.get_id():
            _id = (
                f"{relationship.get_source_id()}_{relationship.get_target_id()}"
            )
        else:
            _id = relationship.get_id()
        if _id in self.seen_relationships[relationship.get_type()]:
            self.duplicate_relationship_ids.add(_id)
            if relationship.get_type() not in self.duplicate_relationship_types:
                logger.warning(
                    f"Duplicate edge type {relationship.get_type()} found. "
                )
                self.duplicate_relationship_types.add(relationship.get_type())
            return True
        self.seen_relationships[relationship.get_type()].add(_id)
        return False
    def rel_as_node_seen(self, rel_as_node: BioCypherRelAsNode) -> bool:
        """
        Adds a rel_as_node to the instance (one entity and two relationships)
        and checks if it has been seen before. Only the node is relevant for
        identifying the rel_as_node as a duplicate.
        Args:
            rel_as_node: BioCypherRelAsNode to be added.
        Returns:
            True if the rel_as_node has been seen before, False otherwise.
        """
        node = rel_as_node.get_node()
        if node.get_label() not in self.seen_relationships:
            self.seen_relationships[node.get_label()] = set()
        # rel as node always has an id
        _id = node.get_id()
        if _id in self.seen_relationships[node.get_type()]:
            self.duplicate_relationship_ids.add(_id)
            if node.get_type() not in self.duplicate_relationship_types:
                logger.warning(f"Duplicate edge type {node.get_type()} found. ")
                self.duplicate_relationship_types.add(node.get_type())
            return True
        self.seen_relationships[node.get_type()].add(_id)
        return False
    def get_duplicate_nodes(self):
        """
        Function to return a list of duplicate nodes.
        Returns:
            list: list of duplicate nodes
        """
        if self.duplicate_entity_types:
            return (self.duplicate_entity_types, self.duplicate_entity_ids)
        else:
            return None
    def get_duplicate_edges(self):
        """
        Function to return a list of duplicate edges.
        Returns:
            list: list of duplicate edges
        """
        if self.duplicate_relationship_types:
            return (
                self.duplicate_relationship_types,
                self.duplicate_relationship_ids,
            )
        else:
            return None
--- a/biocypher/_get.py
+++ b/biocypher/_get.py
@ -0,0 +1,443 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 ...
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 BioCypher get module. Used to download and cache data from external sources.
 """
 from __future__ import annotations
 from typing import Optional
 import shutil
 import requests
 from ._logger import logger
 logger.debug(f"Loading module {__name__}.")
 from abc import ABC
 from datetime import datetime, timedelta
 from tempfile import TemporaryDirectory
 import os
 import json
 import ftplib
 import pooch
 from ._misc import to_list, is_nested
 class Resource(ABC):
    def __init__(
        self,
        name: str,
        url_s: str | list[str],
        lifetime: int = 0,
    ):
        """
        A Resource is a file, a list of files, an API request, or a list of API
        requests, any of which can be downloaded from the given URL(s) and
        cached locally. This class implements checks of the minimum requirements
        for a resource, to be implemented by a biocypher adapter.
        Args:
            name (str): The name of the resource.
            url_s (str | list[str]): The URL or URLs of the resource.
            lifetime (int): The lifetime of the resource in days. If 0, the
                resource is considered to be permanent.
        """
        self.name = name
        self.url_s = url_s
        self.lifetime = lifetime
 class FileDownload(Resource):
    def __init__(
        self,
        name: str,
        url_s: str | list[str],
        lifetime: int = 0,
        is_dir: bool = False,
    ):
        """
        Represents basic information for a File Download.
        Args:
            name(str): The name of the File Download.
            url_s(str|list[str]): The URL(s) of the File Download.
            lifetime(int): The lifetime of the File Download in days. If 0, the
                File Download is cached indefinitely.
            is_dir (bool): Whether the URL points to a directory or not.
        """
        super().__init__(name, url_s, lifetime)
        self.is_dir = is_dir
 class APIRequest(Resource):
    def __init__(self, name: str, url_s: str | list[str], lifetime: int = 0):
        """
        Represents basic information for an API Request.
        Args:
            name(str): The name of the API Request.
            url_s(str|list): The URL of the API endpoint.
            lifetime(int): The lifetime of the API Request in days. If 0, the
                API Request is cached indefinitely.
        """
        super().__init__(name, url_s, lifetime)
 class Downloader:
    def __init__(self, cache_dir: Optional[str] = None) -> None:
        """
        The Downloader is a class that manages resources that can be downloaded
        and cached locally. It manages the lifetime of downloaded resources by
        keeping a JSON record of the download date of each resource.
        Args:
            cache_dir (str): The directory where the resources are cached. If
                not given, a temporary directory is created.
        """
        self.cache_dir = cache_dir or TemporaryDirectory().name
        self.cache_file = os.path.join(self.cache_dir, "cache.json")
        self.cache_dict = self._load_cache_dict()
    def download(self, *resources: Resource):
        """
        Download one or multiple resources. Load from cache if the resource is
        already downloaded and the cache is not expired.
        Args:
            resources (Resource): The resource(s) to download or load from
                cache.
        Returns:
            list[str]: The path or paths to the resource(s) that were downloaded
                or loaded from cache.
        """
        paths = []
        for resource in resources:
            paths.append(self._download_or_cache(resource))
        # flatten list if it is nested
        if is_nested(paths):
            paths = [path for sublist in paths for path in sublist]
        return paths
    def _download_or_cache(self, resource: Resource, cache: bool = True):
        """
        Download a resource if it is not cached or exceeded its lifetime.
        Args:
            resource (Resource): The resource to download.
        Returns:
            list[str]: The path or paths to the downloaded resource(s).
        """
        expired = self._is_cache_expired(resource)
        if expired or not cache:
            self._delete_expired_cache(resource)
            if isinstance(resource, FileDownload):
                logger.info(f"Asking for download of resource {resource.name}.")
                paths = self._download_files(cache, resource)
            elif isinstance(resource, APIRequest):
                logger.info(
                    f"Asking for download of api request {resource.name}."
                )
                paths = self._download_api_request(resource)
            else:
                raise TypeError(f"Unknown resource type: {type(resource)}")
        else:
            paths = self.get_cached_version(resource)
        self._update_cache_record(resource)
        return paths
    def _is_cache_expired(self, resource: Resource) -> bool:
        """
        Check if resource or API request cache is expired.
        Args:
            resource (Resource): The resource or API request to download.
        Returns:
            bool: True if cache is expired, False if not.
        """
        cache_record = self._get_cache_record(resource)
        if cache_record:
            download_time = datetime.strptime(
                cache_record.get("date_downloaded"), "%Y-%m-%d %H:%M:%S.%f"
            )
            lifetime = timedelta(days=resource.lifetime)
            expired = download_time + lifetime < datetime.now()
        else:
            expired = True
        return expired
    def _delete_expired_cache(self, resource: Resource):
        cache_resource_path = self.cache_dir + "/" + resource.name
        if os.path.exists(cache_resource_path) and os.path.isdir(
            cache_resource_path
        ):
            shutil.rmtree(cache_resource_path)
    def _download_files(self, cache, file_download: FileDownload):
        """
        Download a resource given it is a file or a directory and return the
        path.
        Args:
            cache (bool): Whether to cache the resource or not.
            file_download (FileDownload): The resource to download.
        Returns:
            list[str]: The path or paths to the downloaded resource(s).
        """
        if file_download.is_dir:
            files = self._get_files(file_download)
            file_download.url_s = [
                file_download.url_s + "/" + file for file in files
            ]
            file_download.is_dir = False
            paths = self._download_or_cache(file_download, cache)
        elif isinstance(file_download.url_s, list):
            paths = []
            for url in file_download.url_s:
                fname = url[url.rfind("/") + 1 :].split("?")[0]
                paths.append(
                    self._retrieve(
                        url=url,
                        fname=fname,
                        path=os.path.join(self.cache_dir, file_download.name),
                    )
                )
        else:
            paths = []
            fname = file_download.url_s[
                file_download.url_s.rfind("/") + 1 :
            ].split("?")[0]
            results = self._retrieve(
                url=file_download.url_s,
                fname=fname,
                path=os.path.join(self.cache_dir, file_download.name),
            )
            if isinstance(results, list):
                paths.extend(results)
            else:
                paths.append(results)
        # sometimes a compressed file contains multiple files
        # TODO ask for a list of files in the archive to be used from the
        # adapter
        return paths
    def _download_api_request(self, api_request: APIRequest):
        """
        Download an API request and return the path.
        Args:
            api_request(APIRequest): The API request result that is being
                cached.
        Returns:
            list[str]: The path to the cached API request.
        """
        urls = (
            api_request.url_s
            if isinstance(api_request.url_s, list)
            else [api_request.url_s]
        )
        paths = []
        for url in urls:
            fname = url[url.rfind("/") + 1 :].rsplit(".", 1)[0]
            logger.info(
                f"Asking for caching API of {api_request.name} {fname}."
            )
            response = requests.get(url=url)
            if response.status_code != 200:
                response.raise_for_status()
            response_data = response.json()
            api_path = os.path.join(
                self.cache_dir, api_request.name, f"{fname}.json"
            )
            os.makedirs(os.path.dirname(api_path), exist_ok=True)
            with open(api_path, "w") as f:
                json.dump(response_data, f)
                logger.info(f"Caching API request to {api_path}.")
            paths.append(api_path)
        return paths
    def get_cached_version(self, resource: Resource) -> list[str]:
        """Get the cached version of a resource.
        Args:
            resource(Resource): The resource to get the cached version of.
        Returns:
            list[str]: The paths to the cached resource(s).
        """
        cached_location = os.path.join(self.cache_dir, resource.name)
        logger.info(f"Use cached version from {cached_location}.")
        paths = []
        for file in os.listdir(cached_location):
            paths.append(os.path.join(cached_location, file))
        return paths
    def _retrieve(
        self,
        url: str,
        fname: str,
        path: str,
        known_hash: str = None,
    ):
        """
        Retrieve a file from a URL using Pooch. Infer type of file from
        extension and use appropriate processor.
        Args:
            url (str): The URL to retrieve the file from.
            fname (str): The name of the file.
            path (str): The path to the file.
        """
        if fname.endswith(".zip"):
            return pooch.retrieve(
                url=url,
                known_hash=known_hash,
                fname=fname,
                path=path,
                processor=pooch.Unzip(),
                progressbar=True,
            )
        elif fname.endswith(".tar.gz"):
            return pooch.retrieve(
                url=url,
                known_hash=known_hash,
                fname=fname,
                path=path,
                processor=pooch.Untar(),
                progressbar=True,
            )
        elif fname.endswith(".gz"):
            return pooch.retrieve(
                url=url,
                known_hash=known_hash,
                fname=fname,
                path=path,
                processor=pooch.Decompress(),
                progressbar=True,
            )
        else:
            return pooch.retrieve(
                url=url,
                known_hash=known_hash,
                fname=fname,
                path=path,
                progressbar=True,
            )
    def _get_files(self, file_download: FileDownload):
        """
        Get the files contained in a directory file.
        Args:
            file_download (FileDownload): The directory file.
        Returns:
            list: The files contained in the directory.
        """
        if file_download.url_s.startswith("ftp://"):
            # remove protocol
            url = file_download.url_s[6:]
            # get base url
            url = url[: url.find("/")]
            # get directory (remove initial slash as well)
            dir = file_download.url_s[7 + len(url) :]
            # get files
            ftp = ftplib.FTP(url)
            ftp.login()
            ftp.cwd(dir)
            files = ftp.nlst()
            ftp.quit()
        else:
            raise NotImplementedError(
                "Only FTP directories are supported at the moment."
            )
        return files
    def _load_cache_dict(self):
        """
        Load the cache dictionary from the cache file. Create an empty cache
        file if it does not exist.
        """
        if not os.path.exists(self.cache_dir):
            logger.info(f"Creating cache directory {self.cache_dir}.")
            os.makedirs(self.cache_dir)
        if not os.path.exists(self.cache_file):
            logger.info(f"Creating cache file {self.cache_file}.")
            with open(self.cache_file, "w") as f:
                json.dump({}, f)
        with open(self.cache_file, "r") as f:
            logger.info(f"Loading cache file {self.cache_file}.")
            return json.load(f)
    def _get_cache_record(self, resource: Resource):
        """
        Get the cache record of a resource.
        Args:
            resource (Resource): The resource to get the cache record of.
        Returns:
            The cache record of the resource.
        """
        return self.cache_dict.get(resource.name, {})
    def _update_cache_record(self, resource: Resource):
        """
        Update the cache record of a resource.
        Args:
            resource (Resource): The resource to update the cache record of.
        """
        cache_record = {}
        cache_record["url"] = to_list(resource.url_s)
        cache_record["date_downloaded"] = str(datetime.now())
        cache_record["lifetime"] = resource.lifetime
        self.cache_dict[resource.name] = cache_record
        with open(self.cache_file, "w") as f:
            json.dump(self.cache_dict, f, default=str)
--- a/biocypher/_logger.py
+++ b/biocypher/_logger.py
@ -0,0 +1,121 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 ...
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 Configuration of the module logger.
 """
 __all__ = ["get_logger", "log", "logfile"]
 from datetime import datetime
 import os
 import pydoc
 import logging
 from biocypher import _config
 from biocypher._metadata import __version__
 def get_logger(name: str = "biocypher") -> logging.Logger:
    """
    Access the module logger, create a new one if does not exist yet.
    Method providing central logger instance to main module. Is called
    only from main submodule, :mod:`biocypher.driver`. In child modules,
    the standard Python logging facility is called
    (using ``logging.getLogger(__name__)``), automatically inheriting
    the handlers from the central logger.
    The file handler creates a log file named after the current date and
    time. Levels to output to file and console can be set here.
    Args:
        name:
            Name of the logger instance.
    Returns:
        An instance of the Python :py:mod:`logging.Logger`.
    """
    if not logging.getLogger(name).hasHandlers():
        # create logger
        logger = logging.getLogger(name)
        logger.setLevel(logging.DEBUG)
        logger.propagate = True
        # formatting
        file_formatter = logging.Formatter(
            "%(asctime)s\t%(levelname)s\tmodule:%(module)s\n%(message)s",
        )
        stdout_formatter = logging.Formatter("%(levelname)s -- %(message)s")
        # file name and creation
        now = datetime.now()
        date_time = now.strftime("%Y%m%d-%H%M%S")
        log_to_disk = _config.config("biocypher").get("log_to_disk")
        if log_to_disk:
            logdir = (
                _config.config("biocypher").get("log_directory")
                or "biocypher-log"
            )
            os.makedirs(logdir, exist_ok=True)
            logfile = os.path.join(logdir, f"biocypher-{date_time}.log")
            # file handler
            file_handler = logging.FileHandler(logfile)
            if _config.config("biocypher").get("debug"):
                file_handler.setLevel(logging.DEBUG)
            else:
                file_handler.setLevel(logging.INFO)
            file_handler.setFormatter(file_formatter)
            logger.addHandler(file_handler)
        # handlers
        # stream handler
        stdout_handler = logging.StreamHandler()
        stdout_handler.setLevel(logging.INFO)
        stdout_handler.setFormatter(stdout_formatter)
        # add handlers
        logger.addHandler(stdout_handler)
        # startup message
        logger.info(f"This is BioCypher v{__version__}.")
        if log_to_disk:
            logger.info(f"Logging into `{logfile}`.")
        else:
            logger.info("Logging into stdout.")
    return logging.getLogger(name)
 def logfile() -> str:
    """
    Path to the log file.
    """
    return get_logger().handlers[0].baseFilename
 def log():
    """
    Browse the log file.
    """
    with open(logfile()) as fp:
        pydoc.pager(fp.read())
 logger = get_logger()
--- a/biocypher/_mapping.py
+++ b/biocypher/_mapping.py
@ -0,0 +1,307 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 ...
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 BioCypher 'mapping' module. Handles the mapping of user-defined schema to the
 underlying ontology.
 """
 from ._logger import logger
 logger.debug(f"Loading module {__name__}.")
 from typing import Optional
 from urllib.request import urlopen
 import yaml
 from . import _misc
 from ._config import config as _config
 class OntologyMapping:
    """
    Class to store the ontology mapping and extensions.
    """
    def __init__(self, config_file: str = None):
        self.schema = self._read_config(config_file)
        self.extended_schema = self._extend_schema()
    def _read_config(self, config_file: str = None):
        """
        Read the configuration file and store the ontology mapping and extensions.
        """
        if config_file is None:
            schema_config = {}
        # load yaml file from web
        elif config_file.startswith("http"):
            with urlopen(config_file) as f:
                schema_config = yaml.safe_load(f)
        # get graph state from config (assume file is local)
        else:
            with open(config_file, "r") as f:
                schema_config = yaml.safe_load(f)
        return schema_config
    def _extend_schema(self, d: Optional[dict] = None) -> dict:
        """
        Get leaves of the tree hierarchy from the data structure dict
        contained in the `schema_config.yaml`. Creates virtual leaves
        (as children) from entries that provide more than one preferred
        id type (and corresponding inputs).
        Args:
            d:
                Data structure dict from yaml file.
        """
        d = d or self.schema
        extended_schema = dict()
        # first pass: get parent leaves with direct representation in ontology
        for k, v in d.items():
            # k is not an entity
            if "represented_as" not in v:
                continue
            # preferred_id optional: if not provided, use `id`
            if not v.get("preferred_id"):
                v["preferred_id"] = "id"
            # k is an entity that is present in the ontology
            if "is_a" not in v:
                extended_schema[k] = v
        # second pass: "vertical" inheritance
        d = self._vertical_property_inheritance(d)
        for k, v in d.items():
            if "is_a" in v:
                # prevent loops
                if k == v["is_a"]:
                    logger.warning(
                        f"Loop detected in ontology mapping: {k} -> {v}. "
                        "Removing item. Please fix the inheritance if you want "
                        "to use this item."
                    )
                    continue
                extended_schema[k] = v
        # "horizontal" inheritance: create siblings for multiple identifiers or
        # sources -> virtual leaves or implicit children
        mi_leaves = {}
        ms_leaves = {}
        for k, v in d.items():
            # k is not an entity
            if "represented_as" not in v:
                continue
            if isinstance(v.get("preferred_id"), list):
                mi_leaves = self._horizontal_inheritance_pid(k, v)
                extended_schema.update(mi_leaves)
            elif isinstance(v.get("source"), list):
                ms_leaves = self._horizontal_inheritance_source(k, v)
                extended_schema.update(ms_leaves)
        return extended_schema
    def _vertical_property_inheritance(self, d):
        """
        Inherit properties from parents to children and update `d` accordingly.
        """
        for k, v in d.items():
            # k is not an entity
            if "represented_as" not in v:
                continue
            # k is an entity that is present in the ontology
            if "is_a" not in v:
                continue
            # "vertical" inheritance: inherit properties from parent
            if v.get("inherit_properties", False):
                # get direct ancestor
                if isinstance(v["is_a"], list):
                    parent = v["is_a"][0]
                else:
                    parent = v["is_a"]
                # ensure child has properties and exclude_properties
                if "properties" not in v:
                    v["properties"] = {}
                if "exclude_properties" not in v:
                    v["exclude_properties"] = {}
                # update properties of child
                parent_props = self.schema[parent].get("properties", {})
                if parent_props:
                    v["properties"].update(parent_props)
                parent_excl_props = self.schema[parent].get(
                    "exclude_properties", {}
                )
                if parent_excl_props:
                    v["exclude_properties"].update(parent_excl_props)
                # update schema (d)
                d[k] = v
        return d
    def _horizontal_inheritance_pid(self, key, value):
        """
        Create virtual leaves for multiple preferred id types or sources.
        If we create virtual leaves, input_label/label_in_input always has to be
        a list.
        """
        leaves = {}
        preferred_id = value["preferred_id"]
        input_label = value.get("input_label") or value["label_in_input"]
        represented_as = value["represented_as"]
        # adjust lengths
        max_l = max(
            [
                len(_misc.to_list(preferred_id)),
                len(_misc.to_list(input_label)),
                len(_misc.to_list(represented_as)),
            ],
        )
        # adjust pid length if necessary
        if isinstance(preferred_id, str):
            pids = [preferred_id] * max_l
        else:
            pids = preferred_id
        # adjust rep length if necessary
        if isinstance(represented_as, str):
            reps = [represented_as] * max_l
        else:
            reps = represented_as
        for pid, lab, rep in zip(pids, input_label, reps):
            skey = pid + "." + key
            svalue = {
                "preferred_id": pid,
                "input_label": lab,
                "represented_as": rep,
                # mark as virtual
                "virtual": True,
            }
            # inherit is_a if exists
            if "is_a" in value.keys():
                # treat as multiple inheritance
                if isinstance(value["is_a"], list):
                    v = list(value["is_a"])
                    v.insert(0, key)
                    svalue["is_a"] = v
                else:
                    svalue["is_a"] = [key, value["is_a"]]
            else:
                # set parent as is_a
                svalue["is_a"] = key
            # inherit everything except core attributes
            for k, v in value.items():
                if k not in [
                    "is_a",
                    "preferred_id",
                    "input_label",
                    "label_in_input",
                    "represented_as",
                ]:
                    svalue[k] = v
            leaves[skey] = svalue
        return leaves
    def _horizontal_inheritance_source(self, key, value):
        """
        Create virtual leaves for multiple sources.
        If we create virtual leaves, input_label/label_in_input always has to be
        a list.
        """
        leaves = {}
        source = value["source"]
        input_label = value.get("input_label") or value["label_in_input"]
        represented_as = value["represented_as"]
        # adjust lengths
        src_l = len(source)
        # adjust label length if necessary
        if isinstance(input_label, str):
            labels = [input_label] * src_l
        else:
            labels = input_label
        # adjust rep length if necessary
        if isinstance(represented_as, str):
            reps = [represented_as] * src_l
        else:
            reps = represented_as
        for src, lab, rep in zip(source, labels, reps):
            skey = src + "." + key
            svalue = {
                "source": src,
                "input_label": lab,
                "represented_as": rep,
                # mark as virtual
                "virtual": True,
            }
            # inherit is_a if exists
            if "is_a" in value.keys():
                # treat as multiple inheritance
                if isinstance(value["is_a"], list):
                    v = list(value["is_a"])
                    v.insert(0, key)
                    svalue["is_a"] = v
                else:
                    svalue["is_a"] = [key, value["is_a"]]
            else:
                # set parent as is_a
                svalue["is_a"] = key
            # inherit everything except core attributes
            for k, v in value.items():
                if k not in [
                    "is_a",
                    "source",
                    "input_label",
                    "label_in_input",
                    "represented_as",
                ]:
                    svalue[k] = v
            leaves[skey] = svalue
        return leaves
--- a/biocypher/_metadata.py
+++ b/biocypher/_metadata.py
@ -0,0 +1,71 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 ...
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 Package metadata (version, authors, etc).
 """
 __all__ = ["get_metadata"]
 import os
 import pathlib
 import importlib.metadata
 import toml
 _VERSION = "0.6.0"
 def get_metadata():
    """
    Basic package metadata.
    Retrieves package metadata from the current project directory or from
    the installed package.
    """
    here = pathlib.Path(__file__).parent
    pyproj_toml = "pyproject.toml"
    meta = {}
    for project_dir in (here, here.parent):
        toml_path = str(project_dir.joinpath(pyproj_toml).absolute())
        if os.path.exists(toml_path):
            pyproject = toml.load(toml_path)
            meta = {
                "name": pyproject["tool"]["poetry"]["name"],
                "version": pyproject["tool"]["poetry"]["version"],
                "author": pyproject["tool"]["poetry"]["authors"],
                "license": pyproject["tool"]["poetry"]["license"],
                "full_metadata": pyproject,
            }
            break
    if not meta:
        try:
            meta = {
                k.lower(): v
                for k, v in importlib.metadata.metadata(here.name).items()
            }
        except importlib.metadata.PackageNotFoundError:
            pass
    meta["version"] = meta.get("version", None) or _VERSION
    return meta
 metadata = get_metadata()
 __version__ = metadata.get("version", None)
 __author__ = metadata.get("author", None)
 __license__ = metadata.get("license", None)
--- a/biocypher/_misc.py
+++ b/biocypher/_misc.py
@ -0,0 +1,264 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 ...
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 Handy functions for use in various places.
 """
 from ._logger import logger
 logger.debug(f"Loading module {__name__}.")
 from typing import (
    Any,
    Union,
    Mapping,
    KeysView,
    Generator,
    ItemsView,
    ValuesView,
 )
 from collections.abc import Iterable
 import re
 from treelib import Tree
 import networkx as nx
 import stringcase
 __all__ = ["LIST_LIKE", "SIMPLE_TYPES", "ensure_iterable", "to_list"]
 SIMPLE_TYPES = (
    bytes,
    str,
    int,
    float,
    bool,
    type(None),
 )
 LIST_LIKE = (
    list,
    set,
    tuple,
    Generator,
    ItemsView,
    KeysView,
    Mapping,
    ValuesView,
 )
 def to_list(value: Any) -> list:
    """
    Ensures that ``value`` is a list.
    """
    if isinstance(value, LIST_LIKE):
        value = list(value)
    else:
        value = [value]
    return value
 def ensure_iterable(value: Any) -> Iterable:
    """
    Returns iterables, except strings, wraps simple types into tuple.
    """
    return value if isinstance(value, LIST_LIKE) else (value,)
 def create_tree_visualisation(inheritance_graph: Union[dict, nx.Graph]) -> Tree:
    """
    Creates a visualisation of the inheritance tree using treelib.
    """
    inheritance_tree = _get_inheritance_tree(inheritance_graph)
    classes, root = _find_root_node(inheritance_tree)
    tree = Tree()
    tree.create_node(root, root)
    while classes:
        for child in classes:
            parent = inheritance_tree[child]
            if parent in tree.nodes.keys() or parent == root:
                tree.create_node(child, child, parent=parent)
        for node in tree.nodes.keys():
            if node in classes:
                classes.remove(node)
    return tree
 def _get_inheritance_tree(inheritance_graph: Union[dict, nx.Graph]) -> dict:
    """Transforms an inheritance_graph into an inheritance_tree.
    Args:
        inheritance_graph: A dict or nx.Graph representing the inheritance graph.
    Returns:
        A dict representing the inheritance tree.
    """
    if isinstance(inheritance_graph, nx.Graph):
        inheritance_tree = nx.to_dict_of_lists(inheritance_graph)
        multiple_parents_present = _multiple_inheritance_present(
            inheritance_tree
        )
        if multiple_parents_present:
            logger.warning(
                "The ontology contains multiple inheritance (one child node "
                "has multiple parent nodes). This is not visualized in the "
                "following hierarchy tree (the child node is only added once). "
                "If you wish to browse all relationships of the parsed "
                "ontologies, write a graphml file to disk using "
                "`to_disk = <directory>` and view this file."
            )
        # unlist values
        inheritance_tree = {k: v[0] for k, v in inheritance_tree.items() if v}
        return inheritance_tree
    elif not _multiple_inheritance_present(inheritance_graph):
        return inheritance_graph
 def _multiple_inheritance_present(inheritance_tree: dict) -> bool:
    """Checks if multiple inheritance is present in the inheritance_tree."""
    return any(len(value) > 1 for value in inheritance_tree.values())
 def _find_root_node(inheritance_tree: dict) -> tuple[set, str]:
    classes = set(inheritance_tree.keys())
    parents = set(inheritance_tree.values())
    root = list(parents - classes)
    if len(root) > 1:
        if "entity" in root:
            root = "entity"  # TODO: default: good standard?
        else:
            raise ValueError(
                "Inheritance tree cannot have more than one root node. "
                f"Found {len(root)}: {root}."
            )
    else:
        root = root[0]
    if not root:
        # find key whose value is None
        root = list(inheritance_tree.keys())[
            list(inheritance_tree.values()).index(None)
        ]
    return classes, root
 # string conversion, adapted from Biolink Model Toolkit
 lowercase_pattern = re.compile(r"[a-zA-Z]*[a-z][a-zA-Z]*")
 underscore_pattern = re.compile(r"(?<!^)(?=[A-Z][a-z])")
 def from_pascal(s: str, sep: str = " ") -> str:
    underscored = underscore_pattern.sub(sep, s)
    lowercased = lowercase_pattern.sub(
        lambda match: match.group(0).lower(),
        underscored,
    )
    return lowercased
 def pascalcase_to_sentencecase(s: str) -> str:
    """
    Convert PascalCase to sentence case.
    Args:
        s: Input string in PascalCase
    Returns:
        string in sentence case form
    """
    return from_pascal(s, sep=" ")
 def snakecase_to_sentencecase(s: str) -> str:
    """
    Convert snake_case to sentence case.
    Args:
        s: Input string in snake_case
    Returns:
        string in sentence case form
    """
    return stringcase.sentencecase(s).lower()
 def sentencecase_to_snakecase(s: str) -> str:
    """
    Convert sentence case to snake_case.
    Args:
        s: Input string in sentence case
    Returns:
        string in snake_case form
    """
    return stringcase.snakecase(s).lower()
 def sentencecase_to_pascalcase(s: str, sep: str = r"\s") -> str:
    """
    Convert sentence case to PascalCase.
    Args:
        s: Input string in sentence case
    Returns:
        string in PascalCase form
    """
    return re.sub(
        r"(?:^|[" + sep + "])([a-zA-Z])",
        lambda match: match.group(1).upper(),
        s,
    )
 def to_lower_sentence_case(s: str) -> str:
    """
    Convert any string to lower sentence case. Works with snake_case,
    PascalCase, and sentence case.
    Args:
        s: Input string
    Returns:
        string in lower sentence case form
    """
    if "_" in s:
        return snakecase_to_sentencecase(s)
    elif " " in s:
        return s.lower()
    elif s[0].isupper():
        return pascalcase_to_sentencecase(s)
    else:
        return s
 def is_nested(lst) -> bool:
    """
    Check if a list is nested.
    Args:
        lst (list): The list to check.
    Returns:
        bool: True if the list is nested, False otherwise.
    """
    for item in lst:
        if isinstance(item, list):
            return True
    return False
--- a/biocypher/_ontology.py
+++ b/biocypher/_ontology.py
@ -0,0 +1,886 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 ...
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 BioCypher 'ontology' module. Contains classes and functions to handle parsing
 and representation of single ontologies as well as their hybridisation and
 other advanced operations.
 """
 import os
 from ._logger import logger
 logger.debug(f"Loading module {__name__}.")
 from typing import Optional
 from datetime import datetime
 from rdflib import Graph
 from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
 import rdflib
 import networkx as nx
 from ._misc import (
    to_list,
    to_lower_sentence_case,
    create_tree_visualisation,
    sentencecase_to_pascalcase,
 )
 from ._mapping import OntologyMapping
 class OntologyAdapter:
    """
    Class that represents an ontology to be used in the Biocypher framework. Can
    read from a variety of formats, including OWL, OBO, and RDF/XML. The
    ontology is represented by a networkx.DiGraph object; an RDFlib graph is
    also kept. By default, the DiGraph reverses the label and identifier of the
    nodes, such that the node name in the graph is the human-readable label. The
    edges are oriented from child to parent.
    Labels are formatted in lower sentence case and underscores are replaced by spaces.
    Identifiers are taken as defined and the prefixes are removed by default.
    """
    def __init__(
        self,
        ontology_file: str,
        root_label: str,
        ontology_file_format: Optional[str] = None,
        head_join_node_label: Optional[str] = None,
        merge_nodes: Optional[bool] = True,
        switch_label_and_id: bool = True,
        remove_prefixes: bool = True,
    ):
        """
        Initialize the OntologyAdapter class.
        Args:
            ontology_file (str): Path to the ontology file. Can be local or
                remote.
            root_label (str): The label of the root node in the ontology. In
                case of a tail ontology, this is the tail join node.
            ontology_file_format (str): The format of the ontology file (e.g. "application/rdf+xml")
                If format is not passed, it is determined automatically.
            head_join_node_label (str): Optional variable to store the label of the
                node in the head ontology that should be used to join to the
                root node of the tail ontology. Defaults to None.
            merge_nodes (bool): If True, head and tail join nodes will be
                merged, using the label of the head join node. If False, the
                tail join node will be attached as a child of the head join
                node.
            switch_label_and_id (bool): If True, the node names in the graph will be
                the human-readable labels. If False, the node names will be the
                identifiers. Defaults to True.
            remove_prefixes (bool): If True, the prefixes of the identifiers will
                be removed. Defaults to True.
        """
        logger.info(f"Instantiating OntologyAdapter class for {ontology_file}.")
        self._ontology_file = ontology_file
        self._root_label = root_label
        self._format = ontology_file_format
        self._merge_nodes = merge_nodes
        self._head_join_node = head_join_node_label
        self._switch_label_and_id = switch_label_and_id
        self._remove_prefixes = remove_prefixes
        self._rdf_graph = self._load_rdf_graph(ontology_file)
        self._nx_graph = self._rdf_to_nx(
            self._rdf_graph, root_label, switch_label_and_id
        )
    def _rdf_to_nx(
        self,
        _rdf_graph: rdflib.Graph,
        root_label: str,
        switch_label_and_id: bool,
        rename_nodes: bool = True,
    ) -> nx.DiGraph:
        one_to_one_triples, one_to_many_dict = self._get_relevant_rdf_triples(
            _rdf_graph
        )
        nx_graph = self._convert_to_nx(one_to_one_triples, one_to_many_dict)
        nx_graph = self._add_labels_to_nodes(nx_graph, switch_label_and_id)
        nx_graph = self._change_nodes_to_biocypher_format(
            nx_graph, switch_label_and_id, rename_nodes
        )
        nx_graph = self._get_all_ancestors(
            nx_graph, root_label, switch_label_and_id, rename_nodes
        )
        return nx.DiGraph(nx_graph)
    def _get_relevant_rdf_triples(self, g: rdflib.Graph) -> tuple:
        one_to_one_inheritance_graph = self._get_one_to_one_inheritance_triples(
            g
        )
        intersection = self._get_multiple_inheritance_dict(g)
        return one_to_one_inheritance_graph, intersection
    def _get_one_to_one_inheritance_triples(
        self, g: rdflib.Graph
    ) -> rdflib.Graph:
        """Get the one to one inheritance triples from the RDF graph.
        Args:
            g (rdflib.Graph): The RDF graph
        Returns:
            rdflib.Graph: The one to one inheritance graph
        """
        one_to_one_inheritance_graph = Graph()
        for s, p, o in g.triples((None, rdflib.RDFS.subClassOf, None)):
            if self.has_label(s, g):
                one_to_one_inheritance_graph.add((s, p, o))
        return one_to_one_inheritance_graph
    def _get_multiple_inheritance_dict(self, g: rdflib.Graph) -> dict:
        """Get the multiple inheritance dictionary from the RDF graph.
        Args:
            g (rdflib.Graph): The RDF graph
        Returns:
            dict: The multiple inheritance dictionary
        """
        multiple_inheritance = g.triples(
            (None, rdflib.OWL.intersectionOf, None)
        )
        intersection = {}
        for (
            node,
            has_multiple_parents,
            first_node_of_intersection_list,
        ) in multiple_inheritance:
            parents = self._retrieve_rdf_linked_list(
                first_node_of_intersection_list
            )
            child_name = None
            for s_, _, _ in g.triples((None, rdflib.RDFS.subClassOf, node)):
                child_name = s_
            # Handle Snomed CT post coordinated expressions
            if not child_name:
                for s_, _, _ in g.triples(
                    (None, rdflib.OWL.equivalentClass, node)
                ):
                    child_name = s_
            if child_name:
                intersection[node] = {
                    "child_name": child_name,
                    "parent_node_names": parents,
                }
        return intersection
    def has_label(self, node: rdflib.URIRef, g: rdflib.Graph) -> bool:
        """Does the node have a label in g?
        Args:
            node (rdflib.URIRef): The node to check
            g (rdflib.Graph): The graph to check in
        Returns:
            bool: True if the node has a label, False otherwise
        """
        return (node, rdflib.RDFS.label, None) in g
    def _retrieve_rdf_linked_list(self, subject: rdflib.URIRef) -> list:
        """Recursively retrieves a linked list from RDF.
        Example RDF list with the items [item1, item2]:
        list_node - first -> item1
        list_node - rest -> list_node2
        list_node2 - first -> item2
        list_node2 - rest -> nil
        Args:
            subject (rdflib.URIRef): One list_node of the RDF list
        Returns:
            list: The items of the RDF list
        """
        g = self._rdf_graph
        rdf_list = []
        for s, p, o in g.triples((subject, rdflib.RDF.first, None)):
            rdf_list.append(o)
        for s, p, o in g.triples((subject, rdflib.RDF.rest, None)):
            if o != rdflib.RDF.nil:
                rdf_list.extend(self._retrieve_rdf_linked_list(o))
        return rdf_list
    def _convert_to_nx(
        self, one_to_one: rdflib.Graph, one_to_many: dict
    ) -> nx.DiGraph:
        """Convert the one to one and one to many inheritance graphs to networkx.
        Args:
            one_to_one (rdflib.Graph): The one to one inheritance graph
            one_to_many (dict): The one to many inheritance dictionary
        Returns:
            nx.DiGraph: The networkx graph
        """
        nx_graph = rdflib_to_networkx_digraph(
            one_to_one, edge_attrs=lambda s, p, o: {}, calc_weights=False
        )
        for key, value in one_to_many.items():
            nx_graph.add_edges_from(
                [
                    (value["child_name"], parent)
                    for parent in value["parent_node_names"]
                ]
            )
            if key in nx_graph.nodes:
                nx_graph.remove_node(key)
        return nx_graph
    def _add_labels_to_nodes(
        self, nx_graph: nx.DiGraph, switch_label_and_id: bool
    ) -> nx.DiGraph:
        """Add labels to the nodes in the networkx graph.
        Args:
            nx_graph (nx.DiGraph): The networkx graph
            switch_label_and_id (bool): If True, id and label are switched
        Returns:
            nx.DiGraph: The networkx graph with labels
        """
        for node in list(nx_graph.nodes):
            nx_id, nx_label = self._get_nx_id_and_label(
                node, switch_label_and_id
            )
            if nx_id == "none":
                # remove node if it has no id
                nx_graph.remove_node(node)
                continue
            nx_graph.nodes[node]["label"] = nx_label
        return nx_graph
    def _change_nodes_to_biocypher_format(
        self,
        nx_graph: nx.DiGraph,
        switch_label_and_id: bool,
        rename_nodes: bool = True,
    ) -> nx.DiGraph:
        """Change the nodes in the networkx graph to BioCypher format:
            - remove the prefix of the identifier
            - switch id and label
            - adapt the labels (replace _ with space and convert to lower sentence case)
        Args:
            nx_graph (nx.DiGraph): The networkx graph
            switch_label_and_id (bool): If True, id and label are switched
            rename_nodes (bool): If True, the nodes are renamed
        Returns:
            nx.DiGraph: The networkx ontology graph in BioCypher format
        """
        mapping = {
            node: self._get_nx_id_and_label(
                node, switch_label_and_id, rename_nodes
            )[0]
            for node in nx_graph.nodes
        }
        renamed = nx.relabel_nodes(nx_graph, mapping, copy=False)
        return renamed
    def _get_all_ancestors(
        self,
        renamed: nx.DiGraph,
        root_label: str,
        switch_label_and_id: bool,
        rename_nodes: bool = True,
    ) -> nx.DiGraph:
        """Get all ancestors of the root node in the networkx graph.
        Args:
            renamed (nx.DiGraph): The renamed networkx graph
            root_label (str): The label of the root node in the ontology
            switch_label_and_id (bool): If True, id and label are switched
            rename_nodes (bool): If True, the nodes are renamed
        Returns:
            nx.DiGraph: The filtered networkx graph
        """
        root = self._get_nx_id_and_label(
            self._find_root_label(self._rdf_graph, root_label),
            switch_label_and_id,
            rename_nodes,
        )[0]
        ancestors = nx.ancestors(renamed, root)
        ancestors.add(root)
        filtered_graph = renamed.subgraph(ancestors)
        return filtered_graph
    def _get_nx_id_and_label(
        self, node, switch_id_and_label: bool, rename_nodes: bool = True
    ) -> tuple[str, str]:
        """Rename node id and label for nx graph.
        Args:
            node (str): The node to rename
            switch_id_and_label (bool): If True, switch id and label
        Returns:
            tuple[str, str]: The renamed node id and label
        """
        node_id_str = self._remove_prefix(str(node))
        node_label_str = str(self._rdf_graph.value(node, rdflib.RDFS.label))
        if rename_nodes:
            node_label_str = node_label_str.replace("_", " ")
            node_label_str = to_lower_sentence_case(node_label_str)
        nx_id = node_label_str if switch_id_and_label else node_id_str
        nx_label = node_id_str if switch_id_and_label else node_label_str
        return nx_id, nx_label
    def _find_root_label(self, g, root_label):
        # Loop through all labels in the ontology
        for label_subject, _, label_in_ontology in g.triples(
            (None, rdflib.RDFS.label, None)
        ):
            # If the label is the root label, set the root node to the label's subject
            if str(label_in_ontology) == root_label:
                root = label_subject
                break
        else:
            labels_in_ontology = []
            for label_subject, _, label_in_ontology in g.triples(
                (None, rdflib.RDFS.label, None)
            ):
                labels_in_ontology.append(str(label_in_ontology))
            raise ValueError(
                f"Could not find root node with label '{root_label}'. "
                f"The ontology contains the following labels: {labels_in_ontology}"
            )
        return root
    def _remove_prefix(self, uri: str) -> str:
        """
        Remove the prefix of a URI. URIs can contain either "#" or "/" as a
        separator between the prefix and the local name. The prefix is
        everything before the last separator.
        """
        if self._remove_prefixes:
            return uri.rsplit("#", 1)[-1].rsplit("/", 1)[-1]
        else:
            return uri
    def _load_rdf_graph(self, ontology_file):
        """
        Load the ontology into an RDFlib graph. The ontology file can be in
        OWL, OBO, or RDF/XML format.
        """
        g = rdflib.Graph()
        g.parse(ontology_file, format=self._get_format(ontology_file))
        return g
    def _get_format(self, ontology_file):
        """
        Get the format of the ontology file.
        """
        if self._format:
            if self._format == "owl":
                return "application/rdf+xml"
            elif self._format == "obo":
                raise NotImplementedError("OBO format not yet supported")
            elif self._format == "rdf":
                return "application/rdf+xml"
            elif self._format == "ttl":
                return self._format
            else:
                raise ValueError(
                    f"Could not determine format of ontology file {ontology_file}"
                )
        if ontology_file.endswith(".owl"):
            return "application/rdf+xml"
        elif ontology_file.endswith(".obo"):
            raise NotImplementedError("OBO format not yet supported")
        elif ontology_file.endswith(".rdf"):
            return "application/rdf+xml"
        elif ontology_file.endswith(".ttl"):
            return "ttl"
        else:
            raise ValueError(
                f"Could not determine format of ontology file {ontology_file}"
            )
    def get_nx_graph(self):
        """
        Get the networkx graph representing the ontology.
        """
        return self._nx_graph
    def get_rdf_graph(self):
        """
        Get the RDFlib graph representing the ontology.
        """
        return self._rdf_graph
    def get_root_node(self):
        """
        Get root node in the ontology.
        Returns:
            root_node: If _switch_label_and_id is True, the root node label is returned,
                otherwise the root node id is returned.
        """
        root_node = None
        root_label = self._root_label.replace("_", " ")
        if self._switch_label_and_id:
            root_node = to_lower_sentence_case(root_label)
        elif not self._switch_label_and_id:
            for node, data in self.get_nx_graph().nodes(data=True):
                if "label" in data and data["label"] == to_lower_sentence_case(
                    root_label
                ):
                    root_node = node
                    break
        return root_node
    def get_ancestors(self, node_label):
        """
        Get the ancestors of a node in the ontology.
        """
        return nx.dfs_preorder_nodes(self._nx_graph, node_label)
    def get_head_join_node(self):
        """
        Get the head join node of the ontology.
        """
        return self._head_join_node
 class Ontology:
    """
    A class that represents the ontological "backbone" of a BioCypher knowledge
    graph. The ontology can be built from a single resource, or hybridised from
    a combination of resources, with one resource being the "head" ontology,
    while an arbitrary number of other resources can become "tail" ontologies at
    arbitrary fusion points inside the "head" ontology.
    """
    def __init__(
        self,
        head_ontology: dict,
        ontology_mapping: Optional["OntologyMapping"] = None,
        tail_ontologies: Optional[dict] = None,
    ):
        """
        Initialize the Ontology class.
        Args:
            head_ontology (OntologyAdapter): The head ontology.
            tail_ontologies (list): A list of OntologyAdapters that will be
                added to the head ontology. Defaults to None.
        """
        self._head_ontology_meta = head_ontology
        self.mapping = ontology_mapping
        self._tail_ontology_meta = tail_ontologies
        self._tail_ontologies = None
        self._nx_graph = None
        # keep track of nodes that have been extended
        self._extended_nodes = set()
        self._main()
    def _main(self) -> None:
        """
        Main method to be run on instantiation. Loads the ontologies, joins
        them, and returns the hybrid ontology. Loads only the head ontology
        if nothing else is given. Adds user extensions and properties from
        the mapping.
        """
        self._load_ontologies()
        if self._tail_ontologies:
            for adapter in self._tail_ontologies.values():
                head_join_node = self._get_head_join_node(adapter)
                self._join_ontologies(adapter, head_join_node)
        else:
            self._nx_graph = self._head_ontology.get_nx_graph()
        if self.mapping:
            self._extend_ontology()
            # experimental: add connections of disjoint classes to entity
            # self._connect_biolink_classes()
            self._add_properties()
    def _load_ontologies(self) -> None:
        """
        For each ontology, load the OntologyAdapter object and store it as an
        instance variable (head) or a dictionary (tail).
        """
        logger.info("Loading ontologies...")
        self._head_ontology = OntologyAdapter(
            ontology_file=self._head_ontology_meta["url"],
            root_label=self._head_ontology_meta["root_node"],
            ontology_file_format=self._head_ontology_meta.get("format", None),
            switch_label_and_id=self._head_ontology_meta.get(
                "switch_label_and_id", True
            ),
        )
        if self._tail_ontology_meta:
            self._tail_ontologies = {}
            for key, value in self._tail_ontology_meta.items():
                self._tail_ontologies[key] = OntologyAdapter(
                    ontology_file=value["url"],
                    root_label=value["tail_join_node"],
                    head_join_node_label=value["head_join_node"],
                    ontology_file_format=value.get("format", None),
                    merge_nodes=value.get("merge_nodes", True),
                    switch_label_and_id=value.get("switch_label_and_id", True),
                )
    def _get_head_join_node(self, adapter: OntologyAdapter) -> str:
        """
        Tries to find the head join node of the given ontology adapter in the
        head ontology. If the join node is not found, the method will raise an
        error.
        Args:
            adapter (OntologyAdapter): The ontology adapter of which to find the
                join node in the head ontology.
        """
        head_join_node = None
        user_defined_head_join_node_label = adapter.get_head_join_node()
        head_join_node_label_in_bc_format = to_lower_sentence_case(
            user_defined_head_join_node_label.replace("_", " ")
        )
        if self._head_ontology._switch_label_and_id:
            head_join_node = head_join_node_label_in_bc_format
        elif not self._head_ontology._switch_label_and_id:
            for node_id, data in self._head_ontology.get_nx_graph().nodes(
                data=True
            ):
                if (
                    "label" in data
                    and data["label"] == head_join_node_label_in_bc_format
                ):
                    head_join_node = node_id
                    break
        if head_join_node not in self._head_ontology.get_nx_graph().nodes:
            head_ontology = self._head_ontology._rdf_to_nx(
                self._head_ontology.get_rdf_graph(),
                self._head_ontology._root_label,
                self._head_ontology._switch_label_and_id,
                rename_nodes=False,
            )
            raise ValueError(
                f"Head join node '{head_join_node}' not found in head ontology. "
                f"The head ontology contains the following nodes: {head_ontology.nodes}."
            )
        return head_join_node
    def _join_ontologies(
        self, adapter: OntologyAdapter, head_join_node
    ) -> None:
        """
        Joins the ontologies by adding the tail ontology as a subgraph to the
        head ontology at the specified join nodes.
        Args:
            adapter (OntologyAdapter): The ontology adapter of the tail ontology
                to be added to the head ontology.
        """
        if not self._nx_graph:
            self._nx_graph = self._head_ontology.get_nx_graph().copy()
        tail_join_node = adapter.get_root_node()
        tail_ontology = adapter.get_nx_graph()
        # subtree of tail ontology at join node
        tail_ontology_subtree = nx.dfs_tree(
            tail_ontology.reverse(), tail_join_node
        ).reverse()
        # transfer node attributes from tail ontology to subtree
        for node in tail_ontology_subtree.nodes:
            tail_ontology_subtree.nodes[node].update(tail_ontology.nodes[node])
        # if merge_nodes is False, create parent of tail join node from head
        # join node
        if not adapter._merge_nodes:
            # add head join node from head ontology to tail ontology subtree
            # as parent of tail join node
            tail_ontology_subtree.add_node(
                head_join_node,
                **self._head_ontology.get_nx_graph().nodes[head_join_node],
            )
            tail_ontology_subtree.add_edge(tail_join_node, head_join_node)
        # else rename tail join node to match head join node if necessary
        elif not tail_join_node == head_join_node:
            tail_ontology_subtree = nx.relabel_nodes(
                tail_ontology_subtree, {tail_join_node: head_join_node}
            )
        # combine head ontology and tail subtree
        self._nx_graph = nx.compose(self._nx_graph, tail_ontology_subtree)
    def _extend_ontology(self) -> None:
        """
        Adds the user extensions to the ontology. Tries to find the parent in
        the ontology, adds it if necessary, and adds the child and a directed
        edge from child to parent. Can handle multiple parents.
        """
        if not self._nx_graph:
            self._nx_graph = self._head_ontology.get_nx_graph().copy()
        for key, value in self.mapping.extended_schema.items():
            if not value.get("is_a"):
                if self._nx_graph.has_node(value.get("synonym_for")):
                    continue
                if not self._nx_graph.has_node(key):
                    raise ValueError(
                        f"Node {key} not found in ontology, but also has no "
                        "inheritance definition. Please check your schema for "
                        "spelling errors, first letter not in lower case, use of underscores, a missing `is_a` definition (SubClassOf a root node), or missing labels in class or super-classes."
                    )
                continue
            parents = to_list(value.get("is_a"))
            child = key
            while parents:
                parent = parents.pop(0)
                if parent not in self._nx_graph.nodes:
                    self._nx_graph.add_node(parent)
                    self._nx_graph.nodes[parent][
                        "label"
                    ] = sentencecase_to_pascalcase(parent)
                    # mark parent as user extension
                    self._nx_graph.nodes[parent]["user_extension"] = True
                    self._extended_nodes.add(parent)
                if child not in self._nx_graph.nodes:
                    self._nx_graph.add_node(child)
                    self._nx_graph.nodes[child][
                        "label"
                    ] = sentencecase_to_pascalcase(child)
                    # mark child as user extension
                    self._nx_graph.nodes[child]["user_extension"] = True
                    self._extended_nodes.add(child)
                self._nx_graph.add_edge(child, parent)
                child = parent
    def _connect_biolink_classes(self) -> None:
        """
        Experimental: Adds edges from disjoint classes to the entity node.
        """
        if not self._nx_graph:
            self._nx_graph = self._head_ontology.get_nx_graph().copy()
        if "entity" not in self._nx_graph.nodes:
            return
        # biolink classes that are disjoint from entity
        disjoint_classes = [
            "frequency qualifier mixin",
            "chemical entity to entity association mixin",
            "ontology class",
            "relationship quantifier",
            "physical essence or occurrent",
            "gene or gene product",
            "subject of investigation",
        ]
        for node in disjoint_classes:
            if not self._nx_graph.nodes.get(node):
                self._nx_graph.add_node(node)
                self._nx_graph.nodes[node][
                    "label"
                ] = sentencecase_to_pascalcase(node)
            self._nx_graph.add_edge(node, "entity")
    def _add_properties(self) -> None:
        """
        For each entity in the mapping, update the ontology with the properties
        specified in the mapping. Updates synonym information in the graph,
        setting the synonym as the primary node label.
        """
        for key, value in self.mapping.extended_schema.items():
            if key in self._nx_graph.nodes:
                self._nx_graph.nodes[key].update(value)
            if value.get("synonym_for"):
                # change node label to synonym
                if value["synonym_for"] not in self._nx_graph.nodes:
                    raise ValueError(
                        f'Node {value["synonym_for"]} not found in ontology.'
                    )
                self._nx_graph = nx.relabel_nodes(
                    self._nx_graph, {value["synonym_for"]: key}
                )
    def get_ancestors(self, node_label: str) -> list:
        """
        Get the ancestors of a node in the ontology.
        Args:
            node_label (str): The label of the node in the ontology.
        Returns:
            list: A list of the ancestors of the node.
        """
        return nx.dfs_tree(self._nx_graph, node_label)
    def show_ontology_structure(self, to_disk: str = None, full: bool = False):
        """
        Show the ontology structure using treelib or write to GRAPHML file.
        Args:
            to_disk (str): If specified, the ontology structure will be saved
                to disk as a GRAPHML file at the location (directory) specified
                by the `to_disk` string, to be opened in your favourite graph
                visualisation tool.
            full (bool): If True, the full ontology structure will be shown,
                including all nodes and edges. If False, only the nodes and
                edges that are relevant to the extended schema will be shown.
        """
        if not full and not self.mapping.extended_schema:
            raise ValueError(
                "You are attempting to visualise a subset of the loaded"
                "ontology, but have not provided a schema configuration. "
                "To display a partial ontology graph, please provide a schema "
                "configuration file; to visualise the full graph, please use "
                "the parameter `full = True`."
            )
        if not self._nx_graph:
            raise ValueError("Ontology not loaded.")
        if not self._tail_ontologies:
            msg = f"Showing ontology structure based on {self._head_ontology._ontology_file}"
        else:
            msg = f"Showing ontology structure based on {len(self._tail_ontology_meta)+1} ontologies: "
        logger.info(msg)
        if not full:
            # set of leaves and their intermediate parents up to the root
            filter_nodes = set(self.mapping.extended_schema.keys())
            for node in self.mapping.extended_schema.keys():
                filter_nodes.update(self.get_ancestors(node).nodes)
            # filter graph
            G = self._nx_graph.subgraph(filter_nodes)
        else:
            G = self._nx_graph
        if not to_disk:
            # create tree
            tree = create_tree_visualisation(G)
            # add synonym information
            for node in self.mapping.extended_schema:
                if not isinstance(self.mapping.extended_schema[node], dict):
                    continue
                if self.mapping.extended_schema[node].get("synonym_for"):
                    tree.nodes[node].tag = (
                        f"{node} = "
                        f"{self.mapping.extended_schema[node].get('synonym_for')}"
                    )
            logger.info(f"\n{tree}")
            return tree
        else:
            # convert lists/dicts to strings for vis only
            for node in G.nodes:
                # rename node and use former id as label
                label = G.nodes[node].get("label")
                if not label:
                    label = node
                G = nx.relabel_nodes(G, {node: label})
                G.nodes[label]["label"] = node
                for attrib in G.nodes[label]:
                    if type(G.nodes[label][attrib]) in [list, dict]:
                        G.nodes[label][attrib] = str(G.nodes[label][attrib])
            path = os.path.join(to_disk, "ontology_structure.graphml")
            logger.info(f"Writing ontology structure to {path}.")
            nx.write_graphml(G, path)
            return True
    def get_dict(self) -> dict:
        """
        Returns a dictionary compatible with a BioCypher node for compatibility
        with the Neo4j driver.
        """
        d = {
            "node_id": self._get_current_id(),
            "node_label": "BioCypher",
            "properties": {
                "schema": "self.ontology_mapping.extended_schema",
            },
        }
        return d
    def _get_current_id(self):
        """
        Instantiate a version ID for the current session. For now does simple
        versioning using datetime.
        Can later implement incremental versioning, versioning from
        config file, or manual specification via argument.
        """
        now = datetime.now()
        return now.strftime("v%Y%m%d-%H%M%S")
--- a/biocypher/_translate.py
+++ b/biocypher/_translate.py
@ -0,0 +1,480 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 ...
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 BioCypher 'translation' module. Responsible for translating between the raw
 input data and the BioCypherNode and BioCypherEdge objects.
 """
 from ._logger import logger
 logger.debug(f"Loading module {__name__}.")
 from typing import Any, Union, Optional
 from collections.abc import Iterable, Generator
 from more_itertools import peekable
 from . import _misc
 from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
 from ._ontology import Ontology
 __all__ = ["BiolinkAdapter", "Translator"]
 class Translator:
    """
    Class responsible for exacting the translation process that is configured in
    the schema_config.yaml file. Creates a mapping dictionary from that file,
    and, given nodes and edges, translates them into BioCypherNodes and
    BioCypherEdges. During this process, can also filter the properties of the
    entities if the schema_config.yaml file specifies a property whitelist or
    blacklist.
    Provides utility functions for translating between input and output labels
    and cypher queries.
    """
    def __init__(self, ontology: "Ontology", strict_mode: bool = False):
        """
        Args:
            leaves:
                Dictionary detailing the leaves of the hierarchy
                tree representing the structure of the graph; the leaves are
                the entities that will be direct components of the graph,
                while the intermediary nodes are additional labels for
                filtering purposes.
            strict_mode:
                If True, the translator will raise an error if input data do not
                carry source, licence, and version information.
        """
        self.ontology = ontology
        self.strict_mode = strict_mode
        # record nodes without biolink type configured in schema_config.yaml
        self.notype = {}
        # mapping functionality for translating terms and queries
        self.mappings = {}
        self.reverse_mappings = {}
        self._update_ontology_types()
    def translate_nodes(
        self,
        node_tuples: Iterable,
    ) -> Generator[BioCypherNode, None, None]:
        """
        Translates input node representation to a representation that
        conforms to the schema of the given BioCypher graph. For now
        requires explicit statement of node type on pass.
        Args:
            node_tuples (list of tuples): collection of tuples
                representing individual nodes by their unique id and a type
                that is translated from the original database notation to
                the corresponding BioCypher notation.
        """
        self._log_begin_translate(node_tuples, "nodes")
        for _id, _type, _props in node_tuples:
            # check for strict mode requirements
            required_props = ["source", "licence", "version"]
            if self.strict_mode:
                # rename 'license' to 'licence' in _props
                if _props.get("license"):
                    _props["licence"] = _props.pop("license")
                for prop in required_props:
                    if prop not in _props:
                        raise ValueError(
                            f"Property `{prop}` missing from node {_id}. "
                            "Strict mode is enabled, so this is not allowed."
                        )
            # find the node in leaves that represents ontology node type
            _ontology_class = self._get_ontology_mapping(_type)
            if _ontology_class:
                # filter properties for those specified in schema_config if any
                _filtered_props = self._filter_props(_ontology_class, _props)
                # preferred id
                _preferred_id = self._get_preferred_id(_ontology_class)
                yield BioCypherNode(
                    node_id=_id,
                    node_label=_ontology_class,
                    preferred_id=_preferred_id,
                    properties=_filtered_props,
                )
            else:
                self._record_no_type(_type, _id)
        self._log_finish_translate("nodes")
    def _get_preferred_id(self, _bl_type: str) -> str:
        """
        Returns the preferred id for the given Biolink type.
        """
        return (
            self.ontology.mapping.extended_schema[_bl_type]["preferred_id"]
            if "preferred_id"
            in self.ontology.mapping.extended_schema.get(_bl_type, {})
            else "id"
        )
    def _filter_props(self, bl_type: str, props: dict) -> dict:
        """
        Filters properties for those specified in schema_config if any.
        """
        filter_props = self.ontology.mapping.extended_schema[bl_type].get(
            "properties", {}
        )
        # strict mode: add required properties (only if there is a whitelist)
        if self.strict_mode and filter_props:
            filter_props.update(
                {"source": "str", "licence": "str", "version": "str"},
            )
        exclude_props = self.ontology.mapping.extended_schema[bl_type].get(
            "exclude_properties", []
        )
        if isinstance(exclude_props, str):
            exclude_props = [exclude_props]
        if filter_props and exclude_props:
            filtered_props = {
                k: v
                for k, v in props.items()
                if (k in filter_props.keys() and k not in exclude_props)
            }
        elif filter_props:
            filtered_props = {
                k: v for k, v in props.items() if k in filter_props.keys()
            }
        elif exclude_props:
            filtered_props = {
                k: v for k, v in props.items() if k not in exclude_props
            }
        else:
            return props
        missing_props = [
            k for k in filter_props.keys() if k not in filtered_props.keys()
        ]
        # add missing properties with default values
        for k in missing_props:
            filtered_props[k] = None
        return filtered_props
    def translate_edges(
        self,
        edge_tuples: Iterable,
    ) -> Generator[Union[BioCypherEdge, BioCypherRelAsNode], None, None]:
        """
        Translates input edge representation to a representation that
        conforms to the schema of the given BioCypher graph. For now
        requires explicit statement of edge type on pass.
        Args:
            edge_tuples (list of tuples):
                collection of tuples representing source and target of
                an interaction via their unique ids as well as the type
                of interaction in the original database notation, which
                is translated to BioCypher notation using the `leaves`.
                Can optionally possess its own ID.
        """
        self._log_begin_translate(edge_tuples, "edges")
        # legacy: deal with 4-tuples (no edge id)
        # TODO remove for performance reasons once safe
        edge_tuples = peekable(edge_tuples)
        if len(edge_tuples.peek()) == 4:
            edge_tuples = [
                (None, src, tar, typ, props)
                for src, tar, typ, props in edge_tuples
            ]
        for _id, _src, _tar, _type, _props in edge_tuples:
            # check for strict mode requirements
            if self.strict_mode:
                if not "source" in _props:
                    raise ValueError(
                        f"Edge {_id if _id else (_src, _tar)} does not have a `source` property.",
                        " This is required in strict mode.",
                    )
                if not "licence" in _props:
                    raise ValueError(
                        f"Edge {_id if _id else (_src, _tar)} does not have a `licence` property.",
                        " This is required in strict mode.",
                    )
            # match the input label (_type) to
            # a Biolink label from schema_config
            bl_type = self._get_ontology_mapping(_type)
            if bl_type:
                # filter properties for those specified in schema_config if any
                _filtered_props = self._filter_props(bl_type, _props)
                rep = self.ontology.mapping.extended_schema[bl_type][
                    "represented_as"
                ]
                if rep == "node":
                    if _id:
                        # if it brings its own ID, use it
                        node_id = _id
                    else:
                        # source target concat
                        node_id = (
                            str(_src)
                            + "_"
                            + str(_tar)
                            + "_"
                            + "_".join(str(v) for v in _filtered_props.values())
                        )
                    n = BioCypherNode(
                        node_id=node_id,
                        node_label=bl_type,
                        properties=_filtered_props,
                    )
                    # directionality check TODO generalise to account for
                    # different descriptions of directionality or find a
                    # more consistent solution for indicating directionality
                    if _filtered_props.get("directed") == True:
                        l1 = "IS_SOURCE_OF"
                        l2 = "IS_TARGET_OF"
                    elif _filtered_props.get(
                        "src_role",
                    ) and _filtered_props.get("tar_role"):
                        l1 = _filtered_props.get("src_role")
                        l2 = _filtered_props.get("tar_role")
                    else:
                        l1 = l2 = "IS_PART_OF"
                    e_s = BioCypherEdge(
                        source_id=_src,
                        target_id=node_id,
                        relationship_label=l1,
                        # additional here
                    )
                    e_t = BioCypherEdge(
                        source_id=_tar,
                        target_id=node_id,
                        relationship_label=l2,
                        # additional here
                    )
                    yield BioCypherRelAsNode(n, e_s, e_t)
                else:
                    edge_label = self.ontology.mapping.extended_schema[
                        bl_type
                    ].get("label_as_edge")
                    if edge_label is None:
                        edge_label = bl_type
                    yield BioCypherEdge(
                        relationship_id=_id,
                        source_id=_src,
                        target_id=_tar,
                        relationship_label=edge_label,
                        properties=_filtered_props,
                    )
            else:
                self._record_no_type(_type, (_src, _tar))
        self._log_finish_translate("edges")
    def _record_no_type(self, _type: Any, what: Any) -> None:
        """
        Records the type of a node or edge that is not represented in the
        schema_config.
        """
        logger.debug(f"No ontology type defined for `{_type}`: {what}")
        if self.notype.get(_type, None):
            self.notype[_type] += 1
        else:
            self.notype[_type] = 1
    def get_missing_biolink_types(self) -> dict:
        """
        Returns a dictionary of types that were not represented in the
        schema_config.
        """
        return self.notype
    @staticmethod
    def _log_begin_translate(_input: Iterable, what: str):
        n = f"{len(_input)} " if hasattr(_input, "__len__") else ""
        logger.debug(f"Translating {n}{what} to BioCypher")
    @staticmethod
    def _log_finish_translate(what: str):
        logger.debug(f"Finished translating {what} to BioCypher.")
    def _update_ontology_types(self):
        """
        Creates a dictionary to translate from input labels to ontology labels.
        If multiple input labels, creates mapping for each.
        """
        self._ontology_mapping = {}
        for key, value in self.ontology.mapping.extended_schema.items():
            labels = value.get("input_label") or value.get("label_in_input")
            if isinstance(labels, str):
                self._ontology_mapping[labels] = key
            elif isinstance(labels, list):
                for label in labels:
                    self._ontology_mapping[label] = key
            if value.get("label_as_edge"):
                self._add_translation_mappings(labels, value["label_as_edge"])
            else:
                self._add_translation_mappings(labels, key)
    def _get_ontology_mapping(self, label: str) -> Optional[str]:
        """
        For each given input type ("input_label" or "label_in_input"), find the
        corresponding ontology class in the leaves dictionary (from the
        `schema_config.yam`).
        Args:
            label:
                The input type to find (`input_label` or `label_in_input` in
                `schema_config.yaml`).
        """
        # commented out until behaviour of _update_bl_types is fixed
        return self._ontology_mapping.get(label, None)
    def translate_term(self, term):
        """
        Translate a single term.
        """
        return self.mappings.get(term, None)
    def reverse_translate_term(self, term):
        """
        Reverse translate a single term.
        """
        return self.reverse_mappings.get(term, None)
    def translate(self, query):
        """
        Translate a cypher query. Only translates labels as of now.
        """
        for key in self.mappings:
            query = query.replace(":" + key, ":" + self.mappings[key])
        return query
    def reverse_translate(self, query):
        """
        Reverse translate a cypher query. Only translates labels as of
        now.
        """
        for key in self.reverse_mappings:
            a = ":" + key + ")"
            b = ":" + key + "]"
            # TODO this conditional probably does not cover all cases
            if a in query or b in query:
                if isinstance(self.reverse_mappings[key], list):
                    raise NotImplementedError(
                        "Reverse translation of multiple inputs not "
                        "implemented yet. Many-to-one mappings are "
                        "not reversible. "
                        f"({key} -> {self.reverse_mappings[key]})",
                    )
                else:
                    query = query.replace(
                        a,
                        ":" + self.reverse_mappings[key] + ")",
                    ).replace(b, ":" + self.reverse_mappings[key] + "]")
        return query
    def _add_translation_mappings(self, original_name, biocypher_name):
        """
        Add translation mappings for a label and name. We use here the
        PascalCase version of the BioCypher name, since sentence case is
        not useful for Cypher queries.
        """
        if isinstance(original_name, list):
            for on in original_name:
                self.mappings[on] = self.name_sentence_to_pascal(
                    biocypher_name,
                )
        else:
            self.mappings[original_name] = self.name_sentence_to_pascal(
                biocypher_name,
            )
        if isinstance(biocypher_name, list):
            for bn in biocypher_name:
                self.reverse_mappings[
                    self.name_sentence_to_pascal(
                        bn,
                    )
                ] = original_name
        else:
            self.reverse_mappings[
                self.name_sentence_to_pascal(
                    biocypher_name,
                )
            ] = original_name
    @staticmethod
    def name_sentence_to_pascal(name: str) -> str:
        """
        Converts a name in sentence case to pascal case.
        """
        # split on dots if dot is present
        if "." in name:
            return ".".join(
                [_misc.sentencecase_to_pascalcase(n) for n in name.split(".")],
            )
        else:
            return _misc.sentencecase_to_pascalcase(name)
--- a/biocypher/output/init.py
+++ b/biocypher/output/init.py
--- a/biocypher/output/connect/init.py
+++ b/biocypher/output/connect/init.py
--- a/biocypher/output/connect/_neo4j_driver.py
+++ b/biocypher/output/connect/_neo4j_driver.py
@ -0,0 +1,422 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 ...
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 BioCypher 'online' mode. Handles connection and manipulation of a running DBMS.
 """
 import subprocess
 from biocypher._logger import logger
 logger.debug(f"Loading module {__name__}.")
 from collections.abc import Iterable
 import itertools
 import neo4j_utils
 from biocypher import _misc
 from biocypher._config import config as _config
 from biocypher._create import BioCypherEdge, BioCypherNode
 from biocypher._translate import Translator
 __all__ = ["_Neo4jDriver"]
 class _Neo4jDriver:
    """
    Manages a BioCypher connection to a Neo4j database using the
    ``neo4j_utils.Driver`` class.
    Args:
        database_name (str): The name of the database to connect to.
        wipe (bool): Whether to wipe the database before importing.
        uri (str): The URI of the database.
        user (str): The username to use for authentication.
        password (str): The password to use for authentication.
        multi_db (bool): Whether to use multi-database mode.
        fetch_size (int): The number of records to fetch at a time.
        increment_version (bool): Whether to increment the version number.
        translator (Translator): The translator to use for mapping.
    """
    def __init__(
        self,
        database_name: str,
        uri: str,
        user: str,
        password: str,
        multi_db: bool,
        translator: Translator,
        wipe: bool = False,
        fetch_size: int = 1000,
        increment_version: bool = True,
    ):
        self.translator = translator
        self._driver = neo4j_utils.Driver(
            db_name=database_name,
            db_uri=uri,
            db_user=user,
            db_passwd=password,
            fetch_size=fetch_size,
            wipe=wipe,
            multi_db=multi_db,
            raise_errors=True,
        )
        # check for biocypher config in connected graph
        if wipe:
            self.init_db()
        if increment_version:
            # set new current version node
            self._update_meta_graph()
    def _update_meta_graph(self):
        logger.info("Updating Neo4j meta graph.")
        # find current version node
        db_version = self._driver.query(
            "MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v",
        )
        # add version node
        self.add_biocypher_nodes(self.translator.ontology)
        # connect version node to previous
        if db_version[0]:
            previous = db_version[0][0]
            previous_id = previous["v"]["id"]
            e_meta = BioCypherEdge(
                previous_id,
                self.translator.ontology.get_dict().get("node_id"),
                "PRECEDES",
            )
            self.add_biocypher_edges(e_meta)
    def init_db(self):
        """
        Used to initialise a property graph database by setting up new
        constraints. Wipe has been performed by the ``neo4j_utils.Driver``
        class` already.
        Todo:
            - set up constraint creation interactively depending on the
                need of the database
        """
        logger.info("Initialising database.")
        self._create_constraints()
    def _create_constraints(self):
        """
        Creates constraints on node types in the graph. Used for
        initial setup.
        Grabs leaves of the ``schema_config.yaml`` file and creates
        constraints on the id of all entities represented as nodes.
        """
        logger.info("Creating constraints for node types in config.")
        major_neo4j_version = int(self._get_neo4j_version().split(".")[0])
        # get structure
        for leaf in self.translator.ontology.mapping.extended_schema.items():
            label = _misc.sentencecase_to_pascalcase(leaf[0], sep=r"\s\.")
            if leaf[1]["represented_as"] == "node":
                if major_neo4j_version >= 5:
                    s = (
                        f"CREATE CONSTRAINT `{label}_id` "
                        f"IF NOT EXISTS FOR (n:`{label}`) "
                        "REQUIRE n.id IS UNIQUE"
                    )
                    self._driver.query(s)
                else:
                    s = (
                        f"CREATE CONSTRAINT `{label}_id` "
                        f"IF NOT EXISTS ON (n:`{label}`) "
                        "ASSERT n.id IS UNIQUE"
                    )
                    self._driver.query(s)
    def _get_neo4j_version(self):
        """Get neo4j version."""
        try:
            neo4j_version = self._driver.query(
                """
                    CALL dbms.components()
                    YIELD name, versions, edition
                    UNWIND versions AS version
                    RETURN version AS version
                """,
            )[0][0]["version"]
            return neo4j_version
        except Exception as e:
            logger.warning(
                f"Error detecting Neo4j version: {e} use default version 4.0.0."
            )
            return "4.0.0"
    def add_nodes(self, id_type_tuples: Iterable[tuple]) -> tuple:
        """
        Generic node adder method to add any kind of input to the graph via the
        :class:`biocypher.create.BioCypherNode` class. Employs translation
        functionality and calls the :meth:`add_biocypher_nodes()` method.
        Args:
            id_type_tuples (iterable of 3-tuple): for each node to add to
                the biocypher graph, a 3-tuple with the following layout:
                first, the (unique if constrained) ID of the node; second, the
                type of the node, capitalised or PascalCase and in noun form
                (Neo4j primary label, eg `:Protein`); and third, a dictionary
                of arbitrary properties the node should possess (can be empty).
        Returns:
            2-tuple: the query result of :meth:`add_biocypher_nodes()`
                - first entry: data
                - second entry: Neo4j summary.
        """
        bn = self.translator.translate_nodes(id_type_tuples)
        return self.add_biocypher_nodes(bn)
    def add_edges(self, id_src_tar_type_tuples: Iterable[tuple]) -> tuple:
        """
        Generic edge adder method to add any kind of input to the graph
        via the :class:`biocypher.create.BioCypherEdge` class. Employs
        translation functionality and calls the
        :meth:`add_biocypher_edges()` method.
        Args:
            id_src_tar_type_tuples (iterable of 5-tuple):
                for each edge to add to the biocypher graph, a 5-tuple
                with the following layout: first, the optional unique ID
                of the interaction. This can be `None` if there is no
                systematic identifier (which for many interactions is
                the case). Second and third, the (unique if constrained)
                IDs of the source and target nodes of the relationship;
                fourth, the type of the relationship; and fifth, a
                dictionary of arbitrary properties the edge should
                possess (can be empty).
        Returns:
            2-tuple: the query result of :meth:`add_biocypher_edges()`
                - first entry: data
                - second entry: Neo4j summary.
        """
        bn = self.translator.translate_edges(id_src_tar_type_tuples)
        return self.add_biocypher_edges(bn)
    def add_biocypher_nodes(
        self,
        nodes: Iterable[BioCypherNode],
        explain: bool = False,
        profile: bool = False,
    ) -> bool:
        """
        Accepts a node type handoff class
        (:class:`biocypher.create.BioCypherNode`) with id,
        label, and a dict of properties (passing on the type of
        property, ie, ``int``, ``str``, ...).
        The dict retrieved by the
        :meth:`biocypher.create.BioCypherNode.get_dict()` method is
        passed into Neo4j as a map of maps, explicitly encoding node id
        and label, and adding all other properties from the 'properties'
        key of the dict. The merge is performed via APOC, matching only
        on node id to prevent duplicates. The same properties are set on
        match and on create, irrespective of the actual event.
        Args:
            nodes:
                An iterable of :class:`biocypher.create.BioCypherNode` objects.
            explain:
                Call ``EXPLAIN`` on the CYPHER query.
            profile:
                Do profiling on the CYPHER query.
        Returns:
            True for success, False otherwise.
        """
        try:
            nodes = _misc.to_list(nodes)
            entities = [node.get_dict() for node in nodes]
        except AttributeError:
            msg = "Nodes must have a `get_dict` method."
            logger.error(msg)
            raise ValueError(msg)
        logger.info(f"Merging {len(entities)} nodes.")
        entity_query = (
            "UNWIND $entities AS ent "
            "CALL apoc.merge.node([ent.node_label], "
            "{id: ent.node_id}, ent.properties, ent.properties) "
            "YIELD node "
            "RETURN node"
        )
        method = "explain" if explain else "profile" if profile else "query"
        result = getattr(self._driver, method)(
            entity_query,
            parameters={
                "entities": entities,
            },
        )
        logger.info("Finished merging nodes.")
        return result
    def add_biocypher_edges(
        self,
        edges: Iterable[BioCypherEdge],
        explain: bool = False,
        profile: bool = False,
    ) -> bool:
        """
        Accepts an edge type handoff class
        (:class:`biocypher.create.BioCypherEdge`) with source
        and target ids, label, and a dict of properties (passing on the
        type of property, ie, int, string ...).
        The individual edge is either passed as a singleton, in the case
        of representation as an edge in the graph, or as a 4-tuple, in
        the case of representation as a node (with two edges connecting
        to interaction partners).
        The dict retrieved by the
        :meth:`biocypher.create.BioCypherEdge.get_dict()` method is
        passed into Neo4j as a map of maps, explicitly encoding source
        and target ids and the relationship label, and adding all edge
        properties from the 'properties' key of the dict. The merge is
        performed via APOC, matching only on source and target id to
        prevent duplicates. The same properties are set on match and on
        create, irrespective of the actual event.
        Args:
            edges:
                An iterable of :class:`biocypher.create.BioCypherEdge` objects.
            explain:
                Call ``EXPLAIN`` on the CYPHER query.
            profile:
                Do profiling on the CYPHER query.
        Returns:
            `True` for success, `False` otherwise.
        """
        edges = _misc.ensure_iterable(edges)
        edges = itertools.chain(*(_misc.ensure_iterable(i) for i in edges))
        nodes = []
        rels = []
        try:
            for e in edges:
                if hasattr(e, "get_node"):
                    nodes.append(e.get_node())
                    rels.append(e.get_source_edge().get_dict())
                    rels.append(e.get_target_edge().get_dict())
                else:
                    rels.append(e.get_dict())
        except AttributeError:
            msg = "Edges and nodes must have a `get_dict` method."
            logger.error(msg)
            raise ValueError(msg)
        self.add_biocypher_nodes(nodes)
        logger.info(f"Merging {len(rels)} edges.")
        # cypher query
        # merging only on the ids of the entities, passing the
        # properties on match and on create;
        # TODO add node labels?
        node_query = (
            "UNWIND $rels AS r "
            "MERGE (src {id: r.source_id}) "
            "MERGE (tar {id: r.target_id}) "
        )
        self._driver.query(node_query, parameters={"rels": rels})
        edge_query = (
            "UNWIND $rels AS r "
            "MATCH (src {id: r.source_id}) "
            "MATCH (tar {id: r.target_id}) "
            "WITH src, tar, r "
            "CALL apoc.merge.relationship"
            "(src, r.relationship_label, NULL, "
            "r.properties, tar, r.properties) "
            "YIELD rel "
            "RETURN rel"
        )
        method = "explain" if explain else "profile" if profile else "query"
        result = getattr(self._driver, method)(
            edge_query, parameters={"rels": rels}
        )
        logger.info("Finished merging edges.")
        return result
 def get_driver(
    dbms: str,
    translator: "Translator",
 ):
    """
    Function to return the writer class.
    Returns:
        class: the writer class
    """
    dbms_config = _config(dbms)
    if dbms == "neo4j":
        return _Neo4jDriver(
            database_name=dbms_config["database_name"],
            wipe=dbms_config["wipe"],
            uri=dbms_config["uri"],
            user=dbms_config["user"],
            password=dbms_config["password"],
            multi_db=dbms_config["multi_db"],
            translator=translator,
        )
    return None
--- a/biocypher/output/in_memory/init.py
+++ b/biocypher/output/in_memory/init.py
--- a/biocypher/output/in_memory/_pandas.py
+++ b/biocypher/output/in_memory/_pandas.py
@ -0,0 +1,90 @@
 import pandas as pd
 from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
 class Pandas:
    def __init__(self, translator, deduplicator):
        self.translator = translator
        self.deduplicator = deduplicator
        self.dfs = {}
    def _separate_entity_types(self, entities):
        """
        Given mixed iterable of BioCypher objects, separate them into lists by
        type. Also deduplicates using the `Deduplicator` instance.
        """
        lists = {}
        for entity in entities:
            if (
                not isinstance(entity, BioCypherNode)
                and not isinstance(entity, BioCypherEdge)
                and not isinstance(entity, BioCypherRelAsNode)
            ):
                raise TypeError(
                    "Expected a BioCypherNode / BioCypherEdge / "
                    f"BioCypherRelAsNode, got {type(entity)}."
                )
            if isinstance(entity, BioCypherNode):
                seen = self.deduplicator.node_seen(entity)
            elif isinstance(entity, BioCypherEdge):
                seen = self.deduplicator.edge_seen(entity)
            elif isinstance(entity, BioCypherRelAsNode):
                seen = self.deduplicator.rel_as_node_seen(entity)
            if seen:
                continue
            if isinstance(entity, BioCypherRelAsNode):
                node = entity.get_node()
                source_edge = entity.get_source_edge()
                target_edge = entity.get_target_edge()
                _type = node.get_type()
                if not _type in lists:
                    lists[_type] = []
                lists[_type].append(node)
                _source_type = source_edge.get_type()
                if not _source_type in lists:
                    lists[_source_type] = []
                lists[_source_type].append(source_edge)
                _target_type = target_edge.get_type()
                if not _target_type in lists:
                    lists[_target_type] = []
                lists[_target_type].append(target_edge)
                continue
            _type = entity.get_type()
            if not _type in lists:
                lists[_type] = []
            lists[_type].append(entity)
        return lists
    def add_tables(self, entities):
        """
        Add Pandas dataframes for each node and edge type in the input.
        """
        lists = self._separate_entity_types(entities)
        for _type, _entities in lists.items():
            self._add_entity_df(_type, _entities)
    def _add_entity_df(self, _type, _entities):
        df = pd.DataFrame(
            pd.json_normalize([node.get_dict() for node in _entities])
        )
        # replace "properties." with "" in column names
        df.columns = [col.replace("properties.", "") for col in df.columns]
        if _type not in self.dfs:
            self.dfs[_type] = df
        else:
            self.dfs[_type] = pd.concat(
                [self.dfs[_type], df], ignore_index=True
            )
        return self.dfs[_type]
--- a/biocypher/output/write/init.py
+++ b/biocypher/output/write/init.py
--- a/biocypher/output/write/_batch_writer.py
+++ b/biocypher/output/write/_batch_writer.py
--- a/biocypher/output/write/_get_writer.py
+++ b/biocypher/output/write/_get_writer.py
@ -0,0 +1,113 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s): Sebastian Lobentanzer
 #                 Michael Hartung
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 BioCypher 'offline' module. Handles the writing of node and edge representations
 suitable for import into a DBMS.
 """
 from biocypher._logger import logger
 from biocypher.output.write.graph._rdf import _RDFWriter
 from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
 from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
 from biocypher.output.write.graph._networkx import _NetworkXWriter
 from biocypher.output.write.relational._csv import _PandasCSVWriter
 from biocypher.output.write.relational._sqlite import _SQLiteBatchWriter
 from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
 logger.debug(f"Loading module {__name__}.")
 from typing import TYPE_CHECKING
 from biocypher._config import config as _config
 __all__ = ["get_writer", "DBMS_TO_CLASS"]
 if TYPE_CHECKING:
    from biocypher._translate import Translator
    from biocypher._deduplicate import Deduplicator
 DBMS_TO_CLASS = {
    "neo": _Neo4jBatchWriter,
    "neo4j": _Neo4jBatchWriter,
    "Neo4j": _Neo4jBatchWriter,
    "postgres": _PostgreSQLBatchWriter,
    "postgresql": _PostgreSQLBatchWriter,
    "PostgreSQL": _PostgreSQLBatchWriter,
    "arango": _ArangoDBBatchWriter,
    "arangodb": _ArangoDBBatchWriter,
    "ArangoDB": _ArangoDBBatchWriter,
    "sqlite": _SQLiteBatchWriter,
    "sqlite3": _SQLiteBatchWriter,
    "rdf": _RDFWriter,
    "RDF": _RDFWriter,
    "csv": _PandasCSVWriter,
    "CSV": _PandasCSVWriter,
    "pandas": _PandasCSVWriter,
    "Pandas": _PandasCSVWriter,
    "networkx": _NetworkXWriter,
    "NetworkX": _NetworkXWriter,
 }
 def get_writer(
    dbms: str,
    translator: "Translator",
    deduplicator: "Deduplicator",
    output_directory: str,
    strict_mode: bool,
 ):
    """
    Function to return the writer class based on the selection in the config
    file.
    Args:
        dbms: the database management system; for options, see DBMS_TO_CLASS.
        translator: the Translator object.
        deduplicator: the Deduplicator object.
        output_directory: the directory to output.write the output files to.
        strict_mode: whether to use strict mode.
    Returns:
        instance: an instance of the selected writer class.
    """
    dbms_config = _config(dbms)
    writer = DBMS_TO_CLASS[dbms]
    if not writer:
        raise ValueError(f"Unknown dbms: {dbms}")
    if writer is not None:
        return writer(
            translator=translator,
            deduplicator=deduplicator,
            delimiter=dbms_config.get("delimiter"),
            array_delimiter=dbms_config.get("array_delimiter"),
            quote=dbms_config.get("quote_character"),
            output_directory=output_directory,
            db_name=dbms_config.get("database_name"),
            import_call_bin_prefix=dbms_config.get("import_call_bin_prefix"),
            import_call_file_prefix=dbms_config.get("import_call_file_prefix"),
            wipe=dbms_config.get("wipe"),
            strict_mode=strict_mode,
            skip_bad_relationships=dbms_config.get(
                "skip_bad_relationships"
            ),  # neo4j
            skip_duplicate_nodes=dbms_config.get(
                "skip_duplicate_nodes"
            ),  # neo4j
            db_user=dbms_config.get("user"),  # psql
            db_password=dbms_config.get("password"),  # psql
            db_port=dbms_config.get("port"),  # psql
            rdf_format=dbms_config.get("rdf_format"),  # rdf
            rdf_namespaces=dbms_config.get("rdf_namespaces"),  # rdf
        )
--- a/biocypher/output/write/_writer.py
+++ b/biocypher/output/write/_writer.py
@ -0,0 +1,200 @@
 from abc import ABC, abstractmethod
 from typing import Union, Optional
 from collections.abc import Iterable
 import os
 from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
 from biocypher._logger import logger
 from biocypher._translate import Translator
 from biocypher._deduplicate import Deduplicator
 __all__ = ["_Writer"]
 class _Writer(ABC):
    """Abstract class for writing node and edge representations to disk.
    Specifics of the different writers (e.g. neo4j, postgresql, csv, etc.)
    are implemented in the child classes. Any concrete writer needs to
    implement at least:
    - _write_node_data
    - _write_edge_data
    - _construct_import_call
    - _get_import_script_name
    Args:
        translator (Translator): Instance of :py:class:`Translator` to enable translation of
            nodes and manipulation of properties.
        deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
            of nodes and edges.
        output_directory (str, optional): Path for exporting CSV files. Defaults to None.
        strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
    strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
    Raises:
        NotImplementedError: Writer implementation must override '_write_node_data'
        NotImplementedError: Writer implementation must override '_write_edge_data'
        NotImplementedError: Writer implementation must override '_construct_import_call'
        NotImplementedError: Writer implementation must override '_get_import_script_name'
    """
    def __init__(
        self,
        translator: Translator,
        deduplicator: Deduplicator,
        output_directory: Optional[str] = None,
        strict_mode: bool = False,
        *args,
        **kwargs,
    ):
        """Abstract class for writing node and edge representations to disk.
        Args:
            translator (Translator): Instance of :py:class:`Translator` to enable translation of
                nodes and manipulation of properties.
            deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
                of nodes and edges.
            output_directory (str, optional): Path for exporting CSV files. Defaults to None.
            strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
        strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
        """
        self.translator = translator
        self.deduplicator = deduplicator
        self.strict_mode = strict_mode
        self.output_directory = output_directory
        if os.path.exists(self.output_directory):
            if kwargs.get("write_to_file", True):
                logger.warning(
                    f"Output directory `{self.output_directory}` already exists. "
                    "If this is not planned, file consistency may be compromised."
                )
        else:
            logger.info(f"Creating output directory `{self.output_directory}`.")
            os.makedirs(self.output_directory)
    @abstractmethod
    def _write_node_data(
        self,
        nodes: Iterable[
            Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
        ],
    ) -> bool:
        """Implement how to output.write nodes to disk.
        Args:
            nodes (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        raise NotImplementedError(
            "Writer implementation must override 'write_nodes'"
        )
    @abstractmethod
    def _write_edge_data(
        self,
        edges: Iterable[
            Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
        ],
    ) -> bool:
        """Implement how to output.write edges to disk.
        Args:
            edges (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        raise NotImplementedError(
            "Writer implementation must override 'write_edges'"
        )
    @abstractmethod
    def _construct_import_call(self) -> str:
        """
        Function to construct the import call detailing folder and
        individual node and edge headers and data files, as well as
        delimiters and database name. Built after all data has been
        processed to ensure that nodes are called before any edges.
        Returns:
            str: command for importing the output files into a DBMS.
        """
        raise NotImplementedError(
            "Writer implementation must override '_construct_import_call'"
        )
    @abstractmethod
    def _get_import_script_name(self) -> str:
        """Returns the name of the import script.
        Returns:
            str: The name of the import script (ending in .sh)
        """
        raise NotImplementedError(
            "Writer implementation must override '_get_import_script_name'"
        )
    def write_nodes(
        self, nodes, batch_size: int = int(1e6), force: bool = False
    ):
        """Wrapper for writing nodes.
        Args:
            nodes (BioCypherNode): a list or generator of nodes in
                :py:class:`BioCypherNode` format
            batch_size (int): The batch size for writing nodes.
            force (bool): Whether to force writing nodes even if their type is
                not present in the schema.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        passed = self._write_node_data(nodes)
        if not passed:
            logger.error("Error while writing node data.")
            return False
        return True
    def write_edges(
        self, edges, batch_size: int = int(1e6), force: bool = False
    ):
        """Wrapper for writing edges.
        Args:
            nodes (BioCypherNode): a list or generator of nodes in
                :py:class:`BioCypherNode` format
            batch_size (int): The batch size for writing nodes.
            force (bool): Whether to force writing nodes even if their type is
                not present in the schema.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        passed = self._write_edge_data(edges)
        if not passed:
            logger.error("Error while writing edge data.")
            return False
        return True
    def write_import_call(self):
        """
        Function to output.write the import call detailing folder and
        individual node and edge headers and data files, as well as
        delimiters and database name, to the export folder as txt.
        Returns:
            str: The path of the file holding the import call.
        """
        file_path = os.path.join(
            self.output_directory, self._get_import_script_name()
        )
        logger.info(
            f"Writing {self.__class__.__name__} import call to `{file_path}`."
        )
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(self._construct_import_call())
        return file_path
--- a/biocypher/output/write/graph/init.py
+++ b/biocypher/output/write/graph/init.py
--- a/biocypher/output/write/graph/_arangodb.py
+++ b/biocypher/output/write/graph/_arangodb.py
@ -0,0 +1,241 @@
 import os
 from biocypher._logger import logger
 from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
 class _ArangoDBBatchWriter(_Neo4jBatchWriter):
    """
    Class for writing node and edge representations to disk using the format
    specified by ArangoDB for the use of "arangoimport". Output files are
    similar to Neo4j, but with a different header format.
    """
    def _get_default_import_call_bin_prefix(self):
        """
        Method to provide the default string for the import call bin prefix.
        Returns:
            str: The default location for the neo4j admin import location
        """
        return ""
    def _get_import_script_name(self) -> str:
        """
        Returns the name of the neo4j admin import script
        Returns:
            str: The name of the import script (ending in .sh)
        """
        return "arangodb-import-call.sh"
    def _write_node_headers(self):
        """
        Writes single CSV file for a graph entity that is represented
        as a node as per the definition in the `schema_config.yaml`,
        containing only the header for this type of node.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        # load headers from data parse
        if not self.node_property_dict:
            logger.error(
                "Header information not found. Was the data parsed first?",
            )
            return False
        for label, props in self.node_property_dict.items():
            # create header CSV with ID, properties, labels
            _id = "_key"
            # translate label to PascalCase
            pascal_label = self.translator.name_sentence_to_pascal(label)
            header = f"{pascal_label}-header.csv"
            header_path = os.path.join(
                self.outdir,
                header,
            )
            # check if file already exists
            if os.path.exists(header_path):
                logger.warning(
                    f"File {header_path} already exists. Overwriting."
                )
            # concatenate key:value in props
            props_list = []
            for k in props.keys():
                props_list.append(f"{k}")
            # create list of lists and flatten
            # removes need for empty check of property list
            out_list = [[_id], props_list]
            out_list = [val for sublist in out_list for val in sublist]
            with open(header_path, "w", encoding="utf-8") as f:
                # concatenate with delimiter
                row = self.delim.join(out_list)
                f.write(row)
            # add collection from schema config
            collection = self.translator.ontology.mapping.extended_schema[
                label
            ].get("db_collection_name", None)
            # add file path to neo4 admin import statement
            # do once for each part file
            parts = self.parts.get(label, [])
            if not parts:
                raise ValueError(
                    f"No parts found for node label {label}. "
                    f"Check that the data was parsed first.",
                )
            for part in parts:
                import_call_header_path = os.path.join(
                    self.import_call_file_prefix,
                    header,
                )
                import_call_parts_path = os.path.join(
                    self.import_call_file_prefix,
                    part,
                )
                self.import_call_nodes.add(
                    (
                        import_call_header_path,
                        import_call_parts_path,
                        collection,
                    )
                )
        return True
    def _write_edge_headers(self):
        """
        Writes single CSV file for a graph entity that is represented
        as an edge as per the definition in the `schema_config.yaml`,
        containing only the header for this type of edge.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        # load headers from data parse
        if not self.edge_property_dict:
            logger.error(
                "Header information not found. Was the data parsed first?",
            )
            return False
        for label, props in self.edge_property_dict.items():
            # translate label to PascalCase
            pascal_label = self.translator.name_sentence_to_pascal(label)
            # paths
            header = f"{pascal_label}-header.csv"
            header_path = os.path.join(
                self.outdir,
                header,
            )
            parts = f"{pascal_label}-part.*"
            # check for file exists
            if os.path.exists(header_path):
                logger.warning(
                    f"Header file {header_path} already exists. Overwriting."
                )
            # concatenate key:value in props
            props_list = []
            for k in props.keys():
                props_list.append(f"{k}")
            out_list = ["_from", "_key", *props_list, "_to"]
            with open(header_path, "w", encoding="utf-8") as f:
                # concatenate with delimiter
                row = self.delim.join(out_list)
                f.write(row)
            # add collection from schema config
            if not self.translator.ontology.mapping.extended_schema.get(label):
                for (
                    _,
                    v,
                ) in self.translator.ontology.mapping.extended_schema.items():
                    if v.get("label_as_edge") == label:
                        collection = v.get("db_collection_name", None)
                        break
            else:
                collection = self.translator.ontology.mapping.extended_schema[
                    label
                ].get("db_collection_name", None)
            # add file path to neo4 admin import statement (import call path
            # may be different from actual output path)
            header_import_call_path = os.path.join(
                self.import_call_file_prefix,
                header,
            )
            parts_import_call_path = os.path.join(
                self.import_call_file_prefix,
                parts,
            )
            self.import_call_edges.add(
                (
                    header_import_call_path,
                    parts_import_call_path,
                    collection,
                )
            )
        return True
    def _construct_import_call(self) -> str:
        """
        Function to construct the import call detailing folder and
        individual node and edge headers and data files, as well as
        delimiters and database name. Built after all data has been
        processed to ensure that nodes are called before any edges.
        Returns:
            str: a bash command for neo4j-admin import
        """
        import_call = (
            f"{self.import_call_bin_prefix}arangoimp "
            f"--type csv "
            f'--separator="{self.escaped_delim}" '
        )
        if self.quote == "'":
            import_call += f'--quote="{self.quote}" '
        else:
            import_call += f"--quote='{self.quote}' "
        node_lines = ""
        # node import calls: one line per node type
        for header_path, parts_path, collection in self.import_call_nodes:
            line = (
                f"{import_call} "
                f"--headers-file {header_path} "
                f"--file= {parts_path} "
            )
            if collection:
                line += f"--create-collection --collection {collection} "
            node_lines += f"{line}\n"
        edge_lines = ""
        # edge import calls: one line per edge type
        for header_path, parts_path, collection in self.import_call_edges:
            import_call += f'--relationships="{header_path},{parts_path}" '
        return node_lines + edge_lines
--- a/biocypher/output/write/graph/_neo4j.py
+++ b/biocypher/output/write/graph/_neo4j.py
@ -0,0 +1,502 @@
 import os
 import glob
 import pandas as pd
 from biocypher._logger import logger
 from biocypher.output.write._batch_writer import parse_label, _BatchWriter
 class _Neo4jBatchWriter(_BatchWriter):
    """
    Class for writing node and edge representations to disk using the
    format specified by Neo4j for the use of admin import. Each batch
    writer instance has a fixed representation that needs to be passed
    at instantiation via the :py:attr:`schema` argument. The instance
    also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
    to convert and extend the hierarchy.
    This class inherits from the abstract class "_BatchWriter" and implements the
    Neo4j-specific methods:
        - _write_node_headers
        - _write_edge_headers
        - _construct_import_call
        - _write_array_string
    """
    def __init__(self, *args, **kwargs):
        """
        Constructor.
        Check the version of Neo4j and adds a command scope if version >= 5.
        Returns:
            _Neo4jBatchWriter: An instance of the writer.
        """
        # Should read the configuration and setup import_call_bin_prefix.
        super().__init__(*args, **kwargs)
    def _get_default_import_call_bin_prefix(self):
        """
        Method to provide the default string for the import call bin prefix.
        Returns:
            str: The default location for the neo4j admin import location
        """
        return "bin/"
    def _write_array_string(self, string_list):
        """
        Abstract method to output.write the string representation of an array into a .csv file
        as required by the neo4j admin-import.
        Args:
            string_list (list): list of ontology strings
        Returns:
            str: The string representation of an array for the neo4j admin import
        """
        string = self.adelim.join(string_list)
        return f"{self.quote}{string}{self.quote}"
    def _write_node_headers(self):
        """
        Writes single CSV file for a graph entity that is represented
        as a node as per the definition in the `schema_config.yaml`,
        containing only the header for this type of node.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        # load headers from data parse
        if not self.node_property_dict:
            logger.error(
                "Header information not found. Was the data parsed first?",
            )
            return False
        for label, props in self.node_property_dict.items():
            _id = ":ID"
            ##MeDaX dev remark:
            ##From Fhir data we get case sensitive labels. E.g. 'Procedure' and 'procedure' are two distinct node types.
            ##Because we are converting Resources to more specific node classes using their "resourceType" attribute.
            # translate label to PascalCase
            pascal_label = self.translator.name_sentence_to_pascal(
                parse_label(label)
            )
            header = f"{pascal_label}-header.csv"
            header_path = os.path.join(
                self.outdir,
                header,
            )
            parts = f"{pascal_label}-part.*"
            existing_header = False
            # check if file already exists
            if os.path.exists(header_path):
                logger.warning(
                    f"Header file `{header_path}` already exists. Overwriting.",
                )
                with open(header_path, "r", encoding="utf-8") as existing:
                    existing_header = existing.read().strip().split(self.delim)
            # concatenate key:value in props
            props_list = []
            for k, v in props.items():
                if v in ["int", "long", "integer"]:
                    props_list.append(f"{k}:long")
                elif v in ["int[]", "long[]", "integer[]"]:
                    props_list.append(f"{k}:long[]")
                elif v in ["float", "double", "dbl"]:
                    props_list.append(f"{k}:double")
                elif v in ["float[]", "double[]"]:
                    props_list.append(f"{k}:double[]")
                elif v in ["bool", "boolean"]:
                    # TODO Neo4j boolean support / spelling?
                    props_list.append(f"{k}:boolean")
                elif v in ["bool[]", "boolean[]"]:
                    props_list.append(f"{k}:boolean[]")
                elif v in ["str[]", "string[]"]:
                    props_list.append(f"{k}:string[]")
                else:
                    props_list.append(f"{k}")
            # create list of lists and flatten
            out_list = [[_id], props_list, [":LABEL"]]
            out_list = [val for sublist in out_list for val in sublist]
            with open(header_path, "w", encoding="utf-8") as f:
                # Check if header file already exists and has different columns
                if os.path.exists(header_path):
                    if existing_header:
                        #existing_header = existing.read().strip().split(self.delim)
                        # Compare existing and new headers
                        if set(existing_header) != set(out_list):
                            # Get part files associated with this header
                            base_name = os.path.basename(header_path).replace("-header.csv", "")
                            part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
                            # Find the highest numbered part file without full sorting
                            highest_part = None
                            highest_number = -1
                            for part_file in part_files:
                                try:
                                    # Extract number from filename (assuming format like "part123.csv")
                                    file_name = os.path.basename(part_file)
                                    number_part = file_name.split("part")[1].split(".")[0]
                                    number = int(number_part)
                                    if number > highest_number:
                                        highest_number = number
                                        highest_part = part_file
                                except (IndexError, ValueError):
                                    # Skip files that don't match the expected pattern
                                    continue
                            # Update each part file with the new columns
                            for part_file in part_files:
                                if part_file == highest_part:
                                    print(f"Skipping the highest part file: {highest_part}")
                                    continue
                                try:
                                    #print("exi: ", existing_header)
                                    #print("out: ", out_list)
                                    df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
                                    # Read the file without headers
                                    # Write back to file WITHOUT including the header
                                    df.to_csv(part_file, sep=self.delim, index=False, header=False)
                                    print(f"Updated {part_file} with new columns in correct positions")
                                except Exception as e:
                                    print(f"Error updating {part_file}: {e}")
                # Write the new header
                row = self.delim.join(out_list)
                f.write(row)
            # add file path to neo4 admin import statement (import call file
            # path may be different from actual file path)
            import_call_header_path = os.path.join(
                self.import_call_file_prefix,
                header,
            )
            import_call_parts_path = os.path.join(
                self.import_call_file_prefix,
                parts,
            )
            self.import_call_nodes.add(
                (import_call_header_path, import_call_parts_path)
            )
        return True
    def _write_edge_headers(self):
        """
        Writes single CSV file for a graph entity that is represented
        as an edge as per the definition in the `schema_config.yaml`,
        containing only the header for this type of edge.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        # load headers from data parse
        if not self.edge_property_dict:
            logger.error(
                "Header information not found. Was the data parsed first?",
            )
            return False
        for label, props in self.edge_property_dict.items():
            # translate label to PascalCase
            pascal_label = self.translator.name_sentence_to_pascal(
                parse_label(label)
            )
            # paths
            header = f"{pascal_label}-header.csv"
            header_path = os.path.join(
                self.outdir,
                header,
            )
            parts = f"{pascal_label}-part.*"
            # check for file exists
            if os.path.exists(header_path):
                logger.warning(
                    f"File {header_path} already exists. Overwriting."
                )
            # concatenate key:value in props
            props_list = []
            for k, v in props.items():
                if v in ["int", "long", "integer"]:
                    props_list.append(f"{k}:long")
                elif v in ["int[]", "long[]", "integer[]"]:
                    props_list.append(f"{k}:long[]")
                elif v in ["float", "double"]:
                    props_list.append(f"{k}:double")
                elif v in ["float[]", "double[]"]:
                    props_list.append(f"{k}:double[]")
                elif v in [
                    "bool",
                    "boolean",
                ]:  # TODO does Neo4j support bool?
                    props_list.append(f"{k}:boolean")
                elif v in ["bool[]", "boolean[]"]:
                    props_list.append(f"{k}:boolean[]")
                elif v in ["str[]", "string[]"]:
                    props_list.append(f"{k}:string[]")
                else:
                    props_list.append(f"{k}")
            skip_id = False
            schema_label = None
            if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
                skip_id = True
            elif not self.translator.ontology.mapping.extended_schema.get(
                label
            ):
                # find label in schema by label_as_edge
                for (
                    k,
                    v,
                ) in self.translator.ontology.mapping.extended_schema.items():
                    if v.get("label_as_edge") == label:
                        schema_label = k
                        break
            else:
                schema_label = label
            out_list = [":START_ID"]
            if schema_label:
                if (
                    self.translator.ontology.mapping.extended_schema.get(
                        schema_label
                    ).get("use_id")
                    == False
                ):
                    skip_id = True
            if not skip_id:
                out_list.append("id")
            out_list.extend(props_list)
            out_list.extend([":END_ID", ":TYPE"])
            existing_header = False
            # check if file already exists
            if os.path.exists(header_path):
                logger.warning(
                    f"Header file `{header_path}` already exists. Overwriting.",
                )
                with open(header_path, "r", encoding="utf-8") as existing:
                    existing_header = existing.read().strip().split(self.delim)
            with open(header_path, "w", encoding="utf-8") as f:
                # Check if header file already exists and has different columns
                if os.path.exists(header_path):
                    if existing_header:
                        #existing_header = existing.read().strip().split(self.delim)
                        # Compare existing and new headers
                        if set(existing_header) != set(out_list):
                            # Get part files associated with this header
                            base_name = os.path.basename(header_path).replace("-header.csv", "")
                            part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
                            # Find the highest numbered part file without full sorting
                            highest_part = None
                            highest_number = -1
                            for part_file in part_files:
                                try:
                                    # Extract number from filename (assuming format like "part123.csv")
                                    file_name = os.path.basename(part_file)
                                    number_part = file_name.split("part")[1].split(".")[0]
                                    number = int(number_part)
                                    if number > highest_number:
                                        highest_number = number
                                        highest_part = part_file
                                except (IndexError, ValueError):
                                    # Skip files that don't match the expected pattern
                                    continue
                            # Update each part file with the new columns
                            for part_file in part_files:
                                if part_file == highest_part:
                                    print(f"Skipping the highest part file: {highest_part}")
                                    continue
                                try:
                                    print("exi: ", existing_header)
                                    print("out: ", out_list)
                                    df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
                                    # Read the file without headers
                                    # Write back to file WITHOUT including the header
                                    df.to_csv(part_file, sep=self.delim, index=False, header=False)
                                    print(f"Updated {part_file} with new columns in correct positions")
                                except Exception as e:
                                    print(f"Error updating {part_file}: {e}")
                # Write the new header
                row = self.delim.join(out_list)
                f.write(row)
            # add file path to neo4 admin import statement (import call file
            # path may be different from actual file path)
            import_call_header_path = os.path.join(
                self.import_call_file_prefix,
                header,
            )
            import_call_parts_path = os.path.join(
                self.import_call_file_prefix,
                parts,
            )
            self.import_call_edges.add(
                (import_call_header_path, import_call_parts_path)
            )
        return True
    def _get_import_script_name(self) -> str:
        """
        Returns the name of the neo4j admin import script
        Returns:
            str: The name of the import script (ending in .sh)
        """
        return "neo4j-admin-import-call.sh"
    def _construct_import_call(self) -> str:
        """
        Function to construct the import call detailing folder and
        individual node and edge headers and data files, as well as
        delimiters and database name. Built after all data has been
        processed to ensure that nodes are called before any edges.
        Returns:
            str: a bash command for neo4j-admin import
        """
        import_call_neo4j_v4 = self._get_import_call(
            "import", "--database=", "--force="
        )
        import_call_neo4j_v5 = self._get_import_call(
            "database import full", "", "--overwrite-destination="
        )
        neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
        import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
        return import_script
    def _get_import_call(
        self, import_cmd: str, database_cmd: str, wipe_cmd: str
    ) -> str:
        """Get parametrized import call for Neo4j 4 or 5+.
        Args:
            import_cmd (str): The import command to use.
            database_cmd (str): The database command to use.
            wipe_cmd (str): The wipe command to use.
        Returns:
            str: The import call.
        """
        import_call = f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "
        import_call += f"{database_cmd}{self.db_name} "
        import_call += f'--delimiter="{self.escaped_delim}" '
        import_call += f'--array-delimiter="{self.escaped_adelim}" '
        if self.quote == "'":
            import_call += f'--quote="{self.quote}" '
        else:
            import_call += f"--quote='{self.quote}' "
        if self.wipe:
            import_call += f"{wipe_cmd}true "
        if self.skip_bad_relationships:
            import_call += "--skip-bad-relationships=true "
        if self.skip_duplicate_nodes:
            import_call += "--skip-duplicate-nodes=true "
        # append node import calls
        for header_path, parts_path in self.import_call_nodes:
            import_call += f'--nodes="{header_path},{parts_path}" '
        # append edge import calls
        for header_path, parts_path in self.import_call_edges:
            import_call += f'--relationships="{header_path},{parts_path}" '
        return import_call
    def adapt_csv_to_new_header(self, old_header, new_header, csv_file_path):
        """
        Adapt a CSV table to a new header structure, placing new columns in their correct positions.
        Parameters:
        old_header (list): The original header columns
        new_header (list): The new header columns
        csv_file_path (str): Path to the CSV file
        Returns:
        pandas.DataFrame: CSV data with the new header structure
        """
        # Step 1: Read the CSV data without headers
        df = pd.read_csv(csv_file_path, sep=self.delim, header=None)
        # Step 2: If the file is empty, return empty DataFrame with new headers
        if df.empty:
            return pd.DataFrame(columns=new_header)
        # Step 3: If column count doesn't match old_header length, handle the mismatch
        if len(df.columns) != len(old_header):
            print(f"Warning: CSV columns count ({len(df.columns)}) doesn't match the provided old header count ({len(old_header)})")
            # If file has fewer columns than old_header, pad with NaN
            if len(df.columns) < len(old_header):
                for i in range(len(df.columns), len(old_header)):
                    df[i] = None
            # If file has more columns than old_header, truncate
            else:
                df = df.iloc[:, :len(old_header)]
        # Step 4: Assign old header names to the dataframe
        df.columns = old_header
        # Step 5: Create a new DataFrame with the correct structure
        new_df = pd.DataFrame(columns=new_header)
        # Step 6: For each column in the new header, find its position in the old header
        for new_col_idx, new_col in enumerate(new_header):
            if new_col in old_header:
                # If column exists in old header, copy data
                new_df[new_col] = df[new_col]
            else:
                # If new column, add empty column
                new_df[new_col] = None
        # Step 7: Ensure columns are in the exact order of new_header
        new_df = new_df[new_header]
        return new_df
--- a/biocypher/output/write/graph/_networkx.py
+++ b/biocypher/output/write/graph/_networkx.py
@ -0,0 +1,76 @@
 import pickle
 import networkx as nx
 from biocypher._logger import logger
 from biocypher.output.write._writer import _Writer
 from biocypher.output.write.relational._csv import _PandasCSVWriter
 class _NetworkXWriter(_Writer):
    """
    Class for writing node and edges to a networkx DiGraph.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.csv_writer = _PandasCSVWriter(*args, write_to_file=False, **kwargs)
        self.G = nx.DiGraph()
    def _construct_import_call(self) -> str:
        """Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
        Returns:
            str: Python code to load the csv files into Pandas dfs.
        """
        logger.info(
            f"Writing networkx {self.G} to pickle file networkx_graph.pkl."
        )
        with open(f"{self.output_directory}/networkx_graph.pkl", "wb") as f:
            pickle.dump(self.G, f)
        import_call = "import pickle\n"
        import_call += "with open('./networkx_graph.pkl', 'rb') as f:\n\tG_loaded = pickle.load(f)"
        return import_call
    def _get_import_script_name(self) -> str:
        """Function to return the name of the import script."""
        return "import_networkx.py"
    def _write_node_data(self, nodes) -> bool:
        passed = self.csv_writer._write_entities_to_file(nodes)
        self.add_to_networkx()
        return passed
    def _write_edge_data(self, edges) -> bool:
        passed = self.csv_writer._write_entities_to_file(edges)
        self.add_to_networkx()
        return passed
    def add_to_networkx(self) -> bool:
        all_dfs = self.csv_writer.stored_dfs
        node_dfs = [
            df
            for df in all_dfs.values()
            if df.columns.str.contains("node_id").any()
        ]
        edge_dfs = [
            df
            for df in all_dfs.values()
            if df.columns.str.contains("source_id").any()
            and df.columns.str.contains("target_id").any()
        ]
        for df in node_dfs:
            nodes = df.set_index("node_id").to_dict(orient="index")
            self.G.add_nodes_from(nodes.items())
        for df in edge_dfs:
            edges = df.set_index(["source_id", "target_id"]).to_dict(
                orient="index"
            )
            self.G.add_edges_from(
                (
                    (source, target, attrs)
                    for (source, target), attrs in edges.items()
                )
            )
        return True
--- a/biocypher/output/write/graph/_rdf.py
+++ b/biocypher/output/write/graph/_rdf.py
@ -0,0 +1,515 @@
 #!/usr/bin/env python
 #
 # Copyright 2021, Heidelberg University Clinic
 #
 # File author(s):  Loes van den Biggelaar
 #                  Sebastian Lobentanzer
 #
 # Distributed under MIT licence, see the file `LICENSE`.
 #
 """
 BioCypher 'offline' module. Handles the writing of node and edge representations
 suitable for import into a DBMS.
 """
 from types import GeneratorType
 from typing import Union
 import os
 from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
 from rdflib.namespace import (
    _NAMESPACE_PREFIXES_CORE,
    _NAMESPACE_PREFIXES_RDFLIB,
 )
 from biocypher._create import BioCypherEdge, BioCypherNode
 from biocypher._logger import logger
 from biocypher.output.write._batch_writer import _BatchWriter
 class _RDFWriter(_BatchWriter):
    """
    Class to write BioCypher's property graph into an RDF format using
    rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
    N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
    is done keeping only the minimum information about node and edges,
    skipping all properties.
    """
    def _get_import_script_name(self) -> str:
        """
        Returns the name of the RDF admin import script.
        This function applicable for RDF export.
        Returns:
            str: The name of the import script (ending in .sh)
        """
        return "rdf-import-call.sh"
    def _get_default_import_call_bin_prefix(self):
        """
        Method to provide the default string for the import call bin prefix.
        Returns:
            str: The default location for the RDF admin import location
        """
        return "bin/"
    def _is_rdf_format_supported(self, rdf_format: str) -> bool:
        """
        Function to check if the specified RDF format is supported.
        Args:
            rdf_format (str): The RDF format to check.
        Returns:
            bool: Returns True if rdf format supported, False otherwise.
        """
        supported_formats = [
            "xml",
            "n3",
            "turtle",
            "nt",
            "pretty-xml",
            "trix",
            "trig",
            "nquads",
            "json-ld",
        ]
        if rdf_format not in supported_formats:
            logger.error(
                f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
                f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
            )
            return False
        else:
            # RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
            if self.rdf_format == "turtle":
                self.extension = "ttl"
            elif self.rdf_format == "ttl":
                self.rdf_format = "turtle"
                self.extension = "ttl"
            else:
                self.extension = self.rdf_format
            return True
    def _write_single_edge_list_to_file(
        self,
        edge_list: list,
        label: str,
        prop_dict: dict,
    ):
        """
        This function takes one list of biocypher edges and writes them
        to an RDF file with the given format.
        Args:
            edge_list (list): list of BioCypherEdges to be written
            label (str): the label (type) of the edge
            prop_dict (dict): properties of node class passed from parsing
                function and their types
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        if not all(isinstance(n, BioCypherEdge) for n in edge_list):
            logger.error("Edges must be passed as type BioCypherEdge.")
            return False
        # translate label to PascalCase
        label_pascal = self.translator.name_sentence_to_pascal(label)
        # create file name
        file_name = os.path.join(
            self.outdir, f"{label_pascal}.{self.extension}"
        )
        # write data in graph
        graph = Graph()
        self._init_namespaces(graph)
        for edge in edge_list:
            rdf_subject = edge.get_source_id()
            rdf_object = edge.get_target_id()
            rdf_predicate = edge.get_id()
            rdf_properties = edge.get_properties()
            if rdf_predicate == None:
                rdf_predicate = rdf_subject + rdf_object
            edge_label = self.translator.name_sentence_to_pascal(
                edge.get_label()
            )
            edge_uri = self.rdf_namespaces["biocypher"][edge_label]
            graph.add((edge_uri, RDF.type, RDFS.Class))
            graph.add(
                (
                    self.rdf_namespaces["biocypher"][rdf_predicate],
                    RDF.type,
                    edge_uri,
                )
            )
            graph.add(
                (
                    self.rdf_namespaces["biocypher"][rdf_predicate],
                    self.rdf_namespaces["biocypher"]["subject"],
                    self.subject_to_uri(rdf_subject),
                )
            )
            graph.add(
                (
                    self.rdf_namespaces["biocypher"][rdf_predicate],
                    self.rdf_namespaces["biocypher"]["object"],
                    self.subject_to_uri(rdf_object),
                )
            )
            # add properties to the transformed edge --> node
            for key, value in rdf_properties.items():
                # only write value if it exists.
                if value:
                    self.add_property_to_graph(graph, rdf_predicate, value, key)
        graph.serialize(destination=file_name, format=self.rdf_format)
        logger.info(
            f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
        )
        return True
    def add_property_to_graph(
        self,
        graph: Graph,
        rdf_subject: str,
        rdf_object: str,
        rdf_predicate: str,
    ):
        """
        Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
        It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
        If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
        If the property is neither a list or string, it will also be added as a literal.
        Args:
            graph (RDFLib.Graph): The RDF graph to add the nodes to.
            rdf_subject (str): The subject of the RDF triple.
            rdf_object (str): The object of the RDF triple.
            rdf_predicate (str): The predicate of the RDF triple.
        Returns:
            None
        """
        if isinstance(rdf_object, list):
            for obj in rdf_object:
                graph.add(
                    (
                        self.subject_to_uri(rdf_subject),
                        self.property_to_uri(rdf_predicate),
                        Literal(obj),
                    )
                )
        elif isinstance(rdf_object, str):
            if rdf_object.startswith("[") and rdf_object.endswith("]"):
                self.add_property_to_graph(
                    graph,
                    rdf_subject,
                    self.transform_string_to_list(rdf_object),
                    rdf_predicate,
                )
            else:
                graph.add(
                    (
                        self.subject_to_uri(rdf_subject),
                        self.property_to_uri(rdf_predicate),
                        Literal(rdf_object),
                    )
                )
        else:
            graph.add(
                (
                    self.subject_to_uri(rdf_subject),
                    self.property_to_uri(rdf_predicate),
                    Literal(rdf_object),
                )
            )
    def transform_string_to_list(self, string_list: str) -> list:
        """
        Function to transform a string representation of a list into a list.
        Args:
            string_list (str): The string representation of the list.
        Returns:
            list: The list representation of the input string.
        """
        return (
            string_list.replace("[", "")
            .replace("]", "")
            .replace("'", "")
            .split(", ")
        )
    def _write_single_node_list_to_file(
        self,
        node_list: list,
        label: str,
        prop_dict: dict,
        labels: str,
    ):
        """
        This function takes a list of BioCypherNodes and writes them
        to an RDF file in the specified format.
        Args:
            node_list (list): A list of BioCypherNodes to be written.
            label (str): The label (type) of the nodes.
            prop_dict (dict): A dictionary of properties and their types for the node class.
        Returns:
            bool: True if the writing is successful, False otherwise.
        """
        if not all(isinstance(n, BioCypherNode) for n in node_list):
            logger.error("Nodes must be passed as type BioCypherNode.")
            return False
        # translate label to PascalCase
        label_pascal = self.translator.name_sentence_to_pascal(label)
        # create file name
        file_name = os.path.join(
            self.outdir, f"{label_pascal}.{self.extension}"
        )
        # write data in graph
        graph = Graph()
        self._init_namespaces(graph)
        for n in node_list:
            rdf_subject = n.get_id()
            rdf_object = n.get_label()
            properties = n.get_properties()
            class_name = self.translator.name_sentence_to_pascal(rdf_object)
            graph.add(
                (
                    self.rdf_namespaces["biocypher"][class_name],
                    RDF.type,
                    RDFS.Class,
                )
            )
            graph.add(
                (
                    self.subject_to_uri(rdf_subject),
                    RDF.type,
                    self.rdf_namespaces["biocypher"][class_name],
                )
            )
            for key, value in properties.items():
                # only write value if it exists.
                if value:
                    self.add_property_to_graph(graph, rdf_subject, value, key)
        graph.serialize(destination=file_name, format=self.rdf_format)
        logger.info(
            f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
        )
        return True
    def write_nodes(
        self, nodes, batch_size: int = int(1e6), force: bool = False
    ) -> bool:
        """
        Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
        Args:
            nodes (list or generator): A list or generator of nodes in BioCypherNode format.
            batch_size (int): The number of nodes to write in each batch.
            force (bool): Flag to force the writing even if the output file already exists.
        Returns:
            bool: True if the writing is successful, False otherwise.
        """
        # check if specified output format is correct
        passed = self._is_rdf_format_supported(self.rdf_format)
        if not passed:
            logger.error("Error while writing node data, wrong RDF format")
            return False
        # write node data using _write_node_data method
        passed = self._write_node_data(nodes, batch_size, force)
        if not passed:
            logger.error("Error while writing node data.")
            return False
        return True
    def write_edges(
        self,
        edges: Union[list, GeneratorType],
        batch_size: int = int(1e6),
    ) -> bool:
        """
        Wrapper for writing edges in RDF format. It calls _write_edge_data()
        functions specifying it's edge data.
        Args:
            edges (BioCypherEdge): a list or generator of edges in
                :py:class:`BioCypherEdge` format
            batch_size (int): The number of edges to write in each batch.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        # check if specified output format is correct
        passed = self._is_rdf_format_supported(self.rdf_format)
        if not passed:
            logger.error("Error while writing edge data, wrong RDF format")
            return False
        # write edge data using _write_edge_data method
        passed = self._write_edge_data(edges, batch_size=batch_size)
        if not passed:
            logger.error("Error while writing edge data.")
            return False
        return True
    def _construct_import_call(self) -> bool:
        """
        Function to write the import call.
        This function is not applicable for RDF.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        return ""
    def _write_array_string(self, string_list):
        """
        Abstract method to write the string representation of an array into a .csv file
        as required by the RDF admin-import.
        This function is not applicable for RDF.
        Args:
            string_list (list): list of ontology strings
        Returns:
            str: The string representation of an array for the neo4j admin import
        """
        return True
    def _write_node_headers(self):
        """
        Abstract method that takes care of importing properties of a graph entity that is represented
        as a node as per the definition in the `schema_config.yaml`
        This function is not applicable for RDF.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        return True
    def _write_edge_headers(self):
        """
        Abstract method to write a database import-file for a graph entity that is represented
        as an edge as per the definition in the `schema_config.yaml`,
        containing only the header for this type of edge.
        This function is not applicable for RDF.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        return True
    def subject_to_uri(self, subject: str) -> str:
        """
        Converts the subject to a proper URI using the available namespaces.
        If the conversion fails, it defaults to the biocypher prefix.
        Args:
            subject (str): The subject to be converted to a URI.
        Returns:
            str: The corresponding URI for the subject.
        """
        try:
            _pref, _id = subject.split(":")
            if _pref in self.rdf_namespaces.keys():
                return self.rdf_namespaces[_pref][_id]
            else:
                return self.rdf_namespaces["biocypher"][subject]
        except ValueError:
            return self.rdf_namespaces["biocypher"][subject]
    def property_to_uri(self, property_name: str) -> dict[str, str]:
        """
        Converts a property name to its corresponding URI.
        This function takes a property name and searches for its corresponding URI in various namespaces.
        It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
        Args:
            property_name (str): The property name to be converted to a URI.
        Returns:
            str: The corresponding URI for the input property name.
        """
        # These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
        for namespace in _NAMESPACE_PREFIXES_CORE.values():
            if property_name in namespace:
                return namespace[property_name]
        # If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
        for namespace in [SKOS, DC, DCTERMS]:
            if property_name in namespace:
                return namespace[property_name]
        # If the property name is still not found, try other namespaces from rdflib.
        for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
            if property_name in namespace:
                return namespace[property_name]
        # If the property name is "licence", it recursively calls the function with "license" as the input.
        if property_name == "licence":
            return self.property_to_uri("license")
        # TODO: add an option to search trough manually implemented namespaces
        # If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
        # TODO: give a warning and try to prevent this option altogether
        return self.rdf_namespaces["biocypher"][property_name]
    def _init_namespaces(self, graph: Graph):
        """
        Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
        This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
        If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
        the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
        Args:
            graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
        Returns:
            None
        """
        # add biocypher standard to self.rdf_namespaces
        biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
        if not self.rdf_namespaces:
            self.rdf_namespaces = biocypher_standard
        else:
            self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
        for key, value in self.rdf_namespaces.items():
            namespace = Namespace(value)
            self.rdf_namespaces[key] = namespace
            graph.bind(key, namespace)
--- a/biocypher/output/write/relational/init.py
+++ b/biocypher/output/write/relational/init.py
--- a/biocypher/output/write/relational/_csv.py
+++ b/biocypher/output/write/relational/_csv.py
@ -0,0 +1,76 @@
 from more_itertools import peekable
 from biocypher._logger import logger
 from biocypher.output.write._writer import _Writer
 from biocypher.output.in_memory._pandas import Pandas
 class _PandasCSVWriter(_Writer):
    """
    Class for writing node and edge representations to a CSV file.
    """
    def __init__(self, *args, write_to_file: bool = True, **kwargs):
        kwargs["write_to_file"] = write_to_file
        super().__init__(*args, **kwargs)
        self.in_memory_dfs = {}
        self.stored_dfs = {}
        self.pandas_in_memory = Pandas(
            translator=self.translator,
            deduplicator=self.deduplicator,
        )
        self.delimiter = kwargs.get("delimiter")
        if not self.delimiter:
            self.delimiter = ","
        self.write_to_file = write_to_file
    def _construct_import_call(self) -> str:
        """Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
        Returns:
            str: Python code to load the csv files into Pandas dfs.
        """
        import_call = "import pandas as pd\n\n"
        for df_name in self.stored_dfs.keys():
            import_call += f"{df_name} = pd.read_csv('./{df_name}.csv', header=0, index_col=0)\n"
        return import_call
    def _get_import_script_name(self) -> str:
        """Function to return the name of the import script."""
        return "import_pandas_csv.py"
    def _write_node_data(self, nodes) -> bool:
        passed = self._write_entities_to_file(nodes)
        return passed
    def _write_edge_data(self, edges) -> bool:
        passed = self._write_entities_to_file(edges)
        return passed
    def _write_entities_to_file(self, entities: iter) -> bool:
        """Function to output.write the entities to a CSV file.
        Args:
            entities (iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
        """
        entities = peekable(entities)
        entity_list = self.pandas_in_memory._separate_entity_types(entities)
        for entity_type, entities in entity_list.items():
            self.in_memory_dfs[
                entity_type
            ] = self.pandas_in_memory._add_entity_df(entity_type, entities)
        for entity_type in self.in_memory_dfs.keys():
            entity_df = self.in_memory_dfs[entity_type]
            if " " in entity_type or "." in entity_type:
                entity_type = entity_type.replace(" ", "_").replace(".", "_")
            if self.write_to_file:
                logger.info(
                    f"Writing {entity_df.shape[0]} entries to {entity_type}.csv."
                )
                entity_df.to_csv(
                    f"{self.output_directory}/{entity_type}.csv",
                    sep=self.delimiter,
                )
            self.stored_dfs[entity_type] = entity_df
        self.in_memory_dfs = {}
        return True
--- a/biocypher/output/write/relational/_postgresql.py
+++ b/biocypher/output/write/relational/_postgresql.py
@ -0,0 +1,320 @@
 import os
 import glob
 from biocypher._logger import logger
 from biocypher.output.write._batch_writer import _BatchWriter
 class _PostgreSQLBatchWriter(_BatchWriter):
    """
    Class for writing node and edge representations to disk using the
    format specified by PostgreSQL for the use of "COPY FROM...". Each batch
    writer instance has a fixed representation that needs to be passed
    at instantiation via the :py:attr:`schema` argument. The instance
    also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
    to convert and extend the hierarchy.
    This class inherits from the abstract class "_BatchWriter" and implements the
    PostgreSQL-specific methods:
        - _write_node_headers
        - _write_edge_headers
        - _construct_import_call
        - _write_array_string
    """
    DATA_TYPE_LOOKUP = {
        "str": "VARCHAR",  # VARCHAR needs limit
        "int": "INTEGER",
        "long": "BIGINT",
        "float": "NUMERIC",
        "double": "NUMERIC",
        "dbl": "NUMERIC",
        "boolean": "BOOLEAN",
        "str[]": "VARCHAR[]",
        "string[]": "VARCHAR[]",
    }
    def __init__(self, *args, **kwargs):
        self._copy_from_csv_commands = set()
        super().__init__(*args, **kwargs)
    def _get_default_import_call_bin_prefix(self):
        """
        Method to provide the default string for the import call bin prefix.
        Returns:
            str: The default location for the psql command
        """
        return ""
    def _get_data_type(self, string) -> str:
        try:
            return self.DATA_TYPE_LOOKUP[string]
        except KeyError:
            logger.info(
                'Could not determine data type {string}. Using default "VARCHAR"'
            )
            return "VARCHAR"
    def _write_array_string(self, string_list) -> str:
        """
        Abstract method to output.write the string representation of an array into a .csv file
        as required by the postgresql COPY command, with '{','}' brackets and ',' separation.
        Args:
            string_list (list): list of ontology strings
        Returns:
            str: The string representation of an array for postgres COPY
        """
        string = ",".join(string_list)
        string = f'"{{{string}}}"'
        return string
    def _get_import_script_name(self) -> str:
        """
        Returns the name of the psql import script
        Returns:
            str: The name of the import script (ending in .sh)
        """
        return f"{self.db_name}-import-call.sh"
    def _adjust_pascal_to_psql(self, string):
        string = string.replace(".", "_")
        string = string.lower()
        return string
    def _write_node_headers(self):
        """
        Writes single CSV file for a graph entity that is represented
        as a node as per the definition in the `schema_config.yaml`,
        containing only the header for this type of node.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        # load headers from data parse
        if not self.node_property_dict:
            logger.error(
                "Header information not found. Was the data parsed first?",
            )
            return False
        for label, props in self.node_property_dict.items():
            # create header CSV with ID, properties, labels
            # translate label to PascalCase
            pascal_label = self.translator.name_sentence_to_pascal(label)
            parts = f"{pascal_label}-part*.csv"
            parts_paths = os.path.join(self.outdir, parts)
            parts_paths = glob.glob(parts_paths)
            parts_paths.sort()
            # adjust label for import to psql
            pascal_label = self._adjust_pascal_to_psql(pascal_label)
            table_create_command_path = os.path.join(
                self.outdir,
                f"{pascal_label}-create_table.sql",
            )
            # check if file already exists
            if os.path.exists(table_create_command_path):
                logger.warning(
                    f"File {table_create_command_path} already exists. Overwriting.",
                )
            # concatenate key:value in props
            columns = ["_ID VARCHAR"]
            for col_name, col_type in props.items():
                col_type = self._get_data_type(col_type)
                col_name = self._adjust_pascal_to_psql(col_name)
                columns.append(f"{col_name} {col_type}")
            columns.append("_LABEL VARCHAR[]")
            with open(table_create_command_path, "w", encoding="utf-8") as f:
                command = ""
                if self.wipe:
                    command += f"DROP TABLE IF EXISTS {pascal_label};\n"
                # table creation requires comma separation
                command += (
                    f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
                )
                f.write(command)
                for parts_path in parts_paths:
                    # if import_call_file_prefix is set, replace actual path
                    # with prefix
                    if self.import_call_file_prefix != self.outdir:
                        parts_path = parts_path.replace(
                            self.outdir,
                            self.import_call_file_prefix,
                        )
                    self._copy_from_csv_commands.add(
                        f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
                    )
            # add file path to import statement
            # if import_call_file_prefix is set, replace actual path
            # with prefix
            if self.import_call_file_prefix != self.outdir:
                table_create_command_path = table_create_command_path.replace(
                    self.outdir,
                    self.import_call_file_prefix,
                )
            self.import_call_nodes.add(table_create_command_path)
        return True
    def _write_edge_headers(self):
        """
        Writes single CSV file for a graph entity that is represented
        as an edge as per the definition in the `schema_config.yaml`,
        containing only the header for this type of edge.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        # load headers from data parse
        if not self.edge_property_dict:
            logger.error(
                "Header information not found. Was the data parsed first?",
            )
            return False
        for label, props in self.edge_property_dict.items():
            # translate label to PascalCase
            pascal_label = self.translator.name_sentence_to_pascal(label)
            parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv")
            parts_paths = glob.glob(parts_paths)
            parts_paths.sort()
            # adjust label for import to psql
            pascal_label = self._adjust_pascal_to_psql(pascal_label)
            table_create_command_path = os.path.join(
                self.outdir,
                f"{pascal_label}-create_table.sql",
            )
            # check for file exists
            if os.path.exists(table_create_command_path):
                logger.warning(
                    f"File {table_create_command_path} already exists. Overwriting.",
                )
            # concatenate key:value in props
            columns = []
            for col_name, col_type in props.items():
                col_type = self._get_data_type(col_type)
                col_name = self._adjust_pascal_to_psql(col_name)
                if col_name == "_ID":
                    # should ideally never happen
                    raise ValueError(
                        "Column name '_ID' is reserved for internal use, "
                        "denoting the relationship ID. Please choose a "
                        "different name for your column."
                    )
                columns.append(f"{col_name} {col_type}")
            # create list of lists and flatten
            # removes need for empty check of property list
            out_list = [
                "_START_ID VARCHAR",
                "_ID VARCHAR",
                *columns,
                "_END_ID VARCHAR",
                "_TYPE VARCHAR",
            ]
            with open(table_create_command_path, "w", encoding="utf-8") as f:
                command = ""
                if self.wipe:
                    command += f"DROP TABLE IF EXISTS {pascal_label};\n"
                # table creation requires comma separation
                command += (
                    f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
                )
                f.write(command)
                for parts_path in parts_paths:
                    # if import_call_file_prefix is set, replace actual path
                    # with prefix
                    if self.import_call_file_prefix != self.outdir:
                        parts_path = parts_path.replace(
                            self.outdir,
                            self.import_call_file_prefix,
                        )
                    self._copy_from_csv_commands.add(
                        f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
                    )
            # add file path to import statement
            # if import_call_file_prefix is set, replace actual path
            # with prefix
            if self.import_call_file_prefix != self.outdir:
                table_create_command_path = table_create_command_path.replace(
                    self.outdir,
                    self.import_call_file_prefix,
                )
            self.import_call_edges.add(table_create_command_path)
        return True
    def _construct_import_call(self) -> str:
        """
        Function to construct the import call detailing folder and
        individual node and edge headers and data files, as well as
        delimiters and database name. Built after all data has been
        processed to ensure that nodes are called before any edges.
        Returns:
            str: a bash command for postgresql import
        """
        import_call = ""
        # create tables
        # At this point, csv files of nodes and edges do not require differentiation
        for import_file_path in [
            *self.import_call_nodes,
            *self.import_call_edges,
        ]:
            import_call += f'echo "Setup {import_file_path}..."\n'
            if {self.db_password}:
                # set password variable inline
                import_call += f"PGPASSWORD={self.db_password} "
            import_call += (
                f"{self.import_call_bin_prefix}psql -f {import_file_path}"
            )
            import_call += f" --dbname {self.db_name}"
            import_call += f" --host {self.db_host}"
            import_call += f" --port {self.db_port}"
            import_call += f" --user {self.db_user}"
            import_call += '\necho "Done!"\n'
            import_call += "\n"
        # copy data to tables
        for command in self._copy_from_csv_commands:
            table_part = command.split(" ")[3]
            import_call += f'echo "Importing {table_part}..."\n'
            if {self.db_password}:
                # set password variable inline
                import_call += f"PGPASSWORD={self.db_password} "
            import_call += f'{self.import_call_bin_prefix}psql -c "{command}"'
            import_call += f" --dbname {self.db_name}"
            import_call += f" --host {self.db_host}"
            import_call += f" --port {self.db_port}"
            import_call += f" --user {self.db_user}"
            import_call += '\necho "Done!"\n'
            import_call += "\n"
        return import_call
--- a/biocypher/output/write/relational/_sqlite.py
+++ b/biocypher/output/write/relational/_sqlite.py
@ -0,0 +1,51 @@
 from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
 class _SQLiteBatchWriter(_PostgreSQLBatchWriter):
    """
    Class for writing node and edge representations to a SQLite database.
    It uses the _PostgreSQLBatchWriter class under the hood, which already
    implements the logic to write the nodes/edges to a relational DBMS.
    Only the import bash script differs between PostgreSQL and SQLite
    and is therefore implemented in this class.
    - _construct_import_call
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def _construct_import_call(self) -> str:
        """
        Function to construct the import call detailing folder and
        individual node and edge headers and data files, as well as
        delimiters and database name. Built after all data has been
        processed to ensure that nodes are called before any edges.
        Returns:
            str: a bash command for sqlite import
        """
        import_call = ""
        # create tables
        # At this point, csv files of nodes and edges do not require differentiation
        for import_file_path in [
            *self.import_call_nodes,
            *self.import_call_edges,
        ]:
            import_call += f'echo "Setup {import_file_path}..."\n'
            import_call += f"{self.import_call_bin_prefix}sqlite3 {self.db_name} < {import_file_path}"
            import_call += '\necho "Done!"\n'
            import_call += "\n"
        for command in self._copy_from_csv_commands:
            table_name = command.split(" ")[1]
            table_part = command.split(" ")[3].replace("'", "")
            import_call += f'echo "Importing {table_part}..."\n'
            separator = self.delim
            import_part = f".import {table_part} {table_name}"
            import_call += f"{self.import_call_bin_prefix}sqlite3 -separator $'{separator}' {self.db_name} \"{import_part}\""
            import_call += '\necho "Done!"\n'
            import_call += "\n"
        return import_call
--- a/config/biocypher_config.yaml
+++ b/config/biocypher_config.yaml
@ -0,0 +1,18 @@
 # add your settings here (overriding the defaults)
 biocypher:
  dbms: neo4j
  offline: true
  #debug: true
  output_directory: /neo4j_import #comment if you want to debug, so that bc creates a new folder for each run in /biocypher-out
  schema_config_path: config/automated_schema.yaml
  head_ontology:
    url: config/head_ontology/biolink-model.owl.ttl
    root_node: entity
 neo4j:
  delimiter: '\t'
  array_delimiter: '|'
  skip_duplicate_nodes: true
  skip_bad_relationships: true
--- a/config/head_ontology/biolink-model.owl.ttl
+++ b/config/head_ontology/biolink-model.owl.ttl
--- a/config/manual_schema_config.yaml
+++ b/config/manual_schema_config.yaml
@ -0,0 +1,283 @@
 Title: BioCypher graph schema configuration file
 # This configuration file establishes the hierarchy and connectivity in a newly
 # set-up BioCypher property graph database. Naming should adhere to Biolink
 # nomenclature (available at https://biolink.github.io/biolink-model/ or via 
 # the python module 'biolink-model-toolkit').
 # The BioCypher YAML file specifies only the leaves of the hierarchy tree of
 # the desired graph; the hierarchical structure of entities will be derived 
 # from the Biolink model + BRO model. Thus, only the immediate constituents
 # of the graph need to be specified in the schema config.
 # ---
 # "Named Things"
 # ---
 # The implementation of named things is fairly straightforward, since they are
 # usually represented in node form, which is also the Biolink recommendation.
 # The same is not true for associations.
 #
 # A little more complex is the representation of aggregates of named things.
 clinicalStatus:
    is_a: ClinicalEntity 
    represented_as: node
    preferred_id: fhir_id
    label_in_input: clinicalStatus
    properties:
        coding_system: str
        label: str
        coding_code: str
 Condition:
    is_a: ClinicalEntity 
    represented_as: node
    preferred_id: fhir_id
    label_in_input: Condition
    properties:
      input_format: HL7 FHIR
      data_specification: Medical Informatics Initiative Germany Core Data Set, Basic Modules
 diagnosis:
    is_a: ClinicalEntity
    represented_as: node
    preferred_id: fhir_id
    label_in_input: diagnosis
    properties:
        type.coding_code: str
        sequence: str
        label: str
        type.coding_system: str
 DiagnosticReport:
    is_a: ClinicalEntity 
    represented_as: node
    preferred_id: fhir_id
    label_in_input: DiagnosticReport
    properties:
        resourceType: str
        label: str
        status: str
        id: str
 Encounter:
    is_a: ClinicalEntity 
    represented_as: node
    preferred_id: fhir_id
    label_in_input: Encounter
    properties:
        resourceType: str
        label: str
        status: str
        id: str
 identifier:
    is_a: Attribute
    represented_as: node
    preferred_id: fhir_id
    label_in_input: identifier
    properties:
        label: str
        value: str
        system: str
 interpretation: #
    is_a: named thing
    represented_as: node
    preferred_id: fhir_id
    label_in_input: interpretation
    properties:
        extension.valueCoding_system: str
        extension_url: str
        extension.valueCoding_display: str
        coding_code: str
        coding_system: str
        label: str
        extension.valueCoding_code: str
 maritalStatus:
    is_a: OrganismAttribute 
    represented_as: node
    preferred_id: fhir_id
    label_in_input: maritalStatus
    properties:
        label: str
        coding_system: str
        coding_code: str
 Observation:
    is_a: ClinicalEntity 
    represented_as: node
    preferred_id: fhir_id
    label_in_input: Observation
    properties:
        resourceType: str
        label: str
        effectiveDateTime: str
        status: str
        id: str
 Organization:
    is_a: AdministrativeEntity
    represented_as: node
    preferred_id: fhir_id
    label_in_input: Organization
    properties:
        label: str
        id: str
        name: str
        resourceType: str
 Patient:
    is_a: Human 
    represented_as: node
    preferred_id: fhir_id
    label_in_input: Patient
    properties:
        resourceType: str
        label: str
        gender: str
        id: str
        birthDate: str
 Procedure:
   # is_a: Procedure 
    represented_as: node
    preferred_id: fhir_id
    label_in_input: Procedure
    properties:
        label: str
        performedDateTime: str
        resourceType: str
        status: str
        id: str
 referenceRange: #
    is_a: named thing
    represented_as: node
    preferred_id: fhir_id
    label_in_input: referenceRange
    properties:
        high_system: str
        high_value: str
        high_code: str
        label: str
        high_unit: str
 search: #
    is_a: named thing
    represented_as: node
    preferred_id: fhir_id
    label_in_input: search
    properties:
        label: str
        mode: str
 type:
    is_a: Attribute
    represented_as: node
    preferred_id: fhir_id
    label_in_input: type
    properties:
        coding_system: str
        label: str
        coding_code: str
        coding_display: str
 verificationStatus:
    is_a: Attribute
    represented_as: node
    preferred_id: fhir_id
    label_in_input: verificationStatus
    properties:
        coding_system: str
        label: str
        coding_code: str
        coding_display: str
 # ---
 # Associations
 # ---
 # Associations are not supposed to be represented in node form as per the 
 # specifications of Biolink. However, in an analytic context, it often makes 
 # sense to represent interactions as nodes in Neo4j, because it enables, for
 # instance, the annotation of a relationship with a publication as source of
 # evidence (also known as reification in the knowledge graph world). 
 # The Biolink specifications for these types of relationships do 
 # not go into depth; for example, the hierarchy for molecular interactions
 # (ie, "associations") ends at "PairwiseMolecularInteraction", there are no 
 # explicit terms for protein-protein-interaction, phosphorylation, miRNA-
 # targeting, etc. Biolink proposes to use interaction identifiers from 
 # ontologies, such as https://www.ebi.ac.uk/ols/ontologies/mi/.
        # association to connect anything to an identifier node
        # if functional, includes: 
        # IDENTIFIED_BY_Condition_Identifier, 
        # IDENTIFIED_BY_DiagnosticReport_Identifier, 
        # IDENTIFIED_BY_Encounter_Identifier, 
        # IDENTIFIED_BY_Observation_Identifier, 
        # IDENTIFIED_BY_Organization_Identifier
        # IDENTIFIED_BY_Patient_Identifier,
        # IDENTIFIED_BY_Procedure_Identifier
 condition to identifier association: 
  is_a: association
  represented_as: edge
  label_in_input: IDENTIFIED_BY_Condition_Identifier
 diagnostic report to identifier association: 
  is_a: association
  represented_as: edge
  label_in_input: IDENTIFIED_BY_DiagnosticReport_Identifier
 observation to identifier association: 
  is_a: association
  represented_as: edge
  label_in_input: IDENTIFIED_BY_Observation_Identifier
 observation derived from observation association: 
  is_a: association
  represented_as: edge
  label_in_input: DERIVED_FROM_Observation_Observation
 observation has member observation association: 
  is_a: association
  represented_as: edge
  label_in_input: HAS_MEMBER_Observation_Observation
 procedure to identifier association: 
  is_a: association
  represented_as: edge
  label_in_input: IDENTIFIED_BY_Procedure_Identifier
 procedure to diagnostic report association: 
  is_a: association
  represented_as: edge
  label_in_input: IDENTIFIED_BY_Procedure_Identifier
 procedure reasoned by observation association: 
  is_a: association
  represented_as: edge
  label_in_input: HAS_REASON_REFERENCE_Procedure_Observation
 procedure performer is practitioner association:
  is_a: association
  represented_as: edge
  label_in_input: HAS_ACTOR_ProcedurePerformer_Practitioner
  #represented_as: edge
  #label_in_input: DERIVED_FROM_Observation_Observation:
  #represented_as: edge
  #label_in_input: DERIVED_FROM_Observation_Observation
      #protein interaction:
      #    is_a: Pairwise molecular interaction
      #    represented_as: edge
      #    label_in_input: protein_protein_interaction
      #protein to disease association:
      #   is_a: Association
      #    represented_as: edge
      #    label_in_input: protein_disease_association
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,76 @@
 services:
  neo4j:
    image: neo4j:5.7
    environment:
      - NEO4J_AUTH=${NEO4J_AUTH:-neo4j/password}
      - NEO4J_PLUGINS=["apoc"]
      - NEO4J_server_config_strict__validation_enabled=false
      - NEO4J_apoc_export_file_enabled=true
      - NEO4J_apoc_import_file_enabled=true
      - NEO4J_apoc_import_file_use__neo4j__config=true
      - SHARED_PATH=/neo4j_import
    command: >
      bash -c '
        echo "running cmd from docker compose" &&
        #neo4j start &&
        while true; do
          if [ -f /neo4j_import/ready-to-import ]; then
            echo "Starting import process..."
            neo4j stop &&
            bash /neo4j_import/neo4j-admin-import-call.sh &&
            rm /neo4j_import/ready-to-import &&
            touch /neo4j_import/import-complete &&
            chmod 777 /neo4j_import/import-complete
            neo4j start
            echo "The container is running. STR+C will end the bash command and thus, the neo4j container"
          fi
          sleep 10
        done
      '
    healthcheck:
      test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:7474 || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 5
    ports:
      - "8080:7474"
      - "8081:7687"
    volumes:
      - neo4j_data:/neo4j_data
      - neo4j_logs:/neo4j_logs
      - neo4j_import:/neo4j_import
      - ${INPUT_DATA_PATH:-./data}:/input_data
      - ./init-scripts:/init-scripts
      - ./importData:/importData
  python_app:
    build:
      context: .
      dockerfile: Dockerfile
      args:
        HTTP_PROXY: ${HTTP_PROXY}
        HTTPS_PROXY: ${HTTPS_PROXY}
        NO_PROXY: ${NO_PROXY}
    env_file:
         - .env
    environment:
      - NEO4J_URI=bolt://neo4j:7687
      - NEO4J_USER=${NEO4J_USER:-neo4j}
      - NEO4J_PASSWORD=${NEO4J_PASSWORD:-password}
      - INPUT_DATA_PATH=/input_data
      - POETRY_VIRTUALENVS_CREATE=false
      - NEO4J_dbms_directories_import=/neo4j_import
    volumes:
      - neo4j_import:/neo4j_import
      - ${INPUT_DATA_PATH:-./data}:/input_data
      - ./importData:/importData  # Share the import data directory
 #    depends_on:
 #      neo4j:
 #        condition: service_healthy
 # Define named volumes
 volumes:
  neo4j_data:
  neo4j_logs:
  neo4j_import:
--- a/entrypoint.sh
+++ b/entrypoint.sh
@ -0,0 +1,38 @@
 #!/bin/bash
 set -e
 chmod -R 777 /neo4j_import #make the dir accessible for both the python app an neo4j
 #echo "Waiting for Neo4j to be ready... ..."
 #python wait-for-neo4j.py
 #if [ $? -ne 0 ]; then
 #    echo "Failed to connect to Neo4j"
 #    exit 1
 #fi
 echo "Running Python data processing script..."
 poetry run python import_fhir_to_nx_diGraph.py
 echo "Running Neo4j import..."
 # Wait a bit before attempting database operations
 sleep 5
 while [ ! -f /neo4j_import/shell-scipt-complete ]; do
    echo "Waiting for shell-script file"
    sleep 5
 done
 # Create a signal file that we've prepared the data
 touch /neo4j_import/ready-to-import
 chmod -R 777 /neo4j_import/ready-to-import
 # Wait for import to complete by monitoring a completion file
 echo "Waiting for Neo4j import to complete..."
 while [ ! -f /neo4j_import/import-complete ]; do
    echo "Waiting for import-complete file"
    sleep 5
 done
 echo "Database setup complete!"
--- a/fhirImport.py
+++ b/fhirImport.py
@ -0,0 +1,66 @@
 import requests
 from typing import List, Dict, Any
 from dotenv import load_dotenv
 import os
 from requests.auth import HTTPBasicAuth
 # Load environment variables from .env file
 load_dotenv()
 def getBundle(url: str, search: str):
    headers = {
        'Accept': 'application/fhir+json',
        'Content-Type': 'application/fhir+json'
    }
    # Get configuration from environment variables
    mode = os.getenv('MODE')
    fhir_server = os.getenv('FHIR_SERVER_URL')
    if mode != 'testsever':
        username = os.getenv('FHIR_SERVER_USER')
        password = os.getenv('FHIR_SERVER_PW')
    if not fhir_server:
        raise ValueError("FHIR_SERVER_URL not found in environment variables")
    if (not username or not password) and mode != 'testserver':
        raise ValueError("FHIR_USERNAME and FHIR_SERVER_PW must be set in environment variables")
    # Setup basic authentication
    auth = HTTPBasicAuth(username, password)
    if url is not None:
        link = url + '?_format=json'
    else:
        link = fhir_server + search + '&_format=json'
    #print(link)
    if mode != 'testserver':
        response = requests.get(
            link,
            headers=headers,
            auth=auth
        )
    else:
        response = requests.get(
            link,
            headers=headers,
        )
    return response
 def getPatientEverything(id: str):
    search = '/Patient/' + id +  '/$everything?'
    return getBundle(None, search)
 # Example usage
 if __name__ == "__main__":
    bundles = get_bundles(None)
    data = bundles.json()
    # Process the bundles
    for entry in data['entry']:
        print(f"{entry['fullUrl']}")
--- a/graphCreation/create_graph.py
+++ b/graphCreation/create_graph.py
@ -0,0 +1,43 @@
 import json
 import networkx as nx
 def add_nodes_from_dict(graph, parent_node, current_dict):
    for key, value in current_dict.items():
        if isinstance(value, dict):
            # Create a new node for the nested dictionary
            new_node = f"{parent_node}.{key}"
            graph.add_node(new_node, label=key)
            # Add an edge from the parent node to the new node
            graph.add_edge(parent_node, new_node, edge_type=key)
            # Recurse into the nested dictionary
            add_nodes_from_dict(graph, new_node, value)
        elif isinstance(value, list):
            # if list doesn't contain any nested dictionaries, make it a value in the node
            if any(isinstance(item, dict) for item in value)==False:
                graph.nodes[parent_node][key] = value
            else:
                # Process each dictionary in the list
                for index, item in enumerate(value):
                    if isinstance(item, dict):
                        if len(value)>1:
                            item_node = f"{parent_node}.{key}[{index}]"
                        else:
                            item_node = f"{parent_node}.{key}"
                        graph.add_node(item_node, label=key)
                        graph.add_edge(parent_node, item_node, edge_type=key)
                        add_nodes_from_dict(graph, item_node, item)
        else:
            # For non-dict and non-list values, add them as attributes to the parent node
            graph.nodes[parent_node][key] = value
 def add_json_to_networkx(json_data, bundle_name, graph):
    if not isinstance(graph, nx.DiGraph):
        raise ValueError("The provided graph must be a networkx.DiGraph")
    root_node = bundle_name+'_bundle'
    graph.add_node(root_node, label='root')
    add_nodes_from_dict(graph, root_node, json_data)
--- a/graphCreation/node_typing.py
+++ b/graphCreation/node_typing.py
@ -0,0 +1,40 @@
 import networkx as nx
 class Resource:
    def __init__(self, resource_type):
        self.resource_type = resource_type
 def create_resource_class(resource_type):
    return type(resource_type, (Resource,), {})
 def set_resource_type(graph):
    for node, data in graph.nodes(data=True):
        print(node, data)
    print("-----------------------------")
    nodes_to_replace = []
    for node, data in graph.nodes(data=True):
        print(isinstance(node, Resource), node, type(node))
        if isinstance(node, Resource):
            print("Found a resource: ", node)
            resource_type = node.resource_type
            if resource_type:
                # Dynamically create a new class based on the resource_type
                NewResourceClass = create_resource_class(resource_type)
                new_node = NewResourceClass(resource_type)
                nodes_to_replace.append((node, new_node, data))
            else:
                print(f"Warning: Node {node} is a resource but has no resource_type")
    # Replace old nodes with new ones
    for old_node, new_node, data in nodes_to_replace:
        graph.add_node(new_node, **data)
        for pred in graph.predecessors(old_node):
            graph.add_edge(pred, new_node)
        for succ in graph.successors(old_node):
            graph.add_edge(new_node, succ)
        graph.remove_node(old_node)
 """ 
    for node, data in graph.nodes(data=True):
        print(node, data) """
--- a/graphCreation/process_references.py
+++ b/graphCreation/process_references.py
@ -0,0 +1,102 @@
 import networkx as nx
 def parse_synthea_reference(ref):
    if not ref.startswith('#'):
        #print("reference: ", ref)
        if '?' in ref and '|' in ref:
            parsed_ref = ref.split('|')[1]
 #        elif '/' in ref:
 #            parsed_ref = ref.split('/')[1]
        else:
            parsed_ref = ref.split(':')[2]
    else:
        parsed_ref = 'mock'
    return(parsed_ref)
 def process_references(graph):
    isSynthea = False
    nodes_with_reference = [[n, attr['reference']] for n, attr in graph.nodes(data=True) if 'reference' in attr]
    directly_referenced_nodes = []
    indirectly_referenced_nodes = []
    dummy_references = []
    if isSynthea:
        nodes_with_mock_reference = []
        for i in range(len(nodes_with_reference)):
            reference = nodes_with_reference[i][1]
            parsed_reference = parse_synthea_reference(reference)
            if parsed_reference != 'mock':
                nodes_with_reference[i].append(parsed_reference)
            else:
                nodes_with_mock_reference.append(i)
        for i in sorted(nodes_with_mock_reference, reverse=True):
            del nodes_with_reference[i]
        id_to_node = {data["id"]: node for node, data in graph.nodes(data=True) if "id" in data}
        id_to_identifier_node = {data["value"]: node for node, data in graph.nodes(data=True) if ("value" in data and data['label'] == 'identifier')}
        for i in nodes_with_reference:
            ref_id=i[2]
            if ref_id in id_to_node.keys():
                directly_referenced_nodes.append([i[0], id_to_node[ref_id]])
            elif ref_id in id_to_identifier_node.keys():
                indirectly_referenced_nodes.append([i[0], id_to_identifier_node[ref_id]])
            #else:
            #    print("KEY ERROR: Key neither in to_node nor in to_identifier_node", i)
        for i in indirectly_referenced_nodes:
            node_from=list(graph.predecessors(i[0]))[0]
            node_to=list(graph.predecessors(i[1]))[0]
            ref_type=graph.nodes[i[0]]['label']
            graph.add_edge(node_from, node_to, edge_type='reference', reference_type=ref_type)
    else:
        #for node, data in graph.nodes(data=True):
        #    if "id" in data:
        #        if not "resourceType" in data:
        #            print("FAILS AT: ", data, node)
        id_to_node = {data["resourceType"]+'/'+data["id"]: node for node, data in graph.nodes(data=True) if ("id" in data and "resourceType" in data)}
        for i in nodes_with_reference:
            ref_id=i[1]
            if ref_id in id_to_node.keys():
                directly_referenced_nodes.append([i[0], id_to_node[ref_id]])
            else:
                dummy_references.append([i[0], ref_id])              
    for i in directly_referenced_nodes:
        node_from=list(graph.predecessors(i[0]))[0]
        node_to=i[1]
        ref_type=graph.nodes[i[0]]['label']
        graph.add_edge(node_from, node_to, edge_type='reference', reference_type=ref_type)
    for i in dummy_references:
        #print(i)
        node_to='dummy_' + i[1]
        graph.add_node(node_to, label='dummy', unique_id=i[1])
        node_from=list(graph.predecessors(i[0]))[0]
        ref_type=graph.nodes[i[0]]['label']
        graph.add_edge(node_from, node_to, edge_type='reference', reference_type=ref_type)
    #graph.remove_nodes_from([i[0] for i in nodes_with_reference])
    graph.remove_nodes_from([i[0] for i in directly_referenced_nodes])
    graph.remove_nodes_from([i[0] for i in indirectly_referenced_nodes])
    graph.remove_nodes_from([i[0] for i in dummy_references])
    nodes_to_remove = [n for n, attr in graph.nodes(data=True) if attr.get('label') in ['root', 'entry', 'request']]
    graph.remove_nodes_from(nodes_to_remove)
    #graph.remove_nodes_from(list(nx.isolates(graph)))
--- a/graphCreation/property_convolution.py
+++ b/graphCreation/property_convolution.py
@ -0,0 +1,107 @@
 import networkx as nx
 def find_paths(graph, start_node):
    def is_leaf(node):
        #Checks if a node is a leaf (no outgoing edges)
        return graph.out_degree(node) == 0
    def custom_dfs(path, reference_count):
        #Performs a DFS to find paths for both patterns
        current_node = path[-1]
        '''if the current node is labeled 'resource', the path length is greater than 3,
        and we have exactly one 'reference' edge in the path'''
        if len(path) > 3 and graph.nodes[current_node].get('label') == 'resource' and reference_count == 1:
            # add path to the list of property paths containing a reference
            reference_paths.append(list(path))
        '''if the current node is a leaf node (no outgoing edges),
        the path length is greater than 2, and we have no references in the path'''
        if len(path) > 2 and is_leaf(current_node) and reference_count == 0:
            '''add path to the dictionary of property paths ending in leaves,
            by the corresponding property key'''
            leaf_paths.setdefault(path[1].split('.')[-1], []).extend(list(path))
        # check neighbors
        for neighbor in graph.successors(current_node):
            edge_type = graph.edges[current_node, neighbor].get('edge_type', None)
            new_reference_count = reference_count + (1 if edge_type == 'reference' else 0)
            # continue the search only if we have at most one 'reference' edge so far
            if new_reference_count <= 1:
                custom_dfs(path + [neighbor], new_reference_count)
    reference_paths = []
    leaf_paths = {}
    custom_dfs([start_node], 0)
    return reference_paths, leaf_paths
 def property_convolution(graph):
    # Find all nodes with label 'resource'
    resource_nodes = [n for n, attr in graph.nodes(data=True) if attr.get('label') == 'resource']
    #print("Got all nodes with label 'resource'", flush=True)
    '''collect all paths starting with a resource node, that contain one reference edge,
        end with a resource node and are >3 nodes long'''
    '''collect all paths starting with a resource node, that do not contain reference edges,
        end with a leaf node and are >2 nodes long'''
    property_paths_with_reference = []
    property_paths_with_leaves = {}
    for resource_node in resource_nodes:
        temp_ref_paths, temp_leaf_paths = find_paths(graph, resource_node)
        # add paths to the list of property paths containing a reference, for all nodes
        property_paths_with_reference.extend(temp_ref_paths)
        # add paths to the dictionary of property paths ending in leaves, by the corresponding resouce key
        property_paths_with_leaves[resource_node] = temp_leaf_paths
    # print("Collected all paths", flush=True)
    # transfer reference edge to first property node for all reference paths
    for i in property_paths_with_reference:
        ref_edge_data = graph.get_edge_data(i[-2], i[-1])
        ref_type = ref_edge_data.get('reference_type')
        graph.remove_edge(i[-2], i[-1])
        graph.add_edge(i[1], i[-1], edge_type='reference', reference_type=ref_type)
        '''after transferrence, add the modified reference path (that now ends in a leaf)
        to the dictionary of leaf paths, by corresponding resource and property keys'''
        property_paths_with_leaves[i[0]].setdefault(i[1].split('.')[-1], []).extend(i[:-1])
    #print("Transfered all references edges", flush=True)
    '''create a list of collections of property paths ending in leaves,
    removing duplicate nodes from each path collection'''
    list_property_paths_with_leaves = [list(dict.fromkeys(i)) for j in property_paths_with_leaves.values() for i in j.values()]
    nodes_to_remove=[]
    for i in list_property_paths_with_leaves:
        for j in range(len(i)-1, 1, -1):
            source_attributes = graph.nodes[i[j]]
            marker='|'.join(i[j].split('resource.')[1].split('.')[1:])
            # transfer attributes to first property node
            for attr, value in source_attributes.items():
                if attr != 'label':
                    graph.nodes[i[1]][marker+'_'+attr] = value
            nodes_to_remove.append(i[j])
    #print("Transferred attributes for all paths", flush=True)
    graph.remove_nodes_from(nodes_to_remove)
    for i in resource_nodes:
        unique_resource_id = graph.nodes[i]['resourceType']+'/'+graph.nodes[i]['id']
        graph.nodes[i]['unique_id'] = unique_resource_id
        for j in graph.successors(i):
             if graph[i][j].get('edge_type') != 'reference':
                graph.nodes[j]['unique_id'] = unique_resource_id+'/'+j.split('.')[-1]
--- a/import_fhir_to_nx_diGraph.py
+++ b/import_fhir_to_nx_diGraph.py
@ -0,0 +1,276 @@
 from biocypher import BioCypher
 import networkx as nx
 import json
 import os
 import sys
 import re
 import uuid
 import gc
 from dotenv import load_dotenv
 from graphCreation import create_graph
 from graphCreation.process_references import process_references
 from graphCreation.property_convolution import property_convolution
 from schema_config_generation import write_automated_schema
 from fhirImport import getPatientEverything, getBundle
 def load_multiple_fhir_patients(n):
    #graph = nx.DiGraph()
    init = True
    ids = []
    #get n ids
    nextIds = True
    while len(ids) < n and nextIds:
        if init:
            complex = os.getenv('COMPLEX_PATIENTS')
            if complex and complex.upper() != 'TRUE':
                bundle = getBundle(None, '/Patient?_count=' + str(n))
            else:
                bundle = getBundle(None, '/Patient?_has:Observation:subject:status=final&_count=' + str(n))
        else:
            bundle = getBundle(None, nextLink)
        if not 'entry' in bundle.json():
            print("ERROR -- No data found in the fhir bundle. Check the request and if the server is up and responding")
            sys.exit(1) 
        for entry in bundle.json()['entry']:
            ids.append(entry['resource']['id'])
        nextIds = False
        for l in bundle.json()['link']:
            if l['relation'] == "next":
                nextLink = l['url']
                nextIds = True
    if len(ids) < n:
        n = len(ids)
    batchSize = int(os.getenv('BATCH_SIZE'))
    c = 0
    print(len(ids))
    #get bundle for each ID
    for id in ids:
        c += 1
        bundle = getPatientEverything(id).json()
        bundle = replace_single_quotes(bundle)    ### maybe not needed for german data
        if init:
            graph = nx.DiGraph()  
            init = False
        create_graph.add_json_to_networkx(bundle, id + '_bundle', graph)
        if c % 50 == 0:
            print("---------- ", c, " patients loaded ----------", flush=True)
        if c % batchSize == 0 or c == n:
            print(c, " patients imported, reducing graph", flush = True)
            process_references(graph)
            property_convolution(graph)
            lastChunk = False
            if n == c:
                lastChunk =  True
            runBioCypher(graph, lastChunk)
            init = True
            print(graph)
            del graph
            gc.collect
 def replace_single_quotes(obj):
    if isinstance(obj, str):  # If it's a string, replace single quotes
        return obj.replace("'", "''")
    elif isinstance(obj, dict):  # If it's a dictionary, process each key-value pair
        return {key: replace_single_quotes(value) for key, value in obj.items()}
    elif isinstance(obj, list):  # If it's a list, process each item
        return [replace_single_quotes(item) for item in obj]
    else:
        return obj  # Leave other data types unchanged
 def main():
    ## create networkX and run improvement scripts
    print("Creating the graph...", flush=True)
    nPatients = int(os.getenv('NUMBER_OF_PATIENTS'))
    load_multiple_fhir_patients(nPatients) 
 def runBioCypher(nxGraph, final):
    #get lists of node and edge types
    print("Generate auto schema...", flush=True)
    write_automated_schema(nxGraph, 'config/automated_schema.yaml', 'config/manual_schema_config.yaml')
    # create Biocypher driver
    bc = BioCypher(
        biocypher_config_path="config/biocypher_config.yaml",
    )
    #bc.show_ontology_structure() #very extensive
    #BioCypher preperation
    def node_generator():
        for node in nxGraph.nodes():
            label = nxGraph.nodes[node].get('label')
            if label == "resource":
                label = nxGraph.nodes[node].get('resourceType')
                nxGraph.nodes[node]['label'] = label.capitalize()
            label = label.capitalize()
            unq_id = nxGraph.nodes[node].get('unique_id', False)
            if(nxGraph.nodes[node].get('label') in ['search', 'meta', 'link']):
                #print("skipped a node: ", nxGraph.nodes[node].get('label'))
                continue
            label = nxGraph.nodes[node].get('label')
            if(label == 'dummy'):
                #print("SKIPPED dummy node: ", unq_id)
                continue
            yield(
                    nxGraph.nodes[node].get('unique_id', node), #remark: this returns the node id if this attribute exists. otherwise it returns node which equals the identifier that is used by nx
                    label,
                    nxGraph.nodes[node] # get properties
                )
    def edge_generator():
        for edge in nxGraph.edges(data = True):
            source, target, attributes = edge
            sLabel = nxGraph.nodes[source].get('label')
            if sLabel == 'resource':
                sLabel = nxGraph.nodes[source].get('resourceType')
            tLabel = nxGraph.nodes[target].get('label')
            if tLabel == 'resource':
                tLabel = nxGraph.nodes[target].get('resourceType')
            label = sLabel.capitalize() + '_to_' + tLabel
            yield(
                attributes.get('id', str(uuid.uuid4())),  # Edge ID (if exists, otherwise use nx internal id)
                nxGraph.nodes[source].get('unique_id', source),
                nxGraph.nodes[target].get('unique_id', target),
                label,
                attributes  # All edge attributes
            )
    #import nodes
    bc.write_nodes(node_generator())
    bc.write_edges(edge_generator())
    #write the import script -- we are creating our own script since BC would only consider the last batch as an input
    if final:
        print("CREATING THE SCRIPT")
        generate_neo4j_import_script()
        with open('/neo4j_import/shell-scipt-complete', 'w') as f:
            f.write('Import completed successfully')
        print("FHIR import completed successfully")
 def generate_neo4j_import_script(directory_path="/neo4j_import/", output_file="neo4j-admin-import-call.sh"):
    """
    Reads files in a directory and generates a Neo4j import shell script.
    Args:
        directory_path (str): Path to the directory containing CSV files
        output_file (str): Name of the output shell script file
    Returns:
        str: Path to the generated shell script
    """
    # Get all files in the directory
    all_files = os.listdir(directory_path)
    # Dictionary to store entity types (nodes and relationships)
    entity_types = {}
    # Find all header files and use them to identify entity types
    for filename in all_files:
        if '-header.csv' in filename:
            entity_name = filename.split('-header.csv')[0]
            # Check if it's a relationship (contains "To" and "Association")
            is_relationship = "To" in entity_name and "Association" in entity_name
            # Store in entity_types dictionary
            if is_relationship:
                entity_type = "relationships"
            else:
                entity_type = "nodes"
            # Initialize the entity if not already present
            if entity_name not in entity_types:
                entity_types[entity_name] = {
                    "type": entity_type,
                    "header": f"/neo4j_import/{filename}",
                    "has_parts": False
                }
    # Check for part files for each entity
    for entity_name in entity_types:
        # Create pattern to match part files for this entity
        part_pattern = f"{entity_name}-part"
        # Check if any file matches the pattern
        for filename in all_files:
            if part_pattern in filename:
                entity_types[entity_name]["has_parts"] = True
                break
    # Generate the import commands
    nodes_command = ""
    relationships_command = ""
    for entity_name, info in entity_types.items():
        if info["has_parts"]:
            # Create the command string with wildcard for part files
            command = f" --{info['type']}=\"{info['header']},/neo4j_import/{entity_name}-part.*\""
            # Add to appropriate command string
            if info['type'] == "nodes":
                nodes_command += command
            else:  # relationships
                relationships_command += command
    # Create the shell script content
    script_content = """#!/bin/bash
        version=$(bin/neo4j-admin --version | cut -d '.' -f 1)
        if [[ $version -ge 5 ]]; then
        \tbin/neo4j-admin database import full neo4j --delimiter="\\t" --array-delimiter="|" --quote="'" --overwrite-destination=true --skip-bad-relationships=true --skip-duplicate-nodes=true{nodes}{relationships} 
        else
        \tbin/neo4j-admin import --database=neo4j --delimiter="\\t" --array-delimiter="|" --quote="'" --force=true --skip-bad-relationships=true --skip-duplicate-nodes=true{nodes}{relationships} 
        fi
        """.format(nodes=nodes_command, relationships=relationships_command)
    # Write the script to file
    script_path = os.path.join(directory_path, output_file)
    with open(script_path, 'w') as f:
        f.write(script_content)
    # Make the script executable
    os.chmod(script_path, 0o755)
    print("Shell import script created", flush=True)
 if __name__ == "__main__":
    main()
--- a/import_nx_diGraph.py
+++ b/import_nx_diGraph.py
@ -0,0 +1,199 @@
 from biocypher import BioCypher
 import networkx as nx
 import json
 import os
 import uuid
 #from networkx_based import create_graph
 from graphCreation import create_graph
 #from networkx_based.process_references import process_references
 from graphCreation.process_references import process_references
 #from networkx_based.property_convolution import property_convolution
 from graphCreation.property_convolution import property_convolution
 from schema_config_generation import write_automated_schema
 #from networkx_based.node_typing import set_ressource_type
 from graphCreation.node_typing import set_resource_type
 def load_multiple_fhir_bundles(directory_path):
    graph = nx.DiGraph()
    init = True
    #limit = 2
    # Iterate over all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):  # Assuming FHIR bundles are in JSON format
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r') as f:
                bundle_json = json.load(f)
                #fix all strings to to enable ' in neo4j
                fixedQuotes = replace_single_quotes(bundle_json)
            if init:
                #print(bundle_json, filename, graph)
                create_graph.json_to_networkx(fixedQuotes, filename, graph)
                init = False
            else:
                create_graph.add_json_to_networkx(fixedQuotes, filename, graph)
        print("Imported: ", filename)
        #if limit == 0:
        #    return graph
        #limit = limit - 1
    return graph
 def replace_single_quotes(obj):
    if isinstance(obj, str):  # If it's a string, replace single quotes
        return obj.replace("'", "''")
    elif isinstance(obj, dict):  # If it's a dictionary, process each key-value pair
        return {key: replace_single_quotes(value) for key, value in obj.items()}
    elif isinstance(obj, list):  # If it's a list, process each item
        return [replace_single_quotes(item) for item in obj]
    else:
        return obj  # Leave other data types unchanged
 def main():
    #get a list of nodes that should be imported
    ## create networkX and run improvement scripts
    print("Creating the graph...", flush=True)
    nxGraph = load_multiple_fhir_bundles('./testData/') # 'mockData' for unit test data, 'testData' for Synthea files
    print(nxGraph)
    print("Reducing references...", flush=True)
    process_references(nxGraph)
    print(nxGraph)
    print("Convolute references...", flush=True)
    property_convolution(nxGraph)
    print(nxGraph)
    #Set types of all resource nodes to resource_type
    #set_resource_type(nxGraph)
    #get lists of node and edge types
    """     all_nLabels = set()
    all_eLabels = set()
    for node, attrs in nxGraph.nodes(data=True):
        for attr_name, attr_value in attrs.items():
            if attr_name == "label":
                all_nLabels.add(attr_value)
    for nt in all_nLabels:
        print(nt)
    print("-" * 50)
    for u, v, attrs in nxGraph.edges(data=True):
        u_label = nxGraph.nodes[u]['label']
        if u_label == "resource":
            u_label = nxGraph.nodes[u]['resourceType']
        v_label = nxGraph.nodes[v]['label']
        if v_label == "resource":
            v_label = nxGraph.nodes[v]['resourceType']
        all_eLabels.add(u_label + " to " + v_label)
    for et in all_eLabels:
        print(et)
    print("-" * 50)
    print("...end")
    return """
    print("Generate auto schema...")
    write_automated_schema(nxGraph, 'config/automated_schema.yaml')
    # create Biocypher driver
    bc = BioCypher(
        biocypher_config_path="config/biocypher_config.yaml",
        #schema_config_path="/config/manual_schema_config.yaml"
    )
    bc.show_ontology_structure()
    #BioCypher preperation
    ## node generator: extract id, label and property dictionary
    def node_generator():
        for node in nxGraph.nodes():
            """             #single qoutes break neo4j import, e.g. 'CHILDREN'S Hospital'
            checkDisplay = nxGraph.nodes[node].get('display')
            if checkDisplay:
                checkDisplay = checkDisplay.replace("'", "''")
                nxGraph.nodes[node]['display'] = checkDisplay
                #print("------->", nxGraph.nodes[node].get('display'))
            checkName = nxGraph.nodes[node].get('name')
            if checkName:
                checkName = checkName.replace("'", "''")
                nxGraph.nodes[node]['name'] = checkName
                #print("------->", nxGraph.nodes[node].get('name')) """
            label = nxGraph.nodes[node].get('label')
            if label == "resource":
                label = nxGraph.nodes[node].get('resourceType')
            '''
            elif label == 'identifier':
                label = nxGraph.nodes[node].get('system')
                print('/' in label)
                if '/' in label:
                    lastSlash = label.rfind('/') + 1
                    label = label[lastSlash:] + '-ID'
            elif label == 'telecom':
                label = nxGraph.nodes[node].get('system')
                print('/' in label)
                if '/' in label:
                    lastSlash = label.rfind('/') + 1
                    label = 'telecom-' + label[lastSlash:]
            elif label == 'address':
                extension = nxGraph.nodes[node].get('extension_url')
                print("EX!: ", extension)
                if extension:
                    lastSlash = extension.rfind('/') + 1
                    label = label + '-' + extension[lastSlash:]
            '''
            yield(
                    nxGraph.nodes[node].get('id', node), #remark: this returns the node id if this attribute exists. otherwise it returns node which equals the identifier that is used by nx
                    label,
                    nxGraph.nodes[node] # get properties
                )
    def edge_generator():
        for edge in nxGraph.edges(data = True):
            source, target, attributes = edge
            sLabel = nxGraph.nodes[source].get('label')
            if sLabel == 'resource':
                sLabel = nxGraph.nodes[source].get('resourceType')
            tLabel = nxGraph.nodes[target].get('label')
            if tLabel == 'resource':
                tLabel = nxGraph.nodes[target].get('resourceType')
            label = sLabel + '_to_' + tLabel
            yield(
                attributes.get('id', str(uuid.uuid4())),  # Edge ID (if exists, otherwise use nx internal id)
                nxGraph.nodes[source].get('id', source),
                nxGraph.nodes[target].get('id', target),
                label,
                attributes  # All edge attributes
            )
    #import nodes
    bc.write_nodes(node_generator())
    bc.write_edges(edge_generator())
    #write the import script
    bc.write_import_call()
 if __name__ == "__main__":
    #print("Called import script. Should run its main function now...")
    main()
--- a/init-scripts/setup.cypher
+++ b/init-scripts/setup.cypher
@ -0,0 +1,8 @@
 // Example initialization script - modify according to your schema
 CREATE CONSTRAINT IF NOT EXISTS FOR (n:YourLabel) REQUIRE n.id IS UNIQUE;
 CREATE INDEX IF NOT EXISTS FOR (n:YourLabel) ON (n.someProperty);
 // Add any other initialization queries here
 // For example:
 // CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person) REQUIRE n.email IS UNIQUE;
 // CREATE INDEX IF NOT EXISTS FOR (n:Product) ON (n.sku);
--- a/mockData/short_Johnny_Schoen.json
+++ b/mockData/short_Johnny_Schoen.json
@ -0,0 +1,210 @@
 {
  "resourceType": "Bundle",
  "type": "transaction",
  "entry": [ {
    "fullUrl": "urn:uuid:a7a285c0-4714-dd3c-4837-8719c9b67873",
    "resource": {
      "resourceType": "Patient",
      "id": "a7a285c0-4714-dd3c-4837-8719c9b67873",
      "meta": {
        "profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient" ]
      },
      "text": {
        "status": "generated",
        "div": "<div xmlns=\"http://www.w3.org/1999/xhtml\">Generated by <a href=\"https://github.com/synthetichealth/synthea\">Synthea</a>.Version identifier: 3c23908\n .   Person seed: -5557164924473669144  Population seed: 1693908535569</div>"
      },
      "extension": [ {
        "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race",
        "extension": [ {
          "url": "ombCategory",
          "valueCoding": {
            "system": "urn:oid:2.16.840.1.113883.6.238",
            "code": "2106-3",
            "display": "White"
          }
        }, {
          "url": "text",
          "valueString": "White"
        } ]
      }, {
        "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity",
        "extension": [ {
          "url": "ombCategory",
          "valueCoding": {
            "system": "urn:oid:2.16.840.1.113883.6.238",
            "code": "2186-5",
            "display": "Not Hispanic or Latino"
          }
        }, {
          "url": "text",
          "valueString": "Not Hispanic or Latino"
        } ]
      }, {
        "url": "http://hl7.org/fhir/StructureDefinition/patient-mothersMaidenName",
        "valueString": "Leana211 Sauer652"
      }, {
        "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex",
        "valueCode": "M"
      }, {
        "url": "http://hl7.org/fhir/StructureDefinition/patient-birthPlace",
        "valueAddress": {
          "city": "Quincy",
          "state": "Massachusetts",
          "country": "US"
        }
      }, {
        "url": "http://synthetichealth.github.io/synthea/disability-adjusted-life-years",
        "valueDecimal": 0.0
      }, {
        "url": "http://synthetichealth.github.io/synthea/quality-adjusted-life-years",
        "valueDecimal": 1.0
      } ],
      "identifier": [ 
        {
          "system": "https://github.com/synthetichealth/synthea",
          "value": "a7a285c0-4714-dd3c-4837-8719c9b67873"
        },
        {
          "type": {
            "coding": [ {
              "system": "http://terminology.hl7.org/CodeSystem/v2-0203",
              "code": "MR",
              "display": "Medical Record Number"
            } ],
            "text": "Medical Record Number"
          },
          "system": "http://hospital.smarthealthit.org",
          "value": "a7a285c0-4714-dd3c-4837-8719c9b67873"
      }, {
        "type": {
          "coding": [ {
            "system": "http://terminology.hl7.org/CodeSystem/v2-0203",
            "code": "SS",
            "display": "Social Security Number"
          } ],
          "text": "Social Security Number"
        },
        "system": "http://hl7.org/fhir/sid/us-ssn",
        "value": "999-89-9528"
      } ],
      "name": [ {
        "use": "official",
        "family": "Schoen8",
        "given": [ "Johnny786", "Vince741" ]
      } ],
      "telecom": [ {
        "system": "phone",
        "value": "555-753-6560",
        "use": "home"
      } ],
      "gender": "male",
      "birthDate": "2021-05-22",
      "address": [ {
        "extension": [ {
          "url": "http://hl7.org/fhir/StructureDefinition/geolocation",
          "extension": [ {
            "url": "latitude",
            "valueDecimal": 42.05921178859317
          }, {
            "url": "longitude",
            "valueDecimal": -70.79219595855132
          } ]
        } ],
        "line": [ "463 Rempel Ranch Unit 81" ],
        "city": "Pembroke",
        "state": "MA",
        "postalCode": "00000",
        "country": "US"
      } ],
      "maritalStatus": {
        "coding": [ {
          "system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus",
          "code": "S",
          "display": "Never Married"
        } ],
        "text": "Never Married"
      },
      "multipleBirthBoolean": false,
      "communication": [ {
        "language": {
          "coding": [ {
            "system": "urn:ietf:bcp:47",
            "code": "en-US",
            "display": "English (United States)"
          } ],
          "text": "English (United States)"
        }
      } ]
    },
    "request": {
      "method": "POST",
      "url": "Patient"
    }
  }, {
    "fullUrl": "urn:uuid:0eb53bda-2881-5e8e-3597-87a9430af96a",
    "resource": {
      "resourceType": "Encounter",
      "id": "0eb53bda-2881-5e8e-3597-87a9430af96a",
      "meta": {
        "profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-encounter" ]
      },
      "identifier": [ {
        "use": "official",
        "system": "https://github.com/synthetichealth/synthea",
        "value": "0eb53bda-2881-5e8e-3597-87a9430af96a"
      } ],
      "status": "finished",
      "class": {
        "system": "http://terminology.hl7.org/CodeSystem/v3-ActCode",
        "code": "AMB"
      },
      "type": [ {
        "coding": [ {
          "system": "http://snomed.info/sct",
          "code": "410620009",
          "display": "Well child visit (procedure)"
        } ],
        "text": "Well child visit (procedure)"
      } ],
      "subject": {
        "reference": "urn:uuid:a7a285c0-4714-dd3c-4837-8719c9b67873",
        "display": "Johnny786 Vince741 Schoen8"
      },
      "participant": [ {
        "type": [ {
          "coding": [ {
            "system": "http://terminology.hl7.org/CodeSystem/v3-ParticipationType",
            "code": "PPRF",
            "display": "primary performer"
          } ],
          "text": "primary performer"
        } ],
        "period": {
          "start": "2021-05-22T00:13:45+02:00",
          "end": "2021-05-22T00:28:45+02:00"
        },
        "individual": {
          "reference": "Practitioner?identifier=http://hl7.org/fhir/sid/us-npi|9999942599",
          "display": "Dr. Regenia619 Bosco882"
        }
      } ],
      "period": {
        "start": "2021-05-22T00:13:45+02:00",
        "end": "2021-05-22T00:28:45+02:00"
      },
      "location": [ {
        "location": {
          "reference": "Location?identifier=https://github.com/synthetichealth/synthea|6e3d04a3-9064-33e4-b8b5-63bb468d7629",
          "display": "UNITED MEDICAL CARE LLC"
        }
      } ],
      "serviceProvider": {
        "reference": "Organization?identifier=https://github.com/synthetichealth/synthea|4e56c7ec-99e5-3023-8e4f-95ad18a03f06",
        "display": "UNITED MEDICAL CARE LLC"
      }
    },
    "request": {
      "method": "POST",
      "url": "Encounter"
    }
  }]}
--- a/mockData/short_hospitalInformation.json
+++ b/mockData/short_hospitalInformation.json
@ -0,0 +1,98 @@
 {
  "resourceType": "Bundle",
  "type": "batch",
  "entry": [ {
    "fullUrl": "urn:uuid:4e56c7ec-99e5-3023-8e4f-95ad18a03f06",
    "resource": {
      "resourceType": "Organization",
      "id": "4e56c7ec-99e5-3023-8e4f-95ad18a03f06",
      "meta": {
        "profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-organization" ]
      },
      "extension": [ {
        "url": "http://synthetichealth.github.io/synthea/utilization-encounters-extension",
        "valueInteger": 9
      }, {
        "url": "http://synthetichealth.github.io/synthea/utilization-procedures-extension",
        "valueInteger": 2
      }, {
        "url": "http://synthetichealth.github.io/synthea/utilization-labs-extension",
        "valueInteger": 1
      }, {
        "url": "http://synthetichealth.github.io/synthea/utilization-prescriptions-extension",
        "valueInteger": 3
      } ],
      "identifier": [ {
        "system": "https://github.com/synthetichealth/synthea",
        "value": "4e56c7ec-99e5-3023-8e4f-95ad18a03f06"
      } ],
      "active": true,
      "type": [ {
        "coding": [ {
          "system": "http://terminology.hl7.org/CodeSystem/organization-type",
          "code": "prov",
          "display": "Healthcare Provider"
        } ],
        "text": "Healthcare Provider"
      } ],
      "name": "UNITED MEDICAL CARE LLC",
      "telecom": [ {
        "system": "phone",
        "value": "5089715500"
      } ],
      "address": [ {
        "line": [ "28 RIVERSIDE DR STE 101" ],
        "city": "PEMBROKE",
        "state": "MA",
        "postalCode": "023594947",
        "country": "US"
      } ]
    },
    "request": {
      "method": "POST",
      "url": "Organization",
      "ifNoneExist": "identifier=https://github.com/synthetichealth/synthea|4e56c7ec-99e5-3023-8e4f-95ad18a03f06"
    }
  }, {
    "fullUrl": "urn:uuid:6e3d04a3-9064-33e4-b8b5-63bb468d7629",
    "resource": {
      "resourceType": "Location",
      "id": "6e3d04a3-9064-33e4-b8b5-63bb468d7629",
      "meta": {
        "profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-location" ]
      },
      "identifier": [ {
        "system": "https://github.com/synthetichealth/synthea",
        "value": "6e3d04a3-9064-33e4-b8b5-63bb468d7629"
      } ],
      "status": "active",
      "name": "UNITED MEDICAL CARE LLC",
      "telecom": [ {
        "system": "phone",
        "value": "5089715500"
      } ],
      "address": {
        "line": [ "28 RIVERSIDE DR STE 101" ],
        "city": "PEMBROKE",
        "state": "MA",
        "postalCode": "023594947",
        "country": "US"
      },
      "position": {
        "longitude": -70.77534154695786,
        "latitude": 42.11004715
      },
      "managingOrganization": {
        "identifier": {
          "system": "https://github.com/synthetichealth/synthea",
          "value": "4e56c7ec-99e5-3023-8e4f-95ad18a03f06"
        },
        "display": "UNITED MEDICAL CARE LLC"
      }
    },
    "request": {
      "method": "POST",
      "url": "Location",
      "ifNoneExist": "identifier=https://github.com/synthetichealth/synthea|6e3d04a3-9064-33e4-b8b5-63bb468d7629"
    }
  }]}
--- a/mockData/short_practitionerInformation.json
+++ b/mockData/short_practitionerInformation.json
@ -0,0 +1,50 @@
 {
  "resourceType": "Bundle",
  "type": "batch",
  "entry": [ {
    "fullUrl": "urn:uuid:0368f101-0e65-3251-a809-566ebd6b2c2a",
    "resource": {
      "resourceType": "Practitioner",
      "id": "0368f101-0e65-3251-a809-566ebd6b2c2a",
      "meta": {
        "profile": [ "http://hl7.org/fhir/us/core/StructureDefinition/us-core-practitioner" ]
      },
      "extension": [ {
        "url": "http://synthetichealth.github.io/synthea/utilization-encounters-extension",
        "valueInteger": 9
      } ],
      "identifier": [ {
        "system": "http://hl7.org/fhir/sid/us-npi",
        "value": "9999942599"
      } ],
      "active": true,
      "name": [ {
        "family": "Bosco882",
        "given": [ "Regenia619" ],
        "prefix": [ "Dr." ]
      } ],
      "telecom": [ {
        "extension": [ {
          "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-direct",
          "valueBoolean": true
        } ],
        "system": "email",
        "value": "Regenia619.Bosco882@example.com",
        "use": "work"
      } ],
      "address": [ {
        "line": [ "28 RIVERSIDE DR STE 101" ],
        "city": "PEMBROKE",
        "state": "MA",
        "postalCode": "023594947",
        "country": "US"
      } ],
      "gender": "female"
    },
    "request": {
      "method": "POST",
      "url": "Practitioner",
      "ifNoneExist": "identifier=http://hl7.org/fhir/sid/us-npi|9999942599"
    }
  }]}
--- a/pipeline.puml
+++ b/pipeline.puml
@ -0,0 +1,44 @@
@startuml "MeDaX pipeline"
 left to right direction
 actor admin
 database "fhir server" as fhir
 node "docker compose" as compose{
  node "python app" as pyApp {
    [scripts]
    [nodeGenerator] as ngen
    [edgeGenerator] as egen
    [BioCypher] as BC
    file "generated Schema" as gSchema
    file "manual Schema" as mSchema
    mSchema --> scripts : input
    scripts --> gSchema : generates
    scripts --> ngen : generates
    scripts --> egen : generates
    gSchema --> BC : input
    ngen--> BC : input
    egen--> BC : input
  }
  node "neo4j app" as neoApp{
    database "neo4j GDB" as neoDB
    [web server] as neoServer
    neoDB --> neoServer
  }
  folder "admin files" as afiles {
    file nodes
    file edges
    file "import script" as iscript
  }
  admin -[dashed]-> compose : triggers
  BC --> afiles : exports
  fhir --> scripts : http request
  afiles --> neoApp : input
 }
  actor user
  user --> neoServer : uses
  neoApp --> pyApp : kills
@enduml
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,132 @@
 [tool.poetry]
 name = "MeDaX pipeline"
 version = "1.0.0"
 description = "A unifying framework for biomedical research knowledge graphs"
 authors = [
    "Ilya Mazien",
    "Tom Gebhardt",
    "Lea Michaelis",
    "Ron Henkel",
    "Benjamin Winter",
    "Dagmar Waltemath",
    "Judith Wodke"
 ]
 license = "MIT"
 packages = [
    { include = "biocypher" }
 ]
 classifiers = [
    "Development Status :: 3 - Alpha",
    "Intended Audience :: Developers",
    "Intended Audience :: Science/Research",
    "License :: OSI Approved :: MIT License",
    "Operating System :: OS Independent",
    "Programming Language :: Python",
    "Natural Language :: English",
    "Topic :: Scientific/Engineering :: Bio-Informatics"
 ]
 repository = "https://github.com/biocypher/biocypher"
 readme = "README.md"
 [project.urls]
 Homepage = "https://www.medizin.uni-greifswald.de/medizininformatik/research/current-projects/medax/"
 [tool.poetry.dependencies]
 python = "^3.9"
 PyYAML = ">=5.0"
 more_itertools = "*"
 appdirs = "*"
 treelib = "1.6.4"
 rdflib = "^6.2.0"
 networkx = "^3.0"
 stringcase = "^1.2.0"
 neo4j-utils = "0.0.7"
 pandas = "^2.0.1"
 pooch = "^1.7.0"
 tqdm = "^4.65.0"
 [tool.poetry.group.dev.dependencies]
 sphinx = ">=5.0.0"
 sphinx-design = "^0.3.0"
 sphinx-rtd-theme = ">=1.0.0"
 sphinx-last-updated-by-git = ">=0.3"
 sphinx-autodoc-typehints = ">=1.18.0"
 myst-parser = "^0.18.0"
 yapf = "^0.32.0"
 pytest = ">=6.0"
 tox = ">=3.20.1"
 pre-commit = ">=2.17.0"
 bump2version = "*"
 coverage = ">=6.0"
 pytest-cov = "^3.0.0"
 hypothesis = "^6.50.1"
 isort = "^5.10.1"
 ipython = "^8.7.0"
 ipykernel = "^6.23.1"
 sphinxext-opengraph = "^0.8.2"
 coverage-badge = "^1.1.0"
 nbsphinx = "^0.9.2"
 black = "^23.9.1"
 flake8 = "^6.1.0"
 [build-system]
 requires = ["poetry-core<2.0.0"]
 build-backend = "poetry.core.masonry.api"
 [tool.poetry.urls]
 "Bug Tracker" = "https://github.com/biocypher/biocypher/issues"
 [tool.pytest.ini_options]
 log_cli = true
 log_level = "INFO"
 markers = [
  "requires_neo4j: Requires connection to a Neo4j server",
  "requires_postgresql: Requires connection to a PostgreSQL server",
  "inject_driver_args(driver_args): Arguments for the Driver",
 ]
 [tool.black]
 line-length = 80
 target-version = ['py310']
 include = '\.pyi?$'
 exclude = '''
 (
  /(
      \.eggs
    | \.git
    | \.hg
    | \.mypy_cache
    | \.tox
    | \.venv
    | _build
    | buck-out
    | build
    | dist
  )/
 )
 '''
 [tool.isort]
 from_first = true
 line_length = 80
 multi_line_output = 3
 include_trailing_comma = true
 use_parentheses = true
 known_num="numpy,pandas"
 sections = "FUTURE,STDLIB,THIRDPARTY,NUM,FIRSTPARTY,LOCALFOLDER"
 no_lines_before="LOCALFOLDER"
 balanced_wrapping = true
 force_grid_wrap = 0
 length_sort = "1"
 indent = "    "
 profile = "black"
 [tool.flake8]
 ignore = ["E203", "D200", "D202", "D401", "D105", "W504"]
 per-file-ignores = [
    "docs/source/conf.py:D100",
    "tests/*:D100,D101,D102",
    "*/__init__.py:F401"
 ]
 max-line-length = 80
 count = true
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 requests==2.31.0
 python-dotenv==1.0.0
--- a/schema_config_generation.py
+++ b/schema_config_generation.py
@ -0,0 +1,188 @@
 #!/usr/bin/env python
 # coding: utf-8
 from pathlib import Path
 #import networkx as nx
 import yaml
 from collections import defaultdict
 #extract all node types and generate basic yaml config part for nodes
 def write_automated_schema(graph, filePath, mSchemaPath):
    schemaData = {
        'nodes': {},
        'edges': {}
    }
    if Path(filePath).exists():
        schemaData = loadManualSchema(filePath)
    elif mSchemaPath:
        print("using the manual schema")
        schemaData = loadManualSchema(mSchemaPath)
    for node in graph.nodes():
        label = graph.nodes[node].get('label')
        if label == 'resource':
            label = graph.nodes[node].get('resourceType')
        label = label.capitalize()
        if not label in schemaData['nodes']:
            schemaData['nodes'][label] = {}
        if not 'properties' in schemaData['nodes'][label]:
            schemaData['nodes'][label]['properties'] = {}
        for k in graph.nodes[node].keys():
            #print(k, '----- ', graph.nodes[node][k])
            #if k != 'label':
            schemaData['nodes'][label]['properties'][k] = 'str'        
        #schemaData['nodes'][label]['properties'].update(graph.nodes[node].keys())
    file=open(filePath, 'w')
    for n in schemaData['nodes']:
        temp = n+':\n'
        if 'is_a' in schemaData['nodes'][n]:
            temp += '    is_a: ' + schemaData['nodes'][n]['is_a'] + '\n'
        else:
            temp += '    is_a: named thing\n'
        if 'represented_as' in schemaData['nodes'][n]:
            temp += '    represented_as: ' + schemaData['nodes'][n]['represented_as'] + '\n'
        else:
            temp += '    represented_as: node\n'
        if 'label_in_input' in schemaData['nodes'][n]:
            temp += '    label_in_input: ' + schemaData['nodes'][n]['label_in_input'] + '\n'
        if 'preferred_id' in schemaData['nodes'][n]:
            temp += '    preferred_id: ' + schemaData['nodes'][n]['preferred_id'] + '\n'
        else:
            temp += '    preferred_id: fhir_id\n'
        temp += '    label_in_input: ' + n + '\n'
        temp += '    properties:\n'
        # get property values from schemaData if exists
        for pKey in schemaData['nodes'][n]['properties']:
                temp += '        ' + pKey + ': ' + schemaData['nodes'][n]['properties'][pKey] + '\n'
        #elif schemaData['nodes']['properties']:
            #print("----> ", schemaData['nodes']['properties'])
        """ else:
            for attr in schemaData['nodes'][n]:
                temp += '        ' + attr + ': str\n' """
        temp += '\n'
        file.write(temp)
    file.write('\n')
    #extract all relationship types and generate basic yaml config part for relationships
    #if not edgeTypes: edgeTypes = set()
    for u, v, a in graph.edges(data=True):
        #edge_label = graph[u][v].get('edge_type', '')
        source_label = graph.nodes[u].get('label')
        target_label = graph.nodes[v].get('label')
        if source_label == 'resource':
            source_label = graph.nodes[u].get('resourceType', str(u))
        if target_label == 'resource':
            target_label = graph.nodes[v].get('resourceType', str(v))
        source_label = source_label.capitalize()
        #target_label = target_label.capitalize()
        if source_label + ' to ' + target_label + ' association' in schemaData['edges']:
            # add missing attributes
            continue
        elif source_label + ' derived from ' + target_label + ' association' in schemaData['edges']:
            continue
        elif source_label + ' has member ' + target_label + ' association' in schemaData['edges']:
            continue
        elif source_label + ' reasoned by ' + target_label + ' association' in schemaData['edges']:
            continue
        elif source_label + ' is ' + target_label + ' association' in schemaData['edges']:
            continue
        else:
            #schemaData['edges'][source_label + ' to ' + target_label + ' association'] = set()
            schemaData['edges'][source_label + ' to ' + target_label + ' association'] = {
                'is_a': 'association',
                'represented_as': 'edge',
                'label_in_input': source_label + '_to_' + target_label,
                'properties': a
            }
    for label in schemaData['edges']:
        temp = '' + label + ':\n'
        for key in schemaData['edges'][label]:
            if key == 'properties':
                temp += '  properties:\n'
                for prop in schemaData['edges'][label][key]:
                    temp += '    ' + prop + ': ' + schemaData['edges'][label][key][prop] + '\n'
            else:
                temp+= '  ' + key + ': ' + schemaData['edges'][label][key] + '\n'
        temp += '\n'
        file.write(temp)
    file.close()
 def loadManualSchema(path):
    schemaData = {
        'nodes': {},
        'edges': {}
    }
    edgeTypes = set()
    with open(path, 'r') as file:
        # Load YAML with comments stripped
        data = yaml.safe_load(file)
    for label, attrs in data.items():
        cLabel = label.capitalize()
        if not label == 'Title':
            if attrs["represented_as"] == 'node':
                if not hasattr(schemaData['nodes'], cLabel):
                    schemaData['nodes'][cLabel] = set()
                #assuming uniqueness in schema file here. If the same node type exits twice, it will be overwritten.
                schemaData['nodes'][cLabel] = attrs
                #for a in attrs:
                #print(v)
                """ for k, v in attrs:
                    if not k == ''
                    schemaData['nodes'][label][k] = v """
            else:
                if not hasattr(schemaData['edges'], cLabel):
                    schemaData['edges'][cLabel] = set()
                #assuming uniqueness in schema file here. If the same node type exits twice, it will be overwritten.
                schemaData['edges'][cLabel] = attrs
    return schemaData
--- a/testData/Alfonzo975_Medhurst46_cab042ec-9851-e5ed-80c8-0952376f5b08.json
+++ b/testData/Alfonzo975_Medhurst46_cab042ec-9851-e5ed-80c8-0952376f5b08.json
--- a/testData/Armand155_Wilkinson796_4b633a8e-a312-8675-4af4-3519bfa6b00e.json
+++ b/testData/Armand155_Wilkinson796_4b633a8e-a312-8675-4af4-3519bfa6b00e.json
--- a/testData/Conrad619_Zemlak964_881bcdd0-ce56-0da9-f297-696da35bd4a3.json
+++ b/testData/Conrad619_Zemlak964_881bcdd0-ce56-0da9-f297-696da35bd4a3.json
--- a/testData/Daysi106_Borer986_0096fcc6-e2d6-5aed-4790-beda6322c9be.json
+++ b/testData/Daysi106_Borer986_0096fcc6-e2d6-5aed-4790-beda6322c9be.json
--- a/testData/Domingo513_Durgan499_97fcce97-37ab-3fa7-d3d7-0729d60afcb5.json
+++ b/testData/Domingo513_Durgan499_97fcce97-37ab-3fa7-d3d7-0729d60afcb5.json
--- a/testData/Donnell534_Koss676_5fcaaba4-cfdf-43a8-95c6-7d9f2fa6905e.json
+++ b/testData/Donnell534_Koss676_5fcaaba4-cfdf-43a8-95c6-7d9f2fa6905e.json
--- a/testData/Dorthy94_Klocko335_50390ac6-8c15-46f2-3b23-767f52a2e80c.json
+++ b/testData/Dorthy94_Klocko335_50390ac6-8c15-46f2-3b23-767f52a2e80c.json
--- a/testData/Emmie273_Reinger292_630e4b67-6e16-6bc7-6f28-2544b1a5d4d7.json
+++ b/testData/Emmie273_Reinger292_630e4b67-6e16-6bc7-6f28-2544b1a5d4d7.json
--- a/testData/Florentino8_Abshire638_462cc215-c055-3e77-8e4c-9d3bf7f886f5.json
+++ b/testData/Florentino8_Abshire638_462cc215-c055-3e77-8e4c-9d3bf7f886f5.json
--- a/testData/Jadwiga271_Thompson596_9973ad84-42ec-deda-23c2-906f81838e93.json
+++ b/testData/Jadwiga271_Thompson596_9973ad84-42ec-deda-23c2-906f81838e93.json
--- a/testData/Jimmie93_Pfeffer420_b60cecd8-24c6-7983-3915-3bd6e8ede863.json
+++ b/testData/Jimmie93_Pfeffer420_b60cecd8-24c6-7983-3915-3bd6e8ede863.json
--- a/testData/Johnny786_Schoen8_a7a285c0-4714-dd3c-4837-8719c9b67873.json
+++ b/testData/Johnny786_Schoen8_a7a285c0-4714-dd3c-4837-8719c9b67873.json
--- a/testData/Lala778_Jerilyn993_Cole117_ecf9d885-b712-e318-a6fa-f1ec9ffdbde8.json
+++ b/testData/Lala778_Jerilyn993_Cole117_ecf9d885-b712-e318-a6fa-f1ec9ffdbde8.json
--- a/testData/Lilia791_Herrera193_8ebd900a-3563-5008-1fa9-9614ef666647.json
+++ b/testData/Lilia791_Herrera193_8ebd900a-3563-5008-1fa9-9614ef666647.json
--- a/testData/Lois157_Becker968_258cfb42-3e7c-be88-92fd-f31c94f3e76e.json
+++ b/testData/Lois157_Becker968_258cfb42-3e7c-be88-92fd-f31c94f3e76e.json
--- a/testData/Marlana402_Hansen121_8c570871-8d00-8c04-5da3-1ef43ed00f76.json
+++ b/testData/Marlana402_Hansen121_8c570871-8d00-8c04-5da3-1ef43ed00f76.json
--- a/testData/Monte325_Prosacco716_2268a882-1b01-f71f-4377-a0dd87a78dd9.json
+++ b/testData/Monte325_Prosacco716_2268a882-1b01-f71f-4377-a0dd87a78dd9.json
--- a/testData/Murray856_Lesch175_0759e805-61d1-ba2b-8b14-9e4308784a84.json
+++ b/testData/Murray856_Lesch175_0759e805-61d1-ba2b-8b14-9e4308784a84.json
--- a/testData/Niesha86_Anderson154_2e96848a-43c1-78ea-16f2-d9d471d0f9d2.json
+++ b/testData/Niesha86_Anderson154_2e96848a-43c1-78ea-16f2-d9d471d0f9d2.json
--- a/testData/Perry780_Gaylord332_c6456550-9c7a-0cf3-c18b-b266ba91ef1b.json
+++ b/testData/Perry780_Gaylord332_c6456550-9c7a-0cf3-c18b-b266ba91ef1b.json
--- a/testData/Petronila724_Rutherford999_ec8f677e-0e42-d13a-b292-ff50216121b2.json
+++ b/testData/Petronila724_Rutherford999_ec8f677e-0e42-d13a-b292-ff50216121b2.json
--- a/testData/Vernetta697_Garnett735_Schaden604_d69b1a9b-e1e3-0b42-07c1-b37cae947f7f.json
+++ b/testData/Vernetta697_Garnett735_Schaden604_d69b1a9b-e1e3-0b42-07c1-b37cae947f7f.json
--- a/testData/hospitalInformation1693908535569.json
+++ b/testData/hospitalInformation1693908535569.json
--- a/testData/practitionerInformation1693908535569.json
+++ b/testData/practitionerInformation1693908535569.json