release commit

2025-04-16 22:12:19 +02:00
commit a9db0be88a
89 changed files with 2336827 additions and 0 deletions
--- a/biocypher/_config/init.py
+++ b/biocypher/_config/init.py
@ -0,0 +1,148 @@
+#!/usr/bin/env python
+
+#
+# Copyright 2021, Heidelberg University Clinic
+#
+# File author(s): Sebastian Lobentanzer
+#                 ...
+#
+# Distributed under MIT licence, see the file `LICENSE`.
+#
+"""
+Module data directory, including:
+
+* The BioLink database schema
+* The default config files
+"""
+
+from typing import Any, Optional
+import os
+import warnings
+
+import yaml
+import appdirs
+
+__all__ = ["module_data", "module_data_path", "read_config", "config", "reset"]
+
+_USER_CONFIG_DIR = appdirs.user_config_dir("biocypher", "saezlab")
+_USER_CONFIG_FILE = os.path.join(_USER_CONFIG_DIR, "conf.yaml")
+
+
+class MyLoader(yaml.SafeLoader):
+    def construct_scalar(self, node):
+        # Check if the scalar contains double quotes and an escape sequence
+        value = super().construct_scalar(node)
+        q = bool(node.style == '"')
+        b = bool("\\" in value.encode("unicode_escape").decode("utf-8"))
+        if q and b:
+            warnings.warn(
+                (
+                    "Double quotes detected in YAML configuration scalar: "
+                    f"{value.encode('unicode_escape')}. "
+                    "These allow escape sequences and may cause problems, for "
+                    "instance with the Neo4j admin import files (e.g. '\\t'). "
+                    "Make sure you wanted to do this, and use single quotes "
+                    "whenever possible."
+                ),
+                category=UserWarning,
+            )
+        return value
+
+
+def module_data_path(name: str) -> str:
+    """
+    Absolute path to a YAML file shipped with the module.
+    """
+
+    here = os.path.dirname(os.path.abspath(__file__))
+
+    return os.path.join(here, f"{name}.yaml")
+
+
+def module_data(name: str) -> Any:
+    """
+    Retrieve the contents of a YAML file shipped with this module.
+    """
+
+    path = module_data_path(name)
+
+    return _read_yaml(path)
+
+
+def _read_yaml(path: str) -> Optional[dict]:
+    if os.path.exists(path):
+        with open(path, "r") as fp:
+            return yaml.load(fp.read(), Loader=MyLoader)
+
+
+def read_config() -> dict:
+    """
+    Read the module config.
+
+    Read and merge the built-in default, the user level and directory level
+    configuration, with the later taking precendence over the former.
+
+    TODO explain path configuration
+    """
+
+    defaults = module_data("biocypher_config")
+    user = _read_yaml(_USER_CONFIG_FILE) or {}
+    # TODO account for .yml?
+    local = (
+        _read_yaml("biocypher_config.yaml")
+        or _read_yaml("config/biocypher_config.yaml")
+        or {}
+    )
+
+    for key in defaults:
+        value = (
+            local[key] if key in local else user[key] if key in user else None
+        )
+
+        if value is not None:
+            if isinstance(
+                defaults[key], str
+            ):  # first level config (like title)
+                defaults[key] = value
+            else:
+                defaults[key].update(value)
+
+    return defaults
+
+
+def config(*args, **kwargs) -> Optional[Any]:
+    """
+    Set or get module config parameters.
+    """
+
+    if args and kwargs:
+        raise ValueError(
+            "Setting and getting values in the same call is not allowed.",
+        )
+
+    if args:
+        result = tuple(globals()["_config"].get(key, None) for key in args)
+
+        return result[0] if len(result) == 1 else result
+
+    for key, value in kwargs.items():
+        globals()["_config"][key].update(value)
+
+
+def reset():
+    """
+    Reload configuration from the config files.
+    """
+
+    globals()["_config"] = read_config()
+
+
+reset()
+
+
+def update_from_file(path: str):
+    """
+    Update the module configuration from a YAML file.
+    """
+
+    config(**_read_yaml(path))
--- a/biocypher/_config/biocypher_config.yaml
+++ b/biocypher/_config/biocypher_config.yaml
@ -0,0 +1,141 @@
+Title: BioCypher python module configuration file
+
+## Some options are not used by default. Uncomment them to use them.
+
+biocypher:
+  ### Required parameters ###
+  ## DBMS type
+
+  dbms: neo4j
+
+  ## Schema configuration
+
+  # schema_config_path: config/schema_config.yaml
+
+  ## Offline mode: do not connect to a running DBMS instance
+  ## Can be used e.g. for writing batch import files
+
+  offline: true
+
+  ## Strict mode: do not allow to create new nodes or relationships without
+  ## specifying source, version, and license parameters
+
+  strict_mode: false
+
+  ## Ontology configuration
+
+  head_ontology:
+    url: https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
+    root_node: entity
+    # switch_label_and_id: true
+
+  ### Optional parameters ###
+
+  ## Logging
+  # Write log to disk
+  log_to_disk: true
+
+  # Activate more granular logging
+  debug: true
+
+  # Change the log directory
+  # log_directory: biocypher-log
+
+  ## Data output directory
+  # output_directory: biocypher-out
+
+  ## Resource cache directory
+  # cache_directory: .cache
+
+  ## Optional tail ontologies
+
+  # tail_ontologies:
+  #   so:
+  #     url: test/ontologies/so.owl
+  #     head_join_node: sequence variant
+  #     tail_join_node: sequence_variant
+  #     switch_label_and_id: true
+  #   mondo:
+  #     url: test/ontologies/mondo.owl
+  #     head_join_node: disease
+  #     tail_join_node: disease
+  #     switch_label_and_id: true
+
+### DBMS configuration ###
+
+neo4j:
+  ### Neo4j configuration ###
+  ## Database name
+
+  database_name: neo4j
+
+  ## Wipe DB before import (offline mode: --force)
+
+  wipe: true
+
+  ## Neo4j authentication
+
+  uri: neo4j://localhost:7687
+  user: neo4j
+  password: neo4j
+
+  ## Neo4j admin import batch writer settings
+
+  delimiter: ";"
+  array_delimiter: "|"
+  quote_character: "'"
+
+  ## MultiDB functionality
+  ## Set to false for using community edition or older versions of Neo4j
+
+  multi_db: true
+
+  ## Import options
+
+  skip_duplicate_nodes: false
+  skip_bad_relationships: false
+
+  ## Import call prefixes
+
+  # import_call_bin_prefix: bin/
+  # import_call_file_prefix: path/to/files/
+
+postgresql:
+  ### PostgreSQL configuration ###
+
+  # PostgreSQL connection credentials
+  database_name: postgres # DB name
+  user: postgres # user name
+  password: postgres # password
+  host: localhost # host
+  port: 5432 # port
+
+  # PostgreSQL import batch writer settings
+  quote_character: '"'
+  delimiter: '\t'
+  # import_call_bin_prefix: '' # path to "psql"
+  # import_call_file_prefix: '/path/to/files'
+
+rdf:
+  ### RDF configuration ###
+  rdf_format: turtle
+
+sqlite:
+  ### SQLite configuration ###
+
+  # SQLite connection credentials
+  database_name: sqlite.db # DB name
+
+  # SQLite import batch writer settings
+  quote_character: '"'
+  delimiter: '\t'
+  # import_call_bin_prefix: '' # path to "sqlite3"
+  # import_call_file_prefix: '/path/to/files'
+
+csv:
+  ### CSV/Pandas configuration ###
+  delimiter: ","
+
+networkx:
+  ### NetworkX configuration ###
+  some_config: some_value # placeholder for technical reasons TODO
--- a/biocypher/_config/test_config.yaml
+++ b/biocypher/_config/test_config.yaml
@ -0,0 +1,5 @@
+# We test the quote detection
+
+valid: 'This is a valid string'
+also_valid: "This is also a valid string"
+invalid: "\t"
--- a/biocypher/_config/test_schema_config.yaml
+++ b/biocypher/_config/test_schema_config.yaml
@ -0,0 +1,140 @@
+Title: BioCypher graph schema configuration file
+
+# ---
+# "Named Things"
+# ---
+
+protein:
+  represented_as: node
+  preferred_id: uniprot
+  input_label: protein
+  db_collection_name: proteins
+  properties:
+    name: str
+    score: float
+    taxon: int
+    genes: str[]
+
+microRNA:
+  represented_as: node
+  preferred_id: mirbase.mature
+  input_label: mirna
+
+complex:
+  synonym_for: macromolecular complex
+  represented_as: node
+  preferred_id: complexportal
+  input_label: complex
+
+pathway:
+  represented_as: node
+  preferred_id: [reactome, wikipathways]
+  input_label: [reactome, wikipathways]
+
+gene:
+  represented_as: node
+  preferred_id: hgnc
+  input_label: [hgnc, ensg]
+  exclude_properties: accession
+
+disease:
+  represented_as: node
+  preferred_id: doid
+  input_label: Disease
+
+side effect:
+  is_a: phenotypic feature
+  represented_as: node
+  preferred_id: sider.effect
+  input_label: sider
+
+sequence variant:
+  represented_as: node
+  preferred_id: [clinically relevant, known, somatic]
+  input_label: [Clinically_relevant_variant, Known_variant, Somatic_mutation]
+  properties:
+    source: str
+    original_source: str
+    effect: str
+    biotype: str
+
+snRNA sequence:
+  is_a: nucleic acid entity
+  represented_as: node
+  preferred_id: [intact, rnacentral]
+  input_label: [intact_snrna, rnacentral_snrna]
+  properties:
+    ac: str
+    fullName: str
+    shortName: str
+    preferredName: str
+  exclude_properties: sequence
+
+DNA sequence:
+  is_a: nucleic acid entity
+  represented_as: node
+  preferred_id: ensembl
+  input_label: dna
+  properties:
+    ac: str
+    fullName: str
+    shortName: str
+    preferredName: str
+    sequence: str
+
+dsDNA sequence:
+  is_a: [DNA sequence, nucleic acid entity]
+  inherit_properties: True
+  represented_as: node
+  preferred_id: [intact, uniparc]
+  input_label: [intact_dsdna, uniprot_archive_dsdna]
+
+# ---
+# Associations
+# ---
+
+post translational interaction:
+  is_a: pairwise molecular interaction
+  represented_as: node
+  label_as_edge: INTERACTS_POST_TRANSLATIONAL
+  input_label: post_translational
+
+phosphorylation:
+  is_a: post translational interaction
+  represented_as: edge
+  input_label: phosphorylation
+
+gene to disease association:
+  represented_as: edge
+  label_as_edge: PERTURBED_IN_DISEASE
+  input_label: [protein_disease, gene_disease]
+  exclude_properties: accession
+
+mutation to tissue association:
+  is_a: [genotype to tissue association, entity to tissue association, association]
+  represented_as: edge
+  label_as_edge: Is_Mutated_In
+  input_label: Gene_Is_Mutated_In_Cell_Tissue
+
+variant to gene association: # -> Known.... and Somatic....
+  represented_as: edge
+  source: [known.sequence variant, somatic.sequence variant]
+  target: gene
+  input_label: [
+    VARIANT_FOUND_IN_GENE_Known_variant_Gene,
+    VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene
+  ]
+
+gene to gene association:
+  represented_as: edge
+  input_label: gene_gene
+  properties:
+    directional: bool
+    curated: bool
+    score: float
+    id: str  # should be removed
+
+gene to variant association:  # should be removed
+  is_a: gene to variant association
+  represented_as: edge
+  input_label: gene_variant
--- a/biocypher/_config/test_schema_config_disconnected.yaml
+++ b/biocypher/_config/test_schema_config_disconnected.yaml
@ -0,0 +1,3 @@
+disconnected:
+  represented_as: node
+  label_in_input: disconnected
--- a/biocypher/_config/test_schema_config_extended.yaml
+++ b/biocypher/_config/test_schema_config_extended.yaml
@ -0,0 +1,152 @@
+Title: BioCypher graph schema configuration file
+
+# ---
+# "Named Things"
+# ---
+
+protein:
+  represented_as: node
+  preferred_id: uniprot
+  input_label: protein
+  db_collection_name: proteins
+  properties:
+    name: str
+    score: float
+    taxon: int
+    genes: str[]
+
+microRNA:
+  represented_as: node
+  preferred_id: mirbase.mature
+  input_label: mirna
+
+complex:
+  synonym_for: macromolecular complex
+  represented_as: node
+  preferred_id: complexportal
+  input_label: complex
+
+pathway:
+  represented_as: node
+  preferred_id: [reactome, wikipathways]
+  input_label: [reactome, wikipathways]
+
+gene:
+  represented_as: node
+  preferred_id: hgnc
+  input_label: [hgnc, ensg]
+  exclude_properties: accession
+
+disease:
+  represented_as: node
+  preferred_id: doid
+  input_label: Disease
+
+side effect:
+  is_a: phenotypic feature
+  represented_as: node
+  preferred_id: sider.effect
+  input_label: sider
+
+sequence variant:
+  represented_as: node
+  preferred_id: [clinically relevant, known, somatic]
+  input_label: [Clinically_relevant_variant, Known_variant, Somatic_mutation]
+  properties:
+    source: str
+    original_source: str
+    effect: str
+    biotype: str
+
+altered gene product level:
+  represented_as: node
+  input_label: agpl
+
+decreased gene product level:
+  represented_as: node
+  input_label: agpl_decreased
+
+lethal variant:
+  represented_as: node
+  input_label: lethal
+
+snRNA sequence:
+  is_a: nucleic acid entity
+  represented_as: node
+  preferred_id: [intact, rnacentral]
+  input_label: [intact_snrna, rnacentral_snrna]
+  properties:
+    ac: str
+    fullName: str
+    shortName: str
+    preferredName: str
+  exclude_properties: sequence
+
+DNA sequence:
+  is_a: nucleic acid entity
+  represented_as: node
+  preferred_id: ensembl
+  input_label: dna
+  properties:
+    ac: str
+    fullName: str
+    shortName: str
+    preferredName: str
+    sequence: str
+
+dsDNA sequence:
+  is_a: [DNA sequence, nucleic acid entity]
+  inherit_properties: True
+  represented_as: node
+  preferred_id: [intact, uniparc]
+  input_label: [intact_dsdna, uniprot_archive_dsdna]
+
+# ---
+# Associations
+# ---
+
+post translational interaction:
+  is_a: pairwise molecular interaction
+  represented_as: node
+  label_as_edge: INTERACTS_POST_TRANSLATIONAL
+  input_label: post_translational
+
+phosphorylation:
+  is_a: post translational interaction
+  represented_as: edge
+  use_id: false
+  input_label: phosphorylation
+
+gene to disease association:
+  represented_as: edge
+  label_as_edge: PERTURBED_IN_DISEASE
+  input_label: [protein_disease, gene_disease]
+  exclude_properties: accession
+
+mutation to tissue association:
+  is_a: [genotype to tissue association, entity to tissue association, association]
+  represented_as: edge
+  label_as_edge: Is_Mutated_In
+  input_label: Gene_Is_Mutated_In_Cell_Tissue
+
+variant to gene association: # -> Known.... and Somatic....
+  represented_as: edge
+  source: [known.sequence variant, somatic.sequence variant]
+  target: gene
+  input_label: [
+    VARIANT_FOUND_IN_GENE_Known_variant_Gene,
+    VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene
+  ]
+
+gene to gene association:
+  represented_as: edge
+  input_label: gene_gene
+  properties:
+    directional: bool
+    curated: bool
+    score: float
+
+gene to variant association:
+  is_a: gene to variant association
+  represented_as: edge
+  input_label: gene_variant