release commit

This commit is contained in:
2025-04-16 22:12:19 +02:00
commit a9db0be88a
89 changed files with 2336827 additions and 0 deletions

View File

@ -0,0 +1,148 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Sebastian Lobentanzer
# ...
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
Module data directory, including:
* The BioLink database schema
* The default config files
"""
from typing import Any, Optional
import os
import warnings
import yaml
import appdirs
__all__ = ["module_data", "module_data_path", "read_config", "config", "reset"]
_USER_CONFIG_DIR = appdirs.user_config_dir("biocypher", "saezlab")
_USER_CONFIG_FILE = os.path.join(_USER_CONFIG_DIR, "conf.yaml")
class MyLoader(yaml.SafeLoader):
def construct_scalar(self, node):
# Check if the scalar contains double quotes and an escape sequence
value = super().construct_scalar(node)
q = bool(node.style == '"')
b = bool("\\" in value.encode("unicode_escape").decode("utf-8"))
if q and b:
warnings.warn(
(
"Double quotes detected in YAML configuration scalar: "
f"{value.encode('unicode_escape')}. "
"These allow escape sequences and may cause problems, for "
"instance with the Neo4j admin import files (e.g. '\\t'). "
"Make sure you wanted to do this, and use single quotes "
"whenever possible."
),
category=UserWarning,
)
return value
def module_data_path(name: str) -> str:
"""
Absolute path to a YAML file shipped with the module.
"""
here = os.path.dirname(os.path.abspath(__file__))
return os.path.join(here, f"{name}.yaml")
def module_data(name: str) -> Any:
"""
Retrieve the contents of a YAML file shipped with this module.
"""
path = module_data_path(name)
return _read_yaml(path)
def _read_yaml(path: str) -> Optional[dict]:
if os.path.exists(path):
with open(path, "r") as fp:
return yaml.load(fp.read(), Loader=MyLoader)
def read_config() -> dict:
"""
Read the module config.
Read and merge the built-in default, the user level and directory level
configuration, with the later taking precendence over the former.
TODO explain path configuration
"""
defaults = module_data("biocypher_config")
user = _read_yaml(_USER_CONFIG_FILE) or {}
# TODO account for .yml?
local = (
_read_yaml("biocypher_config.yaml")
or _read_yaml("config/biocypher_config.yaml")
or {}
)
for key in defaults:
value = (
local[key] if key in local else user[key] if key in user else None
)
if value is not None:
if isinstance(
defaults[key], str
): # first level config (like title)
defaults[key] = value
else:
defaults[key].update(value)
return defaults
def config(*args, **kwargs) -> Optional[Any]:
"""
Set or get module config parameters.
"""
if args and kwargs:
raise ValueError(
"Setting and getting values in the same call is not allowed.",
)
if args:
result = tuple(globals()["_config"].get(key, None) for key in args)
return result[0] if len(result) == 1 else result
for key, value in kwargs.items():
globals()["_config"][key].update(value)
def reset():
"""
Reload configuration from the config files.
"""
globals()["_config"] = read_config()
reset()
def update_from_file(path: str):
"""
Update the module configuration from a YAML file.
"""
config(**_read_yaml(path))

View File

@ -0,0 +1,141 @@
Title: BioCypher python module configuration file
## Some options are not used by default. Uncomment them to use them.
biocypher:
### Required parameters ###
## DBMS type
dbms: neo4j
## Schema configuration
# schema_config_path: config/schema_config.yaml
## Offline mode: do not connect to a running DBMS instance
## Can be used e.g. for writing batch import files
offline: true
## Strict mode: do not allow to create new nodes or relationships without
## specifying source, version, and license parameters
strict_mode: false
## Ontology configuration
head_ontology:
url: https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
root_node: entity
# switch_label_and_id: true
### Optional parameters ###
## Logging
# Write log to disk
log_to_disk: true
# Activate more granular logging
debug: true
# Change the log directory
# log_directory: biocypher-log
## Data output directory
# output_directory: biocypher-out
## Resource cache directory
# cache_directory: .cache
## Optional tail ontologies
# tail_ontologies:
# so:
# url: test/ontologies/so.owl
# head_join_node: sequence variant
# tail_join_node: sequence_variant
# switch_label_and_id: true
# mondo:
# url: test/ontologies/mondo.owl
# head_join_node: disease
# tail_join_node: disease
# switch_label_and_id: true
### DBMS configuration ###
neo4j:
### Neo4j configuration ###
## Database name
database_name: neo4j
## Wipe DB before import (offline mode: --force)
wipe: true
## Neo4j authentication
uri: neo4j://localhost:7687
user: neo4j
password: neo4j
## Neo4j admin import batch writer settings
delimiter: ";"
array_delimiter: "|"
quote_character: "'"
## MultiDB functionality
## Set to false for using community edition or older versions of Neo4j
multi_db: true
## Import options
skip_duplicate_nodes: false
skip_bad_relationships: false
## Import call prefixes
# import_call_bin_prefix: bin/
# import_call_file_prefix: path/to/files/
postgresql:
### PostgreSQL configuration ###
# PostgreSQL connection credentials
database_name: postgres # DB name
user: postgres # user name
password: postgres # password
host: localhost # host
port: 5432 # port
# PostgreSQL import batch writer settings
quote_character: '"'
delimiter: '\t'
# import_call_bin_prefix: '' # path to "psql"
# import_call_file_prefix: '/path/to/files'
rdf:
### RDF configuration ###
rdf_format: turtle
sqlite:
### SQLite configuration ###
# SQLite connection credentials
database_name: sqlite.db # DB name
# SQLite import batch writer settings
quote_character: '"'
delimiter: '\t'
# import_call_bin_prefix: '' # path to "sqlite3"
# import_call_file_prefix: '/path/to/files'
csv:
### CSV/Pandas configuration ###
delimiter: ","
networkx:
### NetworkX configuration ###
some_config: some_value # placeholder for technical reasons TODO

View File

@ -0,0 +1,5 @@
# We test the quote detection
valid: 'This is a valid string'
also_valid: "This is also a valid string"
invalid: "\t"

View File

@ -0,0 +1,140 @@
Title: BioCypher graph schema configuration file
# ---
# "Named Things"
# ---
protein:
represented_as: node
preferred_id: uniprot
input_label: protein
db_collection_name: proteins
properties:
name: str
score: float
taxon: int
genes: str[]
microRNA:
represented_as: node
preferred_id: mirbase.mature
input_label: mirna
complex:
synonym_for: macromolecular complex
represented_as: node
preferred_id: complexportal
input_label: complex
pathway:
represented_as: node
preferred_id: [reactome, wikipathways]
input_label: [reactome, wikipathways]
gene:
represented_as: node
preferred_id: hgnc
input_label: [hgnc, ensg]
exclude_properties: accession
disease:
represented_as: node
preferred_id: doid
input_label: Disease
side effect:
is_a: phenotypic feature
represented_as: node
preferred_id: sider.effect
input_label: sider
sequence variant:
represented_as: node
preferred_id: [clinically relevant, known, somatic]
input_label: [Clinically_relevant_variant, Known_variant, Somatic_mutation]
properties:
source: str
original_source: str
effect: str
biotype: str
snRNA sequence:
is_a: nucleic acid entity
represented_as: node
preferred_id: [intact, rnacentral]
input_label: [intact_snrna, rnacentral_snrna]
properties:
ac: str
fullName: str
shortName: str
preferredName: str
exclude_properties: sequence
DNA sequence:
is_a: nucleic acid entity
represented_as: node
preferred_id: ensembl
input_label: dna
properties:
ac: str
fullName: str
shortName: str
preferredName: str
sequence: str
dsDNA sequence:
is_a: [DNA sequence, nucleic acid entity]
inherit_properties: True
represented_as: node
preferred_id: [intact, uniparc]
input_label: [intact_dsdna, uniprot_archive_dsdna]
# ---
# Associations
# ---
post translational interaction:
is_a: pairwise molecular interaction
represented_as: node
label_as_edge: INTERACTS_POST_TRANSLATIONAL
input_label: post_translational
phosphorylation:
is_a: post translational interaction
represented_as: edge
input_label: phosphorylation
gene to disease association:
represented_as: edge
label_as_edge: PERTURBED_IN_DISEASE
input_label: [protein_disease, gene_disease]
exclude_properties: accession
mutation to tissue association:
is_a: [genotype to tissue association, entity to tissue association, association]
represented_as: edge
label_as_edge: Is_Mutated_In
input_label: Gene_Is_Mutated_In_Cell_Tissue
variant to gene association: # -> Known.... and Somatic....
represented_as: edge
source: [known.sequence variant, somatic.sequence variant]
target: gene
input_label: [
VARIANT_FOUND_IN_GENE_Known_variant_Gene,
VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene
]
gene to gene association:
represented_as: edge
input_label: gene_gene
properties:
directional: bool
curated: bool
score: float
id: str # should be removed
gene to variant association: # should be removed
is_a: gene to variant association
represented_as: edge
input_label: gene_variant

View File

@ -0,0 +1,3 @@
disconnected:
represented_as: node
label_in_input: disconnected

View File

@ -0,0 +1,152 @@
Title: BioCypher graph schema configuration file
# ---
# "Named Things"
# ---
protein:
represented_as: node
preferred_id: uniprot
input_label: protein
db_collection_name: proteins
properties:
name: str
score: float
taxon: int
genes: str[]
microRNA:
represented_as: node
preferred_id: mirbase.mature
input_label: mirna
complex:
synonym_for: macromolecular complex
represented_as: node
preferred_id: complexportal
input_label: complex
pathway:
represented_as: node
preferred_id: [reactome, wikipathways]
input_label: [reactome, wikipathways]
gene:
represented_as: node
preferred_id: hgnc
input_label: [hgnc, ensg]
exclude_properties: accession
disease:
represented_as: node
preferred_id: doid
input_label: Disease
side effect:
is_a: phenotypic feature
represented_as: node
preferred_id: sider.effect
input_label: sider
sequence variant:
represented_as: node
preferred_id: [clinically relevant, known, somatic]
input_label: [Clinically_relevant_variant, Known_variant, Somatic_mutation]
properties:
source: str
original_source: str
effect: str
biotype: str
altered gene product level:
represented_as: node
input_label: agpl
decreased gene product level:
represented_as: node
input_label: agpl_decreased
lethal variant:
represented_as: node
input_label: lethal
snRNA sequence:
is_a: nucleic acid entity
represented_as: node
preferred_id: [intact, rnacentral]
input_label: [intact_snrna, rnacentral_snrna]
properties:
ac: str
fullName: str
shortName: str
preferredName: str
exclude_properties: sequence
DNA sequence:
is_a: nucleic acid entity
represented_as: node
preferred_id: ensembl
input_label: dna
properties:
ac: str
fullName: str
shortName: str
preferredName: str
sequence: str
dsDNA sequence:
is_a: [DNA sequence, nucleic acid entity]
inherit_properties: True
represented_as: node
preferred_id: [intact, uniparc]
input_label: [intact_dsdna, uniprot_archive_dsdna]
# ---
# Associations
# ---
post translational interaction:
is_a: pairwise molecular interaction
represented_as: node
label_as_edge: INTERACTS_POST_TRANSLATIONAL
input_label: post_translational
phosphorylation:
is_a: post translational interaction
represented_as: edge
use_id: false
input_label: phosphorylation
gene to disease association:
represented_as: edge
label_as_edge: PERTURBED_IN_DISEASE
input_label: [protein_disease, gene_disease]
exclude_properties: accession
mutation to tissue association:
is_a: [genotype to tissue association, entity to tissue association, association]
represented_as: edge
label_as_edge: Is_Mutated_In
input_label: Gene_Is_Mutated_In_Cell_Tissue
variant to gene association: # -> Known.... and Somatic....
represented_as: edge
source: [known.sequence variant, somatic.sequence variant]
target: gene
input_label: [
VARIANT_FOUND_IN_GENE_Known_variant_Gene,
VARIANT_FOUND_IN_GENE_Somatic_mutation_Gene
]
gene to gene association:
represented_as: edge
input_label: gene_gene
properties:
directional: bool
curated: bool
score: float
gene to variant association:
is_a: gene to variant association
represented_as: edge
input_label: gene_variant