release commit

This commit is contained in:
2025-04-16 22:12:19 +02:00
commit a9db0be88a
89 changed files with 2336827 additions and 0 deletions

View File

View File

@ -0,0 +1,241 @@
import os
from biocypher._logger import logger
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
class _ArangoDBBatchWriter(_Neo4jBatchWriter):
"""
Class for writing node and edge representations to disk using the format
specified by ArangoDB for the use of "arangoimport". Output files are
similar to Neo4j, but with a different header format.
"""
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the neo4j admin import location
"""
return ""
def _get_import_script_name(self) -> str:
"""
Returns the name of the neo4j admin import script
Returns:
str: The name of the import script (ending in .sh)
"""
return "arangodb-import-call.sh"
def _write_node_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`,
containing only the header for this type of node.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.node_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.node_property_dict.items():
# create header CSV with ID, properties, labels
_id = "_key"
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(label)
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
# check if file already exists
if os.path.exists(header_path):
logger.warning(
f"File {header_path} already exists. Overwriting."
)
# concatenate key:value in props
props_list = []
for k in props.keys():
props_list.append(f"{k}")
# create list of lists and flatten
# removes need for empty check of property list
out_list = [[_id], props_list]
out_list = [val for sublist in out_list for val in sublist]
with open(header_path, "w", encoding="utf-8") as f:
# concatenate with delimiter
row = self.delim.join(out_list)
f.write(row)
# add collection from schema config
collection = self.translator.ontology.mapping.extended_schema[
label
].get("db_collection_name", None)
# add file path to neo4 admin import statement
# do once for each part file
parts = self.parts.get(label, [])
if not parts:
raise ValueError(
f"No parts found for node label {label}. "
f"Check that the data was parsed first.",
)
for part in parts:
import_call_header_path = os.path.join(
self.import_call_file_prefix,
header,
)
import_call_parts_path = os.path.join(
self.import_call_file_prefix,
part,
)
self.import_call_nodes.add(
(
import_call_header_path,
import_call_parts_path,
collection,
)
)
return True
def _write_edge_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.edge_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.edge_property_dict.items():
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(label)
# paths
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
parts = f"{pascal_label}-part.*"
# check for file exists
if os.path.exists(header_path):
logger.warning(
f"Header file {header_path} already exists. Overwriting."
)
# concatenate key:value in props
props_list = []
for k in props.keys():
props_list.append(f"{k}")
out_list = ["_from", "_key", *props_list, "_to"]
with open(header_path, "w", encoding="utf-8") as f:
# concatenate with delimiter
row = self.delim.join(out_list)
f.write(row)
# add collection from schema config
if not self.translator.ontology.mapping.extended_schema.get(label):
for (
_,
v,
) in self.translator.ontology.mapping.extended_schema.items():
if v.get("label_as_edge") == label:
collection = v.get("db_collection_name", None)
break
else:
collection = self.translator.ontology.mapping.extended_schema[
label
].get("db_collection_name", None)
# add file path to neo4 admin import statement (import call path
# may be different from actual output path)
header_import_call_path = os.path.join(
self.import_call_file_prefix,
header,
)
parts_import_call_path = os.path.join(
self.import_call_file_prefix,
parts,
)
self.import_call_edges.add(
(
header_import_call_path,
parts_import_call_path,
collection,
)
)
return True
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: a bash command for neo4j-admin import
"""
import_call = (
f"{self.import_call_bin_prefix}arangoimp "
f"--type csv "
f'--separator="{self.escaped_delim}" '
)
if self.quote == "'":
import_call += f'--quote="{self.quote}" '
else:
import_call += f"--quote='{self.quote}' "
node_lines = ""
# node import calls: one line per node type
for header_path, parts_path, collection in self.import_call_nodes:
line = (
f"{import_call} "
f"--headers-file {header_path} "
f"--file= {parts_path} "
)
if collection:
line += f"--create-collection --collection {collection} "
node_lines += f"{line}\n"
edge_lines = ""
# edge import calls: one line per edge type
for header_path, parts_path, collection in self.import_call_edges:
import_call += f'--relationships="{header_path},{parts_path}" '
return node_lines + edge_lines

View File

@ -0,0 +1,502 @@
import os
import glob
import pandas as pd
from biocypher._logger import logger
from biocypher.output.write._batch_writer import parse_label, _BatchWriter
class _Neo4jBatchWriter(_BatchWriter):
"""
Class for writing node and edge representations to disk using the
format specified by Neo4j for the use of admin import. Each batch
writer instance has a fixed representation that needs to be passed
at instantiation via the :py:attr:`schema` argument. The instance
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
to convert and extend the hierarchy.
This class inherits from the abstract class "_BatchWriter" and implements the
Neo4j-specific methods:
- _write_node_headers
- _write_edge_headers
- _construct_import_call
- _write_array_string
"""
def __init__(self, *args, **kwargs):
"""
Constructor.
Check the version of Neo4j and adds a command scope if version >= 5.
Returns:
_Neo4jBatchWriter: An instance of the writer.
"""
# Should read the configuration and setup import_call_bin_prefix.
super().__init__(*args, **kwargs)
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the neo4j admin import location
"""
return "bin/"
def _write_array_string(self, string_list):
"""
Abstract method to output.write the string representation of an array into a .csv file
as required by the neo4j admin-import.
Args:
string_list (list): list of ontology strings
Returns:
str: The string representation of an array for the neo4j admin import
"""
string = self.adelim.join(string_list)
return f"{self.quote}{string}{self.quote}"
def _write_node_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`,
containing only the header for this type of node.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.node_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.node_property_dict.items():
_id = ":ID"
##MeDaX dev remark:
##From Fhir data we get case sensitive labels. E.g. 'Procedure' and 'procedure' are two distinct node types.
##Because we are converting Resources to more specific node classes using their "resourceType" attribute.
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(
parse_label(label)
)
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
parts = f"{pascal_label}-part.*"
existing_header = False
# check if file already exists
if os.path.exists(header_path):
logger.warning(
f"Header file `{header_path}` already exists. Overwriting.",
)
with open(header_path, "r", encoding="utf-8") as existing:
existing_header = existing.read().strip().split(self.delim)
# concatenate key:value in props
props_list = []
for k, v in props.items():
if v in ["int", "long", "integer"]:
props_list.append(f"{k}:long")
elif v in ["int[]", "long[]", "integer[]"]:
props_list.append(f"{k}:long[]")
elif v in ["float", "double", "dbl"]:
props_list.append(f"{k}:double")
elif v in ["float[]", "double[]"]:
props_list.append(f"{k}:double[]")
elif v in ["bool", "boolean"]:
# TODO Neo4j boolean support / spelling?
props_list.append(f"{k}:boolean")
elif v in ["bool[]", "boolean[]"]:
props_list.append(f"{k}:boolean[]")
elif v in ["str[]", "string[]"]:
props_list.append(f"{k}:string[]")
else:
props_list.append(f"{k}")
# create list of lists and flatten
out_list = [[_id], props_list, [":LABEL"]]
out_list = [val for sublist in out_list for val in sublist]
with open(header_path, "w", encoding="utf-8") as f:
# Check if header file already exists and has different columns
if os.path.exists(header_path):
if existing_header:
#existing_header = existing.read().strip().split(self.delim)
# Compare existing and new headers
if set(existing_header) != set(out_list):
# Get part files associated with this header
base_name = os.path.basename(header_path).replace("-header.csv", "")
part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
# Find the highest numbered part file without full sorting
highest_part = None
highest_number = -1
for part_file in part_files:
try:
# Extract number from filename (assuming format like "part123.csv")
file_name = os.path.basename(part_file)
number_part = file_name.split("part")[1].split(".")[0]
number = int(number_part)
if number > highest_number:
highest_number = number
highest_part = part_file
except (IndexError, ValueError):
# Skip files that don't match the expected pattern
continue
# Update each part file with the new columns
for part_file in part_files:
if part_file == highest_part:
print(f"Skipping the highest part file: {highest_part}")
continue
try:
#print("exi: ", existing_header)
#print("out: ", out_list)
df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
# Read the file without headers
# Write back to file WITHOUT including the header
df.to_csv(part_file, sep=self.delim, index=False, header=False)
print(f"Updated {part_file} with new columns in correct positions")
except Exception as e:
print(f"Error updating {part_file}: {e}")
# Write the new header
row = self.delim.join(out_list)
f.write(row)
# add file path to neo4 admin import statement (import call file
# path may be different from actual file path)
import_call_header_path = os.path.join(
self.import_call_file_prefix,
header,
)
import_call_parts_path = os.path.join(
self.import_call_file_prefix,
parts,
)
self.import_call_nodes.add(
(import_call_header_path, import_call_parts_path)
)
return True
def _write_edge_headers(self):
"""
Writes single CSV file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
Returns:
bool: The return value. True for success, False otherwise.
"""
# load headers from data parse
if not self.edge_property_dict:
logger.error(
"Header information not found. Was the data parsed first?",
)
return False
for label, props in self.edge_property_dict.items():
# translate label to PascalCase
pascal_label = self.translator.name_sentence_to_pascal(
parse_label(label)
)
# paths
header = f"{pascal_label}-header.csv"
header_path = os.path.join(
self.outdir,
header,
)
parts = f"{pascal_label}-part.*"
# check for file exists
if os.path.exists(header_path):
logger.warning(
f"File {header_path} already exists. Overwriting."
)
# concatenate key:value in props
props_list = []
for k, v in props.items():
if v in ["int", "long", "integer"]:
props_list.append(f"{k}:long")
elif v in ["int[]", "long[]", "integer[]"]:
props_list.append(f"{k}:long[]")
elif v in ["float", "double"]:
props_list.append(f"{k}:double")
elif v in ["float[]", "double[]"]:
props_list.append(f"{k}:double[]")
elif v in [
"bool",
"boolean",
]: # TODO does Neo4j support bool?
props_list.append(f"{k}:boolean")
elif v in ["bool[]", "boolean[]"]:
props_list.append(f"{k}:boolean[]")
elif v in ["str[]", "string[]"]:
props_list.append(f"{k}:string[]")
else:
props_list.append(f"{k}")
skip_id = False
schema_label = None
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
skip_id = True
elif not self.translator.ontology.mapping.extended_schema.get(
label
):
# find label in schema by label_as_edge
for (
k,
v,
) in self.translator.ontology.mapping.extended_schema.items():
if v.get("label_as_edge") == label:
schema_label = k
break
else:
schema_label = label
out_list = [":START_ID"]
if schema_label:
if (
self.translator.ontology.mapping.extended_schema.get(
schema_label
).get("use_id")
== False
):
skip_id = True
if not skip_id:
out_list.append("id")
out_list.extend(props_list)
out_list.extend([":END_ID", ":TYPE"])
existing_header = False
# check if file already exists
if os.path.exists(header_path):
logger.warning(
f"Header file `{header_path}` already exists. Overwriting.",
)
with open(header_path, "r", encoding="utf-8") as existing:
existing_header = existing.read().strip().split(self.delim)
with open(header_path, "w", encoding="utf-8") as f:
# Check if header file already exists and has different columns
if os.path.exists(header_path):
if existing_header:
#existing_header = existing.read().strip().split(self.delim)
# Compare existing and new headers
if set(existing_header) != set(out_list):
# Get part files associated with this header
base_name = os.path.basename(header_path).replace("-header.csv", "")
part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
# Find the highest numbered part file without full sorting
highest_part = None
highest_number = -1
for part_file in part_files:
try:
# Extract number from filename (assuming format like "part123.csv")
file_name = os.path.basename(part_file)
number_part = file_name.split("part")[1].split(".")[0]
number = int(number_part)
if number > highest_number:
highest_number = number
highest_part = part_file
except (IndexError, ValueError):
# Skip files that don't match the expected pattern
continue
# Update each part file with the new columns
for part_file in part_files:
if part_file == highest_part:
print(f"Skipping the highest part file: {highest_part}")
continue
try:
print("exi: ", existing_header)
print("out: ", out_list)
df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
# Read the file without headers
# Write back to file WITHOUT including the header
df.to_csv(part_file, sep=self.delim, index=False, header=False)
print(f"Updated {part_file} with new columns in correct positions")
except Exception as e:
print(f"Error updating {part_file}: {e}")
# Write the new header
row = self.delim.join(out_list)
f.write(row)
# add file path to neo4 admin import statement (import call file
# path may be different from actual file path)
import_call_header_path = os.path.join(
self.import_call_file_prefix,
header,
)
import_call_parts_path = os.path.join(
self.import_call_file_prefix,
parts,
)
self.import_call_edges.add(
(import_call_header_path, import_call_parts_path)
)
return True
def _get_import_script_name(self) -> str:
"""
Returns the name of the neo4j admin import script
Returns:
str: The name of the import script (ending in .sh)
"""
return "neo4j-admin-import-call.sh"
def _construct_import_call(self) -> str:
"""
Function to construct the import call detailing folder and
individual node and edge headers and data files, as well as
delimiters and database name. Built after all data has been
processed to ensure that nodes are called before any edges.
Returns:
str: a bash command for neo4j-admin import
"""
import_call_neo4j_v4 = self._get_import_call(
"import", "--database=", "--force="
)
import_call_neo4j_v5 = self._get_import_call(
"database import full", "", "--overwrite-destination="
)
neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
return import_script
def _get_import_call(
self, import_cmd: str, database_cmd: str, wipe_cmd: str
) -> str:
"""Get parametrized import call for Neo4j 4 or 5+.
Args:
import_cmd (str): The import command to use.
database_cmd (str): The database command to use.
wipe_cmd (str): The wipe command to use.
Returns:
str: The import call.
"""
import_call = f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "
import_call += f"{database_cmd}{self.db_name} "
import_call += f'--delimiter="{self.escaped_delim}" '
import_call += f'--array-delimiter="{self.escaped_adelim}" '
if self.quote == "'":
import_call += f'--quote="{self.quote}" '
else:
import_call += f"--quote='{self.quote}' "
if self.wipe:
import_call += f"{wipe_cmd}true "
if self.skip_bad_relationships:
import_call += "--skip-bad-relationships=true "
if self.skip_duplicate_nodes:
import_call += "--skip-duplicate-nodes=true "
# append node import calls
for header_path, parts_path in self.import_call_nodes:
import_call += f'--nodes="{header_path},{parts_path}" '
# append edge import calls
for header_path, parts_path in self.import_call_edges:
import_call += f'--relationships="{header_path},{parts_path}" '
return import_call
def adapt_csv_to_new_header(self, old_header, new_header, csv_file_path):
"""
Adapt a CSV table to a new header structure, placing new columns in their correct positions.
Parameters:
old_header (list): The original header columns
new_header (list): The new header columns
csv_file_path (str): Path to the CSV file
Returns:
pandas.DataFrame: CSV data with the new header structure
"""
# Step 1: Read the CSV data without headers
df = pd.read_csv(csv_file_path, sep=self.delim, header=None)
# Step 2: If the file is empty, return empty DataFrame with new headers
if df.empty:
return pd.DataFrame(columns=new_header)
# Step 3: If column count doesn't match old_header length, handle the mismatch
if len(df.columns) != len(old_header):
print(f"Warning: CSV columns count ({len(df.columns)}) doesn't match the provided old header count ({len(old_header)})")
# If file has fewer columns than old_header, pad with NaN
if len(df.columns) < len(old_header):
for i in range(len(df.columns), len(old_header)):
df[i] = None
# If file has more columns than old_header, truncate
else:
df = df.iloc[:, :len(old_header)]
# Step 4: Assign old header names to the dataframe
df.columns = old_header
# Step 5: Create a new DataFrame with the correct structure
new_df = pd.DataFrame(columns=new_header)
# Step 6: For each column in the new header, find its position in the old header
for new_col_idx, new_col in enumerate(new_header):
if new_col in old_header:
# If column exists in old header, copy data
new_df[new_col] = df[new_col]
else:
# If new column, add empty column
new_df[new_col] = None
# Step 7: Ensure columns are in the exact order of new_header
new_df = new_df[new_header]
return new_df

View File

@ -0,0 +1,76 @@
import pickle
import networkx as nx
from biocypher._logger import logger
from biocypher.output.write._writer import _Writer
from biocypher.output.write.relational._csv import _PandasCSVWriter
class _NetworkXWriter(_Writer):
"""
Class for writing node and edges to a networkx DiGraph.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.csv_writer = _PandasCSVWriter(*args, write_to_file=False, **kwargs)
self.G = nx.DiGraph()
def _construct_import_call(self) -> str:
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
Returns:
str: Python code to load the csv files into Pandas dfs.
"""
logger.info(
f"Writing networkx {self.G} to pickle file networkx_graph.pkl."
)
with open(f"{self.output_directory}/networkx_graph.pkl", "wb") as f:
pickle.dump(self.G, f)
import_call = "import pickle\n"
import_call += "with open('./networkx_graph.pkl', 'rb') as f:\n\tG_loaded = pickle.load(f)"
return import_call
def _get_import_script_name(self) -> str:
"""Function to return the name of the import script."""
return "import_networkx.py"
def _write_node_data(self, nodes) -> bool:
passed = self.csv_writer._write_entities_to_file(nodes)
self.add_to_networkx()
return passed
def _write_edge_data(self, edges) -> bool:
passed = self.csv_writer._write_entities_to_file(edges)
self.add_to_networkx()
return passed
def add_to_networkx(self) -> bool:
all_dfs = self.csv_writer.stored_dfs
node_dfs = [
df
for df in all_dfs.values()
if df.columns.str.contains("node_id").any()
]
edge_dfs = [
df
for df in all_dfs.values()
if df.columns.str.contains("source_id").any()
and df.columns.str.contains("target_id").any()
]
for df in node_dfs:
nodes = df.set_index("node_id").to_dict(orient="index")
self.G.add_nodes_from(nodes.items())
for df in edge_dfs:
edges = df.set_index(["source_id", "target_id"]).to_dict(
orient="index"
)
self.G.add_edges_from(
(
(source, target, attrs)
for (source, target), attrs in edges.items()
)
)
return True

View File

@ -0,0 +1,515 @@
#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Loes van den Biggelaar
# Sebastian Lobentanzer
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'offline' module. Handles the writing of node and edge representations
suitable for import into a DBMS.
"""
from types import GeneratorType
from typing import Union
import os
from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
from rdflib.namespace import (
_NAMESPACE_PREFIXES_CORE,
_NAMESPACE_PREFIXES_RDFLIB,
)
from biocypher._create import BioCypherEdge, BioCypherNode
from biocypher._logger import logger
from biocypher.output.write._batch_writer import _BatchWriter
class _RDFWriter(_BatchWriter):
"""
Class to write BioCypher's property graph into an RDF format using
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
is done keeping only the minimum information about node and edges,
skipping all properties.
"""
def _get_import_script_name(self) -> str:
"""
Returns the name of the RDF admin import script.
This function applicable for RDF export.
Returns:
str: The name of the import script (ending in .sh)
"""
return "rdf-import-call.sh"
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the RDF admin import location
"""
return "bin/"
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
"""
Function to check if the specified RDF format is supported.
Args:
rdf_format (str): The RDF format to check.
Returns:
bool: Returns True if rdf format supported, False otherwise.
"""
supported_formats = [
"xml",
"n3",
"turtle",
"nt",
"pretty-xml",
"trix",
"trig",
"nquads",
"json-ld",
]
if rdf_format not in supported_formats:
logger.error(
f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
)
return False
else:
# RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
if self.rdf_format == "turtle":
self.extension = "ttl"
elif self.rdf_format == "ttl":
self.rdf_format = "turtle"
self.extension = "ttl"
else:
self.extension = self.rdf_format
return True
def _write_single_edge_list_to_file(
self,
edge_list: list,
label: str,
prop_dict: dict,
):
"""
This function takes one list of biocypher edges and writes them
to an RDF file with the given format.
Args:
edge_list (list): list of BioCypherEdges to be written
label (str): the label (type) of the edge
prop_dict (dict): properties of node class passed from parsing
function and their types
Returns:
bool: The return value. True for success, False otherwise.
"""
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
logger.error("Edges must be passed as type BioCypherEdge.")
return False
# translate label to PascalCase
label_pascal = self.translator.name_sentence_to_pascal(label)
# create file name
file_name = os.path.join(
self.outdir, f"{label_pascal}.{self.extension}"
)
# write data in graph
graph = Graph()
self._init_namespaces(graph)
for edge in edge_list:
rdf_subject = edge.get_source_id()
rdf_object = edge.get_target_id()
rdf_predicate = edge.get_id()
rdf_properties = edge.get_properties()
if rdf_predicate == None:
rdf_predicate = rdf_subject + rdf_object
edge_label = self.translator.name_sentence_to_pascal(
edge.get_label()
)
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
graph.add((edge_uri, RDF.type, RDFS.Class))
graph.add(
(
self.rdf_namespaces["biocypher"][rdf_predicate],
RDF.type,
edge_uri,
)
)
graph.add(
(
self.rdf_namespaces["biocypher"][rdf_predicate],
self.rdf_namespaces["biocypher"]["subject"],
self.subject_to_uri(rdf_subject),
)
)
graph.add(
(
self.rdf_namespaces["biocypher"][rdf_predicate],
self.rdf_namespaces["biocypher"]["object"],
self.subject_to_uri(rdf_object),
)
)
# add properties to the transformed edge --> node
for key, value in rdf_properties.items():
# only write value if it exists.
if value:
self.add_property_to_graph(graph, rdf_predicate, value, key)
graph.serialize(destination=file_name, format=self.rdf_format)
logger.info(
f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
)
return True
def add_property_to_graph(
self,
graph: Graph,
rdf_subject: str,
rdf_object: str,
rdf_predicate: str,
):
"""
Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
If the property is neither a list or string, it will also be added as a literal.
Args:
graph (RDFLib.Graph): The RDF graph to add the nodes to.
rdf_subject (str): The subject of the RDF triple.
rdf_object (str): The object of the RDF triple.
rdf_predicate (str): The predicate of the RDF triple.
Returns:
None
"""
if isinstance(rdf_object, list):
for obj in rdf_object:
graph.add(
(
self.subject_to_uri(rdf_subject),
self.property_to_uri(rdf_predicate),
Literal(obj),
)
)
elif isinstance(rdf_object, str):
if rdf_object.startswith("[") and rdf_object.endswith("]"):
self.add_property_to_graph(
graph,
rdf_subject,
self.transform_string_to_list(rdf_object),
rdf_predicate,
)
else:
graph.add(
(
self.subject_to_uri(rdf_subject),
self.property_to_uri(rdf_predicate),
Literal(rdf_object),
)
)
else:
graph.add(
(
self.subject_to_uri(rdf_subject),
self.property_to_uri(rdf_predicate),
Literal(rdf_object),
)
)
def transform_string_to_list(self, string_list: str) -> list:
"""
Function to transform a string representation of a list into a list.
Args:
string_list (str): The string representation of the list.
Returns:
list: The list representation of the input string.
"""
return (
string_list.replace("[", "")
.replace("]", "")
.replace("'", "")
.split(", ")
)
def _write_single_node_list_to_file(
self,
node_list: list,
label: str,
prop_dict: dict,
labels: str,
):
"""
This function takes a list of BioCypherNodes and writes them
to an RDF file in the specified format.
Args:
node_list (list): A list of BioCypherNodes to be written.
label (str): The label (type) of the nodes.
prop_dict (dict): A dictionary of properties and their types for the node class.
Returns:
bool: True if the writing is successful, False otherwise.
"""
if not all(isinstance(n, BioCypherNode) for n in node_list):
logger.error("Nodes must be passed as type BioCypherNode.")
return False
# translate label to PascalCase
label_pascal = self.translator.name_sentence_to_pascal(label)
# create file name
file_name = os.path.join(
self.outdir, f"{label_pascal}.{self.extension}"
)
# write data in graph
graph = Graph()
self._init_namespaces(graph)
for n in node_list:
rdf_subject = n.get_id()
rdf_object = n.get_label()
properties = n.get_properties()
class_name = self.translator.name_sentence_to_pascal(rdf_object)
graph.add(
(
self.rdf_namespaces["biocypher"][class_name],
RDF.type,
RDFS.Class,
)
)
graph.add(
(
self.subject_to_uri(rdf_subject),
RDF.type,
self.rdf_namespaces["biocypher"][class_name],
)
)
for key, value in properties.items():
# only write value if it exists.
if value:
self.add_property_to_graph(graph, rdf_subject, value, key)
graph.serialize(destination=file_name, format=self.rdf_format)
logger.info(
f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
)
return True
def write_nodes(
self, nodes, batch_size: int = int(1e6), force: bool = False
) -> bool:
"""
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
Args:
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
batch_size (int): The number of nodes to write in each batch.
force (bool): Flag to force the writing even if the output file already exists.
Returns:
bool: True if the writing is successful, False otherwise.
"""
# check if specified output format is correct
passed = self._is_rdf_format_supported(self.rdf_format)
if not passed:
logger.error("Error while writing node data, wrong RDF format")
return False
# write node data using _write_node_data method
passed = self._write_node_data(nodes, batch_size, force)
if not passed:
logger.error("Error while writing node data.")
return False
return True
def write_edges(
self,
edges: Union[list, GeneratorType],
batch_size: int = int(1e6),
) -> bool:
"""
Wrapper for writing edges in RDF format. It calls _write_edge_data()
functions specifying it's edge data.
Args:
edges (BioCypherEdge): a list or generator of edges in
:py:class:`BioCypherEdge` format
batch_size (int): The number of edges to write in each batch.
Returns:
bool: The return value. True for success, False otherwise.
"""
# check if specified output format is correct
passed = self._is_rdf_format_supported(self.rdf_format)
if not passed:
logger.error("Error while writing edge data, wrong RDF format")
return False
# write edge data using _write_edge_data method
passed = self._write_edge_data(edges, batch_size=batch_size)
if not passed:
logger.error("Error while writing edge data.")
return False
return True
def _construct_import_call(self) -> bool:
"""
Function to write the import call.
This function is not applicable for RDF.
Returns:
bool: The return value. True for success, False otherwise.
"""
return ""
def _write_array_string(self, string_list):
"""
Abstract method to write the string representation of an array into a .csv file
as required by the RDF admin-import.
This function is not applicable for RDF.
Args:
string_list (list): list of ontology strings
Returns:
str: The string representation of an array for the neo4j admin import
"""
return True
def _write_node_headers(self):
"""
Abstract method that takes care of importing properties of a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`
This function is not applicable for RDF.
Returns:
bool: The return value. True for success, False otherwise.
"""
return True
def _write_edge_headers(self):
"""
Abstract method to write a database import-file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
This function is not applicable for RDF.
Returns:
bool: The return value. True for success, False otherwise.
"""
return True
def subject_to_uri(self, subject: str) -> str:
"""
Converts the subject to a proper URI using the available namespaces.
If the conversion fails, it defaults to the biocypher prefix.
Args:
subject (str): The subject to be converted to a URI.
Returns:
str: The corresponding URI for the subject.
"""
try:
_pref, _id = subject.split(":")
if _pref in self.rdf_namespaces.keys():
return self.rdf_namespaces[_pref][_id]
else:
return self.rdf_namespaces["biocypher"][subject]
except ValueError:
return self.rdf_namespaces["biocypher"][subject]
def property_to_uri(self, property_name: str) -> dict[str, str]:
"""
Converts a property name to its corresponding URI.
This function takes a property name and searches for its corresponding URI in various namespaces.
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
Args:
property_name (str): The property name to be converted to a URI.
Returns:
str: The corresponding URI for the input property name.
"""
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
for namespace in _NAMESPACE_PREFIXES_CORE.values():
if property_name in namespace:
return namespace[property_name]
# If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
for namespace in [SKOS, DC, DCTERMS]:
if property_name in namespace:
return namespace[property_name]
# If the property name is still not found, try other namespaces from rdflib.
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
if property_name in namespace:
return namespace[property_name]
# If the property name is "licence", it recursively calls the function with "license" as the input.
if property_name == "licence":
return self.property_to_uri("license")
# TODO: add an option to search trough manually implemented namespaces
# If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
# TODO: give a warning and try to prevent this option altogether
return self.rdf_namespaces["biocypher"][property_name]
def _init_namespaces(self, graph: Graph):
"""
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
Args:
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
Returns:
None
"""
# add biocypher standard to self.rdf_namespaces
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
if not self.rdf_namespaces:
self.rdf_namespaces = biocypher_standard
else:
self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
for key, value in self.rdf_namespaces.items():
namespace = Namespace(value)
self.rdf_namespaces[key] = namespace
graph.bind(key, namespace)