2025-04-16 22:12:19 +02:00

516 lines
18 KiB
Python

#!/usr/bin/env python
#
# Copyright 2021, Heidelberg University Clinic
#
# File author(s): Loes van den Biggelaar
# Sebastian Lobentanzer
#
# Distributed under MIT licence, see the file `LICENSE`.
#
"""
BioCypher 'offline' module. Handles the writing of node and edge representations
suitable for import into a DBMS.
"""
from types import GeneratorType
from typing import Union
import os
from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
from rdflib.namespace import (
_NAMESPACE_PREFIXES_CORE,
_NAMESPACE_PREFIXES_RDFLIB,
)
from biocypher._create import BioCypherEdge, BioCypherNode
from biocypher._logger import logger
from biocypher.output.write._batch_writer import _BatchWriter
class _RDFWriter(_BatchWriter):
"""
Class to write BioCypher's property graph into an RDF format using
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
is done keeping only the minimum information about node and edges,
skipping all properties.
"""
def _get_import_script_name(self) -> str:
"""
Returns the name of the RDF admin import script.
This function applicable for RDF export.
Returns:
str: The name of the import script (ending in .sh)
"""
return "rdf-import-call.sh"
def _get_default_import_call_bin_prefix(self):
"""
Method to provide the default string for the import call bin prefix.
Returns:
str: The default location for the RDF admin import location
"""
return "bin/"
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
"""
Function to check if the specified RDF format is supported.
Args:
rdf_format (str): The RDF format to check.
Returns:
bool: Returns True if rdf format supported, False otherwise.
"""
supported_formats = [
"xml",
"n3",
"turtle",
"nt",
"pretty-xml",
"trix",
"trig",
"nquads",
"json-ld",
]
if rdf_format not in supported_formats:
logger.error(
f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
)
return False
else:
# RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
if self.rdf_format == "turtle":
self.extension = "ttl"
elif self.rdf_format == "ttl":
self.rdf_format = "turtle"
self.extension = "ttl"
else:
self.extension = self.rdf_format
return True
def _write_single_edge_list_to_file(
self,
edge_list: list,
label: str,
prop_dict: dict,
):
"""
This function takes one list of biocypher edges and writes them
to an RDF file with the given format.
Args:
edge_list (list): list of BioCypherEdges to be written
label (str): the label (type) of the edge
prop_dict (dict): properties of node class passed from parsing
function and their types
Returns:
bool: The return value. True for success, False otherwise.
"""
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
logger.error("Edges must be passed as type BioCypherEdge.")
return False
# translate label to PascalCase
label_pascal = self.translator.name_sentence_to_pascal(label)
# create file name
file_name = os.path.join(
self.outdir, f"{label_pascal}.{self.extension}"
)
# write data in graph
graph = Graph()
self._init_namespaces(graph)
for edge in edge_list:
rdf_subject = edge.get_source_id()
rdf_object = edge.get_target_id()
rdf_predicate = edge.get_id()
rdf_properties = edge.get_properties()
if rdf_predicate == None:
rdf_predicate = rdf_subject + rdf_object
edge_label = self.translator.name_sentence_to_pascal(
edge.get_label()
)
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
graph.add((edge_uri, RDF.type, RDFS.Class))
graph.add(
(
self.rdf_namespaces["biocypher"][rdf_predicate],
RDF.type,
edge_uri,
)
)
graph.add(
(
self.rdf_namespaces["biocypher"][rdf_predicate],
self.rdf_namespaces["biocypher"]["subject"],
self.subject_to_uri(rdf_subject),
)
)
graph.add(
(
self.rdf_namespaces["biocypher"][rdf_predicate],
self.rdf_namespaces["biocypher"]["object"],
self.subject_to_uri(rdf_object),
)
)
# add properties to the transformed edge --> node
for key, value in rdf_properties.items():
# only write value if it exists.
if value:
self.add_property_to_graph(graph, rdf_predicate, value, key)
graph.serialize(destination=file_name, format=self.rdf_format)
logger.info(
f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
)
return True
def add_property_to_graph(
self,
graph: Graph,
rdf_subject: str,
rdf_object: str,
rdf_predicate: str,
):
"""
Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
If the property is neither a list or string, it will also be added as a literal.
Args:
graph (RDFLib.Graph): The RDF graph to add the nodes to.
rdf_subject (str): The subject of the RDF triple.
rdf_object (str): The object of the RDF triple.
rdf_predicate (str): The predicate of the RDF triple.
Returns:
None
"""
if isinstance(rdf_object, list):
for obj in rdf_object:
graph.add(
(
self.subject_to_uri(rdf_subject),
self.property_to_uri(rdf_predicate),
Literal(obj),
)
)
elif isinstance(rdf_object, str):
if rdf_object.startswith("[") and rdf_object.endswith("]"):
self.add_property_to_graph(
graph,
rdf_subject,
self.transform_string_to_list(rdf_object),
rdf_predicate,
)
else:
graph.add(
(
self.subject_to_uri(rdf_subject),
self.property_to_uri(rdf_predicate),
Literal(rdf_object),
)
)
else:
graph.add(
(
self.subject_to_uri(rdf_subject),
self.property_to_uri(rdf_predicate),
Literal(rdf_object),
)
)
def transform_string_to_list(self, string_list: str) -> list:
"""
Function to transform a string representation of a list into a list.
Args:
string_list (str): The string representation of the list.
Returns:
list: The list representation of the input string.
"""
return (
string_list.replace("[", "")
.replace("]", "")
.replace("'", "")
.split(", ")
)
def _write_single_node_list_to_file(
self,
node_list: list,
label: str,
prop_dict: dict,
labels: str,
):
"""
This function takes a list of BioCypherNodes and writes them
to an RDF file in the specified format.
Args:
node_list (list): A list of BioCypherNodes to be written.
label (str): The label (type) of the nodes.
prop_dict (dict): A dictionary of properties and their types for the node class.
Returns:
bool: True if the writing is successful, False otherwise.
"""
if not all(isinstance(n, BioCypherNode) for n in node_list):
logger.error("Nodes must be passed as type BioCypherNode.")
return False
# translate label to PascalCase
label_pascal = self.translator.name_sentence_to_pascal(label)
# create file name
file_name = os.path.join(
self.outdir, f"{label_pascal}.{self.extension}"
)
# write data in graph
graph = Graph()
self._init_namespaces(graph)
for n in node_list:
rdf_subject = n.get_id()
rdf_object = n.get_label()
properties = n.get_properties()
class_name = self.translator.name_sentence_to_pascal(rdf_object)
graph.add(
(
self.rdf_namespaces["biocypher"][class_name],
RDF.type,
RDFS.Class,
)
)
graph.add(
(
self.subject_to_uri(rdf_subject),
RDF.type,
self.rdf_namespaces["biocypher"][class_name],
)
)
for key, value in properties.items():
# only write value if it exists.
if value:
self.add_property_to_graph(graph, rdf_subject, value, key)
graph.serialize(destination=file_name, format=self.rdf_format)
logger.info(
f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
)
return True
def write_nodes(
self, nodes, batch_size: int = int(1e6), force: bool = False
) -> bool:
"""
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
Args:
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
batch_size (int): The number of nodes to write in each batch.
force (bool): Flag to force the writing even if the output file already exists.
Returns:
bool: True if the writing is successful, False otherwise.
"""
# check if specified output format is correct
passed = self._is_rdf_format_supported(self.rdf_format)
if not passed:
logger.error("Error while writing node data, wrong RDF format")
return False
# write node data using _write_node_data method
passed = self._write_node_data(nodes, batch_size, force)
if not passed:
logger.error("Error while writing node data.")
return False
return True
def write_edges(
self,
edges: Union[list, GeneratorType],
batch_size: int = int(1e6),
) -> bool:
"""
Wrapper for writing edges in RDF format. It calls _write_edge_data()
functions specifying it's edge data.
Args:
edges (BioCypherEdge): a list or generator of edges in
:py:class:`BioCypherEdge` format
batch_size (int): The number of edges to write in each batch.
Returns:
bool: The return value. True for success, False otherwise.
"""
# check if specified output format is correct
passed = self._is_rdf_format_supported(self.rdf_format)
if not passed:
logger.error("Error while writing edge data, wrong RDF format")
return False
# write edge data using _write_edge_data method
passed = self._write_edge_data(edges, batch_size=batch_size)
if not passed:
logger.error("Error while writing edge data.")
return False
return True
def _construct_import_call(self) -> bool:
"""
Function to write the import call.
This function is not applicable for RDF.
Returns:
bool: The return value. True for success, False otherwise.
"""
return ""
def _write_array_string(self, string_list):
"""
Abstract method to write the string representation of an array into a .csv file
as required by the RDF admin-import.
This function is not applicable for RDF.
Args:
string_list (list): list of ontology strings
Returns:
str: The string representation of an array for the neo4j admin import
"""
return True
def _write_node_headers(self):
"""
Abstract method that takes care of importing properties of a graph entity that is represented
as a node as per the definition in the `schema_config.yaml`
This function is not applicable for RDF.
Returns:
bool: The return value. True for success, False otherwise.
"""
return True
def _write_edge_headers(self):
"""
Abstract method to write a database import-file for a graph entity that is represented
as an edge as per the definition in the `schema_config.yaml`,
containing only the header for this type of edge.
This function is not applicable for RDF.
Returns:
bool: The return value. True for success, False otherwise.
"""
return True
def subject_to_uri(self, subject: str) -> str:
"""
Converts the subject to a proper URI using the available namespaces.
If the conversion fails, it defaults to the biocypher prefix.
Args:
subject (str): The subject to be converted to a URI.
Returns:
str: The corresponding URI for the subject.
"""
try:
_pref, _id = subject.split(":")
if _pref in self.rdf_namespaces.keys():
return self.rdf_namespaces[_pref][_id]
else:
return self.rdf_namespaces["biocypher"][subject]
except ValueError:
return self.rdf_namespaces["biocypher"][subject]
def property_to_uri(self, property_name: str) -> dict[str, str]:
"""
Converts a property name to its corresponding URI.
This function takes a property name and searches for its corresponding URI in various namespaces.
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
Args:
property_name (str): The property name to be converted to a URI.
Returns:
str: The corresponding URI for the input property name.
"""
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
for namespace in _NAMESPACE_PREFIXES_CORE.values():
if property_name in namespace:
return namespace[property_name]
# If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
for namespace in [SKOS, DC, DCTERMS]:
if property_name in namespace:
return namespace[property_name]
# If the property name is still not found, try other namespaces from rdflib.
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
if property_name in namespace:
return namespace[property_name]
# If the property name is "licence", it recursively calls the function with "license" as the input.
if property_name == "licence":
return self.property_to_uri("license")
# TODO: add an option to search trough manually implemented namespaces
# If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
# TODO: give a warning and try to prevent this option altogether
return self.rdf_namespaces["biocypher"][property_name]
def _init_namespaces(self, graph: Graph):
"""
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
Args:
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
Returns:
None
"""
# add biocypher standard to self.rdf_namespaces
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
if not self.rdf_namespaces:
self.rdf_namespaces = biocypher_standard
else:
self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
for key, value in self.rdf_namespaces.items():
namespace = Namespace(value)
self.rdf_namespaces[key] = namespace
graph.bind(key, namespace)