516 lines
18 KiB
Python
516 lines
18 KiB
Python
#!/usr/bin/env python
|
|
|
|
#
|
|
# Copyright 2021, Heidelberg University Clinic
|
|
#
|
|
# File author(s): Loes van den Biggelaar
|
|
# Sebastian Lobentanzer
|
|
#
|
|
# Distributed under MIT licence, see the file `LICENSE`.
|
|
#
|
|
"""
|
|
BioCypher 'offline' module. Handles the writing of node and edge representations
|
|
suitable for import into a DBMS.
|
|
"""
|
|
from types import GeneratorType
|
|
from typing import Union
|
|
import os
|
|
|
|
from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
|
|
from rdflib.namespace import (
|
|
_NAMESPACE_PREFIXES_CORE,
|
|
_NAMESPACE_PREFIXES_RDFLIB,
|
|
)
|
|
|
|
from biocypher._create import BioCypherEdge, BioCypherNode
|
|
from biocypher._logger import logger
|
|
from biocypher.output.write._batch_writer import _BatchWriter
|
|
|
|
|
|
class _RDFWriter(_BatchWriter):
|
|
"""
|
|
Class to write BioCypher's property graph into an RDF format using
|
|
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
|
|
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
|
|
is done keeping only the minimum information about node and edges,
|
|
skipping all properties.
|
|
"""
|
|
|
|
def _get_import_script_name(self) -> str:
|
|
"""
|
|
Returns the name of the RDF admin import script.
|
|
This function applicable for RDF export.
|
|
|
|
Returns:
|
|
str: The name of the import script (ending in .sh)
|
|
"""
|
|
return "rdf-import-call.sh"
|
|
|
|
def _get_default_import_call_bin_prefix(self):
|
|
"""
|
|
Method to provide the default string for the import call bin prefix.
|
|
|
|
Returns:
|
|
str: The default location for the RDF admin import location
|
|
"""
|
|
return "bin/"
|
|
|
|
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
|
|
"""
|
|
Function to check if the specified RDF format is supported.
|
|
|
|
Args:
|
|
rdf_format (str): The RDF format to check.
|
|
|
|
Returns:
|
|
bool: Returns True if rdf format supported, False otherwise.
|
|
"""
|
|
supported_formats = [
|
|
"xml",
|
|
"n3",
|
|
"turtle",
|
|
"nt",
|
|
"pretty-xml",
|
|
"trix",
|
|
"trig",
|
|
"nquads",
|
|
"json-ld",
|
|
]
|
|
if rdf_format not in supported_formats:
|
|
logger.error(
|
|
f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
|
|
f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
|
|
)
|
|
return False
|
|
else:
|
|
# RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
|
|
if self.rdf_format == "turtle":
|
|
self.extension = "ttl"
|
|
elif self.rdf_format == "ttl":
|
|
self.rdf_format = "turtle"
|
|
self.extension = "ttl"
|
|
else:
|
|
self.extension = self.rdf_format
|
|
return True
|
|
|
|
def _write_single_edge_list_to_file(
|
|
self,
|
|
edge_list: list,
|
|
label: str,
|
|
prop_dict: dict,
|
|
):
|
|
"""
|
|
This function takes one list of biocypher edges and writes them
|
|
to an RDF file with the given format.
|
|
|
|
Args:
|
|
edge_list (list): list of BioCypherEdges to be written
|
|
|
|
label (str): the label (type) of the edge
|
|
|
|
prop_dict (dict): properties of node class passed from parsing
|
|
function and their types
|
|
|
|
Returns:
|
|
bool: The return value. True for success, False otherwise.
|
|
"""
|
|
|
|
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
|
logger.error("Edges must be passed as type BioCypherEdge.")
|
|
return False
|
|
|
|
# translate label to PascalCase
|
|
label_pascal = self.translator.name_sentence_to_pascal(label)
|
|
|
|
# create file name
|
|
file_name = os.path.join(
|
|
self.outdir, f"{label_pascal}.{self.extension}"
|
|
)
|
|
|
|
# write data in graph
|
|
graph = Graph()
|
|
self._init_namespaces(graph)
|
|
|
|
for edge in edge_list:
|
|
rdf_subject = edge.get_source_id()
|
|
rdf_object = edge.get_target_id()
|
|
rdf_predicate = edge.get_id()
|
|
rdf_properties = edge.get_properties()
|
|
if rdf_predicate == None:
|
|
rdf_predicate = rdf_subject + rdf_object
|
|
|
|
edge_label = self.translator.name_sentence_to_pascal(
|
|
edge.get_label()
|
|
)
|
|
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
|
|
graph.add((edge_uri, RDF.type, RDFS.Class))
|
|
graph.add(
|
|
(
|
|
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
RDF.type,
|
|
edge_uri,
|
|
)
|
|
)
|
|
graph.add(
|
|
(
|
|
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
self.rdf_namespaces["biocypher"]["subject"],
|
|
self.subject_to_uri(rdf_subject),
|
|
)
|
|
)
|
|
graph.add(
|
|
(
|
|
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
self.rdf_namespaces["biocypher"]["object"],
|
|
self.subject_to_uri(rdf_object),
|
|
)
|
|
)
|
|
|
|
# add properties to the transformed edge --> node
|
|
for key, value in rdf_properties.items():
|
|
# only write value if it exists.
|
|
if value:
|
|
self.add_property_to_graph(graph, rdf_predicate, value, key)
|
|
|
|
graph.serialize(destination=file_name, format=self.rdf_format)
|
|
|
|
logger.info(
|
|
f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
|
|
)
|
|
|
|
return True
|
|
|
|
def add_property_to_graph(
|
|
self,
|
|
graph: Graph,
|
|
rdf_subject: str,
|
|
rdf_object: str,
|
|
rdf_predicate: str,
|
|
):
|
|
"""
|
|
Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
|
|
It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
|
|
If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
|
|
If the property is neither a list or string, it will also be added as a literal.
|
|
|
|
Args:
|
|
graph (RDFLib.Graph): The RDF graph to add the nodes to.
|
|
|
|
rdf_subject (str): The subject of the RDF triple.
|
|
|
|
rdf_object (str): The object of the RDF triple.
|
|
|
|
rdf_predicate (str): The predicate of the RDF triple.
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
if isinstance(rdf_object, list):
|
|
for obj in rdf_object:
|
|
graph.add(
|
|
(
|
|
self.subject_to_uri(rdf_subject),
|
|
self.property_to_uri(rdf_predicate),
|
|
Literal(obj),
|
|
)
|
|
)
|
|
elif isinstance(rdf_object, str):
|
|
if rdf_object.startswith("[") and rdf_object.endswith("]"):
|
|
self.add_property_to_graph(
|
|
graph,
|
|
rdf_subject,
|
|
self.transform_string_to_list(rdf_object),
|
|
rdf_predicate,
|
|
)
|
|
else:
|
|
graph.add(
|
|
(
|
|
self.subject_to_uri(rdf_subject),
|
|
self.property_to_uri(rdf_predicate),
|
|
Literal(rdf_object),
|
|
)
|
|
)
|
|
else:
|
|
graph.add(
|
|
(
|
|
self.subject_to_uri(rdf_subject),
|
|
self.property_to_uri(rdf_predicate),
|
|
Literal(rdf_object),
|
|
)
|
|
)
|
|
|
|
def transform_string_to_list(self, string_list: str) -> list:
|
|
"""
|
|
Function to transform a string representation of a list into a list.
|
|
|
|
Args:
|
|
string_list (str): The string representation of the list.
|
|
|
|
Returns:
|
|
list: The list representation of the input string.
|
|
"""
|
|
return (
|
|
string_list.replace("[", "")
|
|
.replace("]", "")
|
|
.replace("'", "")
|
|
.split(", ")
|
|
)
|
|
|
|
def _write_single_node_list_to_file(
|
|
self,
|
|
node_list: list,
|
|
label: str,
|
|
prop_dict: dict,
|
|
labels: str,
|
|
):
|
|
"""
|
|
This function takes a list of BioCypherNodes and writes them
|
|
to an RDF file in the specified format.
|
|
|
|
Args:
|
|
node_list (list): A list of BioCypherNodes to be written.
|
|
|
|
label (str): The label (type) of the nodes.
|
|
|
|
prop_dict (dict): A dictionary of properties and their types for the node class.
|
|
|
|
Returns:
|
|
bool: True if the writing is successful, False otherwise.
|
|
"""
|
|
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
|
logger.error("Nodes must be passed as type BioCypherNode.")
|
|
return False
|
|
|
|
# translate label to PascalCase
|
|
label_pascal = self.translator.name_sentence_to_pascal(label)
|
|
|
|
# create file name
|
|
file_name = os.path.join(
|
|
self.outdir, f"{label_pascal}.{self.extension}"
|
|
)
|
|
|
|
# write data in graph
|
|
graph = Graph()
|
|
self._init_namespaces(graph)
|
|
|
|
for n in node_list:
|
|
rdf_subject = n.get_id()
|
|
rdf_object = n.get_label()
|
|
properties = n.get_properties()
|
|
class_name = self.translator.name_sentence_to_pascal(rdf_object)
|
|
graph.add(
|
|
(
|
|
self.rdf_namespaces["biocypher"][class_name],
|
|
RDF.type,
|
|
RDFS.Class,
|
|
)
|
|
)
|
|
graph.add(
|
|
(
|
|
self.subject_to_uri(rdf_subject),
|
|
RDF.type,
|
|
self.rdf_namespaces["biocypher"][class_name],
|
|
)
|
|
)
|
|
for key, value in properties.items():
|
|
# only write value if it exists.
|
|
if value:
|
|
self.add_property_to_graph(graph, rdf_subject, value, key)
|
|
|
|
graph.serialize(destination=file_name, format=self.rdf_format)
|
|
|
|
logger.info(
|
|
f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
|
|
)
|
|
|
|
return True
|
|
|
|
def write_nodes(
|
|
self, nodes, batch_size: int = int(1e6), force: bool = False
|
|
) -> bool:
|
|
"""
|
|
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
|
|
|
|
Args:
|
|
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
|
|
batch_size (int): The number of nodes to write in each batch.
|
|
force (bool): Flag to force the writing even if the output file already exists.
|
|
|
|
Returns:
|
|
bool: True if the writing is successful, False otherwise.
|
|
"""
|
|
# check if specified output format is correct
|
|
passed = self._is_rdf_format_supported(self.rdf_format)
|
|
if not passed:
|
|
logger.error("Error while writing node data, wrong RDF format")
|
|
return False
|
|
# write node data using _write_node_data method
|
|
passed = self._write_node_data(nodes, batch_size, force)
|
|
if not passed:
|
|
logger.error("Error while writing node data.")
|
|
return False
|
|
return True
|
|
|
|
def write_edges(
|
|
self,
|
|
edges: Union[list, GeneratorType],
|
|
batch_size: int = int(1e6),
|
|
) -> bool:
|
|
"""
|
|
Wrapper for writing edges in RDF format. It calls _write_edge_data()
|
|
functions specifying it's edge data.
|
|
|
|
Args:
|
|
edges (BioCypherEdge): a list or generator of edges in
|
|
:py:class:`BioCypherEdge` format
|
|
batch_size (int): The number of edges to write in each batch.
|
|
|
|
Returns:
|
|
bool: The return value. True for success, False otherwise.
|
|
"""
|
|
# check if specified output format is correct
|
|
passed = self._is_rdf_format_supported(self.rdf_format)
|
|
if not passed:
|
|
logger.error("Error while writing edge data, wrong RDF format")
|
|
return False
|
|
# write edge data using _write_edge_data method
|
|
passed = self._write_edge_data(edges, batch_size=batch_size)
|
|
if not passed:
|
|
logger.error("Error while writing edge data.")
|
|
return False
|
|
|
|
return True
|
|
|
|
def _construct_import_call(self) -> bool:
|
|
"""
|
|
Function to write the import call.
|
|
This function is not applicable for RDF.
|
|
|
|
Returns:
|
|
bool: The return value. True for success, False otherwise.
|
|
"""
|
|
return ""
|
|
|
|
def _write_array_string(self, string_list):
|
|
"""
|
|
Abstract method to write the string representation of an array into a .csv file
|
|
as required by the RDF admin-import.
|
|
This function is not applicable for RDF.
|
|
|
|
Args:
|
|
string_list (list): list of ontology strings
|
|
|
|
Returns:
|
|
str: The string representation of an array for the neo4j admin import
|
|
"""
|
|
|
|
return True
|
|
|
|
def _write_node_headers(self):
|
|
"""
|
|
Abstract method that takes care of importing properties of a graph entity that is represented
|
|
as a node as per the definition in the `schema_config.yaml`
|
|
This function is not applicable for RDF.
|
|
|
|
Returns:
|
|
bool: The return value. True for success, False otherwise.
|
|
"""
|
|
return True
|
|
|
|
def _write_edge_headers(self):
|
|
"""
|
|
Abstract method to write a database import-file for a graph entity that is represented
|
|
as an edge as per the definition in the `schema_config.yaml`,
|
|
containing only the header for this type of edge.
|
|
This function is not applicable for RDF.
|
|
|
|
Returns:
|
|
bool: The return value. True for success, False otherwise.
|
|
"""
|
|
return True
|
|
|
|
def subject_to_uri(self, subject: str) -> str:
|
|
"""
|
|
Converts the subject to a proper URI using the available namespaces.
|
|
If the conversion fails, it defaults to the biocypher prefix.
|
|
|
|
Args:
|
|
subject (str): The subject to be converted to a URI.
|
|
|
|
Returns:
|
|
str: The corresponding URI for the subject.
|
|
"""
|
|
try:
|
|
_pref, _id = subject.split(":")
|
|
|
|
if _pref in self.rdf_namespaces.keys():
|
|
return self.rdf_namespaces[_pref][_id]
|
|
else:
|
|
return self.rdf_namespaces["biocypher"][subject]
|
|
except ValueError:
|
|
return self.rdf_namespaces["biocypher"][subject]
|
|
|
|
def property_to_uri(self, property_name: str) -> dict[str, str]:
|
|
"""
|
|
Converts a property name to its corresponding URI.
|
|
|
|
This function takes a property name and searches for its corresponding URI in various namespaces.
|
|
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
|
|
|
|
Args:
|
|
property_name (str): The property name to be converted to a URI.
|
|
|
|
Returns:
|
|
str: The corresponding URI for the input property name.
|
|
"""
|
|
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
|
|
for namespace in _NAMESPACE_PREFIXES_CORE.values():
|
|
if property_name in namespace:
|
|
return namespace[property_name]
|
|
|
|
# If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
|
|
for namespace in [SKOS, DC, DCTERMS]:
|
|
if property_name in namespace:
|
|
return namespace[property_name]
|
|
|
|
# If the property name is still not found, try other namespaces from rdflib.
|
|
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
|
|
if property_name in namespace:
|
|
return namespace[property_name]
|
|
|
|
# If the property name is "licence", it recursively calls the function with "license" as the input.
|
|
if property_name == "licence":
|
|
return self.property_to_uri("license")
|
|
|
|
# TODO: add an option to search trough manually implemented namespaces
|
|
|
|
# If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
|
|
# TODO: give a warning and try to prevent this option altogether
|
|
return self.rdf_namespaces["biocypher"][property_name]
|
|
|
|
def _init_namespaces(self, graph: Graph):
|
|
"""
|
|
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
|
|
|
|
This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
|
|
If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
|
|
the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
|
|
|
|
Args:
|
|
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
# add biocypher standard to self.rdf_namespaces
|
|
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
|
|
if not self.rdf_namespaces:
|
|
self.rdf_namespaces = biocypher_standard
|
|
else:
|
|
self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
|
|
|
|
for key, value in self.rdf_namespaces.items():
|
|
namespace = Namespace(value)
|
|
self.rdf_namespaces[key] = namespace
|
|
graph.bind(key, namespace)
|