release commit
This commit is contained in:
515
biocypher/output/write/graph/_rdf.py
Normal file
515
biocypher/output/write/graph/_rdf.py
Normal file
@ -0,0 +1,515 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Copyright 2021, Heidelberg University Clinic
|
||||
#
|
||||
# File author(s): Loes van den Biggelaar
|
||||
# Sebastian Lobentanzer
|
||||
#
|
||||
# Distributed under MIT licence, see the file `LICENSE`.
|
||||
#
|
||||
"""
|
||||
BioCypher 'offline' module. Handles the writing of node and edge representations
|
||||
suitable for import into a DBMS.
|
||||
"""
|
||||
from types import GeneratorType
|
||||
from typing import Union
|
||||
import os
|
||||
|
||||
from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
|
||||
from rdflib.namespace import (
|
||||
_NAMESPACE_PREFIXES_CORE,
|
||||
_NAMESPACE_PREFIXES_RDFLIB,
|
||||
)
|
||||
|
||||
from biocypher._create import BioCypherEdge, BioCypherNode
|
||||
from biocypher._logger import logger
|
||||
from biocypher.output.write._batch_writer import _BatchWriter
|
||||
|
||||
|
||||
class _RDFWriter(_BatchWriter):
|
||||
"""
|
||||
Class to write BioCypher's property graph into an RDF format using
|
||||
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
|
||||
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
|
||||
is done keeping only the minimum information about node and edges,
|
||||
skipping all properties.
|
||||
"""
|
||||
|
||||
def _get_import_script_name(self) -> str:
|
||||
"""
|
||||
Returns the name of the RDF admin import script.
|
||||
This function applicable for RDF export.
|
||||
|
||||
Returns:
|
||||
str: The name of the import script (ending in .sh)
|
||||
"""
|
||||
return "rdf-import-call.sh"
|
||||
|
||||
def _get_default_import_call_bin_prefix(self):
|
||||
"""
|
||||
Method to provide the default string for the import call bin prefix.
|
||||
|
||||
Returns:
|
||||
str: The default location for the RDF admin import location
|
||||
"""
|
||||
return "bin/"
|
||||
|
||||
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
|
||||
"""
|
||||
Function to check if the specified RDF format is supported.
|
||||
|
||||
Args:
|
||||
rdf_format (str): The RDF format to check.
|
||||
|
||||
Returns:
|
||||
bool: Returns True if rdf format supported, False otherwise.
|
||||
"""
|
||||
supported_formats = [
|
||||
"xml",
|
||||
"n3",
|
||||
"turtle",
|
||||
"nt",
|
||||
"pretty-xml",
|
||||
"trix",
|
||||
"trig",
|
||||
"nquads",
|
||||
"json-ld",
|
||||
]
|
||||
if rdf_format not in supported_formats:
|
||||
logger.error(
|
||||
f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
|
||||
f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
|
||||
)
|
||||
return False
|
||||
else:
|
||||
# RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
|
||||
if self.rdf_format == "turtle":
|
||||
self.extension = "ttl"
|
||||
elif self.rdf_format == "ttl":
|
||||
self.rdf_format = "turtle"
|
||||
self.extension = "ttl"
|
||||
else:
|
||||
self.extension = self.rdf_format
|
||||
return True
|
||||
|
||||
def _write_single_edge_list_to_file(
|
||||
self,
|
||||
edge_list: list,
|
||||
label: str,
|
||||
prop_dict: dict,
|
||||
):
|
||||
"""
|
||||
This function takes one list of biocypher edges and writes them
|
||||
to an RDF file with the given format.
|
||||
|
||||
Args:
|
||||
edge_list (list): list of BioCypherEdges to be written
|
||||
|
||||
label (str): the label (type) of the edge
|
||||
|
||||
prop_dict (dict): properties of node class passed from parsing
|
||||
function and their types
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
|
||||
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
||||
logger.error("Edges must be passed as type BioCypherEdge.")
|
||||
return False
|
||||
|
||||
# translate label to PascalCase
|
||||
label_pascal = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
# create file name
|
||||
file_name = os.path.join(
|
||||
self.outdir, f"{label_pascal}.{self.extension}"
|
||||
)
|
||||
|
||||
# write data in graph
|
||||
graph = Graph()
|
||||
self._init_namespaces(graph)
|
||||
|
||||
for edge in edge_list:
|
||||
rdf_subject = edge.get_source_id()
|
||||
rdf_object = edge.get_target_id()
|
||||
rdf_predicate = edge.get_id()
|
||||
rdf_properties = edge.get_properties()
|
||||
if rdf_predicate == None:
|
||||
rdf_predicate = rdf_subject + rdf_object
|
||||
|
||||
edge_label = self.translator.name_sentence_to_pascal(
|
||||
edge.get_label()
|
||||
)
|
||||
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
|
||||
graph.add((edge_uri, RDF.type, RDFS.Class))
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][rdf_predicate],
|
||||
RDF.type,
|
||||
edge_uri,
|
||||
)
|
||||
)
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][rdf_predicate],
|
||||
self.rdf_namespaces["biocypher"]["subject"],
|
||||
self.subject_to_uri(rdf_subject),
|
||||
)
|
||||
)
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][rdf_predicate],
|
||||
self.rdf_namespaces["biocypher"]["object"],
|
||||
self.subject_to_uri(rdf_object),
|
||||
)
|
||||
)
|
||||
|
||||
# add properties to the transformed edge --> node
|
||||
for key, value in rdf_properties.items():
|
||||
# only write value if it exists.
|
||||
if value:
|
||||
self.add_property_to_graph(graph, rdf_predicate, value, key)
|
||||
|
||||
graph.serialize(destination=file_name, format=self.rdf_format)
|
||||
|
||||
logger.info(
|
||||
f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def add_property_to_graph(
|
||||
self,
|
||||
graph: Graph,
|
||||
rdf_subject: str,
|
||||
rdf_object: str,
|
||||
rdf_predicate: str,
|
||||
):
|
||||
"""
|
||||
Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
|
||||
It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
|
||||
If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
|
||||
If the property is neither a list or string, it will also be added as a literal.
|
||||
|
||||
Args:
|
||||
graph (RDFLib.Graph): The RDF graph to add the nodes to.
|
||||
|
||||
rdf_subject (str): The subject of the RDF triple.
|
||||
|
||||
rdf_object (str): The object of the RDF triple.
|
||||
|
||||
rdf_predicate (str): The predicate of the RDF triple.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if isinstance(rdf_object, list):
|
||||
for obj in rdf_object:
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
self.property_to_uri(rdf_predicate),
|
||||
Literal(obj),
|
||||
)
|
||||
)
|
||||
elif isinstance(rdf_object, str):
|
||||
if rdf_object.startswith("[") and rdf_object.endswith("]"):
|
||||
self.add_property_to_graph(
|
||||
graph,
|
||||
rdf_subject,
|
||||
self.transform_string_to_list(rdf_object),
|
||||
rdf_predicate,
|
||||
)
|
||||
else:
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
self.property_to_uri(rdf_predicate),
|
||||
Literal(rdf_object),
|
||||
)
|
||||
)
|
||||
else:
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
self.property_to_uri(rdf_predicate),
|
||||
Literal(rdf_object),
|
||||
)
|
||||
)
|
||||
|
||||
def transform_string_to_list(self, string_list: str) -> list:
|
||||
"""
|
||||
Function to transform a string representation of a list into a list.
|
||||
|
||||
Args:
|
||||
string_list (str): The string representation of the list.
|
||||
|
||||
Returns:
|
||||
list: The list representation of the input string.
|
||||
"""
|
||||
return (
|
||||
string_list.replace("[", "")
|
||||
.replace("]", "")
|
||||
.replace("'", "")
|
||||
.split(", ")
|
||||
)
|
||||
|
||||
def _write_single_node_list_to_file(
|
||||
self,
|
||||
node_list: list,
|
||||
label: str,
|
||||
prop_dict: dict,
|
||||
labels: str,
|
||||
):
|
||||
"""
|
||||
This function takes a list of BioCypherNodes and writes them
|
||||
to an RDF file in the specified format.
|
||||
|
||||
Args:
|
||||
node_list (list): A list of BioCypherNodes to be written.
|
||||
|
||||
label (str): The label (type) of the nodes.
|
||||
|
||||
prop_dict (dict): A dictionary of properties and their types for the node class.
|
||||
|
||||
Returns:
|
||||
bool: True if the writing is successful, False otherwise.
|
||||
"""
|
||||
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
||||
logger.error("Nodes must be passed as type BioCypherNode.")
|
||||
return False
|
||||
|
||||
# translate label to PascalCase
|
||||
label_pascal = self.translator.name_sentence_to_pascal(label)
|
||||
|
||||
# create file name
|
||||
file_name = os.path.join(
|
||||
self.outdir, f"{label_pascal}.{self.extension}"
|
||||
)
|
||||
|
||||
# write data in graph
|
||||
graph = Graph()
|
||||
self._init_namespaces(graph)
|
||||
|
||||
for n in node_list:
|
||||
rdf_subject = n.get_id()
|
||||
rdf_object = n.get_label()
|
||||
properties = n.get_properties()
|
||||
class_name = self.translator.name_sentence_to_pascal(rdf_object)
|
||||
graph.add(
|
||||
(
|
||||
self.rdf_namespaces["biocypher"][class_name],
|
||||
RDF.type,
|
||||
RDFS.Class,
|
||||
)
|
||||
)
|
||||
graph.add(
|
||||
(
|
||||
self.subject_to_uri(rdf_subject),
|
||||
RDF.type,
|
||||
self.rdf_namespaces["biocypher"][class_name],
|
||||
)
|
||||
)
|
||||
for key, value in properties.items():
|
||||
# only write value if it exists.
|
||||
if value:
|
||||
self.add_property_to_graph(graph, rdf_subject, value, key)
|
||||
|
||||
graph.serialize(destination=file_name, format=self.rdf_format)
|
||||
|
||||
logger.info(
|
||||
f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def write_nodes(
|
||||
self, nodes, batch_size: int = int(1e6), force: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
|
||||
|
||||
Args:
|
||||
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
|
||||
batch_size (int): The number of nodes to write in each batch.
|
||||
force (bool): Flag to force the writing even if the output file already exists.
|
||||
|
||||
Returns:
|
||||
bool: True if the writing is successful, False otherwise.
|
||||
"""
|
||||
# check if specified output format is correct
|
||||
passed = self._is_rdf_format_supported(self.rdf_format)
|
||||
if not passed:
|
||||
logger.error("Error while writing node data, wrong RDF format")
|
||||
return False
|
||||
# write node data using _write_node_data method
|
||||
passed = self._write_node_data(nodes, batch_size, force)
|
||||
if not passed:
|
||||
logger.error("Error while writing node data.")
|
||||
return False
|
||||
return True
|
||||
|
||||
def write_edges(
|
||||
self,
|
||||
edges: Union[list, GeneratorType],
|
||||
batch_size: int = int(1e6),
|
||||
) -> bool:
|
||||
"""
|
||||
Wrapper for writing edges in RDF format. It calls _write_edge_data()
|
||||
functions specifying it's edge data.
|
||||
|
||||
Args:
|
||||
edges (BioCypherEdge): a list or generator of edges in
|
||||
:py:class:`BioCypherEdge` format
|
||||
batch_size (int): The number of edges to write in each batch.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
# check if specified output format is correct
|
||||
passed = self._is_rdf_format_supported(self.rdf_format)
|
||||
if not passed:
|
||||
logger.error("Error while writing edge data, wrong RDF format")
|
||||
return False
|
||||
# write edge data using _write_edge_data method
|
||||
passed = self._write_edge_data(edges, batch_size=batch_size)
|
||||
if not passed:
|
||||
logger.error("Error while writing edge data.")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _construct_import_call(self) -> bool:
|
||||
"""
|
||||
Function to write the import call.
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _write_array_string(self, string_list):
|
||||
"""
|
||||
Abstract method to write the string representation of an array into a .csv file
|
||||
as required by the RDF admin-import.
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Args:
|
||||
string_list (list): list of ontology strings
|
||||
|
||||
Returns:
|
||||
str: The string representation of an array for the neo4j admin import
|
||||
"""
|
||||
|
||||
return True
|
||||
|
||||
def _write_node_headers(self):
|
||||
"""
|
||||
Abstract method that takes care of importing properties of a graph entity that is represented
|
||||
as a node as per the definition in the `schema_config.yaml`
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
return True
|
||||
|
||||
def _write_edge_headers(self):
|
||||
"""
|
||||
Abstract method to write a database import-file for a graph entity that is represented
|
||||
as an edge as per the definition in the `schema_config.yaml`,
|
||||
containing only the header for this type of edge.
|
||||
This function is not applicable for RDF.
|
||||
|
||||
Returns:
|
||||
bool: The return value. True for success, False otherwise.
|
||||
"""
|
||||
return True
|
||||
|
||||
def subject_to_uri(self, subject: str) -> str:
|
||||
"""
|
||||
Converts the subject to a proper URI using the available namespaces.
|
||||
If the conversion fails, it defaults to the biocypher prefix.
|
||||
|
||||
Args:
|
||||
subject (str): The subject to be converted to a URI.
|
||||
|
||||
Returns:
|
||||
str: The corresponding URI for the subject.
|
||||
"""
|
||||
try:
|
||||
_pref, _id = subject.split(":")
|
||||
|
||||
if _pref in self.rdf_namespaces.keys():
|
||||
return self.rdf_namespaces[_pref][_id]
|
||||
else:
|
||||
return self.rdf_namespaces["biocypher"][subject]
|
||||
except ValueError:
|
||||
return self.rdf_namespaces["biocypher"][subject]
|
||||
|
||||
def property_to_uri(self, property_name: str) -> dict[str, str]:
|
||||
"""
|
||||
Converts a property name to its corresponding URI.
|
||||
|
||||
This function takes a property name and searches for its corresponding URI in various namespaces.
|
||||
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
|
||||
|
||||
Args:
|
||||
property_name (str): The property name to be converted to a URI.
|
||||
|
||||
Returns:
|
||||
str: The corresponding URI for the input property name.
|
||||
"""
|
||||
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
|
||||
for namespace in _NAMESPACE_PREFIXES_CORE.values():
|
||||
if property_name in namespace:
|
||||
return namespace[property_name]
|
||||
|
||||
# If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
|
||||
for namespace in [SKOS, DC, DCTERMS]:
|
||||
if property_name in namespace:
|
||||
return namespace[property_name]
|
||||
|
||||
# If the property name is still not found, try other namespaces from rdflib.
|
||||
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
|
||||
if property_name in namespace:
|
||||
return namespace[property_name]
|
||||
|
||||
# If the property name is "licence", it recursively calls the function with "license" as the input.
|
||||
if property_name == "licence":
|
||||
return self.property_to_uri("license")
|
||||
|
||||
# TODO: add an option to search trough manually implemented namespaces
|
||||
|
||||
# If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
|
||||
# TODO: give a warning and try to prevent this option altogether
|
||||
return self.rdf_namespaces["biocypher"][property_name]
|
||||
|
||||
def _init_namespaces(self, graph: Graph):
|
||||
"""
|
||||
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
|
||||
|
||||
This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
|
||||
If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
|
||||
the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
|
||||
|
||||
Args:
|
||||
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# add biocypher standard to self.rdf_namespaces
|
||||
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
|
||||
if not self.rdf_namespaces:
|
||||
self.rdf_namespaces = biocypher_standard
|
||||
else:
|
||||
self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
|
||||
|
||||
for key, value in self.rdf_namespaces.items():
|
||||
namespace = Namespace(value)
|
||||
self.rdf_namespaces[key] = namespace
|
||||
graph.bind(key, namespace)
|
Reference in New Issue
Block a user