first release

This commit is contained in:
2025-08-22 11:52:43 +02:00
commit ec27c71148
23 changed files with 1543 additions and 0 deletions

40
src/moi/README.md Normal file
View File

@@ -0,0 +1,40 @@
# MOI: MeSH Ontology Importer for Neo4j
MOI imports a MeSH ontology file into a Neo4j graph database and performs postprocessing for consistent labeling and property handling.
## Quickstart
Create a configuration file with your Neo4j connection details.
For example, save it in your home directory as `moi.conf`:
```ini
[neo4j]
uri = bolt://localhost:7687
username = neo4j
password = myfancypassword
```
Start the program by providing the location of your configuration file and the folder containing the ontology files:
```sh
python3 src/moi.py --conf ~/moi.conf --files ~/mesh_files
```
## Requirements
* make sure python3 is installed
* install the required libraries with `pip install -r requirements.txt`
* have a running Neo4j instance (version 5)
* the Neo4j instance must have the Neosemantics and APOC plugin installed
* ontology files must be provided in one of the supported formats: `.owl`, `.xrdf`, `.ttl`, `.nt`
* create the configuration file as described in the Quickstart section
## General structure of the repository
The two main scripts that do things are:
`methods_moi.py` contains helper functions for:
* creating the Neo4j graph configuration (constraints, n10s settings)
* importing ontology files into Neo4j
* postprocessing imported MeSH nodes (renaming labels, flattening properties)
`moi.py` the main scripts:
* loads configuration
* iterates over ontology files
* *runs the import and postprocessing steps

101
src/moi/methods_moi.py Normal file
View File

@@ -0,0 +1,101 @@
# This script contains methods for moi.py, initial draft from the work of Jana Cyrus
import re
import os
# create_graph_config defines the graph configuration needed for Neosemantics and creates the graph configuration if it does not exist, if a graph configuration already exists it is expected to be the same as defined here
def create_graph_config(graph):
create_constraint = '''
CREATE CONSTRAINT n10s_unique_uri IF NOT EXISTS
FOR (r:Resource) REQUIRE r.uri IS UNIQUE;
'''
check_config = 'MATCH (n:_GraphConfig) RETURN count(n) AS config_count;'
create_config = '''
CALL n10s.graphconfig.init({
subClassOfRel: "subClassOf",
handleVocabUris: "MAP",
handleMultival: "ARRAY"
});
'''
result = graph.run(check_config)
config_exists = result.single()['config_count']
if config_exists == 0:
graph.run(create_constraint)
graph.run(create_config)
def import_ontology(path, file, graph):
full_path = os.path.join(path, file).replace(os.sep, '/')
file_uri = f"file:///{full_path}"
# Detect file format
if re.match(r'.+\.(owl|xrdf)$', file):
format_type = "RDF/XML"
elif re.match(r'.+\.(ttl|nt)$', file):
format_type = "Turtle"
else:
print(f"Unsupported file format for: {file}")
return
import_query = '''
CALL n10s.rdf.import.fetch($file_uri, $format_type)
'''
print(f"Importing ontology: CALL n10s.rdf.import.fetch('{file_uri}', '{format_type}')")
graph.run(import_query, file_uri=file_uri, format_type=format_type)
# post-processing for MeSH ontology, labels are altered to identify the nodes as MeSH entries and single value properties are extracted from their arrays
def postprocess_mesh(graph):
# match all labels from MeSH
# change labels by adding ontology name in front of the original label and removing the original label
get_onto_labels = '''
MATCH (n:Resource) WHERE n.uri CONTAINS "/MESH/"
UNWIND labels(n)[1..] AS label
RETURN COLLECT(DISTINCT label) AS labels
'''
labels_result = graph.run(get_onto_labels)
onto_labels = labels_result.single()['labels']
# change label
for label in onto_labels:
change_label = f'''
CALL apoc.periodic.iterate(
"MATCH (n:{label}) WHERE n.uri CONTAINS '/MESH/' RETURN n",
"SET n:Mesh{label} REMOVE n:{label}",
{{batchSize: 10000, parallel: true}}
)
'''
print(f"Updating label: {change_label}")
graph.run(change_label)
# return property keys as list to extract properties from array
get_property_keys = '''
MATCH (n:Resource)
WHERE any(label IN labels(n) WHERE label STARTS WITH "Mesh")
WITH KEYS(n) AS keys
UNWIND keys AS key
RETURN COLLECT(DISTINCT key) AS property_keys
'''
keys_result = graph.run(get_property_keys)
property_keys = keys_result.single()['property_keys']
# extract single value properties from array
for key in property_keys:
if key == "uri":
continue
extract_properties = f'''
CALL apoc.periodic.iterate(
"MATCH (n:Resource)
WHERE any(label IN labels(n) WHERE label STARTS WITH 'Mesh')
AND size(n.{key}) = 1
RETURN n",
"WITH n UNWIND n.{key} AS prop SET n.{key} = prop",
{{batchSize: 10000, parallel: true}}
)
'''
print(f"Extracting property: {extract_properties}")
graph.run(extract_properties)

38
src/moi/moi.py Normal file
View File

@@ -0,0 +1,38 @@
# This integrates MeSH into Neo4j, initial draft adapted from the work of Jana Cyrus
from neo4j import GraphDatabase
import argparse
import configparser
import os
from methods_mesh import create_graph_config, import_ontology, postprocess_mesh
# define parameters - pass ontology file dir and db conf as arguments when running the script
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--conf', required=True, type=str,
help='Configuration file with database connection parameters')
parser.add_argument('-f', '--files', required=True, type=str, help='Directory with ontology files')
# parse parameters
args = parser.parse_args()
mesh_file_path = args.files
conf_file = configparser.ConfigParser()
conf_file.read(args.conf)
uri = conf_file['neo4j']['uri']
username = conf_file['neo4j']['username']
password = conf_file['neo4j']['password']
if __name__ == "__main__":
driver = GraphDatabase.driver(uri, auth=(username, password))
with driver.session() as session:
create_graph_config(session)
# Process all ontology files in the specified directory
for file in os.listdir(mesh_file_path):
if file.endswith(('.owl', '.xrdf', '.ttl', '.nt')):
import_ontology(mesh_file_path, file, session)
postprocess_mesh(session)
driver.close()