first release
This commit is contained in:
40
src/moi/README.md
Normal file
40
src/moi/README.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# MOI: MeSH Ontology Importer for Neo4j
|
||||
MOI imports a MeSH ontology file into a Neo4j graph database and performs postprocessing for consistent labeling and property handling.
|
||||
|
||||
## Quickstart
|
||||
Create a configuration file with your Neo4j connection details.
|
||||
For example, save it in your home directory as `moi.conf`:
|
||||
|
||||
```ini
|
||||
[neo4j]
|
||||
uri = bolt://localhost:7687
|
||||
username = neo4j
|
||||
password = myfancypassword
|
||||
```
|
||||
|
||||
Start the program by providing the location of your configuration file and the folder containing the ontology files:
|
||||
```sh
|
||||
python3 src/moi.py --conf ~/moi.conf --files ~/mesh_files
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
* make sure python3 is installed
|
||||
* install the required libraries with `pip install -r requirements.txt`
|
||||
* have a running Neo4j instance (version 5)
|
||||
* the Neo4j instance must have the Neosemantics and APOC plugin installed
|
||||
* ontology files must be provided in one of the supported formats: `.owl`, `.xrdf`, `.ttl`, `.nt`
|
||||
* create the configuration file as described in the Quickstart section
|
||||
|
||||
## General structure of the repository
|
||||
The two main scripts that do things are:
|
||||
|
||||
`methods_moi.py` – contains helper functions for:
|
||||
* creating the Neo4j graph configuration (constraints, n10s settings)
|
||||
* importing ontology files into Neo4j
|
||||
* postprocessing imported MeSH nodes (renaming labels, flattening properties)
|
||||
|
||||
`moi.py` – the main scripts:
|
||||
* loads configuration
|
||||
* iterates over ontology files
|
||||
* *runs the import and postprocessing steps
|
101
src/moi/methods_moi.py
Normal file
101
src/moi/methods_moi.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# This script contains methods for moi.py, initial draft from the work of Jana Cyrus
|
||||
import re
|
||||
import os
|
||||
|
||||
|
||||
# create_graph_config defines the graph configuration needed for Neosemantics and creates the graph configuration if it does not exist, if a graph configuration already exists it is expected to be the same as defined here
|
||||
def create_graph_config(graph):
|
||||
create_constraint = '''
|
||||
CREATE CONSTRAINT n10s_unique_uri IF NOT EXISTS
|
||||
FOR (r:Resource) REQUIRE r.uri IS UNIQUE;
|
||||
'''
|
||||
check_config = 'MATCH (n:_GraphConfig) RETURN count(n) AS config_count;'
|
||||
create_config = '''
|
||||
CALL n10s.graphconfig.init({
|
||||
subClassOfRel: "subClassOf",
|
||||
handleVocabUris: "MAP",
|
||||
handleMultival: "ARRAY"
|
||||
});
|
||||
'''
|
||||
|
||||
result = graph.run(check_config)
|
||||
config_exists = result.single()['config_count']
|
||||
|
||||
if config_exists == 0:
|
||||
graph.run(create_constraint)
|
||||
graph.run(create_config)
|
||||
|
||||
|
||||
def import_ontology(path, file, graph):
|
||||
full_path = os.path.join(path, file).replace(os.sep, '/')
|
||||
file_uri = f"file:///{full_path}"
|
||||
|
||||
# Detect file format
|
||||
if re.match(r'.+\.(owl|xrdf)$', file):
|
||||
format_type = "RDF/XML"
|
||||
elif re.match(r'.+\.(ttl|nt)$', file):
|
||||
format_type = "Turtle"
|
||||
else:
|
||||
print(f"Unsupported file format for: {file}")
|
||||
return
|
||||
|
||||
import_query = '''
|
||||
CALL n10s.rdf.import.fetch($file_uri, $format_type)
|
||||
'''
|
||||
print(f"Importing ontology: CALL n10s.rdf.import.fetch('{file_uri}', '{format_type}')")
|
||||
graph.run(import_query, file_uri=file_uri, format_type=format_type)
|
||||
|
||||
|
||||
# post-processing for MeSH ontology, labels are altered to identify the nodes as MeSH entries and single value properties are extracted from their arrays
|
||||
def postprocess_mesh(graph):
|
||||
# match all labels from MeSH
|
||||
# change labels by adding ontology name in front of the original label and removing the original label
|
||||
get_onto_labels = '''
|
||||
MATCH (n:Resource) WHERE n.uri CONTAINS "/MESH/"
|
||||
UNWIND labels(n)[1..] AS label
|
||||
RETURN COLLECT(DISTINCT label) AS labels
|
||||
'''
|
||||
|
||||
labels_result = graph.run(get_onto_labels)
|
||||
onto_labels = labels_result.single()['labels']
|
||||
|
||||
# change label
|
||||
for label in onto_labels:
|
||||
change_label = f'''
|
||||
CALL apoc.periodic.iterate(
|
||||
"MATCH (n:{label}) WHERE n.uri CONTAINS '/MESH/' RETURN n",
|
||||
"SET n:Mesh{label} REMOVE n:{label}",
|
||||
{{batchSize: 10000, parallel: true}}
|
||||
)
|
||||
'''
|
||||
print(f"Updating label: {change_label}")
|
||||
graph.run(change_label)
|
||||
|
||||
# return property keys as list to extract properties from array
|
||||
get_property_keys = '''
|
||||
MATCH (n:Resource)
|
||||
WHERE any(label IN labels(n) WHERE label STARTS WITH "Mesh")
|
||||
WITH KEYS(n) AS keys
|
||||
UNWIND keys AS key
|
||||
RETURN COLLECT(DISTINCT key) AS property_keys
|
||||
'''
|
||||
keys_result = graph.run(get_property_keys)
|
||||
property_keys = keys_result.single()['property_keys']
|
||||
|
||||
# extract single value properties from array
|
||||
for key in property_keys:
|
||||
if key == "uri":
|
||||
continue
|
||||
extract_properties = f'''
|
||||
CALL apoc.periodic.iterate(
|
||||
"MATCH (n:Resource)
|
||||
WHERE any(label IN labels(n) WHERE label STARTS WITH 'Mesh')
|
||||
AND size(n.{key}) = 1
|
||||
RETURN n",
|
||||
"WITH n UNWIND n.{key} AS prop SET n.{key} = prop",
|
||||
{{batchSize: 10000, parallel: true}}
|
||||
)
|
||||
'''
|
||||
|
||||
print(f"Extracting property: {extract_properties}")
|
||||
graph.run(extract_properties)
|
38
src/moi/moi.py
Normal file
38
src/moi/moi.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# This integrates MeSH into Neo4j, initial draft adapted from the work of Jana Cyrus
|
||||
from neo4j import GraphDatabase
|
||||
import argparse
|
||||
import configparser
|
||||
import os
|
||||
from methods_mesh import create_graph_config, import_ontology, postprocess_mesh
|
||||
|
||||
|
||||
# define parameters - pass ontology file dir and db conf as arguments when running the script
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-c', '--conf', required=True, type=str,
|
||||
help='Configuration file with database connection parameters')
|
||||
parser.add_argument('-f', '--files', required=True, type=str, help='Directory with ontology files')
|
||||
|
||||
# parse parameters
|
||||
args = parser.parse_args()
|
||||
mesh_file_path = args.files
|
||||
|
||||
conf_file = configparser.ConfigParser()
|
||||
conf_file.read(args.conf)
|
||||
uri = conf_file['neo4j']['uri']
|
||||
username = conf_file['neo4j']['username']
|
||||
password = conf_file['neo4j']['password']
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
driver = GraphDatabase.driver(uri, auth=(username, password))
|
||||
with driver.session() as session:
|
||||
create_graph_config(session)
|
||||
|
||||
# Process all ontology files in the specified directory
|
||||
for file in os.listdir(mesh_file_path):
|
||||
if file.endswith(('.owl', '.xrdf', '.ttl', '.nt')):
|
||||
import_ontology(mesh_file_path, file, session)
|
||||
|
||||
postprocess_mesh(session)
|
||||
|
||||
driver.close()
|
Reference in New Issue
Block a user