first release
This commit is contained in:
40
src/moi/README.md
Normal file
40
src/moi/README.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# MOI: MeSH Ontology Importer for Neo4j
|
||||
MOI imports a MeSH ontology file into a Neo4j graph database and performs postprocessing for consistent labeling and property handling.
|
||||
|
||||
## Quickstart
|
||||
Create a configuration file with your Neo4j connection details.
|
||||
For example, save it in your home directory as `moi.conf`:
|
||||
|
||||
```ini
|
||||
[neo4j]
|
||||
uri = bolt://localhost:7687
|
||||
username = neo4j
|
||||
password = myfancypassword
|
||||
```
|
||||
|
||||
Start the program by providing the location of your configuration file and the folder containing the ontology files:
|
||||
```sh
|
||||
python3 src/moi.py --conf ~/moi.conf --files ~/mesh_files
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
* make sure python3 is installed
|
||||
* install the required libraries with `pip install -r requirements.txt`
|
||||
* have a running Neo4j instance (version 5)
|
||||
* the Neo4j instance must have the Neosemantics and APOC plugin installed
|
||||
* ontology files must be provided in one of the supported formats: `.owl`, `.xrdf`, `.ttl`, `.nt`
|
||||
* create the configuration file as described in the Quickstart section
|
||||
|
||||
## General structure of the repository
|
||||
The two main scripts that do things are:
|
||||
|
||||
`methods_moi.py` – contains helper functions for:
|
||||
* creating the Neo4j graph configuration (constraints, n10s settings)
|
||||
* importing ontology files into Neo4j
|
||||
* postprocessing imported MeSH nodes (renaming labels, flattening properties)
|
||||
|
||||
`moi.py` – the main scripts:
|
||||
* loads configuration
|
||||
* iterates over ontology files
|
||||
* *runs the import and postprocessing steps
|
101
src/moi/methods_moi.py
Normal file
101
src/moi/methods_moi.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# This script contains methods for moi.py, initial draft from the work of Jana Cyrus
|
||||
import re
|
||||
import os
|
||||
|
||||
|
||||
# create_graph_config defines the graph configuration needed for Neosemantics and creates the graph configuration if it does not exist, if a graph configuration already exists it is expected to be the same as defined here
|
||||
def create_graph_config(graph):
|
||||
create_constraint = '''
|
||||
CREATE CONSTRAINT n10s_unique_uri IF NOT EXISTS
|
||||
FOR (r:Resource) REQUIRE r.uri IS UNIQUE;
|
||||
'''
|
||||
check_config = 'MATCH (n:_GraphConfig) RETURN count(n) AS config_count;'
|
||||
create_config = '''
|
||||
CALL n10s.graphconfig.init({
|
||||
subClassOfRel: "subClassOf",
|
||||
handleVocabUris: "MAP",
|
||||
handleMultival: "ARRAY"
|
||||
});
|
||||
'''
|
||||
|
||||
result = graph.run(check_config)
|
||||
config_exists = result.single()['config_count']
|
||||
|
||||
if config_exists == 0:
|
||||
graph.run(create_constraint)
|
||||
graph.run(create_config)
|
||||
|
||||
|
||||
def import_ontology(path, file, graph):
|
||||
full_path = os.path.join(path, file).replace(os.sep, '/')
|
||||
file_uri = f"file:///{full_path}"
|
||||
|
||||
# Detect file format
|
||||
if re.match(r'.+\.(owl|xrdf)$', file):
|
||||
format_type = "RDF/XML"
|
||||
elif re.match(r'.+\.(ttl|nt)$', file):
|
||||
format_type = "Turtle"
|
||||
else:
|
||||
print(f"Unsupported file format for: {file}")
|
||||
return
|
||||
|
||||
import_query = '''
|
||||
CALL n10s.rdf.import.fetch($file_uri, $format_type)
|
||||
'''
|
||||
print(f"Importing ontology: CALL n10s.rdf.import.fetch('{file_uri}', '{format_type}')")
|
||||
graph.run(import_query, file_uri=file_uri, format_type=format_type)
|
||||
|
||||
|
||||
# post-processing for MeSH ontology, labels are altered to identify the nodes as MeSH entries and single value properties are extracted from their arrays
|
||||
def postprocess_mesh(graph):
|
||||
# match all labels from MeSH
|
||||
# change labels by adding ontology name in front of the original label and removing the original label
|
||||
get_onto_labels = '''
|
||||
MATCH (n:Resource) WHERE n.uri CONTAINS "/MESH/"
|
||||
UNWIND labels(n)[1..] AS label
|
||||
RETURN COLLECT(DISTINCT label) AS labels
|
||||
'''
|
||||
|
||||
labels_result = graph.run(get_onto_labels)
|
||||
onto_labels = labels_result.single()['labels']
|
||||
|
||||
# change label
|
||||
for label in onto_labels:
|
||||
change_label = f'''
|
||||
CALL apoc.periodic.iterate(
|
||||
"MATCH (n:{label}) WHERE n.uri CONTAINS '/MESH/' RETURN n",
|
||||
"SET n:Mesh{label} REMOVE n:{label}",
|
||||
{{batchSize: 10000, parallel: true}}
|
||||
)
|
||||
'''
|
||||
print(f"Updating label: {change_label}")
|
||||
graph.run(change_label)
|
||||
|
||||
# return property keys as list to extract properties from array
|
||||
get_property_keys = '''
|
||||
MATCH (n:Resource)
|
||||
WHERE any(label IN labels(n) WHERE label STARTS WITH "Mesh")
|
||||
WITH KEYS(n) AS keys
|
||||
UNWIND keys AS key
|
||||
RETURN COLLECT(DISTINCT key) AS property_keys
|
||||
'''
|
||||
keys_result = graph.run(get_property_keys)
|
||||
property_keys = keys_result.single()['property_keys']
|
||||
|
||||
# extract single value properties from array
|
||||
for key in property_keys:
|
||||
if key == "uri":
|
||||
continue
|
||||
extract_properties = f'''
|
||||
CALL apoc.periodic.iterate(
|
||||
"MATCH (n:Resource)
|
||||
WHERE any(label IN labels(n) WHERE label STARTS WITH 'Mesh')
|
||||
AND size(n.{key}) = 1
|
||||
RETURN n",
|
||||
"WITH n UNWIND n.{key} AS prop SET n.{key} = prop",
|
||||
{{batchSize: 10000, parallel: true}}
|
||||
)
|
||||
'''
|
||||
|
||||
print(f"Extracting property: {extract_properties}")
|
||||
graph.run(extract_properties)
|
38
src/moi/moi.py
Normal file
38
src/moi/moi.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# This integrates MeSH into Neo4j, initial draft adapted from the work of Jana Cyrus
|
||||
from neo4j import GraphDatabase
|
||||
import argparse
|
||||
import configparser
|
||||
import os
|
||||
from methods_mesh import create_graph_config, import_ontology, postprocess_mesh
|
||||
|
||||
|
||||
# define parameters - pass ontology file dir and db conf as arguments when running the script
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-c', '--conf', required=True, type=str,
|
||||
help='Configuration file with database connection parameters')
|
||||
parser.add_argument('-f', '--files', required=True, type=str, help='Directory with ontology files')
|
||||
|
||||
# parse parameters
|
||||
args = parser.parse_args()
|
||||
mesh_file_path = args.files
|
||||
|
||||
conf_file = configparser.ConfigParser()
|
||||
conf_file.read(args.conf)
|
||||
uri = conf_file['neo4j']['uri']
|
||||
username = conf_file['neo4j']['username']
|
||||
password = conf_file['neo4j']['password']
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
driver = GraphDatabase.driver(uri, auth=(username, password))
|
||||
with driver.session() as session:
|
||||
create_graph_config(session)
|
||||
|
||||
# Process all ontology files in the specified directory
|
||||
for file in os.listdir(mesh_file_path):
|
||||
if file.endswith(('.owl', '.xrdf', '.ttl', '.nt')):
|
||||
import_ontology(mesh_file_path, file, session)
|
||||
|
||||
postprocess_mesh(session)
|
||||
|
||||
driver.close()
|
32
src/postprocessing/README.md
Normal file
32
src/postprocessing/README.md
Normal file
@@ -0,0 +1,32 @@
|
||||
# postprocessing
|
||||
`postprocess.py` executes Cypher queries to create mappings between MeSH, UMLS, MDM, and ClinicalTrials.gov data in a Neo4j graph database.
|
||||
|
||||
The main script `postprocess.py` runs a set of predefined Cypher queries for:
|
||||
* mapping MeSH terms to UMLS concepts
|
||||
* mapping MDM Portal aliases to UMLS concepts
|
||||
* mapping ClinicalTrials.gov studies to MeSH terms
|
||||
* mapping ClinicalTrials.gov studies to MDM Portal entries
|
||||
|
||||
## Quickstart
|
||||
|
||||
Create a configuration file with your Neo4j connection details.
|
||||
For example, save it as `postprocess.conf`:
|
||||
|
||||
```ini
|
||||
[neo4j]
|
||||
uri = bolt://localhost:7687
|
||||
username = neo4j
|
||||
password = myfancypassword
|
||||
```
|
||||
|
||||
Run the postprocessing by providing the configuration file:
|
||||
```sh
|
||||
python3 src/postprocess.py --conf ~/postprocess.conf
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
* make sure `python3` is installed
|
||||
* have a running Neo4j instance (version 5)
|
||||
* the Neo4j instance must have the APOC plugin installed
|
||||
* create the configuration file as described in the [Quickstart](#quickstart) section
|
101
src/postprocessing/postprocess.py
Normal file
101
src/postprocessing/postprocess.py
Normal file
@@ -0,0 +1,101 @@
|
||||
from neo4j import GraphDatabase
|
||||
import logging
|
||||
import argparse
|
||||
import configparser
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# define funtion for running cypher queries against a neo4j
|
||||
def run_cypher_query_for_postprocessing(uri, user, password, cypher_query):
|
||||
try:
|
||||
with GraphDatabase.driver(uri, auth=(user, password)) as driver:
|
||||
with driver.session() as session:
|
||||
session.run(cypher_query)
|
||||
logger.info("Postprocessing query executed successfully.")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to execute query: {e}")
|
||||
raise
|
||||
|
||||
|
||||
# cypher queries for postprocessing
|
||||
# mapping MeSH terms to UMLS concepts
|
||||
cypher_mesh_to_umls = """
|
||||
MATCH (m:MeshClass)
|
||||
WITH collect(distinct m.cui) AS mesh_cui_lists
|
||||
WITH apoc.coll.toSet(apoc.coll.flatten(mesh_cui_lists)) AS mesh_cui_set
|
||||
MATCH (a:UMLSconcept)
|
||||
WHERE a.cui IN mesh_cui_set
|
||||
MERGE (m)-[:MESH_TO_UMLS_MAPPING]->(a)
|
||||
"""
|
||||
|
||||
# mapping MeSH terms to UMLS concepts, optimised for larger dataset
|
||||
cypher_mesh_to_umls_optimised = """
|
||||
CALL apoc.periodic.iterate(
|
||||
"MATCH (m:MeshClass) RETURN m",
|
||||
"UNWIND m.cui AS mesh_cui
|
||||
MATCH (a:UMLSconcept {cui: mesh_cui})
|
||||
MERGE (m)-[:MESH_TO_UMLS_MAPPING]->(a)",
|
||||
{batchSize:1000, parallel:false}
|
||||
);
|
||||
"""
|
||||
|
||||
# mapping MDM Portal Alias to UMLS concepts
|
||||
cypher_mdm_to_umls = """
|
||||
MATCH (a:Alias)
|
||||
WITH a, a.Name as aliasname
|
||||
MATCH (u:UMLSconcept)
|
||||
WHERE u.cui = aliasname
|
||||
MERGE (a)-[:MAPS_TO_UMLS_ENTRY]->(u)
|
||||
"""
|
||||
|
||||
# mapping ClinicalTrials.gov studies to MeSH terms
|
||||
cypher_ct_to_mesh = """
|
||||
MATCH (c:MeshClass)
|
||||
WITH c, c.notation as cnote
|
||||
MATCH (m:meshes)
|
||||
WHERE m.id = cnote
|
||||
MERGE (m)-[:MAPS_TO_MESH_ENTRY]->(c)
|
||||
"""
|
||||
|
||||
# mapping ClinicalTrials.gov studies to MDM Portal entries
|
||||
cypher_ct_to_mdm = """
|
||||
MATCH (s:Study)
|
||||
WITH s
|
||||
MATCH (c:ClinicalTrialsEntry)
|
||||
WHERE c.str_id = s.OID
|
||||
MERGE (s)-[r:BELONGS_TO]->(c);
|
||||
|
||||
MATCH (s:Study)
|
||||
WITH s
|
||||
MATCH (c:ClinicalTrialsEntry)
|
||||
WHERE c.str_id = s.NCT_ID
|
||||
MERGE (s)-[r:BELONGS_TO]->(c)
|
||||
"""
|
||||
|
||||
|
||||
# define parameters - pass json files dir and db conf as arguments when running the script
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-c', '--conf', required=True, type=str,
|
||||
help='Configuration file with database connection parameters')
|
||||
|
||||
# parse parameters
|
||||
args = parser.parse_args()
|
||||
|
||||
conf_file = configparser.ConfigParser()
|
||||
conf_file.read(args.conf)
|
||||
uri = conf_file['neo4j']['uri']
|
||||
username = conf_file['neo4j']['username']
|
||||
password = conf_file['neo4j']['password']
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# postprocess mesh to umls
|
||||
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_mesh_to_umls_optimised)
|
||||
# postprocess clinicaltrials.gov to mesh
|
||||
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_ct_to_mesh)
|
||||
# postprocess mdm to umls
|
||||
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_mdm_to_umls)
|
||||
# postprocess clinicaltrials.gov to mdm
|
||||
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_ct_to_mdm)
|
38
src/study2neo4j/README.md
Normal file
38
src/study2neo4j/README.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# study2neo4j
|
||||
|
||||
This repo is for integrating data from ClinicalTrials.gov into Neo4j.
|
||||
|
||||
## Quickstart
|
||||
|
||||
Create a configuration file, storing your details for the database-connection.
|
||||
E.g. in your home-directory with the name `study.conf`.
|
||||
|
||||
```ini
|
||||
[neo4j]
|
||||
uri = bolt://localhost:7687
|
||||
username = neo4j
|
||||
password = myfancypassword
|
||||
```
|
||||
|
||||
Start the program by providing the location of your configuration-file and the location of the json files.
|
||||
|
||||
```sh
|
||||
python3 src/run.py --conf ~/study.conf --files ~/Desktop/datasource
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- make sure, `python3` is installed
|
||||
- install the required libraries with `pip install -r requirements.txt`
|
||||
- download all json files from [ClinicalTrials.gov]( https://clinicaltrials.gov/ ) and place all json files you want to load in one folder
|
||||
- have a running Neo4j DB (Neo4j version 5)
|
||||
- create the configuration-file as described in the [Quickstart section](#quickstart)
|
||||
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
`ct2neo4j.py` is adapted from the MeDaX pipeline (see [1], [2]) graphCreation, with permission from the authors.
|
||||
|
||||
[1] Gebhardt, T., Mazein, I., Michaelis, L., Henkel, R., Lobentanzer, S., Waltemath, D., & Wodke, J. (2025). MeDaX pipeline (1.0.0). Zenodo. https://doi.org/10.5281/zenodo.15229077
|
||||
|
||||
[2] Mazein, I., Gebhardt, T., Zinkewitz, F., Michaelis, L., Braun, S., Waltemath, D., Henkel, R., & Wodke, J. A. (2024). MeDaX: A Knowledge Graph on FHIR. In Digital Health and Informatics Innovations for Sustainable Health Care Systems (pp. 367-371). IOS Press.
|
125
src/study2neo4j/ct2neo4j.py
Normal file
125
src/study2neo4j/ct2neo4j.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import json
|
||||
import os
|
||||
from neo4j import GraphDatabase
|
||||
|
||||
|
||||
# Define a function to add nodes and relationships recursively
|
||||
def add_nodes_from_dict(tx, parent_node_label, parent_node_str_id, current_dict):
|
||||
for key, value in current_dict.items(): # iterate over each key-value pair in dictionary
|
||||
if key == "phases":
|
||||
# Create a node for each phase
|
||||
for index, phase in enumerate(value):
|
||||
phase_node_str_id = f"{parent_node_str_id}_{key}_{index}"
|
||||
tx.run(f"MERGE (n:phase {{str_id: $str_id, name: $phase_name}})",
|
||||
str_id=phase_node_str_id, phase_name=phase)
|
||||
tx.run(
|
||||
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:phase {{str_id: $child_str_id}}) "
|
||||
f"MERGE (a)-[:{key}]->(b)",
|
||||
parent_str_id=parent_node_str_id,
|
||||
child_str_id=phase_node_str_id
|
||||
)
|
||||
if isinstance(value, dict): # if value of key is a dict, then create new node:
|
||||
# Create a new node for the nested dictionary
|
||||
new_node_str_id = f"{parent_node_str_id}_{key}" # concatenate the parent_node_str_id and key to a new id
|
||||
tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=new_node_str_id) # create node with key as label
|
||||
|
||||
# Create a relationship from the parent node to the new node
|
||||
tx.run(f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
|
||||
f"MERGE (a)-[:{key}]->(b)",
|
||||
parent_str_id=parent_node_str_id,
|
||||
child_str_id=new_node_str_id) # create rel betw parent and newly created node
|
||||
|
||||
# Recurse into the nested dictionary
|
||||
add_nodes_from_dict(tx, key, new_node_str_id, value)
|
||||
|
||||
elif isinstance(value, list): # if value of key is a list, then check if list contains dictionary
|
||||
|
||||
if key == "conditions":
|
||||
# Create a node for each condition
|
||||
for index, condition in enumerate(value):
|
||||
condition_node_str_id = f"{parent_node_str_id}_{key}_{index}"
|
||||
tx.run(f"MERGE (n:condition {{str_id: $str_id, name: $condition_name}})",
|
||||
str_id=condition_node_str_id, condition_name=condition)
|
||||
tx.run(
|
||||
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:condition {{str_id: $child_str_id}}) "
|
||||
f"MERGE (a)-[:{key}]->(b)",
|
||||
parent_str_id=parent_node_str_id,
|
||||
child_str_id=condition_node_str_id
|
||||
)
|
||||
elif key == "keywords":
|
||||
# Create a node for each keyword
|
||||
for index, keyword in enumerate(value):
|
||||
keyword_node_str_id = f"{parent_node_str_id}_{key}_{index}"
|
||||
tx.run(f"MERGE (n:keyword {{str_id: $str_id, name: $keyword_name}})", str_id=keyword_node_str_id,
|
||||
keyword_name=keyword)
|
||||
tx.run(
|
||||
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:keyword {{str_id: $child_str_id}}) "
|
||||
f"MERGE (a)-[:{key}]->(b)",
|
||||
parent_str_id=parent_node_str_id,
|
||||
child_str_id=keyword_node_str_id
|
||||
)
|
||||
|
||||
# if list doesn't contain any nested dictionaries, make it a value in the node
|
||||
if not any(isinstance(item, dict) for item in value):
|
||||
# If the list contains only primitive values (like strings or numbers),
|
||||
# these values are set as properties of the parent node.
|
||||
tx.run(f"MATCH (n:{parent_node_label} {{str_id: $str_id}}) SET n.{key} = $value",
|
||||
str_id=parent_node_str_id, value=value)
|
||||
|
||||
else: # if list contains dictionaries, then a new node is created for each dict
|
||||
|
||||
# Process each dictionary in the list
|
||||
for index, item in enumerate(value):
|
||||
if isinstance(item, dict):
|
||||
item_node_str_id = f"{parent_node_str_id}_{key}_{index}"
|
||||
tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=item_node_str_id)
|
||||
|
||||
tx.run(
|
||||
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
|
||||
f"MERGE (a)-[:{key}]->(b)", parent_str_id=parent_node_str_id,
|
||||
child_str_id=item_node_str_id)
|
||||
add_nodes_from_dict(tx, key, item_node_str_id, item)
|
||||
|
||||
else:
|
||||
|
||||
# if value is a reference
|
||||
if key == 'reference':
|
||||
new_node_str_id = f"{parent_node_str_id}_{key}"
|
||||
tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=new_node_str_id)
|
||||
|
||||
tx.run(f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
|
||||
f"MERGE (a)-[:{key}]->(b)", parent_str_id=parent_node_str_id, child_str_id=new_node_str_id)
|
||||
|
||||
tx.run(f"MATCH (n:{key} {{str_id: $str_id}}) SET n.{key} = $value",
|
||||
str_id=new_node_str_id, value=value)
|
||||
|
||||
else:
|
||||
|
||||
# For non-dict and non-list values, add them as attributes to the parent node
|
||||
tx.run(f"MATCH (n:{parent_node_label} {{str_id: $str_id}}) SET n.{key} = $value",
|
||||
str_id=parent_node_str_id, value=value)
|
||||
|
||||
|
||||
# Connect to Neo4j and create the graph
|
||||
def create_graph_from_directory(uri, user, password, directory_path):
|
||||
driver = GraphDatabase.driver(uri, auth=(user, password))
|
||||
|
||||
for filename in os.listdir(directory_path):
|
||||
if filename.endswith('.json'):
|
||||
file_path = os.path.join(directory_path, filename)
|
||||
try:
|
||||
with open(file_path, 'r') as file:
|
||||
json_data = json.load(file)
|
||||
|
||||
with driver.session() as session:
|
||||
root_node_label = 'ClinicalTrialsEntry'
|
||||
root_node_str_id = json_data['protocolSection']['identificationModule']['nctId']
|
||||
session.execute_write(
|
||||
lambda tx: tx.run(f"MERGE (n:{root_node_label} {{str_id: $str_id}})", str_id=root_node_str_id))
|
||||
session.execute_write(add_nodes_from_dict, root_node_label, root_node_str_id, json_data)
|
||||
|
||||
print(f"Successfully imported: {filename}")
|
||||
except Exception as e:
|
||||
print(f"Failed to import {filename}: {e}")
|
||||
|
||||
driver.close()
|
32
src/study2neo4j/run.py
Normal file
32
src/study2neo4j/run.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import argparse
|
||||
import logging
|
||||
import configparser
|
||||
from ct2neo4j import create_graph_from_directory
|
||||
|
||||
STUDY2NEO4J_VERSION: str = "0.1"
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logger.info('study2neo4j v'+STUDY2NEO4J_VERSION)
|
||||
|
||||
# define parameters - pass json files dir and db conf as arguments when running the script
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-c', '--conf', required=True, type=str,
|
||||
help='Configuration file with database connection parameters')
|
||||
parser.add_argument('-f', '--files', required=True, type=str, help='Directory with json files')
|
||||
|
||||
# parse parameters
|
||||
args = parser.parse_args()
|
||||
json_file_path = args.files
|
||||
|
||||
conf_file = configparser.ConfigParser()
|
||||
conf_file.read(args.conf)
|
||||
uri = conf_file['neo4j']['uri']
|
||||
username = conf_file['neo4j']['username']
|
||||
password = conf_file['neo4j']['password']
|
||||
|
||||
|
||||
# start study2neo4j
|
||||
if __name__ == "__main__":
|
||||
create_graph_from_directory(uri=uri, user=username, password=password, directory_path=json_file_path)
|
52
src/umls2neo4j/README.md
Normal file
52
src/umls2neo4j/README.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# umls2neo4j: UMLS to Neo4j Importer
|
||||
|
||||
This Python script parses selected relationships from the UMLS Metathesaurus (`MRREL.RRF` and `MRCONSO.RRF`) and loads them into a Neo4j graph database.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Requires a UMLS licence!
|
||||
|
||||
## Features
|
||||
|
||||
- Filters and loads `PAR` (parent) and `CHD` (child) relationships from `MRREL.RRF`
|
||||
- Loads only preferred English concept names from `MRCONSO.RRF`
|
||||
|
||||
|
||||
## Quickstart
|
||||
|
||||
Create a configuration file, storing your details for the database-connection.
|
||||
E.g. in your home-directory with the name `umls.conf`.
|
||||
|
||||
```ini
|
||||
[neo4j]
|
||||
uri = bolt://localhost:7687
|
||||
username = neo4j
|
||||
password = myfancypassword
|
||||
```
|
||||
|
||||
Start the program by providing the location of your configuration-file and the location of the UMLS-files.
|
||||
|
||||
```sh
|
||||
python3 src/umls2neo4j.py --conf ~/umls.conf --mrconsofiles ~/umls/MRCONSO.RRF --mrrelfiles ~/umls/MRREL.RRF
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- make sure, `python3` is installed
|
||||
- install the required libraries with `pip install -r requirements.txt`
|
||||
- download the UMLS Metathesaurus files (`MRREL.RRF`, `MRCONSO.RRF`) from []( ) → requires a UMLS licence
|
||||
- have a running Neo4j DB (Neo4j version 5), with APOC installed
|
||||
- create the configuration-file as described in the [Quickstart section](#quickstart)
|
||||
|
||||
|
||||
## Detailled Infos
|
||||
|
||||
The script will:
|
||||
|
||||
1. Load preferred English concept names from `MRCONSO.RRF`
|
||||
2. Parse allowed relationships from `MRREL.RRF`
|
||||
3. Insert nodes and relationships into Neo4j using chunked batches
|
||||
|
||||
## Customisation
|
||||
|
||||
- Adjust `ALLOWED_RELS` in the script to include more relationship types
|
||||
- Tune `batch_chunk_size` and `apoc_batch_size` for better performance
|
114
src/umls2neo4j/methods_umls2neo4j.py
Normal file
114
src/umls2neo4j/methods_umls2neo4j.py
Normal file
@@ -0,0 +1,114 @@
|
||||
import csv
|
||||
from neo4j import GraphDatabase
|
||||
|
||||
|
||||
# Load concept names from MRCONSO.RRF
|
||||
# Join CUI2 (MRREL) to CUI (MRCONSO) where TS=P, STT=PF, ISPREF=Y, LAT=ENG
|
||||
def load_cui_names(file_path): # this only stores names for cuis (information about source_vocab and source_vocab_code needs to be included as well)
|
||||
cui_infos = {}
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
reader = csv.reader(f, delimiter="|")
|
||||
for row in reader:
|
||||
cui = row[0]
|
||||
language = row[1] # LAT=ENG
|
||||
term_status = row[2] # TS=P (Preferred)
|
||||
stt = row[4] # STT=PF (Preferred form of term)
|
||||
ispref = row[6] # ISPREF=Y
|
||||
source_vocab = row[11]
|
||||
source_vocab_code = row[13]
|
||||
name = row[14]
|
||||
srl_license = row[15] # should be 0
|
||||
|
||||
if language != "ENG":
|
||||
continue
|
||||
|
||||
if term_status == 'P' and stt == 'PF' and ispref == 'Y':
|
||||
if srl_license == '0':
|
||||
# cui_infos[cui] = name
|
||||
cui_infos[cui] = {
|
||||
"name": name,
|
||||
"source_vocab": source_vocab,
|
||||
"source_vocab_code": source_vocab_code
|
||||
}
|
||||
return cui_infos
|
||||
|
||||
|
||||
# Load MRREL.RRF and return filtered concept relationships
|
||||
def parse_mrrel(file_path, cui_infos, allowed_rels):
|
||||
mrrelationships = []
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
reader = csv.reader(f, delimiter="|")
|
||||
for row in reader:
|
||||
cui1 = row[0]
|
||||
concept_type1 = row[2] # should be CUI
|
||||
rel = row[3] # relationship, e.g. PAR, CHD
|
||||
cui2 = row[4]
|
||||
concept_type2 = row[6] # should be CUI
|
||||
rel_attribute = row[7]
|
||||
source_vocab = row[10] # source vocabulary
|
||||
|
||||
if rel in allowed_rels:
|
||||
# if concept_type1 == 'CUI' and concept_type2 == 'CUI':
|
||||
from_info = cui_infos.get(cui1)
|
||||
to_info = cui_infos.get(cui2)
|
||||
|
||||
if from_info and to_info:
|
||||
mrrelationships.append({
|
||||
"from": cui1,
|
||||
"from_name": from_info["name"],
|
||||
"from_source_vocab": from_info["source_vocab"],
|
||||
"from_source_vocab_code": from_info["source_vocab_code"],
|
||||
"to": cui2,
|
||||
"to_name": to_info["name"],
|
||||
"to_source_vocab": to_info["source_vocab"],
|
||||
"to_source_vocab_code": to_info["source_vocab_code"],
|
||||
"type": rel,
|
||||
"attribute": rel_attribute,
|
||||
"source": source_vocab
|
||||
})
|
||||
return mrrelationships
|
||||
|
||||
|
||||
def chunked(data, size):
|
||||
for i in range(0, len(data), size):
|
||||
yield data[i:i + size]
|
||||
|
||||
|
||||
# Load relationships into Neo4j
|
||||
def load_into_neo4j(rels, uri, user, password, batch_chunk_size, apoc_batch_size):
|
||||
driver = GraphDatabase.driver(uri, auth=(user, password), connection_timeout=60, max_connection_lifetime=3600)
|
||||
|
||||
with driver.session(database="neo4j") as session:
|
||||
# create constraint on cui
|
||||
# session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (u:UMLSconcept) REQUIRE u.cui IS UNIQUE")
|
||||
|
||||
chunk_count = 0
|
||||
|
||||
for chunk in chunked(rels, batch_chunk_size):
|
||||
chunk_count += 1
|
||||
print(f"Inserting chunk {chunk_count} with {len(chunk)} relationships...")
|
||||
|
||||
session.run("""
|
||||
CALL apoc.periodic.iterate(
|
||||
'UNWIND $chunk AS rel RETURN rel',
|
||||
'
|
||||
MERGE (c1:UMLSconcept {cui: rel.from})
|
||||
SET c1.name = rel.from_name, c1.source = rel.from_source_vocab, c1.sourcecode = rel.from_source_vocab_code
|
||||
MERGE (c2:UMLSconcept {cui: rel.to})
|
||||
SET c2.name = rel.to_name, c2.source = rel.to_source_vocab, c2.sourcecode = rel.to_source_vocab_code
|
||||
MERGE (c1)-[r:UMLS_RELATION {type: rel.type}]->(c2)
|
||||
SET r.source = rel.source, r.attribute = rel.attribute
|
||||
',
|
||||
{
|
||||
batchSize: $apoc_batch_size,
|
||||
parallel: false,
|
||||
params: {chunk: $chunk}
|
||||
}
|
||||
)
|
||||
""", {
|
||||
"chunk": chunk,
|
||||
"apoc_batch_size": apoc_batch_size
|
||||
})
|
||||
|
||||
print(f"All {chunk_count} batches inserted")
|
||||
driver.close()
|
42
src/umls2neo4j/umls2neo4j.py
Normal file
42
src/umls2neo4j/umls2neo4j.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import csv
|
||||
from neo4j import GraphDatabase
|
||||
import argparse
|
||||
import configparser
|
||||
from methods_umls2neo4j import load_cui_names, parse_mrrel, load_into_neo4j
|
||||
|
||||
# define parameters - pass db conf and UMLS files directory as arguments when running the script
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-c', '--conf', required=True, type=str,
|
||||
help='Configuration file with database connection parameters')
|
||||
# Path to MRCONSO.RRF file
|
||||
parser.add_argument('-fc', '--mrconsofiles', required=True, type=str, help='Path to UMLS MRCONSO.RRF file')
|
||||
# Path to MRREL.RRF file
|
||||
parser.add_argument('-fr', '--mrrelfiles', required=True, type=str, help='Path to UMLS MRREL.RRF file')
|
||||
|
||||
# parse parameters
|
||||
args = parser.parse_args()
|
||||
mrconso_file = args.mrconsofiles
|
||||
mrrel_file = args.mrrelfiles
|
||||
|
||||
conf_file = configparser.ConfigParser()
|
||||
conf_file.read(args.conf)
|
||||
uri = conf_file['neo4j']['uri']
|
||||
username = conf_file['neo4j']['username']
|
||||
password = conf_file['neo4j']['password']
|
||||
|
||||
# Define relationships of interest
|
||||
ALLOWED_RELS = {"PAR", "CHD"}
|
||||
|
||||
# Run everything
|
||||
if __name__ == "__main__":
|
||||
print("Loading CUI information...")
|
||||
name_map = load_cui_names(file_path=mrconso_file)
|
||||
print(f"Loaded information about {len(name_map)} CUIs")
|
||||
|
||||
print("Parsing relationships...")
|
||||
relationships = parse_mrrel(file_path=mrrel_file, cui_infos=name_map, allowed_rels=ALLOWED_RELS)
|
||||
print(f"Parsed {len(relationships)} relationships")
|
||||
|
||||
print("Loading into Neo4j...")
|
||||
load_into_neo4j(rels=relationships, uri=uri, user=username, password=password, batch_chunk_size=5000, apoc_batch_size=1000)
|
||||
print("Finished loading into Neo4j")
|
Reference in New Issue
Block a user