first release

This commit is contained in:
2025-08-22 11:52:43 +02:00
commit ec27c71148
23 changed files with 1543 additions and 0 deletions

40
src/moi/README.md Normal file
View File

@@ -0,0 +1,40 @@
# MOI: MeSH Ontology Importer for Neo4j
MOI imports a MeSH ontology file into a Neo4j graph database and performs postprocessing for consistent labeling and property handling.
## Quickstart
Create a configuration file with your Neo4j connection details.
For example, save it in your home directory as `moi.conf`:
```ini
[neo4j]
uri = bolt://localhost:7687
username = neo4j
password = myfancypassword
```
Start the program by providing the location of your configuration file and the folder containing the ontology files:
```sh
python3 src/moi.py --conf ~/moi.conf --files ~/mesh_files
```
## Requirements
* make sure python3 is installed
* install the required libraries with `pip install -r requirements.txt`
* have a running Neo4j instance (version 5)
* the Neo4j instance must have the Neosemantics and APOC plugin installed
* ontology files must be provided in one of the supported formats: `.owl`, `.xrdf`, `.ttl`, `.nt`
* create the configuration file as described in the Quickstart section
## General structure of the repository
The two main scripts that do things are:
`methods_moi.py` contains helper functions for:
* creating the Neo4j graph configuration (constraints, n10s settings)
* importing ontology files into Neo4j
* postprocessing imported MeSH nodes (renaming labels, flattening properties)
`moi.py` the main scripts:
* loads configuration
* iterates over ontology files
* *runs the import and postprocessing steps

101
src/moi/methods_moi.py Normal file
View File

@@ -0,0 +1,101 @@
# This script contains methods for moi.py, initial draft from the work of Jana Cyrus
import re
import os
# create_graph_config defines the graph configuration needed for Neosemantics and creates the graph configuration if it does not exist, if a graph configuration already exists it is expected to be the same as defined here
def create_graph_config(graph):
create_constraint = '''
CREATE CONSTRAINT n10s_unique_uri IF NOT EXISTS
FOR (r:Resource) REQUIRE r.uri IS UNIQUE;
'''
check_config = 'MATCH (n:_GraphConfig) RETURN count(n) AS config_count;'
create_config = '''
CALL n10s.graphconfig.init({
subClassOfRel: "subClassOf",
handleVocabUris: "MAP",
handleMultival: "ARRAY"
});
'''
result = graph.run(check_config)
config_exists = result.single()['config_count']
if config_exists == 0:
graph.run(create_constraint)
graph.run(create_config)
def import_ontology(path, file, graph):
full_path = os.path.join(path, file).replace(os.sep, '/')
file_uri = f"file:///{full_path}"
# Detect file format
if re.match(r'.+\.(owl|xrdf)$', file):
format_type = "RDF/XML"
elif re.match(r'.+\.(ttl|nt)$', file):
format_type = "Turtle"
else:
print(f"Unsupported file format for: {file}")
return
import_query = '''
CALL n10s.rdf.import.fetch($file_uri, $format_type)
'''
print(f"Importing ontology: CALL n10s.rdf.import.fetch('{file_uri}', '{format_type}')")
graph.run(import_query, file_uri=file_uri, format_type=format_type)
# post-processing for MeSH ontology, labels are altered to identify the nodes as MeSH entries and single value properties are extracted from their arrays
def postprocess_mesh(graph):
# match all labels from MeSH
# change labels by adding ontology name in front of the original label and removing the original label
get_onto_labels = '''
MATCH (n:Resource) WHERE n.uri CONTAINS "/MESH/"
UNWIND labels(n)[1..] AS label
RETURN COLLECT(DISTINCT label) AS labels
'''
labels_result = graph.run(get_onto_labels)
onto_labels = labels_result.single()['labels']
# change label
for label in onto_labels:
change_label = f'''
CALL apoc.periodic.iterate(
"MATCH (n:{label}) WHERE n.uri CONTAINS '/MESH/' RETURN n",
"SET n:Mesh{label} REMOVE n:{label}",
{{batchSize: 10000, parallel: true}}
)
'''
print(f"Updating label: {change_label}")
graph.run(change_label)
# return property keys as list to extract properties from array
get_property_keys = '''
MATCH (n:Resource)
WHERE any(label IN labels(n) WHERE label STARTS WITH "Mesh")
WITH KEYS(n) AS keys
UNWIND keys AS key
RETURN COLLECT(DISTINCT key) AS property_keys
'''
keys_result = graph.run(get_property_keys)
property_keys = keys_result.single()['property_keys']
# extract single value properties from array
for key in property_keys:
if key == "uri":
continue
extract_properties = f'''
CALL apoc.periodic.iterate(
"MATCH (n:Resource)
WHERE any(label IN labels(n) WHERE label STARTS WITH 'Mesh')
AND size(n.{key}) = 1
RETURN n",
"WITH n UNWIND n.{key} AS prop SET n.{key} = prop",
{{batchSize: 10000, parallel: true}}
)
'''
print(f"Extracting property: {extract_properties}")
graph.run(extract_properties)

38
src/moi/moi.py Normal file
View File

@@ -0,0 +1,38 @@
# This integrates MeSH into Neo4j, initial draft adapted from the work of Jana Cyrus
from neo4j import GraphDatabase
import argparse
import configparser
import os
from methods_mesh import create_graph_config, import_ontology, postprocess_mesh
# define parameters - pass ontology file dir and db conf as arguments when running the script
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--conf', required=True, type=str,
help='Configuration file with database connection parameters')
parser.add_argument('-f', '--files', required=True, type=str, help='Directory with ontology files')
# parse parameters
args = parser.parse_args()
mesh_file_path = args.files
conf_file = configparser.ConfigParser()
conf_file.read(args.conf)
uri = conf_file['neo4j']['uri']
username = conf_file['neo4j']['username']
password = conf_file['neo4j']['password']
if __name__ == "__main__":
driver = GraphDatabase.driver(uri, auth=(username, password))
with driver.session() as session:
create_graph_config(session)
# Process all ontology files in the specified directory
for file in os.listdir(mesh_file_path):
if file.endswith(('.owl', '.xrdf', '.ttl', '.nt')):
import_ontology(mesh_file_path, file, session)
postprocess_mesh(session)
driver.close()

View File

@@ -0,0 +1,32 @@
# postprocessing
`postprocess.py` executes Cypher queries to create mappings between MeSH, UMLS, MDM, and ClinicalTrials.gov data in a Neo4j graph database.
The main script `postprocess.py` runs a set of predefined Cypher queries for:
* mapping MeSH terms to UMLS concepts
* mapping MDM Portal aliases to UMLS concepts
* mapping ClinicalTrials.gov studies to MeSH terms
* mapping ClinicalTrials.gov studies to MDM Portal entries
## Quickstart
Create a configuration file with your Neo4j connection details.
For example, save it as `postprocess.conf`:
```ini
[neo4j]
uri = bolt://localhost:7687
username = neo4j
password = myfancypassword
```
Run the postprocessing by providing the configuration file:
```sh
python3 src/postprocess.py --conf ~/postprocess.conf
```
## Requirements
* make sure `python3` is installed
* have a running Neo4j instance (version 5)
* the Neo4j instance must have the APOC plugin installed
* create the configuration file as described in the [Quickstart](#quickstart) section

View File

@@ -0,0 +1,101 @@
from neo4j import GraphDatabase
import logging
import argparse
import configparser
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# define funtion for running cypher queries against a neo4j
def run_cypher_query_for_postprocessing(uri, user, password, cypher_query):
try:
with GraphDatabase.driver(uri, auth=(user, password)) as driver:
with driver.session() as session:
session.run(cypher_query)
logger.info("Postprocessing query executed successfully.")
except Exception as e:
logger.error(f"Failed to execute query: {e}")
raise
# cypher queries for postprocessing
# mapping MeSH terms to UMLS concepts
cypher_mesh_to_umls = """
MATCH (m:MeshClass)
WITH collect(distinct m.cui) AS mesh_cui_lists
WITH apoc.coll.toSet(apoc.coll.flatten(mesh_cui_lists)) AS mesh_cui_set
MATCH (a:UMLSconcept)
WHERE a.cui IN mesh_cui_set
MERGE (m)-[:MESH_TO_UMLS_MAPPING]->(a)
"""
# mapping MeSH terms to UMLS concepts, optimised for larger dataset
cypher_mesh_to_umls_optimised = """
CALL apoc.periodic.iterate(
"MATCH (m:MeshClass) RETURN m",
"UNWIND m.cui AS mesh_cui
MATCH (a:UMLSconcept {cui: mesh_cui})
MERGE (m)-[:MESH_TO_UMLS_MAPPING]->(a)",
{batchSize:1000, parallel:false}
);
"""
# mapping MDM Portal Alias to UMLS concepts
cypher_mdm_to_umls = """
MATCH (a:Alias)
WITH a, a.Name as aliasname
MATCH (u:UMLSconcept)
WHERE u.cui = aliasname
MERGE (a)-[:MAPS_TO_UMLS_ENTRY]->(u)
"""
# mapping ClinicalTrials.gov studies to MeSH terms
cypher_ct_to_mesh = """
MATCH (c:MeshClass)
WITH c, c.notation as cnote
MATCH (m:meshes)
WHERE m.id = cnote
MERGE (m)-[:MAPS_TO_MESH_ENTRY]->(c)
"""
# mapping ClinicalTrials.gov studies to MDM Portal entries
cypher_ct_to_mdm = """
MATCH (s:Study)
WITH s
MATCH (c:ClinicalTrialsEntry)
WHERE c.str_id = s.OID
MERGE (s)-[r:BELONGS_TO]->(c);
MATCH (s:Study)
WITH s
MATCH (c:ClinicalTrialsEntry)
WHERE c.str_id = s.NCT_ID
MERGE (s)-[r:BELONGS_TO]->(c)
"""
# define parameters - pass json files dir and db conf as arguments when running the script
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--conf', required=True, type=str,
help='Configuration file with database connection parameters')
# parse parameters
args = parser.parse_args()
conf_file = configparser.ConfigParser()
conf_file.read(args.conf)
uri = conf_file['neo4j']['uri']
username = conf_file['neo4j']['username']
password = conf_file['neo4j']['password']
if __name__ == "__main__":
# postprocess mesh to umls
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_mesh_to_umls_optimised)
# postprocess clinicaltrials.gov to mesh
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_ct_to_mesh)
# postprocess mdm to umls
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_mdm_to_umls)
# postprocess clinicaltrials.gov to mdm
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_ct_to_mdm)

38
src/study2neo4j/README.md Normal file
View File

@@ -0,0 +1,38 @@
# study2neo4j
This repo is for integrating data from ClinicalTrials.gov into Neo4j.
## Quickstart
Create a configuration file, storing your details for the database-connection.
E.g. in your home-directory with the name `study.conf`.
```ini
[neo4j]
uri = bolt://localhost:7687
username = neo4j
password = myfancypassword
```
Start the program by providing the location of your configuration-file and the location of the json files.
```sh
python3 src/run.py --conf ~/study.conf --files ~/Desktop/datasource
```
## Requirements
- make sure, `python3` is installed
- install the required libraries with `pip install -r requirements.txt`
- download all json files from [ClinicalTrials.gov]( https://clinicaltrials.gov/ ) and place all json files you want to load in one folder
- have a running Neo4j DB (Neo4j version 5)
- create the configuration-file as described in the [Quickstart section](#quickstart)
## Acknowledgements
`ct2neo4j.py` is adapted from the MeDaX pipeline (see [1], [2]) graphCreation, with permission from the authors.
[1] Gebhardt, T., Mazein, I., Michaelis, L., Henkel, R., Lobentanzer, S., Waltemath, D., & Wodke, J. (2025). MeDaX pipeline (1.0.0). Zenodo. https://doi.org/10.5281/zenodo.15229077
[2] Mazein, I., Gebhardt, T., Zinkewitz, F., Michaelis, L., Braun, S., Waltemath, D., Henkel, R., & Wodke, J. A. (2024). MeDaX: A Knowledge Graph on FHIR. In Digital Health and Informatics Innovations for Sustainable Health Care Systems (pp. 367-371). IOS Press.

125
src/study2neo4j/ct2neo4j.py Normal file
View File

@@ -0,0 +1,125 @@
import json
import os
from neo4j import GraphDatabase
# Define a function to add nodes and relationships recursively
def add_nodes_from_dict(tx, parent_node_label, parent_node_str_id, current_dict):
for key, value in current_dict.items(): # iterate over each key-value pair in dictionary
if key == "phases":
# Create a node for each phase
for index, phase in enumerate(value):
phase_node_str_id = f"{parent_node_str_id}_{key}_{index}"
tx.run(f"MERGE (n:phase {{str_id: $str_id, name: $phase_name}})",
str_id=phase_node_str_id, phase_name=phase)
tx.run(
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:phase {{str_id: $child_str_id}}) "
f"MERGE (a)-[:{key}]->(b)",
parent_str_id=parent_node_str_id,
child_str_id=phase_node_str_id
)
if isinstance(value, dict): # if value of key is a dict, then create new node:
# Create a new node for the nested dictionary
new_node_str_id = f"{parent_node_str_id}_{key}" # concatenate the parent_node_str_id and key to a new id
tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=new_node_str_id) # create node with key as label
# Create a relationship from the parent node to the new node
tx.run(f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
f"MERGE (a)-[:{key}]->(b)",
parent_str_id=parent_node_str_id,
child_str_id=new_node_str_id) # create rel betw parent and newly created node
# Recurse into the nested dictionary
add_nodes_from_dict(tx, key, new_node_str_id, value)
elif isinstance(value, list): # if value of key is a list, then check if list contains dictionary
if key == "conditions":
# Create a node for each condition
for index, condition in enumerate(value):
condition_node_str_id = f"{parent_node_str_id}_{key}_{index}"
tx.run(f"MERGE (n:condition {{str_id: $str_id, name: $condition_name}})",
str_id=condition_node_str_id, condition_name=condition)
tx.run(
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:condition {{str_id: $child_str_id}}) "
f"MERGE (a)-[:{key}]->(b)",
parent_str_id=parent_node_str_id,
child_str_id=condition_node_str_id
)
elif key == "keywords":
# Create a node for each keyword
for index, keyword in enumerate(value):
keyword_node_str_id = f"{parent_node_str_id}_{key}_{index}"
tx.run(f"MERGE (n:keyword {{str_id: $str_id, name: $keyword_name}})", str_id=keyword_node_str_id,
keyword_name=keyword)
tx.run(
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:keyword {{str_id: $child_str_id}}) "
f"MERGE (a)-[:{key}]->(b)",
parent_str_id=parent_node_str_id,
child_str_id=keyword_node_str_id
)
# if list doesn't contain any nested dictionaries, make it a value in the node
if not any(isinstance(item, dict) for item in value):
# If the list contains only primitive values (like strings or numbers),
# these values are set as properties of the parent node.
tx.run(f"MATCH (n:{parent_node_label} {{str_id: $str_id}}) SET n.{key} = $value",
str_id=parent_node_str_id, value=value)
else: # if list contains dictionaries, then a new node is created for each dict
# Process each dictionary in the list
for index, item in enumerate(value):
if isinstance(item, dict):
item_node_str_id = f"{parent_node_str_id}_{key}_{index}"
tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=item_node_str_id)
tx.run(
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
f"MERGE (a)-[:{key}]->(b)", parent_str_id=parent_node_str_id,
child_str_id=item_node_str_id)
add_nodes_from_dict(tx, key, item_node_str_id, item)
else:
# if value is a reference
if key == 'reference':
new_node_str_id = f"{parent_node_str_id}_{key}"
tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=new_node_str_id)
tx.run(f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
f"MERGE (a)-[:{key}]->(b)", parent_str_id=parent_node_str_id, child_str_id=new_node_str_id)
tx.run(f"MATCH (n:{key} {{str_id: $str_id}}) SET n.{key} = $value",
str_id=new_node_str_id, value=value)
else:
# For non-dict and non-list values, add them as attributes to the parent node
tx.run(f"MATCH (n:{parent_node_label} {{str_id: $str_id}}) SET n.{key} = $value",
str_id=parent_node_str_id, value=value)
# Connect to Neo4j and create the graph
def create_graph_from_directory(uri, user, password, directory_path):
driver = GraphDatabase.driver(uri, auth=(user, password))
for filename in os.listdir(directory_path):
if filename.endswith('.json'):
file_path = os.path.join(directory_path, filename)
try:
with open(file_path, 'r') as file:
json_data = json.load(file)
with driver.session() as session:
root_node_label = 'ClinicalTrialsEntry'
root_node_str_id = json_data['protocolSection']['identificationModule']['nctId']
session.execute_write(
lambda tx: tx.run(f"MERGE (n:{root_node_label} {{str_id: $str_id}})", str_id=root_node_str_id))
session.execute_write(add_nodes_from_dict, root_node_label, root_node_str_id, json_data)
print(f"Successfully imported: {filename}")
except Exception as e:
print(f"Failed to import {filename}: {e}")
driver.close()

32
src/study2neo4j/run.py Normal file
View File

@@ -0,0 +1,32 @@
import argparse
import logging
import configparser
from ct2neo4j import create_graph_from_directory
STUDY2NEO4J_VERSION: str = "0.1"
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info('study2neo4j v'+STUDY2NEO4J_VERSION)
# define parameters - pass json files dir and db conf as arguments when running the script
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--conf', required=True, type=str,
help='Configuration file with database connection parameters')
parser.add_argument('-f', '--files', required=True, type=str, help='Directory with json files')
# parse parameters
args = parser.parse_args()
json_file_path = args.files
conf_file = configparser.ConfigParser()
conf_file.read(args.conf)
uri = conf_file['neo4j']['uri']
username = conf_file['neo4j']['username']
password = conf_file['neo4j']['password']
# start study2neo4j
if __name__ == "__main__":
create_graph_from_directory(uri=uri, user=username, password=password, directory_path=json_file_path)

52
src/umls2neo4j/README.md Normal file
View File

@@ -0,0 +1,52 @@
# umls2neo4j: UMLS to Neo4j Importer
This Python script parses selected relationships from the UMLS Metathesaurus (`MRREL.RRF` and `MRCONSO.RRF`) and loads them into a Neo4j graph database.
> [!IMPORTANT]
> Requires a UMLS licence!
## Features
- Filters and loads `PAR` (parent) and `CHD` (child) relationships from `MRREL.RRF`
- Loads only preferred English concept names from `MRCONSO.RRF`
## Quickstart
Create a configuration file, storing your details for the database-connection.
E.g. in your home-directory with the name `umls.conf`.
```ini
[neo4j]
uri = bolt://localhost:7687
username = neo4j
password = myfancypassword
```
Start the program by providing the location of your configuration-file and the location of the UMLS-files.
```sh
python3 src/umls2neo4j.py --conf ~/umls.conf --mrconsofiles ~/umls/MRCONSO.RRF --mrrelfiles ~/umls/MRREL.RRF
```
## Requirements
- make sure, `python3` is installed
- install the required libraries with `pip install -r requirements.txt`
- download the UMLS Metathesaurus files (`MRREL.RRF`, `MRCONSO.RRF`) from []( ) → requires a UMLS licence
- have a running Neo4j DB (Neo4j version 5), with APOC installed
- create the configuration-file as described in the [Quickstart section](#quickstart)
## Detailled Infos
The script will:
1. Load preferred English concept names from `MRCONSO.RRF`
2. Parse allowed relationships from `MRREL.RRF`
3. Insert nodes and relationships into Neo4j using chunked batches
## Customisation
- Adjust `ALLOWED_RELS` in the script to include more relationship types
- Tune `batch_chunk_size` and `apoc_batch_size` for better performance

View File

@@ -0,0 +1,114 @@
import csv
from neo4j import GraphDatabase
# Load concept names from MRCONSO.RRF
# Join CUI2 (MRREL) to CUI (MRCONSO) where TS=P, STT=PF, ISPREF=Y, LAT=ENG
def load_cui_names(file_path): # this only stores names for cuis (information about source_vocab and source_vocab_code needs to be included as well)
cui_infos = {}
with open(file_path, "r", encoding="utf-8") as f:
reader = csv.reader(f, delimiter="|")
for row in reader:
cui = row[0]
language = row[1] # LAT=ENG
term_status = row[2] # TS=P (Preferred)
stt = row[4] # STT=PF (Preferred form of term)
ispref = row[6] # ISPREF=Y
source_vocab = row[11]
source_vocab_code = row[13]
name = row[14]
srl_license = row[15] # should be 0
if language != "ENG":
continue
if term_status == 'P' and stt == 'PF' and ispref == 'Y':
if srl_license == '0':
# cui_infos[cui] = name
cui_infos[cui] = {
"name": name,
"source_vocab": source_vocab,
"source_vocab_code": source_vocab_code
}
return cui_infos
# Load MRREL.RRF and return filtered concept relationships
def parse_mrrel(file_path, cui_infos, allowed_rels):
mrrelationships = []
with open(file_path, "r", encoding="utf-8") as f:
reader = csv.reader(f, delimiter="|")
for row in reader:
cui1 = row[0]
concept_type1 = row[2] # should be CUI
rel = row[3] # relationship, e.g. PAR, CHD
cui2 = row[4]
concept_type2 = row[6] # should be CUI
rel_attribute = row[7]
source_vocab = row[10] # source vocabulary
if rel in allowed_rels:
# if concept_type1 == 'CUI' and concept_type2 == 'CUI':
from_info = cui_infos.get(cui1)
to_info = cui_infos.get(cui2)
if from_info and to_info:
mrrelationships.append({
"from": cui1,
"from_name": from_info["name"],
"from_source_vocab": from_info["source_vocab"],
"from_source_vocab_code": from_info["source_vocab_code"],
"to": cui2,
"to_name": to_info["name"],
"to_source_vocab": to_info["source_vocab"],
"to_source_vocab_code": to_info["source_vocab_code"],
"type": rel,
"attribute": rel_attribute,
"source": source_vocab
})
return mrrelationships
def chunked(data, size):
for i in range(0, len(data), size):
yield data[i:i + size]
# Load relationships into Neo4j
def load_into_neo4j(rels, uri, user, password, batch_chunk_size, apoc_batch_size):
driver = GraphDatabase.driver(uri, auth=(user, password), connection_timeout=60, max_connection_lifetime=3600)
with driver.session(database="neo4j") as session:
# create constraint on cui
# session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (u:UMLSconcept) REQUIRE u.cui IS UNIQUE")
chunk_count = 0
for chunk in chunked(rels, batch_chunk_size):
chunk_count += 1
print(f"Inserting chunk {chunk_count} with {len(chunk)} relationships...")
session.run("""
CALL apoc.periodic.iterate(
'UNWIND $chunk AS rel RETURN rel',
'
MERGE (c1:UMLSconcept {cui: rel.from})
SET c1.name = rel.from_name, c1.source = rel.from_source_vocab, c1.sourcecode = rel.from_source_vocab_code
MERGE (c2:UMLSconcept {cui: rel.to})
SET c2.name = rel.to_name, c2.source = rel.to_source_vocab, c2.sourcecode = rel.to_source_vocab_code
MERGE (c1)-[r:UMLS_RELATION {type: rel.type}]->(c2)
SET r.source = rel.source, r.attribute = rel.attribute
',
{
batchSize: $apoc_batch_size,
parallel: false,
params: {chunk: $chunk}
}
)
""", {
"chunk": chunk,
"apoc_batch_size": apoc_batch_size
})
print(f"All {chunk_count} batches inserted")
driver.close()

View File

@@ -0,0 +1,42 @@
import csv
from neo4j import GraphDatabase
import argparse
import configparser
from methods_umls2neo4j import load_cui_names, parse_mrrel, load_into_neo4j
# define parameters - pass db conf and UMLS files directory as arguments when running the script
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--conf', required=True, type=str,
help='Configuration file with database connection parameters')
# Path to MRCONSO.RRF file
parser.add_argument('-fc', '--mrconsofiles', required=True, type=str, help='Path to UMLS MRCONSO.RRF file')
# Path to MRREL.RRF file
parser.add_argument('-fr', '--mrrelfiles', required=True, type=str, help='Path to UMLS MRREL.RRF file')
# parse parameters
args = parser.parse_args()
mrconso_file = args.mrconsofiles
mrrel_file = args.mrrelfiles
conf_file = configparser.ConfigParser()
conf_file.read(args.conf)
uri = conf_file['neo4j']['uri']
username = conf_file['neo4j']['username']
password = conf_file['neo4j']['password']
# Define relationships of interest
ALLOWED_RELS = {"PAR", "CHD"}
# Run everything
if __name__ == "__main__":
print("Loading CUI information...")
name_map = load_cui_names(file_path=mrconso_file)
print(f"Loaded information about {len(name_map)} CUIs")
print("Parsing relationships...")
relationships = parse_mrrel(file_path=mrrel_file, cui_infos=name_map, allowed_rels=ALLOWED_RELS)
print(f"Parsed {len(relationships)} relationships")
print("Loading into Neo4j...")
load_into_neo4j(rels=relationships, uri=uri, user=username, password=password, batch_chunk_size=5000, apoc_batch_size=1000)
print("Finished loading into Neo4j")