first release

2025-08-22 11:52:43 +02:00
commit ec27c71148
23 changed files with 1543 additions and 0 deletions
--- a/src/moi/README.md
+++ b/src/moi/README.md
@@ -0,0 +1,40 @@
+# MOI: MeSH Ontology Importer for Neo4j
+MOI imports a MeSH ontology file into a Neo4j graph database and performs postprocessing for consistent labeling and property handling.
+
+## Quickstart
+Create a configuration file with your Neo4j connection details.
+For example, save it in your home directory as `moi.conf`:
+
+```ini
+[neo4j]
+uri = bolt://localhost:7687
+username = neo4j
+password = myfancypassword
+```
+
+Start the program by providing the location of your configuration file and the folder containing the ontology files:
+```sh 
+python3 src/moi.py --conf ~/moi.conf --files ~/mesh_files
+```
+
+## Requirements
+
+* make sure python3 is installed
+* install the required libraries with `pip install -r requirements.txt`
+* have a running Neo4j instance (version 5)
+* the Neo4j instance must have the Neosemantics and APOC plugin installed
+* ontology files must be provided in one of the supported formats: `.owl`, `.xrdf`, `.ttl`, `.nt`
+* create the configuration file as described in the Quickstart section
+
+## General structure of the repository
+The two main scripts that do things are:
+
+`methods_moi.py` – contains helper functions for:
+* creating the Neo4j graph configuration (constraints, n10s settings)
+* importing ontology files into Neo4j
+* postprocessing imported MeSH nodes (renaming labels, flattening properties)
+
+`moi.py` – the main scripts:
+* loads configuration
+* iterates over ontology files
+* *runs the import and postprocessing steps
--- a/src/moi/methods_moi.py
+++ b/src/moi/methods_moi.py
@@ -0,0 +1,101 @@
+# This script contains methods for moi.py, initial draft from the work of Jana Cyrus
+import re
+import os
+
+
+# create_graph_config defines the graph configuration needed for Neosemantics and creates the graph configuration if it does not exist, if a graph configuration already exists it is expected to be the same as defined here
+def create_graph_config(graph):
+    create_constraint = '''
+            CREATE CONSTRAINT n10s_unique_uri IF NOT EXISTS 
+            FOR (r:Resource) REQUIRE r.uri IS UNIQUE;
+        '''
+    check_config = 'MATCH (n:_GraphConfig) RETURN count(n) AS config_count;'
+    create_config = '''
+            CALL n10s.graphconfig.init({
+                subClassOfRel: "subClassOf", 
+                handleVocabUris: "MAP", 
+                handleMultival: "ARRAY"
+            });
+        '''
+
+    result = graph.run(check_config)
+    config_exists = result.single()['config_count']
+
+    if config_exists == 0:
+        graph.run(create_constraint)
+        graph.run(create_config)
+
+
+def import_ontology(path, file, graph):
+    full_path = os.path.join(path, file).replace(os.sep, '/')
+    file_uri = f"file:///{full_path}"
+
+    # Detect file format
+    if re.match(r'.+\.(owl|xrdf)$', file):
+        format_type = "RDF/XML"
+    elif re.match(r'.+\.(ttl|nt)$', file):
+        format_type = "Turtle"
+    else:
+        print(f"Unsupported file format for: {file}")
+        return
+
+    import_query = '''
+            CALL n10s.rdf.import.fetch($file_uri, $format_type)
+        '''
+    print(f"Importing ontology: CALL n10s.rdf.import.fetch('{file_uri}', '{format_type}')")
+    graph.run(import_query, file_uri=file_uri, format_type=format_type)
+
+
+# post-processing for MeSH ontology, labels are altered to identify the nodes as MeSH entries and single value properties are extracted from their arrays
+def postprocess_mesh(graph):
+    # match all labels from MeSH
+    # change labels by adding ontology name in front of the original label and removing the original label
+    get_onto_labels = '''
+        MATCH (n:Resource) WHERE n.uri CONTAINS "/MESH/"
+        UNWIND labels(n)[1..] AS label
+        RETURN COLLECT(DISTINCT label) AS labels
+    '''
+
+    labels_result = graph.run(get_onto_labels)
+    onto_labels = labels_result.single()['labels']
+
+    # change label
+    for label in onto_labels:
+        change_label = f'''
+                CALL apoc.periodic.iterate(
+                    "MATCH (n:{label}) WHERE n.uri CONTAINS '/MESH/' RETURN n",
+                    "SET n:Mesh{label} REMOVE n:{label}",
+                    {{batchSize: 10000, parallel: true}}
+                )
+            '''
+        print(f"Updating label: {change_label}")
+        graph.run(change_label)
+
+    # return property keys as list to extract properties from array
+    get_property_keys = '''
+        MATCH (n:Resource)
+        WHERE any(label IN labels(n) WHERE label STARTS WITH "Mesh")
+        WITH KEYS(n) AS keys
+        UNWIND keys AS key
+        RETURN COLLECT(DISTINCT key) AS property_keys
+    '''
+    keys_result = graph.run(get_property_keys)
+    property_keys = keys_result.single()['property_keys']
+
+    # extract single value properties from array
+    for key in property_keys:
+        if key == "uri":
+            continue
+        extract_properties = f'''
+            CALL apoc.periodic.iterate(
+                "MATCH (n:Resource)
+                 WHERE any(label IN labels(n) WHERE label STARTS WITH 'Mesh')
+                 AND size(n.{key}) = 1
+                 RETURN n",
+                "WITH n UNWIND n.{key} AS prop SET n.{key} = prop",
+                {{batchSize: 10000, parallel: true}}
+            )
+        '''
+
+        print(f"Extracting property: {extract_properties}")
+        graph.run(extract_properties)
--- a/src/moi/moi.py
+++ b/src/moi/moi.py
@@ -0,0 +1,38 @@
+# This integrates MeSH into Neo4j, initial draft adapted from the work of Jana Cyrus
+from neo4j import GraphDatabase
+import argparse
+import configparser
+import os
+from methods_mesh import create_graph_config, import_ontology, postprocess_mesh
+
+
+# define parameters - pass ontology file dir and db conf as arguments when running the script
+parser = argparse.ArgumentParser()
+parser.add_argument('-c', '--conf', required=True, type=str,
+                    help='Configuration file with database connection parameters')
+parser.add_argument('-f', '--files', required=True, type=str, help='Directory with ontology files')
+
+# parse parameters
+args = parser.parse_args()
+mesh_file_path = args.files
+
+conf_file = configparser.ConfigParser()
+conf_file.read(args.conf)
+uri = conf_file['neo4j']['uri']
+username = conf_file['neo4j']['username']
+password = conf_file['neo4j']['password']
+
+
+if __name__ == "__main__":
+    driver = GraphDatabase.driver(uri, auth=(username, password))
+    with driver.session() as session:
+        create_graph_config(session)
+
+        # Process all ontology files in the specified directory
+        for file in os.listdir(mesh_file_path):
+            if file.endswith(('.owl', '.xrdf', '.ttl', '.nt')):
+                import_ontology(mesh_file_path, file, session)
+
+        postprocess_mesh(session)
+
+    driver.close()
--- a/src/postprocessing/README.md
+++ b/src/postprocessing/README.md
@@ -0,0 +1,32 @@
+# postprocessing
+`postprocess.py` executes Cypher queries to create mappings between MeSH, UMLS, MDM, and ClinicalTrials.gov data in a Neo4j graph database.
+
+The main script `postprocess.py` runs a set of predefined Cypher queries for:
+* mapping MeSH terms to UMLS concepts
+* mapping MDM Portal aliases to UMLS concepts
+* mapping ClinicalTrials.gov studies to MeSH terms
+* mapping ClinicalTrials.gov studies to MDM Portal entries
+
+## Quickstart
+
+Create a configuration file with your Neo4j connection details.  
+For example, save it as `postprocess.conf`:
+
+```ini
+[neo4j]
+uri = bolt://localhost:7687
+username = neo4j
+password = myfancypassword
+```
+
+Run the postprocessing by providing the configuration file:
+```sh
+python3 src/postprocess.py --conf ~/postprocess.conf
+```
+
+## Requirements
+
+* make sure `python3` is installed
+* have a running Neo4j instance (version 5)
+* the Neo4j instance must have the APOC plugin installed
+* create the configuration file as described in the [Quickstart](#quickstart) section
--- a/src/postprocessing/postprocess.py
+++ b/src/postprocessing/postprocess.py
@@ -0,0 +1,101 @@
+from neo4j import GraphDatabase
+import logging
+import argparse
+import configparser
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+# define funtion for running cypher queries against a neo4j
+def run_cypher_query_for_postprocessing(uri, user, password, cypher_query):
+    try:
+        with GraphDatabase.driver(uri, auth=(user, password)) as driver:
+            with driver.session() as session:
+                session.run(cypher_query)
+                logger.info("Postprocessing query executed successfully.")
+    except Exception as e:
+        logger.error(f"Failed to execute query: {e}")
+        raise
+
+
+# cypher queries for postprocessing
+# mapping MeSH terms to UMLS concepts
+cypher_mesh_to_umls = """
+    MATCH (m:MeshClass)
+    WITH collect(distinct m.cui) AS mesh_cui_lists
+    WITH apoc.coll.toSet(apoc.coll.flatten(mesh_cui_lists)) AS mesh_cui_set
+    MATCH (a:UMLSconcept)
+    WHERE a.cui IN mesh_cui_set
+    MERGE (m)-[:MESH_TO_UMLS_MAPPING]->(a)
+"""
+
+# mapping MeSH terms to UMLS concepts, optimised for larger dataset
+cypher_mesh_to_umls_optimised = """
+    CALL apoc.periodic.iterate(
+        "MATCH (m:MeshClass) RETURN m",
+        "UNWIND m.cui AS mesh_cui
+        MATCH (a:UMLSconcept {cui: mesh_cui})
+        MERGE (m)-[:MESH_TO_UMLS_MAPPING]->(a)",
+    {batchSize:1000, parallel:false}
+);
+"""
+
+# mapping MDM Portal Alias to UMLS concepts
+cypher_mdm_to_umls = """
+    MATCH (a:Alias)
+    WITH a, a.Name as aliasname
+    MATCH (u:UMLSconcept)
+    WHERE u.cui = aliasname
+    MERGE (a)-[:MAPS_TO_UMLS_ENTRY]->(u)
+"""
+
+# mapping ClinicalTrials.gov studies to MeSH terms
+cypher_ct_to_mesh = """
+    MATCH (c:MeshClass)
+    WITH c, c.notation as cnote
+    MATCH (m:meshes)
+    WHERE m.id = cnote
+    MERGE (m)-[:MAPS_TO_MESH_ENTRY]->(c)
+"""
+
+# mapping ClinicalTrials.gov studies to MDM Portal entries
+cypher_ct_to_mdm = """
+MATCH (s:Study)
+WITH s
+MATCH (c:ClinicalTrialsEntry)
+WHERE c.str_id = s.OID
+MERGE (s)-[r:BELONGS_TO]->(c);
+
+MATCH (s:Study)
+WITH s
+MATCH (c:ClinicalTrialsEntry)
+WHERE c.str_id = s.NCT_ID
+MERGE (s)-[r:BELONGS_TO]->(c)
+"""
+
+
+# define parameters - pass json files dir and db conf as arguments when running the script
+parser = argparse.ArgumentParser()
+parser.add_argument('-c', '--conf', required=True, type=str,
+                    help='Configuration file with database connection parameters')
+
+# parse parameters
+args = parser.parse_args()
+
+conf_file = configparser.ConfigParser()
+conf_file.read(args.conf)
+uri = conf_file['neo4j']['uri']
+username = conf_file['neo4j']['username']
+password = conf_file['neo4j']['password']
+
+
+if __name__ == "__main__":
+    # postprocess mesh to umls
+    run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_mesh_to_umls_optimised)
+    # postprocess clinicaltrials.gov to mesh
+    run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_ct_to_mesh)
+    # postprocess mdm to umls
+    run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_mdm_to_umls)
+    # postprocess clinicaltrials.gov to mdm
+    run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_ct_to_mdm)
--- a/src/study2neo4j/README.md
+++ b/src/study2neo4j/README.md
@@ -0,0 +1,38 @@
+# study2neo4j
+
+This repo is for integrating data from ClinicalTrials.gov into Neo4j.
+
+## Quickstart
+
+Create a configuration file, storing your details for the database-connection.
+E.g. in your home-directory with the name `study.conf`.
+
+```ini
+[neo4j]
+uri = bolt://localhost:7687
+username = neo4j
+password = myfancypassword
+```
+
+Start the program by providing the location of your configuration-file and the location of the json files.
+
+```sh
+python3 src/run.py --conf ~/study.conf --files ~/Desktop/datasource
+```
+
+## Requirements
+
+- make sure, `python3` is installed
+- install the required libraries with `pip install -r requirements.txt`
+- download all json files from [ClinicalTrials.gov]( https://clinicaltrials.gov/ ) and place all json files you want to load in one folder
+- have a running Neo4j DB (Neo4j version 5)
+- create the configuration-file as described in the [Quickstart section](#quickstart)
+
+
+## Acknowledgements
+
+`ct2neo4j.py` is adapted from the MeDaX pipeline (see [1], [2]) graphCreation, with permission from the authors.
+
+[1] Gebhardt, T., Mazein, I., Michaelis, L., Henkel, R., Lobentanzer, S., Waltemath, D., & Wodke, J. (2025). MeDaX pipeline (1.0.0). Zenodo. https://doi.org/10.5281/zenodo.15229077
+
+[2] Mazein, I., Gebhardt, T., Zinkewitz, F., Michaelis, L., Braun, S., Waltemath, D., Henkel, R., & Wodke, J. A. (2024). MeDaX: A Knowledge Graph on FHIR. In Digital Health and Informatics Innovations for Sustainable Health Care Systems (pp. 367-371). IOS Press.
--- a/src/study2neo4j/ct2neo4j.py
+++ b/src/study2neo4j/ct2neo4j.py
@@ -0,0 +1,125 @@
+import json
+import os
+from neo4j import GraphDatabase
+
+
+# Define a function to add nodes and relationships recursively
+def add_nodes_from_dict(tx, parent_node_label, parent_node_str_id, current_dict):
+    for key, value in current_dict.items():  # iterate over each key-value pair in dictionary
+        if key == "phases":
+            # Create a node for each phase
+            for index, phase in enumerate(value):
+                phase_node_str_id = f"{parent_node_str_id}_{key}_{index}"
+                tx.run(f"MERGE (n:phase {{str_id: $str_id, name: $phase_name}})",
+                       str_id=phase_node_str_id, phase_name=phase)
+                tx.run(
+                    f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:phase {{str_id: $child_str_id}}) "
+                    f"MERGE (a)-[:{key}]->(b)",
+                    parent_str_id=parent_node_str_id,
+                    child_str_id=phase_node_str_id
+                )
+        if isinstance(value, dict):  # if value of key is a dict, then create new node:
+            # Create a new node for the nested dictionary
+            new_node_str_id = f"{parent_node_str_id}_{key}"  # concatenate the parent_node_str_id and key to a new id
+            tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=new_node_str_id)  # create node with key as label
+
+            # Create a relationship from the parent node to the new node
+            tx.run(f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
+                   f"MERGE (a)-[:{key}]->(b)",
+                   parent_str_id=parent_node_str_id,
+                   child_str_id=new_node_str_id)  # create rel betw parent and newly created node
+
+            # Recurse into the nested dictionary
+            add_nodes_from_dict(tx, key, new_node_str_id, value)
+
+        elif isinstance(value, list):  # if value of key is a list, then check if list contains dictionary
+
+            if key == "conditions":
+                # Create a node for each condition
+                for index, condition in enumerate(value):
+                    condition_node_str_id = f"{parent_node_str_id}_{key}_{index}"
+                    tx.run(f"MERGE (n:condition {{str_id: $str_id, name: $condition_name}})",
+                           str_id=condition_node_str_id, condition_name=condition)
+                    tx.run(
+                        f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:condition {{str_id: $child_str_id}}) "
+                        f"MERGE (a)-[:{key}]->(b)",
+                        parent_str_id=parent_node_str_id,
+                        child_str_id=condition_node_str_id
+                    )
+            elif key == "keywords":
+                # Create a node for each keyword
+                for index, keyword in enumerate(value):
+                    keyword_node_str_id = f"{parent_node_str_id}_{key}_{index}"
+                    tx.run(f"MERGE (n:keyword {{str_id: $str_id, name: $keyword_name}})", str_id=keyword_node_str_id,
+                           keyword_name=keyword)
+                    tx.run(
+                        f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:keyword {{str_id: $child_str_id}}) "
+                        f"MERGE (a)-[:{key}]->(b)",
+                        parent_str_id=parent_node_str_id,
+                        child_str_id=keyword_node_str_id
+                    )
+
+            # if list doesn't contain any nested dictionaries, make it a value in the node
+            if not any(isinstance(item, dict) for item in value):
+                # If the list contains only primitive values (like strings or numbers),
+                # these values are set as properties of the parent node.
+                tx.run(f"MATCH (n:{parent_node_label} {{str_id: $str_id}}) SET n.{key} = $value",
+                       str_id=parent_node_str_id, value=value)
+
+            else:  # if list contains dictionaries, then a new node is created for each dict
+
+                # Process each dictionary in the list
+                for index, item in enumerate(value):
+                    if isinstance(item, dict):
+                        item_node_str_id = f"{parent_node_str_id}_{key}_{index}"
+                        tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=item_node_str_id)
+
+                        tx.run(
+                            f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
+                            f"MERGE (a)-[:{key}]->(b)", parent_str_id=parent_node_str_id,
+                            child_str_id=item_node_str_id)
+                        add_nodes_from_dict(tx, key, item_node_str_id, item)
+
+        else:
+
+            # if value is a reference
+            if key == 'reference':
+                new_node_str_id = f"{parent_node_str_id}_{key}"
+                tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=new_node_str_id)
+
+                tx.run(f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
+                       f"MERGE (a)-[:{key}]->(b)", parent_str_id=parent_node_str_id, child_str_id=new_node_str_id)
+
+                tx.run(f"MATCH (n:{key} {{str_id: $str_id}}) SET n.{key} = $value",
+                       str_id=new_node_str_id, value=value)
+
+            else:
+
+                # For non-dict and non-list values, add them as attributes to the parent node
+                tx.run(f"MATCH (n:{parent_node_label} {{str_id: $str_id}}) SET n.{key} = $value",
+                       str_id=parent_node_str_id, value=value)
+
+
+# Connect to Neo4j and create the graph
+def create_graph_from_directory(uri, user, password, directory_path):
+    driver = GraphDatabase.driver(uri, auth=(user, password))
+
+    for filename in os.listdir(directory_path):
+        if filename.endswith('.json'):
+            file_path = os.path.join(directory_path, filename)
+            try:
+                with open(file_path, 'r') as file:
+                    json_data = json.load(file)
+
+                with driver.session() as session:
+                    root_node_label = 'ClinicalTrialsEntry'
+                    root_node_str_id = json_data['protocolSection']['identificationModule']['nctId']
+                    session.execute_write(
+                        lambda tx: tx.run(f"MERGE (n:{root_node_label} {{str_id: $str_id}})", str_id=root_node_str_id))
+                    session.execute_write(add_nodes_from_dict, root_node_label, root_node_str_id, json_data)
+
+                print(f"Successfully imported: {filename}")
+            except Exception as e:
+                print(f"Failed to import {filename}: {e}")
+
+    driver.close()
--- a/src/study2neo4j/run.py
+++ b/src/study2neo4j/run.py
@@ -0,0 +1,32 @@
+import argparse
+import logging
+import configparser
+from ct2neo4j import create_graph_from_directory
+
+STUDY2NEO4J_VERSION: str = "0.1"
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+logger.info('study2neo4j v'+STUDY2NEO4J_VERSION)
+
+# define parameters - pass json files dir and db conf as arguments when running the script
+parser = argparse.ArgumentParser()
+parser.add_argument('-c', '--conf', required=True, type=str,
+                    help='Configuration file with database connection parameters')
+parser.add_argument('-f', '--files', required=True, type=str, help='Directory with json files')
+
+# parse parameters
+args = parser.parse_args()
+json_file_path = args.files
+
+conf_file = configparser.ConfigParser()
+conf_file.read(args.conf)
+uri = conf_file['neo4j']['uri']
+username = conf_file['neo4j']['username']
+password = conf_file['neo4j']['password']
+
+
+# start study2neo4j
+if __name__ == "__main__":
+    create_graph_from_directory(uri=uri, user=username, password=password, directory_path=json_file_path)
--- a/src/umls2neo4j/README.md
+++ b/src/umls2neo4j/README.md
@@ -0,0 +1,52 @@
+# umls2neo4j: UMLS to Neo4j Importer
+
+This Python script parses selected relationships from the UMLS Metathesaurus (`MRREL.RRF` and `MRCONSO.RRF`) and loads them into a Neo4j graph database.
+
+> [!IMPORTANT]
+> Requires a UMLS licence!
+
+## Features
+
+- Filters and loads `PAR` (parent) and `CHD` (child) relationships from `MRREL.RRF`
+- Loads only preferred English concept names from `MRCONSO.RRF`
+
+
+## Quickstart
+
+Create a configuration file, storing your details for the database-connection.
+E.g. in your home-directory with the name `umls.conf`.
+
+```ini
+[neo4j]
+uri = bolt://localhost:7687
+username = neo4j
+password = myfancypassword
+```
+
+Start the program by providing the location of your configuration-file and the location of the UMLS-files.
+
+```sh
+python3 src/umls2neo4j.py --conf ~/umls.conf --mrconsofiles ~/umls/MRCONSO.RRF --mrrelfiles ~/umls/MRREL.RRF
+```
+
+## Requirements
+
+- make sure, `python3` is installed
+- install the required libraries with `pip install -r requirements.txt`
+- download the UMLS Metathesaurus files (`MRREL.RRF`, `MRCONSO.RRF`) from [](  ) → requires a UMLS licence
+- have a running Neo4j DB (Neo4j version 5), with APOC installed
+- create the configuration-file as described in the [Quickstart section](#quickstart)
+
+
+## Detailled Infos
+
+The script will:
+
+1. Load preferred English concept names from `MRCONSO.RRF`
+2. Parse allowed relationships from `MRREL.RRF`
+3. Insert nodes and relationships into Neo4j using chunked batches
+
+## Customisation
+
+- Adjust `ALLOWED_RELS` in the script to include more relationship types
+- Tune `batch_chunk_size` and `apoc_batch_size` for better performance
--- a/src/umls2neo4j/methods_umls2neo4j.py
+++ b/src/umls2neo4j/methods_umls2neo4j.py
@@ -0,0 +1,114 @@
+import csv
+from neo4j import GraphDatabase
+
+
+# Load concept names from MRCONSO.RRF
+# Join CUI2 (MRREL) to CUI (MRCONSO) where TS=P, STT=PF, ISPREF=Y, LAT=ENG
+def load_cui_names(file_path):  # this only stores names for cuis (information about source_vocab and source_vocab_code needs to be included as well)
+    cui_infos = {}
+    with open(file_path, "r", encoding="utf-8") as f:
+        reader = csv.reader(f, delimiter="|")
+        for row in reader:
+            cui = row[0]
+            language = row[1]  # LAT=ENG
+            term_status = row[2]  # TS=P (Preferred)
+            stt = row[4]  # STT=PF (Preferred form of term)
+            ispref = row[6]  # ISPREF=Y
+            source_vocab = row[11]
+            source_vocab_code = row[13]
+            name = row[14]
+            srl_license = row[15]  # should be 0
+
+            if language != "ENG":
+                continue
+
+            if term_status == 'P' and stt == 'PF' and ispref == 'Y':
+                if srl_license == '0':
+                    # cui_infos[cui] = name
+                    cui_infos[cui] = {
+                        "name": name,
+                        "source_vocab": source_vocab,
+                        "source_vocab_code": source_vocab_code
+                    }
+    return cui_infos
+
+
+# Load MRREL.RRF and return filtered concept relationships
+def parse_mrrel(file_path, cui_infos, allowed_rels):
+    mrrelationships = []
+    with open(file_path, "r", encoding="utf-8") as f:
+        reader = csv.reader(f, delimiter="|")
+        for row in reader:
+            cui1 = row[0]
+            concept_type1 = row[2]  # should be CUI
+            rel = row[3]  # relationship, e.g. PAR, CHD
+            cui2 = row[4]
+            concept_type2 = row[6]  # should be CUI
+            rel_attribute = row[7]
+            source_vocab = row[10]  # source vocabulary
+
+            if rel in allowed_rels:
+                # if concept_type1 == 'CUI' and concept_type2 == 'CUI':
+                from_info = cui_infos.get(cui1)
+                to_info = cui_infos.get(cui2)
+
+                if from_info and to_info:
+                    mrrelationships.append({
+                        "from": cui1,
+                        "from_name": from_info["name"],
+                        "from_source_vocab": from_info["source_vocab"],
+                        "from_source_vocab_code": from_info["source_vocab_code"],
+                        "to": cui2,
+                        "to_name": to_info["name"],
+                        "to_source_vocab": to_info["source_vocab"],
+                        "to_source_vocab_code": to_info["source_vocab_code"],
+                        "type": rel,
+                        "attribute": rel_attribute,
+                        "source": source_vocab
+                    })
+    return mrrelationships
+
+
+def chunked(data, size):
+    for i in range(0, len(data), size):
+        yield data[i:i + size]
+
+
+# Load relationships into Neo4j
+def load_into_neo4j(rels, uri, user, password, batch_chunk_size, apoc_batch_size):
+    driver = GraphDatabase.driver(uri, auth=(user, password), connection_timeout=60, max_connection_lifetime=3600)
+
+    with driver.session(database="neo4j") as session:
+        # create constraint on cui
+        # session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (u:UMLSconcept) REQUIRE u.cui IS UNIQUE")
+
+        chunk_count = 0
+
+        for chunk in chunked(rels, batch_chunk_size):
+            chunk_count += 1
+            print(f"Inserting chunk {chunk_count} with {len(chunk)} relationships...")
+
+            session.run("""
+                            CALL apoc.periodic.iterate(
+                              'UNWIND $chunk AS rel RETURN rel',
+                              '
+                                MERGE (c1:UMLSconcept {cui: rel.from})
+                                  SET c1.name = rel.from_name, c1.source = rel.from_source_vocab, c1.sourcecode = rel.from_source_vocab_code
+                                MERGE (c2:UMLSconcept {cui: rel.to})
+                                  SET c2.name = rel.to_name, c2.source = rel.to_source_vocab, c2.sourcecode = rel.to_source_vocab_code
+                                MERGE (c1)-[r:UMLS_RELATION {type: rel.type}]->(c2)
+                                  SET r.source = rel.source, r.attribute = rel.attribute
+                              ',
+                              {
+                                batchSize: $apoc_batch_size,
+                                parallel: false,
+                                params: {chunk: $chunk}
+                              }
+                            )
+                        """, {
+                "chunk": chunk,
+                "apoc_batch_size": apoc_batch_size
+            })
+
+        print(f"All {chunk_count} batches inserted")
+    driver.close()
--- a/src/umls2neo4j/umls2neo4j.py
+++ b/src/umls2neo4j/umls2neo4j.py
@@ -0,0 +1,42 @@
+import csv
+from neo4j import GraphDatabase
+import argparse
+import configparser
+from methods_umls2neo4j import load_cui_names, parse_mrrel, load_into_neo4j
+
+# define parameters - pass db conf and UMLS files directory as arguments when running the script
+parser = argparse.ArgumentParser()
+parser.add_argument('-c', '--conf', required=True, type=str,
+                    help='Configuration file with database connection parameters')
+# Path to MRCONSO.RRF file
+parser.add_argument('-fc', '--mrconsofiles', required=True, type=str, help='Path to UMLS MRCONSO.RRF file')
+# Path to MRREL.RRF file
+parser.add_argument('-fr', '--mrrelfiles', required=True, type=str, help='Path to UMLS MRREL.RRF file')
+
+# parse parameters
+args = parser.parse_args()
+mrconso_file = args.mrconsofiles
+mrrel_file = args.mrrelfiles
+
+conf_file = configparser.ConfigParser()
+conf_file.read(args.conf)
+uri = conf_file['neo4j']['uri']
+username = conf_file['neo4j']['username']
+password = conf_file['neo4j']['password']
+
+# Define relationships of interest
+ALLOWED_RELS = {"PAR", "CHD"}
+
+# Run everything
+if __name__ == "__main__":
+    print("Loading CUI information...")
+    name_map = load_cui_names(file_path=mrconso_file)
+    print(f"Loaded information about {len(name_map)} CUIs")
+
+    print("Parsing relationships...")
+    relationships = parse_mrrel(file_path=mrrel_file, cui_infos=name_map, allowed_rels=ALLOWED_RELS)
+    print(f"Parsed {len(relationships)} relationships")
+
+    print("Loading into Neo4j...")
+    load_into_neo4j(rels=relationships, uri=uri, user=username, password=password, batch_chunk_size=5000, apoc_batch_size=1000)
+    print("Finished loading into Neo4j")