first release

2025-08-22 11:52:43 +02:00
commit ec27c71148
23 changed files with 1543 additions and 0 deletions
--- a/src/study2neo4j/README.md
+++ b/src/study2neo4j/README.md
@@ -0,0 +1,38 @@
+# study2neo4j
+
+This repo is for integrating data from ClinicalTrials.gov into Neo4j.
+
+## Quickstart
+
+Create a configuration file, storing your details for the database-connection.
+E.g. in your home-directory with the name `study.conf`.
+
+```ini
+[neo4j]
+uri = bolt://localhost:7687
+username = neo4j
+password = myfancypassword
+```
+
+Start the program by providing the location of your configuration-file and the location of the json files.
+
+```sh
+python3 src/run.py --conf ~/study.conf --files ~/Desktop/datasource
+```
+
+## Requirements
+
+- make sure, `python3` is installed
+- install the required libraries with `pip install -r requirements.txt`
+- download all json files from [ClinicalTrials.gov]( https://clinicaltrials.gov/ ) and place all json files you want to load in one folder
+- have a running Neo4j DB (Neo4j version 5)
+- create the configuration-file as described in the [Quickstart section](#quickstart)
+
+
+## Acknowledgements
+
+`ct2neo4j.py` is adapted from the MeDaX pipeline (see [1], [2]) graphCreation, with permission from the authors.
+
+[1] Gebhardt, T., Mazein, I., Michaelis, L., Henkel, R., Lobentanzer, S., Waltemath, D., & Wodke, J. (2025). MeDaX pipeline (1.0.0). Zenodo. https://doi.org/10.5281/zenodo.15229077
+
+[2] Mazein, I., Gebhardt, T., Zinkewitz, F., Michaelis, L., Braun, S., Waltemath, D., Henkel, R., & Wodke, J. A. (2024). MeDaX: A Knowledge Graph on FHIR. In Digital Health and Informatics Innovations for Sustainable Health Care Systems (pp. 367-371). IOS Press.
--- a/src/study2neo4j/ct2neo4j.py
+++ b/src/study2neo4j/ct2neo4j.py
@@ -0,0 +1,125 @@
+import json
+import os
+from neo4j import GraphDatabase
+
+
+# Define a function to add nodes and relationships recursively
+def add_nodes_from_dict(tx, parent_node_label, parent_node_str_id, current_dict):
+    for key, value in current_dict.items():  # iterate over each key-value pair in dictionary
+        if key == "phases":
+            # Create a node for each phase
+            for index, phase in enumerate(value):
+                phase_node_str_id = f"{parent_node_str_id}_{key}_{index}"
+                tx.run(f"MERGE (n:phase {{str_id: $str_id, name: $phase_name}})",
+                       str_id=phase_node_str_id, phase_name=phase)
+                tx.run(
+                    f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:phase {{str_id: $child_str_id}}) "
+                    f"MERGE (a)-[:{key}]->(b)",
+                    parent_str_id=parent_node_str_id,
+                    child_str_id=phase_node_str_id
+                )
+        if isinstance(value, dict):  # if value of key is a dict, then create new node:
+            # Create a new node for the nested dictionary
+            new_node_str_id = f"{parent_node_str_id}_{key}"  # concatenate the parent_node_str_id and key to a new id
+            tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=new_node_str_id)  # create node with key as label
+
+            # Create a relationship from the parent node to the new node
+            tx.run(f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
+                   f"MERGE (a)-[:{key}]->(b)",
+                   parent_str_id=parent_node_str_id,
+                   child_str_id=new_node_str_id)  # create rel betw parent and newly created node
+
+            # Recurse into the nested dictionary
+            add_nodes_from_dict(tx, key, new_node_str_id, value)
+
+        elif isinstance(value, list):  # if value of key is a list, then check if list contains dictionary
+
+            if key == "conditions":
+                # Create a node for each condition
+                for index, condition in enumerate(value):
+                    condition_node_str_id = f"{parent_node_str_id}_{key}_{index}"
+                    tx.run(f"MERGE (n:condition {{str_id: $str_id, name: $condition_name}})",
+                           str_id=condition_node_str_id, condition_name=condition)
+                    tx.run(
+                        f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:condition {{str_id: $child_str_id}}) "
+                        f"MERGE (a)-[:{key}]->(b)",
+                        parent_str_id=parent_node_str_id,
+                        child_str_id=condition_node_str_id
+                    )
+            elif key == "keywords":
+                # Create a node for each keyword
+                for index, keyword in enumerate(value):
+                    keyword_node_str_id = f"{parent_node_str_id}_{key}_{index}"
+                    tx.run(f"MERGE (n:keyword {{str_id: $str_id, name: $keyword_name}})", str_id=keyword_node_str_id,
+                           keyword_name=keyword)
+                    tx.run(
+                        f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:keyword {{str_id: $child_str_id}}) "
+                        f"MERGE (a)-[:{key}]->(b)",
+                        parent_str_id=parent_node_str_id,
+                        child_str_id=keyword_node_str_id
+                    )
+
+            # if list doesn't contain any nested dictionaries, make it a value in the node
+            if not any(isinstance(item, dict) for item in value):
+                # If the list contains only primitive values (like strings or numbers),
+                # these values are set as properties of the parent node.
+                tx.run(f"MATCH (n:{parent_node_label} {{str_id: $str_id}}) SET n.{key} = $value",
+                       str_id=parent_node_str_id, value=value)
+
+            else:  # if list contains dictionaries, then a new node is created for each dict
+
+                # Process each dictionary in the list
+                for index, item in enumerate(value):
+                    if isinstance(item, dict):
+                        item_node_str_id = f"{parent_node_str_id}_{key}_{index}"
+                        tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=item_node_str_id)
+
+                        tx.run(
+                            f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
+                            f"MERGE (a)-[:{key}]->(b)", parent_str_id=parent_node_str_id,
+                            child_str_id=item_node_str_id)
+                        add_nodes_from_dict(tx, key, item_node_str_id, item)
+
+        else:
+
+            # if value is a reference
+            if key == 'reference':
+                new_node_str_id = f"{parent_node_str_id}_{key}"
+                tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=new_node_str_id)
+
+                tx.run(f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
+                       f"MERGE (a)-[:{key}]->(b)", parent_str_id=parent_node_str_id, child_str_id=new_node_str_id)
+
+                tx.run(f"MATCH (n:{key} {{str_id: $str_id}}) SET n.{key} = $value",
+                       str_id=new_node_str_id, value=value)
+
+            else:
+
+                # For non-dict and non-list values, add them as attributes to the parent node
+                tx.run(f"MATCH (n:{parent_node_label} {{str_id: $str_id}}) SET n.{key} = $value",
+                       str_id=parent_node_str_id, value=value)
+
+
+# Connect to Neo4j and create the graph
+def create_graph_from_directory(uri, user, password, directory_path):
+    driver = GraphDatabase.driver(uri, auth=(user, password))
+
+    for filename in os.listdir(directory_path):
+        if filename.endswith('.json'):
+            file_path = os.path.join(directory_path, filename)
+            try:
+                with open(file_path, 'r') as file:
+                    json_data = json.load(file)
+
+                with driver.session() as session:
+                    root_node_label = 'ClinicalTrialsEntry'
+                    root_node_str_id = json_data['protocolSection']['identificationModule']['nctId']
+                    session.execute_write(
+                        lambda tx: tx.run(f"MERGE (n:{root_node_label} {{str_id: $str_id}})", str_id=root_node_str_id))
+                    session.execute_write(add_nodes_from_dict, root_node_label, root_node_str_id, json_data)
+
+                print(f"Successfully imported: {filename}")
+            except Exception as e:
+                print(f"Failed to import {filename}: {e}")
+
+    driver.close()
--- a/src/study2neo4j/run.py
+++ b/src/study2neo4j/run.py
@@ -0,0 +1,32 @@
+import argparse
+import logging
+import configparser
+from ct2neo4j import create_graph_from_directory
+
+STUDY2NEO4J_VERSION: str = "0.1"
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+logger.info('study2neo4j v'+STUDY2NEO4J_VERSION)
+
+# define parameters - pass json files dir and db conf as arguments when running the script
+parser = argparse.ArgumentParser()
+parser.add_argument('-c', '--conf', required=True, type=str,
+                    help='Configuration file with database connection parameters')
+parser.add_argument('-f', '--files', required=True, type=str, help='Directory with json files')
+
+# parse parameters
+args = parser.parse_args()
+json_file_path = args.files
+
+conf_file = configparser.ConfigParser()
+conf_file.read(args.conf)
+uri = conf_file['neo4j']['uri']
+username = conf_file['neo4j']['username']
+password = conf_file['neo4j']['password']
+
+
+# start study2neo4j
+if __name__ == "__main__":
+    create_graph_from_directory(uri=uri, user=username, password=password, directory_path=json_file_path)