first release

This commit is contained in:
2025-08-22 11:52:43 +02:00
commit ec27c71148
23 changed files with 1543 additions and 0 deletions

38
src/study2neo4j/README.md Normal file
View File

@@ -0,0 +1,38 @@
# study2neo4j
This repo is for integrating data from ClinicalTrials.gov into Neo4j.
## Quickstart
Create a configuration file, storing your details for the database-connection.
E.g. in your home-directory with the name `study.conf`.
```ini
[neo4j]
uri = bolt://localhost:7687
username = neo4j
password = myfancypassword
```
Start the program by providing the location of your configuration-file and the location of the json files.
```sh
python3 src/run.py --conf ~/study.conf --files ~/Desktop/datasource
```
## Requirements
- make sure, `python3` is installed
- install the required libraries with `pip install -r requirements.txt`
- download all json files from [ClinicalTrials.gov]( https://clinicaltrials.gov/ ) and place all json files you want to load in one folder
- have a running Neo4j DB (Neo4j version 5)
- create the configuration-file as described in the [Quickstart section](#quickstart)
## Acknowledgements
`ct2neo4j.py` is adapted from the MeDaX pipeline (see [1], [2]) graphCreation, with permission from the authors.
[1] Gebhardt, T., Mazein, I., Michaelis, L., Henkel, R., Lobentanzer, S., Waltemath, D., & Wodke, J. (2025). MeDaX pipeline (1.0.0). Zenodo. https://doi.org/10.5281/zenodo.15229077
[2] Mazein, I., Gebhardt, T., Zinkewitz, F., Michaelis, L., Braun, S., Waltemath, D., Henkel, R., & Wodke, J. A. (2024). MeDaX: A Knowledge Graph on FHIR. In Digital Health and Informatics Innovations for Sustainable Health Care Systems (pp. 367-371). IOS Press.

125
src/study2neo4j/ct2neo4j.py Normal file
View File

@@ -0,0 +1,125 @@
import json
import os
from neo4j import GraphDatabase
# Define a function to add nodes and relationships recursively
def add_nodes_from_dict(tx, parent_node_label, parent_node_str_id, current_dict):
for key, value in current_dict.items(): # iterate over each key-value pair in dictionary
if key == "phases":
# Create a node for each phase
for index, phase in enumerate(value):
phase_node_str_id = f"{parent_node_str_id}_{key}_{index}"
tx.run(f"MERGE (n:phase {{str_id: $str_id, name: $phase_name}})",
str_id=phase_node_str_id, phase_name=phase)
tx.run(
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:phase {{str_id: $child_str_id}}) "
f"MERGE (a)-[:{key}]->(b)",
parent_str_id=parent_node_str_id,
child_str_id=phase_node_str_id
)
if isinstance(value, dict): # if value of key is a dict, then create new node:
# Create a new node for the nested dictionary
new_node_str_id = f"{parent_node_str_id}_{key}" # concatenate the parent_node_str_id and key to a new id
tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=new_node_str_id) # create node with key as label
# Create a relationship from the parent node to the new node
tx.run(f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
f"MERGE (a)-[:{key}]->(b)",
parent_str_id=parent_node_str_id,
child_str_id=new_node_str_id) # create rel betw parent and newly created node
# Recurse into the nested dictionary
add_nodes_from_dict(tx, key, new_node_str_id, value)
elif isinstance(value, list): # if value of key is a list, then check if list contains dictionary
if key == "conditions":
# Create a node for each condition
for index, condition in enumerate(value):
condition_node_str_id = f"{parent_node_str_id}_{key}_{index}"
tx.run(f"MERGE (n:condition {{str_id: $str_id, name: $condition_name}})",
str_id=condition_node_str_id, condition_name=condition)
tx.run(
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:condition {{str_id: $child_str_id}}) "
f"MERGE (a)-[:{key}]->(b)",
parent_str_id=parent_node_str_id,
child_str_id=condition_node_str_id
)
elif key == "keywords":
# Create a node for each keyword
for index, keyword in enumerate(value):
keyword_node_str_id = f"{parent_node_str_id}_{key}_{index}"
tx.run(f"MERGE (n:keyword {{str_id: $str_id, name: $keyword_name}})", str_id=keyword_node_str_id,
keyword_name=keyword)
tx.run(
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:keyword {{str_id: $child_str_id}}) "
f"MERGE (a)-[:{key}]->(b)",
parent_str_id=parent_node_str_id,
child_str_id=keyword_node_str_id
)
# if list doesn't contain any nested dictionaries, make it a value in the node
if not any(isinstance(item, dict) for item in value):
# If the list contains only primitive values (like strings or numbers),
# these values are set as properties of the parent node.
tx.run(f"MATCH (n:{parent_node_label} {{str_id: $str_id}}) SET n.{key} = $value",
str_id=parent_node_str_id, value=value)
else: # if list contains dictionaries, then a new node is created for each dict
# Process each dictionary in the list
for index, item in enumerate(value):
if isinstance(item, dict):
item_node_str_id = f"{parent_node_str_id}_{key}_{index}"
tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=item_node_str_id)
tx.run(
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
f"MERGE (a)-[:{key}]->(b)", parent_str_id=parent_node_str_id,
child_str_id=item_node_str_id)
add_nodes_from_dict(tx, key, item_node_str_id, item)
else:
# if value is a reference
if key == 'reference':
new_node_str_id = f"{parent_node_str_id}_{key}"
tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=new_node_str_id)
tx.run(f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
f"MERGE (a)-[:{key}]->(b)", parent_str_id=parent_node_str_id, child_str_id=new_node_str_id)
tx.run(f"MATCH (n:{key} {{str_id: $str_id}}) SET n.{key} = $value",
str_id=new_node_str_id, value=value)
else:
# For non-dict and non-list values, add them as attributes to the parent node
tx.run(f"MATCH (n:{parent_node_label} {{str_id: $str_id}}) SET n.{key} = $value",
str_id=parent_node_str_id, value=value)
# Connect to Neo4j and create the graph
def create_graph_from_directory(uri, user, password, directory_path):
driver = GraphDatabase.driver(uri, auth=(user, password))
for filename in os.listdir(directory_path):
if filename.endswith('.json'):
file_path = os.path.join(directory_path, filename)
try:
with open(file_path, 'r') as file:
json_data = json.load(file)
with driver.session() as session:
root_node_label = 'ClinicalTrialsEntry'
root_node_str_id = json_data['protocolSection']['identificationModule']['nctId']
session.execute_write(
lambda tx: tx.run(f"MERGE (n:{root_node_label} {{str_id: $str_id}})", str_id=root_node_str_id))
session.execute_write(add_nodes_from_dict, root_node_label, root_node_str_id, json_data)
print(f"Successfully imported: {filename}")
except Exception as e:
print(f"Failed to import {filename}: {e}")
driver.close()

32
src/study2neo4j/run.py Normal file
View File

@@ -0,0 +1,32 @@
import argparse
import logging
import configparser
from ct2neo4j import create_graph_from_directory
STUDY2NEO4J_VERSION: str = "0.1"
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info('study2neo4j v'+STUDY2NEO4J_VERSION)
# define parameters - pass json files dir and db conf as arguments when running the script
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--conf', required=True, type=str,
help='Configuration file with database connection parameters')
parser.add_argument('-f', '--files', required=True, type=str, help='Directory with json files')
# parse parameters
args = parser.parse_args()
json_file_path = args.files
conf_file = configparser.ConfigParser()
conf_file.read(args.conf)
uri = conf_file['neo4j']['uri']
username = conf_file['neo4j']['username']
password = conf_file['neo4j']['password']
# start study2neo4j
if __name__ == "__main__":
create_graph_from_directory(uri=uri, user=username, password=password, directory_path=json_file_path)