first release
This commit is contained in:
38
src/study2neo4j/README.md
Normal file
38
src/study2neo4j/README.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# study2neo4j
|
||||
|
||||
This repo is for integrating data from ClinicalTrials.gov into Neo4j.
|
||||
|
||||
## Quickstart
|
||||
|
||||
Create a configuration file, storing your details for the database-connection.
|
||||
E.g. in your home-directory with the name `study.conf`.
|
||||
|
||||
```ini
|
||||
[neo4j]
|
||||
uri = bolt://localhost:7687
|
||||
username = neo4j
|
||||
password = myfancypassword
|
||||
```
|
||||
|
||||
Start the program by providing the location of your configuration-file and the location of the json files.
|
||||
|
||||
```sh
|
||||
python3 src/run.py --conf ~/study.conf --files ~/Desktop/datasource
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- make sure, `python3` is installed
|
||||
- install the required libraries with `pip install -r requirements.txt`
|
||||
- download all json files from [ClinicalTrials.gov]( https://clinicaltrials.gov/ ) and place all json files you want to load in one folder
|
||||
- have a running Neo4j DB (Neo4j version 5)
|
||||
- create the configuration-file as described in the [Quickstart section](#quickstart)
|
||||
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
`ct2neo4j.py` is adapted from the MeDaX pipeline (see [1], [2]) graphCreation, with permission from the authors.
|
||||
|
||||
[1] Gebhardt, T., Mazein, I., Michaelis, L., Henkel, R., Lobentanzer, S., Waltemath, D., & Wodke, J. (2025). MeDaX pipeline (1.0.0). Zenodo. https://doi.org/10.5281/zenodo.15229077
|
||||
|
||||
[2] Mazein, I., Gebhardt, T., Zinkewitz, F., Michaelis, L., Braun, S., Waltemath, D., Henkel, R., & Wodke, J. A. (2024). MeDaX: A Knowledge Graph on FHIR. In Digital Health and Informatics Innovations for Sustainable Health Care Systems (pp. 367-371). IOS Press.
|
125
src/study2neo4j/ct2neo4j.py
Normal file
125
src/study2neo4j/ct2neo4j.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import json
|
||||
import os
|
||||
from neo4j import GraphDatabase
|
||||
|
||||
|
||||
# Define a function to add nodes and relationships recursively
|
||||
def add_nodes_from_dict(tx, parent_node_label, parent_node_str_id, current_dict):
|
||||
for key, value in current_dict.items(): # iterate over each key-value pair in dictionary
|
||||
if key == "phases":
|
||||
# Create a node for each phase
|
||||
for index, phase in enumerate(value):
|
||||
phase_node_str_id = f"{parent_node_str_id}_{key}_{index}"
|
||||
tx.run(f"MERGE (n:phase {{str_id: $str_id, name: $phase_name}})",
|
||||
str_id=phase_node_str_id, phase_name=phase)
|
||||
tx.run(
|
||||
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:phase {{str_id: $child_str_id}}) "
|
||||
f"MERGE (a)-[:{key}]->(b)",
|
||||
parent_str_id=parent_node_str_id,
|
||||
child_str_id=phase_node_str_id
|
||||
)
|
||||
if isinstance(value, dict): # if value of key is a dict, then create new node:
|
||||
# Create a new node for the nested dictionary
|
||||
new_node_str_id = f"{parent_node_str_id}_{key}" # concatenate the parent_node_str_id and key to a new id
|
||||
tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=new_node_str_id) # create node with key as label
|
||||
|
||||
# Create a relationship from the parent node to the new node
|
||||
tx.run(f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
|
||||
f"MERGE (a)-[:{key}]->(b)",
|
||||
parent_str_id=parent_node_str_id,
|
||||
child_str_id=new_node_str_id) # create rel betw parent and newly created node
|
||||
|
||||
# Recurse into the nested dictionary
|
||||
add_nodes_from_dict(tx, key, new_node_str_id, value)
|
||||
|
||||
elif isinstance(value, list): # if value of key is a list, then check if list contains dictionary
|
||||
|
||||
if key == "conditions":
|
||||
# Create a node for each condition
|
||||
for index, condition in enumerate(value):
|
||||
condition_node_str_id = f"{parent_node_str_id}_{key}_{index}"
|
||||
tx.run(f"MERGE (n:condition {{str_id: $str_id, name: $condition_name}})",
|
||||
str_id=condition_node_str_id, condition_name=condition)
|
||||
tx.run(
|
||||
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:condition {{str_id: $child_str_id}}) "
|
||||
f"MERGE (a)-[:{key}]->(b)",
|
||||
parent_str_id=parent_node_str_id,
|
||||
child_str_id=condition_node_str_id
|
||||
)
|
||||
elif key == "keywords":
|
||||
# Create a node for each keyword
|
||||
for index, keyword in enumerate(value):
|
||||
keyword_node_str_id = f"{parent_node_str_id}_{key}_{index}"
|
||||
tx.run(f"MERGE (n:keyword {{str_id: $str_id, name: $keyword_name}})", str_id=keyword_node_str_id,
|
||||
keyword_name=keyword)
|
||||
tx.run(
|
||||
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:keyword {{str_id: $child_str_id}}) "
|
||||
f"MERGE (a)-[:{key}]->(b)",
|
||||
parent_str_id=parent_node_str_id,
|
||||
child_str_id=keyword_node_str_id
|
||||
)
|
||||
|
||||
# if list doesn't contain any nested dictionaries, make it a value in the node
|
||||
if not any(isinstance(item, dict) for item in value):
|
||||
# If the list contains only primitive values (like strings or numbers),
|
||||
# these values are set as properties of the parent node.
|
||||
tx.run(f"MATCH (n:{parent_node_label} {{str_id: $str_id}}) SET n.{key} = $value",
|
||||
str_id=parent_node_str_id, value=value)
|
||||
|
||||
else: # if list contains dictionaries, then a new node is created for each dict
|
||||
|
||||
# Process each dictionary in the list
|
||||
for index, item in enumerate(value):
|
||||
if isinstance(item, dict):
|
||||
item_node_str_id = f"{parent_node_str_id}_{key}_{index}"
|
||||
tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=item_node_str_id)
|
||||
|
||||
tx.run(
|
||||
f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
|
||||
f"MERGE (a)-[:{key}]->(b)", parent_str_id=parent_node_str_id,
|
||||
child_str_id=item_node_str_id)
|
||||
add_nodes_from_dict(tx, key, item_node_str_id, item)
|
||||
|
||||
else:
|
||||
|
||||
# if value is a reference
|
||||
if key == 'reference':
|
||||
new_node_str_id = f"{parent_node_str_id}_{key}"
|
||||
tx.run(f"MERGE (n:{key} {{str_id: $str_id}})", str_id=new_node_str_id)
|
||||
|
||||
tx.run(f"MATCH (a:{parent_node_label} {{str_id: $parent_str_id}}), (b:{key} {{str_id: $child_str_id}}) "
|
||||
f"MERGE (a)-[:{key}]->(b)", parent_str_id=parent_node_str_id, child_str_id=new_node_str_id)
|
||||
|
||||
tx.run(f"MATCH (n:{key} {{str_id: $str_id}}) SET n.{key} = $value",
|
||||
str_id=new_node_str_id, value=value)
|
||||
|
||||
else:
|
||||
|
||||
# For non-dict and non-list values, add them as attributes to the parent node
|
||||
tx.run(f"MATCH (n:{parent_node_label} {{str_id: $str_id}}) SET n.{key} = $value",
|
||||
str_id=parent_node_str_id, value=value)
|
||||
|
||||
|
||||
# Connect to Neo4j and create the graph
|
||||
def create_graph_from_directory(uri, user, password, directory_path):
|
||||
driver = GraphDatabase.driver(uri, auth=(user, password))
|
||||
|
||||
for filename in os.listdir(directory_path):
|
||||
if filename.endswith('.json'):
|
||||
file_path = os.path.join(directory_path, filename)
|
||||
try:
|
||||
with open(file_path, 'r') as file:
|
||||
json_data = json.load(file)
|
||||
|
||||
with driver.session() as session:
|
||||
root_node_label = 'ClinicalTrialsEntry'
|
||||
root_node_str_id = json_data['protocolSection']['identificationModule']['nctId']
|
||||
session.execute_write(
|
||||
lambda tx: tx.run(f"MERGE (n:{root_node_label} {{str_id: $str_id}})", str_id=root_node_str_id))
|
||||
session.execute_write(add_nodes_from_dict, root_node_label, root_node_str_id, json_data)
|
||||
|
||||
print(f"Successfully imported: {filename}")
|
||||
except Exception as e:
|
||||
print(f"Failed to import {filename}: {e}")
|
||||
|
||||
driver.close()
|
32
src/study2neo4j/run.py
Normal file
32
src/study2neo4j/run.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import argparse
|
||||
import logging
|
||||
import configparser
|
||||
from ct2neo4j import create_graph_from_directory
|
||||
|
||||
STUDY2NEO4J_VERSION: str = "0.1"
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logger.info('study2neo4j v'+STUDY2NEO4J_VERSION)
|
||||
|
||||
# define parameters - pass json files dir and db conf as arguments when running the script
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-c', '--conf', required=True, type=str,
|
||||
help='Configuration file with database connection parameters')
|
||||
parser.add_argument('-f', '--files', required=True, type=str, help='Directory with json files')
|
||||
|
||||
# parse parameters
|
||||
args = parser.parse_args()
|
||||
json_file_path = args.files
|
||||
|
||||
conf_file = configparser.ConfigParser()
|
||||
conf_file.read(args.conf)
|
||||
uri = conf_file['neo4j']['uri']
|
||||
username = conf_file['neo4j']['username']
|
||||
password = conf_file['neo4j']['password']
|
||||
|
||||
|
||||
# start study2neo4j
|
||||
if __name__ == "__main__":
|
||||
create_graph_from_directory(uri=uri, user=username, password=password, directory_path=json_file_path)
|
Reference in New Issue
Block a user