first release

This commit is contained in:
2025-08-22 11:52:43 +02:00
commit ec27c71148
23 changed files with 1543 additions and 0 deletions

View File

@@ -0,0 +1,32 @@
# postprocessing
`postprocess.py` executes Cypher queries to create mappings between MeSH, UMLS, MDM, and ClinicalTrials.gov data in a Neo4j graph database.
The main script `postprocess.py` runs a set of predefined Cypher queries for:
* mapping MeSH terms to UMLS concepts
* mapping MDM Portal aliases to UMLS concepts
* mapping ClinicalTrials.gov studies to MeSH terms
* mapping ClinicalTrials.gov studies to MDM Portal entries
## Quickstart
Create a configuration file with your Neo4j connection details.
For example, save it as `postprocess.conf`:
```ini
[neo4j]
uri = bolt://localhost:7687
username = neo4j
password = myfancypassword
```
Run the postprocessing by providing the configuration file:
```sh
python3 src/postprocess.py --conf ~/postprocess.conf
```
## Requirements
* make sure `python3` is installed
* have a running Neo4j instance (version 5)
* the Neo4j instance must have the APOC plugin installed
* create the configuration file as described in the [Quickstart](#quickstart) section

View File

@@ -0,0 +1,101 @@
from neo4j import GraphDatabase
import logging
import argparse
import configparser
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# define funtion for running cypher queries against a neo4j
def run_cypher_query_for_postprocessing(uri, user, password, cypher_query):
try:
with GraphDatabase.driver(uri, auth=(user, password)) as driver:
with driver.session() as session:
session.run(cypher_query)
logger.info("Postprocessing query executed successfully.")
except Exception as e:
logger.error(f"Failed to execute query: {e}")
raise
# cypher queries for postprocessing
# mapping MeSH terms to UMLS concepts
cypher_mesh_to_umls = """
MATCH (m:MeshClass)
WITH collect(distinct m.cui) AS mesh_cui_lists
WITH apoc.coll.toSet(apoc.coll.flatten(mesh_cui_lists)) AS mesh_cui_set
MATCH (a:UMLSconcept)
WHERE a.cui IN mesh_cui_set
MERGE (m)-[:MESH_TO_UMLS_MAPPING]->(a)
"""
# mapping MeSH terms to UMLS concepts, optimised for larger dataset
cypher_mesh_to_umls_optimised = """
CALL apoc.periodic.iterate(
"MATCH (m:MeshClass) RETURN m",
"UNWIND m.cui AS mesh_cui
MATCH (a:UMLSconcept {cui: mesh_cui})
MERGE (m)-[:MESH_TO_UMLS_MAPPING]->(a)",
{batchSize:1000, parallel:false}
);
"""
# mapping MDM Portal Alias to UMLS concepts
cypher_mdm_to_umls = """
MATCH (a:Alias)
WITH a, a.Name as aliasname
MATCH (u:UMLSconcept)
WHERE u.cui = aliasname
MERGE (a)-[:MAPS_TO_UMLS_ENTRY]->(u)
"""
# mapping ClinicalTrials.gov studies to MeSH terms
cypher_ct_to_mesh = """
MATCH (c:MeshClass)
WITH c, c.notation as cnote
MATCH (m:meshes)
WHERE m.id = cnote
MERGE (m)-[:MAPS_TO_MESH_ENTRY]->(c)
"""
# mapping ClinicalTrials.gov studies to MDM Portal entries
cypher_ct_to_mdm = """
MATCH (s:Study)
WITH s
MATCH (c:ClinicalTrialsEntry)
WHERE c.str_id = s.OID
MERGE (s)-[r:BELONGS_TO]->(c);
MATCH (s:Study)
WITH s
MATCH (c:ClinicalTrialsEntry)
WHERE c.str_id = s.NCT_ID
MERGE (s)-[r:BELONGS_TO]->(c)
"""
# define parameters - pass json files dir and db conf as arguments when running the script
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--conf', required=True, type=str,
help='Configuration file with database connection parameters')
# parse parameters
args = parser.parse_args()
conf_file = configparser.ConfigParser()
conf_file.read(args.conf)
uri = conf_file['neo4j']['uri']
username = conf_file['neo4j']['username']
password = conf_file['neo4j']['password']
if __name__ == "__main__":
# postprocess mesh to umls
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_mesh_to_umls_optimised)
# postprocess clinicaltrials.gov to mesh
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_ct_to_mesh)
# postprocess mdm to umls
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_mdm_to_umls)
# postprocess clinicaltrials.gov to mdm
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_ct_to_mdm)