first release
This commit is contained in:
32
src/postprocessing/README.md
Normal file
32
src/postprocessing/README.md
Normal file
@@ -0,0 +1,32 @@
|
||||
# postprocessing
|
||||
`postprocess.py` executes Cypher queries to create mappings between MeSH, UMLS, MDM, and ClinicalTrials.gov data in a Neo4j graph database.
|
||||
|
||||
The main script `postprocess.py` runs a set of predefined Cypher queries for:
|
||||
* mapping MeSH terms to UMLS concepts
|
||||
* mapping MDM Portal aliases to UMLS concepts
|
||||
* mapping ClinicalTrials.gov studies to MeSH terms
|
||||
* mapping ClinicalTrials.gov studies to MDM Portal entries
|
||||
|
||||
## Quickstart
|
||||
|
||||
Create a configuration file with your Neo4j connection details.
|
||||
For example, save it as `postprocess.conf`:
|
||||
|
||||
```ini
|
||||
[neo4j]
|
||||
uri = bolt://localhost:7687
|
||||
username = neo4j
|
||||
password = myfancypassword
|
||||
```
|
||||
|
||||
Run the postprocessing by providing the configuration file:
|
||||
```sh
|
||||
python3 src/postprocess.py --conf ~/postprocess.conf
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
* make sure `python3` is installed
|
||||
* have a running Neo4j instance (version 5)
|
||||
* the Neo4j instance must have the APOC plugin installed
|
||||
* create the configuration file as described in the [Quickstart](#quickstart) section
|
101
src/postprocessing/postprocess.py
Normal file
101
src/postprocessing/postprocess.py
Normal file
@@ -0,0 +1,101 @@
|
||||
from neo4j import GraphDatabase
|
||||
import logging
|
||||
import argparse
|
||||
import configparser
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# define funtion for running cypher queries against a neo4j
|
||||
def run_cypher_query_for_postprocessing(uri, user, password, cypher_query):
|
||||
try:
|
||||
with GraphDatabase.driver(uri, auth=(user, password)) as driver:
|
||||
with driver.session() as session:
|
||||
session.run(cypher_query)
|
||||
logger.info("Postprocessing query executed successfully.")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to execute query: {e}")
|
||||
raise
|
||||
|
||||
|
||||
# cypher queries for postprocessing
|
||||
# mapping MeSH terms to UMLS concepts
|
||||
cypher_mesh_to_umls = """
|
||||
MATCH (m:MeshClass)
|
||||
WITH collect(distinct m.cui) AS mesh_cui_lists
|
||||
WITH apoc.coll.toSet(apoc.coll.flatten(mesh_cui_lists)) AS mesh_cui_set
|
||||
MATCH (a:UMLSconcept)
|
||||
WHERE a.cui IN mesh_cui_set
|
||||
MERGE (m)-[:MESH_TO_UMLS_MAPPING]->(a)
|
||||
"""
|
||||
|
||||
# mapping MeSH terms to UMLS concepts, optimised for larger dataset
|
||||
cypher_mesh_to_umls_optimised = """
|
||||
CALL apoc.periodic.iterate(
|
||||
"MATCH (m:MeshClass) RETURN m",
|
||||
"UNWIND m.cui AS mesh_cui
|
||||
MATCH (a:UMLSconcept {cui: mesh_cui})
|
||||
MERGE (m)-[:MESH_TO_UMLS_MAPPING]->(a)",
|
||||
{batchSize:1000, parallel:false}
|
||||
);
|
||||
"""
|
||||
|
||||
# mapping MDM Portal Alias to UMLS concepts
|
||||
cypher_mdm_to_umls = """
|
||||
MATCH (a:Alias)
|
||||
WITH a, a.Name as aliasname
|
||||
MATCH (u:UMLSconcept)
|
||||
WHERE u.cui = aliasname
|
||||
MERGE (a)-[:MAPS_TO_UMLS_ENTRY]->(u)
|
||||
"""
|
||||
|
||||
# mapping ClinicalTrials.gov studies to MeSH terms
|
||||
cypher_ct_to_mesh = """
|
||||
MATCH (c:MeshClass)
|
||||
WITH c, c.notation as cnote
|
||||
MATCH (m:meshes)
|
||||
WHERE m.id = cnote
|
||||
MERGE (m)-[:MAPS_TO_MESH_ENTRY]->(c)
|
||||
"""
|
||||
|
||||
# mapping ClinicalTrials.gov studies to MDM Portal entries
|
||||
cypher_ct_to_mdm = """
|
||||
MATCH (s:Study)
|
||||
WITH s
|
||||
MATCH (c:ClinicalTrialsEntry)
|
||||
WHERE c.str_id = s.OID
|
||||
MERGE (s)-[r:BELONGS_TO]->(c);
|
||||
|
||||
MATCH (s:Study)
|
||||
WITH s
|
||||
MATCH (c:ClinicalTrialsEntry)
|
||||
WHERE c.str_id = s.NCT_ID
|
||||
MERGE (s)-[r:BELONGS_TO]->(c)
|
||||
"""
|
||||
|
||||
|
||||
# define parameters - pass json files dir and db conf as arguments when running the script
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-c', '--conf', required=True, type=str,
|
||||
help='Configuration file with database connection parameters')
|
||||
|
||||
# parse parameters
|
||||
args = parser.parse_args()
|
||||
|
||||
conf_file = configparser.ConfigParser()
|
||||
conf_file.read(args.conf)
|
||||
uri = conf_file['neo4j']['uri']
|
||||
username = conf_file['neo4j']['username']
|
||||
password = conf_file['neo4j']['password']
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# postprocess mesh to umls
|
||||
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_mesh_to_umls_optimised)
|
||||
# postprocess clinicaltrials.gov to mesh
|
||||
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_ct_to_mesh)
|
||||
# postprocess mdm to umls
|
||||
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_mdm_to_umls)
|
||||
# postprocess clinicaltrials.gov to mdm
|
||||
run_cypher_query_for_postprocessing(uri=uri, user=username, password=password, cypher_query=cypher_ct_to_mdm)
|
Reference in New Issue
Block a user