first release
This commit is contained in:
@@ -0,0 +1,52 @@
|
||||
# umls2neo4j: UMLS to Neo4j Importer
|
||||
|
||||
This Python script parses selected relationships from the UMLS Metathesaurus (`MRREL.RRF` and `MRCONSO.RRF`) and loads them into a Neo4j graph database.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Requires a UMLS licence!
|
||||
|
||||
## Features
|
||||
|
||||
- Filters and loads `PAR` (parent) and `CHD` (child) relationships from `MRREL.RRF`
|
||||
- Loads only preferred English concept names from `MRCONSO.RRF`
|
||||
|
||||
|
||||
## Quickstart
|
||||
|
||||
Create a configuration file, storing your details for the database-connection.
|
||||
E.g. in your home-directory with the name `umls.conf`.
|
||||
|
||||
```ini
|
||||
[neo4j]
|
||||
uri = bolt://localhost:7687
|
||||
username = neo4j
|
||||
password = myfancypassword
|
||||
```
|
||||
|
||||
Start the program by providing the location of your configuration-file and the location of the UMLS-files.
|
||||
|
||||
```sh
|
||||
python3 src/umls2neo4j.py --conf ~/umls.conf --mrconsofiles ~/umls/MRCONSO.RRF --mrrelfiles ~/umls/MRREL.RRF
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- make sure, `python3` is installed
|
||||
- install the required libraries with `pip install -r requirements.txt`
|
||||
- download the UMLS Metathesaurus files (`MRREL.RRF`, `MRCONSO.RRF`) from []( ) → requires a UMLS licence
|
||||
- have a running Neo4j DB (Neo4j version 5), with APOC installed
|
||||
- create the configuration-file as described in the [Quickstart section](#quickstart)
|
||||
|
||||
|
||||
## Detailled Infos
|
||||
|
||||
The script will:
|
||||
|
||||
1. Load preferred English concept names from `MRCONSO.RRF`
|
||||
2. Parse allowed relationships from `MRREL.RRF`
|
||||
3. Insert nodes and relationships into Neo4j using chunked batches
|
||||
|
||||
## Customisation
|
||||
|
||||
- Adjust `ALLOWED_RELS` in the script to include more relationship types
|
||||
- Tune `batch_chunk_size` and `apoc_batch_size` for better performance
|
||||
@@ -0,0 +1,114 @@
|
||||
import csv
|
||||
from neo4j import GraphDatabase
|
||||
|
||||
|
||||
# Load concept names from MRCONSO.RRF
|
||||
# Join CUI2 (MRREL) to CUI (MRCONSO) where TS=P, STT=PF, ISPREF=Y, LAT=ENG
|
||||
def load_cui_names(file_path): # this only stores names for cuis (information about source_vocab and source_vocab_code needs to be included as well)
|
||||
cui_infos = {}
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
reader = csv.reader(f, delimiter="|")
|
||||
for row in reader:
|
||||
cui = row[0]
|
||||
language = row[1] # LAT=ENG
|
||||
term_status = row[2] # TS=P (Preferred)
|
||||
stt = row[4] # STT=PF (Preferred form of term)
|
||||
ispref = row[6] # ISPREF=Y
|
||||
source_vocab = row[11]
|
||||
source_vocab_code = row[13]
|
||||
name = row[14]
|
||||
srl_license = row[15] # should be 0
|
||||
|
||||
if language != "ENG":
|
||||
continue
|
||||
|
||||
if term_status == 'P' and stt == 'PF' and ispref == 'Y':
|
||||
if srl_license == '0':
|
||||
# cui_infos[cui] = name
|
||||
cui_infos[cui] = {
|
||||
"name": name,
|
||||
"source_vocab": source_vocab,
|
||||
"source_vocab_code": source_vocab_code
|
||||
}
|
||||
return cui_infos
|
||||
|
||||
|
||||
# Load MRREL.RRF and return filtered concept relationships
|
||||
def parse_mrrel(file_path, cui_infos, allowed_rels):
|
||||
mrrelationships = []
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
reader = csv.reader(f, delimiter="|")
|
||||
for row in reader:
|
||||
cui1 = row[0]
|
||||
concept_type1 = row[2] # should be CUI
|
||||
rel = row[3] # relationship, e.g. PAR, CHD
|
||||
cui2 = row[4]
|
||||
concept_type2 = row[6] # should be CUI
|
||||
rel_attribute = row[7]
|
||||
source_vocab = row[10] # source vocabulary
|
||||
|
||||
if rel in allowed_rels:
|
||||
# if concept_type1 == 'CUI' and concept_type2 == 'CUI':
|
||||
from_info = cui_infos.get(cui1)
|
||||
to_info = cui_infos.get(cui2)
|
||||
|
||||
if from_info and to_info:
|
||||
mrrelationships.append({
|
||||
"from": cui1,
|
||||
"from_name": from_info["name"],
|
||||
"from_source_vocab": from_info["source_vocab"],
|
||||
"from_source_vocab_code": from_info["source_vocab_code"],
|
||||
"to": cui2,
|
||||
"to_name": to_info["name"],
|
||||
"to_source_vocab": to_info["source_vocab"],
|
||||
"to_source_vocab_code": to_info["source_vocab_code"],
|
||||
"type": rel,
|
||||
"attribute": rel_attribute,
|
||||
"source": source_vocab
|
||||
})
|
||||
return mrrelationships
|
||||
|
||||
|
||||
def chunked(data, size):
|
||||
for i in range(0, len(data), size):
|
||||
yield data[i:i + size]
|
||||
|
||||
|
||||
# Load relationships into Neo4j
|
||||
def load_into_neo4j(rels, uri, user, password, batch_chunk_size, apoc_batch_size):
|
||||
driver = GraphDatabase.driver(uri, auth=(user, password), connection_timeout=60, max_connection_lifetime=3600)
|
||||
|
||||
with driver.session(database="neo4j") as session:
|
||||
# create constraint on cui
|
||||
# session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (u:UMLSconcept) REQUIRE u.cui IS UNIQUE")
|
||||
|
||||
chunk_count = 0
|
||||
|
||||
for chunk in chunked(rels, batch_chunk_size):
|
||||
chunk_count += 1
|
||||
print(f"Inserting chunk {chunk_count} with {len(chunk)} relationships...")
|
||||
|
||||
session.run("""
|
||||
CALL apoc.periodic.iterate(
|
||||
'UNWIND $chunk AS rel RETURN rel',
|
||||
'
|
||||
MERGE (c1:UMLSconcept {cui: rel.from})
|
||||
SET c1.name = rel.from_name, c1.source = rel.from_source_vocab, c1.sourcecode = rel.from_source_vocab_code
|
||||
MERGE (c2:UMLSconcept {cui: rel.to})
|
||||
SET c2.name = rel.to_name, c2.source = rel.to_source_vocab, c2.sourcecode = rel.to_source_vocab_code
|
||||
MERGE (c1)-[r:UMLS_RELATION {type: rel.type}]->(c2)
|
||||
SET r.source = rel.source, r.attribute = rel.attribute
|
||||
',
|
||||
{
|
||||
batchSize: $apoc_batch_size,
|
||||
parallel: false,
|
||||
params: {chunk: $chunk}
|
||||
}
|
||||
)
|
||||
""", {
|
||||
"chunk": chunk,
|
||||
"apoc_batch_size": apoc_batch_size
|
||||
})
|
||||
|
||||
print(f"All {chunk_count} batches inserted")
|
||||
driver.close()
|
||||
@@ -0,0 +1,42 @@
|
||||
import csv
|
||||
from neo4j import GraphDatabase
|
||||
import argparse
|
||||
import configparser
|
||||
from methods_umls2neo4j import load_cui_names, parse_mrrel, load_into_neo4j
|
||||
|
||||
# define parameters - pass db conf and UMLS files directory as arguments when running the script
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-c', '--conf', required=True, type=str,
|
||||
help='Configuration file with database connection parameters')
|
||||
# Path to MRCONSO.RRF file
|
||||
parser.add_argument('-fc', '--mrconsofiles', required=True, type=str, help='Path to UMLS MRCONSO.RRF file')
|
||||
# Path to MRREL.RRF file
|
||||
parser.add_argument('-fr', '--mrrelfiles', required=True, type=str, help='Path to UMLS MRREL.RRF file')
|
||||
|
||||
# parse parameters
|
||||
args = parser.parse_args()
|
||||
mrconso_file = args.mrconsofiles
|
||||
mrrel_file = args.mrrelfiles
|
||||
|
||||
conf_file = configparser.ConfigParser()
|
||||
conf_file.read(args.conf)
|
||||
uri = conf_file['neo4j']['uri']
|
||||
username = conf_file['neo4j']['username']
|
||||
password = conf_file['neo4j']['password']
|
||||
|
||||
# Define relationships of interest
|
||||
ALLOWED_RELS = {"PAR", "CHD"}
|
||||
|
||||
# Run everything
|
||||
if __name__ == "__main__":
|
||||
print("Loading CUI information...")
|
||||
name_map = load_cui_names(file_path=mrconso_file)
|
||||
print(f"Loaded information about {len(name_map)} CUIs")
|
||||
|
||||
print("Parsing relationships...")
|
||||
relationships = parse_mrrel(file_path=mrrel_file, cui_infos=name_map, allowed_rels=ALLOWED_RELS)
|
||||
print(f"Parsed {len(relationships)} relationships")
|
||||
|
||||
print("Loading into Neo4j...")
|
||||
load_into_neo4j(rels=relationships, uri=uri, user=username, password=password, batch_chunk_size=5000, apoc_batch_size=1000)
|
||||
print("Finished loading into Neo4j")
|
||||
Reference in New Issue
Block a user