first release

This commit is contained in:
2025-08-22 11:52:43 +02:00
commit ec27c71148
23 changed files with 1543 additions and 0 deletions
+52
View File
@@ -0,0 +1,52 @@
# umls2neo4j: UMLS to Neo4j Importer
This Python script parses selected relationships from the UMLS Metathesaurus (`MRREL.RRF` and `MRCONSO.RRF`) and loads them into a Neo4j graph database.
> [!IMPORTANT]
> Requires a UMLS licence!
## Features
- Filters and loads `PAR` (parent) and `CHD` (child) relationships from `MRREL.RRF`
- Loads only preferred English concept names from `MRCONSO.RRF`
## Quickstart
Create a configuration file, storing your details for the database-connection.
E.g. in your home-directory with the name `umls.conf`.
```ini
[neo4j]
uri = bolt://localhost:7687
username = neo4j
password = myfancypassword
```
Start the program by providing the location of your configuration-file and the location of the UMLS-files.
```sh
python3 src/umls2neo4j.py --conf ~/umls.conf --mrconsofiles ~/umls/MRCONSO.RRF --mrrelfiles ~/umls/MRREL.RRF
```
## Requirements
- make sure, `python3` is installed
- install the required libraries with `pip install -r requirements.txt`
- download the UMLS Metathesaurus files (`MRREL.RRF`, `MRCONSO.RRF`) from []( ) → requires a UMLS licence
- have a running Neo4j DB (Neo4j version 5), with APOC installed
- create the configuration-file as described in the [Quickstart section](#quickstart)
## Detailled Infos
The script will:
1. Load preferred English concept names from `MRCONSO.RRF`
2. Parse allowed relationships from `MRREL.RRF`
3. Insert nodes and relationships into Neo4j using chunked batches
## Customisation
- Adjust `ALLOWED_RELS` in the script to include more relationship types
- Tune `batch_chunk_size` and `apoc_batch_size` for better performance
+114
View File
@@ -0,0 +1,114 @@
import csv
from neo4j import GraphDatabase
# Load concept names from MRCONSO.RRF
# Join CUI2 (MRREL) to CUI (MRCONSO) where TS=P, STT=PF, ISPREF=Y, LAT=ENG
def load_cui_names(file_path): # this only stores names for cuis (information about source_vocab and source_vocab_code needs to be included as well)
cui_infos = {}
with open(file_path, "r", encoding="utf-8") as f:
reader = csv.reader(f, delimiter="|")
for row in reader:
cui = row[0]
language = row[1] # LAT=ENG
term_status = row[2] # TS=P (Preferred)
stt = row[4] # STT=PF (Preferred form of term)
ispref = row[6] # ISPREF=Y
source_vocab = row[11]
source_vocab_code = row[13]
name = row[14]
srl_license = row[15] # should be 0
if language != "ENG":
continue
if term_status == 'P' and stt == 'PF' and ispref == 'Y':
if srl_license == '0':
# cui_infos[cui] = name
cui_infos[cui] = {
"name": name,
"source_vocab": source_vocab,
"source_vocab_code": source_vocab_code
}
return cui_infos
# Load MRREL.RRF and return filtered concept relationships
def parse_mrrel(file_path, cui_infos, allowed_rels):
mrrelationships = []
with open(file_path, "r", encoding="utf-8") as f:
reader = csv.reader(f, delimiter="|")
for row in reader:
cui1 = row[0]
concept_type1 = row[2] # should be CUI
rel = row[3] # relationship, e.g. PAR, CHD
cui2 = row[4]
concept_type2 = row[6] # should be CUI
rel_attribute = row[7]
source_vocab = row[10] # source vocabulary
if rel in allowed_rels:
# if concept_type1 == 'CUI' and concept_type2 == 'CUI':
from_info = cui_infos.get(cui1)
to_info = cui_infos.get(cui2)
if from_info and to_info:
mrrelationships.append({
"from": cui1,
"from_name": from_info["name"],
"from_source_vocab": from_info["source_vocab"],
"from_source_vocab_code": from_info["source_vocab_code"],
"to": cui2,
"to_name": to_info["name"],
"to_source_vocab": to_info["source_vocab"],
"to_source_vocab_code": to_info["source_vocab_code"],
"type": rel,
"attribute": rel_attribute,
"source": source_vocab
})
return mrrelationships
def chunked(data, size):
for i in range(0, len(data), size):
yield data[i:i + size]
# Load relationships into Neo4j
def load_into_neo4j(rels, uri, user, password, batch_chunk_size, apoc_batch_size):
driver = GraphDatabase.driver(uri, auth=(user, password), connection_timeout=60, max_connection_lifetime=3600)
with driver.session(database="neo4j") as session:
# create constraint on cui
# session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (u:UMLSconcept) REQUIRE u.cui IS UNIQUE")
chunk_count = 0
for chunk in chunked(rels, batch_chunk_size):
chunk_count += 1
print(f"Inserting chunk {chunk_count} with {len(chunk)} relationships...")
session.run("""
CALL apoc.periodic.iterate(
'UNWIND $chunk AS rel RETURN rel',
'
MERGE (c1:UMLSconcept {cui: rel.from})
SET c1.name = rel.from_name, c1.source = rel.from_source_vocab, c1.sourcecode = rel.from_source_vocab_code
MERGE (c2:UMLSconcept {cui: rel.to})
SET c2.name = rel.to_name, c2.source = rel.to_source_vocab, c2.sourcecode = rel.to_source_vocab_code
MERGE (c1)-[r:UMLS_RELATION {type: rel.type}]->(c2)
SET r.source = rel.source, r.attribute = rel.attribute
',
{
batchSize: $apoc_batch_size,
parallel: false,
params: {chunk: $chunk}
}
)
""", {
"chunk": chunk,
"apoc_batch_size": apoc_batch_size
})
print(f"All {chunk_count} batches inserted")
driver.close()
+42
View File
@@ -0,0 +1,42 @@
import csv
from neo4j import GraphDatabase
import argparse
import configparser
from methods_umls2neo4j import load_cui_names, parse_mrrel, load_into_neo4j
# define parameters - pass db conf and UMLS files directory as arguments when running the script
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--conf', required=True, type=str,
help='Configuration file with database connection parameters')
# Path to MRCONSO.RRF file
parser.add_argument('-fc', '--mrconsofiles', required=True, type=str, help='Path to UMLS MRCONSO.RRF file')
# Path to MRREL.RRF file
parser.add_argument('-fr', '--mrrelfiles', required=True, type=str, help='Path to UMLS MRREL.RRF file')
# parse parameters
args = parser.parse_args()
mrconso_file = args.mrconsofiles
mrrel_file = args.mrrelfiles
conf_file = configparser.ConfigParser()
conf_file.read(args.conf)
uri = conf_file['neo4j']['uri']
username = conf_file['neo4j']['username']
password = conf_file['neo4j']['password']
# Define relationships of interest
ALLOWED_RELS = {"PAR", "CHD"}
# Run everything
if __name__ == "__main__":
print("Loading CUI information...")
name_map = load_cui_names(file_path=mrconso_file)
print(f"Loaded information about {len(name_map)} CUIs")
print("Parsing relationships...")
relationships = parse_mrrel(file_path=mrrel_file, cui_infos=name_map, allowed_rels=ALLOWED_RELS)
print(f"Parsed {len(relationships)} relationships")
print("Loading into Neo4j...")
load_into_neo4j(rels=relationships, uri=uri, user=username, password=password, batch_chunk_size=5000, apoc_batch_size=1000)
print("Finished loading into Neo4j")