first release

2025-08-22 11:52:43 +02:00
commit ec27c71148
23 changed files with 1543 additions and 0 deletions
@@ -0,0 +1,52 @@
+# umls2neo4j: UMLS to Neo4j Importer
+
+This Python script parses selected relationships from the UMLS Metathesaurus (`MRREL.RRF` and `MRCONSO.RRF`) and loads them into a Neo4j graph database.
+
+> [!IMPORTANT]
+> Requires a UMLS licence!
+
+## Features
+
+- Filters and loads `PAR` (parent) and `CHD` (child) relationships from `MRREL.RRF`
+- Loads only preferred English concept names from `MRCONSO.RRF`
+
+
+## Quickstart
+
+Create a configuration file, storing your details for the database-connection.
+E.g. in your home-directory with the name `umls.conf`.
+
+```ini
+[neo4j]
+uri = bolt://localhost:7687
+username = neo4j
+password = myfancypassword
+```
+
+Start the program by providing the location of your configuration-file and the location of the UMLS-files.
+
+```sh
+python3 src/umls2neo4j.py --conf ~/umls.conf --mrconsofiles ~/umls/MRCONSO.RRF --mrrelfiles ~/umls/MRREL.RRF
+```
+
+## Requirements
+
+- make sure, `python3` is installed
+- install the required libraries with `pip install -r requirements.txt`
+- download the UMLS Metathesaurus files (`MRREL.RRF`, `MRCONSO.RRF`) from [](  ) → requires a UMLS licence
+- have a running Neo4j DB (Neo4j version 5), with APOC installed
+- create the configuration-file as described in the [Quickstart section](#quickstart)
+
+
+## Detailled Infos
+
+The script will:
+
+1. Load preferred English concept names from `MRCONSO.RRF`
+2. Parse allowed relationships from `MRREL.RRF`
+3. Insert nodes and relationships into Neo4j using chunked batches
+
+## Customisation
+
+- Adjust `ALLOWED_RELS` in the script to include more relationship types
+- Tune `batch_chunk_size` and `apoc_batch_size` for better performance
@@ -0,0 +1,114 @@
+import csv
+from neo4j import GraphDatabase
+
+
+# Load concept names from MRCONSO.RRF
+# Join CUI2 (MRREL) to CUI (MRCONSO) where TS=P, STT=PF, ISPREF=Y, LAT=ENG
+def load_cui_names(file_path):  # this only stores names for cuis (information about source_vocab and source_vocab_code needs to be included as well)
+    cui_infos = {}
+    with open(file_path, "r", encoding="utf-8") as f:
+        reader = csv.reader(f, delimiter="|")
+        for row in reader:
+            cui = row[0]
+            language = row[1]  # LAT=ENG
+            term_status = row[2]  # TS=P (Preferred)
+            stt = row[4]  # STT=PF (Preferred form of term)
+            ispref = row[6]  # ISPREF=Y
+            source_vocab = row[11]
+            source_vocab_code = row[13]
+            name = row[14]
+            srl_license = row[15]  # should be 0
+
+            if language != "ENG":
+                continue
+
+            if term_status == 'P' and stt == 'PF' and ispref == 'Y':
+                if srl_license == '0':
+                    # cui_infos[cui] = name
+                    cui_infos[cui] = {
+                        "name": name,
+                        "source_vocab": source_vocab,
+                        "source_vocab_code": source_vocab_code
+                    }
+    return cui_infos
+
+
+# Load MRREL.RRF and return filtered concept relationships
+def parse_mrrel(file_path, cui_infos, allowed_rels):
+    mrrelationships = []
+    with open(file_path, "r", encoding="utf-8") as f:
+        reader = csv.reader(f, delimiter="|")
+        for row in reader:
+            cui1 = row[0]
+            concept_type1 = row[2]  # should be CUI
+            rel = row[3]  # relationship, e.g. PAR, CHD
+            cui2 = row[4]
+            concept_type2 = row[6]  # should be CUI
+            rel_attribute = row[7]
+            source_vocab = row[10]  # source vocabulary
+
+            if rel in allowed_rels:
+                # if concept_type1 == 'CUI' and concept_type2 == 'CUI':
+                from_info = cui_infos.get(cui1)
+                to_info = cui_infos.get(cui2)
+
+                if from_info and to_info:
+                    mrrelationships.append({
+                        "from": cui1,
+                        "from_name": from_info["name"],
+                        "from_source_vocab": from_info["source_vocab"],
+                        "from_source_vocab_code": from_info["source_vocab_code"],
+                        "to": cui2,
+                        "to_name": to_info["name"],
+                        "to_source_vocab": to_info["source_vocab"],
+                        "to_source_vocab_code": to_info["source_vocab_code"],
+                        "type": rel,
+                        "attribute": rel_attribute,
+                        "source": source_vocab
+                    })
+    return mrrelationships
+
+
+def chunked(data, size):
+    for i in range(0, len(data), size):
+        yield data[i:i + size]
+
+
+# Load relationships into Neo4j
+def load_into_neo4j(rels, uri, user, password, batch_chunk_size, apoc_batch_size):
+    driver = GraphDatabase.driver(uri, auth=(user, password), connection_timeout=60, max_connection_lifetime=3600)
+
+    with driver.session(database="neo4j") as session:
+        # create constraint on cui
+        # session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (u:UMLSconcept) REQUIRE u.cui IS UNIQUE")
+
+        chunk_count = 0
+
+        for chunk in chunked(rels, batch_chunk_size):
+            chunk_count += 1
+            print(f"Inserting chunk {chunk_count} with {len(chunk)} relationships...")
+
+            session.run("""
+                            CALL apoc.periodic.iterate(
+                              'UNWIND $chunk AS rel RETURN rel',
+                              '
+                                MERGE (c1:UMLSconcept {cui: rel.from})
+                                  SET c1.name = rel.from_name, c1.source = rel.from_source_vocab, c1.sourcecode = rel.from_source_vocab_code
+                                MERGE (c2:UMLSconcept {cui: rel.to})
+                                  SET c2.name = rel.to_name, c2.source = rel.to_source_vocab, c2.sourcecode = rel.to_source_vocab_code
+                                MERGE (c1)-[r:UMLS_RELATION {type: rel.type}]->(c2)
+                                  SET r.source = rel.source, r.attribute = rel.attribute
+                              ',
+                              {
+                                batchSize: $apoc_batch_size,
+                                parallel: false,
+                                params: {chunk: $chunk}
+                              }
+                            )
+                        """, {
+                "chunk": chunk,
+                "apoc_batch_size": apoc_batch_size
+            })
+
+        print(f"All {chunk_count} batches inserted")
+    driver.close()
@@ -0,0 +1,42 @@
+import csv
+from neo4j import GraphDatabase
+import argparse
+import configparser
+from methods_umls2neo4j import load_cui_names, parse_mrrel, load_into_neo4j
+
+# define parameters - pass db conf and UMLS files directory as arguments when running the script
+parser = argparse.ArgumentParser()
+parser.add_argument('-c', '--conf', required=True, type=str,
+                    help='Configuration file with database connection parameters')
+# Path to MRCONSO.RRF file
+parser.add_argument('-fc', '--mrconsofiles', required=True, type=str, help='Path to UMLS MRCONSO.RRF file')
+# Path to MRREL.RRF file
+parser.add_argument('-fr', '--mrrelfiles', required=True, type=str, help='Path to UMLS MRREL.RRF file')
+
+# parse parameters
+args = parser.parse_args()
+mrconso_file = args.mrconsofiles
+mrrel_file = args.mrrelfiles
+
+conf_file = configparser.ConfigParser()
+conf_file.read(args.conf)
+uri = conf_file['neo4j']['uri']
+username = conf_file['neo4j']['username']
+password = conf_file['neo4j']['password']
+
+# Define relationships of interest
+ALLOWED_RELS = {"PAR", "CHD"}
+
+# Run everything
+if __name__ == "__main__":
+    print("Loading CUI information...")
+    name_map = load_cui_names(file_path=mrconso_file)
+    print(f"Loaded information about {len(name_map)} CUIs")
+
+    print("Parsing relationships...")
+    relationships = parse_mrrel(file_path=mrrel_file, cui_infos=name_map, allowed_rels=ALLOWED_RELS)
+    print(f"Parsed {len(relationships)} relationships")
+
+    print("Loading into Neo4j...")
+    load_into_neo4j(rels=relationships, uri=uri, user=username, password=password, batch_chunk_size=5000, apoc_batch_size=1000)
+    print("Finished loading into Neo4j")