added batch processing
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,2 +1,3 @@
|
|||||||
**__pycache__/
|
**__pycache__/
|
||||||
datasource/
|
datasource/
|
||||||
|
old-src/
|
||||||
|
|||||||
7
.idea/mdm-to-neo4j.iml
generated
7
.idea/mdm-to-neo4j.iml
generated
@@ -1,11 +1,14 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<module type="PYTHON_MODULE" version="4">
|
<module type="PYTHON_MODULE" version="4">
|
||||||
<component name="NewModuleRootManager">
|
<component name="NewModuleRootManager">
|
||||||
<content url="file://$MODULE_DIR$" />
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||||
|
</content>
|
||||||
<orderEntry type="jdk" jdkName="Python 3.11 (mdm-to-neo4j) (2)" jdkType="Python SDK" />
|
<orderEntry type="jdk" jdkName="Python 3.11 (mdm-to-neo4j) (2)" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
</component>
|
</component>
|
||||||
<component name="PackageRequirementsSettings">
|
<component name="PackageRequirementsSettings">
|
||||||
<option name="requirementsPath" value="$USER_HOME$/PycharmProjects/mdm-to-neo4j/requirements.txt" />
|
<option name="requirementsPath" value="$MODULE_DIR$/requirements.txt" />
|
||||||
</component>
|
</component>
|
||||||
</module>
|
</module>
|
||||||
@@ -4,6 +4,7 @@ import logging
|
|||||||
from neo4j import GraphDatabase, Driver
|
from neo4j import GraphDatabase, Driver
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
from run import args
|
from run import args
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -62,147 +63,121 @@ class Neo4jConnection:
|
|||||||
logger.debug("Connection to Neo4j database closed")
|
logger.debug("Connection to Neo4j database closed")
|
||||||
self.__connection = None
|
self.__connection = None
|
||||||
|
|
||||||
def create_study_node(self, data: DataFrame, modelid):
|
def execute_batch_query(self, query: str, data: List[Dict], **params) -> None:
|
||||||
|
"""Execute batch query for better performance"""
|
||||||
|
try:
|
||||||
|
if isinstance(data, list) and len(data) > 0: # Only execute if there's data, change if data to ...
|
||||||
|
self.__connection.execute_query(query, data=data, **params)
|
||||||
|
logger.debug(f"Executed batch query with {len(data)} records")
|
||||||
|
else:
|
||||||
|
self.__connection.execute_query(query, data=data, **params)
|
||||||
|
logger.debug("Executed batch query with no additional data")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to execute batch query: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def create_study_node(self, data: List[Dict], modelid: str) -> None:
|
||||||
|
query = """
|
||||||
|
UNWIND $data AS row
|
||||||
|
MERGE (st:Study {OID: row.oid, ModelID: $modelid})
|
||||||
|
SET st.Name = row.name,
|
||||||
|
st.Description = row.descr,
|
||||||
|
st.Protocol = row.protocol
|
||||||
"""
|
"""
|
||||||
based on https://neo4j.com/docs/python-manual/current/query-simple/
|
self.execute_batch_query(query, data, modelid=modelid)
|
||||||
|
|
||||||
|
def create_studyevent_node(self, data: List[Dict], studyoid: str, modelid: str):
|
||||||
|
query = """
|
||||||
|
UNWIND $data AS row
|
||||||
|
MERGE (se:StudyEvent {OID: row.OID, Name: row.Name, Type: row.Type, StudyOID: $study_oid, ModelID: $modelid})
|
||||||
"""
|
"""
|
||||||
conn = self.__connection
|
self.execute_batch_query(query, data, study_oid=studyoid, modelid=modelid)
|
||||||
try:
|
|
||||||
for index, row in data.iterrows():
|
|
||||||
conn.execute_query(
|
|
||||||
"MERGE (st:Study {OID: $studyoid, Name: $studyname, Description: $studydescr, Protocol: $studyprotocol, ModelID: $modelid})",
|
|
||||||
studyoid=row['oid'], studyname=row['name'], studydescr=row['descr'], studyprotocol=row["protocol"], modelid=modelid
|
|
||||||
)
|
|
||||||
logger.debug(f"Inserted Study node with OID {row['oid']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_studyevent_node(self, data: DataFrame, studyoid: str, modelid):
|
def create_form_node(self, data: List[Dict], studyoid: str, modelid: str):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MERGE (f:Form {OID: row.OID, Name: row.Name, StudyOID: $study_oid, ModelID: $modelid})
|
||||||
conn.execute_query("MERGE (:StudyEvent {OID: $oid, Name: $name, Type: $type, StudyOID: $study_oid, ModelID: $modelid})",
|
"""
|
||||||
oid=row['OID'], name=row['Name'], type=row['Type'], study_oid=studyoid, modelid=modelid
|
self.execute_batch_query(query, data, study_oid=studyoid, modelid=modelid)
|
||||||
)
|
|
||||||
logger.debug(f"Inserted StudyEvent node with OID {row['OID']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_form_node(self, data: DataFrame, studyoid: str, modelid):
|
def create_itemgroup_node(self, data: List[Dict], studyoid: str, modelid):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MERGE (ig:ItemGroup {OID: row.OID, Name: row.Name, StudyOID: $study_oid, ModelID: $modelid})
|
||||||
conn.execute_query(
|
"""
|
||||||
"MERGE (:Form {OID: $oid, Name: $name, StudyOID: $studyoid, ModelID: $modelid})",
|
self.execute_batch_query(query, data, study_oid=studyoid, modelid=modelid)
|
||||||
oid=row['OID'], name=row['Name'], studyoid=studyoid, modelid=modelid)
|
|
||||||
logger.debug(f"Inserted Form node with OID {row['OID']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_itemgroup_node(self, data: DataFrame, studyoid: str, modelid):
|
def create_item_node(self, data: List[Dict], studyoid: str, modelid):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MERGE (i:Item {OID: row.itemoid, Name: row.name, DataType: row.datatype, Question: row.question, StudyOID: $studyoid, ModelID: $modelid})
|
||||||
conn.execute_query(
|
"""
|
||||||
"MERGE (:ItemGroup {OID: $oid, Name: $name, StudyOID: $studyoid, ModelID: $modelid})",
|
self.execute_batch_query(query, data, studyoid=studyoid, modelid=modelid)
|
||||||
oid=row['OID'], name=row['Name'], studyoid=studyoid, modelid=modelid)
|
|
||||||
logger.debug(f"Inserted ItemGroup node with OID {row['OID']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_item_node(self, data: DataFrame, studyoid: str, modelid):
|
def create_alias_node(self, data: List[Dict], studyoid: str, modelid: str) -> None:
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MERGE (a:Alias {Context: row.end, Name: row.end_two, ItemOID: row.start, StudyOID: $studyoid, ModelID: $modelid})
|
||||||
conn.execute_query(
|
"""
|
||||||
"MERGE (:Item {OID: $oid, Name: $name, DataType: $datatype, Question: $question, StudyOID: $studyoid, ModelID: $modelid})",
|
self.execute_batch_query(query, data, studyoid=studyoid, modelid=modelid)
|
||||||
oid=row['itemoid'], name=row['name'], datatype=row['datatype'], question=row['question'],
|
|
||||||
studyoid=studyoid, modelid=modelid)
|
|
||||||
logger.debug(f"Inserted Item node with OID {row['itemoid']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_alias_node(self, data: DataFrame,
|
def create_measurementunit_node(self, data: List[Dict]):
|
||||||
studyoid: str, modelid):
|
query = """
|
||||||
conn = self.__connection
|
UNWIND $data AS row
|
||||||
try:
|
MERGE (m:MeasurementUnit {OID: row.oid, Name: row.name, Symbol: row.symbol})
|
||||||
for index, row in data.iterrows():
|
"""
|
||||||
conn.execute_query(
|
self.execute_batch_query(query, data)
|
||||||
"MERGE (:Alias {Context: $context, Name: $name, ItemOID: $item, StudyOID: $study, ModelID: $modelid})",
|
|
||||||
context=row['end'], name=row['end_two'], item=row['start'], study=studyoid, modelid=modelid)
|
|
||||||
logger.debug(f"Inserted Alias node with name {row['end_two']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_measurementunit_node(self, data: DataFrame):
|
|
||||||
conn = self.__connection
|
|
||||||
try:
|
|
||||||
for index, row in data.iterrows():
|
|
||||||
conn.execute_query(
|
|
||||||
"MERGE (:MeasurementUnit {OID: $oid, Name: $name, Symbol: $symbol})",
|
|
||||||
oid=row['oid'], name=row['name'], symbol=row['symbol'])
|
|
||||||
logger.debug(f"Inserted MeasurementUnit node with name {row['name']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_basedef_node(self, studyoid: str, modelid):
|
def create_basedef_node(self, studyoid: str, modelid):
|
||||||
|
query = """
|
||||||
|
MERGE (b:BasicDefinitions {StudyOID: $studyoid, ModelID: $modelid})
|
||||||
|
"""
|
||||||
|
self.execute_batch_query(query, data=[], studyoid=studyoid, modelid=modelid)
|
||||||
|
|
||||||
|
def create_rangecheck_node(self, data: List[Dict]):
|
||||||
|
query = """
|
||||||
|
UNWIND $data AS row
|
||||||
|
MERGE (rc:RangeCheck {Comparator: row.comparator, Constraint: row.constraint, CheckValue: row.value})
|
||||||
|
"""
|
||||||
|
self.execute_batch_query(query, data)
|
||||||
|
|
||||||
|
def create_codelist_node(self, data: List[Dict], studyoid, modelid):
|
||||||
|
query = """
|
||||||
|
UNWIND $data AS row
|
||||||
|
MERGE (cl:CodeList {OID: row.OID, Name: row.Name, DataType: row.DataType, StudyOID: $studyoid, ModelID: $modelid})
|
||||||
|
"""
|
||||||
|
self.execute_batch_query(query, data, studyoid=studyoid, modelid=modelid)
|
||||||
|
|
||||||
|
def create_clitem_node(self, data: List[Dict]):
|
||||||
|
query = """
|
||||||
|
UNWIND $data AS row
|
||||||
|
MERGE (cli:CodeListItem {CodedValue: row.codedvalue, Decode: row.decode})
|
||||||
|
"""
|
||||||
|
self.execute_batch_query(query, data)
|
||||||
|
|
||||||
|
# Batch relationship creation methods
|
||||||
|
def create_relationships_batch(self, query: str, data: List[Dict], **params) -> None: # todo: remove it not needed
|
||||||
|
"""Generic batch relationship creation"""
|
||||||
|
self.execute_batch_query(query, data, **params)
|
||||||
|
|
||||||
|
def create_study_to_studyevent_rel(self, studyoid: str, modelid: str) -> None:
|
||||||
|
query = """
|
||||||
|
MATCH (start:Study {OID: $studyoid, ModelID: $modelid})
|
||||||
|
WITH start, start.OID as oid, start.ModelID as mid
|
||||||
|
MATCH (end:StudyEvent)
|
||||||
|
WHERE end.StudyOID = oid AND end.ModelID = mid
|
||||||
|
WITH start, end
|
||||||
|
MERGE (start)-[:STUDY_HAS_STUDYEVENT]->(end)-[:BACKTRACK]->(start)
|
||||||
|
"""
|
||||||
|
self.execute_batch_query(query, data=[], studyoid=studyoid, modelid=modelid)
|
||||||
|
|
||||||
|
def create_study_to_studyevent_rel_old(self, studyoid: str, modelid):
|
||||||
conn = self.__connection
|
conn = self.__connection
|
||||||
try:
|
try:
|
||||||
conn.execute_query(
|
conn.execute_query(
|
||||||
"MERGE (:BasicDefinitions {StudyOID: $studyoid, ModelID: $modelid})", studyoid=studyoid, modelid=modelid)
|
"MATCH (start:Study {OID: $studyoid, ModelID: $modelid}),"
|
||||||
logger.debug(f"Inserted BasicDefinitions node for study {studyoid}")
|
" (end:StudyEvent {StudyOID: $studyoid, ModelID: $modelid})"
|
||||||
except Exception as e:
|
"MERGE (start)-[:STUDY_HAS_STUDYEVENT]->(end)-[:BACKTRACK]->(start)",
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_rangecheck_node(self, data: DataFrame):
|
|
||||||
conn = self.__connection
|
|
||||||
try:
|
|
||||||
for index, row in data.iterrows():
|
|
||||||
conn.execute_query(
|
|
||||||
"MERGE (:RangeCheck {Comparator: $comp, Constraint: $cons, CheckValue: $val})",
|
|
||||||
comp=row['comparator'], cons=row['constraint'], val=row['value'])
|
|
||||||
logger.debug(f"Inserted RangeCheck node")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_codelist_node(self, data: DataFrame, studyoid, modelid):
|
|
||||||
conn = self.__connection
|
|
||||||
try:
|
|
||||||
for index, row in data.iterrows():
|
|
||||||
conn.execute_query(
|
|
||||||
"MERGE (:CodeList {OID: $oid, Name: $name, DataType: $type, StudyOID: $studyoid, ModelID: $modelid})",
|
|
||||||
oid=row['OID'], name=row['Name'], type=row['DataType'], studyoid=studyoid, modelid=modelid)
|
|
||||||
logger.debug(f"Inserted CodeList node")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_clitem_node(self, data: DataFrame):
|
|
||||||
conn = self.__connection
|
|
||||||
try:
|
|
||||||
for index, row in data.iterrows():
|
|
||||||
conn.execute_query(
|
|
||||||
"MERGE (:CodeListItem {CodedValue: $cv, Decode: $dc})",
|
|
||||||
cv=row['codedvalue'], dc=row['decode'])
|
|
||||||
logger.debug(f"Inserted CodeListItem node")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_study_to_studyevent_rel(self, studyoid: str, modelid):
|
|
||||||
conn = self.__connection
|
|
||||||
try:
|
|
||||||
conn.execute_query(
|
|
||||||
"MATCH (start:Study {OID: $studyoid, ModelID: $modelid}), (end:StudyEvent {StudyOID: $studyoid, ModelID: $modelid}) MERGE (start)-[:STUDY_HAS_STUDYEVENT]->(end)-[:BACKTRACK]->(start)",
|
|
||||||
studyoid=studyoid, modelid=modelid
|
studyoid=studyoid, modelid=modelid
|
||||||
)
|
)
|
||||||
logger.debug(f"Inserted rels betw Study and StudyEvent")
|
logger.debug(f"Inserted rels betw Study and StudyEvent")
|
||||||
@@ -210,175 +185,122 @@ class Neo4jConnection:
|
|||||||
logger.error(f"Failed to populate Neo4j database: {e}")
|
logger.error(f"Failed to populate Neo4j database: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def create_itemgroup_to_item_rel(self, data: DataFrame, studyoid: str, modelid):
|
def create_itemgroup_to_item_rel(self, data: List[Dict], studyoid: str, modelid):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MATCH (start:ItemGroup {OID: row.start, StudyOID: $studyoid, ModelID: $modelid}), (end:Item {OID: row.end, StudyOID: $studyoid, ModelID: $modelid})
|
||||||
conn.execute_query(
|
MERGE (start)-[:ITEMGROUP_HAS_ITEM]->(end)-[:BACKTRACK]->(start)
|
||||||
"MATCH (start:ItemGroup {OID: $startoid, StudyOID: $studyoid, ModelID: $modelid}), (end:Item {OID: $endoid, StudyOID: $studyoid, ModelID: $modelid}) MERGE (start)-[:ITEMGROUP_HAS_ITEM]->(end)-[:BACKTRACK]->(start)",
|
"""
|
||||||
startoid=row['start'], endoid=row['end'], studyoid=studyoid, modelid=modelid)
|
self.execute_batch_query(query, data, studyoid=studyoid, modelid=modelid)
|
||||||
logger.debug(
|
|
||||||
f"Inserted rels betw ItemGroup {row['start']} and Item {row['end']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_form_to_itemgroup_rel(self, data: DataFrame, studyoid: str, modelid):
|
def create_form_to_itemgroup_rel(self, data: List[Dict], studyoid: str, modelid):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MATCH (start:Form {OID: row.start, StudyOID: $studyoid, ModelID: $modelid}), (end:ItemGroup {OID: row.end, StudyOID: $studyoid, ModelID: $modelid})
|
||||||
conn.execute_query(
|
MERGE (start)-[:FORM_HAS_ITEMGROUP]->(end)-[:BACKTRACK]->(start)
|
||||||
"MATCH (start:Form {OID: $startoid, StudyOID: $studyoid, ModelID: $modelid}), (end:ItemGroup {OID: $endoid, StudyOID: $studyoid, ModelID: $modelid}) MERGE (start)-[:FORM_HAS_ITEMGROUP]->(end)-[:BACKTRACK]->(start)",
|
"""
|
||||||
startoid=row['start'], endoid=row['end'], studyoid=studyoid, modelid=modelid)
|
self.execute_batch_query(query, data, studyoid=studyoid, modelid=modelid)
|
||||||
logger.debug(
|
|
||||||
f"Inserted rels betw Form {row['start']} and ItemGroup {row['end']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_studyevent_to_form_rel(self, data: DataFrame, studyoid: str, modelid):
|
def create_studyevent_to_form_rel(self, data: List[Dict], studyoid: str, modelid):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MATCH (start:StudyEvent {OID: row.start, StudyOID: $studyoid, ModelID: $modelid}), (end:Form {OID: row.end, StudyOID: $studyoid, ModelID: $modelid})
|
||||||
conn.execute_query(
|
MERGE (start)-[:STUDYEVENT_HAS_FORM]->(end)-[:BACKTRACK]->(start)
|
||||||
"MATCH (start:StudyEvent {OID: $startoid, StudyOID: $studyoid, ModelID: $modelid}), (end:Form {OID: $endoid, StudyOID: $studyoid, ModelID: $modelid}) MERGE (start)-[:STUDYEVENT_HAS_FORM]->(end)-[:BACKTRACK]->(start)",
|
"""
|
||||||
startoid=row['start'], endoid=row['end'], studyoid=studyoid, modelid=modelid)
|
self.execute_batch_query(query, data, studyoid=studyoid, modelid=modelid)
|
||||||
logger.debug(
|
|
||||||
f"Inserted rels betw StudyEvent {row['start']} and Form {row['end']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_item_to_alias_rel(self, data: DataFrame, studyoid: str, modelid):
|
def create_item_to_alias_rel(self, data: List[Dict], studyoid: str, modelid):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MATCH (start:Item {OID: row.start, StudyOID: $studyoid, ModelID: $modelid}), (end:Alias {Context: row.end, Name: row.end_two, ItemOID: row.start, StudyOID: $studyoid, ModelID: $modelid})
|
||||||
conn.execute_query(
|
MERGE (start)-[:ITEM_HAS_ALIAS]->(end)-[:BACKTRACK]->(start)
|
||||||
"MATCH (start:Item {OID: $startoid, StudyOID: $studyoid, ModelID: $modelid}), (end:Alias {Context: $endcontext, Name: $endname, ItemOID: $startoid, StudyOID: $studyoid, ModelID: $modelid}) MERGE (start)-[:ITEM_HAS_ALIAS]->(end)-[:BACKTRACK]->(start)",
|
"""
|
||||||
startoid=row['start'], endcontext=row['end'], endname=row['end_two'], studyoid=studyoid, modelid=modelid)
|
self.execute_batch_query(query, data, studyoid=studyoid, modelid=modelid)
|
||||||
logger.debug(
|
|
||||||
f"Inserted rels betw Item {row['start']} and Alias {row['end']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_itemgroup_to_alias_rel(self, data: DataFrame, studyoid: str, modelid):
|
def create_itemgroup_to_alias_rel(self, data: List[Dict], studyoid: str, modelid):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MATCH (start:ItemGroup {OID: row.start, StudyOID: $studyoid, ModelID: $modelid}), (end:Alias {Context: row.end, Name: row.end_two})
|
||||||
conn.execute_query(
|
MERGE (start)-[:ITEMGROUP_HAS_ALIAS]->(end)-[:BACKTRACK]->(start)
|
||||||
"MATCH (start:ItemGroup {OID: $startoid, StudyOID: $studyoid, ModelID: $modelid}), (end:Alias {Context: $endcontext, Name: $endname}) MERGE (start)-[:ITEMGROUP_HAS_ALIAS]->(end)-[:BACKTRACK]->(start)",
|
"""
|
||||||
startoid=row['start'], endcontext=row['end'], endname=row['end_two'], studyoid=studyoid, modelid=modelid)
|
self.execute_batch_query(query, data, studyoid=studyoid, modelid=modelid)
|
||||||
logger.debug(
|
|
||||||
f"Inserted rels betw ItemGroup {row['start']} and Alias {row['end']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_item_to_measurement_rel(self, data: DataFrame, studyoid: str, modelid):
|
def create_item_to_measurement_rel(self, data: List[Dict], studyoid: str, modelid):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MATCH (start:Item {OID: row.start, StudyOID: $studyoid, ModelID: $modelid}), (end:MeasurementUnit {OID: row.endoid, Name: row.endname})
|
||||||
conn.execute_query(
|
MERGE (start)-[:ITEM_HAS_MEASUREMENTUNIT]->(end)-[:BACKTRACK]->(start)
|
||||||
"MATCH (start:Item {OID: $startoid, StudyOID: $studyoid, ModelID: $modelid}), (end:MeasurementUnit {OID: $endoid, Name: $endname}) MERGE (start)-[:ITEM_HAS_MEASUREMENTUNIT]->(end)-[:BACKTRACK]->(start)",
|
"""
|
||||||
startoid=row['start'], endoid=row['endoid'], endname=row['endname'], studyoid=studyoid, modelid=modelid)
|
self.execute_batch_query(query, data, studyoid=studyoid, modelid=modelid)
|
||||||
logger.debug(
|
|
||||||
f"Inserted rels betw Item {row['start']} and MeasurementUnit {row['endname']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_basedef_to_study_rel(self, studyoid: str, modelid):
|
def create_basedef_to_study_rel(self, studyoid: str, modelid):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
MATCH (start:Study {OID: $studyoid, ModelID: $modelid}), (end:BasicDefinitions {StudyOID: $studyoid, ModelID: $modelid})
|
||||||
conn.execute_query(
|
MERGE (start)-[:HAS_BASEDEF]->(end)
|
||||||
"MATCH (start:Study {OID: $studyoid, ModelID: $modelid}), (end:BasicDefinitions {StudyOID: $studyoid, ModelID: $modelid}) MERGE (start)-[:HAS_BASEDEF]->(end)",
|
"""
|
||||||
studyoid=studyoid, modelid=modelid)
|
self.execute_batch_query(query, data=[], studyoid=studyoid, modelid=modelid)
|
||||||
logger.debug(
|
|
||||||
f"Inserted rels betw Study {studyoid} and BasicDefinitions")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_basedef_to_measurement_rel(self, data: DataFrame,
|
def create_basedef_to_measurement_rel(self, data: List[Dict], studyoid: str, modelid):
|
||||||
studyoid: str, modelid):
|
query = """
|
||||||
conn = self.__connection
|
UNWIND $data AS row
|
||||||
try:
|
MATCH (start:BasicDefinitions {StudyOID: $studyoid, ModelID: $modelid}), (end:MeasurementUnit {OID: row.oid, Name: row.name, Symbol: row.symbol})
|
||||||
for index, row in data.iterrows():
|
MERGE (start)-[:BASEDEF_HAS_MEASUREMENTUNIT]->(end)
|
||||||
conn.execute_query(
|
"""
|
||||||
"MATCH (start:BasicDefinitions {StudyOID: $studyoid, ModelID: $modelid}), (end:MeasurementUnit {OID: $endoid, Name: $endname, Symbol: $endsymbol}) MERGE (start)-[:BASEDEF_HAS_MEASUREMENTUNIT]->(end)",
|
self.execute_batch_query(query, data, studyoid=studyoid, modelid=modelid)
|
||||||
endoid=row['oid'], endname=row['name'], endsymbol=row['symbol'], studyoid=studyoid, modelid=modelid)
|
|
||||||
logger.debug(
|
|
||||||
f"Inserted rels betw BasicDefinitions and MeasurementUnit {row['name']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_item_to_rangecheck_rel(self, data: DataFrame): #TODO check for Item properties (missing StudyOID?)
|
def create_item_to_rangecheck_rel(self, data: List[Dict]): # TODO check for Item properties (missing StudyOID?)
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MATCH (start:Item {OID: row.start}), (end:RangeCheck {Comparator: row.endcomparator, Constraint: row.endconstraint, CheckValue: row.endvalue})
|
||||||
conn.execute_query(
|
MERGE (start)-[:ITEM_HAS_RANGECHECK]->(end)-[:BACKTRACK]->(start)
|
||||||
"MATCH (start:Item {OID: $itemoid}), (end:RangeCheck {Comparator: $comp, Constraint: $cons, CheckValue: $cvalue}) MERGE (start)-[:ITEM_HAS_RANGECHECK]->(end)-[:BACKTRACK]->(start)",
|
"""
|
||||||
itemoid=row['start'], comp=row['endcomparator'], cons=row['endconstraint'], cvalue=row['endvalue'])
|
self.execute_batch_query(query, data)
|
||||||
logger.debug(
|
|
||||||
f"Inserted rels betw Item and RangeCheck")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_item_to_codelist_rel(self, data: DataFrame, studyoid, modelid):
|
def create_item_to_codelist_rel(self, data: List[Dict], studyoid, modelid):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MATCH (start:Item {OID: row.start, StudyOID: $studyoid, ModelID: $modelid}), (end:CodeList {OID: row.end, StudyOID: $studyoid, ModelID: $modelid})
|
||||||
conn.execute_query(
|
MERGE (start)-[:ITEM_HAS_CODELIST]->(end)-[:BACKTRACK]->(start)
|
||||||
"MATCH (start:Item {OID: $startoid, StudyOID: $studyoid, ModelID: $modelid}), (end:CodeList {OID: $endoid, StudyOID: $studyoid, ModelID: $modelid}) MERGE (start)-[:ITEM_HAS_CODELIST]->(end)-[:BACKTRACK]->(start)",
|
"""
|
||||||
startoid=row['start'], endoid=row['end'], studyoid=studyoid, modelid=modelid)
|
self.execute_batch_query(query, data, studyoid=studyoid, modelid=modelid)
|
||||||
logger.debug(
|
|
||||||
f"Inserted rels betw Item {row['start']} and CodeList {row['end']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def create_codelist_to_codelistitem_rel(self, data: DataFrame, studyoid, modelid):
|
def create_codelist_to_codelistitem_rel(self, data: List[Dict], studyoid, modelid):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
for index, row in data.iterrows():
|
MATCH (start:CodeList {OID: row.start, StudyOID: $studyoid, ModelID: $modelid}), (end:CodeListItem {CodedValue: row.endcodedvalue, Decode: row.enddecode})
|
||||||
conn.execute_query(
|
MERGE (start)-[:CODELIST_HAS_CODELISTITEM]->(end)-[:BACKTRACK]->(start)
|
||||||
"MATCH (start:CodeList {OID: $startoid, StudyOID: $studyoid, ModelID: $modelid}), (end:CodeListItem {CodedValue: $cv, Decode: $dc}) MERGE (start)-[:CODELIST_HAS_CODELISTITEM]->(end)-[:BACKTRACK]->(start)",
|
"""
|
||||||
startoid=row['start'], cv=row['endcodedvalue'], dc=row['enddecode'], studyoid=studyoid, modelid=modelid)
|
self.execute_batch_query(query, data, studyoid=studyoid, modelid=modelid)
|
||||||
logger.debug(
|
|
||||||
f"Inserted rels betw CodeList {row['start']} and CodeListItem {row['endcodedvalue']}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
# postprocessing with cypher queries
|
# postprocessing with cypher queries
|
||||||
def postprocess_set_label_post(self):
|
def postprocess_set_label_post(self):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
MATCH (n:Alias)
|
||||||
conn.execute_query(
|
WHERE n.Context CONTAINS '[' AND n.Context CONTAINS ',' AND n.Context CONTAINS ']'
|
||||||
"MATCH (n:Alias) WHERE n.Context contains '[' and n.Context contains ',' and n.Context contains ']' set n :Post"
|
SET n:Post
|
||||||
)
|
"""
|
||||||
logger.debug(f"Inserted post property in alias nodes")
|
self.execute_batch_query(query, data=[])
|
||||||
except Exception as e:
|
logger.debug(f"Inserted post property in alias nodes")
|
||||||
logger.error(f"Failed to execute query: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def postprocess_connect_postcoord_aliases(self, tuples):
|
def postprocess_connect_postcoord_aliases(self, data):
|
||||||
conn = self.__connection
|
query = """
|
||||||
try:
|
UNWIND $data AS row
|
||||||
conn.execute_query(
|
MATCH (i:Item {OID: row.item_oid})
|
||||||
"""
|
WITH row, i
|
||||||
UNWIND $data AS row
|
MATCH (a1:Post {StudyOID: row.study_oid, Context: row.ontology + '[' + row.x + ',' + 1 + ']'})--(i)
|
||||||
MATCH (i:Item {OID: row.item_oid})
|
MATCH (a2:Post {StudyOID: row.study_oid, Context: row.ontology + '[' + row.x + ',' + row.y + ']'})--(i)
|
||||||
WITH row, i
|
WHERE a1 <> a2
|
||||||
MATCH (a1:Post {StudyOID: row.study_oid, Context: row.ontology + '[' + row.x + ',' + 1 + ']'})--(i)
|
MERGE (a1)-[rel:COMPOSITE]->(a2)
|
||||||
MATCH (a2:Post {StudyOID: row.study_oid, Context: row.ontology + '[' + row.x + ',' + row.y + ']'})--(i)
|
"""
|
||||||
WHERE a1 <> a2
|
self.execute_batch_query(query, data)
|
||||||
MERGE (a1)-[rel:COMPOSITE]->(a2)
|
logger.debug(f"Inserted rels btw post-coordinated aliases")
|
||||||
""", data=tuples)
|
|
||||||
logger.debug(f"Inserted rels btw post-coordinated aliases")
|
def postprocess_add_nctid_to_study_nodes(self):
|
||||||
except Exception as e:
|
query = """
|
||||||
logger.error(f"Failed to execute query: {e}")
|
MATCH (n:Study) WHERE NOT n.OID =~ '.*(NCT\\d{8}).*'
|
||||||
raise
|
WITH n, apoc.text.regexGroups(n.Name, 'NCT\\d{8}') AS matches
|
||||||
|
WHERE size(matches) > 0
|
||||||
|
SET n.NCT_ID = matches[0][0]
|
||||||
|
"""
|
||||||
|
self.execute_batch_query(query, data=[])
|
||||||
|
logger.debug(f"Inserted NCT identifier to Study nodes")
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import atexit
|
|||||||
import signal
|
import signal
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
MDM2NEO4J_VERSION: str = "0.1"
|
MDM2NEO4J_VERSION: str = "0.2"
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -17,6 +17,7 @@ parser = argparse.ArgumentParser()
|
|||||||
parser.add_argument('-c', '--conf', required=True, type=str,
|
parser.add_argument('-c', '--conf', required=True, type=str,
|
||||||
help='Configuration file with database connection parameters')
|
help='Configuration file with database connection parameters')
|
||||||
parser.add_argument('-f', '--files', required=True, type=str, help='Directory with xml files')
|
parser.add_argument('-f', '--files', required=True, type=str, help='Directory with xml files')
|
||||||
|
parser.add_argument('-b', '--batch-size', required=False, type=int, default=500, help='Number of files per batch')
|
||||||
|
|
||||||
# parse parameters
|
# parse parameters
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
@@ -517,104 +517,215 @@ def dict_to_list_of_tuples(mydictionary):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def parse_xml(
|
def parse_xml(path_to_datafiles: str, batch_size: int = 500): # creates XmlProcessor object for each xml file
|
||||||
path_to_datafiles): # creates XmlProcessor object for each xml file
|
|
||||||
neo4jconnection_object = db.Neo4jConnection()
|
neo4jconnection_object = db.Neo4jConnection()
|
||||||
for file in os.listdir(path_to_datafiles):
|
|
||||||
if file.endswith(".xml"):
|
xml_files = [file for file in os.listdir(path_to_datafiles) if file.endswith(".xml")]
|
||||||
|
total_files = len(xml_files)
|
||||||
|
|
||||||
|
logger.info(F"Found {total_files} xml files to process")
|
||||||
|
|
||||||
|
for batch_start in range(0, total_files, batch_size):
|
||||||
|
batch_end = min(batch_start + batch_size, total_files)
|
||||||
|
batch_files = xml_files[batch_start:batch_end]
|
||||||
|
|
||||||
|
logger.info(f"Processing batch {batch_start // batch_end + 1}: " f"{batch_start + 1}-{batch_end} of {total_files} files")
|
||||||
|
|
||||||
|
processed_files = 0
|
||||||
|
failed_files = 0
|
||||||
|
|
||||||
|
for file in batch_files:
|
||||||
filename = os.path.splitext(file)[0]
|
filename = os.path.splitext(file)[0]
|
||||||
filepath = os.path.join(path_to_datafiles, file)
|
filepath = os.path.join(path_to_datafiles, file)
|
||||||
with open(filepath) as fp:
|
|
||||||
parsed_xml = BeautifulSoup(fp, "xml")
|
try:
|
||||||
|
logger.info(f"Processing file: {filename}.xml")
|
||||||
|
|
||||||
|
with open(filepath) as fp:
|
||||||
|
parsed_xml = BeautifulSoup(fp, "xml")
|
||||||
|
|
||||||
# create XmlProcessor object for parsed_xml files
|
# create XmlProcessor object for parsed_xml files
|
||||||
xmlprocessor_object = XmlProcessor(parsed_xml)
|
xmlprocessor_object = XmlProcessor(parsed_xml)
|
||||||
# do stuff with the object
|
# do stuff with the object
|
||||||
study_oid = xmlprocessor_object.get_studyoid()
|
study_oid = xmlprocessor_object.get_studyoid()
|
||||||
|
|
||||||
|
# extract data in batches
|
||||||
|
study_data = xmlprocessor_object.extract_study_data()
|
||||||
|
itemgroup_data = xmlprocessor_object.extract_itemgroup_data()
|
||||||
|
item_data = xmlprocessor_object.extract_item_data()
|
||||||
|
studyevent_data = xmlprocessor_object.extract_studyevent_data()
|
||||||
|
form_data = xmlprocessor_object.extract_form_data()
|
||||||
|
alias_data = xmlprocessor_object.extract_item_to_alias_data()
|
||||||
|
measurement_data = xmlprocessor_object.extract_measurement_data()
|
||||||
|
rangecheck_data = xmlprocessor_object.extract_rangecheck_data()
|
||||||
|
codelist_data = xmlprocessor_object.extract_codelist_data()
|
||||||
|
codelistitem_data = xmlprocessor_object.extract_codelistitem_data()
|
||||||
|
|
||||||
|
# create nodes in batches
|
||||||
|
if not study_data.empty:
|
||||||
|
neo4jconnection_object.create_study_node(study_data.to_dict('records'), filename)
|
||||||
|
|
||||||
|
if not itemgroup_data.empty:
|
||||||
|
neo4jconnection_object.create_itemgroup_node(itemgroup_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
|
if not item_data.empty:
|
||||||
|
neo4jconnection_object.create_item_node(item_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
|
if not studyevent_data.empty:
|
||||||
|
neo4jconnection_object.create_studyevent_node(studyevent_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
|
if not form_data.empty:
|
||||||
|
neo4jconnection_object.create_form_node(form_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
|
if not alias_data.empty:
|
||||||
|
neo4jconnection_object.create_alias_node(alias_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
|
if not measurement_data.empty:
|
||||||
|
neo4jconnection_object.create_measurementunit_node(measurement_data.to_dict('records'))
|
||||||
|
|
||||||
|
if not rangecheck_data.empty:
|
||||||
|
neo4jconnection_object.create_rangecheck_node(rangecheck_data.to_dict('records'))
|
||||||
|
|
||||||
|
if not codelist_data.empty:
|
||||||
|
neo4jconnection_object.create_codelist_node(codelist_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
|
if not codelistitem_data.empty:
|
||||||
|
neo4jconnection_object.create_clitem_node(codelistitem_data.to_dict('records'))
|
||||||
|
|
||||||
|
if parsed_xml.findAll('BasicDefinitions'):
|
||||||
|
neo4jconnection_object.create_basedef_node(study_oid, filename)
|
||||||
|
|
||||||
|
logger.info(f"Inserted all nodes for file {filename}.xml")
|
||||||
|
|
||||||
|
# Create relationships in batches
|
||||||
|
|
||||||
|
### previous version ###
|
||||||
|
#for file in os.listdir(path_to_datafiles):
|
||||||
|
# if file.endswith(".xml"):
|
||||||
|
# filename = os.path.splitext(file)[0]
|
||||||
|
# filepath = os.path.join(path_to_datafiles, file)
|
||||||
|
# with open(filepath) as fp:
|
||||||
|
# parsed_xml = BeautifulSoup(fp, "xml")
|
||||||
|
# create XmlProcessor object for parsed_xml files
|
||||||
|
# xmlprocessor_object = XmlProcessor(parsed_xml)
|
||||||
|
# do stuff with the object
|
||||||
|
# study_oid = xmlprocessor_object.get_studyoid()
|
||||||
|
|
||||||
# create nodes
|
# create nodes
|
||||||
try:
|
# try:
|
||||||
study_data = xmlprocessor_object.extract_study_data()
|
# study_data = xmlprocessor_object.extract_study_data()
|
||||||
neo4jconnection_object.create_study_node(study_data, filename)
|
# neo4jconnection_object.create_study_node(study_data, filename)
|
||||||
|
|
||||||
itemgroup_data = xmlprocessor_object.extract_itemgroup_data()
|
# itemgroup_data = xmlprocessor_object.extract_itemgroup_data()
|
||||||
neo4jconnection_object.create_itemgroup_node(itemgroup_data, study_oid, filename)
|
# neo4jconnection_object.create_itemgroup_node(itemgroup_data, study_oid, filename)
|
||||||
|
|
||||||
item_data = xmlprocessor_object.extract_item_data()
|
# item_data = xmlprocessor_object.extract_item_data()
|
||||||
neo4jconnection_object.create_item_node(item_data, study_oid, filename)
|
# neo4jconnection_object.create_item_node(item_data, study_oid, filename)
|
||||||
|
|
||||||
studyevent_data = xmlprocessor_object.extract_studyevent_data()
|
# studyevent_data = xmlprocessor_object.extract_studyevent_data()
|
||||||
neo4jconnection_object.create_studyevent_node(studyevent_data, study_oid, filename)
|
# neo4jconnection_object.create_studyevent_node(studyevent_data, study_oid, filename)
|
||||||
|
|
||||||
form_data = xmlprocessor_object.extract_form_data()
|
# form_data = xmlprocessor_object.extract_form_data()
|
||||||
neo4jconnection_object.create_form_node(form_data, study_oid, filename)
|
# neo4jconnection_object.create_form_node(form_data, study_oid, filename)
|
||||||
|
|
||||||
alias_data = xmlprocessor_object.extract_item_to_alias_data()
|
# alias_data = xmlprocessor_object.extract_item_to_alias_data()
|
||||||
neo4jconnection_object.create_alias_node(alias_data, study_oid, filename)
|
# neo4jconnection_object.create_alias_node(alias_data, study_oid, filename)
|
||||||
|
|
||||||
measurement_data = xmlprocessor_object.extract_measurement_data()
|
# measurement_data = xmlprocessor_object.extract_measurement_data()
|
||||||
neo4jconnection_object.create_measurementunit_node(measurement_data)
|
# neo4jconnection_object.create_measurementunit_node(measurement_data)
|
||||||
|
|
||||||
rangecheck_data = xmlprocessor_object.extract_rangecheck_data()
|
# rangecheck_data = xmlprocessor_object.extract_rangecheck_data()
|
||||||
neo4jconnection_object.create_rangecheck_node(rangecheck_data)
|
# neo4jconnection_object.create_rangecheck_node(rangecheck_data)
|
||||||
|
|
||||||
codelist_data = xmlprocessor_object.extract_codelist_data()
|
# codelist_data = xmlprocessor_object.extract_codelist_data()
|
||||||
neo4jconnection_object.create_codelist_node(codelist_data, study_oid, filename)
|
# neo4jconnection_object.create_codelist_node(codelist_data, study_oid, filename)
|
||||||
|
|
||||||
codelistitem_data = xmlprocessor_object.extract_codelistitem_data()
|
# codelistitem_data = xmlprocessor_object.extract_codelistitem_data()
|
||||||
neo4jconnection_object.create_clitem_node(codelistitem_data)
|
# neo4jconnection_object.create_clitem_node(codelistitem_data)
|
||||||
|
|
||||||
if parsed_xml.findAll('BasicDefinitions'):
|
# if parsed_xml.findAll('BasicDefinitions'):
|
||||||
neo4jconnection_object.create_basedef_node(study_oid, filename)
|
# neo4jconnection_object.create_basedef_node(study_oid, filename)
|
||||||
|
|
||||||
logger.info(f"Inserted all nodes for file {filename}.xml")
|
# logger.info(f"Inserted all nodes for file {filename}.xml")
|
||||||
|
|
||||||
except:
|
# except:
|
||||||
print(f"Nodes for file {filename}.xml could not be inserted")
|
# print(f"Nodes for file {filename}.xml could not be inserted")
|
||||||
|
|
||||||
# create rels
|
# create rels
|
||||||
try:
|
|
||||||
itemgroup_to_item_data = xmlprocessor_object.extract_itemgroup_to_item_data
|
|
||||||
neo4jconnection_object.create_itemgroup_to_item_rel(itemgroup_to_item_data, study_oid, filename)
|
|
||||||
|
|
||||||
item_to_alias_data = xmlprocessor_object.extract_item_to_alias_data()
|
# Create relationships in batches
|
||||||
neo4jconnection_object.create_item_to_alias_rel(item_to_alias_data, study_oid, filename)
|
logger.info(f"Creating relationships for {filename}.xml...")
|
||||||
|
|
||||||
itemgroup_to_alias_data = xmlprocessor_object.extract_itemgroup_to_alias_data()
|
# Get data for rels
|
||||||
neo4jconnection_object.create_itemgroup_to_alias_rel(itemgroup_to_alias_data, study_oid, filename)
|
itemgroup_to_item_data = xmlprocessor_object.extract_itemgroup_to_item_data
|
||||||
|
item_to_alias_data = xmlprocessor_object.extract_item_to_alias_data()
|
||||||
|
itemgroup_to_alias_data = xmlprocessor_object.extract_itemgroup_to_alias_data()
|
||||||
|
form_to_itemgroup_data = xmlprocessor_object.extract_form_to_itemgroup_data()
|
||||||
|
studyevent_to_form_data = xmlprocessor_object.extract_studyevent_to_form_data()
|
||||||
|
item_to_measurement_data = xmlprocessor_object.extract_item_to_measurement_data()
|
||||||
|
item_to_rangecheck_data = xmlprocessor_object.extract_item_to_rangecheck_data()
|
||||||
|
item_to_cl_data = xmlprocessor_object.extract_item_to_codelist_data()
|
||||||
|
cl_to_clitem_data = xmlprocessor_object.extract_codelist_to_codelistitem_data()
|
||||||
|
|
||||||
form_to_itemgroup_data = xmlprocessor_object.extract_form_to_itemgroup_data()
|
# Batch creation of relationships
|
||||||
neo4jconnection_object.create_form_to_itemgroup_rel(form_to_itemgroup_data, study_oid, filename)
|
|
||||||
|
|
||||||
studyevent_to_form_data = xmlprocessor_object.extract_studyevent_to_form_data()
|
if not itemgroup_to_item_data.empty:
|
||||||
neo4jconnection_object.create_studyevent_to_form_rel(studyevent_to_form_data, study_oid, filename)
|
neo4jconnection_object.create_itemgroup_to_item_rel(itemgroup_to_item_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
|
if not item_to_alias_data.empty:
|
||||||
|
neo4jconnection_object.create_item_to_alias_rel(item_to_alias_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
|
if not itemgroup_to_alias_data.empty:
|
||||||
|
neo4jconnection_object.create_itemgroup_to_alias_rel(itemgroup_to_alias_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
|
if not form_to_itemgroup_data.empty:
|
||||||
|
neo4jconnection_object.create_form_to_itemgroup_rel(form_to_itemgroup_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
|
if not studyevent_to_form_data.empty:
|
||||||
|
neo4jconnection_object.create_studyevent_to_form_rel(studyevent_to_form_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
|
if study_oid:
|
||||||
neo4jconnection_object.create_study_to_studyevent_rel(study_oid, filename)
|
neo4jconnection_object.create_study_to_studyevent_rel(study_oid, filename)
|
||||||
|
|
||||||
item_to_measurement_data = xmlprocessor_object.extract_item_to_measurement_data()
|
if not item_to_measurement_data.empty:
|
||||||
neo4jconnection_object.create_item_to_measurement_rel(item_to_measurement_data, study_oid, filename)
|
neo4jconnection_object.create_item_to_measurement_rel(item_to_measurement_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
item_to_rangecheck_data = xmlprocessor_object.extract_item_to_rangecheck_data()
|
if not item_to_rangecheck_data.empty:
|
||||||
neo4jconnection_object.create_item_to_rangecheck_rel(item_to_rangecheck_data)
|
neo4jconnection_object.create_item_to_rangecheck_rel(item_to_rangecheck_data.to_dict('records'))
|
||||||
|
|
||||||
item_to_cl_data = xmlprocessor_object.extract_item_to_codelist_data()
|
if not item_to_cl_data.empty:
|
||||||
neo4jconnection_object.create_item_to_codelist_rel(item_to_cl_data, study_oid, filename)
|
neo4jconnection_object.create_item_to_codelist_rel(item_to_cl_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
cl_to_clitem_data = xmlprocessor_object.extract_codelist_to_codelistitem_data()
|
if not cl_to_clitem_data.empty:
|
||||||
neo4jconnection_object.create_codelist_to_codelistitem_rel(cl_to_clitem_data, study_oid, filename)
|
neo4jconnection_object.create_codelist_to_codelistitem_rel(cl_to_clitem_data.to_dict('records'), study_oid, filename)
|
||||||
|
|
||||||
# rels to and from BasicDefinitions node
|
# rels to and from BasicDefinitions node
|
||||||
neo4jconnection_object.create_basedef_to_study_rel(study_oid, filename)
|
neo4jconnection_object.create_basedef_to_study_rel(study_oid, filename)
|
||||||
neo4jconnection_object.create_basedef_to_measurement_rel(measurement_data, study_oid, filename)
|
if not measurement_data.empty:
|
||||||
|
neo4jconnection_object.create_basedef_to_measurement_rel(measurement_data.to_dict('records'), study_oid, filename)
|
||||||
except:
|
|
||||||
print(f"Relationships for file {filename}.xml could not be inserted")
|
|
||||||
|
|
||||||
logger.info(f"Inserted all rels for file {filename}.xml")
|
logger.info(f"Inserted all rels for file {filename}.xml")
|
||||||
|
|
||||||
# postprocess
|
# postprocessing
|
||||||
|
logger.info(f"Post-processing for file {filename}.xml")
|
||||||
neo4jconnection_object.postprocess_set_label_post()
|
neo4jconnection_object.postprocess_set_label_post()
|
||||||
dictionary = xmlprocessor_object.prepare_tuple_for_postcoord_rels()
|
dictionary = xmlprocessor_object.prepare_tuple_for_postcoord_rels()
|
||||||
datatuple = dict_to_list_of_tuples(dictionary)
|
datatuple = dict_to_list_of_tuples(dictionary)
|
||||||
neo4jconnection_object.postprocess_connect_postcoord_aliases(datatuple)
|
if datatuple:
|
||||||
|
neo4jconnection_object.postprocess_connect_postcoord_aliases(datatuple)
|
||||||
|
|
||||||
logger.info(f"Post-processing done for file {filename}.xml")
|
logger.info(f"Post-processing done for file {filename}.xml")
|
||||||
|
processed_files += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing file {filename}.xml: {e}")
|
||||||
|
failed_files += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Log batch completion
|
||||||
|
logger.info(f"Batch completed: {processed_files} successful, {failed_files} failed")
|
||||||
|
|
||||||
|
# postprocess all Study nodes
|
||||||
|
logger.info(f"Final post-processing step")
|
||||||
|
neo4jconnection_object.postprocess_add_nctid_to_study_nodes()
|
||||||
|
logger.info(f"Batches completed")
|
||||||
|
|||||||
Reference in New Issue
Block a user