148 lines
4.8 KiB
Python
148 lines
4.8 KiB
Python
from ._logger import logger
|
|
|
|
logger.debug(f"Loading module {__name__}.")
|
|
|
|
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
|
|
|
|
class Deduplicator:
|
|
"""
|
|
Singleton class responsible of deduplicating BioCypher inputs. Maintains
|
|
sets/dictionaries of node and edge types and their unique identifiers.
|
|
|
|
Nodes identifiers should be globally unique (represented as a set), while
|
|
edge identifiers are only unique per edge type (represented as a dict of
|
|
sets, keyed by edge type).
|
|
|
|
Stores collection of duplicate node and edge identifiers and types for
|
|
troubleshooting and to avoid overloading the log.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.seen_entity_ids = set()
|
|
self.duplicate_entity_ids = set()
|
|
|
|
self.entity_types = set()
|
|
self.duplicate_entity_types = set()
|
|
|
|
self.seen_relationships = {}
|
|
self.duplicate_relationship_ids = set()
|
|
self.duplicate_relationship_types = set()
|
|
|
|
def node_seen(self, entity: BioCypherNode) -> bool:
|
|
"""
|
|
Adds a node to the instance and checks if it has been seen before.
|
|
|
|
Args:
|
|
node: BioCypherNode to be added.
|
|
|
|
Returns:
|
|
True if the node has been seen before, False otherwise.
|
|
"""
|
|
if entity.get_label() not in self.entity_types:
|
|
self.entity_types.add(entity.get_label())
|
|
|
|
if entity.get_id() in self.seen_entity_ids:
|
|
self.duplicate_entity_ids.add(entity.get_id())
|
|
if entity.get_label() not in self.duplicate_entity_types:
|
|
logger.warning(
|
|
f"Duplicate node type {entity.get_label()} found. "
|
|
)
|
|
self.duplicate_entity_types.add(entity.get_label())
|
|
return True
|
|
|
|
self.seen_entity_ids.add(entity.get_id())
|
|
return False
|
|
|
|
def edge_seen(self, relationship: BioCypherEdge) -> bool:
|
|
"""
|
|
Adds an edge to the instance and checks if it has been seen before.
|
|
|
|
Args:
|
|
edge: BioCypherEdge to be added.
|
|
|
|
Returns:
|
|
True if the edge has been seen before, False otherwise.
|
|
"""
|
|
if relationship.get_type() not in self.seen_relationships:
|
|
self.seen_relationships[relationship.get_type()] = set()
|
|
|
|
# concatenate source and target if no id is present
|
|
if not relationship.get_id():
|
|
_id = (
|
|
f"{relationship.get_source_id()}_{relationship.get_target_id()}"
|
|
)
|
|
else:
|
|
_id = relationship.get_id()
|
|
|
|
if _id in self.seen_relationships[relationship.get_type()]:
|
|
self.duplicate_relationship_ids.add(_id)
|
|
if relationship.get_type() not in self.duplicate_relationship_types:
|
|
logger.warning(
|
|
f"Duplicate edge type {relationship.get_type()} found. "
|
|
)
|
|
self.duplicate_relationship_types.add(relationship.get_type())
|
|
return True
|
|
|
|
self.seen_relationships[relationship.get_type()].add(_id)
|
|
return False
|
|
|
|
def rel_as_node_seen(self, rel_as_node: BioCypherRelAsNode) -> bool:
|
|
"""
|
|
Adds a rel_as_node to the instance (one entity and two relationships)
|
|
and checks if it has been seen before. Only the node is relevant for
|
|
identifying the rel_as_node as a duplicate.
|
|
|
|
Args:
|
|
rel_as_node: BioCypherRelAsNode to be added.
|
|
|
|
Returns:
|
|
True if the rel_as_node has been seen before, False otherwise.
|
|
"""
|
|
node = rel_as_node.get_node()
|
|
|
|
if node.get_label() not in self.seen_relationships:
|
|
self.seen_relationships[node.get_label()] = set()
|
|
|
|
# rel as node always has an id
|
|
_id = node.get_id()
|
|
|
|
if _id in self.seen_relationships[node.get_type()]:
|
|
self.duplicate_relationship_ids.add(_id)
|
|
if node.get_type() not in self.duplicate_relationship_types:
|
|
logger.warning(f"Duplicate edge type {node.get_type()} found. ")
|
|
self.duplicate_relationship_types.add(node.get_type())
|
|
return True
|
|
|
|
self.seen_relationships[node.get_type()].add(_id)
|
|
return False
|
|
|
|
def get_duplicate_nodes(self):
|
|
"""
|
|
Function to return a list of duplicate nodes.
|
|
|
|
Returns:
|
|
list: list of duplicate nodes
|
|
"""
|
|
|
|
if self.duplicate_entity_types:
|
|
return (self.duplicate_entity_types, self.duplicate_entity_ids)
|
|
else:
|
|
return None
|
|
|
|
def get_duplicate_edges(self):
|
|
"""
|
|
Function to return a list of duplicate edges.
|
|
|
|
Returns:
|
|
list: list of duplicate edges
|
|
"""
|
|
|
|
if self.duplicate_relationship_types:
|
|
return (
|
|
self.duplicate_relationship_types,
|
|
self.duplicate_relationship_ids,
|
|
)
|
|
else:
|
|
return None
|