medax_pipeline/graphCreation/property_convolution.py
2025-04-16 22:12:19 +02:00

108 lines
4.6 KiB
Python

import networkx as nx
def find_paths(graph, start_node):
def is_leaf(node):
#Checks if a node is a leaf (no outgoing edges)
return graph.out_degree(node) == 0
def custom_dfs(path, reference_count):
#Performs a DFS to find paths for both patterns
current_node = path[-1]
'''if the current node is labeled 'resource', the path length is greater than 3,
and we have exactly one 'reference' edge in the path'''
if len(path) > 3 and graph.nodes[current_node].get('label') == 'resource' and reference_count == 1:
# add path to the list of property paths containing a reference
reference_paths.append(list(path))
'''if the current node is a leaf node (no outgoing edges),
the path length is greater than 2, and we have no references in the path'''
if len(path) > 2 and is_leaf(current_node) and reference_count == 0:
'''add path to the dictionary of property paths ending in leaves,
by the corresponding property key'''
leaf_paths.setdefault(path[1].split('.')[-1], []).extend(list(path))
# check neighbors
for neighbor in graph.successors(current_node):
edge_type = graph.edges[current_node, neighbor].get('edge_type', None)
new_reference_count = reference_count + (1 if edge_type == 'reference' else 0)
# continue the search only if we have at most one 'reference' edge so far
if new_reference_count <= 1:
custom_dfs(path + [neighbor], new_reference_count)
reference_paths = []
leaf_paths = {}
custom_dfs([start_node], 0)
return reference_paths, leaf_paths
def property_convolution(graph):
# Find all nodes with label 'resource'
resource_nodes = [n for n, attr in graph.nodes(data=True) if attr.get('label') == 'resource']
#print("Got all nodes with label 'resource'", flush=True)
'''collect all paths starting with a resource node, that contain one reference edge,
end with a resource node and are >3 nodes long'''
'''collect all paths starting with a resource node, that do not contain reference edges,
end with a leaf node and are >2 nodes long'''
property_paths_with_reference = []
property_paths_with_leaves = {}
for resource_node in resource_nodes:
temp_ref_paths, temp_leaf_paths = find_paths(graph, resource_node)
# add paths to the list of property paths containing a reference, for all nodes
property_paths_with_reference.extend(temp_ref_paths)
# add paths to the dictionary of property paths ending in leaves, by the corresponding resouce key
property_paths_with_leaves[resource_node] = temp_leaf_paths
# print("Collected all paths", flush=True)
# transfer reference edge to first property node for all reference paths
for i in property_paths_with_reference:
ref_edge_data = graph.get_edge_data(i[-2], i[-1])
ref_type = ref_edge_data.get('reference_type')
graph.remove_edge(i[-2], i[-1])
graph.add_edge(i[1], i[-1], edge_type='reference', reference_type=ref_type)
'''after transferrence, add the modified reference path (that now ends in a leaf)
to the dictionary of leaf paths, by corresponding resource and property keys'''
property_paths_with_leaves[i[0]].setdefault(i[1].split('.')[-1], []).extend(i[:-1])
#print("Transfered all references edges", flush=True)
'''create a list of collections of property paths ending in leaves,
removing duplicate nodes from each path collection'''
list_property_paths_with_leaves = [list(dict.fromkeys(i)) for j in property_paths_with_leaves.values() for i in j.values()]
nodes_to_remove=[]
for i in list_property_paths_with_leaves:
for j in range(len(i)-1, 1, -1):
source_attributes = graph.nodes[i[j]]
marker='|'.join(i[j].split('resource.')[1].split('.')[1:])
# transfer attributes to first property node
for attr, value in source_attributes.items():
if attr != 'label':
graph.nodes[i[1]][marker+'_'+attr] = value
nodes_to_remove.append(i[j])
#print("Transferred attributes for all paths", flush=True)
graph.remove_nodes_from(nodes_to_remove)
for i in resource_nodes:
unique_resource_id = graph.nodes[i]['resourceType']+'/'+graph.nodes[i]['id']
graph.nodes[i]['unique_id'] = unique_resource_id
for j in graph.successors(i):
if graph[i][j].get('edge_type') != 'reference':
graph.nodes[j]['unique_id'] = unique_resource_id+'/'+j.split('.')[-1]