import networkx as nx def find_paths(graph, start_node): def is_leaf(node): #Checks if a node is a leaf (no outgoing edges) return graph.out_degree(node) == 0 def custom_dfs(path, reference_count): #Performs a DFS to find paths for both patterns current_node = path[-1] '''if the current node is labeled 'resource', the path length is greater than 3, and we have exactly one 'reference' edge in the path''' if len(path) > 3 and graph.nodes[current_node].get('label') == 'resource' and reference_count == 1: # add path to the list of property paths containing a reference reference_paths.append(list(path)) '''if the current node is a leaf node (no outgoing edges), the path length is greater than 2, and we have no references in the path''' if len(path) > 2 and is_leaf(current_node) and reference_count == 0: '''add path to the dictionary of property paths ending in leaves, by the corresponding property key''' leaf_paths.setdefault(path[1].split('.')[-1], []).extend(list(path)) # check neighbors for neighbor in graph.successors(current_node): edge_type = graph.edges[current_node, neighbor].get('edge_type', None) new_reference_count = reference_count + (1 if edge_type == 'reference' else 0) # continue the search only if we have at most one 'reference' edge so far if new_reference_count <= 1: custom_dfs(path + [neighbor], new_reference_count) reference_paths = [] leaf_paths = {} custom_dfs([start_node], 0) return reference_paths, leaf_paths def property_convolution(graph): # Find all nodes with label 'resource' resource_nodes = [n for n, attr in graph.nodes(data=True) if attr.get('label') == 'resource'] #print("Got all nodes with label 'resource'", flush=True) '''collect all paths starting with a resource node, that contain one reference edge, end with a resource node and are >3 nodes long''' '''collect all paths starting with a resource node, that do not contain reference edges, end with a leaf node and are >2 nodes long''' property_paths_with_reference = [] property_paths_with_leaves = {} for resource_node in resource_nodes: temp_ref_paths, temp_leaf_paths = find_paths(graph, resource_node) # add paths to the list of property paths containing a reference, for all nodes property_paths_with_reference.extend(temp_ref_paths) # add paths to the dictionary of property paths ending in leaves, by the corresponding resouce key property_paths_with_leaves[resource_node] = temp_leaf_paths # print("Collected all paths", flush=True) # transfer reference edge to first property node for all reference paths for i in property_paths_with_reference: ref_edge_data = graph.get_edge_data(i[-2], i[-1]) ref_type = ref_edge_data.get('reference_type') graph.remove_edge(i[-2], i[-1]) graph.add_edge(i[1], i[-1], edge_type='reference', reference_type=ref_type) '''after transferrence, add the modified reference path (that now ends in a leaf) to the dictionary of leaf paths, by corresponding resource and property keys''' property_paths_with_leaves[i[0]].setdefault(i[1].split('.')[-1], []).extend(i[:-1]) #print("Transfered all references edges", flush=True) '''create a list of collections of property paths ending in leaves, removing duplicate nodes from each path collection''' list_property_paths_with_leaves = [list(dict.fromkeys(i)) for j in property_paths_with_leaves.values() for i in j.values()] nodes_to_remove=[] for i in list_property_paths_with_leaves: for j in range(len(i)-1, 1, -1): source_attributes = graph.nodes[i[j]] marker='|'.join(i[j].split('resource.')[1].split('.')[1:]) # transfer attributes to first property node for attr, value in source_attributes.items(): if attr != 'label': graph.nodes[i[1]][marker+'_'+attr] = value nodes_to_remove.append(i[j]) #print("Transferred attributes for all paths", flush=True) graph.remove_nodes_from(nodes_to_remove) for i in resource_nodes: unique_resource_id = graph.nodes[i]['resourceType']+'/'+graph.nodes[i]['id'] graph.nodes[i]['unique_id'] = unique_resource_id for j in graph.successors(i): if graph[i][j].get('edge_type') != 'reference': graph.nodes[j]['unique_id'] = unique_resource_id+'/'+j.split('.')[-1]