import os import glob import pandas as pd from biocypher._logger import logger from biocypher.output.write._batch_writer import parse_label, _BatchWriter class _Neo4jBatchWriter(_BatchWriter): """ Class for writing node and edge representations to disk using the format specified by Neo4j for the use of admin import. Each batch writer instance has a fixed representation that needs to be passed at instantiation via the :py:attr:`schema` argument. The instance also expects an ontology adapter via :py:attr:`ontology_adapter` to be able to convert and extend the hierarchy. This class inherits from the abstract class "_BatchWriter" and implements the Neo4j-specific methods: - _write_node_headers - _write_edge_headers - _construct_import_call - _write_array_string """ def __init__(self, *args, **kwargs): """ Constructor. Check the version of Neo4j and adds a command scope if version >= 5. Returns: _Neo4jBatchWriter: An instance of the writer. """ # Should read the configuration and setup import_call_bin_prefix. super().__init__(*args, **kwargs) def _get_default_import_call_bin_prefix(self): """ Method to provide the default string for the import call bin prefix. Returns: str: The default location for the neo4j admin import location """ return "bin/" def _write_array_string(self, string_list): """ Abstract method to output.write the string representation of an array into a .csv file as required by the neo4j admin-import. Args: string_list (list): list of ontology strings Returns: str: The string representation of an array for the neo4j admin import """ string = self.adelim.join(string_list) return f"{self.quote}{string}{self.quote}" def _write_node_headers(self): """ Writes single CSV file for a graph entity that is represented as a node as per the definition in the `schema_config.yaml`, containing only the header for this type of node. Returns: bool: The return value. True for success, False otherwise. """ # load headers from data parse if not self.node_property_dict: logger.error( "Header information not found. Was the data parsed first?", ) return False for label, props in self.node_property_dict.items(): _id = ":ID" ##MeDaX dev remark: ##From Fhir data we get case sensitive labels. E.g. 'Procedure' and 'procedure' are two distinct node types. ##Because we are converting Resources to more specific node classes using their "resourceType" attribute. # translate label to PascalCase pascal_label = self.translator.name_sentence_to_pascal( parse_label(label) ) header = f"{pascal_label}-header.csv" header_path = os.path.join( self.outdir, header, ) parts = f"{pascal_label}-part.*" existing_header = False # check if file already exists if os.path.exists(header_path): logger.warning( f"Header file `{header_path}` already exists. Overwriting.", ) with open(header_path, "r", encoding="utf-8") as existing: existing_header = existing.read().strip().split(self.delim) # concatenate key:value in props props_list = [] for k, v in props.items(): if v in ["int", "long", "integer"]: props_list.append(f"{k}:long") elif v in ["int[]", "long[]", "integer[]"]: props_list.append(f"{k}:long[]") elif v in ["float", "double", "dbl"]: props_list.append(f"{k}:double") elif v in ["float[]", "double[]"]: props_list.append(f"{k}:double[]") elif v in ["bool", "boolean"]: # TODO Neo4j boolean support / spelling? props_list.append(f"{k}:boolean") elif v in ["bool[]", "boolean[]"]: props_list.append(f"{k}:boolean[]") elif v in ["str[]", "string[]"]: props_list.append(f"{k}:string[]") else: props_list.append(f"{k}") # create list of lists and flatten out_list = [[_id], props_list, [":LABEL"]] out_list = [val for sublist in out_list for val in sublist] with open(header_path, "w", encoding="utf-8") as f: # Check if header file already exists and has different columns if os.path.exists(header_path): if existing_header: #existing_header = existing.read().strip().split(self.delim) # Compare existing and new headers if set(existing_header) != set(out_list): # Get part files associated with this header base_name = os.path.basename(header_path).replace("-header.csv", "") part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv")) # Find the highest numbered part file without full sorting highest_part = None highest_number = -1 for part_file in part_files: try: # Extract number from filename (assuming format like "part123.csv") file_name = os.path.basename(part_file) number_part = file_name.split("part")[1].split(".")[0] number = int(number_part) if number > highest_number: highest_number = number highest_part = part_file except (IndexError, ValueError): # Skip files that don't match the expected pattern continue # Update each part file with the new columns for part_file in part_files: if part_file == highest_part: print(f"Skipping the highest part file: {highest_part}") continue try: #print("exi: ", existing_header) #print("out: ", out_list) df = self.adapt_csv_to_new_header(existing_header, out_list, part_file) # Read the file without headers # Write back to file WITHOUT including the header df.to_csv(part_file, sep=self.delim, index=False, header=False) print(f"Updated {part_file} with new columns in correct positions") except Exception as e: print(f"Error updating {part_file}: {e}") # Write the new header row = self.delim.join(out_list) f.write(row) # add file path to neo4 admin import statement (import call file # path may be different from actual file path) import_call_header_path = os.path.join( self.import_call_file_prefix, header, ) import_call_parts_path = os.path.join( self.import_call_file_prefix, parts, ) self.import_call_nodes.add( (import_call_header_path, import_call_parts_path) ) return True def _write_edge_headers(self): """ Writes single CSV file for a graph entity that is represented as an edge as per the definition in the `schema_config.yaml`, containing only the header for this type of edge. Returns: bool: The return value. True for success, False otherwise. """ # load headers from data parse if not self.edge_property_dict: logger.error( "Header information not found. Was the data parsed first?", ) return False for label, props in self.edge_property_dict.items(): # translate label to PascalCase pascal_label = self.translator.name_sentence_to_pascal( parse_label(label) ) # paths header = f"{pascal_label}-header.csv" header_path = os.path.join( self.outdir, header, ) parts = f"{pascal_label}-part.*" # check for file exists if os.path.exists(header_path): logger.warning( f"File {header_path} already exists. Overwriting." ) # concatenate key:value in props props_list = [] for k, v in props.items(): if v in ["int", "long", "integer"]: props_list.append(f"{k}:long") elif v in ["int[]", "long[]", "integer[]"]: props_list.append(f"{k}:long[]") elif v in ["float", "double"]: props_list.append(f"{k}:double") elif v in ["float[]", "double[]"]: props_list.append(f"{k}:double[]") elif v in [ "bool", "boolean", ]: # TODO does Neo4j support bool? props_list.append(f"{k}:boolean") elif v in ["bool[]", "boolean[]"]: props_list.append(f"{k}:boolean[]") elif v in ["str[]", "string[]"]: props_list.append(f"{k}:string[]") else: props_list.append(f"{k}") skip_id = False schema_label = None if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]: skip_id = True elif not self.translator.ontology.mapping.extended_schema.get( label ): # find label in schema by label_as_edge for ( k, v, ) in self.translator.ontology.mapping.extended_schema.items(): if v.get("label_as_edge") == label: schema_label = k break else: schema_label = label out_list = [":START_ID"] if schema_label: if ( self.translator.ontology.mapping.extended_schema.get( schema_label ).get("use_id") == False ): skip_id = True if not skip_id: out_list.append("id") out_list.extend(props_list) out_list.extend([":END_ID", ":TYPE"]) existing_header = False # check if file already exists if os.path.exists(header_path): logger.warning( f"Header file `{header_path}` already exists. Overwriting.", ) with open(header_path, "r", encoding="utf-8") as existing: existing_header = existing.read().strip().split(self.delim) with open(header_path, "w", encoding="utf-8") as f: # Check if header file already exists and has different columns if os.path.exists(header_path): if existing_header: #existing_header = existing.read().strip().split(self.delim) # Compare existing and new headers if set(existing_header) != set(out_list): # Get part files associated with this header base_name = os.path.basename(header_path).replace("-header.csv", "") part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv")) # Find the highest numbered part file without full sorting highest_part = None highest_number = -1 for part_file in part_files: try: # Extract number from filename (assuming format like "part123.csv") file_name = os.path.basename(part_file) number_part = file_name.split("part")[1].split(".")[0] number = int(number_part) if number > highest_number: highest_number = number highest_part = part_file except (IndexError, ValueError): # Skip files that don't match the expected pattern continue # Update each part file with the new columns for part_file in part_files: if part_file == highest_part: print(f"Skipping the highest part file: {highest_part}") continue try: print("exi: ", existing_header) print("out: ", out_list) df = self.adapt_csv_to_new_header(existing_header, out_list, part_file) # Read the file without headers # Write back to file WITHOUT including the header df.to_csv(part_file, sep=self.delim, index=False, header=False) print(f"Updated {part_file} with new columns in correct positions") except Exception as e: print(f"Error updating {part_file}: {e}") # Write the new header row = self.delim.join(out_list) f.write(row) # add file path to neo4 admin import statement (import call file # path may be different from actual file path) import_call_header_path = os.path.join( self.import_call_file_prefix, header, ) import_call_parts_path = os.path.join( self.import_call_file_prefix, parts, ) self.import_call_edges.add( (import_call_header_path, import_call_parts_path) ) return True def _get_import_script_name(self) -> str: """ Returns the name of the neo4j admin import script Returns: str: The name of the import script (ending in .sh) """ return "neo4j-admin-import-call.sh" def _construct_import_call(self) -> str: """ Function to construct the import call detailing folder and individual node and edge headers and data files, as well as delimiters and database name. Built after all data has been processed to ensure that nodes are called before any edges. Returns: str: a bash command for neo4j-admin import """ import_call_neo4j_v4 = self._get_import_call( "import", "--database=", "--force=" ) import_call_neo4j_v5 = self._get_import_call( "database import full", "", "--overwrite-destination=" ) neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)" import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi" return import_script def _get_import_call( self, import_cmd: str, database_cmd: str, wipe_cmd: str ) -> str: """Get parametrized import call for Neo4j 4 or 5+. Args: import_cmd (str): The import command to use. database_cmd (str): The database command to use. wipe_cmd (str): The wipe command to use. Returns: str: The import call. """ import_call = f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} " import_call += f"{database_cmd}{self.db_name} " import_call += f'--delimiter="{self.escaped_delim}" ' import_call += f'--array-delimiter="{self.escaped_adelim}" ' if self.quote == "'": import_call += f'--quote="{self.quote}" ' else: import_call += f"--quote='{self.quote}' " if self.wipe: import_call += f"{wipe_cmd}true " if self.skip_bad_relationships: import_call += "--skip-bad-relationships=true " if self.skip_duplicate_nodes: import_call += "--skip-duplicate-nodes=true " # append node import calls for header_path, parts_path in self.import_call_nodes: import_call += f'--nodes="{header_path},{parts_path}" ' # append edge import calls for header_path, parts_path in self.import_call_edges: import_call += f'--relationships="{header_path},{parts_path}" ' return import_call def adapt_csv_to_new_header(self, old_header, new_header, csv_file_path): """ Adapt a CSV table to a new header structure, placing new columns in their correct positions. Parameters: old_header (list): The original header columns new_header (list): The new header columns csv_file_path (str): Path to the CSV file Returns: pandas.DataFrame: CSV data with the new header structure """ # Step 1: Read the CSV data without headers df = pd.read_csv(csv_file_path, sep=self.delim, header=None) # Step 2: If the file is empty, return empty DataFrame with new headers if df.empty: return pd.DataFrame(columns=new_header) # Step 3: If column count doesn't match old_header length, handle the mismatch if len(df.columns) != len(old_header): print(f"Warning: CSV columns count ({len(df.columns)}) doesn't match the provided old header count ({len(old_header)})") # If file has fewer columns than old_header, pad with NaN if len(df.columns) < len(old_header): for i in range(len(df.columns), len(old_header)): df[i] = None # If file has more columns than old_header, truncate else: df = df.iloc[:, :len(old_header)] # Step 4: Assign old header names to the dataframe df.columns = old_header # Step 5: Create a new DataFrame with the correct structure new_df = pd.DataFrame(columns=new_header) # Step 6: For each column in the new header, find its position in the old header for new_col_idx, new_col in enumerate(new_header): if new_col in old_header: # If column exists in old header, copy data new_df[new_col] = df[new_col] else: # If new column, add empty column new_df[new_col] = None # Step 7: Ensure columns are in the exact order of new_header new_df = new_df[new_header] return new_df