release commit

2025-04-16 22:12:19 +02:00
commit a9db0be88a
89 changed files with 2336827 additions and 0 deletions
--- a/biocypher/output/write/relational/init.py
+++ b/biocypher/output/write/relational/init.py
--- a/biocypher/output/write/relational/_csv.py
+++ b/biocypher/output/write/relational/_csv.py
@ -0,0 +1,76 @@
+from more_itertools import peekable
+
+from biocypher._logger import logger
+from biocypher.output.write._writer import _Writer
+from biocypher.output.in_memory._pandas import Pandas
+
+
+class _PandasCSVWriter(_Writer):
+    """
+    Class for writing node and edge representations to a CSV file.
+    """
+
+    def __init__(self, *args, write_to_file: bool = True, **kwargs):
+        kwargs["write_to_file"] = write_to_file
+        super().__init__(*args, **kwargs)
+        self.in_memory_dfs = {}
+        self.stored_dfs = {}
+        self.pandas_in_memory = Pandas(
+            translator=self.translator,
+            deduplicator=self.deduplicator,
+        )
+        self.delimiter = kwargs.get("delimiter")
+        if not self.delimiter:
+            self.delimiter = ","
+        self.write_to_file = write_to_file
+
+    def _construct_import_call(self) -> str:
+        """Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
+
+        Returns:
+            str: Python code to load the csv files into Pandas dfs.
+        """
+        import_call = "import pandas as pd\n\n"
+        for df_name in self.stored_dfs.keys():
+            import_call += f"{df_name} = pd.read_csv('./{df_name}.csv', header=0, index_col=0)\n"
+        return import_call
+
+    def _get_import_script_name(self) -> str:
+        """Function to return the name of the import script."""
+        return "import_pandas_csv.py"
+
+    def _write_node_data(self, nodes) -> bool:
+        passed = self._write_entities_to_file(nodes)
+        return passed
+
+    def _write_edge_data(self, edges) -> bool:
+        passed = self._write_entities_to_file(edges)
+        return passed
+
+    def _write_entities_to_file(self, entities: iter) -> bool:
+        """Function to output.write the entities to a CSV file.
+
+        Args:
+            entities (iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
+        """
+        entities = peekable(entities)
+        entity_list = self.pandas_in_memory._separate_entity_types(entities)
+        for entity_type, entities in entity_list.items():
+            self.in_memory_dfs[
+                entity_type
+            ] = self.pandas_in_memory._add_entity_df(entity_type, entities)
+        for entity_type in self.in_memory_dfs.keys():
+            entity_df = self.in_memory_dfs[entity_type]
+            if " " in entity_type or "." in entity_type:
+                entity_type = entity_type.replace(" ", "_").replace(".", "_")
+            if self.write_to_file:
+                logger.info(
+                    f"Writing {entity_df.shape[0]} entries to {entity_type}.csv."
+                )
+                entity_df.to_csv(
+                    f"{self.output_directory}/{entity_type}.csv",
+                    sep=self.delimiter,
+                )
+            self.stored_dfs[entity_type] = entity_df
+        self.in_memory_dfs = {}
+        return True
--- a/biocypher/output/write/relational/_postgresql.py
+++ b/biocypher/output/write/relational/_postgresql.py
@ -0,0 +1,320 @@
+import os
+import glob
+
+from biocypher._logger import logger
+from biocypher.output.write._batch_writer import _BatchWriter
+
+
+class _PostgreSQLBatchWriter(_BatchWriter):
+    """
+    Class for writing node and edge representations to disk using the
+    format specified by PostgreSQL for the use of "COPY FROM...". Each batch
+    writer instance has a fixed representation that needs to be passed
+    at instantiation via the :py:attr:`schema` argument. The instance
+    also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
+    to convert and extend the hierarchy.
+
+    This class inherits from the abstract class "_BatchWriter" and implements the
+    PostgreSQL-specific methods:
+
+        - _write_node_headers
+        - _write_edge_headers
+        - _construct_import_call
+        - _write_array_string
+    """
+
+    DATA_TYPE_LOOKUP = {
+        "str": "VARCHAR",  # VARCHAR needs limit
+        "int": "INTEGER",
+        "long": "BIGINT",
+        "float": "NUMERIC",
+        "double": "NUMERIC",
+        "dbl": "NUMERIC",
+        "boolean": "BOOLEAN",
+        "str[]": "VARCHAR[]",
+        "string[]": "VARCHAR[]",
+    }
+
+    def __init__(self, *args, **kwargs):
+        self._copy_from_csv_commands = set()
+        super().__init__(*args, **kwargs)
+
+    def _get_default_import_call_bin_prefix(self):
+        """
+        Method to provide the default string for the import call bin prefix.
+
+        Returns:
+            str: The default location for the psql command
+        """
+        return ""
+
+    def _get_data_type(self, string) -> str:
+        try:
+            return self.DATA_TYPE_LOOKUP[string]
+        except KeyError:
+            logger.info(
+                'Could not determine data type {string}. Using default "VARCHAR"'
+            )
+            return "VARCHAR"
+
+    def _write_array_string(self, string_list) -> str:
+        """
+        Abstract method to output.write the string representation of an array into a .csv file
+        as required by the postgresql COPY command, with '{','}' brackets and ',' separation.
+
+        Args:
+            string_list (list): list of ontology strings
+
+        Returns:
+            str: The string representation of an array for postgres COPY
+        """
+        string = ",".join(string_list)
+        string = f'"{{{string}}}"'
+        return string
+
+    def _get_import_script_name(self) -> str:
+        """
+        Returns the name of the psql import script
+
+        Returns:
+            str: The name of the import script (ending in .sh)
+        """
+        return f"{self.db_name}-import-call.sh"
+
+    def _adjust_pascal_to_psql(self, string):
+        string = string.replace(".", "_")
+        string = string.lower()
+        return string
+
+    def _write_node_headers(self):
+        """
+        Writes single CSV file for a graph entity that is represented
+        as a node as per the definition in the `schema_config.yaml`,
+        containing only the header for this type of node.
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        # load headers from data parse
+        if not self.node_property_dict:
+            logger.error(
+                "Header information not found. Was the data parsed first?",
+            )
+            return False
+
+        for label, props in self.node_property_dict.items():
+            # create header CSV with ID, properties, labels
+
+            # translate label to PascalCase
+            pascal_label = self.translator.name_sentence_to_pascal(label)
+
+            parts = f"{pascal_label}-part*.csv"
+            parts_paths = os.path.join(self.outdir, parts)
+            parts_paths = glob.glob(parts_paths)
+            parts_paths.sort()
+
+            # adjust label for import to psql
+            pascal_label = self._adjust_pascal_to_psql(pascal_label)
+            table_create_command_path = os.path.join(
+                self.outdir,
+                f"{pascal_label}-create_table.sql",
+            )
+
+            # check if file already exists
+            if os.path.exists(table_create_command_path):
+                logger.warning(
+                    f"File {table_create_command_path} already exists. Overwriting.",
+                )
+
+            # concatenate key:value in props
+            columns = ["_ID VARCHAR"]
+            for col_name, col_type in props.items():
+                col_type = self._get_data_type(col_type)
+                col_name = self._adjust_pascal_to_psql(col_name)
+                columns.append(f"{col_name} {col_type}")
+            columns.append("_LABEL VARCHAR[]")
+
+            with open(table_create_command_path, "w", encoding="utf-8") as f:
+                command = ""
+                if self.wipe:
+                    command += f"DROP TABLE IF EXISTS {pascal_label};\n"
+
+                # table creation requires comma separation
+                command += (
+                    f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
+                )
+                f.write(command)
+
+                for parts_path in parts_paths:
+                    # if import_call_file_prefix is set, replace actual path
+                    # with prefix
+                    if self.import_call_file_prefix != self.outdir:
+                        parts_path = parts_path.replace(
+                            self.outdir,
+                            self.import_call_file_prefix,
+                        )
+
+                    self._copy_from_csv_commands.add(
+                        f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
+                    )
+
+            # add file path to import statement
+            # if import_call_file_prefix is set, replace actual path
+            # with prefix
+            if self.import_call_file_prefix != self.outdir:
+                table_create_command_path = table_create_command_path.replace(
+                    self.outdir,
+                    self.import_call_file_prefix,
+                )
+
+            self.import_call_nodes.add(table_create_command_path)
+
+        return True
+
+    def _write_edge_headers(self):
+        """
+        Writes single CSV file for a graph entity that is represented
+        as an edge as per the definition in the `schema_config.yaml`,
+        containing only the header for this type of edge.
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        # load headers from data parse
+        if not self.edge_property_dict:
+            logger.error(
+                "Header information not found. Was the data parsed first?",
+            )
+            return False
+
+        for label, props in self.edge_property_dict.items():
+            # translate label to PascalCase
+            pascal_label = self.translator.name_sentence_to_pascal(label)
+
+            parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv")
+            parts_paths = glob.glob(parts_paths)
+            parts_paths.sort()
+
+            # adjust label for import to psql
+            pascal_label = self._adjust_pascal_to_psql(pascal_label)
+            table_create_command_path = os.path.join(
+                self.outdir,
+                f"{pascal_label}-create_table.sql",
+            )
+
+            # check for file exists
+            if os.path.exists(table_create_command_path):
+                logger.warning(
+                    f"File {table_create_command_path} already exists. Overwriting.",
+                )
+
+            # concatenate key:value in props
+            columns = []
+            for col_name, col_type in props.items():
+                col_type = self._get_data_type(col_type)
+                col_name = self._adjust_pascal_to_psql(col_name)
+                if col_name == "_ID":
+                    # should ideally never happen
+                    raise ValueError(
+                        "Column name '_ID' is reserved for internal use, "
+                        "denoting the relationship ID. Please choose a "
+                        "different name for your column."
+                    )
+
+                columns.append(f"{col_name} {col_type}")
+
+            # create list of lists and flatten
+            # removes need for empty check of property list
+            out_list = [
+                "_START_ID VARCHAR",
+                "_ID VARCHAR",
+                *columns,
+                "_END_ID VARCHAR",
+                "_TYPE VARCHAR",
+            ]
+
+            with open(table_create_command_path, "w", encoding="utf-8") as f:
+                command = ""
+                if self.wipe:
+                    command += f"DROP TABLE IF EXISTS {pascal_label};\n"
+
+                # table creation requires comma separation
+                command += (
+                    f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
+                )
+                f.write(command)
+
+                for parts_path in parts_paths:
+                    # if import_call_file_prefix is set, replace actual path
+                    # with prefix
+                    if self.import_call_file_prefix != self.outdir:
+                        parts_path = parts_path.replace(
+                            self.outdir,
+                            self.import_call_file_prefix,
+                        )
+
+                    self._copy_from_csv_commands.add(
+                        f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
+                    )
+
+            # add file path to import statement
+            # if import_call_file_prefix is set, replace actual path
+            # with prefix
+            if self.import_call_file_prefix != self.outdir:
+                table_create_command_path = table_create_command_path.replace(
+                    self.outdir,
+                    self.import_call_file_prefix,
+                )
+
+            self.import_call_edges.add(table_create_command_path)
+
+        return True
+
+    def _construct_import_call(self) -> str:
+        """
+        Function to construct the import call detailing folder and
+        individual node and edge headers and data files, as well as
+        delimiters and database name. Built after all data has been
+        processed to ensure that nodes are called before any edges.
+
+        Returns:
+            str: a bash command for postgresql import
+        """
+        import_call = ""
+
+        # create tables
+        # At this point, csv files of nodes and edges do not require differentiation
+        for import_file_path in [
+            *self.import_call_nodes,
+            *self.import_call_edges,
+        ]:
+            import_call += f'echo "Setup {import_file_path}..."\n'
+            if {self.db_password}:
+                # set password variable inline
+                import_call += f"PGPASSWORD={self.db_password} "
+            import_call += (
+                f"{self.import_call_bin_prefix}psql -f {import_file_path}"
+            )
+            import_call += f" --dbname {self.db_name}"
+            import_call += f" --host {self.db_host}"
+            import_call += f" --port {self.db_port}"
+            import_call += f" --user {self.db_user}"
+            import_call += '\necho "Done!"\n'
+            import_call += "\n"
+
+        # copy data to tables
+        for command in self._copy_from_csv_commands:
+            table_part = command.split(" ")[3]
+            import_call += f'echo "Importing {table_part}..."\n'
+            if {self.db_password}:
+                # set password variable inline
+                import_call += f"PGPASSWORD={self.db_password} "
+            import_call += f'{self.import_call_bin_prefix}psql -c "{command}"'
+            import_call += f" --dbname {self.db_name}"
+            import_call += f" --host {self.db_host}"
+            import_call += f" --port {self.db_port}"
+            import_call += f" --user {self.db_user}"
+            import_call += '\necho "Done!"\n'
+            import_call += "\n"
+
+        return import_call
--- a/biocypher/output/write/relational/_sqlite.py
+++ b/biocypher/output/write/relational/_sqlite.py
@ -0,0 +1,51 @@
+from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
+
+
+class _SQLiteBatchWriter(_PostgreSQLBatchWriter):
+    """
+    Class for writing node and edge representations to a SQLite database.
+    It uses the _PostgreSQLBatchWriter class under the hood, which already
+    implements the logic to write the nodes/edges to a relational DBMS.
+    Only the import bash script differs between PostgreSQL and SQLite
+    and is therefore implemented in this class.
+
+    - _construct_import_call
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def _construct_import_call(self) -> str:
+        """
+        Function to construct the import call detailing folder and
+        individual node and edge headers and data files, as well as
+        delimiters and database name. Built after all data has been
+        processed to ensure that nodes are called before any edges.
+
+        Returns:
+            str: a bash command for sqlite import
+        """
+        import_call = ""
+
+        # create tables
+        # At this point, csv files of nodes and edges do not require differentiation
+        for import_file_path in [
+            *self.import_call_nodes,
+            *self.import_call_edges,
+        ]:
+            import_call += f'echo "Setup {import_file_path}..."\n'
+            import_call += f"{self.import_call_bin_prefix}sqlite3 {self.db_name} < {import_file_path}"
+            import_call += '\necho "Done!"\n'
+            import_call += "\n"
+
+        for command in self._copy_from_csv_commands:
+            table_name = command.split(" ")[1]
+            table_part = command.split(" ")[3].replace("'", "")
+            import_call += f'echo "Importing {table_part}..."\n'
+            separator = self.delim
+            import_part = f".import {table_part} {table_name}"
+            import_call += f"{self.import_call_bin_prefix}sqlite3 -separator $'{separator}' {self.db_name} \"{import_part}\""
+            import_call += '\necho "Done!"\n'
+            import_call += "\n"
+
+        return import_call