release commit

2025-04-16 22:12:19 +02:00
commit a9db0be88a
89 changed files with 2336827 additions and 0 deletions
--- a/biocypher/output/write/graph/init.py
+++ b/biocypher/output/write/graph/init.py
--- a/biocypher/output/write/graph/_arangodb.py
+++ b/biocypher/output/write/graph/_arangodb.py
@@ -0,0 +1,241 @@
+import os
+
+from biocypher._logger import logger
+from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
+
+
+class _ArangoDBBatchWriter(_Neo4jBatchWriter):
+    """
+    Class for writing node and edge representations to disk using the format
+    specified by ArangoDB for the use of "arangoimport". Output files are
+    similar to Neo4j, but with a different header format.
+    """
+
+    def _get_default_import_call_bin_prefix(self):
+        """
+        Method to provide the default string for the import call bin prefix.
+
+        Returns:
+            str: The default location for the neo4j admin import location
+        """
+        return ""
+
+    def _get_import_script_name(self) -> str:
+        """
+        Returns the name of the neo4j admin import script
+
+        Returns:
+            str: The name of the import script (ending in .sh)
+        """
+        return "arangodb-import-call.sh"
+
+    def _write_node_headers(self):
+        """
+        Writes single CSV file for a graph entity that is represented
+        as a node as per the definition in the `schema_config.yaml`,
+        containing only the header for this type of node.
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        # load headers from data parse
+        if not self.node_property_dict:
+            logger.error(
+                "Header information not found. Was the data parsed first?",
+            )
+            return False
+
+        for label, props in self.node_property_dict.items():
+            # create header CSV with ID, properties, labels
+
+            _id = "_key"
+
+            # translate label to PascalCase
+            pascal_label = self.translator.name_sentence_to_pascal(label)
+
+            header = f"{pascal_label}-header.csv"
+            header_path = os.path.join(
+                self.outdir,
+                header,
+            )
+
+            # check if file already exists
+            if os.path.exists(header_path):
+                logger.warning(
+                    f"File {header_path} already exists. Overwriting."
+                )
+
+            # concatenate key:value in props
+            props_list = []
+            for k in props.keys():
+                props_list.append(f"{k}")
+
+            # create list of lists and flatten
+            # removes need for empty check of property list
+            out_list = [[_id], props_list]
+            out_list = [val for sublist in out_list for val in sublist]
+
+            with open(header_path, "w", encoding="utf-8") as f:
+                # concatenate with delimiter
+                row = self.delim.join(out_list)
+                f.write(row)
+
+            # add collection from schema config
+            collection = self.translator.ontology.mapping.extended_schema[
+                label
+            ].get("db_collection_name", None)
+
+            # add file path to neo4 admin import statement
+            # do once for each part file
+            parts = self.parts.get(label, [])
+
+            if not parts:
+                raise ValueError(
+                    f"No parts found for node label {label}. "
+                    f"Check that the data was parsed first.",
+                )
+
+            for part in parts:
+                import_call_header_path = os.path.join(
+                    self.import_call_file_prefix,
+                    header,
+                )
+                import_call_parts_path = os.path.join(
+                    self.import_call_file_prefix,
+                    part,
+                )
+
+                self.import_call_nodes.add(
+                    (
+                        import_call_header_path,
+                        import_call_parts_path,
+                        collection,
+                    )
+                )
+
+        return True
+
+    def _write_edge_headers(self):
+        """
+        Writes single CSV file for a graph entity that is represented
+        as an edge as per the definition in the `schema_config.yaml`,
+        containing only the header for this type of edge.
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        # load headers from data parse
+        if not self.edge_property_dict:
+            logger.error(
+                "Header information not found. Was the data parsed first?",
+            )
+            return False
+
+        for label, props in self.edge_property_dict.items():
+            # translate label to PascalCase
+            pascal_label = self.translator.name_sentence_to_pascal(label)
+
+            # paths
+            header = f"{pascal_label}-header.csv"
+            header_path = os.path.join(
+                self.outdir,
+                header,
+            )
+            parts = f"{pascal_label}-part.*"
+
+            # check for file exists
+            if os.path.exists(header_path):
+                logger.warning(
+                    f"Header file {header_path} already exists. Overwriting."
+                )
+
+            # concatenate key:value in props
+            props_list = []
+            for k in props.keys():
+                props_list.append(f"{k}")
+
+            out_list = ["_from", "_key", *props_list, "_to"]
+
+            with open(header_path, "w", encoding="utf-8") as f:
+                # concatenate with delimiter
+                row = self.delim.join(out_list)
+                f.write(row)
+
+            # add collection from schema config
+            if not self.translator.ontology.mapping.extended_schema.get(label):
+                for (
+                    _,
+                    v,
+                ) in self.translator.ontology.mapping.extended_schema.items():
+                    if v.get("label_as_edge") == label:
+                        collection = v.get("db_collection_name", None)
+                        break
+
+            else:
+                collection = self.translator.ontology.mapping.extended_schema[
+                    label
+                ].get("db_collection_name", None)
+
+            # add file path to neo4 admin import statement (import call path
+            # may be different from actual output path)
+            header_import_call_path = os.path.join(
+                self.import_call_file_prefix,
+                header,
+            )
+            parts_import_call_path = os.path.join(
+                self.import_call_file_prefix,
+                parts,
+            )
+            self.import_call_edges.add(
+                (
+                    header_import_call_path,
+                    parts_import_call_path,
+                    collection,
+                )
+            )
+
+        return True
+
+    def _construct_import_call(self) -> str:
+        """
+        Function to construct the import call detailing folder and
+        individual node and edge headers and data files, as well as
+        delimiters and database name. Built after all data has been
+        processed to ensure that nodes are called before any edges.
+
+        Returns:
+            str: a bash command for neo4j-admin import
+        """
+        import_call = (
+            f"{self.import_call_bin_prefix}arangoimp "
+            f"--type csv "
+            f'--separator="{self.escaped_delim}" '
+        )
+
+        if self.quote == "'":
+            import_call += f'--quote="{self.quote}" '
+        else:
+            import_call += f"--quote='{self.quote}' "
+
+        node_lines = ""
+
+        # node import calls: one line per node type
+        for header_path, parts_path, collection in self.import_call_nodes:
+            line = (
+                f"{import_call} "
+                f"--headers-file {header_path} "
+                f"--file= {parts_path} "
+            )
+
+            if collection:
+                line += f"--create-collection --collection {collection} "
+
+            node_lines += f"{line}\n"
+
+        edge_lines = ""
+
+        # edge import calls: one line per edge type
+        for header_path, parts_path, collection in self.import_call_edges:
+            import_call += f'--relationships="{header_path},{parts_path}" '
+
+        return node_lines + edge_lines
--- a/biocypher/output/write/graph/_neo4j.py
+++ b/biocypher/output/write/graph/_neo4j.py
@@ -0,0 +1,502 @@
+import os
+import glob
+import pandas as pd
+
+from biocypher._logger import logger
+from biocypher.output.write._batch_writer import parse_label, _BatchWriter
+
+
+class _Neo4jBatchWriter(_BatchWriter):
+    """
+    Class for writing node and edge representations to disk using the
+    format specified by Neo4j for the use of admin import. Each batch
+    writer instance has a fixed representation that needs to be passed
+    at instantiation via the :py:attr:`schema` argument. The instance
+    also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
+    to convert and extend the hierarchy.
+
+    This class inherits from the abstract class "_BatchWriter" and implements the
+    Neo4j-specific methods:
+
+        - _write_node_headers
+        - _write_edge_headers
+        - _construct_import_call
+        - _write_array_string
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Constructor.
+
+        Check the version of Neo4j and adds a command scope if version >= 5.
+
+        Returns:
+            _Neo4jBatchWriter: An instance of the writer.
+        """
+
+        # Should read the configuration and setup import_call_bin_prefix.
+        super().__init__(*args, **kwargs)
+
+    def _get_default_import_call_bin_prefix(self):
+        """
+        Method to provide the default string for the import call bin prefix.
+
+        Returns:
+            str: The default location for the neo4j admin import location
+        """
+
+        return "bin/"
+
+    def _write_array_string(self, string_list):
+        """
+        Abstract method to output.write the string representation of an array into a .csv file
+        as required by the neo4j admin-import.
+
+        Args:
+            string_list (list): list of ontology strings
+
+        Returns:
+            str: The string representation of an array for the neo4j admin import
+        """
+        string = self.adelim.join(string_list)
+        return f"{self.quote}{string}{self.quote}"
+
+    def _write_node_headers(self):
+        """
+        Writes single CSV file for a graph entity that is represented
+        as a node as per the definition in the `schema_config.yaml`,
+        containing only the header for this type of node.
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        # load headers from data parse
+        if not self.node_property_dict:
+            logger.error(
+                "Header information not found. Was the data parsed first?",
+            )
+            return False
+
+        for label, props in self.node_property_dict.items():
+            _id = ":ID"
+
+            ##MeDaX dev remark:
+            ##From Fhir data we get case sensitive labels. E.g. 'Procedure' and 'procedure' are two distinct node types.
+            ##Because we are converting Resources to more specific node classes using their "resourceType" attribute.
+            
+            # translate label to PascalCase
+            pascal_label = self.translator.name_sentence_to_pascal(
+                parse_label(label)
+            )
+
+            header = f"{pascal_label}-header.csv"
+            header_path = os.path.join(
+                self.outdir,
+                header,
+            )
+            parts = f"{pascal_label}-part.*"
+
+            existing_header = False
+            # check if file already exists
+            if os.path.exists(header_path):
+                logger.warning(
+                    f"Header file `{header_path}` already exists. Overwriting.",
+                )
+                with open(header_path, "r", encoding="utf-8") as existing:
+                    existing_header = existing.read().strip().split(self.delim)
+
+            # concatenate key:value in props
+            props_list = []
+            for k, v in props.items():
+                if v in ["int", "long", "integer"]:
+                    props_list.append(f"{k}:long")
+                elif v in ["int[]", "long[]", "integer[]"]:
+                    props_list.append(f"{k}:long[]")
+                elif v in ["float", "double", "dbl"]:
+                    props_list.append(f"{k}:double")
+                elif v in ["float[]", "double[]"]:
+                    props_list.append(f"{k}:double[]")
+                elif v in ["bool", "boolean"]:
+                    # TODO Neo4j boolean support / spelling?
+                    props_list.append(f"{k}:boolean")
+                elif v in ["bool[]", "boolean[]"]:
+                    props_list.append(f"{k}:boolean[]")
+                elif v in ["str[]", "string[]"]:
+                    props_list.append(f"{k}:string[]")
+                else:
+                    props_list.append(f"{k}")
+
+            # create list of lists and flatten
+            out_list = [[_id], props_list, [":LABEL"]]
+            out_list = [val for sublist in out_list for val in sublist]
+
+
+
+
+
+            with open(header_path, "w", encoding="utf-8") as f:
+                # Check if header file already exists and has different columns
+                if os.path.exists(header_path):
+                    if existing_header:
+                        #existing_header = existing.read().strip().split(self.delim)
+                        # Compare existing and new headers
+                        if set(existing_header) != set(out_list):
+                            
+                            # Get part files associated with this header
+                            base_name = os.path.basename(header_path).replace("-header.csv", "")
+                            part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
+                            
+                            
+                            # Find the highest numbered part file without full sorting
+                            highest_part = None
+                            highest_number = -1
+
+                            for part_file in part_files:
+                                try:
+                                    # Extract number from filename (assuming format like "part123.csv")
+                                    file_name = os.path.basename(part_file)
+                                    number_part = file_name.split("part")[1].split(".")[0]
+                                    number = int(number_part)
+                                    
+                                    if number > highest_number:
+                                        highest_number = number
+                                        highest_part = part_file
+                                except (IndexError, ValueError):
+                                    # Skip files that don't match the expected pattern
+                                    continue
+                            # Update each part file with the new columns
+                            for part_file in part_files:
+                                if part_file == highest_part:
+                                    print(f"Skipping the highest part file: {highest_part}")
+                                    continue
+                                try:
+                                    #print("exi: ", existing_header)
+                                    #print("out: ", out_list)
+                                    df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
+                                    # Read the file without headers
+                                    
+                                    # Write back to file WITHOUT including the header
+                                    df.to_csv(part_file, sep=self.delim, index=False, header=False)
+                                    print(f"Updated {part_file} with new columns in correct positions")
+                                except Exception as e:
+                                    print(f"Error updating {part_file}: {e}")
+                
+                # Write the new header
+                row = self.delim.join(out_list)
+                f.write(row)
+
+
+            # add file path to neo4 admin import statement (import call file
+            # path may be different from actual file path)
+            import_call_header_path = os.path.join(
+                self.import_call_file_prefix,
+                header,
+            )
+            import_call_parts_path = os.path.join(
+                self.import_call_file_prefix,
+                parts,
+            )
+            self.import_call_nodes.add(
+                (import_call_header_path, import_call_parts_path)
+            )
+
+        return True
+
+    def _write_edge_headers(self):
+        """
+        Writes single CSV file for a graph entity that is represented
+        as an edge as per the definition in the `schema_config.yaml`,
+        containing only the header for this type of edge.
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        # load headers from data parse
+        if not self.edge_property_dict:
+            logger.error(
+                "Header information not found. Was the data parsed first?",
+            )
+            return False
+
+        for label, props in self.edge_property_dict.items():
+            # translate label to PascalCase
+            pascal_label = self.translator.name_sentence_to_pascal(
+                parse_label(label)
+            )
+
+            # paths
+            header = f"{pascal_label}-header.csv"
+            header_path = os.path.join(
+                self.outdir,
+                header,
+            )
+            parts = f"{pascal_label}-part.*"
+
+            # check for file exists
+            if os.path.exists(header_path):
+                logger.warning(
+                    f"File {header_path} already exists. Overwriting."
+                )
+
+            # concatenate key:value in props
+            props_list = []
+            for k, v in props.items():
+                if v in ["int", "long", "integer"]:
+                    props_list.append(f"{k}:long")
+                elif v in ["int[]", "long[]", "integer[]"]:
+                    props_list.append(f"{k}:long[]")
+                elif v in ["float", "double"]:
+                    props_list.append(f"{k}:double")
+                elif v in ["float[]", "double[]"]:
+                    props_list.append(f"{k}:double[]")
+                elif v in [
+                    "bool",
+                    "boolean",
+                ]:  # TODO does Neo4j support bool?
+                    props_list.append(f"{k}:boolean")
+                elif v in ["bool[]", "boolean[]"]:
+                    props_list.append(f"{k}:boolean[]")
+                elif v in ["str[]", "string[]"]:
+                    props_list.append(f"{k}:string[]")
+                else:
+                    props_list.append(f"{k}")
+
+            skip_id = False
+            schema_label = None
+
+            if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
+                skip_id = True
+            elif not self.translator.ontology.mapping.extended_schema.get(
+                label
+            ):
+                # find label in schema by label_as_edge
+                for (
+                    k,
+                    v,
+                ) in self.translator.ontology.mapping.extended_schema.items():
+                    if v.get("label_as_edge") == label:
+                        schema_label = k
+                        break
+            else:
+                schema_label = label
+
+            out_list = [":START_ID"]
+
+            if schema_label:
+                if (
+                    self.translator.ontology.mapping.extended_schema.get(
+                        schema_label
+                    ).get("use_id")
+                    == False
+                ):
+                    skip_id = True
+
+            if not skip_id:
+                out_list.append("id")
+
+            out_list.extend(props_list)
+            out_list.extend([":END_ID", ":TYPE"])
+
+            existing_header = False
+            # check if file already exists
+            if os.path.exists(header_path):
+                logger.warning(
+                    f"Header file `{header_path}` already exists. Overwriting.",
+                )
+                with open(header_path, "r", encoding="utf-8") as existing:
+                    existing_header = existing.read().strip().split(self.delim)
+
+
+            with open(header_path, "w", encoding="utf-8") as f:
+                # Check if header file already exists and has different columns
+                if os.path.exists(header_path):
+                    if existing_header:
+                        #existing_header = existing.read().strip().split(self.delim)
+                        # Compare existing and new headers
+                        if set(existing_header) != set(out_list):
+                            
+                            # Get part files associated with this header
+                            base_name = os.path.basename(header_path).replace("-header.csv", "")
+                            part_files = glob.glob(os.path.join(os.path.dirname(header_path), f"{base_name}-part*.csv"))
+                            
+                            
+                            # Find the highest numbered part file without full sorting
+                            highest_part = None
+                            highest_number = -1
+
+                            for part_file in part_files:
+                                try:
+                                    # Extract number from filename (assuming format like "part123.csv")
+                                    file_name = os.path.basename(part_file)
+                                    number_part = file_name.split("part")[1].split(".")[0]
+                                    number = int(number_part)
+                                    
+                                    if number > highest_number:
+                                        highest_number = number
+                                        highest_part = part_file
+                                except (IndexError, ValueError):
+                                    # Skip files that don't match the expected pattern
+                                    continue
+                            # Update each part file with the new columns
+                            for part_file in part_files:
+                                if part_file == highest_part:
+                                    print(f"Skipping the highest part file: {highest_part}")
+                                    continue
+                                try:
+                                    print("exi: ", existing_header)
+                                    print("out: ", out_list)
+                                    df = self.adapt_csv_to_new_header(existing_header, out_list, part_file)
+                                    # Read the file without headers
+                                    
+                                    # Write back to file WITHOUT including the header
+                                    df.to_csv(part_file, sep=self.delim, index=False, header=False)
+                                    print(f"Updated {part_file} with new columns in correct positions")
+                                except Exception as e:
+                                    print(f"Error updating {part_file}: {e}")
+                
+                # Write the new header
+                row = self.delim.join(out_list)
+                f.write(row)
+
+            # add file path to neo4 admin import statement (import call file
+            # path may be different from actual file path)
+            import_call_header_path = os.path.join(
+                self.import_call_file_prefix,
+                header,
+            )
+            import_call_parts_path = os.path.join(
+                self.import_call_file_prefix,
+                parts,
+            )
+            self.import_call_edges.add(
+                (import_call_header_path, import_call_parts_path)
+            )
+
+        return True
+
+    def _get_import_script_name(self) -> str:
+        """
+        Returns the name of the neo4j admin import script
+
+        Returns:
+            str: The name of the import script (ending in .sh)
+        """
+        return "neo4j-admin-import-call.sh"
+
+    def _construct_import_call(self) -> str:
+        """
+        Function to construct the import call detailing folder and
+        individual node and edge headers and data files, as well as
+        delimiters and database name. Built after all data has been
+        processed to ensure that nodes are called before any edges.
+
+        Returns:
+            str: a bash command for neo4j-admin import
+        """
+        import_call_neo4j_v4 = self._get_import_call(
+            "import", "--database=", "--force="
+        )
+        import_call_neo4j_v5 = self._get_import_call(
+            "database import full", "", "--overwrite-destination="
+        )
+        neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
+
+        import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
+        return import_script
+
+    def _get_import_call(
+        self, import_cmd: str, database_cmd: str, wipe_cmd: str
+    ) -> str:
+        """Get parametrized import call for Neo4j 4 or 5+.
+
+        Args:
+            import_cmd (str): The import command to use.
+            database_cmd (str): The database command to use.
+            wipe_cmd (str): The wipe command to use.
+
+        Returns:
+            str: The import call.
+        """
+        import_call = f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "
+
+        import_call += f"{database_cmd}{self.db_name} "
+
+        import_call += f'--delimiter="{self.escaped_delim}" '
+
+        import_call += f'--array-delimiter="{self.escaped_adelim}" '
+
+        if self.quote == "'":
+            import_call += f'--quote="{self.quote}" '
+        else:
+            import_call += f"--quote='{self.quote}' "
+
+        if self.wipe:
+            import_call += f"{wipe_cmd}true "
+        if self.skip_bad_relationships:
+            import_call += "--skip-bad-relationships=true "
+        if self.skip_duplicate_nodes:
+            import_call += "--skip-duplicate-nodes=true "
+
+        # append node import calls
+        for header_path, parts_path in self.import_call_nodes:
+            import_call += f'--nodes="{header_path},{parts_path}" '
+
+        # append edge import calls
+        for header_path, parts_path in self.import_call_edges:
+            import_call += f'--relationships="{header_path},{parts_path}" '
+
+        return import_call
+
+
+
+
+    def adapt_csv_to_new_header(self, old_header, new_header, csv_file_path):
+        """
+        Adapt a CSV table to a new header structure, placing new columns in their correct positions.
+        
+        Parameters:
+        old_header (list): The original header columns
+        new_header (list): The new header columns
+        csv_file_path (str): Path to the CSV file
+        
+        Returns:
+        pandas.DataFrame: CSV data with the new header structure
+        """
+        
+        # Step 1: Read the CSV data without headers
+        df = pd.read_csv(csv_file_path, sep=self.delim, header=None)
+        
+        # Step 2: If the file is empty, return empty DataFrame with new headers
+        if df.empty:
+            return pd.DataFrame(columns=new_header)
+        
+        # Step 3: If column count doesn't match old_header length, handle the mismatch
+        if len(df.columns) != len(old_header):
+            print(f"Warning: CSV columns count ({len(df.columns)}) doesn't match the provided old header count ({len(old_header)})")
+            # If file has fewer columns than old_header, pad with NaN
+            if len(df.columns) < len(old_header):
+                for i in range(len(df.columns), len(old_header)):
+                    df[i] = None
+            # If file has more columns than old_header, truncate
+            else:
+                df = df.iloc[:, :len(old_header)]
+        
+        # Step 4: Assign old header names to the dataframe
+        df.columns = old_header
+        
+        # Step 5: Create a new DataFrame with the correct structure
+        new_df = pd.DataFrame(columns=new_header)
+        
+        # Step 6: For each column in the new header, find its position in the old header
+        for new_col_idx, new_col in enumerate(new_header):
+            if new_col in old_header:
+                # If column exists in old header, copy data
+                new_df[new_col] = df[new_col]
+            else:
+                # If new column, add empty column
+                new_df[new_col] = None
+        
+        # Step 7: Ensure columns are in the exact order of new_header
+        new_df = new_df[new_header]
+        
+        return new_df
--- a/biocypher/output/write/graph/_networkx.py
+++ b/biocypher/output/write/graph/_networkx.py
@@ -0,0 +1,76 @@
+import pickle
+
+import networkx as nx
+
+from biocypher._logger import logger
+from biocypher.output.write._writer import _Writer
+from biocypher.output.write.relational._csv import _PandasCSVWriter
+
+
+class _NetworkXWriter(_Writer):
+    """
+    Class for writing node and edges to a networkx DiGraph.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.csv_writer = _PandasCSVWriter(*args, write_to_file=False, **kwargs)
+        self.G = nx.DiGraph()
+
+    def _construct_import_call(self) -> str:
+        """Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
+
+        Returns:
+            str: Python code to load the csv files into Pandas dfs.
+        """
+        logger.info(
+            f"Writing networkx {self.G} to pickle file networkx_graph.pkl."
+        )
+        with open(f"{self.output_directory}/networkx_graph.pkl", "wb") as f:
+            pickle.dump(self.G, f)
+
+        import_call = "import pickle\n"
+        import_call += "with open('./networkx_graph.pkl', 'rb') as f:\n\tG_loaded = pickle.load(f)"
+        return import_call
+
+    def _get_import_script_name(self) -> str:
+        """Function to return the name of the import script."""
+        return "import_networkx.py"
+
+    def _write_node_data(self, nodes) -> bool:
+        passed = self.csv_writer._write_entities_to_file(nodes)
+        self.add_to_networkx()
+        return passed
+
+    def _write_edge_data(self, edges) -> bool:
+        passed = self.csv_writer._write_entities_to_file(edges)
+        self.add_to_networkx()
+        return passed
+
+    def add_to_networkx(self) -> bool:
+        all_dfs = self.csv_writer.stored_dfs
+        node_dfs = [
+            df
+            for df in all_dfs.values()
+            if df.columns.str.contains("node_id").any()
+        ]
+        edge_dfs = [
+            df
+            for df in all_dfs.values()
+            if df.columns.str.contains("source_id").any()
+            and df.columns.str.contains("target_id").any()
+        ]
+        for df in node_dfs:
+            nodes = df.set_index("node_id").to_dict(orient="index")
+            self.G.add_nodes_from(nodes.items())
+        for df in edge_dfs:
+            edges = df.set_index(["source_id", "target_id"]).to_dict(
+                orient="index"
+            )
+            self.G.add_edges_from(
+                (
+                    (source, target, attrs)
+                    for (source, target), attrs in edges.items()
+                )
+            )
+        return True
--- a/biocypher/output/write/graph/_rdf.py
+++ b/biocypher/output/write/graph/_rdf.py
@@ -0,0 +1,515 @@
+#!/usr/bin/env python
+
+#
+# Copyright 2021, Heidelberg University Clinic
+#
+# File author(s):  Loes van den Biggelaar
+#                  Sebastian Lobentanzer
+#
+# Distributed under MIT licence, see the file `LICENSE`.
+#
+"""
+BioCypher 'offline' module. Handles the writing of node and edge representations
+suitable for import into a DBMS.
+"""
+from types import GeneratorType
+from typing import Union
+import os
+
+from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
+from rdflib.namespace import (
+    _NAMESPACE_PREFIXES_CORE,
+    _NAMESPACE_PREFIXES_RDFLIB,
+)
+
+from biocypher._create import BioCypherEdge, BioCypherNode
+from biocypher._logger import logger
+from biocypher.output.write._batch_writer import _BatchWriter
+
+
+class _RDFWriter(_BatchWriter):
+    """
+    Class to write BioCypher's property graph into an RDF format using
+    rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
+    N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
+    is done keeping only the minimum information about node and edges,
+    skipping all properties.
+    """
+
+    def _get_import_script_name(self) -> str:
+        """
+        Returns the name of the RDF admin import script.
+        This function applicable for RDF export.
+
+        Returns:
+            str: The name of the import script (ending in .sh)
+        """
+        return "rdf-import-call.sh"
+
+    def _get_default_import_call_bin_prefix(self):
+        """
+        Method to provide the default string for the import call bin prefix.
+
+        Returns:
+            str: The default location for the RDF admin import location
+        """
+        return "bin/"
+
+    def _is_rdf_format_supported(self, rdf_format: str) -> bool:
+        """
+        Function to check if the specified RDF format is supported.
+
+        Args:
+            rdf_format (str): The RDF format to check.
+
+        Returns:
+            bool: Returns True if rdf format supported, False otherwise.
+        """
+        supported_formats = [
+            "xml",
+            "n3",
+            "turtle",
+            "nt",
+            "pretty-xml",
+            "trix",
+            "trig",
+            "nquads",
+            "json-ld",
+        ]
+        if rdf_format not in supported_formats:
+            logger.error(
+                f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
+                f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
+            )
+            return False
+        else:
+            # RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
+            if self.rdf_format == "turtle":
+                self.extension = "ttl"
+            elif self.rdf_format == "ttl":
+                self.rdf_format = "turtle"
+                self.extension = "ttl"
+            else:
+                self.extension = self.rdf_format
+            return True
+
+    def _write_single_edge_list_to_file(
+        self,
+        edge_list: list,
+        label: str,
+        prop_dict: dict,
+    ):
+        """
+        This function takes one list of biocypher edges and writes them
+        to an RDF file with the given format.
+
+        Args:
+            edge_list (list): list of BioCypherEdges to be written
+
+            label (str): the label (type) of the edge
+
+            prop_dict (dict): properties of node class passed from parsing
+                function and their types
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+
+        if not all(isinstance(n, BioCypherEdge) for n in edge_list):
+            logger.error("Edges must be passed as type BioCypherEdge.")
+            return False
+
+        # translate label to PascalCase
+        label_pascal = self.translator.name_sentence_to_pascal(label)
+
+        # create file name
+        file_name = os.path.join(
+            self.outdir, f"{label_pascal}.{self.extension}"
+        )
+
+        # write data in graph
+        graph = Graph()
+        self._init_namespaces(graph)
+
+        for edge in edge_list:
+            rdf_subject = edge.get_source_id()
+            rdf_object = edge.get_target_id()
+            rdf_predicate = edge.get_id()
+            rdf_properties = edge.get_properties()
+            if rdf_predicate == None:
+                rdf_predicate = rdf_subject + rdf_object
+
+            edge_label = self.translator.name_sentence_to_pascal(
+                edge.get_label()
+            )
+            edge_uri = self.rdf_namespaces["biocypher"][edge_label]
+            graph.add((edge_uri, RDF.type, RDFS.Class))
+            graph.add(
+                (
+                    self.rdf_namespaces["biocypher"][rdf_predicate],
+                    RDF.type,
+                    edge_uri,
+                )
+            )
+            graph.add(
+                (
+                    self.rdf_namespaces["biocypher"][rdf_predicate],
+                    self.rdf_namespaces["biocypher"]["subject"],
+                    self.subject_to_uri(rdf_subject),
+                )
+            )
+            graph.add(
+                (
+                    self.rdf_namespaces["biocypher"][rdf_predicate],
+                    self.rdf_namespaces["biocypher"]["object"],
+                    self.subject_to_uri(rdf_object),
+                )
+            )
+
+            # add properties to the transformed edge --> node
+            for key, value in rdf_properties.items():
+                # only write value if it exists.
+                if value:
+                    self.add_property_to_graph(graph, rdf_predicate, value, key)
+
+        graph.serialize(destination=file_name, format=self.rdf_format)
+
+        logger.info(
+            f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
+        )
+
+        return True
+
+    def add_property_to_graph(
+        self,
+        graph: Graph,
+        rdf_subject: str,
+        rdf_object: str,
+        rdf_predicate: str,
+    ):
+        """
+        Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
+        It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
+        If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
+        If the property is neither a list or string, it will also be added as a literal.
+
+        Args:
+            graph (RDFLib.Graph): The RDF graph to add the nodes to.
+
+            rdf_subject (str): The subject of the RDF triple.
+
+            rdf_object (str): The object of the RDF triple.
+
+            rdf_predicate (str): The predicate of the RDF triple.
+
+        Returns:
+            None
+        """
+        if isinstance(rdf_object, list):
+            for obj in rdf_object:
+                graph.add(
+                    (
+                        self.subject_to_uri(rdf_subject),
+                        self.property_to_uri(rdf_predicate),
+                        Literal(obj),
+                    )
+                )
+        elif isinstance(rdf_object, str):
+            if rdf_object.startswith("[") and rdf_object.endswith("]"):
+                self.add_property_to_graph(
+                    graph,
+                    rdf_subject,
+                    self.transform_string_to_list(rdf_object),
+                    rdf_predicate,
+                )
+            else:
+                graph.add(
+                    (
+                        self.subject_to_uri(rdf_subject),
+                        self.property_to_uri(rdf_predicate),
+                        Literal(rdf_object),
+                    )
+                )
+        else:
+            graph.add(
+                (
+                    self.subject_to_uri(rdf_subject),
+                    self.property_to_uri(rdf_predicate),
+                    Literal(rdf_object),
+                )
+            )
+
+    def transform_string_to_list(self, string_list: str) -> list:
+        """
+        Function to transform a string representation of a list into a list.
+
+        Args:
+            string_list (str): The string representation of the list.
+
+        Returns:
+            list: The list representation of the input string.
+        """
+        return (
+            string_list.replace("[", "")
+            .replace("]", "")
+            .replace("'", "")
+            .split(", ")
+        )
+
+    def _write_single_node_list_to_file(
+        self,
+        node_list: list,
+        label: str,
+        prop_dict: dict,
+        labels: str,
+    ):
+        """
+        This function takes a list of BioCypherNodes and writes them
+        to an RDF file in the specified format.
+
+        Args:
+            node_list (list): A list of BioCypherNodes to be written.
+
+            label (str): The label (type) of the nodes.
+
+            prop_dict (dict): A dictionary of properties and their types for the node class.
+
+        Returns:
+            bool: True if the writing is successful, False otherwise.
+        """
+        if not all(isinstance(n, BioCypherNode) for n in node_list):
+            logger.error("Nodes must be passed as type BioCypherNode.")
+            return False
+
+        # translate label to PascalCase
+        label_pascal = self.translator.name_sentence_to_pascal(label)
+
+        # create file name
+        file_name = os.path.join(
+            self.outdir, f"{label_pascal}.{self.extension}"
+        )
+
+        # write data in graph
+        graph = Graph()
+        self._init_namespaces(graph)
+
+        for n in node_list:
+            rdf_subject = n.get_id()
+            rdf_object = n.get_label()
+            properties = n.get_properties()
+            class_name = self.translator.name_sentence_to_pascal(rdf_object)
+            graph.add(
+                (
+                    self.rdf_namespaces["biocypher"][class_name],
+                    RDF.type,
+                    RDFS.Class,
+                )
+            )
+            graph.add(
+                (
+                    self.subject_to_uri(rdf_subject),
+                    RDF.type,
+                    self.rdf_namespaces["biocypher"][class_name],
+                )
+            )
+            for key, value in properties.items():
+                # only write value if it exists.
+                if value:
+                    self.add_property_to_graph(graph, rdf_subject, value, key)
+
+        graph.serialize(destination=file_name, format=self.rdf_format)
+
+        logger.info(
+            f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
+        )
+
+        return True
+
+    def write_nodes(
+        self, nodes, batch_size: int = int(1e6), force: bool = False
+    ) -> bool:
+        """
+        Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
+
+        Args:
+            nodes (list or generator): A list or generator of nodes in BioCypherNode format.
+            batch_size (int): The number of nodes to write in each batch.
+            force (bool): Flag to force the writing even if the output file already exists.
+
+        Returns:
+            bool: True if the writing is successful, False otherwise.
+        """
+        # check if specified output format is correct
+        passed = self._is_rdf_format_supported(self.rdf_format)
+        if not passed:
+            logger.error("Error while writing node data, wrong RDF format")
+            return False
+        # write node data using _write_node_data method
+        passed = self._write_node_data(nodes, batch_size, force)
+        if not passed:
+            logger.error("Error while writing node data.")
+            return False
+        return True
+
+    def write_edges(
+        self,
+        edges: Union[list, GeneratorType],
+        batch_size: int = int(1e6),
+    ) -> bool:
+        """
+        Wrapper for writing edges in RDF format. It calls _write_edge_data()
+        functions specifying it's edge data.
+
+        Args:
+            edges (BioCypherEdge): a list or generator of edges in
+                :py:class:`BioCypherEdge` format
+            batch_size (int): The number of edges to write in each batch.
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        # check if specified output format is correct
+        passed = self._is_rdf_format_supported(self.rdf_format)
+        if not passed:
+            logger.error("Error while writing edge data, wrong RDF format")
+            return False
+        # write edge data using _write_edge_data method
+        passed = self._write_edge_data(edges, batch_size=batch_size)
+        if not passed:
+            logger.error("Error while writing edge data.")
+            return False
+
+        return True
+
+    def _construct_import_call(self) -> bool:
+        """
+        Function to write the import call.
+        This function is not applicable for RDF.
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        return ""
+
+    def _write_array_string(self, string_list):
+        """
+        Abstract method to write the string representation of an array into a .csv file
+        as required by the RDF admin-import.
+        This function is not applicable for RDF.
+
+        Args:
+            string_list (list): list of ontology strings
+
+        Returns:
+            str: The string representation of an array for the neo4j admin import
+        """
+
+        return True
+
+    def _write_node_headers(self):
+        """
+        Abstract method that takes care of importing properties of a graph entity that is represented
+        as a node as per the definition in the `schema_config.yaml`
+        This function is not applicable for RDF.
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        return True
+
+    def _write_edge_headers(self):
+        """
+        Abstract method to write a database import-file for a graph entity that is represented
+        as an edge as per the definition in the `schema_config.yaml`,
+        containing only the header for this type of edge.
+        This function is not applicable for RDF.
+
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        return True
+
+    def subject_to_uri(self, subject: str) -> str:
+        """
+        Converts the subject to a proper URI using the available namespaces.
+        If the conversion fails, it defaults to the biocypher prefix.
+
+        Args:
+            subject (str): The subject to be converted to a URI.
+
+        Returns:
+            str: The corresponding URI for the subject.
+        """
+        try:
+            _pref, _id = subject.split(":")
+
+            if _pref in self.rdf_namespaces.keys():
+                return self.rdf_namespaces[_pref][_id]
+            else:
+                return self.rdf_namespaces["biocypher"][subject]
+        except ValueError:
+            return self.rdf_namespaces["biocypher"][subject]
+
+    def property_to_uri(self, property_name: str) -> dict[str, str]:
+        """
+        Converts a property name to its corresponding URI.
+
+        This function takes a property name and searches for its corresponding URI in various namespaces.
+        It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
+
+        Args:
+            property_name (str): The property name to be converted to a URI.
+
+        Returns:
+            str: The corresponding URI for the input property name.
+        """
+        # These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
+        for namespace in _NAMESPACE_PREFIXES_CORE.values():
+            if property_name in namespace:
+                return namespace[property_name]
+
+        # If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
+        for namespace in [SKOS, DC, DCTERMS]:
+            if property_name in namespace:
+                return namespace[property_name]
+
+        # If the property name is still not found, try other namespaces from rdflib.
+        for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
+            if property_name in namespace:
+                return namespace[property_name]
+
+        # If the property name is "licence", it recursively calls the function with "license" as the input.
+        if property_name == "licence":
+            return self.property_to_uri("license")
+
+        # TODO: add an option to search trough manually implemented namespaces
+
+        # If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
+        # TODO: give a warning and try to prevent this option altogether
+        return self.rdf_namespaces["biocypher"][property_name]
+
+    def _init_namespaces(self, graph: Graph):
+        """
+        Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
+
+        This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
+        If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
+        the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
+
+        Args:
+            graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
+
+        Returns:
+            None
+        """
+        # add biocypher standard to self.rdf_namespaces
+        biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
+        if not self.rdf_namespaces:
+            self.rdf_namespaces = biocypher_standard
+        else:
+            self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
+
+        for key, value in self.rdf_namespaces.items():
+            namespace = Namespace(value)
+            self.rdf_namespaces[key] = namespace
+            graph.bind(key, namespace)