Source code for bioneuralnet.network_embedding.node2vec

from typing import Optional
import pandas as pd
import networkx as nx
from node2vec import Node2Vec
from ..utils.logger import get_logger


class Node2VecEmbedding:
    """
    Node2VecEmbedding Class for Generating Node2Vec-Based Embeddings.

    This class handles the execution of the Node2Vec algorithm on a provided graph adjacency matrix
    and returns the resulting node embeddings.

    Attributes:
        embedding_dim (int): Dimension of the embeddings.
        walk_length (int): Length of each walk.
        num_walks (int): Number of walks per node.
        window_size (int): Window size for Word2Vec.
        workers (int): Number of worker threads.
        seed (int): Random seed for reproducibility.
        p (float): Return parameter for Node2Vec.
        q (float): In-Out parameter for Node2Vec.
        weight_key (str): Edge weight parameter name.
    """

    def __init__(
        self,
        adjacency_matrix: pd.DataFrame,
        embedding_dim: int = 128,
        walk_length: int = 80,
        num_walks: int = 10,
        window_size: int = 10,
        workers: int = 4,
        seed: int = 42,
        p: float = 1.0,
        q: float = 1.0,
        weight_key: str = 'weight',
    ):
        """
        Initializes the Node2VecEmbedding instance.

        Args:
            adjacency_matrix (pd.DataFrame): Adjacency matrix representing the graph.
            embedding_dim (int, optional): Dimension of the embeddings. Defaults to 128.
            walk_length (int, optional): Length of each walk. Defaults to 80.
            num_walks (int, optional): Number of walks per node. Defaults to 10.
            window_size (int, optional): Window size for Word2Vec. Defaults to 10.
            workers (int, optional): Number of worker threads. Defaults to 4.
            seed (int, optional): Random seed for reproducibility. Defaults to 42.
            p (float, optional): Return parameter for Node2Vec. Defaults to 1.0.
            q (float, optional): In-Out parameter for Node2Vec. Defaults to 1.0.
            weight_key (str, optional): Edge weight parameter name. Defaults to 'weight'.
        """

        self.adjacency_matrix = adjacency_matrix
        self.embedding_dim = embedding_dim
        self.walk_length = walk_length
        self.num_walks = num_walks
        self.window_size = window_size
        self.workers = workers
        self.seed = seed
        self.p = p
        self.q = q
        self.weight_key = weight_key
        self.logger = get_logger(__name__)

        self.logger.info("Initialized Node2VecEmbedding with the following parameters:")
        self.logger.info(f"Embedding Dimension: {self.embedding_dim}")
        self.logger.info(f"Walk Length: {self.walk_length}")
        self.logger.info(f"Number of Walks: {self.num_walks}")
        self.logger.info(f"Window Size: {self.window_size}")
        self.logger.info(f"Workers: {self.workers}")
        self.logger.info(f"Seed: {self.seed}")
        self.logger.info(f"Return Parameter (p): {self.p}")
        self.logger.info(f"In-Out Parameter (q): {self.q}")
        self.logger.info(f"Weight Key: {self.weight_key}")

        self.embeddings: Optional[pd.DataFrame] = None

    def _convert_to_networkx(self) -> nx.Graph:
        """
        Converts the adjacency matrix to a NetworkX graph.

        Returns:
            nx.Graph: The converted NetworkX graph.
        """
        try:
            G = nx.from_pandas_adjacency(self.adjacency_matrix, create_using=nx.Graph())
            
            if self.weight_key != 'weight':
                for u, v, data in G.edges(data=True):
                    if self.weight_key in data:
                        data['weight'] = data.pop(self.weight_key)
                    else:
                        data['weight'] = 1.0  # Assign default weight if not present

            self.logger.info(
                f"Converted adjacency matrix to NetworkX graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges."
            )
            return G
        except Exception as e:
            self.logger.error(f"Error converting adjacency matrix to NetworkX graph: {e}")
            raise

    def _generate_embeddings(self, G: nx.Graph) -> pd.DataFrame:
        """
        Generates node embeddings using Node2Vec.

        Args:
            G (nx.Graph): The NetworkX graph.

        Returns:
            pd.DataFrame: DataFrame containing node embeddings.
        """
        try:
            self.logger.info("Initializing Node2Vec model.")
            node2vec = Node2Vec(
                G,
                dimensions=self.embedding_dim,
                walk_length=self.walk_length,
                num_walks=self.num_walks,
                workers=self.workers,
                seed=self.seed,
                p=self.p,
                q=self.q,
                quiet=True 
            )

            self.logger.info("Fitting Node2Vec model to generate embeddings.")
            model = node2vec.fit(window=self.window_size, min_count=1, batch_words=4)

            self.logger.info("Generating embeddings DataFrame.")
            embeddings_df = pd.DataFrame(
                model.wv.vectors,
                index=model.wv.index_to_key,
                columns=[str(i) for i in range(self.embedding_dim)] 
            )
            embeddings_df.index.name = 'node'
            embeddings_df.reset_index(inplace=True)

            self.logger.info("Embeddings generated successfully.")
            return embeddings_df

        except Exception as e:
            self.logger.error(f"Error generating embeddings with Node2Vec: {e}")
            raise

[docs] def run(self) -> pd.DataFrame: """ Runs the Node2Vec embedding process. **Steps:** 1. **Converting to NetworkX Graph**: - Converts the input adjacency matrix to a NetworkX-compatible graph object. 2. **Embedding Generation**: - Executes the Node2Vec algorithm to generate low-dimensional embeddings for graph nodes. 3. **Output Preparation**: - Returns the generated embeddings as a Pandas DataFrame. **Returns**: pd.DataFrame - A DataFrame containing the node embeddings, with nodes as rows and embedding dimensions as columns. **Raises**: - **Exception**: For any errors encountered during graph conversion or embedding generation. **Notes**: - Ensure the adjacency matrix is properly formatted and reflects the graph's structure. - Adjust hyperparameters like `walk_length` or `embedding_dim` to tune the Node2Vec process. **Example**: .. code-block:: python node2vec = Node2VecEmbedding(adjacency_matrix) embeddings = node2vec.run() print(embeddings.head()) """ try: self.logger.info("Starting Node2Vec embedding process.") G = self._convert_to_networkx() self.embeddings = self._generate_embeddings(G) self.logger.info("Node2Vec embedding process completed successfully.") return self.embeddings except Exception as e: self.logger.error(f"Error in Node2VecEmbedding run method: {e}") raise
def get_embeddings(self) -> pd.DataFrame: """ Retrieves the generated node embeddings. Returns: pd.DataFrame: DataFrame containing node embeddings. Raises: ValueError: If embeddings have not been generated yet. """ if self.embeddings is None: self.logger.error("Embeddings have not been generated yet. Call the run() method first.") raise ValueError("Embeddings have not been generated yet. Call the run() method first.") return self.embeddings def save_embeddings(self, filepath: str) -> None: """ Saves the generated embeddings to a CSV file. Args: filepath (str): Path to save the embeddings CSV file. Raises: ValueError: If embeddings have not been generated yet. """ if self.embeddings is None: self.logger.error("Embeddings have not been generated yet. Call the run() method first.") raise ValueError("Embeddings have not been generated yet. Call the run() method first.") try: self.embeddings.to_csv(filepath, index=False) self.logger.info(f"Embeddings saved to {filepath}") except Exception as e: self.logger.error(f"Error saving embeddings to {filepath}: {e}") raise