Source code for dgl.data.gnn_benchmark

"""GNN Benchmark datasets for node classification."""
import os

import numpy as np
import scipy.sparse as sp

from .. import backend as F, transforms
from ..convert import graph as dgl_graph

from .dgl_dataset import DGLBuiltinDataset
from .utils import (
    _get_dgl_url,
    deprecate_class,
    deprecate_property,
    load_graphs,
    save_graphs,
)

__all__ = [
    "AmazonCoBuyComputerDataset",
    "AmazonCoBuyPhotoDataset",
    "CoauthorPhysicsDataset",
    "CoauthorCSDataset",
    "CoraFullDataset",
    "AmazonCoBuy",
    "Coauthor",
    "CoraFull",
]


def eliminate_self_loops(A):
    """Remove self-loops from the adjacency matrix."""
    A = A.tolil()
    A.setdiag(0)
    A = A.tocsr()
    A.eliminate_zeros()
    return A


class GNNBenchmarkDataset(DGLBuiltinDataset):
    r"""Base Class for GNN Benchmark dataset

    Reference: https://github.com/shchur/gnn-benchmark#datasets
    """

    def __init__(
        self,
        name,
        raw_dir=None,
        force_reload=False,
        verbose=False,
        transform=None,
    ):
        _url = _get_dgl_url("dataset/" + name + ".zip")
        super(GNNBenchmarkDataset, self).__init__(
            name=name,
            url=_url,
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )

    def process(self):
        npz_path = os.path.join(self.raw_path, self.name + ".npz")
        g = self._load_npz(npz_path)
        g = transforms.reorder_graph(
            g,
            node_permute_algo="rcmk",
            edge_permute_algo="dst",
            store_ids=False,
        )
        self._graph = g
        self._data = [g]
        self._print_info()

    def has_cache(self):
        graph_path = os.path.join(self.save_path, "dgl_graph_v1.bin")
        if os.path.exists(graph_path):
            return True
        return False

    def save(self):
        graph_path = os.path.join(self.save_path, "dgl_graph_v1.bin")
        save_graphs(graph_path, self._graph)

    def load(self):
        graph_path = os.path.join(self.save_path, "dgl_graph_v1.bin")
        graphs, _ = load_graphs(graph_path)
        self._graph = graphs[0]
        self._data = [graphs[0]]
        self._print_info()

    def _print_info(self):
        if self.verbose:
            print("  NumNodes: {}".format(self._graph.num_nodes()))
            print("  NumEdges: {}".format(self._graph.num_edges()))
            print("  NumFeats: {}".format(self._graph.ndata["feat"].shape[-1]))
            print("  NumbClasses: {}".format(self.num_classes))

    def _load_npz(self, file_name):
        with np.load(file_name, allow_pickle=True) as loader:
            loader = dict(loader)
            num_nodes = loader["adj_shape"][0]
            adj_matrix = sp.csr_matrix(
                (
                    loader["adj_data"],
                    loader["adj_indices"],
                    loader["adj_indptr"],
                ),
                shape=loader["adj_shape"],
            ).tocoo()

            if "attr_data" in loader:
                # Attributes are stored as a sparse CSR matrix
                attr_matrix = sp.csr_matrix(
                    (
                        loader["attr_data"],
                        loader["attr_indices"],
                        loader["attr_indptr"],
                    ),
                    shape=loader["attr_shape"],
                ).todense()
            elif "attr_matrix" in loader:
                # Attributes are stored as a (dense) np.ndarray
                attr_matrix = loader["attr_matrix"]
            else:
                attr_matrix = None

            if "labels_data" in loader:
                # Labels are stored as a CSR matrix
                labels = sp.csr_matrix(
                    (
                        loader["labels_data"],
                        loader["labels_indices"],
                        loader["labels_indptr"],
                    ),
                    shape=loader["labels_shape"],
                ).todense()
            elif "labels" in loader:
                # Labels are stored as a numpy array
                labels = loader["labels"]
            else:
                labels = None
        g = dgl_graph((adj_matrix.row, adj_matrix.col))
        g = transforms.to_bidirected(g)
        g.ndata["feat"] = F.tensor(attr_matrix, F.data_type_dict["float32"])
        g.ndata["label"] = F.tensor(labels, F.data_type_dict["int64"])
        return g

    @property
    def num_classes(self):
        """Number of classes."""
        raise NotImplementedError

    def __getitem__(self, idx):
        r"""Get graph by index

        Parameters
        ----------
        idx : int
            Item index

        Returns
        -------
        :class:`dgl.DGLGraph`

            The graph contains:

            - ``ndata['feat']``: node features
            - ``ndata['label']``: node labels
        """
        assert idx == 0, "This dataset has only one graph"
        if self._transform is None:
            return self._graph
        else:
            return self._transform(self._graph)

    def __len__(self):
        r"""Number of graphs in the dataset"""
        return 1


[docs]class CoraFullDataset(GNNBenchmarkDataset):
    r"""CORA-Full dataset for node classification task.

    Extended Cora dataset. Nodes represent paper and edges represent citations.

    Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_

    Statistics:

    - Nodes: 19,793
    - Edges: 126,842 (note that the original dataset has 65,311 edges but DGL adds
      the reverse edges and remove the duplicates, hence with a different number)
    - Number of Classes: 70
    - Node feature size: 8,710

    Parameters
    ----------
    raw_dir : str
        Raw file directory to download/contains the input data directory.
        Default: ~/.dgl/
    force_reload : bool
        Whether to reload the dataset. Default: False
    verbose : bool
        Whether to print out progress information. Default: True.
    transform : callable, optional
        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
        a transformed version. The :class:`~dgl.DGLGraph` object will be
        transformed before every access.

    Attributes
    ----------
    num_classes : int
        Number of classes for each node.

    Examples
    --------
    >>> data = CoraFullDataset()
    >>> g = data[0]
    >>> num_class = data.num_classes
    >>> feat = g.ndata['feat']  # get node feature
    >>> label = g.ndata['label']  # get node labels
    """

    def __init__(
        self, raw_dir=None, force_reload=False, verbose=False, transform=None
    ):
        super(CoraFullDataset, self).__init__(
            name="cora_full",
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )

    @property
    def num_classes(self):
        """Number of classes.

        Return
        -------
        int
        """
        return 70


[docs]class CoauthorCSDataset(GNNBenchmarkDataset):
    r"""'Computer Science (CS)' part of the Coauthor dataset for node classification task.

    Coauthor CS and Coauthor Physics are co-authorship graphs based on the Microsoft Academic Graph
    from the KDD Cup 2016 challenge. Here, nodes are authors, that are connected by an edge if they
    co-authored a paper; node features represent paper keywords for each author’s papers, and class
    labels indicate most active fields of study for each author.

    Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_

    Statistics:

    - Nodes: 18,333
    - Edges: 163,788 (note that the original dataset has 81,894 edges but DGL adds
      the reverse edges and remove the duplicates, hence with a different number)
    - Number of classes: 15
    - Node feature size: 6,805

    Parameters
    ----------
    raw_dir : str
        Raw file directory to download/contains the input data directory.
        Default: ~/.dgl/
    force_reload : bool
        Whether to reload the dataset. Default: False
    verbose : bool
        Whether to print out progress information. Default: True.
    transform : callable, optional
        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
        a transformed version. The :class:`~dgl.DGLGraph` object will be
        transformed before every access.

    Attributes
    ----------
    num_classes : int
        Number of classes for each node.

    Examples
    --------
    >>> data = CoauthorCSDataset()
    >>> g = data[0]
    >>> num_class = data.num_classes
    >>> feat = g.ndata['feat']  # get node feature
    >>> label = g.ndata['label']  # get node labels
    """

    def __init__(
        self, raw_dir=None, force_reload=False, verbose=False, transform=None
    ):
        super(CoauthorCSDataset, self).__init__(
            name="coauthor_cs",
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )

    @property
    def num_classes(self):
        """Number of classes.

        Return
        -------
        int
        """
        return 15


[docs]class CoauthorPhysicsDataset(GNNBenchmarkDataset):
    r"""'Physics' part of the Coauthor dataset for node classification task.

    Coauthor CS and Coauthor Physics are co-authorship graphs based on the Microsoft Academic Graph
    from the KDD Cup 2016 challenge. Here, nodes are authors, that are connected by an edge if they
    co-authored a paper; node features represent paper keywords for each author’s papers, and class
    labels indicate most active fields of study for each author.

    Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_

    Statistics

    - Nodes: 34,493
    - Edges: 495,924 (note that the original dataset has 247,962 edges but DGL adds
      the reverse edges and remove the duplicates, hence with a different number)
    - Number of classes: 5
    - Node feature size: 8,415

    Parameters
    ----------
    raw_dir : str
        Raw file directory to download/contains the input data directory.
        Default: ~/.dgl/
    force_reload : bool
        Whether to reload the dataset. Default: False
    verbose : bool
        Whether to print out progress information. Default: True.
    transform : callable, optional
        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
        a transformed version. The :class:`~dgl.DGLGraph` object will be
        transformed before every access.

    Attributes
    ----------
    num_classes : int
        Number of classes for each node.

    Examples
    --------
    >>> data = CoauthorPhysicsDataset()
    >>> g = data[0]
    >>> num_class = data.num_classes
    >>> feat = g.ndata['feat']  # get node feature
    >>> label = g.ndata['label']  # get node labels
    """

    def __init__(
        self, raw_dir=None, force_reload=False, verbose=False, transform=None
    ):
        super(CoauthorPhysicsDataset, self).__init__(
            name="coauthor_physics",
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )

    @property
    def num_classes(self):
        """Number of classes.

        Return
        -------
        int
        """
        return 5


[docs]class AmazonCoBuyComputerDataset(GNNBenchmarkDataset):
    r"""'Computer' part of the AmazonCoBuy dataset for node classification task.

    Amazon Computers and Amazon Photo are segments of the Amazon co-purchase graph [McAuley et al., 2015],
    where nodes represent goods, edges indicate that two goods are frequently bought together, node
    features are bag-of-words encoded product reviews, and class labels are given by the product category.

    Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_

    Statistics:

    - Nodes: 13,752
    - Edges: 491,722 (note that the original dataset has 245,778 edges but DGL adds
      the reverse edges and remove the duplicates, hence with a different number)
    - Number of classes: 10
    - Node feature size: 767

    Parameters
    ----------
    raw_dir : str
        Raw file directory to download/contains the input data directory.
        Default: ~/.dgl/
    force_reload : bool
        Whether to reload the dataset. Default: False
    verbose : bool
        Whether to print out progress information. Default: True.
    transform : callable, optional
        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
        a transformed version. The :class:`~dgl.DGLGraph` object will be
        transformed before every access.

    Attributes
    ----------
    num_classes : int
        Number of classes for each node.

    Examples
    --------
    >>> data = AmazonCoBuyComputerDataset()
    >>> g = data[0]
    >>> num_class = data.num_classes
    >>> feat = g.ndata['feat']  # get node feature
    >>> label = g.ndata['label']  # get node labels
    """

    def __init__(
        self, raw_dir=None, force_reload=False, verbose=False, transform=None
    ):
        super(AmazonCoBuyComputerDataset, self).__init__(
            name="amazon_co_buy_computer",
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )

    @property
    def num_classes(self):
        """Number of classes.

        Return
        -------
        int
        """
        return 10


[docs]class AmazonCoBuyPhotoDataset(GNNBenchmarkDataset):
    r"""AmazonCoBuy dataset for node classification task.

    Amazon Computers and Amazon Photo are segments of the Amazon co-purchase graph [McAuley et al., 2015],
    where nodes represent goods, edges indicate that two goods are frequently bought together, node
    features are bag-of-words encoded product reviews, and class labels are given by the product category.

    Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_

    Statistics

    - Nodes: 7,650
    - Edges: 238,163 (note that the original dataset has 119,043 edges but DGL adds
      the reverse edges and remove the duplicates, hence with a different number)
    - Number of classes: 8
    - Node feature size: 745

    Parameters
    ----------
    raw_dir : str
        Raw file directory to download/contains the input data directory.
        Default: ~/.dgl/
    force_reload : bool
        Whether to reload the dataset. Default: False
    verbose : bool
        Whether to print out progress information. Default: True.
    transform : callable, optional
        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
        a transformed version. The :class:`~dgl.DGLGraph` object will be
        transformed before every access.

    Attributes
    ----------
    num_classes : int
        Number of classes for each node.

    Examples
    --------
    >>> data = AmazonCoBuyPhotoDataset()
    >>> g = data[0]
    >>> num_class = data.num_classes
    >>> feat = g.ndata['feat']  # get node feature
    >>> label = g.ndata['label']  # get node labels
    """

    def __init__(
        self, raw_dir=None, force_reload=False, verbose=False, transform=None
    ):
        super(AmazonCoBuyPhotoDataset, self).__init__(
            name="amazon_co_buy_photo",
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )

    @property
    def num_classes(self):
        """Number of classes.

        Return
        -------
        int
        """
        return 8


class CoraFull(CoraFullDataset):
    def __init__(self, **kwargs):
        deprecate_class("CoraFull", "CoraFullDataset")
        super(CoraFull, self).__init__(**kwargs)


def AmazonCoBuy(name):
    if name == "computers":
        deprecate_class("AmazonCoBuy", "AmazonCoBuyComputerDataset")
        return AmazonCoBuyComputerDataset()
    elif name == "photo":
        deprecate_class("AmazonCoBuy", "AmazonCoBuyPhotoDataset")
        return AmazonCoBuyPhotoDataset()
    else:
        raise ValueError('Dataset name should be "computers" or "photo".')


def Coauthor(name):
    if name == "cs":
        deprecate_class("Coauthor", "CoauthorCSDataset")
        return CoauthorCSDataset()
    elif name == "physics":
        deprecate_class("Coauthor", "CoauthorPhysicsDataset")
        return CoauthorPhysicsDataset()
    else:
        raise ValueError('Dataset name should be "cs" or "physics".')