Source code for dgl.data.gnn_benchmark

"""GNN Benchmark datasets for node classification."""
import scipy.sparse as sp
import numpy as np
import os

from .dgl_dataset import DGLBuiltinDataset
from .utils import save_graphs, load_graphs, _get_dgl_url, deprecate_property, deprecate_class
from ..convert import graph as dgl_graph
from .. import backend as F
from .. import transform

__all__ = ["AmazonCoBuyComputerDataset", "AmazonCoBuyPhotoDataset", "CoauthorPhysicsDataset", "CoauthorCSDataset",
           "CoraFullDataset", "AmazonCoBuy", "Coauthor", "CoraFull"]


def eliminate_self_loops(A):
    """Remove self-loops from the adjacency matrix."""
    A = A.tolil()
    A.setdiag(0)
    A = A.tocsr()
    A.eliminate_zeros()
    return A


class GNNBenchmarkDataset(DGLBuiltinDataset):
    r"""Base Class for GNN Benchmark dataset

    Reference: https://github.com/shchur/gnn-benchmark#datasets
    """
    def __init__(self, name, raw_dir=None, force_reload=False, verbose=False):
        _url = _get_dgl_url('dataset/' + name + '.zip')
        super(GNNBenchmarkDataset, self).__init__(name=name,
                                                  url=_url,
                                                  raw_dir=raw_dir,
                                                  force_reload=force_reload,
                                                  verbose=verbose)

    def process(self):
        npz_path = os.path.join(self.raw_path, self.name + '.npz')
        g = self._load_npz(npz_path)
        g = transform.reorder_graph(
            g, node_permute_algo='rcmk', edge_permute_algo='dst', store_ids=False)
        self._graph = g
        self._data = [g]
        self._print_info()

    def has_cache(self):
        graph_path = os.path.join(self.save_path, 'dgl_graph_v1.bin')
        if os.path.exists(graph_path):
            return True
        return False

    def save(self):
        graph_path = os.path.join(self.save_path, 'dgl_graph_v1.bin')
        save_graphs(graph_path, self._graph)

    def load(self):
        graph_path = os.path.join(self.save_path, 'dgl_graph_v1.bin')
        graphs, _ = load_graphs(graph_path)
        self._graph = graphs[0]
        self._data = [graphs[0]]
        self._print_info()

    def _print_info(self):
        if self.verbose:
            print('  NumNodes: {}'.format(self._graph.number_of_nodes()))
            print('  NumEdges: {}'.format(self._graph.number_of_edges()))
            print('  NumFeats: {}'.format(self._graph.ndata['feat'].shape[-1]))
            print('  NumbClasses: {}'.format(self.num_classes))

    def _load_npz(self, file_name):
        with np.load(file_name, allow_pickle=True) as loader:
            loader = dict(loader)
            num_nodes = loader['adj_shape'][0]
            adj_matrix = sp.csr_matrix((loader['adj_data'], loader['adj_indices'], loader['adj_indptr']),
                                    shape=loader['adj_shape']).tocoo()

            if 'attr_data' in loader:
                # Attributes are stored as a sparse CSR matrix
                attr_matrix = sp.csr_matrix((loader['attr_data'], loader['attr_indices'], loader['attr_indptr']),
                                            shape=loader['attr_shape']).todense()
            elif 'attr_matrix' in loader:
                # Attributes are stored as a (dense) np.ndarray
                attr_matrix = loader['attr_matrix']
            else:
                attr_matrix = None

            if 'labels_data' in loader:
                # Labels are stored as a CSR matrix
                labels = sp.csr_matrix((loader['labels_data'], loader['labels_indices'], loader['labels_indptr']),
                                    shape=loader['labels_shape']).todense()
            elif 'labels' in loader:
                # Labels are stored as a numpy array
                labels = loader['labels']
            else:
                labels = None
        g = dgl_graph((adj_matrix.row, adj_matrix.col))
        g = transform.to_bidirected(g)
        g.ndata['feat'] = F.tensor(attr_matrix, F.data_type_dict['float32'])
        g.ndata['label'] = F.tensor(labels, F.data_type_dict['int64'])
        return g

    @property
    def num_classes(self):
        """Number of classes."""
        raise NotImplementedError

    @property
    def data(self):
        deprecate_property('dataset.data', 'dataset[0]')
        return self._data

    def __getitem__(self, idx):
        r""" Get graph by index

        Parameters
        ----------
        idx : int
            Item index

        Returns
        -------
        :class:`dgl.DGLGraph`

            The graph contains:

            - ``ndata['feat']``: node features
            - ``ndata['label']``: node labels
        """
        assert idx == 0, "This dataset has only one graph"
        return self._graph

    def __len__(self):
        r"""Number of graphs in the dataset"""
        return 1


[docs]class CoraFullDataset(GNNBenchmarkDataset): r"""CORA-Full dataset for node classification task. .. deprecated:: 0.5.0 - ``data`` is deprecated, it is repalced by: >>> dataset = CoraFullDataset() >>> graph = dataset[0] Extended Cora dataset. Nodes represent paper and edges represent citations. Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_ Statistics: - Nodes: 19,793 - Edges: 126,842 (note that the original dataset has 65,311 edges but DGL adds the reverse edges and remove the duplicates, hence with a different number) - Number of Classes: 70 - Node feature size: 8,710 Parameters ---------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose: bool Whether to print out progress information. Default: True. Attributes ---------- num_classes : int Number of classes for each node. data : list A list of DGLGraph objects Examples -------- >>> data = CoraFullDataset() >>> g = data[0] >>> num_class = data.num_classes >>> feat = g.ndata['feat'] # get node feature >>> label = g.ndata['label'] # get node labels """ def __init__(self, raw_dir=None, force_reload=False, verbose=False): super(CoraFullDataset, self).__init__(name="cora_full", raw_dir=raw_dir, force_reload=force_reload, verbose=verbose) @property def num_classes(self): """Number of classes. Return ------- int """ return 70
[docs]class CoauthorCSDataset(GNNBenchmarkDataset): r""" 'Computer Science (CS)' part of the Coauthor dataset for node classification task. .. deprecated:: 0.5.0 - ``data`` is deprecated, it is repalced by: >>> dataset = CoauthorCSDataset() >>> graph = dataset[0] Coauthor CS and Coauthor Physics are co-authorship graphs based on the Microsoft Academic Graph from the KDD Cup 2016 challenge. Here, nodes are authors, that are connected by an edge if they co-authored a paper; node features represent paper keywords for each author’s papers, and class labels indicate most active fields of study for each author. Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_ Statistics: - Nodes: 18,333 - Edges: 163,788 (note that the original dataset has 81,894 edges but DGL adds the reverse edges and remove the duplicates, hence with a different number) - Number of classes: 15 - Node feature size: 6,805 Parameters ---------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose: bool Whether to print out progress information. Default: True. Attributes ---------- num_classes : int Number of classes for each node. data : list A list of DGLGraph objects Examples -------- >>> data = CoauthorCSDataset() >>> g = data[0] >>> num_class = data.num_classes >>> feat = g.ndata['feat'] # get node feature >>> label = g.ndata['label'] # get node labels """ def __init__(self, raw_dir=None, force_reload=False, verbose=False): super(CoauthorCSDataset, self).__init__(name='coauthor_cs', raw_dir=raw_dir, force_reload=force_reload, verbose=verbose) @property def num_classes(self): """Number of classes. Return ------- int """ return 15
[docs]class CoauthorPhysicsDataset(GNNBenchmarkDataset): r""" 'Physics' part of the Coauthor dataset for node classification task. .. deprecated:: 0.5.0 - ``data`` is deprecated, it is repalced by: >>> dataset = CoauthorPhysicsDataset() >>> graph = dataset[0] Coauthor CS and Coauthor Physics are co-authorship graphs based on the Microsoft Academic Graph from the KDD Cup 2016 challenge. Here, nodes are authors, that are connected by an edge if they co-authored a paper; node features represent paper keywords for each author’s papers, and class labels indicate most active fields of study for each author. Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_ Statistics - Nodes: 34,493 - Edges: 495,924 (note that the original dataset has 247,962 edges but DGL adds the reverse edges and remove the duplicates, hence with a different number) - Number of classes: 5 - Node feature size: 8,415 Parameters ---------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose: bool Whether to print out progress information. Default: True. Attributes ---------- num_classes : int Number of classes for each node. data : list A list of DGLGraph objects Examples -------- >>> data = CoauthorPhysicsDataset() >>> g = data[0] >>> num_class = data.num_classes >>> feat = g.ndata['feat'] # get node feature >>> label = g.ndata['label'] # get node labels """ def __init__(self, raw_dir=None, force_reload=False, verbose=False): super(CoauthorPhysicsDataset, self).__init__(name='coauthor_physics', raw_dir=raw_dir, force_reload=force_reload, verbose=verbose) @property def num_classes(self): """Number of classes. Return ------- int """ return 5
[docs]class AmazonCoBuyComputerDataset(GNNBenchmarkDataset): r""" 'Computer' part of the AmazonCoBuy dataset for node classification task. .. deprecated:: 0.5.0 - ``data`` is deprecated, it is repalced by: >>> dataset = AmazonCoBuyComputerDataset() >>> graph = dataset[0] Amazon Computers and Amazon Photo are segments of the Amazon co-purchase graph [McAuley et al., 2015], where nodes represent goods, edges indicate that two goods are frequently bought together, node features are bag-of-words encoded product reviews, and class labels are given by the product category. Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_ Statistics: - Nodes: 13,752 - Edges: 491,722 (note that the original dataset has 245,778 edges but DGL adds the reverse edges and remove the duplicates, hence with a different number) - Number of classes: 10 - Node feature size: 767 Parameters ---------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose: bool Whether to print out progress information. Default: True. Attributes ---------- num_classes : int Number of classes for each node. data : list A list of DGLGraph objects Examples -------- >>> data = AmazonCoBuyComputerDataset() >>> g = data[0] >>> num_class = data.num_classes >>> feat = g.ndata['feat'] # get node feature >>> label = g.ndata['label'] # get node labels """ def __init__(self, raw_dir=None, force_reload=False, verbose=False): super(AmazonCoBuyComputerDataset, self).__init__(name='amazon_co_buy_computer', raw_dir=raw_dir, force_reload=force_reload, verbose=verbose) @property def num_classes(self): """Number of classes. Return ------- int """ return 10
[docs]class AmazonCoBuyPhotoDataset(GNNBenchmarkDataset): r"""AmazonCoBuy dataset for node classification task. .. deprecated:: 0.5.0 - ``data`` is deprecated, it is repalced by: >>> dataset = AmazonCoBuyPhotoDataset() >>> graph = dataset[0] Amazon Computers and Amazon Photo are segments of the Amazon co-purchase graph [McAuley et al., 2015], where nodes represent goods, edges indicate that two goods are frequently bought together, node features are bag-of-words encoded product reviews, and class labels are given by the product category. Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_ Statistics - Nodes: 7,650 - Edges: 238,163 (note that the original dataset has 119,043 edges but DGL adds the reverse edges and remove the duplicates, hence with a different number) - Number of classes: 8 - Node feature size: 745 Parameters ---------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose: bool Whether to print out progress information. Default: True. Attributes ---------- num_classes : int Number of classes for each node. data : list A list of DGLGraph objects Examples -------- >>> data = AmazonCoBuyPhotoDataset() >>> g = data[0] >>> num_class = data.num_classes >>> feat = g.ndata['feat'] # get node feature >>> label = g.ndata['label'] # get node labels """ def __init__(self, raw_dir=None, force_reload=False, verbose=False): super(AmazonCoBuyPhotoDataset, self).__init__(name='amazon_co_buy_photo', raw_dir=raw_dir, force_reload=force_reload, verbose=verbose) @property def num_classes(self): """Number of classes. Return ------- int """ return 8
class CoraFull(CoraFullDataset): def __init__(self, **kwargs): deprecate_class('CoraFull', 'CoraFullDataset') super(CoraFull, self).__init__(**kwargs) def AmazonCoBuy(name): if name == 'computers': deprecate_class('AmazonCoBuy', 'AmazonCoBuyComputerDataset') return AmazonCoBuyComputerDataset() elif name == 'photo': deprecate_class('AmazonCoBuy', 'AmazonCoBuyPhotoDataset') return AmazonCoBuyPhotoDataset() else: raise ValueError('Dataset name should be "computers" or "photo".') def Coauthor(name): if name == 'cs': deprecate_class('Coauthor', 'CoauthorCSDataset') return CoauthorCSDataset() elif name == 'physics': deprecate_class('Coauthor', 'CoauthorPhysicsDataset') return CoauthorPhysicsDataset() else: raise ValueError('Dataset name should be "cs" or "physics".')