"""GNN Benchmark datasets for node classification."""
import os
import numpy as np
import scipy.sparse as sp
from .. import backend as F, transforms
from ..convert import graph as dgl_graph
from .dgl_dataset import DGLBuiltinDataset
from .utils import (
_get_dgl_url,
deprecate_class,
deprecate_property,
load_graphs,
save_graphs,
)
__all__ = [
"AmazonCoBuyComputerDataset",
"AmazonCoBuyPhotoDataset",
"CoauthorPhysicsDataset",
"CoauthorCSDataset",
"CoraFullDataset",
"AmazonCoBuy",
"Coauthor",
"CoraFull",
]
def eliminate_self_loops(A):
"""Remove self-loops from the adjacency matrix."""
A = A.tolil()
A.setdiag(0)
A = A.tocsr()
A.eliminate_zeros()
return A
class GNNBenchmarkDataset(DGLBuiltinDataset):
r"""Base Class for GNN Benchmark dataset
Reference: https://github.com/shchur/gnn-benchmark#datasets
"""
def __init__(
self,
name,
raw_dir=None,
force_reload=False,
verbose=False,
transform=None,
):
_url = _get_dgl_url("dataset/" + name + ".zip")
super(GNNBenchmarkDataset, self).__init__(
name=name,
url=_url,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
def process(self):
npz_path = os.path.join(self.raw_path, self.name + ".npz")
g = self._load_npz(npz_path)
g = transforms.reorder_graph(
g,
node_permute_algo="rcmk",
edge_permute_algo="dst",
store_ids=False,
)
self._graph = g
self._data = [g]
self._print_info()
def has_cache(self):
graph_path = os.path.join(self.save_path, "dgl_graph_v1.bin")
if os.path.exists(graph_path):
return True
return False
def save(self):
graph_path = os.path.join(self.save_path, "dgl_graph_v1.bin")
save_graphs(graph_path, self._graph)
def load(self):
graph_path = os.path.join(self.save_path, "dgl_graph_v1.bin")
graphs, _ = load_graphs(graph_path)
self._graph = graphs[0]
self._data = [graphs[0]]
self._print_info()
def _print_info(self):
if self.verbose:
print(" NumNodes: {}".format(self._graph.num_nodes()))
print(" NumEdges: {}".format(self._graph.num_edges()))
print(" NumFeats: {}".format(self._graph.ndata["feat"].shape[-1]))
print(" NumbClasses: {}".format(self.num_classes))
def _load_npz(self, file_name):
with np.load(file_name, allow_pickle=True) as loader:
loader = dict(loader)
num_nodes = loader["adj_shape"][0]
adj_matrix = sp.csr_matrix(
(
loader["adj_data"],
loader["adj_indices"],
loader["adj_indptr"],
),
shape=loader["adj_shape"],
).tocoo()
if "attr_data" in loader:
# Attributes are stored as a sparse CSR matrix
attr_matrix = sp.csr_matrix(
(
loader["attr_data"],
loader["attr_indices"],
loader["attr_indptr"],
),
shape=loader["attr_shape"],
).todense()
elif "attr_matrix" in loader:
# Attributes are stored as a (dense) np.ndarray
attr_matrix = loader["attr_matrix"]
else:
attr_matrix = None
if "labels_data" in loader:
# Labels are stored as a CSR matrix
labels = sp.csr_matrix(
(
loader["labels_data"],
loader["labels_indices"],
loader["labels_indptr"],
),
shape=loader["labels_shape"],
).todense()
elif "labels" in loader:
# Labels are stored as a numpy array
labels = loader["labels"]
else:
labels = None
g = dgl_graph((adj_matrix.row, adj_matrix.col))
g = transforms.to_bidirected(g)
g.ndata["feat"] = F.tensor(attr_matrix, F.data_type_dict["float32"])
g.ndata["label"] = F.tensor(labels, F.data_type_dict["int64"])
return g
@property
def num_classes(self):
"""Number of classes."""
raise NotImplementedError
def __getitem__(self, idx):
r"""Get graph by index
Parameters
----------
idx : int
Item index
Returns
-------
:class:`dgl.DGLGraph`
The graph contains:
- ``ndata['feat']``: node features
- ``ndata['label']``: node labels
"""
assert idx == 0, "This dataset has only one graph"
if self._transform is None:
return self._graph
else:
return self._transform(self._graph)
def __len__(self):
r"""Number of graphs in the dataset"""
return 1
[docs]class CoraFullDataset(GNNBenchmarkDataset):
r"""CORA-Full dataset for node classification task.
Extended Cora dataset. Nodes represent paper and edges represent citations.
Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_
Statistics:
- Nodes: 19,793
- Edges: 126,842 (note that the original dataset has 65,311 edges but DGL adds
the reverse edges and remove the duplicates, hence with a different number)
- Number of Classes: 70
- Node feature size: 8,710
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose : bool
Whether to print out progress information. Default: True.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
Attributes
----------
num_classes : int
Number of classes for each node.
Examples
--------
>>> data = CoraFullDataset()
>>> g = data[0]
>>> num_class = data.num_classes
>>> feat = g.ndata['feat'] # get node feature
>>> label = g.ndata['label'] # get node labels
"""
def __init__(
self, raw_dir=None, force_reload=False, verbose=False, transform=None
):
super(CoraFullDataset, self).__init__(
name="cora_full",
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
@property
def num_classes(self):
"""Number of classes.
Return
-------
int
"""
return 70
[docs]class CoauthorCSDataset(GNNBenchmarkDataset):
r"""'Computer Science (CS)' part of the Coauthor dataset for node classification task.
Coauthor CS and Coauthor Physics are co-authorship graphs based on the Microsoft Academic Graph
from the KDD Cup 2016 challenge. Here, nodes are authors, that are connected by an edge if they
co-authored a paper; node features represent paper keywords for each authorโs papers, and class
labels indicate most active fields of study for each author.
Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_
Statistics:
- Nodes: 18,333
- Edges: 163,788 (note that the original dataset has 81,894 edges but DGL adds
the reverse edges and remove the duplicates, hence with a different number)
- Number of classes: 15
- Node feature size: 6,805
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose : bool
Whether to print out progress information. Default: True.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
Attributes
----------
num_classes : int
Number of classes for each node.
Examples
--------
>>> data = CoauthorCSDataset()
>>> g = data[0]
>>> num_class = data.num_classes
>>> feat = g.ndata['feat'] # get node feature
>>> label = g.ndata['label'] # get node labels
"""
def __init__(
self, raw_dir=None, force_reload=False, verbose=False, transform=None
):
super(CoauthorCSDataset, self).__init__(
name="coauthor_cs",
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
@property
def num_classes(self):
"""Number of classes.
Return
-------
int
"""
return 15
[docs]class CoauthorPhysicsDataset(GNNBenchmarkDataset):
r"""'Physics' part of the Coauthor dataset for node classification task.
Coauthor CS and Coauthor Physics are co-authorship graphs based on the Microsoft Academic Graph
from the KDD Cup 2016 challenge. Here, nodes are authors, that are connected by an edge if they
co-authored a paper; node features represent paper keywords for each authorโs papers, and class
labels indicate most active fields of study for each author.
Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_
Statistics
- Nodes: 34,493
- Edges: 495,924 (note that the original dataset has 247,962 edges but DGL adds
the reverse edges and remove the duplicates, hence with a different number)
- Number of classes: 5
- Node feature size: 8,415
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose : bool
Whether to print out progress information. Default: True.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
Attributes
----------
num_classes : int
Number of classes for each node.
Examples
--------
>>> data = CoauthorPhysicsDataset()
>>> g = data[0]
>>> num_class = data.num_classes
>>> feat = g.ndata['feat'] # get node feature
>>> label = g.ndata['label'] # get node labels
"""
def __init__(
self, raw_dir=None, force_reload=False, verbose=False, transform=None
):
super(CoauthorPhysicsDataset, self).__init__(
name="coauthor_physics",
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
@property
def num_classes(self):
"""Number of classes.
Return
-------
int
"""
return 5
[docs]class AmazonCoBuyComputerDataset(GNNBenchmarkDataset):
r"""'Computer' part of the AmazonCoBuy dataset for node classification task.
Amazon Computers and Amazon Photo are segments of the Amazon co-purchase graph [McAuley et al., 2015],
where nodes represent goods, edges indicate that two goods are frequently bought together, node
features are bag-of-words encoded product reviews, and class labels are given by the product category.
Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_
Statistics:
- Nodes: 13,752
- Edges: 491,722 (note that the original dataset has 245,778 edges but DGL adds
the reverse edges and remove the duplicates, hence with a different number)
- Number of classes: 10
- Node feature size: 767
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose : bool
Whether to print out progress information. Default: True.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
Attributes
----------
num_classes : int
Number of classes for each node.
Examples
--------
>>> data = AmazonCoBuyComputerDataset()
>>> g = data[0]
>>> num_class = data.num_classes
>>> feat = g.ndata['feat'] # get node feature
>>> label = g.ndata['label'] # get node labels
"""
def __init__(
self, raw_dir=None, force_reload=False, verbose=False, transform=None
):
super(AmazonCoBuyComputerDataset, self).__init__(
name="amazon_co_buy_computer",
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
@property
def num_classes(self):
"""Number of classes.
Return
-------
int
"""
return 10
[docs]class AmazonCoBuyPhotoDataset(GNNBenchmarkDataset):
r"""AmazonCoBuy dataset for node classification task.
Amazon Computers and Amazon Photo are segments of the Amazon co-purchase graph [McAuley et al., 2015],
where nodes represent goods, edges indicate that two goods are frequently bought together, node
features are bag-of-words encoded product reviews, and class labels are given by the product category.
Reference: `<https://github.com/shchur/gnn-benchmark#datasets>`_
Statistics
- Nodes: 7,650
- Edges: 238,163 (note that the original dataset has 119,043 edges but DGL adds
the reverse edges and remove the duplicates, hence with a different number)
- Number of classes: 8
- Node feature size: 745
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose : bool
Whether to print out progress information. Default: True.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
Attributes
----------
num_classes : int
Number of classes for each node.
Examples
--------
>>> data = AmazonCoBuyPhotoDataset()
>>> g = data[0]
>>> num_class = data.num_classes
>>> feat = g.ndata['feat'] # get node feature
>>> label = g.ndata['label'] # get node labels
"""
def __init__(
self, raw_dir=None, force_reload=False, verbose=False, transform=None
):
super(AmazonCoBuyPhotoDataset, self).__init__(
name="amazon_co_buy_photo",
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
@property
def num_classes(self):
"""Number of classes.
Return
-------
int
"""
return 8
class CoraFull(CoraFullDataset):
def __init__(self, **kwargs):
deprecate_class("CoraFull", "CoraFullDataset")
super(CoraFull, self).__init__(**kwargs)
def AmazonCoBuy(name):
if name == "computers":
deprecate_class("AmazonCoBuy", "AmazonCoBuyComputerDataset")
return AmazonCoBuyComputerDataset()
elif name == "photo":
deprecate_class("AmazonCoBuy", "AmazonCoBuyPhotoDataset")
return AmazonCoBuyPhotoDataset()
else:
raise ValueError('Dataset name should be "computers" or "photo".')
def Coauthor(name):
if name == "cs":
deprecate_class("Coauthor", "CoauthorCSDataset")
return CoauthorCSDataset()
elif name == "physics":
deprecate_class("Coauthor", "CoauthorPhysicsDataset")
return CoauthorPhysicsDataset()
else:
raise ValueError('Dataset name should be "cs" or "physics".')