"""Fraud Dataset
"""
import os
from scipy import io
import numpy as np
from .utils import save_graphs, load_graphs, _get_dgl_url
from ..convert import heterograph
from .dgl_dataset import DGLBuiltinDataset
from .. import backend as F
[docs]class FraudDataset(DGLBuiltinDataset):
r"""Fraud node prediction dataset.
The dataset includes two multi-relational graphs extracted from Yelp and Amazon
where nodes represent fraudulent reviews or fraudulent reviewers.
It was first proposed in a CIKM'20 paper <https://arxiv.org/pdf/2008.08692.pdf> and
has been used by a recent WWW'21 paper <https://ponderly.github.io/pub/PCGNN_WWW2021.pdf>
as a benchmark. Another paper <https://arxiv.org/pdf/2104.01404.pdf> also takes
the dataset as an example to study the non-homophilous graphs. This dataset is built
upon industrial data and has rich relational information and unique properties like
class-imbalance and feature inconsistency, which makes the dataset be a good instance
to investigate how GNNs perform on real-world noisy graphs. These graphs are bidirected
and not self connected.
Reference: <https://github.com/YingtongDou/CARE-GNN>
Parameters
----------
name : str
Name of the dataset
raw_dir : str
Specifying the directory that will store the
downloaded data or the directory that
already stores the input data.
Default: ~/.dgl/
random_seed : int
Specifying the random seed in splitting the dataset.
Default: 717
train_size : float
training set size of the dataset.
Default: 0.7
val_size : float
validation set size of the dataset, and the
size of testing set is (1 - train_size - val_size)
Default: 0.1
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Attributes
----------
num_classes : int
Number of label classes
graph : dgl.DGLGraph
Graph structure, etc.
seed : int
Random seed in splitting the dataset.
train_size : float
Training set size of the dataset.
val_size : float
Validation set size of the dataset
Examples
--------
>>> dataset = FraudDataset('yelp')
>>> graph = dataset[0]
>>> num_classes = dataset.num_classes
>>> feat = graph.ndata['feature']
>>> label = graph.ndata['label']
"""
file_urls = {
'yelp': 'dataset/FraudYelp.zip',
'amazon': 'dataset/FraudAmazon.zip'
}
relations = {
'yelp': ['net_rsr', 'net_rtr', 'net_rur'],
'amazon': ['net_upu', 'net_usu', 'net_uvu']
}
file_names = {
'yelp': 'YelpChi.mat',
'amazon': 'Amazon.mat'
}
node_name = {
'yelp': 'review',
'amazon': 'user'
}
def __init__(self, name, raw_dir=None, random_seed=717, train_size=0.7,
val_size=0.1, force_reload=False, verbose=True):
assert name in ['yelp', 'amazon'], "only supports 'yelp', or 'amazon'"
url = _get_dgl_url(self.file_urls[name])
self.seed = random_seed
self.train_size = train_size
self.val_size = val_size
super(FraudDataset, self).__init__(name=name,
url=url,
raw_dir=raw_dir,
hash_key=(random_seed, train_size, val_size),
force_reload=force_reload,
verbose=verbose)
def process(self):
"""process raw data to graph, labels, splitting masks"""
file_path = os.path.join(self.raw_path, self.file_names[self.name])
data = io.loadmat(file_path)
node_features = data['features'].todense()
node_labels = data['label']
graph_data = {}
for relation in self.relations[self.name]:
adj = data[relation].tocoo()
row, col = adj.row, adj.col
graph_data[(self.node_name[self.name], relation, self.node_name[self.name])] = (row, col)
g = heterograph(graph_data)
g.ndata['feature'] = F.tensor(node_features)
g.ndata['label'] = F.tensor(node_labels.T)
self.graph = g
self._random_split(g.ndata['feature'], self.seed, self.train_size, self.val_size)
[docs] def __getitem__(self, idx):
r""" Get graph object
Parameters
----------
idx : int
Item index
Returns
-------
:class:`dgl.DGLGraph`
graph structure, node features, node labels and masks
- ``ndata['feature']``: node features
- ``ndata['label']``: node labels
- ``ndata['train_mask']``: mask of training set
- ``ndata['val_mask']``: mask of validation set
- ``ndata['test_mask']``: mask of testing set
"""
assert idx == 0, "This dataset has only one graph"
return self.graph
[docs] def __len__(self):
"""number of data examples"""
return len(self.graph)
@property
def num_classes(self):
"""Number of classes.
Return
-------
int
"""
return 2
def save(self):
"""save processed data to directory `self.save_path`"""
graph_path = os.path.join(self.save_path, self.name + '_dgl_graph_{}.bin'.format(self.hash))
save_graphs(str(graph_path), self.graph)
def load(self):
"""load processed data from directory `self.save_path`"""
graph_path = os.path.join(self.save_path, self.name + '_dgl_graph_{}.bin'.format(self.hash))
graph_list, _ = load_graphs(str(graph_path))
g = graph_list[0]
self.graph = g
def has_cache(self):
"""check whether there are processed data in `self.save_path`"""
graph_path = os.path.join(self.save_path, self.name + '_dgl_graph_{}.bin'.format(self.hash))
return os.path.exists(graph_path)
def _random_split(self, x, seed=717, train_size=0.7, val_size=0.1):
"""split the dataset into training set, validation set and testing set"""
assert 0 <= train_size + val_size <= 1, \
"The sum of valid training set size and validation set size " \
"must between 0 and 1 (inclusive)."
N = x.shape[0]
index = np.arange(N)
if self.name == 'amazon':
# 0-3304 are unlabeled nodes
index = np.arange(3305, N)
index = np.random.RandomState(seed).permutation(index)
train_idx = index[:int(train_size * len(index))]
val_idx = index[len(index) - int(val_size * len(index)):]
test_idx = index[int(train_size * len(index)):len(index) - int(val_size * len(index))]
train_mask = np.zeros(N, dtype=np.bool)
val_mask = np.zeros(N, dtype=np.bool)
test_mask = np.zeros(N, dtype=np.bool)
train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True
self.graph.ndata['train_mask'] = F.tensor(train_mask)
self.graph.ndata['val_mask'] = F.tensor(val_mask)
self.graph.ndata['test_mask'] = F.tensor(test_mask)
[docs]class FraudYelpDataset(FraudDataset):
r""" Fraud Yelp Dataset
The Yelp dataset includes hotel and restaurant reviews filtered (spam) and recommended
(legitimate) by Yelp. A spam review detection task can be conducted, which is a binary
classification task. 32 handcrafted features from <http://dx.doi.org/10.1145/2783258.2783370>
are taken as the raw node features. Reviews are nodes in the graph, and three relations are:
1. R-U-R: it connects reviews posted by the same user
2. R-S-R: it connects reviews under the same product with the same star rating (1-5 stars)
3. R-T-R: it connects two reviews under the same product posted in the same month.
Statistics:
- Nodes: 45,954
- Edges:
- R-U-R: 98,630
- R-T-R: 1,147,232
- R-S-R: 6,805,486
- Classes:
- Positive (spam): 6,677
- Negative (legitimate): 39,277
- Positive-Negative ratio: 1 : 5.9
- Node feature size: 32
Parameters
----------
raw_dir : str
Specifying the directory that will store the
downloaded data or the directory that
already stores the input data.
Default: ~/.dgl/
random_seed : int
Specifying the random seed in splitting the dataset.
Default: 717
train_size : float
training set size of the dataset.
Default: 0.7
val_size : float
validation set size of the dataset, and the
size of testing set is (1 - train_size - val_size)
Default: 0.1
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Examples
--------
>>> dataset = FraudYelpDataset()
>>> graph = dataset[0]
>>> num_classes = dataset.num_classes
>>> feat = graph.ndata['feature']
>>> label = graph.ndata['label']
"""
def __init__(self, raw_dir=None, random_seed=717, train_size=0.7,
val_size=0.1, force_reload=False, verbose=True):
super(FraudYelpDataset, self).__init__(name='yelp',
raw_dir=raw_dir,
random_seed=random_seed,
train_size=train_size,
val_size=val_size,
force_reload=force_reload,
verbose=verbose)
[docs]class FraudAmazonDataset(FraudDataset):
r""" Fraud Amazon Dataset
The Amazon dataset includes product reviews under the Musical Instruments category.
Users with more than 80% helpful votes are labelled as benign entities and users with
less than 20% helpful votes are labelled as fraudulent entities. A fraudulent user
detection task can be conducted on the Amazon dataset, which is a binary classification
task. 25 handcrafted features from <https://arxiv.org/pdf/2005.10150.pdf> are taken as
the raw node features .
Users are nodes in the graph, and three relations are:
1. U-P-U : it connects users reviewing at least one same product
2. U-S-U : it connects users having at least one same star rating within one week
3. U-V-U : it connects users with top 5% mutual review text similarities (measured by
TF-IDF) among all users.
Statistics:
- Nodes: 11,944
- Edges:
- U-P-U: 351,216
- U-S-U: 7,132,958
- U-V-U: 2,073,474
- Classes:
- Positive (fraudulent): 821
- Negative (benign): 7,818
- Unlabeled: 3,305
- Positive-Negative ratio: 1 : 10.5
- Node feature size: 25
Parameters
----------
raw_dir : str
Specifying the directory that will store the
downloaded data or the directory that
already stores the input data.
Default: ~/.dgl/
random_seed : int
Specifying the random seed in splitting the dataset.
Default: 717
train_size : float
training set size of the dataset.
Default: 0.7
val_size : float
validation set size of the dataset, and the
size of testing set is (1 - train_size - val_size)
Default: 0.1
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Examples
--------
>>> dataset = FraudAmazonDataset()
>>> graph = dataset[0]
>>> num_classes = dataset.num_classes
>>> feat = graph.ndata['feature']
>>> label = graph.ndata['label']
"""
def __init__(self, raw_dir=None, random_seed=717, train_size=0.7,
val_size=0.1, force_reload=False, verbose=True):
super(FraudAmazonDataset, self).__init__(name='amazon',
raw_dir=raw_dir,
random_seed=random_seed,
train_size=train_size,
val_size=val_size,
force_reload=force_reload,
verbose=verbose)