Source code for dgl.data.fraud

"""Fraud Dataset
"""
import os

import numpy as np
from scipy import io

from .. import backend as F
from ..convert import heterograph
from .dgl_dataset import DGLBuiltinDataset
from .utils import _get_dgl_url, load_graphs, save_graphs


[docs]class FraudDataset(DGLBuiltinDataset): r"""Fraud node prediction dataset. The dataset includes two multi-relational graphs extracted from Yelp and Amazon where nodes represent fraudulent reviews or fraudulent reviewers. It was first proposed in a CIKM'20 paper <https://arxiv.org/pdf/2008.08692.pdf> and has been used by a recent WWW'21 paper <https://ponderly.github.io/pub/PCGNN_WWW2021.pdf> as a benchmark. Another paper <https://arxiv.org/pdf/2104.01404.pdf> also takes the dataset as an example to study the non-homophilous graphs. This dataset is built upon industrial data and has rich relational information and unique properties like class-imbalance and feature inconsistency, which makes the dataset be a good instance to investigate how GNNs perform on real-world noisy graphs. These graphs are bidirected and not self connected. Reference: <https://github.com/YingtongDou/CARE-GNN> Parameters ---------- name : str Name of the dataset raw_dir : str Specifying the directory that will store the downloaded data or the directory that already stores the input data. Default: ~/.dgl/ random_seed : int Specifying the random seed in splitting the dataset. Default: 717 train_size : float training set size of the dataset. Default: 0.7 val_size : float validation set size of the dataset, and the size of testing set is (1 - train_size - val_size) Default: 0.1 force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Attributes ---------- num_classes : int Number of label classes graph : dgl.DGLGraph Graph structure, etc. seed : int Random seed in splitting the dataset. train_size : float Training set size of the dataset. val_size : float Validation set size of the dataset Examples -------- >>> dataset = FraudDataset('yelp') >>> graph = dataset[0] >>> num_classes = dataset.num_classes >>> feat = graph.ndata['feature'] >>> label = graph.ndata['label'] """ file_urls = { "yelp": "dataset/FraudYelp.zip", "amazon": "dataset/FraudAmazon.zip", } relations = { "yelp": ["net_rsr", "net_rtr", "net_rur"], "amazon": ["net_upu", "net_usu", "net_uvu"], } file_names = {"yelp": "YelpChi.mat", "amazon": "Amazon.mat"} node_name = {"yelp": "review", "amazon": "user"} def __init__( self, name, raw_dir=None, random_seed=717, train_size=0.7, val_size=0.1, force_reload=False, verbose=True, transform=None, ): assert name in ["yelp", "amazon"], "only supports 'yelp', or 'amazon'" url = _get_dgl_url(self.file_urls[name]) self.seed = random_seed self.train_size = train_size self.val_size = val_size super(FraudDataset, self).__init__( name=name, url=url, raw_dir=raw_dir, hash_key=(random_seed, train_size, val_size), force_reload=force_reload, verbose=verbose, transform=transform, ) def process(self): """process raw data to graph, labels, splitting masks""" file_path = os.path.join(self.raw_path, self.file_names[self.name]) data = io.loadmat(file_path) node_features = data["features"].todense() # remove additional dimension of length 1 in raw .mat file node_labels = data["label"].squeeze() graph_data = {} for relation in self.relations[self.name]: adj = data[relation].tocoo() row, col = adj.row, adj.col graph_data[ (self.node_name[self.name], relation, self.node_name[self.name]) ] = (row, col) g = heterograph(graph_data) g.ndata["feature"] = F.tensor( node_features, dtype=F.data_type_dict["float32"] ) g.ndata["label"] = F.tensor( node_labels, dtype=F.data_type_dict["int64"] ) self.graph = g self._random_split( g.ndata["feature"], self.seed, self.train_size, self.val_size )
[docs] def __getitem__(self, idx): r"""Get graph object Parameters ---------- idx : int Item index Returns ------- :class:`dgl.DGLGraph` graph structure, node features, node labels and masks - ``ndata['feature']``: node features - ``ndata['label']``: node labels - ``ndata['train_mask']``: mask of training set - ``ndata['val_mask']``: mask of validation set - ``ndata['test_mask']``: mask of testing set """ assert idx == 0, "This dataset has only one graph" if self._transform is None: return self.graph else: return self._transform(self.graph)
[docs] def __len__(self): """number of data examples""" return len(self.graph)
@property def num_classes(self): """Number of classes. Return ------- int """ return 2 def save(self): """save processed data to directory `self.save_path`""" graph_path = os.path.join( self.save_path, self.name + "_dgl_graph_{}.bin".format(self.hash) ) save_graphs(str(graph_path), self.graph) def load(self): """load processed data from directory `self.save_path`""" graph_path = os.path.join( self.save_path, self.name + "_dgl_graph_{}.bin".format(self.hash) ) graph_list, _ = load_graphs(str(graph_path)) g = graph_list[0] self.graph = g def has_cache(self): """check whether there are processed data in `self.save_path`""" graph_path = os.path.join( self.save_path, self.name + "_dgl_graph_{}.bin".format(self.hash) ) return os.path.exists(graph_path) def _random_split(self, x, seed=717, train_size=0.7, val_size=0.1): """split the dataset into training set, validation set and testing set""" assert 0 <= train_size + val_size <= 1, ( "The sum of valid training set size and validation set size " "must between 0 and 1 (inclusive)." ) N = x.shape[0] index = np.arange(N) if self.name == "amazon": # 0-3304 are unlabeled nodes index = np.arange(3305, N) index = np.random.RandomState(seed).permutation(index) train_idx = index[: int(train_size * len(index))] val_idx = index[len(index) - int(val_size * len(index)) :] test_idx = index[ int(train_size * len(index)) : len(index) - int(val_size * len(index)) ] train_mask = np.zeros(N, dtype=np.bool_) val_mask = np.zeros(N, dtype=np.bool_) test_mask = np.zeros(N, dtype=np.bool_) train_mask[train_idx] = True val_mask[val_idx] = True test_mask[test_idx] = True self.graph.ndata["train_mask"] = F.tensor(train_mask) self.graph.ndata["val_mask"] = F.tensor(val_mask) self.graph.ndata["test_mask"] = F.tensor(test_mask)
[docs]class FraudYelpDataset(FraudDataset): r"""Fraud Yelp Dataset The Yelp dataset includes hotel and restaurant reviews filtered (spam) and recommended (legitimate) by Yelp. A spam review detection task can be conducted, which is a binary classification task. 32 handcrafted features from <http://dx.doi.org/10.1145/2783258.2783370> are taken as the raw node features. Reviews are nodes in the graph, and three relations are: 1. R-U-R: it connects reviews posted by the same user 2. R-S-R: it connects reviews under the same product with the same star rating (1-5 stars) 3. R-T-R: it connects two reviews under the same product posted in the same month. Statistics: - Nodes: 45,954 - Edges: - R-U-R: 98,630 - R-T-R: 1,147,232 - R-S-R: 6,805,486 - Classes: - Positive (spam): 6,677 - Negative (legitimate): 39,277 - Positive-Negative ratio: 1 : 5.9 - Node feature size: 32 Parameters ---------- raw_dir : str Specifying the directory that will store the downloaded data or the directory that already stores the input data. Default: ~/.dgl/ random_seed : int Specifying the random seed in splitting the dataset. Default: 717 train_size : float training set size of the dataset. Default: 0.7 val_size : float validation set size of the dataset, and the size of testing set is (1 - train_size - val_size) Default: 0.1 force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Examples -------- >>> dataset = FraudYelpDataset() >>> graph = dataset[0] >>> num_classes = dataset.num_classes >>> feat = graph.ndata['feature'] >>> label = graph.ndata['label'] """ def __init__( self, raw_dir=None, random_seed=717, train_size=0.7, val_size=0.1, force_reload=False, verbose=True, transform=None, ): super(FraudYelpDataset, self).__init__( name="yelp", raw_dir=raw_dir, random_seed=random_seed, train_size=train_size, val_size=val_size, force_reload=force_reload, verbose=verbose, transform=transform, )
[docs]class FraudAmazonDataset(FraudDataset): r"""Fraud Amazon Dataset The Amazon dataset includes product reviews under the Musical Instruments category. Users with more than 80% helpful votes are labelled as benign entities and users with less than 20% helpful votes are labelled as fraudulent entities. A fraudulent user detection task can be conducted on the Amazon dataset, which is a binary classification task. 25 handcrafted features from <https://arxiv.org/pdf/2005.10150.pdf> are taken as the raw node features . Users are nodes in the graph, and three relations are: 1. U-P-U : it connects users reviewing at least one same product 2. U-S-U : it connects users having at least one same star rating within one week 3. U-V-U : it connects users with top 5% mutual review text similarities (measured by TF-IDF) among all users. Statistics: - Nodes: 11,944 - Edges: - U-P-U: 351,216 - U-S-U: 7,132,958 - U-V-U: 2,073,474 - Classes: - Positive (fraudulent): 821 - Negative (benign): 7,818 - Unlabeled: 3,305 - Positive-Negative ratio: 1 : 10.5 - Node feature size: 25 Parameters ---------- raw_dir : str Specifying the directory that will store the downloaded data or the directory that already stores the input data. Default: ~/.dgl/ random_seed : int Specifying the random seed in splitting the dataset. Default: 717 train_size : float training set size of the dataset. Default: 0.7 val_size : float validation set size of the dataset, and the size of testing set is (1 - train_size - val_size) Default: 0.1 force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Examples -------- >>> dataset = FraudAmazonDataset() >>> graph = dataset[0] >>> num_classes = dataset.num_classes >>> feat = graph.ndata['feature'] >>> label = graph.ndata['label'] """ def __init__( self, raw_dir=None, random_seed=717, train_size=0.7, val_size=0.1, force_reload=False, verbose=True, transform=None, ): super(FraudAmazonDataset, self).__init__( name="amazon", raw_dir=raw_dir, random_seed=random_seed, train_size=train_size, val_size=val_size, force_reload=force_reload, verbose=verbose, transform=transform, )