"""MovieLens dataset"""
import os
import numpy as np
import pandas as pd
from torch import LongTensor, Tensor
from ..base import dgl_warning
from ..convert import heterograph
from .dgl_dataset import DGLDataset
from .utils import (
_get_dgl_url,
download,
extract_archive,
load_graphs,
load_info,
save_graphs,
save_info,
split_dataset,
)
GENRES_ML_100K = [
"unknown",
"Action",
"Adventure",
"Animation",
"Children",
"Comedy",
"Crime",
"Documentary",
"Drama",
"Fantasy",
"Film-Noir",
"Horror",
"Musical",
"Mystery",
"Romance",
"Sci-Fi",
"Thriller",
"War",
"Western",
]
GENRES_ML_1M = GENRES_ML_100K[1:]
GENRES_ML_10M = GENRES_ML_100K + ["IMAX"]
try:
import torch
except ImportError:
HAS_TORCH = False
else:
HAS_TORCH = True
def check_pytorch():
"""Check if PyTorch is the backend."""
if not HAS_TORCH:
raise ModuleNotFoundError(
"MovieLensDataset requires PyTorch to be the backend."
)
[docs]class MovieLensDataset(DGLDataset):
r"""MovieLens dataset for edge prediction tasks. The raw datasets are extracted from
`MovieLens <https://grouplens.org/datasets/movielens/>`, introduced by
`Movielens unplugged: experiences with an occasionally connected recommender system <https://dl.acm.org/doi/10.1145/604045.604094>`.
The datasets consist of user ratings for movies and incorporate additional user/movie information in the form of features.
The nodes represent users and movies, and the edges store ratings that users assign to movies.
Statistics:
MovieLens-100K (ml-100k)
- Users: 943
- Movies: 1,682
- Ratings: 100,000 (1, 2, 3, 4, 5)
MovieLens-1M (ml-1m)
- Users: 6,040
- Movies: 3,706
- Ratings: 1,000,209 (1, 2, 3, 4, 5)
MovieLens-10M (ml-10m)
- Users: 69,878
- Movies: 10,677
- Ratings: 10,000,054 (0.5, 1, 1.5, ..., 4.5, 5.0)
Parameters
----------
name: str
Dataset name. (:obj:`"ml-100k"`, :obj:`"ml-1m"`, :obj:`"ml-10m"`).
valid_ratio: int
Ratio of validation samples out of the whole dataset. Should be in (0.0, 1.0).
test_ratio: int, optional
Ratio of testing samples out of the whole dataset. Should be in (0.0, 1.0). And its sum with
:obj:`valid_ratio` should be in (0.0, 1.0) as well. This parameter is invalid
when :obj:`name` is :obj:`"ml-100k"`, since its testing samples are pre-specified.
Default: None
raw_dir : str, optional
Raw file directory to download/store the data.
Default: ~/.dgl/
force_reload : bool, optional
Whether to re-download(if the dataset has not been downloaded) and re-process the dataset.
Default: False
verbose : bool, optional
Whether to print progress information. Default: True.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
random_state : int, optional
Random seed used for random dataset split. Default: 0
Notes
-----
- When :obj:`name` is :obj:`"ml-100k"`, the :obj:`test_ratio` is invalid, and the training ratio is equal to 1-:obj:`valid_ratio`.
When :obj:`name` is :obj:`"ml-1m"` or :obj:`"ml-10m"`, the :obj:`test_ratio` is valid,
and the training ratio is equal to 1-:obj:`valid_ratio`-:obj:`test_ratio`.
- The number of edges is doubled to form an undirected(bidirected) graph structure.
Examples
--------
>>> from dgl.data import MovieLensDataset
>>> dataset = MovieLensDataset(name='ml-100k', valid_ratio=0.2)
>>> g = dataset[0]
>>> g
Graph(num_nodes={'movie': 1682, 'user': 943},
num_edges={('movie', 'movie-user', 'user'): 100000, ('user', 'user-movie', 'movie'): 100000},
metagraph=[('movie', 'user', 'movie-user'), ('user', 'movie', 'user-movie')])
>>> # get ratings of edges in the training graph.
>>> rate = g.edges['user-movie'].data['rate'] # or rate = g.edges['movie-user'].data['rate']
>>> rate
tensor([5., 5., 3., ..., 3., 3., 5.])
>>> # get train, valid and test mask of edges
>>> train_mask = g.edges['user-movie'].data['train_mask']
>>> valid_mask = g.edges['user-movie'].data['valid_mask']
>>> test_mask = g.edges['user-movie'].data['test_mask']
>>> # get train, valid and test ratings
>>> train_ratings = rate[train_mask]
>>> valid_ratings = rate[valid_mask]
>>> test_ratings = rate[test_mask]
>>> # get input features of users
>>> g.nodes["user"].data["feat"] # or g.nodes["movie"].data["feat"] for movie nodes
tensor([[0.4800, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],
[1.0600, 1.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],
[0.4600, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],
...,
[0.4000, 0.0000, 1.0000, ..., 0.0000, 0.0000, 0.0000],
[0.9600, 1.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],
[0.4400, 0.0000, 1.0000, ..., 0.0000, 0.0000, 0.0000]])
"""
_url = {
"ml-100k": "dataset/ml-100k.zip",
"ml-1m": "dataset/ml-1m.zip",
"ml-10m": "dataset/ml-10m.zip",
}
def __init__(
self,
name,
valid_ratio,
test_ratio=None,
raw_dir=None,
force_reload=None,
verbose=None,
transform=None,
random_state=0,
):
check_pytorch()
assert name in [
"ml-100k",
"ml-1m",
"ml-10m",
], f"currently movielens does not support {name}"
# test regarding valid and test split ratio
assert (
valid_ratio > 0.0 and valid_ratio < 1.0
), f"valid_ratio {valid_ratio} must be in (0.0, 1.0)"
if name in ["ml-1m", "ml-10m"]:
assert (
test_ratio is not None and test_ratio > 0.0 and test_ratio < 1.0
), f"test_ratio({test_ratio}) must be set to a value in (0.0, 1.0) when using ml-1m and ml-10m"
assert (
test_ratio + valid_ratio > 0.0
and test_ratio + valid_ratio < 1.0
), f"test_ratio({test_ratio}) + valid_ratio({valid_ratio}) must be set to (0.0, 1.0) when using ml-1m and ml-10m"
if name == "ml-100k" and test_ratio is not None:
dgl_warning(
f"test_ratio ({test_ratio}) is not set to None for ml-100k. "
"Note that dataset split would not be affected by the test_ratio since "
"testing samples of ml-100k have been pre-specified."
)
self.valid_ratio = valid_ratio
self.test_ratio = test_ratio
self.random_state = random_state
if name == "ml-100k":
self.genres = GENRES_ML_100K
elif name == "ml-1m":
self.genres = GENRES_ML_1M
elif name == "ml-10m":
self.genres = GENRES_ML_10M
else:
raise NotImplementedError
super(MovieLensDataset, self).__init__(
name=name,
url=_get_dgl_url(self._url[name]),
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
def check_version(self):
valid_ratio, test_ratio = load_info(self.version_path)
if self.valid_ratio == valid_ratio and (
self.test_ratio == test_ratio if self.name != "ml-100k" else True
):
return True
else:
if self.name == "ml-100k":
print(
f"The current valid ratio ({self.valid_ratio}) "
"is not the same as the last setting "
f"(valid: {valid_ratio}). "
f"MovieLens {self.name} will be re-processed with the new dataset split setting."
)
else:
print(
f"At least one of current valid ({self.valid_ratio}) and test ({self.test_ratio}) ratio "
"are not the same as the last setting "
f"(valid: {valid_ratio}, test: {test_ratio}). "
f"MovieLens {self.name} will be re-processed with the new dataset split setting."
)
return False
def download(self):
zip_file_path = os.path.join(self.raw_dir, self.name + ".zip")
download(self.url, path=zip_file_path)
extract_archive(zip_file_path, self.raw_dir, overwrite=True)
def process(self):
print(f"Starting processing {self.name} ...")
# 0. loading movie features
movie_feat = load_info(
os.path.join(self.raw_path, "movie_feat.pkl")
).to(torch.float)
# 1. dataset split: train + (valid + ) test
if self.name == "ml-100k":
train_rating_data = self._load_raw_rates(
os.path.join(self.raw_path, "u1.base"), "\t"
)
test_rating_data = self._load_raw_rates(
os.path.join(self.raw_path, "u1.test"), "\t"
)
indices = np.arange(len(train_rating_data))
train, valid, _ = split_dataset(
indices,
[1 - self.valid_ratio, self.valid_ratio, 0.0],
shuffle=True,
random_state=self.random_state,
)
train_rating_data, valid_rating_data = (
train_rating_data.iloc[train.indices],
train_rating_data.iloc[valid.indices],
)
all_rating_data = pd.concat(
[train_rating_data, valid_rating_data, test_rating_data]
)
elif self.name == "ml-1m" or self.name == "ml-10m":
all_rating_data = self._load_raw_rates(
os.path.join(self.raw_path, "ratings.dat"), "::"
)
indices = np.arange(len(all_rating_data))
train, valid, test = split_dataset(
indices,
[
1 - self.valid_ratio - self.test_ratio,
self.valid_ratio,
self.test_ratio,
],
shuffle=True,
random_state=self.random_state,
)
train_rating_data, valid_rating_data, test_rating_data = (
all_rating_data.iloc[train.indices],
all_rating_data.iloc[valid.indices],
all_rating_data.iloc[test.indices],
)
# 2. load user and movie data, and drop those unseen in rating_data
user_data = self._load_raw_user_data()
movie_data = self._load_raw_movie_data()
user_data = self._drop_unseen_nodes(
data_df=user_data,
col_name="id",
reserved_ids_set=set(all_rating_data["user_id"].values),
)
movie_data = self._drop_unseen_nodes(
data_df=movie_data,
col_name="id",
reserved_ids_set=set(all_rating_data["movie_id"].values),
)
user_feat = Tensor(self._process_user_feat(user_data))
# 3. generate rating pairs
# Map user/movie to the global id
self._global_user_id_map = {
ele: i for i, ele in enumerate(user_data["id"])
}
self._global_movie_id_map = {
ele: i for i, ele in enumerate(movie_data["id"])
}
# pair value is idx rather than id
u_indices, v_indices, labels = self._generate_pair_value(
all_rating_data
)
all_rating_pairs = (
LongTensor(u_indices),
LongTensor(v_indices),
)
all_rating_values = Tensor(labels)
graph = self.construct_g(
all_rating_pairs, all_rating_values, user_feat, movie_feat
)
self.graph = self.add_masks(
graph, train_rating_data, valid_rating_data, test_rating_data
)
print(f"End processing {self.name} ...")
def construct_g(self, rate_pairs, rate_values, user_feat, movie_feat):
g = heterograph(
{
("user", "user-movie", "movie"): (rate_pairs[0], rate_pairs[1]),
("movie", "movie-user", "user"): (rate_pairs[1], rate_pairs[0]),
}
)
ndata = {"user": user_feat, "movie": movie_feat}
edata = {"user-movie": rate_values, "movie-user": rate_values}
g.ndata["feat"] = ndata
g.edata["rate"] = edata
return g
def add_masks(
self, g, train_rating_data, valid_rating_data, test_rating_data
):
train_u_indices, train_v_indices, _ = self._generate_pair_value(
train_rating_data
)
valid_u_indices, valid_v_indices, _ = self._generate_pair_value(
valid_rating_data
)
test_u_indices, test_v_indices, _ = self._generate_pair_value(
test_rating_data
)
# user-movie
train_mask = torch.zeros((g.num_edges("user-movie"),), dtype=torch.bool)
train_mask[
g.edge_ids(train_u_indices, train_v_indices, etype="user-movie")
] = True
valid_mask = torch.zeros((g.num_edges("user-movie"),), dtype=torch.bool)
valid_mask[
g.edge_ids(valid_u_indices, valid_v_indices, etype="user-movie")
] = True
test_mask = torch.zeros((g.num_edges("user-movie"),), dtype=torch.bool)
test_mask[
g.edge_ids(test_u_indices, test_v_indices, etype="user-movie")
] = True
g.edges["user-movie"].data["train_mask"] = train_mask
g.edges["user-movie"].data["valid_mask"] = valid_mask
g.edges["user-movie"].data["test_mask"] = test_mask
# movie-user
train_mask_rev = torch.zeros(
(g.num_edges("movie-user"),), dtype=torch.bool
)
train_mask_rev[
g.edge_ids(train_v_indices, train_u_indices, etype="movie-user")
] = True
valid_mask_rev = torch.zeros(
(g.num_edges("movie-user"),), dtype=torch.bool
)
valid_mask_rev[
g.edge_ids(valid_v_indices, valid_u_indices, etype="movie-user")
] = True
test_mask_rev = torch.zeros(
(g.num_edges("movie-user"),), dtype=torch.bool
)
test_mask_rev[
g.edge_ids(test_v_indices, test_u_indices, etype="movie-user")
] = True
g.edges["movie-user"].data["train_mask"] = train_mask_rev
g.edges["movie-user"].data["valid_mask"] = valid_mask_rev
g.edges["movie-user"].data["test_mask"] = test_mask_rev
return g
def has_cache(self):
if (
os.path.exists(self.graph_path)
and os.path.exists(self.version_path)
and self.check_version()
):
return True
return False
def save(self):
save_graphs(self.graph_path, [self.graph])
save_info(self.version_path, [self.valid_ratio, self.test_ratio])
if self.verbose:
print(f"Done saving data into {self.raw_path}.")
def load(self):
g_list, _ = load_graphs(self.graph_path)
self.graph = g_list[0]
"""
To avoid the problem each time loading boolean tensor from the disk, boolean values
would be automatically converted into torch.uint8 types, and a deprecation warning would
be raised for using torch.uint8
"""
for e in self.graph.etypes:
self.graph.edges[e].data["train_mask"] = (
self.graph.edges[e].data["train_mask"].to(torch.bool)
)
self.graph.edges[e].data["valid_mask"] = (
self.graph.edges[e].data["valid_mask"].to(torch.bool)
)
self.graph.edges[e].data["test_mask"] = (
self.graph.edges[e].data["test_mask"].to(torch.bool)
)
[docs] def __getitem__(self, idx):
assert (
idx == 0
), "This dataset has only one set of training, validation and testing graph"
if self._transform is None:
return self.graph
else:
return self._transform(self.graph)
[docs] def __len__(self):
return 1
@property
def raw_path(self):
return os.path.join(self.raw_dir, self.name)
@property
def graph_path(self):
return os.path.join(self.raw_path, self.name + ".bin")
@property
def version_path(self):
return os.path.join(self.raw_path, self.name + "_version.pkl")
def _process_user_feat(self, user_data):
if self.name == "ml-100k" or self.name == "ml-1m":
ages = user_data["age"].values.astype(np.float32)
gender = (user_data["gender"] == "F").values.astype(np.float32)
all_occupations = set(user_data["occupation"])
occupation_map = {ele: i for i, ele in enumerate(all_occupations)}
occupation_one_hot = np.zeros(
shape=(user_data.shape[0], len(all_occupations)),
dtype=np.float32,
)
occupation_one_hot[
np.arange(user_data.shape[0]),
np.array(
[occupation_map[ele] for ele in user_data["occupation"]]
),
] = 1
user_features = np.concatenate(
[
ages.reshape((user_data.shape[0], 1)) / 50.0,
gender.reshape((user_data.shape[0], 1)),
occupation_one_hot,
],
axis=1,
)
elif self.name == "ml-10m":
user_features = np.zeros(
shape=(user_data.shape[0], 1), dtype=np.float32
)
else:
raise NotImplementedError
return user_features
def _load_raw_user_data(self):
if self.name == "ml-100k":
user_data = pd.read_csv(
os.path.join(self.raw_path, "u.user"),
sep="|",
header=None,
names=["id", "age", "gender", "occupation", "zip_code"],
engine="python",
)
elif self.name == "ml-1m":
user_data = pd.read_csv(
os.path.join(self.raw_path, "users.dat"),
sep="::",
header=None,
names=["id", "gender", "age", "occupation", "zip_code"],
engine="python",
)
elif self.name == "ml-10m":
rating_info = pd.read_csv(
os.path.join(self.raw_path, "ratings.dat"),
sep="::",
header=None,
names=["user_id", "movie_id", "rating", "timestamp"],
dtype={
"user_id": np.int32,
"movie_id": np.int32,
"ratings": np.float32,
"timestamp": np.int64,
},
engine="python",
)
user_data = pd.DataFrame(
np.unique(rating_info["user_id"].values.astype(np.int32)),
columns=["id"],
)
else:
raise NotImplementedError
return user_data
def _load_raw_movie_data(self):
file_path = os.path.join(self.raw_path, "u.item")
if self.name == "ml-100k":
movie_data = pd.read_csv(
file_path,
sep="|",
header=None,
names=[
"id",
"title",
"release_date",
"video_release_date",
"url",
]
+ GENRES_ML_100K,
engine="python",
encoding="ISO-8859-1",
)
elif self.name == "ml-1m" or self.name == "ml-10m":
file_path = os.path.join(self.raw_path, "movies.dat")
movie_data = pd.read_csv(
file_path,
sep="::",
header=None,
names=["id", "title", "genres"],
encoding="iso-8859-1",
engine="python",
)
genre_map = {ele: i for i, ele in enumerate(self.genres)}
genre_map["Children's"] = genre_map["Children"]
genre_map["Childrens"] = genre_map["Children"]
movie_genres = np.zeros(
shape=(movie_data.shape[0], len(self.genres)), dtype=np.float32
)
for i, genres in enumerate(movie_data["genres"]):
for ele in genres.split("|"):
if ele in genre_map:
movie_genres[i, genre_map[ele]] = 1.0
else:
movie_genres[i, genre_map["unknown"]] = 1.0
for idx, genre_name in enumerate(self.genres):
movie_data[genre_name] = movie_genres[:, idx]
movie_data = movie_data.drop(columns=["genres"])
else:
raise NotImplementedError
return movie_data
def _load_raw_rates(self, file_path, sep):
rating_data = pd.read_csv(
file_path,
sep=sep,
header=None,
names=["user_id", "movie_id", "rating", "timestamp"],
dtype={
"user_id": np.int32,
"movie_id": np.int32,
"ratings": np.float32,
"timestamp": np.int64,
},
engine="python",
)
rating_data = rating_data.reset_index(drop=True)
return rating_data
def _drop_unseen_nodes(self, data_df, col_name, reserved_ids_set):
data_df = data_df[data_df[col_name].isin(reserved_ids_set)]
data_df.reset_index(drop=True, inplace=True)
return data_df
def _generate_pair_value(self, rating_data):
rating_pairs = (
np.array(
[
self._global_user_id_map[ele]
for ele in rating_data["user_id"]
],
dtype=np.int32,
),
np.array(
[
self._global_movie_id_map[ele]
for ele in rating_data["movie_id"]
],
dtype=np.int32,
),
)
rating_values = rating_data["rating"].values.astype(np.float32)
return rating_pairs[0], rating_pairs[1], rating_values
def __repr__(self):
return (
f'Dataset("{self.name}", num_graphs={len(self)},'
+ f" save_path={self.raw_path}), valid_ratio={self.valid_ratio}, test_ratio={self.test_ratio}"
)