"""RDF datasets
Datasets from "A Collection of Benchmark Datasets for
Systematic Evaluations of Machine Learning on
the Semantic Web"
"""
import os
from collections import OrderedDict
import itertools
import abc
import re
try:
import rdflib as rdf
except ImportError:
pass
import networkx as nx
import numpy as np
import dgl
import dgl.backend as F
from .dgl_dataset import DGLBuiltinDataset
from .utils import save_graphs, load_graphs, save_info, load_info, _get_dgl_url
from .utils import generate_mask_tensor, idx2mask, deprecate_property, deprecate_class
__all__ = ['AIFB', 'MUTAG', 'BGS', 'AM', 'AIFBDataset', 'MUTAGDataset', 'BGSDataset', 'AMDataset']
# Dictionary for renaming reserved node/edge type names to the ones
# that are allowed by nn.Module.
RENAME_DICT = {
'type' : 'rdftype',
'rev-type' : 'rev-rdftype',
}
class Entity:
"""Class for entities
Parameters
----------
id : str
ID of this entity
cls : str
Type of this entity
"""
def __init__(self, e_id, cls):
self.id = e_id
self.cls = cls
def __str__(self):
return '{}/{}'.format(self.cls, self.id)
class Relation:
"""Class for relations
Parameters
----------
cls : str
Type of this relation
"""
def __init__(self, cls):
self.cls = cls
def __str__(self):
return str(self.cls)
class RDFGraphDataset(DGLBuiltinDataset):
"""Base graph dataset class from RDF tuples.
To derive from this, implement the following abstract methods:
* ``parse_entity``
* ``parse_relation``
* ``process_tuple``
* ``process_idx_file_line``
* ``predict_category``
Preprocessed graph and other data will be cached in the download folder
to speedup data loading.
The dataset should contain a "trainingSet.tsv" and a "testSet.tsv" file
for training and testing samples.
Attributes
----------
graph : dgl.DGLraph
Graph structure
num_classes : int
Number of classes to predict
predict_category : str
The entity category (node type) that has labels for prediction
train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
labels : Tensor
All the labels of the entities in ``predict_category``
Parameters
----------
name : str
Name of the dataset
url : str or path
URL to download the raw dataset.
predict_category : str
Predict category.
print_every : int, optional
Preprocessing log for every X tuples.
insert_reverse : bool, optional
If true, add reverse edge and reverse relations to the final graph.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool, optional
If true, force load and process from raw data. Ignore cached pre-processed data.
verbose: bool
Whether to print out progress information. Default: True.
"""
def __init__(self, name, url, predict_category,
print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
import rdflib as rdf
self._insert_reverse = insert_reverse
self._print_every = print_every
self._predict_category = predict_category
super(RDFGraphDataset, self).__init__(name, url,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def process(self):
raw_tuples = self.load_raw_tuples(self.raw_path)
self.process_raw_tuples(raw_tuples, self.raw_path)
def load_raw_tuples(self, root_path):
"""Loading raw RDF dataset
Parameters
----------
root_path : str
Root path containing the data
Returns
-------
Loaded rdf data
"""
raw_rdf_graphs = []
for _, filename in enumerate(os.listdir(root_path)):
fmt = None
if filename.endswith('nt'):
fmt = 'nt'
elif filename.endswith('n3'):
fmt = 'n3'
if fmt is None:
continue
g = rdf.Graph()
print('Parsing file %s ...' % filename)
g.parse(os.path.join(root_path, filename), format=fmt)
raw_rdf_graphs.append(g)
return itertools.chain(*raw_rdf_graphs)
def process_raw_tuples(self, raw_tuples, root_path):
"""Processing raw RDF dataset
Parameters
----------
raw_tuples:
Raw rdf tuples
root_path: str
Root path containing the data
"""
mg = nx.MultiDiGraph()
ent_classes = OrderedDict()
rel_classes = OrderedDict()
entities = OrderedDict()
src = []
dst = []
ntid = []
etid = []
sorted_tuples = []
for t in raw_tuples:
sorted_tuples.append(t)
sorted_tuples.sort()
for i, (sbj, pred, obj) in enumerate(sorted_tuples):
if self.verbose and i % self._print_every == 0:
print('Processed %d tuples, found %d valid tuples.' % (i, len(src)))
sbjent = self.parse_entity(sbj)
rel = self.parse_relation(pred)
objent = self.parse_entity(obj)
processed = self.process_tuple((sbj, pred, obj), sbjent, rel, objent)
if processed is None:
# ignored
continue
# meta graph
sbjclsid = _get_id(ent_classes, sbjent.cls)
objclsid = _get_id(ent_classes, objent.cls)
relclsid = _get_id(rel_classes, rel.cls)
mg.add_edge(sbjent.cls, objent.cls, key=rel.cls)
if self._insert_reverse:
mg.add_edge(objent.cls, sbjent.cls, key='rev-%s' % rel.cls)
# instance graph
src_id = _get_id(entities, str(sbjent))
if len(entities) > len(ntid): # found new entity
ntid.append(sbjclsid)
dst_id = _get_id(entities, str(objent))
if len(entities) > len(ntid): # found new entity
ntid.append(objclsid)
src.append(src_id)
dst.append(dst_id)
etid.append(relclsid)
src = np.asarray(src)
dst = np.asarray(dst)
ntid = np.asarray(ntid)
etid = np.asarray(etid)
ntypes = list(ent_classes.keys())
etypes = list(rel_classes.keys())
# add reverse edge with reverse relation
if self._insert_reverse:
if self.verbose:
print('Adding reverse edges ...')
newsrc = np.hstack([src, dst])
newdst = np.hstack([dst, src])
src = newsrc
dst = newdst
etid = np.hstack([etid, etid + len(etypes)])
etypes.extend(['rev-%s' % t for t in etypes])
hg = self.build_graph(mg, src, dst, ntid, etid, ntypes, etypes)
if self.verbose:
print('Load training/validation/testing split ...')
idmap = F.asnumpy(hg.nodes[self.predict_category].data[dgl.NID])
glb2lcl = {glbid : lclid for lclid, glbid in enumerate(idmap)}
def findidfn(ent):
if ent not in entities:
return None
else:
return glb2lcl[entities[ent]]
self._hg = hg
train_idx, test_idx, labels, num_classes = self.load_data_split(findidfn, root_path)
train_mask = idx2mask(train_idx, self._hg.number_of_nodes(self.predict_category))
test_mask = idx2mask(test_idx, self._hg.number_of_nodes(self.predict_category))
labels = F.tensor(labels, F.data_type_dict['int64'])
train_mask = generate_mask_tensor(train_mask)
test_mask = generate_mask_tensor(test_mask)
self._hg.nodes[self.predict_category].data['train_mask'] = train_mask
self._hg.nodes[self.predict_category].data['test_mask'] = test_mask
self._hg.nodes[self.predict_category].data['labels'] = labels
self._num_classes = num_classes
# save for compatability
self._train_idx = F.tensor(train_idx)
self._test_idx = F.tensor(test_idx)
self._labels = labels
def build_graph(self, mg, src, dst, ntid, etid, ntypes, etypes):
"""Build the graphs
Parameters
----------
mg: MultiDiGraph
Input graph
src: Numpy array
Source nodes
dst: Numpy array
Destination nodes
ntid: Numpy array
Node types for each node
etid: Numpy array
Edge types for each edge
ntypes: list
Node types
etypes: list
Edge types
Returns
-------
g: DGLGraph
"""
# create homo graph
if self.verbose:
print('Creating one whole graph ...')
g = dgl.graph((src, dst))
g.ndata[dgl.NTYPE] = F.tensor(ntid)
g.edata[dgl.ETYPE] = F.tensor(etid)
if self.verbose:
print('Total #nodes:', g.number_of_nodes())
print('Total #edges:', g.number_of_edges())
# rename names such as 'type' so that they an be used as keys
# to nn.ModuleDict
etypes = [RENAME_DICT.get(ty, ty) for ty in etypes]
mg_edges = mg.edges(keys=True)
mg = nx.MultiDiGraph()
for sty, dty, ety in mg_edges:
mg.add_edge(sty, dty, key=RENAME_DICT.get(ety, ety))
# convert to heterograph
if self.verbose:
print('Convert to heterograph ...')
hg = dgl.to_heterogeneous(g,
ntypes,
etypes,
metagraph=mg)
if self.verbose:
print('#Node types:', len(hg.ntypes))
print('#Canonical edge types:', len(hg.etypes))
print('#Unique edge type names:', len(set(hg.etypes)))
return hg
def load_data_split(self, ent2id, root_path):
"""Load data split
Parameters
----------
ent2id: func
A function mapping entity to id
root_path: str
Root path containing the data
Return
------
train_idx: Numpy array
Training set
test_idx: Numpy array
Testing set
labels: Numpy array
Labels
num_classes: int
Number of classes
"""
label_dict = {}
labels = np.zeros((self._hg.number_of_nodes(self.predict_category),)) - 1
train_idx = self.parse_idx_file(
os.path.join(root_path, 'trainingSet.tsv'),
ent2id, label_dict, labels)
test_idx = self.parse_idx_file(
os.path.join(root_path, 'testSet.tsv'),
ent2id, label_dict, labels)
train_idx = np.array(train_idx)
test_idx = np.array(test_idx)
labels = np.array(labels)
num_classes = len(label_dict)
return train_idx, test_idx, labels, num_classes
def parse_idx_file(self, filename, ent2id, label_dict, labels):
"""Parse idx files
Parameters
----------
filename: str
File to parse
ent2id: func
A function mapping entity to id
label_dict: dict
Map label to label id
labels: dict
Map entity id to label id
Return
------
idx: list
Entity idss
"""
idx = []
with open(filename, 'r') as f:
for i, line in enumerate(f):
if i == 0:
continue # first line is the header
sample, label = self.process_idx_file_line(line)
#person, _, label = line.strip().split('\t')
ent = self.parse_entity(sample)
entid = ent2id(str(ent))
if entid is None:
print('Warning: entity "%s" does not have any valid links associated. Ignored.' % str(ent))
else:
idx.append(entid)
lblid = _get_id(label_dict, label)
labels[entid] = lblid
return idx
def has_cache(self):
"""check if there is a processed data"""
graph_path = os.path.join(self.save_path,
self.save_name + '.bin')
info_path = os.path.join(self.save_path,
self.save_name + '.pkl')
if os.path.exists(graph_path) and \
os.path.exists(info_path):
return True
return False
def save(self):
"""save the graph list and the labels"""
graph_path = os.path.join(self.save_path,
self.save_name + '.bin')
info_path = os.path.join(self.save_path,
self.save_name + '.pkl')
save_graphs(str(graph_path), self._hg)
save_info(str(info_path), {'num_classes': self.num_classes,
'predict_category': self.predict_category})
def load(self):
"""load the graph list and the labels from disk"""
graph_path = os.path.join(self.save_path,
self.save_name + '.bin')
info_path = os.path.join(self.save_path,
self.save_name + '.pkl')
graphs, _ = load_graphs(str(graph_path))
info = load_info(str(info_path))
self._num_classes = info['num_classes']
self._predict_category = info['predict_category']
self._hg = graphs[0]
train_mask = self._hg.nodes[self.predict_category].data['train_mask']
test_mask = self._hg.nodes[self.predict_category].data['test_mask']
self._labels = self._hg.nodes[self.predict_category].data['labels']
train_idx = F.nonzero_1d(train_mask)
test_idx = F.nonzero_1d(test_mask)
self._train_idx = train_idx
self._test_idx = test_idx
def __getitem__(self, idx):
r"""Gets the graph object
"""
g = self._hg
return g
def __len__(self):
r"""The number of graphs in the dataset."""
return 1
@property
def save_name(self):
return self.name + '_dgl_graph'
@property
def graph(self):
deprecate_property('dataset.graph', 'hg = dataset[0]')
return self._hg
@property
def predict_category(self):
return self._predict_category
@property
def num_classes(self):
return self._num_classes
@property
def train_idx(self):
deprecate_property('dataset.train_idx', 'train_mask = g.ndata[\'train_mask\']')
return self._train_idx
@property
def test_idx(self):
deprecate_property('dataset.test_idx', 'train_mask = g.ndata[\'test_mask\']')
return self._test_idx
@property
def labels(self):
deprecate_property('dataset.labels', 'train_mask = g.ndata[\'labels\']')
return self._labels
@abc.abstractmethod
def parse_entity(self, term):
"""Parse one entity from an RDF term.
Return None if the term does not represent a valid entity and the
whole tuple should be ignored.
Parameters
----------
term : rdflib.term.Identifier
RDF term
Returns
-------
Entity or None
An entity.
"""
pass
@abc.abstractmethod
def parse_relation(self, term):
"""Parse one relation from an RDF term.
Return None if the term does not represent a valid relation and the
whole tuple should be ignored.
Parameters
----------
term : rdflib.term.Identifier
RDF term
Returns
-------
Relation or None
A relation
"""
pass
@abc.abstractmethod
def process_tuple(self, raw_tuple, sbj, rel, obj):
"""Process the tuple.
Return (Entity, Relation, Entity) tuple for as the final tuple.
Return None if the tuple should be ignored.
Parameters
----------
raw_tuple : tuple of rdflib.term.Identifier
(subject, predicate, object) tuple
sbj : Entity
Subject entity
rel : Relation
Relation
obj : Entity
Object entity
Returns
-------
(Entity, Relation, Entity)
The final tuple or None if should be ignored
"""
pass
@abc.abstractmethod
def process_idx_file_line(self, line):
"""Process one line of ``trainingSet.tsv`` or ``testSet.tsv``.
Parameters
----------
line : str
One line of the file
Returns
-------
(str, str)
One sample and its label
"""
pass
def _get_id(dict, key):
id = dict.get(key, None)
if id is None:
id = len(dict)
dict[key] = id
return id
[docs]class AIFBDataset(RDFGraphDataset):
r"""AIFB dataset for node classification task
.. deprecated:: 0.5.0
- ``graph`` is deprecated, it is replaced by:
>>> dataset = AIFBDataset()
>>> graph = dataset[0]
- ``train_idx`` is deprecated, it can be replaced by:
>>> dataset = AIFBDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
- ``test_idx`` is deprecated, it can be replaced by:
>>> dataset = AIFBDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()
AIFB DataSet is a Semantic Web (RDF) dataset used as a benchmark in
data mining. It records the organizational structure of AIFB at the
University of Karlsruhe.
AIFB dataset statistics:
- Nodes: 7262
- Edges: 48810 (including reverse edges)
- Target Category: Personen
- Number of Classes: 4
- Label Split:
- Train: 140
- Test: 36
Parameters
-----------
print_every: int
Preprocessing log for every X tuples. Default: 10000.
insert_reverse: bool
If true, add reverse edge and reverse relations to the final graph. Default: True.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Attributes
----------
num_classes : int
Number of classes to predict
predict_category : str
The entity category (node type) that has labels for prediction
labels : Tensor
All the labels of the entities in ``predict_category``
graph : :class:`dgl.DGLGraph`
Graph structure
train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples
--------
>>> dataset = dgl.data.rdf.AIFBDataset()
>>> graph = dataset[0]
>>> category = dataset.predict_category
>>> num_classes = dataset.num_classes
>>>
>>> train_mask = g.nodes[category].data.pop('train_mask')
>>> test_mask = g.nodes[category].data.pop('test_mask')
>>> labels = g.nodes[category].data.pop('labels')
"""
entity_prefix = 'http://www.aifb.uni-karlsruhe.de/'
relation_prefix = 'http://swrc.ontoware.org/'
def __init__(self,
print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
import rdflib as rdf
self.employs = rdf.term.URIRef("http://swrc.ontoware.org/ontology#employs")
self.affiliation = rdf.term.URIRef("http://swrc.ontoware.org/ontology#affiliation")
url = _get_dgl_url('dataset/rdf/aifb-hetero.zip')
name = 'aifb-hetero'
predict_category = 'Personen'
super(AIFBDataset, self).__init__(name, url, predict_category,
print_every=print_every,
insert_reverse=insert_reverse,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
[docs] def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, AIFBDataset has only one graph object
Return
-------
:class:`dgl.DGLGraph`
The graph contains:
- ``ndata['train_mask']``: mask for training node set
- ``ndata['test_mask']``: mask for testing node set
- ``ndata['labels']``: mask for labels
"""
return super(AIFBDataset, self).__getitem__(idx)
[docs] def __len__(self):
r"""The number of graphs in the dataset.
Return
-------
int
"""
return super(AIFBDataset, self).__len__()
def parse_entity(self, term):
if isinstance(term, rdf.Literal):
return Entity(e_id=str(term), cls="_Literal")
if isinstance(term, rdf.BNode):
return None
entstr = str(term)
if entstr.startswith(self.entity_prefix):
sp = entstr.split('/')
return Entity(e_id=sp[5], cls=sp[3])
else:
return None
def parse_relation(self, term):
if term == self.employs or term == self.affiliation:
return None
relstr = str(term)
if relstr.startswith(self.relation_prefix):
return Relation(cls=relstr.split('/')[3])
else:
relstr = relstr.split('/')[-1]
return Relation(cls=relstr)
def process_tuple(self, raw_tuple, sbj, rel, obj):
if sbj is None or rel is None or obj is None:
return None
return (sbj, rel, obj)
def process_idx_file_line(self, line):
person, _, label = line.strip().split('\t')
return person, label
class AIFB(AIFBDataset):
"""AIFB dataset. Same as AIFBDataset.
"""
def __init__(self,
print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
deprecate_class('AIFB', 'AIFBDataset')
super(AIFB, self).__init__(print_every,
insert_reverse,
raw_dir,
force_reload,
verbose)
[docs]class MUTAGDataset(RDFGraphDataset):
r"""MUTAG dataset for node classification task
.. deprecated:: 0.5.0
- ``graph`` is deprecated, it is replaced by:
>>> dataset = MUTAGDataset()
>>> graph = dataset[0]
- ``train_idx`` is deprecated, it can be replaced by:
>>> dataset = MUTAGDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
- ``test_idx`` is deprecated, it can be replaced by:
>>> dataset = MUTAGDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
Mutag dataset statistics:
- Nodes: 27163
- Edges: 148100 (including reverse edges)
- Target Category: d
- Number of Classes: 2
- Label Split:
- Train: 272
- Test: 68
Parameters
-----------
print_every: int
Preprocessing log for every X tuples. Default: 10000.
insert_reverse: bool
If true, add reverse edge and reverse relations to the final graph. Default: True.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Attributes
----------
num_classes : int
Number of classes to predict
predict_category : str
The entity category (node type) that has labels for prediction
labels : Tensor
All the labels of the entities in ``predict_category``
graph : :class:`dgl.DGLGraph`
Graph structure
train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples
--------
>>> dataset = dgl.data.rdf.MUTAGDataset()
>>> graph = dataset[0]
>>> category = dataset.predict_category
>>> num_classes = dataset.num_classes
>>>
>>> train_mask = g.nodes[category].data.pop('train_mask')
>>> test_mask = g.nodes[category].data.pop('test_mask')
>>> labels = g.nodes[category].data.pop('labels')
"""
d_entity = re.compile("d[0-9]")
bond_entity = re.compile("bond[0-9]")
entity_prefix = 'http://dl-learner.org/carcinogenesis#'
relation_prefix = entity_prefix
def __init__(self,
print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
import rdflib as rdf
self.is_mutagenic = rdf.term.URIRef("http://dl-learner.org/carcinogenesis#isMutagenic")
self.rdf_type = rdf.term.URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
self.rdf_subclassof = rdf.term.URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf")
self.rdf_domain = rdf.term.URIRef("http://www.w3.org/2000/01/rdf-schema#domain")
url = _get_dgl_url('dataset/rdf/mutag-hetero.zip')
name = 'mutag-hetero'
predict_category = 'd'
super(MUTAGDataset, self).__init__(name, url, predict_category,
print_every=print_every,
insert_reverse=insert_reverse,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
[docs] def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, MUTAGDataset has only one graph object
Return
-------
:class:`dgl.DGLGraph`
The graph contains:
- ``ndata['train_mask']``: mask for training node set
- ``ndata['test_mask']``: mask for testing node set
- ``ndata['labels']``: mask for labels
"""
return super(MUTAGDataset, self).__getitem__(idx)
[docs] def __len__(self):
r"""The number of graphs in the dataset.
Return
-------
int
"""
return super(MUTAGDataset, self).__len__()
def parse_entity(self, term):
if isinstance(term, rdf.Literal):
return Entity(e_id=str(term), cls="_Literal")
elif isinstance(term, rdf.BNode):
return None
entstr = str(term)
if entstr.startswith(self.entity_prefix):
inst = entstr[len(self.entity_prefix):]
if self.d_entity.match(inst):
cls = 'd'
elif self.bond_entity.match(inst):
cls = 'bond'
else:
cls = None
return Entity(e_id=inst, cls=cls)
else:
return None
def parse_relation(self, term):
if term == self.is_mutagenic:
return None
relstr = str(term)
if relstr.startswith(self.relation_prefix):
cls = relstr[len(self.relation_prefix):]
return Relation(cls=cls)
else:
relstr = relstr.split('/')[-1]
return Relation(cls=relstr)
def process_tuple(self, raw_tuple, sbj, rel, obj):
if sbj is None or rel is None or obj is None:
return None
if not raw_tuple[1].startswith('http://dl-learner.org/carcinogenesis#'):
obj.cls = 'SCHEMA'
if sbj.cls is None:
sbj.cls = 'SCHEMA'
if obj.cls is None:
obj.cls = rel.cls
assert sbj.cls is not None and obj.cls is not None
return (sbj, rel, obj)
def process_idx_file_line(self, line):
bond, _, label = line.strip().split('\t')
return bond, label
class MUTAG(MUTAGDataset):
"""MUTAG dataset. Same as MUTAGDataset.
"""
def __init__(self,
print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
deprecate_class('MUTAG', 'MUTAGDataset')
super(MUTAG, self).__init__(print_every,
insert_reverse,
raw_dir,
force_reload,
verbose)
[docs]class BGSDataset(RDFGraphDataset):
r"""BGS dataset for node classification task
.. deprecated:: 0.5.0
- ``graph`` is deprecated, it is replaced by:
>>> dataset = BGSDataset()
>>> graph = dataset[0]
- ``train_idx`` is deprecated, it can be replaced by:
>>> dataset = BGSDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
- ``test_idx`` is deprecated, it can be replaced by:
>>> dataset = BGSDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
BGS namespace convention:
``http://data.bgs.ac.uk/(ref|id)/<Major Concept>/<Sub Concept>/INSTANCE``.
We ignored all literal nodes and the relations connecting them in the
output graph. We also ignored the relation used to mark whether a
term is CURRENT or DEPRECATED.
BGS dataset statistics:
- Nodes: 94806
- Edges: 672884 (including reverse edges)
- Target Category: Lexicon/NamedRockUnit
- Number of Classes: 2
- Label Split:
- Train: 117
- Test: 29
Parameters
-----------
print_every: int
Preprocessing log for every X tuples. Default: 10000.
insert_reverse: bool
If true, add reverse edge and reverse relations to the final graph. Default: True.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Attributes
----------
num_classes : int
Number of classes to predict
predict_category : str
The entity category (node type) that has labels for prediction
labels : Tensor
All the labels of the entities in ``predict_category``
graph : :class:`dgl.DGLGraph`
Graph structure
train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples
--------
>>> dataset = dgl.data.rdf.BGSDataset()
>>> graph = dataset[0]
>>> category = dataset.predict_category
>>> num_classes = dataset.num_classes
>>>
>>> train_mask = g.nodes[category].data.pop('train_mask')
>>> test_mask = g.nodes[category].data.pop('test_mask')
>>> labels = g.nodes[category].data.pop('labels')
"""
entity_prefix = 'http://data.bgs.ac.uk/'
status_prefix = 'http://data.bgs.ac.uk/ref/CurrentStatus'
relation_prefix = 'http://data.bgs.ac.uk/ref'
def __init__(self,
print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
import rdflib as rdf
url = _get_dgl_url('dataset/rdf/bgs-hetero.zip')
name = 'bgs-hetero'
predict_category = 'Lexicon/NamedRockUnit'
self.lith = rdf.term.URIRef("http://data.bgs.ac.uk/ref/Lexicon/hasLithogenesis")
super(BGSDataset, self).__init__(name, url, predict_category,
print_every=print_every,
insert_reverse=insert_reverse,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
[docs] def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, BGSDataset has only one graph object
Return
-------
:class:`dgl.DGLGraph`
The graph contains:
- ``ndata['train_mask']``: mask for training node set
- ``ndata['test_mask']``: mask for testing node set
- ``ndata['labels']``: mask for labels
"""
return super(BGSDataset, self).__getitem__(idx)
[docs] def __len__(self):
r"""The number of graphs in the dataset.
Return
-------
int
"""
return super(BGSDataset, self).__len__()
def parse_entity(self, term):
if isinstance(term, rdf.Literal):
return None
elif isinstance(term, rdf.BNode):
return None
entstr = str(term)
if entstr.startswith(self.status_prefix):
return None
if entstr.startswith(self.entity_prefix):
sp = entstr.split('/')
if len(sp) != 7:
return None
# instance
cls = '%s/%s' % (sp[4], sp[5])
inst = sp[6]
return Entity(e_id=inst, cls=cls)
else:
return None
def parse_relation(self, term):
if term == self.lith:
return None
relstr = str(term)
if relstr.startswith(self.relation_prefix):
sp = relstr.split('/')
if len(sp) < 6:
return None
assert len(sp) == 6, relstr
cls = '%s/%s' % (sp[4], sp[5])
return Relation(cls=cls)
else:
relstr = relstr.replace('.', '_')
return Relation(cls=relstr)
def process_tuple(self, raw_tuple, sbj, rel, obj):
if sbj is None or rel is None or obj is None:
return None
return (sbj, rel, obj)
def process_idx_file_line(self, line):
_, rock, label = line.strip().split('\t')
return rock, label
class BGS(BGSDataset):
"""BGS dataset. Same as BGSDataset.
"""
def __init__(self,
print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
deprecate_class('BGS', 'BGSDataset')
super(BGS, self).__init__(print_every,
insert_reverse,
raw_dir,
force_reload,
verbose)
[docs]class AMDataset(RDFGraphDataset):
"""AM dataset. for node classification task
.. deprecated:: 0.5.0
- ``graph`` is deprecated, it is replaced by:
>>> dataset = AMDataset()
>>> graph = dataset[0]
- ``train_idx`` is deprecated, it can be replaced by:
>>> dataset = AMDataset()
>>> graph = dataset[0]
>>> train_mask = graph.nodes[dataset.category].data['train_mask']
>>> train_idx = th.nonzero(train_mask).squeeze()
- ``test_idx`` is deprecated, it can be replaced by:
>>> dataset = AMDataset()
>>> graph = dataset[0]
>>> test_mask = graph.nodes[dataset.category].data['test_mask']
>>> test_idx = th.nonzero(test_mask).squeeze()
Namespace convention:
- Instance: ``http://purl.org/collections/nl/am/<type>-<id>``
- Relation: ``http://purl.org/collections/nl/am/<name>``
We ignored all literal nodes and the relations connecting them in the
output graph.
AM dataset statistics:
- Nodes: 881680
- Edges: 5668682 (including reverse edges)
- Target Category: proxy
- Number of Classes: 11
- Label Split:
- Train: 802
- Test: 198
Parameters
-----------
print_every: int
Preprocessing log for every X tuples. Default: 10000.
insert_reverse: bool
If true, add reverse edge and reverse relations to the final graph. Default: True.
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True.
Attributes
----------
num_classes : int
Number of classes to predict
predict_category : str
The entity category (node type) that has labels for prediction
labels : Tensor
All the labels of the entities in ``predict_category``
graph : :class:`dgl.DGLGraph`
Graph structure
train_idx : Tensor
Entity IDs for training. All IDs are local IDs w.r.t. to ``predict_category``.
test_idx : Tensor
Entity IDs for testing. All IDs are local IDs w.r.t. to ``predict_category``.
Examples
--------
>>> dataset = dgl.data.rdf.AMDataset()
>>> graph = dataset[0]
>>> category = dataset.predict_category
>>> num_classes = dataset.num_classes
>>>
>>> train_mask = g.nodes[category].data.pop('train_mask')
>>> test_mask = g.nodes[category].data.pop('test_mask')
>>> labels = g.nodes[category].data.pop('labels')
"""
entity_prefix = 'http://purl.org/collections/nl/am/'
relation_prefix = entity_prefix
def __init__(self,
print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
import rdflib as rdf
self.objectCategory = rdf.term.URIRef("http://purl.org/collections/nl/am/objectCategory")
self.material = rdf.term.URIRef("http://purl.org/collections/nl/am/material")
url = _get_dgl_url('dataset/rdf/am-hetero.zip')
name = 'am-hetero'
predict_category = 'proxy'
super(AMDataset, self).__init__(name, url, predict_category,
print_every=print_every,
insert_reverse=insert_reverse,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
[docs] def __getitem__(self, idx):
r"""Gets the graph object
Parameters
-----------
idx: int
Item index, AMDataset has only one graph object
Return
-------
:class:`dgl.DGLGraph`
The graph contains:
- ``ndata['train_mask']``: mask for training node set
- ``ndata['test_mask']``: mask for testing node set
- ``ndata['labels']``: mask for labels
"""
return super(AMDataset, self).__getitem__(idx)
[docs] def __len__(self):
r"""The number of graphs in the dataset.
Return
-------
int
"""
return super(AMDataset, self).__len__()
def parse_entity(self, term):
if isinstance(term, rdf.Literal):
return None
elif isinstance(term, rdf.BNode):
return Entity(e_id=str(term), cls='_BNode')
entstr = str(term)
if entstr.startswith(self.entity_prefix):
sp = entstr.split('/')
assert len(sp) == 7, entstr
spp = sp[6].split('-')
if len(spp) == 2:
# instance
cls, inst = spp
else:
cls = 'TYPE'
inst = spp
return Entity(e_id=inst, cls=cls)
else:
return None
def parse_relation(self, term):
if term == self.objectCategory or term == self.material:
return None
relstr = str(term)
if relstr.startswith(self.relation_prefix):
sp = relstr.split('/')
assert len(sp) == 7, relstr
cls = sp[6]
return Relation(cls=cls)
else:
relstr = relstr.replace('.', '_')
return Relation(cls=relstr)
def process_tuple(self, raw_tuple, sbj, rel, obj):
if sbj is None or rel is None or obj is None:
return None
return (sbj, rel, obj)
def process_idx_file_line(self, line):
proxy, _, label = line.strip().split('\t')
return proxy, label
class AM(AMDataset):
"""AM dataset. Same as AMDataset.
"""
def __init__(self,
print_every=10000,
insert_reverse=True,
raw_dir=None,
force_reload=False,
verbose=True):
deprecate_class('AM', 'AMDataset')
super(AM, self).__init__(print_every,
insert_reverse,
raw_dir,
force_reload,
verbose)
if __name__ == '__main__':
dataset = AIFB()