Source code for dgl.homophily

"""Utils for tacking graph homophily and heterophily"""
# pylint: disable=W0611
from . import function as fn

try:
    import torch
except ImportError:
    HAS_TORCH = False
else:
    HAS_TORCH = True

__all__ = ["node_homophily", "edge_homophily", "linkx_homophily"]


def check_pytorch():
    """Check if PyTorch is the backend."""
    if HAS_TORCH is False:
        raise ModuleNotFoundError(
            "This function requires PyTorch to be the backend."
        )


def get_long_edges(graph):
    """Internal function for getting the edges of a graph as long tensors."""
    src, dst = graph.edges()
    return src.long(), dst.long()


[docs]def node_homophily(graph, y):
    r"""Homophily measure from `Geom-GCN: Geometric Graph Convolutional
    Networks <https://arxiv.org/abs/2002.05287>`__

    We follow the practice of a later paper `Large Scale Learning on
    Non-Homophilous Graphs: New Benchmarks and Strong Simple Methods
    <https://arxiv.org/abs/2110.14446>`__ to call it node homophily.

    Mathematically it is defined as follows:

    .. math::
      \frac{1}{|\mathcal{V}|} \sum_{v \in \mathcal{V}} \frac{ | \{u
      \in \mathcal{N}(v): y_v = y_u \} |  } { |\mathcal{N}(v)| },

    where :math:`\mathcal{V}` is the set of nodes, :math:`\mathcal{N}(v)` is
    the predecessors of node :math:`v`, and :math:`y_v` is the class of node
    :math:`v`.

    Parameters
    ----------
    graph : DGLGraph
        The graph.
    y : torch.Tensor
        The node labels, which is a tensor of shape (|V|).

    Returns
    -------
    float
        The node homophily value.

    Examples
    --------
    >>> import dgl
    >>> import torch

    >>> graph = dgl.graph(([1, 2, 0, 4], [0, 1, 2, 3]))
    >>> y = torch.tensor([0, 0, 0, 0, 1])
    >>> dgl.node_homophily(graph, y)
    0.6000000238418579
    """
    check_pytorch()
    with graph.local_scope():
        # Handle the case where graph is of dtype int32.
        src, dst = get_long_edges(graph)
        # Compute y_v = y_u for all edges.
        graph.edata["same_class"] = (y[src] == y[dst]).float()
        graph.update_all(
            fn.copy_e("same_class", "m"), fn.mean("m", "same_class_deg")
        )
        return graph.ndata["same_class_deg"].mean(dim=0).item()


[docs]def edge_homophily(graph, y):
    r"""Homophily measure from `Beyond Homophily in Graph Neural Networks:
    Current Limitations and Effective Designs
    <https://arxiv.org/abs/2006.11468>`__

    Mathematically it is defined as follows:

    .. math::
      \frac{| \{ (u,v) : (u,v) \in \mathcal{E} \wedge y_u = y_v \} | }
      {|\mathcal{E}|},

    where :math:`\mathcal{E}` is the set of edges, and :math:`y_u` is the class
    of node :math:`u`.

    Parameters
    ----------
    graph : DGLGraph
        The graph.
    y : torch.Tensor
        The node labels, which is a tensor of shape (|V|).

    Returns
    -------
    float
        The edge homophily ratio value.

    Examples
    --------
    >>> import dgl
    >>> import torch

    >>> graph = dgl.graph(([1, 2, 0, 4], [0, 1, 2, 3]))
    >>> y = torch.tensor([0, 0, 0, 0, 1])
    >>> dgl.edge_homophily(graph, y)
    0.75
    """
    check_pytorch()
    with graph.local_scope():
        # Handle the case where graph is of dtype int32.
        src, dst = get_long_edges(graph)
        # Compute y_v = y_u for all edges.
        edge_indicator = (y[src] == y[dst]).float()
        return edge_indicator.mean(dim=0).item()


[docs]def linkx_homophily(graph, y):
    r"""Homophily measure from `Large Scale Learning on Non-Homophilous Graphs:
    New Benchmarks and Strong Simple Methods
    <https://arxiv.org/abs/2110.14446>`__

    Mathematically it is defined as follows:

    .. math::
      \frac{1}{C-1} \sum_{k=1}^{C} \max \left(0, \frac{\sum_{v\in C_k}|\{u\in
      \mathcal{N}(v): y_v = y_u \}|}{\sum_{v\in C_k}|\mathcal{N}(v)|} -
      \frac{|\mathcal{C}_k|}{|\mathcal{V}|} \right),

    where :math:`C` is the number of node classes, :math:`C_k` is the set of
    nodes that belong to class k, :math:`\mathcal{N}(v)` are the predecessors
    of node :math:`v`, :math:`y_v` is the class of node :math:`v`, and
    :math:`\mathcal{V}` is the set of nodes.

    Parameters
    ----------
    graph : DGLGraph
        The graph.
    y : torch.Tensor
        The node labels, which is a tensor of shape (|V|).

    Returns
    -------
    float
        The homophily value.

    Examples
    --------
    >>> import dgl
    >>> import torch

    >>> graph = dgl.graph(([0, 1, 2, 3], [1, 2, 0, 4]))
    >>> y = torch.tensor([0, 0, 0, 0, 1])
    >>> dgl.linkx_homophily(graph, y)
    0.19999998807907104
    """
    check_pytorch()
    with graph.local_scope():
        # Compute |{u\in N(v): y_v = y_u}| for each node v.
        # Handle the case where graph is of dtype int32.
        src, dst = get_long_edges(graph)
        # Compute y_v = y_u for all edges.
        graph.edata["same_class"] = (y[src] == y[dst]).float()
        graph.update_all(
            fn.copy_e("same_class", "m"), fn.sum("m", "same_class_deg")
        )

        deg = graph.in_degrees().float()
        num_nodes = graph.num_nodes()
        num_classes = y.max(dim=0).values.item() + 1

        value = torch.tensor(0.0).to(graph.device)
        for k in range(num_classes):
            # Get the nodes that belong to class k.
            class_mask = y == k
            same_class_deg_k = graph.ndata["same_class_deg"][class_mask].sum()
            deg_k = deg[class_mask].sum()
            num_nodes_k = class_mask.sum()
            value += max(0, same_class_deg_k / deg_k - num_nodes_k / num_nodes)

        return value.item() / (num_classes - 1)