Source code for dgl.distributed.dist_context

"""Initialize the distributed services"""
# pylint: disable=line-too-long

import multiprocessing as mp
import traceback
import atexit
import time
import os
import sys
import queue
from enum import Enum

from . import rpc
from .constants import MAX_QUEUE_SIZE
from .kvstore import init_kvstore, close_kvstore
from .rpc_client import connect_to_server, shutdown_servers
from .role import init_role
from .. import utils

SAMPLER_POOL = None
NUM_SAMPLER_WORKERS = 0
INITIALIZED = False


def set_initialized(value=True):
    """Set the initialized state of rpc"""
    global INITIALIZED
    INITIALIZED = value


def get_sampler_pool():
    """Return the sampler pool and num_workers"""
    return SAMPLER_POOL, NUM_SAMPLER_WORKERS


def _init_rpc(ip_config, num_servers, max_queue_size, net_type, role, num_threads):
    ''' This init function is called in the worker processes.
    '''
    try:
        utils.set_num_threads(num_threads)
        if os.environ.get('DGL_DIST_MODE', 'standalone') != 'standalone':
            connect_to_server(ip_config, num_servers, max_queue_size, net_type)
        init_role(role)
        init_kvstore(ip_config, num_servers, role)
    except Exception as e:
        print(e, flush=True)
        traceback.print_exc()
        raise e


class MpCommand(Enum):
    """Enum class for multiprocessing command"""
    INIT_RPC = 0  # Not used in the task queue
    SET_COLLATE_FN = 1
    CALL_BARRIER = 2
    DELETE_COLLATE_FN = 3
    CALL_COLLATE_FN = 4
    CALL_FN_ALL_WORKERS = 5
    FINALIZE_POOL = 6


def init_process(rpc_config, mp_contexts):
    """Work loop in the worker"""
    try:
        _init_rpc(*rpc_config)
        keep_polling = True
        data_queue, task_queue, barrier = mp_contexts
        collate_fn_dict = {}

        while keep_polling:
            try:
                # Follow https://github.com/pytorch/pytorch/blob/d57ce8cf8989c0b737e636d8d7abe16c1f08f70b/torch/utils/data/_utils/worker.py#L260
                command, args = task_queue.get(timeout=5)
            except queue.Empty:
                continue
            if command == MpCommand.SET_COLLATE_FN:
                dataloader_name, func = args
                collate_fn_dict[dataloader_name] = func
            elif command == MpCommand.CALL_BARRIER:
                barrier.wait()
            elif command == MpCommand.DELETE_COLLATE_FN:
                dataloader_name, = args
                del collate_fn_dict[dataloader_name]
            elif command == MpCommand.CALL_COLLATE_FN:
                dataloader_name, collate_args = args
                data_queue.put(
                    (dataloader_name, collate_fn_dict[dataloader_name](collate_args)))
            elif command == MpCommand.CALL_FN_ALL_WORKERS:
                func, func_args = args
                func(func_args)
            elif command == MpCommand.FINALIZE_POOL:
                _exit()
                keep_polling = False
            else:
                raise Exception("Unknown command")
    except Exception as e:
        traceback.print_exc()
        raise e


class CustomPool:
    """Customized worker pool"""
    def __init__(self, num_workers, rpc_config):
        """
        Customized worker pool init function
        """
        ctx = mp.get_context("spawn")
        self.num_workers = num_workers
        self.queue_size = num_workers * 4
        self.result_queue = ctx.Queue(self.queue_size)
        self.task_queues = []
        self.process_list = []
        self.current_proc_id = 0
        self.cache_result_dict = {}
        self.barrier = ctx.Barrier(num_workers)
        for _ in range(num_workers):
            task_queue = ctx.Queue(self.queue_size)
            self.task_queues.append(task_queue)
            proc = ctx.Process(target=init_process, args=(
                rpc_config, (self.result_queue, task_queue, self.barrier)))
            proc.daemon = True
            proc.start()
            self.process_list.append(proc)

    def set_collate_fn(self, func, dataloader_name):
        """Set collate function in subprocess"""
        for i in range(self.num_workers):
            self.task_queues[i].put(
                (MpCommand.SET_COLLATE_FN, (dataloader_name, func)))

    def submit_task(self, dataloader_name, args):
        """Submit task to workers"""
        # Round robin
        self.task_queues[self.current_proc_id].put(
            (MpCommand.CALL_COLLATE_FN, (dataloader_name, args)))
        self.current_proc_id = (self.current_proc_id + 1) % self.num_workers

    def submit_task_to_all_workers(self, func, args):
        """Submit task to all workers"""
        for i in range(self.num_workers):
            self.task_queues[i].put(
                (MpCommand.CALL_FN_ALL_WORKERS, (func, args)))

    def get_result(self, dataloader_name, timeout=1800):
        """Get result from result queue"""
        result_dataloader_name, result = self.result_queue.get(timeout=timeout)
        assert result_dataloader_name == dataloader_name
        return result

    def delete_collate_fn(self, dataloader_name):
        """Delete collate function"""
        for i in range(self.num_workers):
            self.task_queues[i].put(
                (MpCommand.DELETE_COLLATE_FN, (dataloader_name, )))

    def call_barrier(self):
        """Call barrier at all workers"""
        for i in range(self.num_workers):
            self.task_queues[i].put(
                (MpCommand.CALL_BARRIER, tuple()))


    def close(self):
        """Close worker pool"""
        for i in range(self.num_workers):
            self.task_queues[i].put((MpCommand.FINALIZE_POOL, tuple()), block=False)
            time.sleep(0.5) # Fix for early python version

    def join(self):
        """Join the close process of worker pool"""
        for i in range(self.num_workers):
            self.process_list[i].join()


[docs]def initialize(ip_config, num_servers=1, num_workers=0,
               max_queue_size=MAX_QUEUE_SIZE, net_type='socket',
               num_worker_threads=1):
    """Initialize DGL's distributed module

    This function initializes DGL's distributed module. It acts differently in server
    or client modes. In the server mode, it runs the server code and never returns.
    In the client mode, it builds connections with servers for communication and
    creates worker processes for distributed sampling. `num_workers` specifies
    the number of sampling worker processes per trainer process.
    Users also have to provide the number of server processes on each machine in order
    to connect to all the server processes in the cluster of machines correctly.

    Parameters
    ----------
    ip_config: str
        File path of ip_config file
    num_servers : int
        The number of server processes on each machine. This argument is deprecated in DGL 0.7.0.
    num_workers: int
        Number of worker process on each machine. The worker processes are used
        for distributed sampling. This argument is deprecated in DGL 0.7.0.
    max_queue_size : int
        Maximal size (bytes) of client queue buffer (~20 GB on default).

        Note that the 20 GB is just an upper-bound and DGL uses zero-copy and
        it will not allocate 20GB memory at once.
    net_type : str, optional
        Networking type. Currently the only valid option is ``'socket'``.

        Default: ``'socket'``
    num_worker_threads: int
        The number of threads in a worker process.

    Note
    ----
    Users have to invoke this API before any DGL's distributed API and framework-specific
    distributed API. For example, when used with Pytorch, users have to invoke this function
    before Pytorch's `pytorch.distributed.init_process_group`.
    """
    if os.environ.get('DGL_ROLE', 'client') == 'server':
        from .dist_graph import DistGraphServer
        assert os.environ.get('DGL_SERVER_ID') is not None, \
            'Please define DGL_SERVER_ID to run DistGraph server'
        assert os.environ.get('DGL_IP_CONFIG') is not None, \
            'Please define DGL_IP_CONFIG to run DistGraph server'
        assert os.environ.get('DGL_NUM_SERVER') is not None, \
            'Please define DGL_NUM_SERVER to run DistGraph server'
        assert os.environ.get('DGL_NUM_CLIENT') is not None, \
            'Please define DGL_NUM_CLIENT to run DistGraph server'
        assert os.environ.get('DGL_CONF_PATH') is not None, \
            'Please define DGL_CONF_PATH to run DistGraph server'
        formats = os.environ.get('DGL_GRAPH_FORMAT', 'csc').split(',')
        formats = [f.strip() for f in formats]
        serv = DistGraphServer(int(os.environ.get('DGL_SERVER_ID')),
                               os.environ.get('DGL_IP_CONFIG'),
                               int(os.environ.get('DGL_NUM_SERVER')),
                               int(os.environ.get('DGL_NUM_CLIENT')),
                               os.environ.get('DGL_CONF_PATH'),
                               graph_format=formats)
        serv.start()
        sys.exit()
    else:
        if os.environ.get('DGL_NUM_SAMPLER') is not None:
            num_workers = int(os.environ.get('DGL_NUM_SAMPLER'))
        else:
            num_workers = 0
        if os.environ.get('DGL_NUM_SERVER') is not None:
            num_servers = int(os.environ.get('DGL_NUM_SERVER'))
        else:
            num_servers = 1

        rpc.reset()
        global SAMPLER_POOL
        global NUM_SAMPLER_WORKERS
        is_standalone = os.environ.get(
            'DGL_DIST_MODE', 'standalone') == 'standalone'
        if num_workers > 0 and not is_standalone:
            SAMPLER_POOL = CustomPool(num_workers, (ip_config, num_servers, max_queue_size,
                                                    net_type, 'sampler', num_worker_threads))
        else:
            SAMPLER_POOL = None
        NUM_SAMPLER_WORKERS = num_workers
        if not is_standalone:
            assert num_servers is not None and num_servers > 0, \
                'The number of servers per machine must be specified with a positive number.'
            connect_to_server(ip_config, num_servers, max_queue_size, net_type)
        init_role('default')
        init_kvstore(ip_config, num_servers, 'default')


def finalize_client():
    """Release resources of this client."""
    if os.environ.get('DGL_DIST_MODE', 'standalone') != 'standalone':
        rpc.finalize_sender()
        rpc.finalize_receiver()
    global INITIALIZED
    INITIALIZED = False


def _exit():
    exit_client()
    time.sleep(1)


def finalize_worker():
    """Finalize workers
       Python's multiprocessing pool will not call atexit function when close
    """
    global SAMPLER_POOL
    if SAMPLER_POOL is not None:
        SAMPLER_POOL.close()


def join_finalize_worker():
    """join the worker close process"""
    global SAMPLER_POOL
    if SAMPLER_POOL is not None:
        SAMPLER_POOL.join()
    SAMPLER_POOL = None


def is_initialized():
    """Is RPC initialized?
    """
    return INITIALIZED


def exit_client():
    """Trainer exits

    This function is called automatically when a Python process exits. Normally,
    the training script does not need to invoke this function at the end.

    In the case that the training script needs to initialize the distributed module
    multiple times (so far, this is needed in the unit tests), the training script
    needs to call `exit_client` before calling `initialize` again.
    """
    # Only client with rank_0 will send shutdown request to servers.
    finalize_worker()  # finalize workers should be earilier than barrier, and non-blocking
    if os.environ.get('DGL_DIST_MODE', 'standalone') != 'standalone':
        rpc.client_barrier()
        shutdown_servers()
    finalize_client()
    join_finalize_worker()
    close_kvstore()
    atexit.unregister(exit_client)