Revert D29413019: [torch] Various improvements to torch.distributed.launch and torch.distributed.run

VitalyFedyunin · facebook-github-bot · commit ccfdb30644b4 · 2021-07-01T08:44:51.000-07:00
Test Plan: revert-hammer Differential Revision: D29413019 (pytorch@4e181df) Original commit changeset: 323bfbad9d0e fbshipit-source-id: 1f8ae4b3d0a23f3eaff28c37e9148efff25fafe2
diff --git a/docs/source/elastic/errors.rst b/docs/source/elastic/errors.rst
@@ -1,5 +1,3 @@
-.. _elastic_errors-api:
-
 Error Propagation
 ==================
 
diff --git a/docs/source/elastic/run.rst b/docs/source/elastic/run.rst
@@ -1,6 +1,9 @@
 .. _launcher-api:
 
-torch.distributed.run (Elastic Launch)
-======================================
+Elastic Launch
+============================
+
+torch.distributed.run
+----------------------
 
 .. automodule:: torch.distributed.run
diff --git a/docs/source/elastic/train_script.rst b/docs/source/elastic/train_script.rst
@@ -1,5 +1,3 @@
-.. _elastic_train_script:
-
 Train script
 -------------
 
@@ -9,20 +7,18 @@ working with ``torch.distributed.run`` with these differences:
 1. No need to manually pass ``RANK``, ``WORLD_SIZE``,
    ``MASTER_ADDR``, and ``MASTER_PORT``.
 
-2. ``rdzv_backend`` and ``rdzv_endpoint`` can be provided. For most users
-   this will be set to ``c10d`` (see `rendezvous <rendezvous.html>`_). The default
-   ``rdzv_backend`` creates a non-elastic rendezvous where ``rdzv_endpoint`` holds
-   the master address.
+2. ``rdzv_backend`` and ``rdzv_endpoint`` must be provided. For most users
+   this will be set to ``c10d`` (see `rendezvous <rendezvous.html>`_).
 
 3. Make sure you have a ``load_checkpoint(path)`` and
-   ``save_checkpoint(path)`` logic in your script. When any number of
-   workers fail we restart all the workers with the same program
-   arguments so you will lose progress up to the most recent checkpoint
+   ``save_checkpoint(path)`` logic in your script. When workers fail
+   we restart all the workers with the same program arguments so you will
+   lose progress up to the most recent checkpoint
    (see `elastic launch <distributed.html>`_).
 
 4. ``use_env`` flag has been removed. If you were parsing local rank by parsing
    the ``--local_rank`` option, you need to get the local rank from the
-   environment variable ``LOCAL_RANK`` (e.g. ``int(os.environ["LOCAL_RANK"])``).
+   environment variable ``LOCAL_RANK`` (e.g. ``os.environ["LOCAL_RANK"]``).
 
 Below is an expository example of a training script that checkpoints on each
 epoch, hence the worst-case progress lost on failure is one full epoch worth
@@ -35,7 +31,7 @@ of training.
        state = load_checkpoint(args.checkpoint_path)
        initialize(state)
 
-       # torch.distributed.run ensures that this will work
+       # torch.distributed.run ensure that this will work
        # by exporting all the env vars needed to initialize the process group
        torch.distributed.init_process_group(backend=args.backend)
 
diff --git a/torch/distributed/elastic/agent/server/local_elastic_agent.py b/torch/distributed/elastic/agent/server/local_elastic_agent.py
@@ -205,6 +205,7 @@ def _monitor_workers(self, worker_group: WorkerGroup) -> RunResult:
         result = self._pcontext.wait(0)
         if result:
             if result.is_failed():
+                log.error(f"[{role}] Worker group failed")
                 # map local rank failure to global rank
                 worker_failures = {}
                 for local_rank, failure in result.failures.items():
diff --git a/torch/distributed/elastic/events/__init__.py b/torch/distributed/elastic/events/__init__.py
@@ -19,7 +19,6 @@
 
 """
 
-import os
 import logging
 
 from torch.distributed.elastic.events.handlers import get_logging_handler
@@ -47,12 +46,12 @@ def _get_or_create_logger(destination: str = "null") -> logging.Logger:
         return _events_logger
     logging_handler = get_logging_handler(destination)
     _events_logger = logging.getLogger(f"torchelastic-events-{destination}")
-    _events_logger.setLevel(os.environ.get("LOGLEVEL", "INFO"))
+    _events_logger.setLevel(logging.DEBUG)
     # Do not propagate message to the root logger
     _events_logger.propagate = False
     _events_logger.addHandler(logging_handler)
     return _events_logger
 
 
-def record(event: Event, destination: str = "null") -> None:
+def record(event: Event, destination: str = "console") -> None:
     _get_or_create_logger(destination).info(event.serialize())
diff --git a/torch/distributed/elastic/events/handlers.py b/torch/distributed/elastic/events/handlers.py
@@ -12,9 +12,8 @@
 
 _log_handlers: Dict[str, logging.Handler] = {
     "console": logging.StreamHandler(),
-    "null": logging.NullHandler(),
 }
 
 
-def get_logging_handler(destination: str = "null") -> logging.Handler:
+def get_logging_handler(destination: str = "console") -> logging.Handler:
     return _log_handlers[destination]
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
@@ -497,17 +497,6 @@ def close(self):
             self._stderr.close()
 
 
-def _pr_set_pdeathsig() -> None:
-    """
-    Sets PR_SET_PDEATHSIG to ensure a child process is
-    terminated appropriately.
-
-    See http://stackoverflow.com/questions/1884941/ for more information.
-    For libc.so.6 read http://www.linux-m68k.org/faq/glibcinfo.html
-    """
-    mp._prctl_pr_set_pdeathsig(signal.SIGTERM)  # type: ignore[attr-defined]
-
-
 class SubprocessContext(PContext):
     """
     ``PContext`` holding worker processes invoked as a binary.
@@ -552,7 +541,7 @@ def _start(self):
                 entrypoint=self.entrypoint,  # type: ignore[arg-type] # entrypoint is always a str
                 args=self.args[local_rank],
                 env=self.envs[local_rank],
-                preexec_fn=_pr_set_pdeathsig,
+                preexec_fn=mp._prctl_pr_set_pdeathsig(signal.SIGTERM),  # type: ignore[attr-defined]
                 stdout=self.stdouts[local_rank],
                 stderr=self.stderrs[local_rank],
             )
diff --git a/torch/distributed/elastic/utils/logging.py b/torch/distributed/elastic/utils/logging.py
@@ -17,7 +17,7 @@ def get_logger(name: Optional[str] = None):
     """
     Util function to set up a simple logger that writes
     into stderr. The loglevel is fetched from the LOGLEVEL
-    env. variable or WARNING as default. The function will use the
+    env. variable or INFO as default. The function will use the
     module name of the caller if no name is provided.
 
     Args:
@@ -32,7 +32,7 @@ def get_logger(name: Optional[str] = None):
 
 def _setup_logger(name: Optional[str] = None):
     log = logging.getLogger(name)
-    log.setLevel(os.environ.get("LOGLEVEL", "WARNING"))
+    log.setLevel(os.environ.get("LOGLEVEL", "INFO"))
     return log
 
 
diff --git a/torch/distributed/elastic/utils/store.py b/torch/distributed/elastic/utils/store.py
@@ -6,6 +6,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import warnings
 from datetime import timedelta
 from typing import List
 
@@ -63,5 +64,8 @@ def barrier(
     Note: Since the data is not removed from the store, the barrier can be used
         once per unique ``key_prefix``.
     """
+    warnings.warn(
+        "This is an experimental API and will be changed in future.", FutureWarning
+    )
     data = f"{rank}".encode(encoding="UTF-8")
     synchronize(store, data, rank, world_size, key_prefix, barrier_timeout)
diff --git a/torch/distributed/launch.py b/torch/distributed/launch.py
@@ -1,10 +1,8 @@
 r"""
-``torch.distributed.launch`` is a module that spawns up multiple distributed
+`torch.distributed.launch` is a module that spawns up multiple distributed
 training processes on each of the training nodes.
 
-.. warning::
-
-    This module is going to be deprecated in favor of :ref:`torch.distributed.run <launcher-api>`.
+NOTE: This module is deprecated, use torch.distributed.run.
 
 The utility can be used for single-node distributed training, in which one or
 more processes per node will be spawned. The utility can be used for either
@@ -138,12 +136,9 @@
     https://github.com/pytorch/pytorch/issues/12042 for an example of
     how things can go wrong if you don't do this correctly.
 
-
-
 """
 
 import logging
-import warnings
 
 from torch.distributed.run import get_args_parser, run
 
@@ -164,27 +159,14 @@ def parse_args(args):
     return parser.parse_args(args)
 
 
-def launch(args):
-    if args.no_python and not args.use_env:
-        raise ValueError(
-            "When using the '--no_python' flag,"
-            " you must also set the '--use_env' flag."
-        )
-    run(args)
-
-
 def main(args=None):
-    warnings.warn(
-        "The module torch.distributed.launch is deprecated\n"
-        "and will be removed in future. Use torch.distributed.run.\n"
-        "Note that --use_env is set by default in torch.distributed.run.\n"
-        "If your script expects `--local_rank` argument to be set, please\n"
-        "change it to read from `os.environ('LOCAL_RANK')` instead. See \n"
-        "https://pytorch.org/docs/stable/distributed.html#launch-utility for \n"
-        "further instructions\n", FutureWarning
+    logger.warning(
+        "The module torch.distributed.launch is deprecated "
+        "and going to be removed in future."
+        "Migrate to torch.distributed.run"
     )
     args = parse_args(args)
-    launch(args)
+    run(args)
 
 
 if __name__ == "__main__":
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
@@ -15,7 +15,7 @@
 from torch.distributed.elastic.agent.server.api import WorkerSpec, WorkerState
 from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent
 from torch.distributed.elastic.multiprocessing import Std
-from torch.distributed.elastic.multiprocessing.errors import ChildFailedError
+from torch.distributed.elastic.multiprocessing.errors import ChildFailedError, record
 from torch.distributed.elastic.rendezvous import RendezvousParameters
 from torch.distributed.elastic.rendezvous.utils import parse_rendezvous_endpoint
 from torch.distributed.elastic.utils.logging import get_logger
@@ -172,6 +172,7 @@ def _get_addr_and_port(
 
 # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
 # torch.distributed.elastic.multiprocessing.errors.record.
+@record
 def launch_agent(
     config: LaunchConfig,
     entrypoint: Union[Callable, str, None],
diff --git a/torch/distributed/run.py b/torch/distributed/run.py

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-.. _elastic_errors-api:`
`2`		`-`
`3`	`1`	`Error Propagation`
`4`	`2`	`==================`
`5`	`3`
Original file line number	Diff line number	Diff line change
`@@ -12,9 +12,8 @@`
`12`	`12`
`13`	`13`	`_log_handlers: Dict[str, logging.Handler] = {`
`14`	`14`	`"console": logging.StreamHandler(),`
`15`		`- "null": logging.NullHandler(),`
`16`	`15`	`}`
`17`	`16`
`18`	`17`
`19`		`-def get_logging_handler(destination: str = "null") -> logging.Handler:`
	`18`	`+def get_logging_handler(destination: str = "console") -> logging.Handler:`
`20`	`19`	`return _log_handlers[destination]`