mila-iqia · lebrice · Aug 7, 2023 · Jul 5, 2023 · Jul 5, 2023 · Jul 7, 2023
@@ -3,4 +3,4 @@ _build
 .idea
 **/__pycache__
 /docs/examples/**/*.diff
-**/slurm-*.out
+/docs/examples/**/slurm-*.out
@@ -1,8 +1,22 @@
-.. ****************
-.. Minimal Examples
-.. ****************
+****************
+Minimal Examples
+****************
 
+This section contains some minimal examples of how to run jobs on the Mila cluster.
 
-.. include:: examples/frameworks/index.rst
-.. include:: examples/distributed/index.rst
-.. include:: examples/good_practices/index.rst
+Each example is self-contained and can be run as-is directly on the cluster without error.
+Examples has the following structured:
+
+* ``job.sh``: SLURM ``sbatch`` script. Can be launched with ``sbatch job.sh``.
+* ``main.py``: Example python script.
+
+Some examples are displayed as a difference with respect to a "base" example. For instance, the :doc:`multi-gpu example <examples/distributed/multi_gpu/index>` is shown as a difference with respect to the :doc:`single-gpu example <examples/distributed/single_gpu/index>`.
+
+
+
+.. toctree::
+    :maxdepth: 1
+
+    examples/frameworks/index
+    examples/distributed/index
+    examples/good_practices/index
@@ -9,39 +9,42 @@
 import sphinx_theme
 
 extensions = [
-            'sphinx-prompt',
-            'sphinx_copybutton',
-            # 'recommonmark',
-            'sphinx.ext.autosectionlabel',
-            'sphinx.ext.todo',
-            'myst_parser']
-
-templates_path = ['templates', '_templates', '.templates']
-source_suffix = ['.rst', '.md']
-master_doc = 'index'
-project = u'MILA Technical Documentation'
+    "sphinx-prompt",
+    "sphinx_copybutton",
+    # 'recommonmark',
+    "sphinx.ext.autosectionlabel",
+    "sphinx.ext.todo",
+    "myst_parser",
+]
+
+templates_path = ["templates", "_templates", ".templates"]
+source_suffix = [".rst", ".md"]
+master_doc = "index"
+project = "MILA Technical Documentation"
 copyright = str(datetime.now().year)
-version = 'latest'
-release = 'latest'
+version = "latest"
+release = "latest"
 
-htmlhelp_basename = 'mila-docs'
+htmlhelp_basename = "mila-docs"
 file_insertion_enabled = False
 latex_documents = [
-  ('index', 'mila-docs.tex', u'Mila technical Documentation',
-   u'', 'manual'),
+    ("index", "mila-docs.tex", "Mila technical Documentation", "", "manual"),
 ]
 
-exclude_patterns = ['_build',
-                    'Userguide_*',
-                    'Theory_cluster_*',
-                    'Information_*',
-                    'Purpose_*',
-                    'Extra_compute_*',
-                    'IDT_*',
-                    'singularity/*',
-                    'examples/*',]
+exclude_patterns = [
+    "_build",
+    "Userguide_*",
+    "Theory_cluster_*",
+    "Information_*",
+    "Purpose_*",
+    "Extra_compute_*",
+    "IDT_*",
+    "singularity/*",
+    "examples/**/README.rst",
+    # 'examples/*',
+]
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # -- Options for HTML output -------------------------------------------------
 
@@ -53,16 +56,14 @@
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
-html_theme_options = {
-    'logo_only': True
-    }
+html_theme_options = {"logo_only": True}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
-html_js_files = ['documentation_options.js', 'documentation_options_fix.js']
+html_js_files = ["documentation_options.js", "documentation_options_fix.js"]
 
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
@@ -73,30 +74,30 @@
 # 'searchbox.html']``.
 #
 # html_sidebars = {}
-#---sphinx-themes-----
-html_theme = 'sphinx_rtd_theme'
+# ---sphinx-themes-----
+html_theme = "sphinx_rtd_theme"
 html_theme_path = [sphinx_theme.get_html_theme_path()]
-html_logo = '_static/image.png'
+html_logo = "_static/image.png"
 
 html_context = {
     # Enable the "Edit in GitHub link within the header of each page.
-    'display_github': True,
+    "display_github": True,
     # Set the following variables to generate the resulting github URL for each page.
     # Format Template: https://{{ github_host|default("github.com") }}/{{ github_user }}/{{ github_repo }}/blob/{{ github_version }}{{ conf_py_path }}{{ pagename }}{{ suffix }}
-    'github_user': 'mila-iqia',
-    'github_repo': 'mila-docs',
-    'github_version': 'master/',
-    'conf_py_path': 'docs/'
+    "github_user": "mila-iqia",
+    "github_repo": "mila-docs",
+    "github_version": "master/",
+    "conf_py_path": "docs/",
 }
 
 # Include CNAME file so GitHub Pages can set Custom Domain name
-html_extra_path = ['CNAME']
+html_extra_path = ["CNAME"]
 
 
 docs_root = Path(__file__).absolute().parent
 pyscript = docs_root / "examples/preprocess.py"
 try:
-    _proc = subprocess.run(["python3", str(pyscript)], capture_output=True, check=True)
+    subprocess.run(["python3", str(pyscript)], capture_output=True, check=True)
 except subprocess.CalledProcessError as err:
     raise RuntimeError(
         "Could not generate github README's and/or diff files:\n"
@@ -106,4 +107,4 @@
 
 
 def setup(app):
-    app.add_css_file('custom.css')
+    app.add_css_file("custom.css")
@@ -2,6 +2,11 @@
 Distributed Training
 ********************
 
-.. include:: /examples/distributed/001_single_gpu/_index.rst
-.. include:: /examples/distributed/002_multi_gpu/_index.rst
-.. include:: /examples/distributed/003_multi_node/_index.rst
+.. toctree::
+    :maxdepth: 1
+    :glob:
+    :numbered:
+
+    single_gpu/index
+    multi_gpu/index
+    multi_node/index
@@ -1,25 +1,29 @@
-002 - Multi-GPU Job
-====================
+.. NOTE: This file is auto-generated from examples/distributed/multi_gpu/index.rst
+.. This is done so this file can be easily viewed from the GitHub UI.
+.. **DO NOT EDIT**
+
+Multi-GPU Job
+=============
 
 
 Prerequisites:
 
-* :ref:`pytorch_setup`
-* :ref:`001 - Single GPU Job`
+* `examples/frameworks/pytorch_setup <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/frameworks/pytorch_setup>`_
+* `examples/distributed/single_gpu <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/single_gpu>`_
 
 Other interesting resources:
 
 * `<https://sebarnold.net/dist_blog/>`_
 * `<https://lambdalabs.com/blog/multi-node-pytorch-distributed-training-guide>`_
 
 Click here to see `the code for this example
-<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/002_multi_gpu>`_
+<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/multi_gpu>`_
 
-**Job.sh**
+**job.sh**
 
 .. code:: diff
 
-    # distributed/001_single_gpu/job.sh -> distributed/002_multi_gpu/job.sh
+    # distributed/single_gpu/job.sh -> distributed/multi_gpu/job.sh
     #!/bin/bash
     #SBATCH --gpus-per-task=rtx8000:1
     #SBATCH --cpus-per-task=4
@@ -71,9 +75,11 @@ Click here to see `the code for this example
    +# Execute Python script in each task (one per GPU)
    +srun python main.py
 
+**main.py**
+
 .. code:: diff
 
-    # distributed/001_single_gpu/main.py -> distributed/002_multi_gpu/main.py
+    # distributed/single_gpu/main.py -> distributed/multi_gpu/main.py
    -"""Single-GPU training example."""
    +"""Multi-GPU Training example."""
     import logging
@@ -238,7 +244,7 @@ Click here to see `the code for this example
    +                logger.debug(f"Accuracy: {accuracy.item():.2%}")
    +                logger.debug(f"Average Loss: {loss.item()}")
 
-                # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+                # Advance the progress bar one step and update the progress bar text.
                 progress_bar.update(1)
                 progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
             progress_bar.close()

@@ -1,26 +1,28 @@
-002 - Multi-GPU Job
-====================
+Multi-GPU Job
+=============
 
 
 Prerequisites:
 
-* :ref:`pytorch_setup`
-* :ref:`001 - Single GPU Job`
+* :doc:`/examples/frameworks/pytorch_setup/index`
+* :doc:`/examples/distributed/single_gpu/index`
 
 Other interesting resources:
 
 * `<https://sebarnold.net/dist_blog/>`_
 * `<https://lambdalabs.com/blog/multi-node-pytorch-distributed-training-guide>`_
 
 Click here to see `the code for this example
-<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/002_multi_gpu>`_
+<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/multi_gpu>`_
 
-**Job.sh**
+**job.sh**
 
-.. literalinclude:: examples/distributed/002_multi_gpu/job.sh.diff
+.. literalinclude:: job.sh.diff
     :language: diff
 
-.. literalinclude:: examples/distributed/002_multi_gpu/main.py.diff
+**main.py**
+
+.. literalinclude:: main.py.diff
     :language: diff
 
 

@@ -151,7 +151,7 @@ def main():
                 logger.debug(f"Accuracy: {accuracy.item():.2%}")
                 logger.debug(f"Average Loss: {loss.item()}")
 
-            # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+            # Advance the progress bar one step and update the progress bar text.
             progress_bar.update(1)
             progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
         progress_bar.close()

@@ -1,12 +1,16 @@
-003 - Multi-Node (DDP) Job
-=====================================
+.. NOTE: This file is auto-generated from examples/distributed/multi_node/index.rst
+.. This is done so this file can be easily viewed from the GitHub UI.
+.. **DO NOT EDIT**
+
+Multi-Node (DDP) Job
+====================
 
 
 Prerequisites:
 
-* :ref:`pytorch_setup`
-* :ref:`001 - Single GPU Job`
-* :ref:`002 - Multi-GPU Job`
+* `examples/frameworks/pytorch_setup <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/frameworks/pytorch_setup>`_
+* `examples/distributed/single_gpu <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/single_gpu>`_
+* `examples/distributed/multi_gpu <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/multi_gpu>`_
 
 Other interesting resources:
 
@@ -15,13 +19,13 @@ Other interesting resources:
 
 
 Click here to see `the source code for this example
-<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/003_multi_node>`_
+<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/multi_node>`_
 
-**Job.sh**
+**job.sh**
 
 .. code:: diff
 
-    # distributed/002_multi_gpu/job.sh -> distributed/003_multi_node/job.sh
+    # distributed/multi_gpu/job.sh -> distributed/multi_node/job.sh
     #!/bin/bash
     #SBATCH --gpus-per-task=rtx8000:1
     #SBATCH --cpus-per-task=4
@@ -75,7 +79,7 @@ Click here to see `the source code for this example
 
 .. code:: diff
 
-    # distributed/002_multi_gpu/main.py -> distributed/003_multi_node/main.py
+    # distributed/multi_gpu/main.py -> distributed/multi_node/main.py
     """Multi-GPU Training example."""
     import logging
     import os
@@ -239,7 +243,7 @@ Click here to see `the source code for this example
                     logger.debug(f"Accuracy: {accuracy.item():.2%}")
                     logger.debug(f"Average Loss: {loss.item()}")
 
-                # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+                # Advance the progress bar one step and update the progress bar text.
                 progress_bar.update(1)
                 progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
             progress_bar.close()

@@ -1,12 +1,12 @@
-003 - Multi-Node (DDP) Job
-=====================================
+Multi-Node (DDP) Job
+====================
 
 
 Prerequisites:
 
-* :ref:`pytorch_setup`
-* :ref:`001 - Single GPU Job`
-* :ref:`002 - Multi-GPU Job`
+* :doc:`/examples/frameworks/pytorch_setup/index`
+* :doc:`/examples/distributed/single_gpu/index`
+* :doc:`/examples/distributed/multi_gpu/index`
 
 Other interesting resources:
 
@@ -15,16 +15,16 @@ Other interesting resources:
 
 
 Click here to see `the source code for this example
-<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/003_multi_node>`_
+<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/multi_node>`_
 
-**Job.sh**
+**job.sh**
 
-.. literalinclude:: examples/distributed/003_multi_node/job.sh.diff
+.. literalinclude:: job.sh.diff
     :language: diff
 
 **main.py**
 
-.. literalinclude:: examples/distributed/003_multi_node/main.py.diff
+.. literalinclude:: main.py.diff
     :language: diff
 
 

@@ -156,7 +156,7 @@ def main():
                 logger.debug(f"Accuracy: {accuracy.item():.2%}")
                 logger.debug(f"Average Loss: {loss.item()}")
 
-            # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+            # Advance the progress bar one step and update the progress bar text.
             progress_bar.update(1)
             progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
         progress_bar.close()