From bfd61b6bc209404ba5f784d546ccbc778a0aeb49 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sun, 5 May 2024 19:35:58 -0400
Subject: [PATCH] Add spell-checking that is run by CI

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/workflows/spellcheck.yml |  45 ++++++
 .spellcheck-en-custom.txt        | 259 +++++++++++++++++++++++++++++++
 .spellcheck.yml                  |  28 ++++
 Makefile                         |   8 +
 README.md                        |  34 ++--
 5 files changed, 357 insertions(+), 17 deletions(-)
 create mode 100644 .github/workflows/spellcheck.yml
 create mode 100644 .spellcheck-en-custom.txt
 create mode 100644 .spellcheck.yml

diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml
new file mode 100644
index 0000000..7067940
--- /dev/null
+++ b/.github/workflows/spellcheck.yml
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: Spellcheck
+
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - '**.md'
+      - '.github/workflows/spellcheck.yml' # This workflow
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - '**.md'
+      - '.github/workflows/spellcheck.yml' # This workflow
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  spellcheck:
+    name: Spellcheck (en_US)
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Harden Runner"
+        uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1
+        with:
+         egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
+
+      - name: "Checkout"
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4
+        with:
+          fetch-depth: 0
+
+      - name: Spellcheck
+        uses: rojopolis/spellcheck-github-actions@dbd2f1da869c05ad874fffeb6fe1ed50cd1a6e98 # v0.36.0
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
new file mode 100644
index 0000000..bb73aa5
--- /dev/null
+++ b/.spellcheck-en-custom.txt
@@ -0,0 +1,259 @@
+
+# make spellcheck-sort
+# Please keep this file sorted:
+# SPDX-License-Identifier: Apache-2.0
+Aakanksha
+aakankshaduggal
+abhi
+Abramson
+Acknowledgements
+Adminstrators
+akashgit
+al
+aldopareja
+alimaredia
+Alina
+alinaryan
+Anh
+anik
+Anza
+Approver
+Approvers
+AQuA
+Arapahoe
+ARB
+arge
+Asghar
+Ashgar
+backend
+Bernardino
+bjhargrave
+Boelkins
+bootable
+bootc
+Byars
+Bzostek
+caradelia
+CDLA
+cdoern
+CHANGELOG
+Chatbot
+Choi
+ckadner
+CLI
+CNCF
+coc
+CoCC's
+codebase
+codebases
+compositional
+Conala
+Corbett
+curation
+cybette
+danmcp
+darrellreimer
+datacenter
+Dataset
+dataset
+datasets
+dave
+DCO
+De
+deployable
+DeSaix
+dev
+DM
+Doern
+Duggal
+Eder
+Eggebrecht
+EPEL
+et
+Fraknoi
+Freeform
+freeform
+Frontend
+frontend
+Gair
+gcc
+Greenlaw
+GSM
+Haver
+Helpsteer
+HH
+hickeyma
+Hidaka
+Hinrichs
+Huben
+ibm
+ics
+ilab
+Illowsky
+inglis
+InstructLab
+integrations
+Jaideep
+jaideepr
+JamesKunstle
+Janssen
+jeremyeder
+JJ
+jjasghar
+joesepi
+Jolla
+Jonick
+judgement
+juliadenham
+Keene
+kelbrown
+khaledsulayman
+Kickstart
+kickstart
+Kordas
+Korol
+Kruse
+Kubernetes
+Kunstle
+Lenovo
+LGTM
+LGTMs
+lhawthorn
+lignment
+LLM
+LLM's
+Lovett
+luke
+Lund
+Lyryx
+Mahbobi
+Maintainership
+maintainership
+mairin
+Máirín
+Makefiles
+Maredia
+markstur
+Marymount
+Masaki
+McElaney
+md
+Merlinite
+mingxzhao
+Miniforge
+Mixtral
+mmcelaney
+MMLU
+Moebs
+mrutkows
+mscherer
+Multivariable
+Musique
+nathan
+nerdalert
+Neth
+NOIRLab
+noone
+Norwood
+NumGLUE
+nvidia
+OASST
+obuzek
+OCI
+ODC
+oindrillac
+Oksana
+Oleg
+openbookQA
+Oswego
+ots
+overfitting
+Pfannestiel
+PII
+Podman
+Prahl
+pre
+Precalculus
+PRM
+pyenv
+PyPI
+pypi
+qa
+quantized
+Quinnipiac
+Rao
+README
+Rebecca
+Redbooks
+redbooks
+Repo
+repo
+resynthesizes
+RHEL
+RHLF
+RHUI
+Roadmap
+Roush
+runtime
+russellb
+Saftey
+Sandhills
+Sanny
+Schlicker
+Schneegurt
+Seminario
+Sepi
+SETI
+sexualized
+shivchander
+signoff
+signoffs
+Silkin
+socio
+soltysh
+SPDX
+Spelman
+Spielman
+spzala
+Stanberry
+Standup
+subdirectory
+Sudalairaj
+supermajority
+Tatlock
+TBD
+templated
+Theopold
+Thi
+Tiemann
+TODO
+Toolbx
+transactional
+Triager
+triagers
+Triaging
+UI
+Urone
+USC
+userspace
+Usings
+Ventura
+venv
+Vickery
+vishnoianil
+VLLM
+Volker
+Vretta
+Wakefield
+Waskiewicz
+weinberg
+Wikimedia
+wikimultihop
+wordmarks
+workstreams
+xukai
+YAML
+yhwang
+Zach
+Zedalis
+Zimmitti
diff --git a/.spellcheck.yml b/.spellcheck.yml
new file mode 100644
index 0000000..f45e84a
--- /dev/null
+++ b/.spellcheck.yml
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+
+matrix:
+- name: markdown
+  aspell:
+    lang: en
+    d: en_US
+    camel-case: true
+    mode: markdown
+  sources:
+  - "**/*.md|!.tox/**|!training/**"
+  dictionary:
+    wordlists:
+    - .spellcheck-en-custom.txt
+  pipeline:
+  - pyspelling.filters.context:
+      context_visible_first: true
+      escapes: '\\[\\`~]'
+      delimiters:
+      # Ignore multiline content between fences (fences can have 3 or more back ticks)
+      # ```language
+      # content
+      # ```
+      - open: '(?s)^(?P<open> *`{3,}).*?$'
+        close: '^(?P=open)$'
+      # Ignore text between inline back ticks
+      - open: '(?P<open>`+)'
+        close: '(?P=open)'
diff --git a/Makefile b/Makefile
index 515946a..3e99296 100644
--- a/Makefile
+++ b/Makefile
@@ -34,6 +34,14 @@ update-training-dir: ## Update the contents of the training directory
 	$(CMD_PREFIX) rm -rf ai-lab-recipes
 	$(CMD_PREFIX) git add training
 
+.PHONY: spellcheck
+spellcheck:
+	$(CMD_PREFIX) python -m pyspelling --config .spellcheck.yml --spellchecker aspell
+
+.PHONY: spellcheck-sort
+spellcheck-sort: .spellcheck-en-custom.txt
+	$(CMD_PREFIX) sort -d -f -o $< $<
+
 # Catch-all target to pass through any other target to the training directory
 %:
 	$(CMD_PREFIX) make -C training $@
diff --git a/README.md b/README.md
index 023907f..a562cb2 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ Developer Preview.
 
 For the best experience using the RHEL AI developer preview period, we have
 included a pruned taxonomy tree inside the InstructLab container. This will
-allow for validating training to complete in a reasonable timeframe on a single
+allow for validating training to complete in a reasonable time frame on a single
 server.
 
 - Add your knowledge and skills to this version of the taxonomy. We recommend you add no more than 5 additions to the taxonomy tree to keep the resource requirements reasonable.
@@ -73,8 +73,8 @@ delivery format for base operating system updates.
 
 The container image includes a Linux kernel (in e.g. `/usr/lib/modules`), which is
 used to boot. At runtime on a target system, the base userspace is not itself
-running in a container by default. For example, assuming systemd is in use,
-systemd acts as pid1 as usual - there's no "outer" process.
+running in a container by default. For example, assuming `systemd` is in use,
+`systemd` acts as `pid1` as usual - there's no "outer" process.
 
 In the following example, the bootc container is labeled **Node Base Image*
 
@@ -85,8 +85,8 @@ and uploading container images could take up to 2 hours.
 
 - RHEL 9.4
 - Connection to the internet (some images are > 15GB)
-- 4 CPU, 16GB RAM, 400GB disk space (tested with AWS EC2 m5.xlarge using GP3 storage)
-- A place to push container images that you will build – e.g., quay.io or another image registry.
+- 4 CPU, 16GB RAM, 400GB disk space (tested with AWS EC2 `m5.xlarge` using GP3 storage)
+- A place to push container images that you will build – e.g., `quay.io` or another image registry.
 
 ## Preparing the Build Host
 
@@ -112,7 +112,7 @@ git clone https://github.com/RedHatOfficial/rhelai-dev-preview
 
 Authenticate to the Red Hat registry ([Red Hat Container Registry
 Authentication](https://access.redhat.com/RegistryAuthentication)) using your
-redhat.com account.
+`redhat.com` account.
 
 ```shell
 podman login registry.redhat.io --username <username> --password <password>
@@ -121,7 +121,7 @@ Your_login_here
 ```
 
 Ensure you have an SSH key on the build host. This is used during the driver
-toolkit image build. ([Using ssh-keygen and sharing for key-based authentication
+toolkit image build. ([Using `ssh-keygen` and sharing for key-based authentication
 in Linux | Enable Sysadmin](https://www.redhat.com/sysadmin/configure-ssh-keygen))
 
 ### Creating bootc containers
@@ -136,13 +136,13 @@ Build the InstructLab NVIDIA container image.
 make instruct-nvidia
 ```
 
-Build the [vllm](https://github.com/vllm-project/vllm) container image.
+Build the [`vllm`](https://github.com/vllm-project/vllm) container image.
 
 ```sh
 make vllm
 ```
 
-Build the [deepspeed](https://www.deepspeed.ai/) container image.
+Build the [`deepspeed`](https://www.deepspeed.ai/) container image.
 
 ```sh
 make deepspeed
@@ -179,7 +179,7 @@ podman push quay.io/<your-user-name>/nvidia-bootc:latest
 
 [Anaconda](https://docs.anaconda.com/free/anaconda/install/index.html) is the
 Red Hat Enterprise Linux installer, and it is embedded in all RHEL downloadable
-iso images. The main method of automating RHEL installation is
+ISO images. The main method of automating RHEL installation is
 via scripts called Kickstart. For more information about Anaconda and Kickstart,
 [read these documents](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/9/html-single/performing_an_advanced_rhel_9_installation/index#what-are-kickstart-installations_kickstart-installation-basics).
 
@@ -217,12 +217,12 @@ Here is an example of a kickstart file. Copy it to a file called
 # reboot
 ```
 
-### Embed your kickstart into the RHEL Boot iso
+### Embed your kickstart into the RHEL Boot ISO
 
 [Download the RHEL
 9.4](https://developers.redhat.com/products/rhel/download#rhel-new-product-download-list-61451)
-“Boot iso”, and use `mkksiso` command to embed the kickstart into the RHEL
-boot iso.
+“Boot ISO”, and use `mkksiso` command to embed the kickstart into the RHEL
+boot ISO.
 
 ```sh
 mkksiso rhelai-dev-preview-bootc.ks rhel-9.4-x86_64-boot.iso rhelai-dev-preview-bootc-ks.iso
@@ -418,10 +418,10 @@ INFO:     Finished server process [1]
 With VLLM stopped and the new data generated, the training process can be
 launched using the `ilab train` command. By default, the training process
 saves a model checkpoint after every 4999 samples. You can adjust this using the
-–num-samples parameter. Additionally, training defaults to running for 10
-epochs, which can also be adjusted with the –num-epochs parameter. Generally,
-more epochs are better, but after a certain point, the model can become
-overfitted. It is typically recommended to stay within 10 or fewer epochs and to
+`--num-samples` parameter. Additionally, training defaults to running for 10
+epochs, which can also be adjusted with the `--num-epochs` parameter. Generally,
+more epochs are better, but after a certain point, more epochs will result in
+overfitting. It is typically recommended to stay within 10 or fewer epochs and to
 look at different sample points to find the best result.
 
 ```sh