From bfd61b6bc209404ba5f784d546ccbc778a0aeb49 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sun, 5 May 2024 19:35:58 -0400 Subject: [PATCH] Add spell-checking that is run by CI Signed-off-by: Russell Bryant --- .github/workflows/spellcheck.yml | 45 ++++++ .spellcheck-en-custom.txt | 259 +++++++++++++++++++++++++++++++ .spellcheck.yml | 28 ++++ Makefile | 8 + README.md | 34 ++-- 5 files changed, 357 insertions(+), 17 deletions(-) create mode 100644 .github/workflows/spellcheck.yml create mode 100644 .spellcheck-en-custom.txt create mode 100644 .spellcheck.yml diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml new file mode 100644 index 0000000..7067940 --- /dev/null +++ b/.github/workflows/spellcheck.yml @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Spellcheck + +on: + push: + branches: + - "main" + paths: + - '**.md' + - '.github/workflows/spellcheck.yml' # This workflow + pull_request: + branches: + - "main" + paths: + - '**.md' + - '.github/workflows/spellcheck.yml' # This workflow + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + spellcheck: + name: Spellcheck (en_US) + runs-on: ubuntu-latest + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Checkout" + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4 + with: + fetch-depth: 0 + + - name: Spellcheck + uses: rojopolis/spellcheck-github-actions@dbd2f1da869c05ad874fffeb6fe1ed50cd1a6e98 # v0.36.0 diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt new file mode 100644 index 0000000..bb73aa5 --- /dev/null +++ b/.spellcheck-en-custom.txt @@ -0,0 +1,259 @@ + +# make spellcheck-sort +# Please keep this file sorted: +# SPDX-License-Identifier: Apache-2.0 +Aakanksha +aakankshaduggal +abhi +Abramson +Acknowledgements +Adminstrators +akashgit +al +aldopareja +alimaredia +Alina +alinaryan +Anh +anik +Anza +Approver +Approvers +AQuA +Arapahoe +ARB +arge +Asghar +Ashgar +backend +Bernardino +bjhargrave +Boelkins +bootable +bootc +Byars +Bzostek +caradelia +CDLA +cdoern +CHANGELOG +Chatbot +Choi +ckadner +CLI +CNCF +coc +CoCC's +codebase +codebases +compositional +Conala +Corbett +curation +cybette +danmcp +darrellreimer +datacenter +Dataset +dataset +datasets +dave +DCO +De +deployable +DeSaix +dev +DM +Doern +Duggal +Eder +Eggebrecht +EPEL +et +Fraknoi +Freeform +freeform +Frontend +frontend +Gair +gcc +Greenlaw +GSM +Haver +Helpsteer +HH +hickeyma +Hidaka +Hinrichs +Huben +ibm +ics +ilab +Illowsky +inglis +InstructLab +integrations +Jaideep +jaideepr +JamesKunstle +Janssen +jeremyeder +JJ +jjasghar +joesepi +Jolla +Jonick +judgement +juliadenham +Keene +kelbrown +khaledsulayman +Kickstart +kickstart +Kordas +Korol +Kruse +Kubernetes +Kunstle +Lenovo +LGTM +LGTMs +lhawthorn +lignment +LLM +LLM's +Lovett +luke +Lund +Lyryx +Mahbobi +Maintainership +maintainership +mairin +Máirín +Makefiles +Maredia +markstur +Marymount +Masaki +McElaney +md +Merlinite +mingxzhao +Miniforge +Mixtral +mmcelaney +MMLU +Moebs +mrutkows +mscherer +Multivariable +Musique +nathan +nerdalert +Neth +NOIRLab +noone +Norwood +NumGLUE +nvidia +OASST +obuzek +OCI +ODC +oindrillac +Oksana +Oleg +openbookQA +Oswego +ots +overfitting +Pfannestiel +PII +Podman +Prahl +pre +Precalculus +PRM +pyenv +PyPI +pypi +qa +quantized +Quinnipiac +Rao +README +Rebecca +Redbooks +redbooks +Repo +repo +resynthesizes +RHEL +RHLF +RHUI +Roadmap +Roush +runtime +russellb +Saftey +Sandhills +Sanny +Schlicker +Schneegurt +Seminario +Sepi +SETI +sexualized +shivchander +signoff +signoffs +Silkin +socio +soltysh +SPDX +Spelman +Spielman +spzala +Stanberry +Standup +subdirectory +Sudalairaj +supermajority +Tatlock +TBD +templated +Theopold +Thi +Tiemann +TODO +Toolbx +transactional +Triager +triagers +Triaging +UI +Urone +USC +userspace +Usings +Ventura +venv +Vickery +vishnoianil +VLLM +Volker +Vretta +Wakefield +Waskiewicz +weinberg +Wikimedia +wikimultihop +wordmarks +workstreams +xukai +YAML +yhwang +Zach +Zedalis +Zimmitti diff --git a/.spellcheck.yml b/.spellcheck.yml new file mode 100644 index 0000000..f45e84a --- /dev/null +++ b/.spellcheck.yml @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 + +matrix: +- name: markdown + aspell: + lang: en + d: en_US + camel-case: true + mode: markdown + sources: + - "**/*.md|!.tox/**|!training/**" + dictionary: + wordlists: + - .spellcheck-en-custom.txt + pipeline: + - pyspelling.filters.context: + context_visible_first: true + escapes: '\\[\\`~]' + delimiters: + # Ignore multiline content between fences (fences can have 3 or more back ticks) + # ```language + # content + # ``` + - open: '(?s)^(?P *`{3,}).*?$' + close: '^(?P=open)$' + # Ignore text between inline back ticks + - open: '(?P`+)' + close: '(?P=open)' diff --git a/Makefile b/Makefile index 515946a..3e99296 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,14 @@ update-training-dir: ## Update the contents of the training directory $(CMD_PREFIX) rm -rf ai-lab-recipes $(CMD_PREFIX) git add training +.PHONY: spellcheck +spellcheck: + $(CMD_PREFIX) python -m pyspelling --config .spellcheck.yml --spellchecker aspell + +.PHONY: spellcheck-sort +spellcheck-sort: .spellcheck-en-custom.txt + $(CMD_PREFIX) sort -d -f -o $< $< + # Catch-all target to pass through any other target to the training directory %: $(CMD_PREFIX) make -C training $@ diff --git a/README.md b/README.md index 023907f..a562cb2 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Developer Preview. For the best experience using the RHEL AI developer preview period, we have included a pruned taxonomy tree inside the InstructLab container. This will -allow for validating training to complete in a reasonable timeframe on a single +allow for validating training to complete in a reasonable time frame on a single server. - Add your knowledge and skills to this version of the taxonomy. We recommend you add no more than 5 additions to the taxonomy tree to keep the resource requirements reasonable. @@ -73,8 +73,8 @@ delivery format for base operating system updates. The container image includes a Linux kernel (in e.g. `/usr/lib/modules`), which is used to boot. At runtime on a target system, the base userspace is not itself -running in a container by default. For example, assuming systemd is in use, -systemd acts as pid1 as usual - there's no "outer" process. +running in a container by default. For example, assuming `systemd` is in use, +`systemd` acts as `pid1` as usual - there's no "outer" process. In the following example, the bootc container is labeled **Node Base Image* @@ -85,8 +85,8 @@ and uploading container images could take up to 2 hours. - RHEL 9.4 - Connection to the internet (some images are > 15GB) -- 4 CPU, 16GB RAM, 400GB disk space (tested with AWS EC2 m5.xlarge using GP3 storage) -- A place to push container images that you will build – e.g., quay.io or another image registry. +- 4 CPU, 16GB RAM, 400GB disk space (tested with AWS EC2 `m5.xlarge` using GP3 storage) +- A place to push container images that you will build – e.g., `quay.io` or another image registry. ## Preparing the Build Host @@ -112,7 +112,7 @@ git clone https://github.com/RedHatOfficial/rhelai-dev-preview Authenticate to the Red Hat registry ([Red Hat Container Registry Authentication](https://access.redhat.com/RegistryAuthentication)) using your -redhat.com account. +`redhat.com` account. ```shell podman login registry.redhat.io --username --password @@ -121,7 +121,7 @@ Your_login_here ``` Ensure you have an SSH key on the build host. This is used during the driver -toolkit image build. ([Using ssh-keygen and sharing for key-based authentication +toolkit image build. ([Using `ssh-keygen` and sharing for key-based authentication in Linux | Enable Sysadmin](https://www.redhat.com/sysadmin/configure-ssh-keygen)) ### Creating bootc containers @@ -136,13 +136,13 @@ Build the InstructLab NVIDIA container image. make instruct-nvidia ``` -Build the [vllm](https://github.com/vllm-project/vllm) container image. +Build the [`vllm`](https://github.com/vllm-project/vllm) container image. ```sh make vllm ``` -Build the [deepspeed](https://www.deepspeed.ai/) container image. +Build the [`deepspeed`](https://www.deepspeed.ai/) container image. ```sh make deepspeed @@ -179,7 +179,7 @@ podman push quay.io//nvidia-bootc:latest [Anaconda](https://docs.anaconda.com/free/anaconda/install/index.html) is the Red Hat Enterprise Linux installer, and it is embedded in all RHEL downloadable -iso images. The main method of automating RHEL installation is +ISO images. The main method of automating RHEL installation is via scripts called Kickstart. For more information about Anaconda and Kickstart, [read these documents](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/9/html-single/performing_an_advanced_rhel_9_installation/index#what-are-kickstart-installations_kickstart-installation-basics). @@ -217,12 +217,12 @@ Here is an example of a kickstart file. Copy it to a file called # reboot ``` -### Embed your kickstart into the RHEL Boot iso +### Embed your kickstart into the RHEL Boot ISO [Download the RHEL 9.4](https://developers.redhat.com/products/rhel/download#rhel-new-product-download-list-61451) -“Boot iso”, and use `mkksiso` command to embed the kickstart into the RHEL -boot iso. +“Boot ISO”, and use `mkksiso` command to embed the kickstart into the RHEL +boot ISO. ```sh mkksiso rhelai-dev-preview-bootc.ks rhel-9.4-x86_64-boot.iso rhelai-dev-preview-bootc-ks.iso @@ -418,10 +418,10 @@ INFO: Finished server process [1] With VLLM stopped and the new data generated, the training process can be launched using the `ilab train` command. By default, the training process saves a model checkpoint after every 4999 samples. You can adjust this using the -–num-samples parameter. Additionally, training defaults to running for 10 -epochs, which can also be adjusted with the –num-epochs parameter. Generally, -more epochs are better, but after a certain point, the model can become -overfitted. It is typically recommended to stay within 10 or fewer epochs and to +`--num-samples` parameter. Additionally, training defaults to running for 10 +epochs, which can also be adjusted with the `--num-epochs` parameter. Generally, +more epochs are better, but after a certain point, more epochs will result in +overfitting. It is typically recommended to stay within 10 or fewer epochs and to look at different sample points to find the best result. ```sh