Skip to content

Commit

Permalink
chore(style): add ruff format
Browse files Browse the repository at this point in the history
Now "make style" will reformat the code.
I also applied the formatting to all files, so CI should not fail
anymore due to code formatting.
  • Loading branch information
tengomucho committed Jan 29, 2025
1 parent c84afc9 commit c233b35
Show file tree
Hide file tree
Showing 41 changed files with 149 additions and 163 deletions.
1 change: 1 addition & 0 deletions .github/workflows/check_code_quality.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,5 @@ jobs:
- name: Check style with ruff
run: |
source venv/bin/activate
ruff format . --diff
ruff check .
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,11 @@ transformers_examples:
# Run code quality checks
style_check:
ruff check .
ruff format . --diff

style:
ruff check . --fix
ruff format .

# Utilities to release to PyPi
build_dist_install_tools:
Expand Down
2 changes: 1 addition & 1 deletion examples/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ def main():
model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)

n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")

# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
Expand Down
1 change: 1 addition & 0 deletions examples/question-answering/trainer_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""
A subclass of `Trainer` specific to Question-Answering tasks
"""

import math
import time

Expand Down
1 change: 1 addition & 0 deletions examples/question-answering/trainer_seq2seq_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""
A subclass of `Trainer` specific to Question-Answering tasks
"""

import math
import time
from typing import Dict, List, Optional
Expand Down
1 change: 1 addition & 0 deletions examples/question-answering/utils_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""
Post-processing utilities for question answering.
"""

import collections
import json
import logging
Expand Down
6 changes: 3 additions & 3 deletions examples/summarization/run_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,9 +525,9 @@ def main():
return

if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
assert (
data_args.lang is not None
), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
assert data_args.lang is not None, (
f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
)

tokenizer.src_lang = data_args.lang
tokenizer.tgt_lang = data_args.lang
Expand Down
14 changes: 7 additions & 7 deletions examples/text-classification/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Finetuning the library models for sequence classification on GLUE."""
"""Finetuning the library models for sequence classification on GLUE."""
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.

import logging
Expand Down Expand Up @@ -158,9 +158,9 @@ def __post_init__(self):
train_extension = self.train_file.split(".")[-1]
assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
validation_extension = self.validation_file.split(".")[-1]
assert (
validation_extension == train_extension
), "`validation_file` should have the same extension (csv or json) as `train_file`."
assert validation_extension == train_extension, (
"`validation_file` should have the same extension (csv or json) as `train_file`."
)


@dataclass
Expand Down Expand Up @@ -329,9 +329,9 @@ def main():
if data_args.test_file is not None:
train_extension = data_args.train_file.split(".")[-1]
test_extension = data_args.test_file.split(".")[-1]
assert (
test_extension == train_extension
), "`test_file` should have the same extension (csv or json) as `train_file`."
assert test_extension == train_extension, (
"`test_file` should have the same extension (csv or json) as `train_file`."
)
data_files["test"] = data_args.test_file
else:
raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
Expand Down
4 changes: 2 additions & 2 deletions examples/text-classification/run_xnli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Finetuning multi-lingual models on XNLI (e.g. Bert, DistilBERT, XLM).
Adapted from `examples/text-classification/run_glue.py`"""
"""Finetuning multi-lingual models on XNLI (e.g. Bert, DistilBERT, XLM).
Adapted from `examples/text-classification/run_glue.py`"""

import logging
import os
Expand Down
6 changes: 3 additions & 3 deletions notebooks/sentence-transformers/getting-started.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,14 @@
"\n",
"# Run inference\n",
"prompt = \"I like to eat apples\"\n",
"encoded_input = tokenizer(prompt, return_tensors='pt')\n",
"encoded_input = tokenizer(prompt, return_tensors=\"pt\")\n",
"outputs = model(**encoded_input)\n",
"\n",
"token_embeddings = outputs.token_embeddings\n",
"sentence_embedding = outputs.sentence_embedding\n",
"\n",
"print(f\"token embeddings: {token_embeddings.shape}\") # torch.Size([1, 7, 384])\n",
"print(f\"sentence_embedding: {sentence_embedding.shape}\") # torch.Size([1, 384])"
"print(f\"token embeddings: {token_embeddings.shape}\") # torch.Size([1, 7, 384])\n",
"print(f\"sentence_embedding: {sentence_embedding.shape}\") # torch.Size([1, 384])"
]
},
{
Expand Down
8 changes: 5 additions & 3 deletions notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@
"outputs": [],
"source": [
"# Push and share your model to the HuggingFace hub\n",
"repository_id = \"your-username/your-awesome-model\" # Replace with your repo id, eg. \"Jingya/stable-diffusion-2-1-neuronx\".\n",
"repository_id = (\n",
" \"your-username/your-awesome-model\" # Replace with your repo id, eg. \"Jingya/stable-diffusion-2-1-neuronx\".\n",
")\n",
"stable_diffusion.push_to_hub(save_directory, repository_id=repository_id, use_auth_token=True)"
]
},
Expand Down Expand Up @@ -659,7 +661,7 @@
" \"engineers eating lunch at the opera\",\n",
" \"panda eating bamboo on a plane\",\n",
" \"A digital illustration of a steampunk flying machine in the sky with cogs and mechanisms, 4k, detailed, trending in artstation, fantasy vivid colors\",\n",
" \"kids playing soccer at the FIFA World Cup\"\n",
" \"kids playing soccer at the FIFA World Cup\",\n",
"]\n",
"\n",
"\n",
Expand All @@ -675,7 +677,7 @@
" print(f\"[Inference Time] {np.round(inf_time, 2)} seconds.\")\n",
" image.save(\"image.png\")\n",
" image = mpimg.imread(\"image.png\")\n",
" #clear_output(wait=True)\n",
" # clear_output(wait=True)\n",
" plt.imshow(image)\n",
" plt.show()"
]
Expand Down
8 changes: 5 additions & 3 deletions notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@
"outputs": [],
"source": [
"# Push and share your model to the HuggingFace hub\n",
"repository_id = \"your-username/your-awesome-model\" # Replace with your repo id, eg. \"Jingya/stable-diffusion-xl-base-1.0-neuronx\".\n",
"repository_id = (\n",
" \"your-username/your-awesome-model\" # Replace with your repo id, eg. \"Jingya/stable-diffusion-xl-base-1.0-neuronx\".\n",
")\n",
"stable_diffusion_xl.push_to_hub(save_directory, repository_id=repository_id, use_auth_token=True)"
]
},
Expand Down Expand Up @@ -708,7 +710,7 @@
" \"engineers eating lunch at the opera\",\n",
" \"panda eating bamboo on a plane\",\n",
" \"A digital illustration of a steampunk flying machine in the sky with cogs and mechanisms, 4k, detailed, trending in artstation, fantasy vivid colors\",\n",
" \"kids playing soccer at the FIFA World Cup\"\n",
" \"kids playing soccer at the FIFA World Cup\",\n",
"]\n",
"\n",
"\n",
Expand All @@ -724,7 +726,7 @@
" print(f\"[Inference Time] {np.round(inf_time, 2)} seconds.\")\n",
" image.save(\"image.png\")\n",
" image = mpimg.imread(\"image.png\")\n",
" #clear_output(wait=True)\n",
" # clear_output(wait=True)\n",
" plt.imshow(image)\n",
" plt.show()"
]
Expand Down
14 changes: 8 additions & 6 deletions notebooks/text-classification/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,8 @@
"from random import randrange\n",
"\n",
"\n",
"random_id = randrange(len(raw_dataset['train']))\n",
"raw_dataset['train'][random_id]\n",
"random_id = randrange(len(raw_dataset[\"train\"]))\n",
"raw_dataset[\"train\"][random_id]\n",
"# {'text': 'i feel isolated and alone in my trade', 'label': 0}"
]
},
Expand Down Expand Up @@ -152,18 +152,20 @@
"# Load Tokenizer\n",
"tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
"\n",
"\n",
"# Tokenize helper function\n",
"def tokenize(batch):\n",
" return tokenizer(batch['text'], padding='max_length', truncation=True,return_tensors=\"pt\")\n",
" return tokenizer(batch[\"text\"], padding=\"max_length\", truncation=True, return_tensors=\"pt\")\n",
"\n",
"\n",
"# Tokenize dataset\n",
"raw_dataset = raw_dataset.rename_column(\"label\", \"labels\") # to match Trainer\n",
"raw_dataset = raw_dataset.rename_column(\"label\", \"labels\") # to match Trainer\n",
"tokenized_dataset = raw_dataset.map(tokenize, batched=True, remove_columns=[\"text\"])\n",
"tokenized_dataset = tokenized_dataset.with_format(\"torch\")\n",
"\n",
"# save dataset to disk\n",
"tokenized_dataset[\"train\"].save_to_disk(os.path.join(save_dataset_path,\"train\"))\n",
"tokenized_dataset[\"test\"].save_to_disk(os.path.join(save_dataset_path,\"eval\"))"
"tokenized_dataset[\"train\"].save_to_disk(os.path.join(save_dataset_path, \"train\"))\n",
"tokenized_dataset[\"test\"].save_to_disk(os.path.join(save_dataset_path, \"eval\"))"
]
},
{
Expand Down
17 changes: 10 additions & 7 deletions notebooks/text-generation/CodeLlama-7B-Compilation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,9 @@
"from optimum.neuron import pipeline\n",
"\n",
"\n",
"p = pipeline('text-generation', 'aws-neuron/CodeLlama-7b-hf-neuron-8xlarge')\n",
"p(\"import socket\\n\\ndef ping_exponential_backoff(host: str):\",\n",
"p = pipeline(\"text-generation\", \"aws-neuron/CodeLlama-7b-hf-neuron-8xlarge\")\n",
"p(\n",
" \"import socket\\n\\ndef ping_exponential_backoff(host: str):\",\n",
" do_sample=True,\n",
" top_k=10,\n",
" temperature=0.1,\n",
Expand Down Expand Up @@ -191,10 +192,12 @@
"from optimum.neuron import NeuronModelForCausalLM\n",
"\n",
"\n",
"#num_cores should be changed based on the instance. inf2.24xlarge has 6 neuron processors (they have two cores each) so 12 total\n",
"compiler_args = {\"num_cores\": 2, \"auto_cast_type\": 'fp16'}\n",
"# num_cores should be changed based on the instance. inf2.24xlarge has 6 neuron processors (they have two cores each) so 12 total\n",
"compiler_args = {\"num_cores\": 2, \"auto_cast_type\": \"fp16\"}\n",
"input_shapes = {\"batch_size\": 1, \"sequence_length\": 2048}\n",
"model = NeuronModelForCausalLM.from_pretrained(\"codellama/CodeLlama-7b-hf\", export=True, **compiler_args, **input_shapes)"
"model = NeuronModelForCausalLM.from_pretrained(\n",
" \"codellama/CodeLlama-7b-hf\", export=True, **compiler_args, **input_shapes\n",
")"
]
},
{
Expand All @@ -214,7 +217,7 @@
"metadata": {},
"outputs": [],
"source": [
"model.save_pretrained(\"CodeLlama-7b-hf-neuron-8xlarge\")\n"
"model.save_pretrained(\"CodeLlama-7b-hf-neuron-8xlarge\")"
]
},
{
Expand Down Expand Up @@ -255,7 +258,7 @@
"from huggingface_hub.hf_api import HfFolder\n",
"\n",
"\n",
"HfFolder.save_token('MY_HUGGINGFACE_TOKEN_HERE')"
"HfFolder.save_token(\"MY_HUGGINGFACE_TOKEN_HERE\")"
]
},
{
Expand Down
41 changes: 16 additions & 25 deletions notebooks/text-generation/llama2-13b-chatbot.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,11 @@
"from optimum.neuron import NeuronModelForCausalLM\n",
"\n",
"\n",
"compiler_args = {\"num_cores\": 24, \"auto_cast_type\": 'fp16'}\n",
"compiler_args = {\"num_cores\": 24, \"auto_cast_type\": \"fp16\"}\n",
"input_shapes = {\"batch_size\": 1, \"sequence_length\": 2048}\n",
"model = NeuronModelForCausalLM.from_pretrained(\n",
" \"NousResearch/Llama-2-13b-chat-hf\",\n",
" export=True,\n",
" **compiler_args,\n",
" **input_shapes)"
" \"NousResearch/Llama-2-13b-chat-hf\", export=True, **compiler_args, **input_shapes\n",
")"
]
},
{
Expand Down Expand Up @@ -177,7 +175,7 @@
"from huggingface_hub import whoami\n",
"\n",
"\n",
"org = whoami()['name']\n",
"org = whoami()[\"name\"]\n",
"\n",
"repo_id = f\"{org}/llama-2-13b-chat-neuron\"\n",
"\n",
Expand Down Expand Up @@ -245,7 +243,7 @@
" model\n",
"except NameError:\n",
" # Edit this to use another base model\n",
" model = NeuronModelForCausalLM.from_pretrained('aws-neuron/Llama-2-13b-chat-hf-neuron-latency')"
" model = NeuronModelForCausalLM.from_pretrained(\"aws-neuron/Llama-2-13b-chat-hf-neuron-latency\")"
]
},
{
Expand Down Expand Up @@ -290,12 +288,7 @@
"outputs": [],
"source": [
"inputs = tokenizer(\"What is deep-learning ?\", return_tensors=\"pt\")\n",
"outputs = model.generate(**inputs,\n",
" max_new_tokens=128,\n",
" do_sample=True,\n",
" temperature=0.9,\n",
" top_k=50,\n",
" top_p=0.9)\n",
"outputs = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.9, top_k=50, top_p=0.9)\n",
"tokenizer.batch_decode(outputs, skip_special_tokens=True)"
]
},
Expand Down Expand Up @@ -323,7 +316,7 @@
"outputs": [],
"source": [
"def format_chat_prompt(message, history, max_tokens):\n",
" \"\"\" Convert a history of messages to a chat prompt\n",
" \"\"\"Convert a history of messages to a chat prompt\n",
" Args:\n",
" message(str): the new user message.\n",
" history (List[str]): the list of user messages and assistant responses.\n",
Expand All @@ -334,10 +327,10 @@
" chat = []\n",
" # Convert all messages in history to chat interactions\n",
" for interaction in history:\n",
" chat.append({\"role\": \"user\", \"content\" : interaction[0]})\n",
" chat.append({\"role\": \"assistant\", \"content\" : interaction[1]})\n",
" chat.append({\"role\": \"user\", \"content\": interaction[0]})\n",
" chat.append({\"role\": \"assistant\", \"content\": interaction[1]})\n",
" # Add the new message\n",
" chat.append({\"role\": \"user\", \"content\" : message})\n",
" chat.append({\"role\": \"user\", \"content\": message})\n",
" # Generate the prompt, verifying that we don't go beyond the maximum number of tokens\n",
" for i in range(0, len(chat), 2):\n",
" # Generate candidate prompt with the last n-i entries\n",
Expand Down Expand Up @@ -372,19 +365,17 @@
"history = []\n",
"max_tokens = 1024\n",
"\n",
"\n",
"def chat(message, history, max_tokens):\n",
" prompt = format_chat_prompt(message, history, max_tokens)\n",
" # Uncomment the line below to see what the formatted prompt looks like\n",
" #print(prompt)\n",
" # print(prompt)\n",
" inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
" outputs = model.generate(**inputs,\n",
" max_length=2048,\n",
" do_sample=True,\n",
" temperature=0.9,\n",
" top_k=50,\n",
" repetition_penalty=1.2)\n",
" outputs = model.generate(\n",
" **inputs, max_length=2048, do_sample=True, temperature=0.9, top_k=50, repetition_penalty=1.2\n",
" )\n",
" # Do not include the input tokens\n",
" outputs = outputs[0, inputs.input_ids.size(-1):]\n",
" outputs = outputs[0, inputs.input_ids.size(-1) :]\n",
" response = tokenizer.decode(outputs, skip_special_tokens=True)\n",
" history.append([message, response])\n",
" return response"
Expand Down
Loading

0 comments on commit c233b35

Please sign in to comment.