diff --git a/chapters/zh-CN/chapter3/2.mdx b/chapters/zh-CN/chapter3/2.mdx index 2591001fd..b9d79afa2 100644 --- a/chapters/zh-CN/chapter3/2.mdx +++ b/chapters/zh-CN/chapter3/2.mdx @@ -104,7 +104,7 @@ raw_datasets = load_dataset("glue", "mrpc") raw_datasets ``` -```python +```python out DatasetDict({ train: Dataset({ features: ['sentence1', 'sentence2', 'label', 'idx'], @@ -133,10 +133,12 @@ raw_train_dataset[0] ``` ```python -{'idx': 0, - 'label': 1, - 'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', - 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .'} +{ + "idx": 0, + "label": 1, + "sentence1": 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', + "sentence2": 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', +} ``` 现在可以看到标签已经是整数了,因此不需要对标签做任何预处理。如果想要知道不同数字对应标签的实际含义,我们可以查看 `raw_train_dataset` 的 `features` 。这告诉我们每列的类型: @@ -146,10 +148,14 @@ raw_train_dataset.features ``` ```python -{'sentence1': Value(dtype='string', id=None), - 'sentence2': Value(dtype='string', id=None), - 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None), - 'idx': Value(dtype='int32', id=None)} +{ + "sentence1": Value(dtype="string", id=None), + "sentence2": Value(dtype="string", id=None), + "label": ClassLabel( + num_classes=2, names=["not_equivalent", "equivalent"], names_file=None, id=None + ), + "idx": Value(dtype="int32", id=None), +} ``` 上面的例子中的 `Label(标签)` 是一种 `ClassLabel(分类标签)` ,也就是使用整数建立起类别标签的映射关系。 `0` 对应于 `not_equivalent(非同义)` , `1` 对应于 `equivalent(同义)` 。 @@ -187,10 +193,26 @@ inputs ``` ```python -{ - 'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] +{ + "input_ids": [ + 101, + 2023, + 2003, + 1996, + 2034, + 6251, + 1012, + 102, + 2023, + 2003, + 1996, + 2117, + 2028, + 1012, + 102, + ], + "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], + "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], } ``` @@ -211,14 +233,46 @@ tokenizer.convert_ids_to_tokens(inputs["input_ids"]) 将得到: ```python -['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]'] +[ + "[CLS]", + "this", + "is", + "the", + "first", + "sentence", + ".", + "[SEP]", + "this", + "is", + "the", + "second", + "one", + ".", + "[SEP]", +] ``` 所以我们看到模型需要输入的形式是 `[CLS] sentence1 [SEP] sentence2 [SEP]` 。所以当有两句话的时候, `token类型ID(token_type_ids)` 的值是: ```python -['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]'] -[ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] +[ + "[CLS]", + "this", + "is", + "the", + "first", + "sentence", + ".", + "[SEP]", + "this", + "is", + "the", + "second", + "one", + ".", + "[SEP]", +] +[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] ``` 现在输入中 `[CLS] sentence1 [SEP]` 它们的 `token_type_ids` 均为 `0` ,而其他部分例如 `sentence2 [SEP]` ,所有的 `token_type_ids` 均为 `1` 。 @@ -267,7 +321,7 @@ tokenized_datasets 🤗Datasets 库进行这种处理的方式是向数据集添加新的字段,每个字段对应预处理函数返回的字典中的每个键: -```python +```python out DatasetDict({ train: Dataset({ features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'], @@ -348,19 +402,23 @@ batch = data_collator(samples) {#if fw === 'tf'} ```python -{'attention_mask': TensorShape([8, 67]), - 'input_ids': TensorShape([8, 67]), - 'token_type_ids': TensorShape([8, 67]), - 'labels': TensorShape([8])} +{ + "attention_mask": TensorShape([8, 67]), + "input_ids": TensorShape([8, 67]), + "token_type_ids": TensorShape([8, 67]), + "labels": TensorShape([8]), +} ``` {:else} ```python -{'attention_mask': torch.Size([8, 67]), - 'input_ids': torch.Size([8, 67]), - 'token_type_ids': torch.Size([8, 67]), - 'labels': torch.Size([8])} +{ + "attention_mask": torch.Size([8, 67]), + "input_ids": torch.Size([8, 67]), + "token_type_ids": torch.Size([8, 67]), + "labels": torch.Size([8]), +} ``` 看起来不错!现在,我们已经从原始文本转化为了模型可以处理的数据,我们准备好对其进行微调。 diff --git a/chapters/zh-CN/chapter5/3.mdx b/chapters/zh-CN/chapter5/3.mdx index 0feb10d03..163623d9f 100644 --- a/chapters/zh-CN/chapter5/3.mdx +++ b/chapters/zh-CN/chapter5/3.mdx @@ -102,6 +102,7 @@ DatasetDict({ def lowercase_condition(example): return {"condition": example["condition"].lower()} + drug_dataset.map(lowercase_condition) ``` @@ -281,8 +282,10 @@ new_drug_dataset = drug_dataset.map( ```python from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + def tokenize_function(examples): return tokenizer(examples["review"], truncation=True) ``` @@ -315,9 +318,11 @@ def tokenize_function(examples): ```py slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False) + def slow_tokenize_function(examples): return slow_tokenizer(examples["review"], truncation=True) + tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8) ``` diff --git a/chapters/zh-CN/chapter6/2.mdx b/chapters/zh-CN/chapter6/2.mdx index 918d0c2f4..1c24c35f4 100644 --- a/chapters/zh-CN/chapter6/2.mdx +++ b/chapters/zh-CN/chapter6/2.mdx @@ -76,7 +76,7 @@ def handle_simple_responses( ```py # 除非你的数据集很小,否则不要直接运行下面的代码! # training_corpus = [ -# raw_datasets["train"][i: i + 1000]["whole_func_string"] +# raw_datasets["train"][i: i + 1000]["whole_func_string"] # for i in range(0, len(raw_datasets["train"]), 1000) # ] ``` @@ -253,7 +253,7 @@ tokenizer.push_to_hub("code-search-net-tokenizer") 这将在你的账户中创建一个名为 `code-search-net-tokenizer` 的新仓库,其中将包含 tokenizer 文件。然后,你可以使用 tokenizer 的 `from_pretrained()` 方法从任何地方加载 tokenizer 。 ```py -# 将下面的 "huggingface-course" 替换为你的用户名来加载你的 tokenizer +# 将下面的 "huggingface-course" 替换为你的用户名来加载你的 tokenizer tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer") ``` diff --git a/chapters/zh-CN/chapter6/3b.mdx b/chapters/zh-CN/chapter6/3b.mdx index b2b31eaf3..c6c366722 100644 --- a/chapters/zh-CN/chapter6/3b.mdx +++ b/chapters/zh-CN/chapter6/3b.mdx @@ -183,7 +183,7 @@ import torch sequence_ids = inputs.sequence_ids() # 屏蔽除 context 之外的所有内容 mask = [i != 1 for i in sequence_ids] -# 不屏蔽 [CLS] token +# 不屏蔽 [CLS] token mask[0] = False mask = torch.tensor(mask)[None] @@ -199,7 +199,7 @@ import tensorflow as tf sequence_ids = inputs.sequence_ids() # 屏蔽除 context 之外的所有内容 mask = [i != 1 for i in sequence_ids] -# 不屏蔽 [CLS] token +# 不屏蔽 [CLS] token mask[0] = False mask = tf.constant(mask)[None] diff --git a/chapters/zh-CN/chapter6/7.mdx b/chapters/zh-CN/chapter6/7.mdx index cbe85e855..98e16f6fb 100644 --- a/chapters/zh-CN/chapter6/7.mdx +++ b/chapters/zh-CN/chapter6/7.mdx @@ -318,7 +318,7 @@ def compute_scores(model): scores = {} model_loss = compute_loss(model) for token, score in model.items(): - # 我们将保留长度为 1 的 tokens + # 我们将保留长度为 1 的 tokens if len(token) == 1: continue model_without_token = copy.deepcopy(model) @@ -372,6 +372,7 @@ def tokenize(text, model): encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text] return sum(encoded_words, []) + tokenize("This is the Hugging Face course.", model) ```