Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using pretrained models in step_tokenize_sentencepiece #278

Open
jrosell opened this issue Dec 10, 2024 · 2 comments
Open

Using pretrained models in step_tokenize_sentencepiece #278

jrosell opened this issue Dec 10, 2024 · 2 comments
Labels
feature a feature request or enhancement question

Comments

@jrosell
Copy link

jrosell commented Dec 10, 2024

I'm trying to use bpe working in step_tokenize_sentencepiece.

Could I use some already trained model? Here some examples trying different approaches:

library(tidymodels)
library(textrecipes)
library(sentencepiece)

dataf <- data.frame(
  "text" = c("positive sentiment", "super neg", "bad outcome", "good results"),
  "label_col" = c("pos", "neg", "neg", "pos", "neg")
)

test <- data.frame(
  "text" = c("negative results", "neg"),
  "label_col" = c("neg", "neg")
)

rec0 <- dataf |> 
  recipe(label_col ~ text) |> 
  step_tokenize_sentencepiece(text, vocabulary_size = 25) |> 
  step_tfidf(text) 

# It splits by character not by "bpe"
rec0 |> 
  prep() |> 
  juice() |>
  print()


# 0.853 for .pred_neg in the first case
rec0 |> 
  workflow(logistic_reg()) |> 
  fit(dataf) |> 
  augment(test, type = c("prob", "class"))

# I can't split by bpe in step_tokenize_sentencepiece
tryCatch({
    rec1 <- dataf |> 
      recipe(label_col ~ text) |> 
      step_tokenize_sentencepiece(text, vocabulary_size = 25, options = list(type = "bpe")) |> 
      step_tfidf(text) 

    rec1 |> 
      prep() |> 
      juice() |>
      print()
  },
  error = \(e) message(e)
)

# I can do it in step_tokenize_bpe
rec2 <- dataf |> 
  recipe(label_col ~ text) |> 
  step_tokenize_bpe(text) |>   
  step_tfidf(text) 

rec2 |> 
  prep() |> 
  juice() |>
  print()

# 1.00 for .pred_neg in the first case
rec2 |> 
  workflow(logistic_reg()) |> 
  fit(dataf) |> 
  augment(test, type = c("prob", "class"))


# Using a sentencepiece trained model before
all <- bind_rows(dataf |> mutate(data = "train"), test |> mutate(data = "test"))
download.file("https://bpemb.h-its.org/en/en.wiki.bpe.vs1000.model", "en.wiki.bpe.vs1000.model")
model <- sentencepiece_load_model("en.wiki.bpe.vs1000.model")
all$text <- sentencepiece_encode(model, all$text, type = "subwords") |> 
  tokenlist()

rec3 <- all |> 
  filter(data == "train") |>
  recipe(label_col ~ text) |> 
  step_tfidf(text) 

rec3 |> 
  prep() |> 
  juice() |>
  print()

rec3 |> 
  workflow(logistic_reg()) |> 
  fit(all |> filter(data == "train")) |> 
  augment(all |> filter(data == "test"), type = c("prob", "class"))



# Using a sentencepiece trained model before
# Using word embedings instead of step_tfidf
all <- bind_rows(dataf |> mutate(data = "train"), test |> mutate(data = "test"))
download.file("https://bpemb.h-its.org/en/en.wiki.bpe.vs1000.model", "en.wiki.bpe.vs1000.model")
model <- sentencepiece_load_model("en.wiki.bpe.vs1000.model")
embeddings <- tibble(
  tokens = sentencepiece_encode(model, all$text, type = "subwords") |> unlist() |> unique(),
  ids = sentencepiece_encode(model, all$text, type = "ids") |> unlist() |> unique()
)
all$text <- sentencepiece_encode(model, all$text, type = "subwords") |> 
  tokenlist()
rec4 <- all |> 
  filter(data == "train") |>
  recipe(label_col ~ text) |> 
  step_word_embeddings(text, embeddings = embeddings, aggregation = "sum", keep_original_cols = TRUE, prefix = "sum") |> 
  step_word_embeddings(text, embeddings = embeddings, aggregation = "mean", keep_original_cols = TRUE, prefix = "mean") |> 
  step_word_embeddings(text, embeddings = embeddings, aggregation = "max", keep_original_cols = TRUE, prefix = "max") |> 
  step_rm(text)

rec4 |> 
  prep() |> 
  juice() |>
  print()

rec4 |> 
  workflow(logistic_reg()) |> 
  fit(all |> filter(data == "train")) |> 
  augment(all |> filter(data == "test"), type = c("prob", "class"))
@EmilHvitfeldt
Copy link
Member

first part:

the reason why you are getting characters when you were expecting BPE is because the vocabulary size is set to small. With a vocabulary_size of 25 it is too low to allow for combined characters.

library(textrecipes)

dataf <- data.frame(
  "text" = c("positive sentiment", "super neg", "bad outcome", "good results"),
  "label_col" = c("pos", "neg", "neg", "pos")
)

rec0 <- dataf |> 
  recipe(label_col ~ text) |> 
  step_tokenize_sentencepiece(text, vocabulary_size = 100) |> 
  step_tfidf(text) 

rec0 |> 
  prep() |> 
  juice() |>
  names()
#>   [1] "label_col"             "tfidf_text_</s>"       "tfidf_text_<s>"       
#>   [4] "tfidf_text_<unk>"      "tfidf_text_▁"          "tfidf_text_▁b"        
#>   [7] "tfidf_text_▁bad"       "tfidf_text_▁g"         "tfidf_text_▁go"       
#>  [10] "tfidf_text_▁good"      "tfidf_text_▁n"         "tfidf_text_▁neg"      
#>  [13] "tfidf_text_▁o"         "tfidf_text_▁ou"        "tfidf_text_▁out"      
#>  [16] "tfidf_text_▁outcome"   "tfidf_text_▁p"         "tfidf_text_▁pos"      
#>  [19] "tfidf_text_▁positive"  "tfidf_text_▁r"         "tfidf_text_▁re"       
#>  [22] "tfidf_text_▁results"   "tfidf_text_▁s"         "tfidf_text_▁sen"      
#>  [25] "tfidf_text_▁sentiment" "tfidf_text_▁su"        "tfidf_text_▁super"    
#>  [28] "tfidf_text_a"          "tfidf_text_ad"         "tfidf_text_b"         
#>  [31] "tfidf_text_ba"         "tfidf_text_bad"        "tfidf_text_c"         
#>  [34] "tfidf_text_co"         "tfidf_text_com"        "tfidf_text_come"      
#>  [37] "tfidf_text_d"          "tfidf_text_e"          "tfidf_text_eg"        
#>  [40] "tfidf_text_en"         "tfidf_text_ent"        "tfidf_text_enti"      
#>  [43] "tfidf_text_er"         "tfidf_text_es"         "tfidf_text_esu"       
#>  [46] "tfidf_text_g"          "tfidf_text_go"         "tfidf_text_goo"       
#>  [49] "tfidf_text_i"          "tfidf_text_im"         "tfidf_text_it"        
#>  [52] "tfidf_text_iti"        "tfidf_text_itive"      "tfidf_text_iv"        
#>  [55] "tfidf_text_l"          "tfidf_text_lt"         "tfidf_text_lts"       
#>  [58] "tfidf_text_m"          "tfidf_text_me"         "tfidf_text_men"       
#>  [61] "tfidf_text_n"          "tfidf_text_ne"         "tfidf_text_neg"       
#>  [64] "tfidf_text_nt"         "tfidf_text_o"          "tfidf_text_od"        
#>  [67] "tfidf_text_om"         "tfidf_text_oo"         "tfidf_text_os"        
#>  [70] "tfidf_text_osi"        "tfidf_text_ou"         "tfidf_text_out"       
#>  [73] "tfidf_text_p"          "tfidf_text_pe"         "tfidf_text_per"       
#>  [76] "tfidf_text_po"         "tfidf_text_pos"        "tfidf_text_r"         
#>  [79] "tfidf_text_re"         "tfidf_text_s"          "tfidf_text_se"        
#>  [82] "tfidf_text_sen"        "tfidf_text_si"         "tfidf_text_su"        
#>  [85] "tfidf_text_sul"        "tfidf_text_sults"      "tfidf_text_sup"       
#>  [88] "tfidf_text_t"          "tfidf_text_tc"         "tfidf_text_tco"       
#>  [91] "tfidf_text_ti"         "tfidf_text_tim"        "tfidf_text_timent"    
#>  [94] "tfidf_text_tiv"        "tfidf_text_ts"         "tfidf_text_u"         
#>  [97] "tfidf_text_ul"         "tfidf_text_up"         "tfidf_text_ut"        
#> [100] "tfidf_text_v"          "tfidf_text_ve"

Created on 2024-12-10 with reprex v2.1.0

@EmilHvitfeldt
Copy link
Member

it is not currently possible to use a pretrained model, but it is a valid thing to want to do.

@EmilHvitfeldt EmilHvitfeldt added feature a feature request or enhancement question labels Dec 10, 2024
@jrosell jrosell changed the title Using bpe in step_tokenize_sentencepiece should work Using pretrained models in step_tokenize_sentencepiece should work Dec 19, 2024
@jrosell jrosell changed the title Using pretrained models in step_tokenize_sentencepiece should work Using pretrained models in step_tokenize_sentencepiece Dec 19, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
feature a feature request or enhancement question
Projects
None yet
Development

No branches or pull requests

2 participants