fix: escape regexes

snapADDY · May 24, 2023 · 64c3c1f · 64c3c1f
1 parent 54a84ec
commit 64c3c1f
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 16 deletions.
diff --git a/jieba_pyfast/__init__.py b/jieba_pyfast/__init__.py
@@ -38,16 +38,16 @@
 
 pool = None
 
-re_userdict = re.compile("^(.+?)( [0-9]+)?( [a-z]+)?$", re.U)
+re_userdict = re.compile(r"^(.+?)( [0-9]+)?( [a-z]+)?$", re.U)
 
-re_eng = re.compile("[a-zA-Z0-9]", re.U)
+re_eng = re.compile(r"[a-zA-Z0-9]", re.U)
 
 # \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
 # \r\n|\s : whitespace characters. Will not be handled.
-re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U)
-re_skip_default = re.compile("(\r\n|\s)", re.U)
-re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
-re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
+re_han_default = re.compile(r"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U)
+re_skip_default = re.compile(r"(\r\n|\s)", re.U)
+re_han_cut_all = re.compile(r"([\u4E00-\u9FD5]+)", re.U)
+re_skip_cut_all = re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
 
 
 def setLogLevel(log_level):

diff --git a/jieba_pyfast/finalseg/__init__.py b/jieba_pyfast/finalseg/__init__.py
@@ -88,8 +88,8 @@ def __cut(sentence):
         yield sentence[nexti:]
 
 
-re_han = re.compile("([\u4E00-\u9FD5]+)")
-re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")
+re_han = re.compile(r"([\u4E00-\u9FD5]+)")
+re_skip = re.compile(r"([a-zA-Z0-9]+(?:\.\d+)?%?)")
 
 
 def add_force_split(word):

diff --git a/jieba_pyfast/posseg/__init__.py b/jieba_pyfast/posseg/__init__.py
@@ -12,15 +12,15 @@
 PROB_EMIT_P = "prob_emit.p"
 CHAR_STATE_TAB_P = "char_state_tab.p"
 
-re_han_detail = re.compile("([\u4E00-\u9FD5]+)")
-re_skip_detail = re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
-re_han_internal = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)")
-re_skip_internal = re.compile("(\r\n|\s)")
+re_han_detail = re.compile(r"([\u4E00-\u9FD5]+)")
+re_skip_detail = re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
+re_han_internal = re.compile(r"([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)")
+re_skip_internal = re.compile(r"(\r\n|\s)")
 
-re_eng = re.compile("[a-zA-Z0-9]+")
-re_num = re.compile("[\.0-9]+")
+re_eng = re.compile(r"[a-zA-Z0-9]+")
+re_num = re.compile(r"[\.0-9]+")
 
-re_eng1 = re.compile("^[a-zA-Z0-9]$", re.U)
+re_eng1 = re.compile(r"^[a-zA-Z0-9]$", re.U)
 
 
 def load_model():

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "jieba_pyfast"
-version = "3.11.0"
+version = "3.11.1"
 description = "Tokenize Chinese characters"
 license = "MIT"
 authors = ["snapADDY GmbH <[email protected]>"]