Skip to content

Commit

Permalink
fix: escape regexes
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed May 24, 2023
1 parent 54a84ec commit 64c3c1f
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 16 deletions.
12 changes: 6 additions & 6 deletions jieba_pyfast/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,16 @@

pool = None

re_userdict = re.compile("^(.+?)( [0-9]+)?( [a-z]+)?$", re.U)
re_userdict = re.compile(r"^(.+?)( [0-9]+)?( [a-z]+)?$", re.U)

re_eng = re.compile("[a-zA-Z0-9]", re.U)
re_eng = re.compile(r"[a-zA-Z0-9]", re.U)

# \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
# \r\n|\s : whitespace characters. Will not be handled.
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U)
re_skip_default = re.compile("(\r\n|\s)", re.U)
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
re_han_default = re.compile(r"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U)
re_skip_default = re.compile(r"(\r\n|\s)", re.U)
re_han_cut_all = re.compile(r"([\u4E00-\u9FD5]+)", re.U)
re_skip_cut_all = re.compile(r"[^a-zA-Z0-9+#\n]", re.U)


def setLogLevel(log_level):
Expand Down
4 changes: 2 additions & 2 deletions jieba_pyfast/finalseg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ def __cut(sentence):
yield sentence[nexti:]


re_han = re.compile("([\u4E00-\u9FD5]+)")
re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")
re_han = re.compile(r"([\u4E00-\u9FD5]+)")
re_skip = re.compile(r"([a-zA-Z0-9]+(?:\.\d+)?%?)")


def add_force_split(word):
Expand Down
14 changes: 7 additions & 7 deletions jieba_pyfast/posseg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
PROB_EMIT_P = "prob_emit.p"
CHAR_STATE_TAB_P = "char_state_tab.p"

re_han_detail = re.compile("([\u4E00-\u9FD5]+)")
re_skip_detail = re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
re_han_internal = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)")
re_skip_internal = re.compile("(\r\n|\s)")
re_han_detail = re.compile(r"([\u4E00-\u9FD5]+)")
re_skip_detail = re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
re_han_internal = re.compile(r"([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)")
re_skip_internal = re.compile(r"(\r\n|\s)")

re_eng = re.compile("[a-zA-Z0-9]+")
re_num = re.compile("[\.0-9]+")
re_eng = re.compile(r"[a-zA-Z0-9]+")
re_num = re.compile(r"[\.0-9]+")

re_eng1 = re.compile("^[a-zA-Z0-9]$", re.U)
re_eng1 = re.compile(r"^[a-zA-Z0-9]$", re.U)


def load_model():
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "jieba_pyfast"
version = "3.11.0"
version = "3.11.1"
description = "Tokenize Chinese characters"
license = "MIT"
authors = ["snapADDY GmbH <[email protected]>"]
Expand Down

0 comments on commit 64c3c1f

Please sign in to comment.