forked from macournoyer/neuralconvo
-
Notifications
You must be signed in to change notification settings - Fork 128
/
Copy pathtokenizer.lua
executable file
·60 lines (48 loc) · 1.09 KB
/
tokenizer.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
local lexer = require "pl.lexer"
local yield = coroutine.yield
local M = {}
local function word(token)
return yield("word", token)
end
local function quote(token)
return yield("quote", token)
end
local function space(token)
return yield("space", token)
end
local function tag(token)
return yield("tag", token)
end
local function punct(token)
return yield("punct", token)
end
local function endpunct(token)
return yield("endpunct", token)
end
local function unknown(token)
print("unknown")
return yield("unknown", token)
end
function M.tokenize(text)
print(text)
--{ "^[\128-\193]+", word },
return lexer.scan(text, {
{ "^%s+", space },
{ "^['\"]", quote },
{ "^%w+", word },
{ "^%-+", space },
{ "^[,:;%-]", punct },
{ "^%.+", endpunct },
{ "^[%.%?!]", endpunct },
{ "^</?.->", tag },
{ "^.", unknown },
}, { [space]=true, [tag]=true })
end
function M.join(words)
local s = table.concat(words, " ")
s = s:gsub("^%l", string.upper)
s = s:gsub(" (') ", "%1")
s = s:gsub(" ([,:;%-%.%?!])", "%1")
return s
end
return M