-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathkiwi.go
274 lines (223 loc) · 7.12 KB
/
kiwi.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
// Package kiwi is a Go binding for Kiwi (https://github.com/bab2min/Kiwi) project.
package kiwi
/*
#cgo LDFLAGS: -l kiwi
#include <stdlib.h>
#include <string.h>
#include <stdint.h> // for uintptr_t
#include <kiwi/capi.h>
extern int KiwiReaderBridge(int lineNumber, char *buffer, void *userData);
*/
import "C"
import (
"io"
"runtime/cgo"
"unsafe"
"github.com/codingpot/kiwigo/internal"
)
// BuildOption is a bitwise OR of the KiwiBuildOption values.
type BuildOption int
const (
KIWI_BUILD_LOAD_DEFAULT_DICT BuildOption = C.KIWI_BUILD_LOAD_DEFAULT_DICT
KIWI_BUILD_INTEGRATE_ALLOMORPH BuildOption = C.KIWI_BUILD_INTEGRATE_ALLOMORPH
KIWI_BUILD_DEFAULT BuildOption = C.KIWI_BUILD_DEFAULT
)
// AnalyzeOption is a bitwise OR of the KiwiAnalyzeOption values.
type AnalyzeOption int
const (
KIWI_MATCH_URL AnalyzeOption = C.KIWI_MATCH_URL
KIWI_MATCH_EMAIL AnalyzeOption = C.KIWI_MATCH_EMAIL
KIWI_MATCH_HASHTAG AnalyzeOption = C.KIWI_MATCH_HASHTAG
KIWI_MATCH_MENTION AnalyzeOption = C.KIWI_MATCH_MENTION
KIWI_MATCH_ALL AnalyzeOption = C.KIWI_MATCH_ALL
KIWI_MATCH_NORMALIZE_CODA AnalyzeOption = C.KIWI_MATCH_NORMALIZE_CODA
KIWI_MATCH_ALL_WITH_NORMALIZING AnalyzeOption = C.KIWI_MATCH_ALL_WITH_NORMALIZING
)
// KiwiVersion returns the version of the kiwi library.
func KiwiVersion() string {
return C.GoString(C.kiwi_version())
}
// KiwiError returns the Error messages.
func KiwiError() string {
return C.GoString(C.kiwi_error())
}
// KiwiClearError clear error.
func KiwiClearError() {
C.kiwi_clear_error()
}
// Kiwi is a wrapper for the kiwi C library.
type Kiwi struct {
handler C.kiwi_h
}
// New returns a new Kiwi instance.
// Don't forget to call Close after this.
func New(modelPath string, numThread int, options BuildOption) *Kiwi {
return &Kiwi{
handler: C.kiwi_init(C.CString(modelPath), C.int(numThread), C.int(options)),
}
}
// TokenInfo returns the token info for the given token(Str).
type TokenInfo struct {
// Position is the index of this token appears in the original text.
Position int
// Tag represents a type of this token (e.g. VV, NNG, ...).
Tag POSType
// Form is the actual string of this token.
Form string
}
// TokenResult is a result for Analyze.
type TokenResult struct {
Tokens []TokenInfo
Score float32
}
// Analyze returns the result of the analysis.
func (k *Kiwi) Analyze(text string, topN int, options AnalyzeOption) ([]TokenResult, error) {
kiwiResH := C.kiwi_analyze(k.handler, C.CString(text), C.int(topN), C.int(options))
defer C.kiwi_res_close(kiwiResH)
resSize := int(C.kiwi_res_size(kiwiResH))
res := make([]TokenResult, resSize)
for i := 0; i < resSize; i++ {
tokens := make([]TokenInfo, int(C.kiwi_res_word_num(kiwiResH, C.int(i))))
for j := 0; j < len(tokens); j++ {
pos, err := ParsePOSType(C.GoString(C.kiwi_res_tag(kiwiResH, C.int(i), C.int(j))))
if err != nil {
return nil, err
}
tokens[j] = TokenInfo{
Form: C.GoString(C.kiwi_res_form(kiwiResH, C.int(i), C.int(j))),
Tag: pos,
Position: int(C.kiwi_res_position(kiwiResH, C.int(i), C.int(j))),
}
}
res[i] = TokenResult{
Tokens: tokens,
Score: float32(C.kiwi_res_prob(kiwiResH, C.int(i))),
}
}
return res, nil
}
// SplitResult returns the Sentences.
type SplitResult struct {
Text string
Begin int
End int
}
// SplitSentence returns the line of sentences.
func (k *Kiwi) SplitSentence(text string, options AnalyzeOption) ([]SplitResult, error) {
kiwiSsH := C.kiwi_split_into_sents(k.handler, C.CString(text), C.int(options), nil)
defer C.kiwi_ss_close(kiwiSsH)
resSize := int(C.kiwi_ss_size(kiwiSsH))
res := make([]SplitResult, resSize)
for i := 0; i < resSize; i++ {
begin := int(C.kiwi_ss_begin_position(kiwiSsH, C.int(i)))
end := int(C.kiwi_ss_end_position(kiwiSsH, C.int(i)))
res[i] = SplitResult{
Text: text[begin:end],
Begin: begin,
End: end,
}
}
return res, nil
}
// Close frees the resource allocated for Kiwi and returns the exit status.
// This must be called after New.
// Returns 0 if successful.
// Safe to call multiple times.
func (k *Kiwi) Close() int {
if k.handler != nil {
out := int(C.kiwi_close(k.handler))
k.handler = nil
return out
}
return 0
}
// KiwiBuilder is a wrapper for the kiwi C library.
type KiwiBuilder struct {
handler C.kiwi_builder_h
}
// NewBuilder returns a new KiwiBuilder instance.
// Don't forget to call Close after this.
func NewBuilder(modelPath string, numThread int, options BuildOption) *KiwiBuilder {
return &KiwiBuilder{
handler: C.kiwi_builder_init(C.CString(modelPath), C.int(numThread), C.int(options)),
}
}
// AddWord set custom word with word, pos, score.
func (kb *KiwiBuilder) AddWord(word string, pos POSType, score float32) int {
return int(C.kiwi_builder_add_word(kb.handler, C.CString(word), C.CString(string(pos)), C.float(score)))
}
// LoadDict loads user dict with dict file path.
func (kb *KiwiBuilder) LoadDict(dictPath string) int {
return int(C.kiwi_builder_load_dict(kb.handler, C.CString(dictPath)))
}
// Build creates kiwi instance with user word etc.
func (kb *KiwiBuilder) Build() *Kiwi {
h := C.kiwi_builder_build(kb.handler)
defer kb.Close()
return &Kiwi{
handler: h,
}
}
// Close frees the resource allocated for KiwiBuilder and returns the exit status.
// This must be called after New but not need to called after Build.
// Returns 0 if successful.
// Safe to call multiple times.
func (kb *KiwiBuilder) Close() int {
if kb.handler != nil {
out := int(C.kiwi_builder_close(kb.handler))
kb.handler = nil
return out
}
return 0
}
// WordInfo returns the token info for the given token(Str).
type WordInfo struct {
Form string
Freq int
POSScore float32
Score float32
}
//export KiwiReaderImpl
func KiwiReaderImpl(lineNumber C.int, buffer *C.char, userData unsafe.Pointer) C.int {
scanner := cgo.Handle(userData).Value().(*internal.RewindScanner)
if buffer == nil {
if lineNumber == 0 {
scanner.Rewind()
}
if !scanner.Scan() {
return C.int(0)
}
text := scanner.Text()
return C.int(len([]byte(text)) + 1)
}
textCString := C.CString(scanner.Text())
defer C.free(unsafe.Pointer(textCString))
C.strcpy(buffer, textCString)
return C.int(0)
}
// ExtractWords returns the result of extract word.
func (kb *KiwiBuilder) ExtractWords(readSeeker io.ReadSeeker, minCnt int, maxWordLen int, minScore float32, posThreshold float32) ([]WordInfo, error) {
scanner := internal.NewRewindScanner(readSeeker)
h := cgo.NewHandle(scanner)
defer h.Delete()
kiwiWsH := C.kiwi_builder_extract_words(
kb.handler,
C.kiwi_reader_t(C.KiwiReaderBridge),
unsafe.Pointer(h),
C.int(minCnt), C.int(maxWordLen), C.float(minScore), C.float(posThreshold))
defer C.kiwi_ws_close(kiwiWsH)
resSize := int(C.kiwi_ws_size(kiwiWsH))
if resSize < 0 {
resSize = 0
}
res := make([]WordInfo, resSize)
for i := 0; i < resSize; i++ {
res[i] = WordInfo{
Form: C.GoString(C.kiwi_ws_form(kiwiWsH, C.int(i))),
Freq: int(C.kiwi_ws_freq(kiwiWsH, C.int(i))),
POSScore: float32(C.kiwi_ws_pos_score(kiwiWsH, C.int(i))),
Score: float32(C.kiwi_ws_score(kiwiWsH, C.int(i))),
}
}
return res, nil
}