Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

添加新规则报错,_pywrapfst.FstOpError: Operation failed #159

Closed
HankLiu10 opened this issue Nov 16, 2023 · 7 comments
Closed

添加新规则报错,_pywrapfst.FstOpError: Operation failed #159

HankLiu10 opened this issue Nov 16, 2023 · 7 comments
Labels
good first issue Good for newcomers

Comments

@HankLiu10
Copy link

HankLiu10 commented Nov 16, 2023

你好,我尝试为ITN加入新的基本规则,运行后报错, 麻烦看看能否解决,谢谢!:

其中,添加了WeTextProcessing/itn/chinese/data/dataflow/flow.tsv如下:

兆	M
g	G

另添加了WeTextProcessing/itn/chinese/rules/dataflow.py如下:

from itn.chinese.rules.cardinal import Cardinal
from tn.processor import Processor

from pynini import string_file
from pynini.lib.pynutil import delete, insert


class DataFlow(Processor):

    def __init__(self, enable_0_to_9=True):
        super().__init__(name='dataflow')
        self.enable_0_to_9 = enable_0_to_9
        self.build_tagger()
        self.build_verbalizer()

    def build_tagger(self):
        flow = string_file('itn/chinese/data/dataflow/flow.tsv')
        digit = string_file('itn/chinese/data/number/digit.tsv')  # 1 ~ 9

        number = Cardinal().number if self.enable_0_to_9 else \
            Cardinal().number_exclude_0_to_9
        number |= digit
        tagger = (insert('value: "') + number + insert('"') +
                  insert(' dataflow: "') + flow + insert('"'))
        self.tagger = self.add_tokens(tagger)

    def build_verbalizer(self):
        dataflow = delete('dataflow: "') + self.SIGMA + delete('"')
        value = delete(' value: "') + self.SIGMA + delete('"')
        verbalizer = value + dataflow
        self.verbalizer = self.delete_tokens(verbalizer)

同时,修改了WeTextProcessing/itn/chinese/inverse_normalizer.py 的内容如下:

from tn.processor import Processor
from itn.chinese.rules.cardinal import Cardinal
from itn.chinese.rules.char import Char
from itn.chinese.rules.date import Date
from itn.chinese.rules.fraction import Fraction
from itn.chinese.rules.math import Math
from itn.chinese.rules.measure import Measure
from itn.chinese.rules.money import Money
from itn.chinese.rules.whitelist import Whitelist
from itn.chinese.rules.time import Time
from itn.chinese.rules.postprocessor import PostProcessor
from itn.chinese.rules.license_plate import LicensePlate
from itn.chinese.rules.dataflow import DataFlow     ####

from pynini.lib.pynutil import add_weight, delete
from importlib_resources import files


class InverseNormalizer(Processor):

    def __init__(self, cache_dir=None, overwrite_cache=False,
                 enable_standalone_number=True,
                 enable_0_to_9=False):
        super().__init__(name='inverse_normalizer', ordertype='itn')
        self.convert_number = enable_standalone_number
        self.enable_0_to_9 = enable_0_to_9
        if cache_dir is None:
            cache_dir = files("itn")
        self.build_fst('zh_itn', cache_dir, overwrite_cache)

    def build_tagger(self):
        tagger = (add_weight(Date().tagger, 1.02)
                  | add_weight(Whitelist().tagger, 1.01)
                  | add_weight(Fraction().tagger, 1.05)
                  | add_weight(Measure(enable_0_to_9=self.enable_0_to_9).tagger, 1.05)  # noqa
                  | add_weight(Money(enable_0_to_9=self.enable_0_to_9).tagger, 1.04)  # noqa
                  | add_weight(Time().tagger, 1.05)
                  | add_weight(Cardinal(self.convert_number, self.enable_0_to_9).tagger, 1.06)  # noqa
                  | add_weight(Math().tagger, 1.10)
                  | add_weight(LicensePlate().tagger, 1.0)
                  | add_weight(DataFlow().tagger, 1.11)            ####
                  | add_weight(Char().tagger, 100)).optimize()

        tagger = tagger.star
        # remove the last space
        self.tagger = tagger @ self.build_rule(delete(' '), '', '[EOS]')

    def build_verbalizer(self):
        verbalizer = (Cardinal(self.convert_number, self.enable_0_to_9).verbalizer  # noqa
                      | Char().verbalizer
                      | Date().verbalizer
                      | Fraction().verbalizer
                      | Math().verbalizer
                      | Measure(enable_0_to_9=self.enable_0_to_9).verbalizer
                      | Money(enable_0_to_9=self.enable_0_to_9).verbalizer
                      | Time().verbalizer
                      | LicensePlate().verbalizer
                      | DataFlow(enable_0_to_9=self.enable_0_to_9).verbalizer        ####
                      | Whitelist().verbalizer).optimize()
        postprocessor = PostProcessor(remove_interjections=True).processor

        self.verbalizer = (verbalizer @ postprocessor).star

最后运行python -m itn --text "八兆流量" --overwrite_cache后报错如下,
(wenetITN) liuhangchen@G08:~/WeNetITN/WeTextProcessing$ python -m itn --text "八兆流量" --overwrite_cache
dataflow { value: "8" dataflow: "M" } char { value: "流" } char { value: "量" }
ERROR: StringFstToOutputLabels: Invalid start state
Traceback (most recent call last):
File "/storage1/liuhangchen/anaconda3/envs/wenetITN/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/storage1/liuhangchen/anaconda3/envs/wenetITN/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/storage1/liuhangchen/WeNetITN/WeTextProcessing/itn/main.py", line 4, in
main()
File "/storage1/liuhangchen/WeNetITN/WeTextProcessing/itn/main.py", line 53, in main
print(normalizer.normalize(args.text))
File "/storage1/liuhangchen/WeNetITN/WeTextProcessing/tn/processor.py", line 96, in normalize
return self.verbalize(self.tag(input))
File "/storage1/liuhangchen/WeNetITN/WeTextProcessing/tn/processor.py", line 93, in verbalize
return shortestpath(lattice, nshortest=1, unique=True).string()
File "extensions/_pynini.pyx", line 462, in _pynini.Fst.string
File "extensions/_pynini.pyx", line 507, in _pynini.Fst.string
_pywrapfst.FstOpError: Operation failed

@xingchensong
Copy link
Member

这种大概率是少了空格引起的,tagger里dataflow前面有个空格,verblizer里没把这个空格删掉

image

@HankLiu10
Copy link
Author

这种大概率是少了空格引起的,tagger里dataflow前面有个空格,verblizer里没把这个空格删掉

image

谢谢,已经修改为:

from itn.chinese.rules.cardinal import Cardinal
from tn.processor import Processor

from pynini import string_file
from pynini.lib.pynutil import delete, insert


class DataFlow(Processor):

    def __init__(self, enable_0_to_9=True):
        super().__init__(name='dataflow')
        self.enable_0_to_9 = enable_0_to_9
        self.build_tagger()
        self.build_verbalizer()

    def build_tagger(self):
        flow = string_file('itn/chinese/data/dataflow/flow.tsv')
        digit = string_file('itn/chinese/data/number/digit.tsv')  # 1 ~ 9

        number = Cardinal().number if self.enable_0_to_9 else \
            Cardinal().number_exclude_0_to_9
        number |= digit
        tagger = (insert('value: "') + number + insert('"') +
                  insert('dataflow: "') + flow + insert('"'))
        self.tagger = self.add_tokens(tagger)

    def build_verbalizer(self):
        dataflow = delete('dataflow: "') + self.SIGMA + delete('"')
        value = delete('value: "') + self.SIGMA + delete('"')
        verbalizer = value + dataflow
        self.verbalizer = self.delete_tokens(verbalizer)

重新执行后似乎在输出时有问题,麻烦看看~

dataflow { value: "8"dataflow: "M" } char { value: "流" } char { value: "量" }
ERROR: StringFstToOutputLabels: Invalid start state
Traceback (most recent call last):
  File "/storage1/liuhangchen/anaconda3/envs/wenetITN/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/storage1/liuhangchen/anaconda3/envs/wenetITN/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/storage1/liuhangchen/WeNetITN/WeTextProcessing/itn/__main__.py", line 4, in <module>
    main()
  File "/storage1/liuhangchen/WeNetITN/WeTextProcessing/itn/main.py", line 53, in main
    print(normalizer.normalize(args.text))
  File "/storage1/liuhangchen/WeNetITN/WeTextProcessing/tn/processor.py", line 96, in normalize
    return self.verbalize(self.tag(input))
  File "/storage1/liuhangchen/WeNetITN/WeTextProcessing/tn/processor.py", line 93, in verbalize
    return shortestpath(lattice, nshortest=1, unique=True).string()
  File "extensions/_pynini.pyx", line 462, in _pynini.Fst.string
  File "extensions/_pynini.pyx", line 507, in _pynini.Fst.string
_pywrapfst.FstOpError: Operation failed

似乎代码已经正确识别并转换了,但在输出时出了bug

@xingchensong
Copy link
Member

保留空格,不要删除空格

@xingchensong xingchensong changed the title 添加新规则报错 添加新规则报错,_pywrapfst.FstOpError: Operation failed Nov 16, 2023
@xingchensong xingchensong added the good first issue Good for newcomers label Nov 16, 2023
@xingchensong xingchensong pinned this issue Nov 16, 2023
@HankLiu10
Copy link
Author

HankLiu10 commented Nov 16, 2023

保留空格,不要删除空格

非常感谢,已经解决,确认一下,报错原因是解析时verbalizer无法匹配前面tagger留的空格导致的是吗

@xingchensong
Copy link
Member

不是,是因为tagger和verblizer之间还有一个tokenparser,他要靠空格来parse

https://github.com/wenet-e2e/WeTextProcessing/blob/master/tn/token_parser.py#L120-L136

@xingchensong
Copy link
Member

image

没有空格,parse不成功

@HankLiu10
Copy link
Author

image

没有空格,parse不成功

明白了,感谢

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
good first issue Good for newcomers
Projects
None yet
Development

No branches or pull requests

2 participants