diff --git a/CHANGELOG.md b/CHANGELOG.md index af2f3f7..9f67720 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,12 @@ CHANGELOG --------- +v0.3.8: + - multiprocessing support (get_vocab and apply_bpe) + - progress bar for learn_bpe + - seed parameter for deterministic BPE dropout + - ignore some unicode line separators which would crash subword-nmt + v0.3.7: - BPE dropout (Provilkov et al., 2019) - more efficient glossaries (https://github.com/rsennrich/subword-nmt/pull/69) diff --git a/setup.py b/setup.py index 836b7e0..23d16db 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ def test_suite(): setup( name='subword_nmt', - version='0.3.7', + version='0.3.8', description='Unsupervised Word Segmentation for Neural Machine Translation and Text Generation', long_description=(codecs.open("README.md", encoding='utf-8').read() + "\n\n" + codecs.open("CHANGELOG.md", encoding='utf-8').read()), @@ -28,6 +28,8 @@ def test_suite(): 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 3', ], + install_requires=['mock', + 'tqdm'], packages=find_packages(), entry_points={ 'console_scripts': ['subword-nmt=subword_nmt.subword_nmt:main'], diff --git a/subword_nmt/learn_bpe.py b/subword_nmt/learn_bpe.py index 2adee1b..7b01f04 100755 --- a/subword_nmt/learn_bpe.py +++ b/subword_nmt/learn_bpe.py @@ -25,6 +25,12 @@ from multiprocessing import Pool, cpu_count from collections import defaultdict, Counter +try: + from tqdm import tqdm +except ImportError: + def tqdm(iterator, *args, **kwargs): + return iterator + # hack for python2/3 compatibility from io import open argparse.open = open @@ -294,7 +300,7 @@ def learn_bpe(infile, outfile, num_symbols, min_frequency=2, verbose=False, is_d # threshold is inspired by Zipfian assumption, but should only affect speed threshold = max(stats.values()) / 10 - for i in range(num_symbols): + for i in tqdm(range(num_symbols)): if stats: most_frequent = max(stats, key=lambda x: (stats[x], x))