Skip to content

Commit

Permalink
Fix preprocess filenames + cleaning. Bump version 0.4.1 (#983)
Browse files Browse the repository at this point in the history
* Fix preprocess filenames + cleaning. Bump version 0.4.1
  • Loading branch information
vince62s authored Oct 11, 2018
1 parent 6de42cd commit 70a99a9
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 6 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

### Fixes and improvements

## [0.4.1](https://github.com/OpenNMT/OpenNMT-py/tree/v0.4.1) (2018-10-11)
* Fixed preprocessing files names, cleaning intermediary files.

## [0.4.0](https://github.com/OpenNMT/OpenNMT-py/tree/v0.4.0) (2018-10-08)
* Fixed Speech2Text training (thanks Yuntian)

Expand Down
2 changes: 1 addition & 1 deletion onmt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
__all__ = [onmt.inputters, onmt.encoders, onmt.decoders, onmt.models,
onmt.utils, onmt.modules, "Trainer"]

__version__ = "0.2.0"
__version__ = "0.4.1"
12 changes: 8 additions & 4 deletions preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import glob
import sys
import gc
import os
import codecs
import torch
from onmt.utils.logging import init_logger, logger
Expand Down Expand Up @@ -59,14 +60,14 @@ def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields,

with codecs.open(src_corpus, "r", encoding="utf-8") as fsrc:
with codecs.open(tgt_corpus, "r", encoding="utf-8") as ftgt:
logger.info("Reading source and target files: %s %s."
% (src_corpus, tgt_corpus))
src_data = fsrc.readlines()
tgt_data = ftgt.readlines()

src_corpus = "".join(src_corpus.split(".")[:-1])
tgt_corpus = "".join(tgt_corpus.split(".")[:-1])

num_shards = int(len(src_data) / opt.shard_size)
for x in range(num_shards):
logger.info("Splitting shard %d." % x)
f = codecs.open(src_corpus + ".{0}.txt".format(x), "w",
encoding="utf-8")
f.writelines(
Expand All @@ -79,6 +80,7 @@ def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields,
f.close()
num_written = num_shards * opt.shard_size
if len(src_data) > num_written:
logger.info("Splitting shard %d." % num_shards)
f = codecs.open(src_corpus + ".{0}.txt".format(num_shards),
'w', encoding="utf-8")
f.writelines(
Expand All @@ -96,6 +98,7 @@ def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields,
ret_list = []

for index, src in enumerate(src_list):
logger.info("Building shard %d." % index)
dataset = inputters.build_dataset(
fields, opt.data_type,
src_path=src,
Expand Down Expand Up @@ -124,7 +127,8 @@ def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields,
torch.save(dataset, pt_file)

ret_list.append(pt_file)

os.remove(src)
os.remove(tgt_list[index])
del dataset.examples
gc.collect()
del dataset
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

setup(name='OpenNMT-py',
description='A python implementation of OpenNMT',
version='0.2.1',
version='0.4.1',

packages=['onmt', 'onmt.encoders', 'onmt.modules', 'onmt.tests',
'onmt.translate', 'onmt.decoders', 'onmt.inputters',
Expand Down

0 comments on commit 70a99a9

Please sign in to comment.