From 9965647df8fd5bc2756a42dc89274267f11e3497 Mon Sep 17 00:00:00 2001 From: Minho Ryu Date: Thu, 4 Jul 2019 17:11:36 +0900 Subject: [PATCH 1/9] Fix prefix in get_input_fn --- data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_utils.py b/data_utils.py index ae9073f2e..a716bc82d 100644 --- a/data_utils.py +++ b/data_utils.py @@ -778,7 +778,7 @@ def get_input_fn( # Merge all record infos into a single one record_glob_base = format_filename( - prefix="record_info-{}-*".format(split), + prefix="record_info-{}-{}-{}".format(split, FLAGS.task, FLAGS.pass_id), bsz_per_host=bsz_per_host, seq_len=seq_len, bi_data=bi_data, From 1b61bf27311a14a4447f7997cef339e828ef6e61 Mon Sep 17 00:00:00 2001 From: Minho Ryu Date: Thu, 4 Jul 2019 17:29:30 +0900 Subject: [PATCH 2/9] fix errors with get_input_fn in train_gpu --- data_utils.py | 4 +++- train_gpu.py | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/data_utils.py b/data_utils.py index a716bc82d..75bfd66fb 100644 --- a/data_utils.py +++ b/data_utils.py @@ -762,6 +762,8 @@ def parser(record): def get_input_fn( tfrecord_dir, split, + task, + pass_id, bsz_per_host, seq_len, reuse_len, @@ -778,7 +780,7 @@ def get_input_fn( # Merge all record infos into a single one record_glob_base = format_filename( - prefix="record_info-{}-{}-{}".format(split, FLAGS.task, FLAGS.pass_id), + prefix="record_info-{}-{}-{}".format(split, task, pass_id), bsz_per_host=bsz_per_host, seq_len=seq_len, bi_data=bi_data, diff --git a/train_gpu.py b/train_gpu.py index 790c024a0..885e18746 100644 --- a/train_gpu.py +++ b/train_gpu.py @@ -127,6 +127,11 @@ flags.DEFINE_float("init_range", default=0.1, help="Initialization std when init is uniform.") +# TFRecord Path +flags.DEFINE_integer("pass_id", 0, help="ID of the current pass." + "Different passes sample different negative segment.") +flags.DEFINE_integer("task", 0, help="The Task ID. This value is used when " + "using multiple workers to identify each worker.") FLAGS = flags.FLAGS @@ -186,6 +191,8 @@ def train(ps_device): train_input_fn, record_info_dict = data_utils.get_input_fn( tfrecord_dir=FLAGS.record_info_dir, split="train", + pass_id=FLAGS.pass_id, + task=FLAGS.task, bsz_per_host=FLAGS.train_batch_size, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, From 4c574d688f47140ccf85f3e1882e056cc675d541 Mon Sep 17 00:00:00 2001 From: Minho Ryu Date: Thu, 4 Jul 2019 17:39:17 +0900 Subject: [PATCH 3/9] fix minor issue --- train_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_gpu.py b/train_gpu.py index 885e18746..3d4e7469d 100644 --- a/train_gpu.py +++ b/train_gpu.py @@ -191,8 +191,8 @@ def train(ps_device): train_input_fn, record_info_dict = data_utils.get_input_fn( tfrecord_dir=FLAGS.record_info_dir, split="train", - pass_id=FLAGS.pass_id, task=FLAGS.task, + pass_id=FLAGS.pass_id, bsz_per_host=FLAGS.train_batch_size, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, From 7ef2f2ddb6aea4ab931b2f5762579e419c78944c Mon Sep 17 00:00:00 2001 From: Minho Ryu Date: Thu, 4 Jul 2019 19:37:20 +0900 Subject: [PATCH 4/9] fix assertion for even number of batch size --- modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modeling.py b/modeling.py index a7d719cfb..713be5637 100644 --- a/modeling.py +++ b/modeling.py @@ -233,7 +233,7 @@ def relative_positional_encoding(qlen, klen, d_model, clamp_len, attn_type, if bsz is not None: # With bi_data, the batch size should be divisible by 2. - assert bsz%2 == 0 + tf.debugging.assert_equal(bsz % 2, 0) fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz//2) bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq, bsz//2) else: From 1ea884d155cf789829fa7e0fa384b6987de4b809 Mon Sep 17 00:00:00 2001 From: Minho Ryu Date: Thu, 4 Jul 2019 21:05:08 +0900 Subject: [PATCH 5/9] change default value of uncased to False --- data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_utils.py b/data_utils.py index 75bfd66fb..5e80d1d33 100644 --- a/data_utils.py +++ b/data_utils.py @@ -885,7 +885,7 @@ def input_fn(params): flags.DEFINE_integer("reuse_len", 256, help="Number of token that can be reused as memory. " "Could be half of `seq_len`.") - flags.DEFINE_bool("uncased", True, help="Use uncased inputs or not.") + flags.DEFINE_bool("uncased", False, help="Use uncased inputs or not.") flags.DEFINE_bool("bi_data", True, help="whether to create bidirectional data") flags.DEFINE_integer("mask_alpha", default=6, From 0391d1e2a792b6e147eb0b120bf2d0f057494b2a Mon Sep 17 00:00:00 2001 From: Minho Ryu Date: Sat, 6 Jul 2019 14:34:08 +0900 Subject: [PATCH 6/9] Fix curr_loss calculation When restart training, since prev_step is -1, curr_loss for the first print would be wrongly calculated. --- train_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_gpu.py b/train_gpu.py index 3d4e7469d..9b5994728 100644 --- a/train_gpu.py +++ b/train_gpu.py @@ -300,7 +300,7 @@ def train(ps_device): total_loss += loss_np if curr_step > 0 and curr_step % FLAGS.iterations == 0: - curr_loss = total_loss / (curr_step - prev_step) + curr_loss = total_loss / FLAGS.iterations tf.logging.info("[{}] | gnorm {:.2f} lr {:8.6f} " "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format( curr_step, fetched[-3], fetched[-2], From c4850b9160955cbcc4fdc22f21c99feea491dd73 Mon Sep 17 00:00:00 2001 From: Cong Date: Fri, 11 Oct 2019 15:31:24 +0900 Subject: [PATCH 7/9] Config for TPU pod I ran `train.py` on a TPU pod v3-256 and got the following error: ValueError: TPUConfig.num_shards is not set correctly .... Found in https://cloud.google.com/tpu/docs/training-on-tpu-pods#providing_the_tpu_name_and_region_to_tpuclusterresolver that > For single device training, you can specify either the TPU name or an IP address, for example: `grpc://1.2.3.4:8470`. > For TPU Pods you must use the TPU name so that TensorFlow can discover the IP addresses of all the hosts available for training distribution. So, in the case of a TPU pod, setting `master` doesn't work. I just tried setting `cluster` and it worked, all 32 hosts in the TPU pod were detected and used correctly. --- model_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/model_utils.py b/model_utils.py index fd8d6d8d9..e7b66e241 100644 --- a/model_utils.py +++ b/model_utils.py @@ -19,7 +19,7 @@ def configure_tpu(FLAGS): if FLAGS.use_tpu: tpu_cluster = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) - master = tpu_cluster.get_master() + master = None else: tpu_cluster = None master = FLAGS.master @@ -42,6 +42,7 @@ def configure_tpu(FLAGS): per_host_input = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster, master=master, model_dir=FLAGS.model_dir, session_config=session_config, From 0265911ccb09a080e4d579ea476c944f72984570 Mon Sep 17 00:00:00 2001 From: Cong Date: Wed, 20 Nov 2019 23:28:20 +0700 Subject: [PATCH 8/9] Add pass_id and task in flags to train.py --- train.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/train.py b/train.py index 9abb15167..7c020b33d 100644 --- a/train.py +++ b/train.py @@ -140,6 +140,12 @@ flags.DEFINE_float("init_range", default=0.1, help="Initialization std when init is uniform.") +# TFRecord Path +flags.DEFINE_integer("pass_id", 0, help="ID of the current pass." + "Different passes sample different negative segment.") +flags.DEFINE_integer("task", 0, help="The Task ID. This value is used when " + "using multiple workers to identify each worker.") + FLAGS = flags.FLAGS @@ -226,6 +232,8 @@ def get_input_fn(split): input_fn, record_info_dict = data_utils.get_input_fn( tfrecord_dir=FLAGS.record_info_dir, split=split, + task=FLAGS.task, + pass_id=FLAGS.pass_id, bsz_per_host=batch_size // FLAGS.num_hosts, seq_len=FLAGS.seq_len, reuse_len=FLAGS.reuse_len, From c4966b6a21b23735daf4ad815947a519de81347c Mon Sep 17 00:00:00 2001 From: Cong Date: Thu, 21 Nov 2019 07:31:50 +0700 Subject: [PATCH 9/9] Add flag n_token to data_utils.py --- data_utils.py | 4 ++-- run_classifier.py | 2 +- run_race.py | 2 +- run_squad.py | 2 +- train.py | 1 - train_gpu.py | 2 -- 6 files changed, 5 insertions(+), 8 deletions(-) diff --git a/data_utils.py b/data_utils.py index 5e80d1d33..3ecf8415d 100644 --- a/data_utils.py +++ b/data_utils.py @@ -32,7 +32,6 @@ "" : 8, } -VOCAB_SIZE = 32000 UNK_ID = special_symbols[""] CLS_ID = special_symbols[""] SEP_ID = special_symbols[""] @@ -188,7 +187,7 @@ def create_data(_): # Create and dump corpus_info from task 0 if FLAGS.task == 0: corpus_info = { - "vocab_size": VOCAB_SIZE, + "vocab_size": FLAGS.n_token, "bsz_per_host": FLAGS.bsz_per_host, "num_core_per_host": FLAGS.num_core_per_host, "seq_len": FLAGS.seq_len, @@ -886,6 +885,7 @@ def input_fn(params): help="Number of token that can be reused as memory. " "Could be half of `seq_len`.") flags.DEFINE_bool("uncased", False, help="Use uncased inputs or not.") + flags.DEFINE_integer("n_token", 32000, help="Vocab size") flags.DEFINE_bool("bi_data", True, help="whether to create bidirectional data") flags.DEFINE_integer("mask_alpha", default=6, diff --git a/run_classifier.py b/run_classifier.py index c6eb1baa1..dc20022c9 100644 --- a/run_classifier.py +++ b/run_classifier.py @@ -21,7 +21,7 @@ import sentencepiece as spm -from data_utils import SEP_ID, VOCAB_SIZE, CLS_ID +from data_utils import SEP_ID, CLS_ID import model_utils import function_builder from classifier_utils import PaddingInputExample diff --git a/run_race.py b/run_race.py index 5f4973c6c..0859f13fb 100644 --- a/run_race.py +++ b/run_race.py @@ -23,7 +23,7 @@ import tensorflow as tf import sentencepiece as spm -from data_utils import SEP_ID, VOCAB_SIZE, CLS_ID +from data_utils import SEP_ID, CLS_ID import model_utils import function_builder from classifier_utils import PaddingInputExample diff --git a/run_squad.py b/run_squad.py index 156fa4396..78b1d2cda 100644 --- a/run_squad.py +++ b/run_squad.py @@ -28,7 +28,7 @@ import function_builder import model_utils import squad_utils -from data_utils import SEP_ID, CLS_ID, VOCAB_SIZE +from data_utils import SEP_ID, CLS_ID SPIECE_UNDERLINE = u'▁' diff --git a/train.py b/train.py index 7c020b33d..fc619c25d 100644 --- a/train.py +++ b/train.py @@ -259,7 +259,6 @@ def main(unused_argv): assert FLAGS.seq_len > 0 assert FLAGS.perm_size > 0 - FLAGS.n_token = data_utils.VOCAB_SIZE tf.logging.info("n_token {}".format(FLAGS.n_token)) if not tf.gfile.Exists(FLAGS.model_dir): diff --git a/train_gpu.py b/train_gpu.py index 9b5994728..f7df62115 100644 --- a/train_gpu.py +++ b/train_gpu.py @@ -321,8 +321,6 @@ def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) - # Get corpus info - FLAGS.n_token = data_utils.VOCAB_SIZE tf.logging.info("n_token {}".format(FLAGS.n_token)) if not tf.gfile.Exists(FLAGS.model_dir):