From 9965647df8fd5bc2756a42dc89274267f11e3497 Mon Sep 17 00:00:00 2001
From: Minho Ryu <ryumin93@gmail.com>
Date: Thu, 4 Jul 2019 17:11:36 +0900
Subject: [PATCH 1/9] Fix prefix in get_input_fn

---
 data_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_utils.py b/data_utils.py
index ae9073f2e..a716bc82d 100644
--- a/data_utils.py
+++ b/data_utils.py
@@ -778,7 +778,7 @@ def get_input_fn(
 
   # Merge all record infos into a single one
   record_glob_base = format_filename(
-      prefix="record_info-{}-*".format(split),
+      prefix="record_info-{}-{}-{}".format(split, FLAGS.task, FLAGS.pass_id),
       bsz_per_host=bsz_per_host,
       seq_len=seq_len,
       bi_data=bi_data,

From 1b61bf27311a14a4447f7997cef339e828ef6e61 Mon Sep 17 00:00:00 2001
From: Minho Ryu <ryumin93@gmail.com>
Date: Thu, 4 Jul 2019 17:29:30 +0900
Subject: [PATCH 2/9] fix errors with get_input_fn in train_gpu

---
 data_utils.py | 4 +++-
 train_gpu.py  | 7 +++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/data_utils.py b/data_utils.py
index a716bc82d..75bfd66fb 100644
--- a/data_utils.py
+++ b/data_utils.py
@@ -762,6 +762,8 @@ def parser(record):
 def get_input_fn(
     tfrecord_dir,
     split,
+    task,
+    pass_id,
     bsz_per_host,
     seq_len,
     reuse_len,
@@ -778,7 +780,7 @@ def get_input_fn(
 
   # Merge all record infos into a single one
   record_glob_base = format_filename(
-      prefix="record_info-{}-{}-{}".format(split, FLAGS.task, FLAGS.pass_id),
+      prefix="record_info-{}-{}-{}".format(split, task, pass_id),
       bsz_per_host=bsz_per_host,
       seq_len=seq_len,
       bi_data=bi_data,
diff --git a/train_gpu.py b/train_gpu.py
index 790c024a0..885e18746 100644
--- a/train_gpu.py
+++ b/train_gpu.py
@@ -127,6 +127,11 @@
 flags.DEFINE_float("init_range", default=0.1,
       help="Initialization std when init is uniform.")
 
+# TFRecord Path
+flags.DEFINE_integer("pass_id", 0, help="ID of the current pass."
+                                        "Different passes sample different negative segment.")
+flags.DEFINE_integer("task", 0, help="The Task ID. This value is used when "
+                                     "using multiple workers to identify each worker.")
 
 FLAGS = flags.FLAGS
 
@@ -186,6 +191,8 @@ def train(ps_device):
   train_input_fn, record_info_dict = data_utils.get_input_fn(
       tfrecord_dir=FLAGS.record_info_dir,
       split="train",
+      pass_id=FLAGS.pass_id,
+      task=FLAGS.task,
       bsz_per_host=FLAGS.train_batch_size,
       seq_len=FLAGS.seq_len,
       reuse_len=FLAGS.reuse_len,

From 4c574d688f47140ccf85f3e1882e056cc675d541 Mon Sep 17 00:00:00 2001
From: Minho Ryu <ryumin93@gmail.com>
Date: Thu, 4 Jul 2019 17:39:17 +0900
Subject: [PATCH 3/9] fix minor issue

---
 train_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_gpu.py b/train_gpu.py
index 885e18746..3d4e7469d 100644
--- a/train_gpu.py
+++ b/train_gpu.py
@@ -191,8 +191,8 @@ def train(ps_device):
   train_input_fn, record_info_dict = data_utils.get_input_fn(
       tfrecord_dir=FLAGS.record_info_dir,
       split="train",
-      pass_id=FLAGS.pass_id,
       task=FLAGS.task,
+      pass_id=FLAGS.pass_id,
       bsz_per_host=FLAGS.train_batch_size,
       seq_len=FLAGS.seq_len,
       reuse_len=FLAGS.reuse_len,

From 7ef2f2ddb6aea4ab931b2f5762579e419c78944c Mon Sep 17 00:00:00 2001
From: Minho Ryu <ryumin93@gmail.com>
Date: Thu, 4 Jul 2019 19:37:20 +0900
Subject: [PATCH 4/9] fix assertion for even number of batch size

---
 modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modeling.py b/modeling.py
index a7d719cfb..713be5637 100644
--- a/modeling.py
+++ b/modeling.py
@@ -233,7 +233,7 @@ def relative_positional_encoding(qlen, klen, d_model, clamp_len, attn_type,
 
     if bsz is not None:
       # With bi_data, the batch size should be divisible by 2.
-      assert bsz%2 == 0
+      tf.debugging.assert_equal(bsz % 2, 0)
       fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
       bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
     else:

From 1ea884d155cf789829fa7e0fa384b6987de4b809 Mon Sep 17 00:00:00 2001
From: Minho Ryu <ryumin93@gmail.com>
Date: Thu, 4 Jul 2019 21:05:08 +0900
Subject: [PATCH 5/9] change default value of uncased to False

---
 data_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_utils.py b/data_utils.py
index 75bfd66fb..5e80d1d33 100644
--- a/data_utils.py
+++ b/data_utils.py
@@ -885,7 +885,7 @@ def input_fn(params):
   flags.DEFINE_integer("reuse_len", 256,
                        help="Number of token that can be reused as memory. "
                        "Could be half of `seq_len`.")
-  flags.DEFINE_bool("uncased", True, help="Use uncased inputs or not.")
+  flags.DEFINE_bool("uncased", False, help="Use uncased inputs or not.")
   flags.DEFINE_bool("bi_data", True,
                     help="whether to create bidirectional data")
   flags.DEFINE_integer("mask_alpha", default=6,

From 0391d1e2a792b6e147eb0b120bf2d0f057494b2a Mon Sep 17 00:00:00 2001
From: Minho Ryu <ryumin93@gmail.com>
Date: Sat, 6 Jul 2019 14:34:08 +0900
Subject: [PATCH 6/9] Fix curr_loss calculation

When restart training, since prev_step is -1, curr_loss for the first print would be wrongly calculated.
---
 train_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_gpu.py b/train_gpu.py
index 3d4e7469d..9b5994728 100644
--- a/train_gpu.py
+++ b/train_gpu.py
@@ -300,7 +300,7 @@ def train(ps_device):
       total_loss += loss_np
 
       if curr_step > 0 and curr_step % FLAGS.iterations == 0:
-        curr_loss = total_loss / (curr_step - prev_step)
+        curr_loss = total_loss / FLAGS.iterations
         tf.logging.info("[{}] | gnorm {:.2f} lr {:8.6f} "
             "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format(
             curr_step, fetched[-3], fetched[-2],

From c4850b9160955cbcc4fdc22f21c99feea491dd73 Mon Sep 17 00:00:00 2001
From: Cong <chicong.v@gmail.com>
Date: Fri, 11 Oct 2019 15:31:24 +0900
Subject: [PATCH 7/9] Config for TPU pod

I ran `train.py` on a TPU pod v3-256 and got the following error:

    ValueError: TPUConfig.num_shards is not set correctly ....

Found in https://cloud.google.com/tpu/docs/training-on-tpu-pods#providing_the_tpu_name_and_region_to_tpuclusterresolver that
> For single device training, you can specify either the TPU name or an IP address, for example: `grpc://1.2.3.4:8470`.
> For TPU Pods you must use the TPU name so that TensorFlow can discover the IP addresses of all the hosts available for training distribution.

So, in the case of a TPU pod, setting `master` doesn't work. I just tried setting `cluster` and it worked, all 32 hosts in the TPU pod were detected and used correctly.
---
 model_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/model_utils.py b/model_utils.py
index fd8d6d8d9..e7b66e241 100644
--- a/model_utils.py
+++ b/model_utils.py
@@ -19,7 +19,7 @@ def configure_tpu(FLAGS):
   if FLAGS.use_tpu:
     tpu_cluster = tf.contrib.cluster_resolver.TPUClusterResolver(
         FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
-    master = tpu_cluster.get_master()
+    master = None
   else:
     tpu_cluster = None
     master = FLAGS.master
@@ -42,6 +42,7 @@ def configure_tpu(FLAGS):
 
   per_host_input = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
   run_config = tf.contrib.tpu.RunConfig(
+      cluster=tpu_cluster,
       master=master,
       model_dir=FLAGS.model_dir,
       session_config=session_config,

From 0265911ccb09a080e4d579ea476c944f72984570 Mon Sep 17 00:00:00 2001
From: Cong <chicong.v@gmail.com>
Date: Wed, 20 Nov 2019 23:28:20 +0700
Subject: [PATCH 8/9] Add pass_id and task in flags to train.py

---
 train.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/train.py b/train.py
index 9abb15167..7c020b33d 100644
--- a/train.py
+++ b/train.py
@@ -140,6 +140,12 @@
 flags.DEFINE_float("init_range", default=0.1,
       help="Initialization std when init is uniform.")
 
+# TFRecord Path
+flags.DEFINE_integer("pass_id", 0, help="ID of the current pass."
+                                        "Different passes sample different negative segment.")
+flags.DEFINE_integer("task", 0, help="The Task ID. This value is used when "
+                                     "using multiple workers to identify each worker.")
+
 FLAGS = flags.FLAGS
 
 
@@ -226,6 +232,8 @@ def get_input_fn(split):
   input_fn, record_info_dict = data_utils.get_input_fn(
       tfrecord_dir=FLAGS.record_info_dir,
       split=split,
+      task=FLAGS.task,
+      pass_id=FLAGS.pass_id,
       bsz_per_host=batch_size // FLAGS.num_hosts,
       seq_len=FLAGS.seq_len,
       reuse_len=FLAGS.reuse_len,

From c4966b6a21b23735daf4ad815947a519de81347c Mon Sep 17 00:00:00 2001
From: Cong <chicong.v@gmail.com>
Date: Thu, 21 Nov 2019 07:31:50 +0700
Subject: [PATCH 9/9] Add flag n_token to data_utils.py

---
 data_utils.py     | 4 ++--
 run_classifier.py | 2 +-
 run_race.py       | 2 +-
 run_squad.py      | 2 +-
 train.py          | 1 -
 train_gpu.py      | 2 --
 6 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/data_utils.py b/data_utils.py
index 5e80d1d33..3ecf8415d 100644
--- a/data_utils.py
+++ b/data_utils.py
@@ -32,7 +32,6 @@
     "<eop>"  : 8,
 }
 
-VOCAB_SIZE = 32000
 UNK_ID = special_symbols["<unk>"]
 CLS_ID = special_symbols["<cls>"]
 SEP_ID = special_symbols["<sep>"]
@@ -188,7 +187,7 @@ def create_data(_):
   # Create and dump corpus_info from task 0
   if FLAGS.task == 0:
     corpus_info = {
-        "vocab_size": VOCAB_SIZE,
+        "vocab_size": FLAGS.n_token,
         "bsz_per_host": FLAGS.bsz_per_host,
         "num_core_per_host": FLAGS.num_core_per_host,
         "seq_len": FLAGS.seq_len,
@@ -886,6 +885,7 @@ def input_fn(params):
                        help="Number of token that can be reused as memory. "
                        "Could be half of `seq_len`.")
   flags.DEFINE_bool("uncased", False, help="Use uncased inputs or not.")
+  flags.DEFINE_integer("n_token", 32000, help="Vocab size")
   flags.DEFINE_bool("bi_data", True,
                     help="whether to create bidirectional data")
   flags.DEFINE_integer("mask_alpha", default=6,
diff --git a/run_classifier.py b/run_classifier.py
index c6eb1baa1..dc20022c9 100644
--- a/run_classifier.py
+++ b/run_classifier.py
@@ -21,7 +21,7 @@
 
 import sentencepiece as spm
 
-from data_utils import SEP_ID, VOCAB_SIZE, CLS_ID
+from data_utils import SEP_ID, CLS_ID
 import model_utils
 import function_builder
 from classifier_utils import PaddingInputExample
diff --git a/run_race.py b/run_race.py
index 5f4973c6c..0859f13fb 100644
--- a/run_race.py
+++ b/run_race.py
@@ -23,7 +23,7 @@
 import tensorflow as tf
 import sentencepiece as spm
 
-from data_utils import SEP_ID, VOCAB_SIZE, CLS_ID
+from data_utils import SEP_ID, CLS_ID
 import model_utils
 import function_builder
 from classifier_utils import PaddingInputExample
diff --git a/run_squad.py b/run_squad.py
index 156fa4396..78b1d2cda 100644
--- a/run_squad.py
+++ b/run_squad.py
@@ -28,7 +28,7 @@
 import function_builder
 import model_utils
 import squad_utils
-from data_utils import SEP_ID, CLS_ID, VOCAB_SIZE
+from data_utils import SEP_ID, CLS_ID
 
 SPIECE_UNDERLINE = u'▁'
 
diff --git a/train.py b/train.py
index 7c020b33d..fc619c25d 100644
--- a/train.py
+++ b/train.py
@@ -259,7 +259,6 @@ def main(unused_argv):
   assert FLAGS.seq_len > 0
   assert FLAGS.perm_size > 0
 
-  FLAGS.n_token = data_utils.VOCAB_SIZE
   tf.logging.info("n_token {}".format(FLAGS.n_token))
 
   if not tf.gfile.Exists(FLAGS.model_dir):
diff --git a/train_gpu.py b/train_gpu.py
index 9b5994728..f7df62115 100644
--- a/train_gpu.py
+++ b/train_gpu.py
@@ -321,8 +321,6 @@ def main(unused_argv):
 
   tf.logging.set_verbosity(tf.logging.INFO)
 
-  # Get corpus info
-  FLAGS.n_token = data_utils.VOCAB_SIZE
   tf.logging.info("n_token {}".format(FLAGS.n_token))
 
   if not tf.gfile.Exists(FLAGS.model_dir):