diff --git a/data/locale/ar-SA.ini b/data/locale/ar-SA.ini
index 5c08e38..770d659 100644
--- a/data/locale/ar-SA.ini
+++ b/data/locale/ar-SA.ini
@@ -51,3 +51,4 @@ translate_add_context="الترجمة مع السياق"
 whisper_translate="ترجمة إلى الإنجليزية (Whisper)"
 buffer_size_msec="حجم الذاكرة المؤقتة (ملي ثانية)"
 overlap_size_msec="حجم التداخل (ملي ثانية)"
+buffer_output_type="نوع مخرجات الذاكرة المؤقتة"
diff --git a/data/locale/de-DE.ini b/data/locale/de-DE.ini
index df2e450..d171104 100644
--- a/data/locale/de-DE.ini
+++ b/data/locale/de-DE.ini
@@ -51,3 +51,4 @@ translate_add_context="Mit Kontext übersetzen"
 whisper_translate="Ins Englische übersetzen (Flüstern)"
 buffer_size_msec="Puffergröße (ms)"
 overlap_size_msec="Überlappungsgröße (ms)"
+buffer_output_type="Pufferausgabetyp"
diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
index 03efa24..9db3e12 100644
--- a/data/locale/en-US.ini
+++ b/data/locale/en-US.ini
@@ -70,4 +70,5 @@ translation_no_repeat_ngram_size="No-repeat ngram size"
 translation_max_input_length="Max input length"
 buffered_output_parameters="Buffered output parameters"
 buffer_num_lines="Number of lines"
-buffer_num_chars_per_line="Characters per line"
+buffer_num_chars_per_line="Amount per line"
+buffer_output_type="Output type"
diff --git a/data/locale/es-ES.ini b/data/locale/es-ES.ini
index 59dc350..120677d 100644
--- a/data/locale/es-ES.ini
+++ b/data/locale/es-ES.ini
@@ -51,3 +51,4 @@ translate_add_context="Traducir con contexto"
 whisper_translate="Traducir al inglés (Whisper)"
 buffer_size_msec="Tamaño del búfer (ms)"
 overlap_size_msec="Tamaño de superposición (ms)"
+buffer_output_type="Tipo de salida de búfer"
diff --git a/data/locale/fr-FR.ini b/data/locale/fr-FR.ini
index 0b6cd0c..3dbbd88 100644
--- a/data/locale/fr-FR.ini
+++ b/data/locale/fr-FR.ini
@@ -51,3 +51,4 @@ translate_add_context="Traduire avec contexte"
 whisper_translate="Traduire en anglais (Whisper)"
 buffer_size_msec="Taille du tampon (ms)"
 overlap_size_msec="Taille de chevauchement (ms)"
+buffer_output_type="Type de sortie du tampon"
diff --git a/data/locale/hi-IN.ini b/data/locale/hi-IN.ini
index a7b2d26..d265128 100644
--- a/data/locale/hi-IN.ini
+++ b/data/locale/hi-IN.ini
@@ -51,3 +51,4 @@ translate_add_context="संदर्भ के साथ अनुवाद 
 whisper_translate="अंग्रेजी में अनुवाद करें (व्हिस्पर)"
 buffer_size_msec="बफ़र आकार (ms)"
 overlap_size_msec="ओवरलैप आकार (ms)"
+buffer_output_type="बफ़र आउटपुट प्रकार"
diff --git a/data/locale/ja-JP.ini b/data/locale/ja-JP.ini
index 18445bc..7887bfc 100644
--- a/data/locale/ja-JP.ini
+++ b/data/locale/ja-JP.ini
@@ -51,3 +51,4 @@ translate_add_context="コンテキスト付きで翻訳"
 whisper_translate="英語に翻訳（ウィスパー）"
 buffer_size_msec="バッファサイズ（ms）"
 overlap_size_msec="オーバーラップサイズ（ms）"
+buffer_output_type="バッファ出力タイプ"
diff --git a/data/locale/ko-KR.ini b/data/locale/ko-KR.ini
index 8dbe564..60cd002 100644
--- a/data/locale/ko-KR.ini
+++ b/data/locale/ko-KR.ini
@@ -51,3 +51,4 @@ translate_add_context="컨텍스트와 함께 번역"
 whisper_translate="영어로 번역 (속삭임)"
 buffer_size_msec="버퍼 크기 (ms)"
 overlap_size_msec="오버랩 크기 (ms)"
+buffer_output_type="버퍼 출력 유형"
diff --git a/data/locale/pl-PL.ini b/data/locale/pl-PL.ini
index 64ee55b..752b787 100644
--- a/data/locale/pl-PL.ini
+++ b/data/locale/pl-PL.ini
@@ -51,3 +51,4 @@ translate_add_context="Tłumacz z kontekstem"
 whisper_translate="Tłumacz na angielski (Whisper)"
 buffer_size_msec="Rozmiar bufora (ms)"
 overlap_size_msec="Rozmiar nakładki (ms)"
+buffer_output_type="Typ wyjścia bufora"
diff --git a/data/locale/pt-BR.ini b/data/locale/pt-BR.ini
index 2f0a27e..cd9360c 100644
--- a/data/locale/pt-BR.ini
+++ b/data/locale/pt-BR.ini
@@ -51,3 +51,4 @@ translate_add_context="Traduzir com contexto"
 whisper_translate="Traduzir para inglês (Whisper)"
 buffer_size_msec="Tamanho do buffer (ms)"
 overlap_size_msec="Tamanho da sobreposição (ms)"
+buffer_output_type="Tipo de saída do buffer"
diff --git a/data/locale/ru-RU.ini b/data/locale/ru-RU.ini
index 23090b6..543200d 100644
--- a/data/locale/ru-RU.ini
+++ b/data/locale/ru-RU.ini
@@ -50,3 +50,4 @@ translate_add_context="Перевести с контекстом"
 whisper_translate="Перевести на английский (Whisper)"
 buffer_size_msec="Размер буфера (мс)"
 overlap_size_msec="Размер перекрытия (мс)"
+buffer_output_type="Тип выходных данных буфера"
diff --git a/data/locale/zh-CN.ini b/data/locale/zh-CN.ini
index a561f35..48e8dfa 100644
--- a/data/locale/zh-CN.ini
+++ b/data/locale/zh-CN.ini
@@ -51,3 +51,4 @@ translate_add_context="带上下文翻译"
 whisper_translate="翻译为英语（Whisper）"
 buffer_size_msec="缓冲区大小（毫秒）"
 overlap_size_msec="重叠大小（毫秒）"
+buffer_output_type="缓冲区输出类型"
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index 9cb26c0..c16f9cd 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -108,6 +108,8 @@ struct transcription_filter_data {
 	TokenBufferThread captions_monitor;
 	int buffered_output_num_lines = 2;
 	int buffered_output_num_chars = 30;
+	TokenBufferSegmentation buffered_output_output_type =
+		TokenBufferSegmentation::SEGMENTATION_TOKEN;
 
 	// ctor
 	transcription_filter_data() : whisper_buf_mutex(), whisper_ctx_mutex(), wshiper_thread_cv()
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index e03e819..e62e5b9 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -167,9 +167,9 @@ void transcription_filter_destroy(void *data)
 
 void transcription_filter_update(void *data, obs_data_t *s)
 {
-	obs_log(LOG_INFO, "LocalVocal filter update");
 	struct transcription_filter_data *gf =
 		static_cast<struct transcription_filter_data *>(data);
+	obs_log(gf->log_level, "LocalVocal filter update");
 
 	gf->log_level = (int)obs_data_get_int(s, "log_level");
 	gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
@@ -188,11 +188,13 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	bool new_buffered_output = obs_data_get_bool(s, "buffered_output");
 	int new_buffer_num_lines = (int)obs_data_get_int(s, "buffer_num_lines");
 	int new_buffer_num_chars_per_line = (int)obs_data_get_int(s, "buffer_num_chars_per_line");
+	TokenBufferSegmentation new_buffer_output_type =
+		(TokenBufferSegmentation)obs_data_get_int(s, "buffer_output_type");
 
 	if (new_buffered_output) {
-		obs_log(LOG_INFO, "buffered_output enable");
+		obs_log(gf->log_level, "buffered_output enable");
 		if (!gf->buffered_output || !gf->captions_monitor.isEnabled()) {
-			obs_log(LOG_INFO, "buffered_output currently disabled, enabling");
+			obs_log(gf->log_level, "buffered_output currently disabled, enabling");
 			gf->buffered_output = true;
 			gf->captions_monitor.initialize(
 				gf,
@@ -203,18 +205,23 @@ void transcription_filter_update(void *data, obs_data_t *s)
 					}
 				},
 				new_buffer_num_lines, new_buffer_num_chars_per_line,
-				std::chrono::seconds(10));
+				std::chrono::seconds(3), new_buffer_output_type);
 		} else {
 			if (new_buffer_num_lines != gf->buffered_output_num_lines ||
-			    new_buffer_num_chars_per_line != gf->buffered_output_num_chars) {
-				obs_log(LOG_INFO, "buffered_output parameters changed, updating");
+			    new_buffer_num_chars_per_line != gf->buffered_output_num_chars ||
+			    new_buffer_output_type != gf->buffered_output_output_type) {
+				obs_log(gf->log_level,
+					"buffered_output parameters changed, updating");
+				gf->captions_monitor.clear();
 				gf->captions_monitor.setNumSentences(new_buffer_num_lines);
 				gf->captions_monitor.setNumPerSentence(
 					new_buffer_num_chars_per_line);
-				gf->buffered_output_num_lines = new_buffer_num_lines;
-				gf->buffered_output_num_chars = new_buffer_num_chars_per_line;
+				gf->captions_monitor.setSegmentation(new_buffer_output_type);
 			}
 		}
+		gf->buffered_output_num_lines = new_buffer_num_lines;
+		gf->buffered_output_num_chars = new_buffer_num_chars_per_line;
+		gf->buffered_output_output_type = new_buffer_output_type;
 	} else {
 		obs_log(gf->log_level, "buffered_output disable");
 		if (gf->buffered_output) {
@@ -349,13 +356,23 @@ void transcription_filter_update(void *data, obs_data_t *s)
 		}
 	}
 
-	if (gf->initial_creation && gf->context != nullptr && obs_source_enabled(gf->context)) {
-		obs_log(LOG_INFO, "Initial filter creation and source enabled");
+	if (gf->context != nullptr && obs_source_enabled(gf->context)) {
+		if (gf->initial_creation) {
+			obs_log(LOG_INFO, "Initial filter creation and source enabled");
 
-		// source was enabled on creation
-		update_whisper_model(gf);
-		gf->active = true;
-		gf->initial_creation = false;
+			// source was enabled on creation
+			update_whisper_model(gf);
+			gf->active = true;
+			gf->initial_creation = false;
+		} else {
+			// check if the whisper model selection has changed
+			const std::string new_model_path =
+				obs_data_get_string(s, "whisper_model_path");
+			if (gf->whisper_model_path != new_model_path) {
+				obs_log(LOG_INFO, "New model selected: %s", new_model_path.c_str());
+				update_whisper_model(gf);
+			}
+		}
 	}
 }
 
@@ -506,9 +523,11 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_bool(s, "buffered_output", false);
 	obs_data_set_default_int(s, "buffer_num_lines", 2);
 	obs_data_set_default_int(s, "buffer_num_chars_per_line", 30);
+	obs_data_set_default_int(s, "buffer_output_type",
+				 (int)TokenBufferSegmentation::SEGMENTATION_TOKEN);
 
 	obs_data_set_default_bool(s, "vad_enabled", true);
-	obs_data_set_default_double(s, "vad_threshold", 0.5);
+	obs_data_set_default_double(s, "vad_threshold", 0.65);
 	obs_data_set_default_int(s, "log_level", LOG_DEBUG);
 	obs_data_set_default_bool(s, "log_words", false);
 	obs_data_set_default_bool(s, "caption_to_stream", false);
@@ -669,6 +688,16 @@ obs_properties_t *transcription_filter_properties(void *data)
 		return true;
 	});
 
+	// Add language selector
+	obs_property_t *whisper_language_select_list =
+		obs_properties_add_list(ppts, "whisper_language_select", MT_("language"),
+					OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
+	// iterate over all available languages and add them to the list
+	for (auto const &pair : whisper_available_lang_reverse) {
+		obs_property_list_add_string(whisper_language_select_list, pair.first.c_str(),
+					     pair.second.c_str());
+	}
+
 	// add translation option group
 	obs_properties_t *translation_group = obs_properties_create();
 	obs_property_t *translation_group_prop = obs_properties_add_group(
@@ -806,7 +835,8 @@ obs_properties_t *transcription_filter_properties(void *data)
 		     {"whisper_params_group", "log_words", "caption_to_stream", "buffer_size_msec",
 		      "overlap_size_msec", "step_by_step_processing", "min_sub_duration",
 		      "process_while_muted", "buffered_output", "vad_enabled", "log_level",
-		      "suppress_sentences", "sentence_psum_accept_thresh", "vad_threshold"}) {
+		      "suppress_sentences", "sentence_psum_accept_thresh", "vad_threshold",
+		      "buffered_output_group"}) {
 			obs_property_set_visible(obs_properties_get(props, prop_name.c_str()),
 						 show_hide);
 		}
@@ -820,6 +850,12 @@ obs_properties_t *transcription_filter_properties(void *data)
 	obs_properties_t *buffered_output_group = obs_properties_create();
 	obs_properties_add_group(ppts, "buffered_output_group", MT_("buffered_output_parameters"),
 				 OBS_GROUP_NORMAL, buffered_output_group);
+	// add buffer "type" character or word
+	obs_property_t *buffer_type_list = obs_properties_add_list(
+		buffered_output_group, "buffer_output_type", MT_("buffer_output_type"),
+		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
+	obs_property_list_add_int(buffer_type_list, "Character", SEGMENTATION_TOKEN);
+	obs_property_list_add_int(buffer_type_list, "Word", SEGMENTATION_WORD);
 	// add buffer lines parameter
 	obs_properties_add_int_slider(buffered_output_group, "buffer_num_lines",
 				      MT_("buffer_num_lines"), 1, 5, 1);
@@ -868,16 +904,6 @@ obs_properties_t *transcription_filter_properties(void *data)
 	obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"),
 				 OBS_GROUP_NORMAL, whisper_params_group);
 
-	// Add language selector
-	obs_property_t *whisper_language_select_list = obs_properties_add_list(
-		whisper_params_group, "whisper_language_select", MT_("language"),
-		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
-	// iterate over all available languages and add them to the list
-	for (auto const &pair : whisper_available_lang_reverse) {
-		obs_property_list_add_string(whisper_language_select_list, pair.first.c_str(),
-					     pair.second.c_str());
-	}
-
 	obs_property_t *whisper_sampling_method_list = obs_properties_add_list(
 		whisper_params_group, "whisper_sampling_method", MT_("whisper_sampling_method"),
 		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
diff --git a/src/transcription-utils.h b/src/transcription-utils.h
index 4e7f39c..e81b909 100644
--- a/src/transcription-utils.h
+++ b/src/transcription-utils.h
@@ -4,6 +4,8 @@
 #include <string>
 #include <vector>
 #include <chrono>
+#include <algorithm>
+#include <cctype>
 
 // Fix UTF8 string for Windows
 std::string fix_utf8(const std::string &str);
@@ -25,4 +27,18 @@ inline uint64_t now_ms()
 // Split a string into words based on spaces
 std::vector<std::string> split_words(const std::string &str_copy);
 
+// trim (strip) string from leading and trailing whitespaces
+template<typename StringLike> StringLike trim(const StringLike &str)
+{
+	StringLike str_copy = str;
+	str_copy.erase(str_copy.begin(),
+		       std::find_if(str_copy.begin(), str_copy.end(),
+				    [](unsigned char ch) { return !std::isspace(ch); }));
+	str_copy.erase(std::find_if(str_copy.rbegin(), str_copy.rend(),
+				    [](unsigned char ch) { return !std::isspace(ch); })
+			       .base(),
+		       str_copy.end());
+	return str_copy;
+}
+
 #endif // TRANSCRIPTION_UTILS_H
diff --git a/src/whisper-utils/token-buffer-thread.cpp b/src/whisper-utils/token-buffer-thread.cpp
index e88d76c..b2a4093 100644
--- a/src/whisper-utils/token-buffer-thread.cpp
+++ b/src/whisper-utils/token-buffer-thread.cpp
@@ -4,6 +4,7 @@
 
 #include "token-buffer-thread.h"
 #include "whisper-utils.h"
+#include "transcription-utils.h"
 
 #include <obs-module.h>
 
@@ -18,12 +19,13 @@
 
 TokenBufferThread::TokenBufferThread() noexcept
 	: gf(nullptr),
-	  numSentences(1),
-	  numPerSentence(1),
+	  numSentences(2),
+	  numPerSentence(30),
 	  maxTime(0),
 	  stop(true),
 	  presentationQueueMutex(),
-	  inputQueueMutex()
+	  inputQueueMutex(),
+	  segmentation(SEGMENTATION_TOKEN)
 {
 }
 
@@ -110,6 +112,8 @@ void TokenBufferThread::clear()
 		std::lock_guard<std::mutex> lock(presentationQueueMutex);
 		presentationQueue.clear();
 	}
+	this->lastCaption = "";
+	this->lastCaptionTime = std::chrono::steady_clock::now();
 	this->callback("");
 }
 
@@ -134,6 +138,13 @@ void TokenBufferThread::monitor()
 				for (size_t i = 0; i < this->numPerSentence; i++) {
 					presentationQueue.pop_front();
 				}
+				if (this->segmentation == SEGMENTATION_TOKEN) {
+					// pop tokens until a space is found
+					while (!presentationQueue.empty() &&
+					       presentationQueue.front() != SPACE) {
+						presentationQueue.pop_front();
+					}
+				}
 			}
 
 			{
@@ -146,10 +157,23 @@ void TokenBufferThread::monitor()
 						for (const auto &token : inputQueue) {
 							presentationQueue.push_back(token);
 						}
-					} else {
+					} else if (this->segmentation == SEGMENTATION_TOKEN) {
 						// add one token to the presentation queue
 						presentationQueue.push_back(inputQueue.front());
 						inputQueue.pop_front();
+					} else {
+						// skip spaces in the beginning of the input queue
+						while (inputQueue.front() == SPACE) {
+							inputQueue.pop_front();
+						}
+						// add one word to the presentation queue
+						TokenBufferString word;
+						while (!inputQueue.empty() &&
+						       inputQueue.front() != SPACE) {
+							word += inputQueue.front();
+							inputQueue.pop_front();
+						}
+						presentationQueue.push_back(word);
 					}
 				}
 			}
@@ -158,49 +182,64 @@ void TokenBufferThread::monitor()
 				// build a caption from the presentation queue in sentences
 				// with a maximum of numPerSentence tokens/words per sentence
 				// and a newline between sentences
-				TokenBufferString caption;
+				std::vector<TokenBufferString> sentences(1);
+
 				if (this->segmentation == SEGMENTATION_WORD) {
-					// iterate through the presentation queue tokens and make words (based on spaces)
-					// then build a caption with a maximum of numPerSentence words per sentence
+					// add words from the presentation queue to the sentences
+					// if a sentence is full - start a new one
 					size_t wordsInSentence = 0;
-					TokenBufferString word;
-					for (const auto &token : presentationQueue) {
-						// keep adding tokens to the word until a space is found
-						word += token;
-						if (word.find(SPACE) != TokenBufferString::npos) {
-							// cut the word at the space and add it to the caption
-							caption += word.substr(0, word.find(SPACE));
-							wordsInSentence++;
-							// keep the rest of the word for the next iteration
-							word = word.substr(word.find(SPACE) + 1);
-
-							if (wordsInSentence ==
-							    this->numPerSentence) {
-								caption += word;
-								caption += SPACE;
-								wordsInSentence = 0;
-								word.clear();
-							}
+					for (size_t i = 0; i < presentationQueue.size(); i++) {
+						const auto &word = presentationQueue[i];
+						sentences.back() += word + SPACE;
+						wordsInSentence++;
+						if (wordsInSentence == this->numPerSentence) {
+							sentences.push_back(TokenBufferString());
 						}
 					}
 				} else {
 					// iterate through the presentation queue tokens and build a caption
-					size_t tokensInSentence = 0;
-					for (const auto &token : presentationQueue) {
+					for (size_t i = 0; i < presentationQueue.size(); i++) {
+						const auto &token = presentationQueue[i];
 						// skip spaces in the beginning of a sentence (tokensInSentence == 0)
-						if (token == SPACE && tokensInSentence == 0) {
+						if (token == SPACE &&
+						    sentences.back().length() == 0) {
 							continue;
 						}
 
-						caption += token;
-						tokensInSentence++;
-						if (tokensInSentence == this->numPerSentence) {
-							caption += NEWLINE;
-							tokensInSentence = 0;
+						sentences.back() += token;
+						if (sentences.back().length() ==
+						    this->numPerSentence) {
+							// if the next character is not a space - this is a broken word
+							// roll back to the last space, replace it with a newline
+							size_t lastSpace =
+								sentences.back().find_last_of(
+									SPACE);
+							sentences.push_back(sentences.back().substr(
+								lastSpace + 1));
+							sentences[sentences.size() - 2] =
+								sentences[sentences.size() - 2]
+									.substr(0, lastSpace);
 						}
 					}
 				}
 
+				TokenBufferString caption;
+				// if there are more sentences than numSentences - remove the oldest ones
+				while (sentences.size() > this->numSentences) {
+					sentences.erase(sentences.begin());
+				}
+				// if there are less sentences than numSentences - add empty sentences
+				while (sentences.size() < this->numSentences) {
+					sentences.push_back(TokenBufferString());
+				}
+				// build the caption from the sentences
+				for (const auto &sentence : sentences) {
+					if (!sentence.empty()) {
+						caption += trim<TokenBufferString>(sentence);
+					}
+					caption += NEWLINE;
+				}
+
 #ifdef _WIN32
 				// convert caption to multibyte for obs
 				int count = WideCharToMultiByte(CP_UTF8, 0, caption.c_str(),
@@ -222,19 +261,59 @@ void TokenBufferThread::monitor()
 
 		if (caption_out.empty()) {
 			// if no caption was built, sleep for a while
+			this->lastCaption = "";
+			this->lastCaptionTime = std::chrono::steady_clock::now();
 			std::this_thread::sleep_for(std::chrono::milliseconds(100));
 			continue;
 		}
 
-		// emit the caption
-		this->callback(caption_out);
+		if (caption_out == lastCaption) {
+			// if it has been max_time since the last caption - clear the presentation queue
+			if (this->maxTime.count() > 0) {
+				auto now = std::chrono::steady_clock::now();
+				auto duration = std::chrono::duration_cast<std::chrono::seconds>(
+					now - this->lastCaptionTime);
+				if (duration > this->maxTime) {
+					this->clear();
+				}
+			}
+		} else {
+			// emit the caption
+			this->callback(caption_out);
+			this->lastCaption = caption_out;
+			this->lastCaptionTime = std::chrono::steady_clock::now();
+		}
 
 		// check the input queue size (iqs), if it's big - sleep less
-		std::this_thread::sleep_for(std::chrono::milliseconds(inputQueue.size() > 30 ? 33
-								      : inputQueue.size() > 15
-									      ? 66
-									      : 100));
+		std::this_thread::sleep_for(std::chrono::milliseconds(
+			inputQueue.size() > 30   ? getWaitTime(SPEED_FAST)
+			: inputQueue.size() > 15 ? getWaitTime(SPEED_NORMAL)
+						 : getWaitTime(SPEED_SLOW)));
 	}
 
 	obs_log(LOG_INFO, "TokenBufferThread::monitor: done");
 }
+
+int TokenBufferThread::getWaitTime(TokenBufferSpeed speed) const
+{
+	if (this->segmentation == SEGMENTATION_WORD) {
+		switch (speed) {
+		case SPEED_SLOW:
+			return 200;
+		case SPEED_NORMAL:
+			return 150;
+		case SPEED_FAST:
+			return 100;
+		}
+	} else if (this->segmentation == SEGMENTATION_TOKEN) {
+		switch (speed) {
+		case SPEED_SLOW:
+			return 100;
+		case SPEED_NORMAL:
+			return 66;
+		case SPEED_FAST:
+			return 33;
+		}
+	}
+	return 1000;
+}
diff --git a/src/whisper-utils/token-buffer-thread.h b/src/whisper-utils/token-buffer-thread.h
index 0dbe14e..a27b244 100644
--- a/src/whisper-utils/token-buffer-thread.h
+++ b/src/whisper-utils/token-buffer-thread.h
@@ -23,6 +23,7 @@ typedef std::string TokenBufferString;
 struct transcription_filter_data;
 
 enum TokenBufferSegmentation { SEGMENTATION_WORD = 0, SEGMENTATION_TOKEN, SEGMENTATION_SENTENCE };
+enum TokenBufferSpeed { SPEED_SLOW = 0, SPEED_NORMAL, SPEED_FAST };
 
 class TokenBufferThread {
 public:
@@ -43,10 +44,16 @@ class TokenBufferThread {
 
 	void setNumSentences(size_t numSentences_) { numSentences = numSentences_; }
 	void setNumPerSentence(size_t numPerSentence_) { numPerSentence = numPerSentence_; }
+	void setMaxTime(std::chrono::seconds maxTime_) { maxTime = maxTime_; }
+	void setSegmentation(TokenBufferSegmentation segmentation_)
+	{
+		segmentation = segmentation_;
+	}
 
 private:
 	void monitor();
 	void log_token_vector(const std::vector<std::string> &tokens);
+	int getWaitTime(TokenBufferSpeed speed) const;
 	struct transcription_filter_data *gf;
 	std::deque<TokenBufferString> inputQueue;
 	std::deque<TokenBufferString> presentationQueue;
@@ -61,6 +68,9 @@ class TokenBufferThread {
 	size_t numSentences;
 	size_t numPerSentence;
 	TokenBufferSegmentation segmentation;
+	// timestamp of the last caption
+	std::chrono::time_point<std::chrono::steady_clock> lastCaptionTime;
+	std::string lastCaption;
 };
 
 #endif