***Add new files for testing perplexity***

***Update CMakeLists.txt to include new files and enable examples*** ***Update README with instructions for evaluating model quality***
yvonwin · Apr 16, 2024 · 32fd347 · 32fd347
1 parent 17afbfb
commit 32fd347
Show file tree

Hide file tree

Showing 7 changed files with 300 additions and 49 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -12,7 +12,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall")
 if (NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif ()
-# set(CMAKE_BUILD_TYPE "Debug")
 
 # third-party libraries
 set(ABSL_ENABLE_INSTALL ON)
@@ -36,6 +35,8 @@ if (GGML_METAL)
   configure_file(third_party/ggml/src/ggml-metal.metal ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
 endif ()
 
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
 file(GLOB CPP_SOURCES
   ${PROJECT_SOURCE_DIR}/*.h
   ${PROJECT_SOURCE_DIR}/*.cpp)
@@ -49,8 +50,22 @@ endif()
 add_library(qwen STATIC qwen.cpp)
 target_link_libraries(qwen PUBLIC re2::re2 ggml)
 
-add_executable(main main.cpp)
-target_link_libraries(main PRIVATE qwen)
+# add_executable(main main.cpp)
+# target_link_libraries(main PRIVATE qwen)
+
+# c++ examples
+option(QWEN_ENABLE_EXAMPLES "qwen: enable c++ examples" ON)
+if (QWEN_ENABLE_EXAMPLES)
+    add_executable(main main.cpp)
+    target_link_libraries(main PRIVATE qwen)
+
+    find_package(OpenMP)
+    if (OpenMP_CXX_FOUND)
+        set(QWEN_OPENMP_TARGET OpenMP::OpenMP_CXX)
+    endif ()
+    add_executable(perplexity tests/perplexity.cpp)
+    target_link_libraries(perplexity PRIVATE qwen ${QWEN_OPENMP_TARGET})
+endif ()
 
 # GoogleTest
 option(QWEN_ENABLE_TESTING "qwen: enable testing" OFF)

diff --git a/README.md b/README.md
@@ -257,6 +257,22 @@ output time: 15149.7 ms / 159 tokens (95.281 ms/token)
 total time: 15948.1 ms
 ```
 
+
+## Model Quality
+
+We measure model quality by evaluating the perplexity over the WikiText-2 test dataset, following the strided sliding window strategy in https://huggingface.co/docs/transformers/perplexity. Lower perplexity usually indicates a better model.
+
+Download and unzip the dataset
+
+```sh
+wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+unzip wikitext-2-raw-v1.zip
+```
+
+```sh
+./build/bin/perplexity -m <model_path> -f wikitext-2-raw/wiki.test.raw -s 512 -l 2048
+```
+
 ## Development
 
 **Unit Test**

diff --git a/qwen.cpp b/qwen.cpp
@@ -261,7 +261,7 @@ auto ModelLoader::read_tensor(const std::string &name, ggml_tensor *tensor) -> v
   // read and check tensor shape
   {
     int ndim = read_basic<int>();
-    int n_dims = ggml_n_dims((tensor));
+    int n_dims = ggml_n_dims(tensor);
       // a quick fix
       if ((n_dims == 1) && (ndim == 2) && (tensor->ne[1] == 1))
           n_dims = 2;
@@ -431,14 +431,14 @@ QwenAttention::QwenAttention(ModelContext *ctx, int hidden_size, int num_attenti
     v_cache(ggml_new_tensor_3d(ctx->ctx_kv.get(), GGML_TYPE_F16, max_length, hidden_size / num_attention_heads,
                                num_kv_heads)) {}
 
-auto QwenAttention::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor * {
+auto QwenAttention::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_past, int n_ctx) const -> ggml_tensor * {
   ggml_context *gctx = ctx->ctx_b.get();
 
   const int hidden_size = hidden_states->ne[0];
   const int qlen = hidden_states->ne[1];
   const int head_size = hidden_size / num_attention_heads;
   const int rope_dim = head_size;
-  const int n_past = static_cast<int *>(KQ_pos->data)[0];
+  // const int n_past = static_cast<int *>(KQ_pos->data)[0];
 
   ggml_tensor *q = q_proj.forward(ctx, hidden_states); // [qlen, hidden]
   // [qlen, heads, head_size]
@@ -580,12 +580,12 @@ auto Qwen2MoeSparseMoeBlock::forward(ModelContext *ctx, ggml_tensor *hidden_stat
   return final_hidden_states;
 }
 
-auto QwenBlock::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor * {
+auto QwenBlock::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_past, int n_ctx) const -> ggml_tensor * {
   ggml_context *gctx = ctx->ctx_b.get();
 
   ggml_tensor *residual = hidden_states;
   hidden_states = input_layernorm.forward(ctx, hidden_states, 1e-6f);
-  hidden_states = attn.forward(ctx, hidden_states, KQ_pos, n_ctx); // FAILE HERE  a->ne[2] == b->ne[0]
+  hidden_states = attn.forward(ctx, hidden_states, KQ_pos, n_past, n_ctx); // FAILE HERE  a->ne[2] == b->ne[0]
   hidden_states = tensor_assign_buffers(ggml_add_inplace(gctx, hidden_states, residual));
 
   residual = hidden_states;
@@ -595,12 +595,12 @@ auto QwenBlock::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tens
   return hidden_states;
 }
 
-auto QwenMoeBlock::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_ctx , int num_experts, int num_experts_per_tok) const -> ggml_tensor * {
+auto QwenMoeBlock::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_past, int n_ctx , int num_experts, int num_experts_per_tok) const -> ggml_tensor * {
   ggml_context *gctx = ctx->ctx_b.get();
 
   ggml_tensor *residual = hidden_states;
   hidden_states = input_layernorm.forward(ctx, hidden_states, 1e-6f);
-  hidden_states = attn.forward(ctx, hidden_states, KQ_pos, n_ctx); // FAILE HERE  a->ne[2] == b->ne[0]
+  hidden_states = attn.forward(ctx, hidden_states, KQ_pos, n_past, n_ctx); // FAILE HERE  a->ne[2] == b->ne[0]
 
   hidden_states = tensor_assign_buffers(ggml_add_inplace(gctx, hidden_states, residual));
 
@@ -620,12 +620,19 @@ QwenModel::QwenModel(ModelContext *ctx, const QwenConfig &config)
   }
 }
 
-auto QwenModel::forward(ModelContext *ctx, ggml_tensor *input_ids, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor * {
+auto QwenModel::forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const -> ggml_tensor * {
   ggml_context *gctx = ctx->ctx_b.get();
+  ggml_tensor *KQ_pos = pos_ids_gen_(gctx, input_ids->ne[0], n_past, n_ctx);
+  if(KQ_pos){
+    tensor_to_device(KQ_pos);
+  }
   ggml_tensor *hidden_states = embed_tokens.forward(ctx, input_ids);
   for (const auto &layer : layers) {
     ggml_set_scratch(gctx, ctx->scratch);
-    hidden_states = layer.forward(ctx, hidden_states, KQ_pos, n_ctx);
+    hidden_states = layer.forward(ctx, hidden_states, KQ_pos,n_past, n_ctx);
+  }
+  if(KQ_pos){
+    tensor_to_cpu(KQ_pos);
   }
   ggml_scratch empty_scratch = {0, 0, nullptr};
   ggml_set_scratch(gctx, empty_scratch);
@@ -642,13 +649,21 @@ QwenMoeModel::QwenMoeModel(ModelContext *ctx, const QwenMoeConfig &config)
   }
 }
 
-auto QwenMoeModel::forward(ModelContext *ctx, ggml_tensor *input_ids, ggml_tensor *KQ_pos, int n_ctx,  int num_experts, int num_experts_per_tok) const -> ggml_tensor * {
+auto QwenMoeModel::forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx,  int num_experts, int num_experts_per_tok) const -> ggml_tensor * {
   ggml_context *gctx = ctx->ctx_b.get();
+  ggml_tensor *KQ_pos = pos_ids_gen_(gctx, input_ids->ne[0], n_past, n_ctx);
+  if(KQ_pos){
+    tensor_to_device(KQ_pos);
+  }
   ggml_tensor *hidden_states = embed_tokens.forward(ctx, input_ids);
   for (const auto &layer : layers) {
     ggml_set_scratch(gctx, ctx->scratch);
-    hidden_states = layer.forward(ctx, hidden_states, KQ_pos, n_ctx, num_experts, num_experts_per_tok);
+    hidden_states = layer.forward(ctx, hidden_states, KQ_pos, n_past, n_ctx, num_experts, num_experts_per_tok);
+  }
+  if(KQ_pos){
+    tensor_to_cpu(KQ_pos);
   }
+
   ggml_scratch empty_scratch = {0, 0, nullptr};
   ggml_set_scratch(gctx, empty_scratch);
   hidden_states = norm.forward(ctx, hidden_states, 1e-6f);
@@ -819,18 +834,13 @@ QwenMoeForCausalLM::~QwenMoeForCausalLM() {
   }
 }
 
-auto QwenForCausalLM::generate_next_token(
-  const std::vector<int32_t> &input_ids,
-  const GenerationConfig &gen_config,
-  int n_past,
-  int n_ctx
-) -> int32_t {
+auto QwenForCausalLM::forward_graph_compute(const std::vector<int> &input_ids, int n_past, int n_ctx,int n_threads, bool is_decoding) -> ggml_tensor*{
   ctx_.ctx_b = make_unique_ggml_context(ctx_.compute_buffer.size(), ctx_.compute_buffer.data(), false);
   // ctx_.gf = ggml_new_graph(ctx_.ctx_b.get()); // dafault graph size didn't fit larger model like 32b, moe
   size_t GRAPH_SIZE = 4096 * 2;
   ctx_.gf = ggml_new_graph_custom(ctx_.ctx_b.get(), GRAPH_SIZE, false);
 
-  int n_threads = gen_config.num_threads; // user defined
+  // int n_threads = gen_config.num_threads; // user defined
   if (n_threads <= 0) {
     n_threads = get_default_num_threads(); // default thread num
   }
@@ -842,28 +852,28 @@ auto QwenForCausalLM::generate_next_token(
   ggml_tensor *curr_input_ids = ggml_new_tensor_1d(ctx_.ctx_b.get(), GGML_TYPE_I32, curr_input_ids_size);
   memcpy(curr_input_ids->data, input_ids.data() + n_past, ggml_nbytes(curr_input_ids));
 
-  ggml_tensor *KQ_pos = ggml_new_tensor_1d(ctx_.ctx_b.get(), GGML_TYPE_I32, curr_input_ids_size);
-  int * data = static_cast<int *>(KQ_pos->data);
-  for (int i = 0; i < curr_input_ids_size; ++i) {
-    data[i] = n_past + i;
-  }
-  if (KQ_pos) {
-    tensor_to_device(KQ_pos);
-  }
+  ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, n_past, n_ctx, is_decoding);
+  lm_logits->backend = GGML_BACKEND_CPU;
+  // lm_logits->backend = GGML_BACKEND_TYPE_CPU; //newer ggml
 
-  ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, KQ_pos, n_ctx);
-  lm_logits->backend = GGML_BACKEND_CPU; // newer ggml version
-  if (KQ_pos) {
-    tensor_to_cpu(KQ_pos);
-  }
 
   ggml_build_forward_expand(ctx_.gf, lm_logits);
 #ifdef GGML_USE_METAL
   ggml_metal_graph_compute(ctx_.ctx_metal.get(), ctx_.gf);
 #else
   ggml_graph_compute_helper(ctx_.work_buffer, ctx_.gf, n_threads);
 #endif
+  // std::cout << lm_logits -> ne[1] << std::endl;
+  return lm_logits;
+}
 
+auto QwenForCausalLM::generate_next_token(
+  const std::vector<int32_t> &input_ids,
+  const GenerationConfig &gen_config,
+  int n_past,
+  int n_ctx
+) -> int32_t {
+  ggml_tensor *lm_logits = forward_graph_compute(input_ids, n_past, n_ctx, gen_config.num_threads, true);
   int vocab_size = lm_logits->ne[0];
   float *next_token_logits = (float *)lm_logits->data;
 
@@ -1068,12 +1078,13 @@ auto QwenMoeForCausalLM::load(ModelLoader &loader) -> void {
 auto QwenForCausalLM::forward(
   ModelContext *ctx,
   ggml_tensor *input_ids,
-  ggml_tensor *KQ_pos,
-  int n_ctx
+  int n_past,
+  int n_ctx,
+  bool is_decoding
 ) const -> ggml_tensor * {
-  ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, KQ_pos, n_ctx);
+  ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, n_past, n_ctx);
   // NOTE: only compute next_token_logits for the last token
-  if (input_ids->ne[0] > 1) {
+  if (is_decoding && input_ids->ne[0] > 1) {
     transformer_outputs = tensor_assign_buffers(
       ggml_view_1d(ctx->ctx_b.get(), transformer_outputs, config.hidden_size,
                    (input_ids->ne[0] - 1) * config.hidden_size * ggml_element_size(transformer_outputs)));
@@ -1085,12 +1096,13 @@ auto QwenForCausalLM::forward(
 auto QwenMoeForCausalLM::forward(
   ModelContext *ctx,
   ggml_tensor *input_ids,
-  ggml_tensor *KQ_pos,
-  int n_ctx
+  int n_past,
+  int n_ctx,
+  bool is_decoding
 ) const -> ggml_tensor * {
-  ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, KQ_pos, n_ctx, config.num_experts, config.num_experts_per_tok);
+  ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, n_past, n_ctx, config.num_experts, config.num_experts_per_tok);
   // NOTE: only compute next_token_logits for the last token
-  if (input_ids->ne[0] > 1) {
+  if (is_decoding && input_ids->ne[0] > 1) {
     transformer_outputs = tensor_assign_buffers(
       ggml_view_1d(ctx->ctx_b.get(), transformer_outputs, config.hidden_size,
                    (input_ids->ne[0] - 1) * config.hidden_size * ggml_element_size(transformer_outputs)));

diff --git a/qwen.h b/qwen.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cmath>
 #include "tiktoken.h"
 
 #include <ggml.h>
@@ -356,7 +357,7 @@ class QwenAttention {
     QwenAttention() : num_attention_heads(0), num_kv_heads(0) {}
     QwenAttention(ModelContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int max_length);
 
-    auto forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor *;
+    auto forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_past, int n_ctx) const -> ggml_tensor *;
 
     int num_attention_heads;
     int num_kv_heads;
@@ -425,14 +426,24 @@ class QwenBlock {
         post_attention_layernorm(ctx, hidden_size, false),
         mlp(ctx, hidden_size, intermediate_size) {}
 
-    auto forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor *;
+    auto forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos,int n_past, int n_ctx) const -> ggml_tensor *;
 
     RMSNorm input_layernorm;
     QwenAttention attn;
     RMSNorm post_attention_layernorm;
     QwenMLP mlp;
 };
 
+struct BasicPositionIdsGenerator {
+    ggml_tensor *operator()(ggml_context *ctx, int qlen, int n_past, int n_ctx) const {
+        ggml_tensor *position_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, qlen);
+        for (int i = 0; i < qlen; i++) {
+            ((int *)position_ids->data)[i] = n_past + i;
+        }
+        return position_ids;
+    }
+};
+
 class QwenMoeBlock {
   public:
     QwenMoeBlock() = default;
@@ -442,7 +453,7 @@ class QwenMoeBlock {
         post_attention_layernorm(ctx, hidden_size, false),
         mlp(ctx, hidden_size, intermediate_size, moe_intermediate_size, shared_expert_intermediate_size, num_experts,num_experts_per_tok) {}
 
-    auto forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_ctx, int num_experts, int num_experts_per_tok) const -> ggml_tensor *;
+    auto forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_past, int n_ctx, int num_experts, int num_experts_per_tok) const -> ggml_tensor *;
 
     RMSNorm input_layernorm;
     QwenAttention attn;
@@ -455,11 +466,12 @@ class QwenModel {
     QwenModel() = default;
     QwenModel(ModelContext *ctx, const QwenConfig &config);
 
-    auto forward(ModelContext *ctx, ggml_tensor *input_ids, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor *;
+    auto forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const -> ggml_tensor *;
 
     Embedding embed_tokens;
     std::vector<QwenBlock> layers;
     RMSNorm norm;
+    BasicPositionIdsGenerator pos_ids_gen_;
 };
 
 class QwenMoeModel {
@@ -468,11 +480,12 @@ class QwenMoeModel {
     QwenMoeModel(ModelContext *ctx, const QwenMoeConfig &config);
 
     // Attention: These parameters should not be set to fixed values. I did this for quick implementation.
-    auto forward(ModelContext *ctx, ggml_tensor *input_ids, ggml_tensor *KQ_pos, int n_ctx, int num_experts, int num_experts_per_tok) const -> ggml_tensor *;
+    auto forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx, int num_experts, int num_experts_per_tok) const -> ggml_tensor *;
 
     Embedding embed_tokens;
     std::vector<QwenMoeBlock> layers;
     RMSNorm norm;
+    BasicPositionIdsGenerator pos_ids_gen_;
 };
 
 class QwenForCausalLM {
@@ -505,7 +518,9 @@ class QwenForCausalLM {
 
     virtual void load(ModelLoader &loader);
 
-    virtual ggml_tensor * forward(ModelContext *ctx, ggml_tensor *input_ids, ggml_tensor *KQ_pos, int n_ctx) const;
+    virtual ggml_tensor * forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx, bool is_decoding) const;
+
+    auto forward_graph_compute(const std::vector<int> &input_ids, int n_past, int n_ctx,int n_threads, bool is_decoding)-> ggml_tensor*;
 
     static constexpr size_t MEM_SIZE     = 1280 * MB;  // 2k context
     static constexpr size_t SCRATCH_SIZE = 1280 * MB; // 2k context
@@ -526,7 +541,7 @@ class QwenMoeForCausalLM : public QwenForCausalLM {
     // Override methods here if needed
 
     auto load(ModelLoader &loader) -> void override;
-    auto forward(ModelContext *ctx, ggml_tensor *input_ids, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor * override;
+    auto forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx, bool is_decoding) const -> ggml_tensor * override;
 
     static constexpr size_t MEM_SIZE = 812ull * MB;
     static constexpr size_t SCRATCH_SIZE = 1844ull * MB;

diff --git a/tests/data/get-wikitext-2.sh b/tests/data/get-wikitext-2.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+unzip wikitext-2-raw-v1.zip
+
+echo "Usage:"
+echo ""
+echo "  ./perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]"
+echo ""
+
+exit 0