Skip to content

Commit

Permalink
***Add new files for testing perplexity***
Browse files Browse the repository at this point in the history
***Update CMakeLists.txt to include new files and enable examples***

***Update README with instructions for evaluating model quality***
  • Loading branch information
yvonwin committed Apr 16, 2024
1 parent 17afbfb commit 32fd347
Show file tree
Hide file tree
Showing 7 changed files with 300 additions and 49 deletions.
21 changes: 18 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall")
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif ()
# set(CMAKE_BUILD_TYPE "Debug")

# third-party libraries
set(ABSL_ENABLE_INSTALL ON)
Expand All @@ -36,6 +35,8 @@ if (GGML_METAL)
configure_file(third_party/ggml/src/ggml-metal.metal ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
endif ()

include_directories(${CMAKE_CURRENT_SOURCE_DIR})

file(GLOB CPP_SOURCES
${PROJECT_SOURCE_DIR}/*.h
${PROJECT_SOURCE_DIR}/*.cpp)
Expand All @@ -49,8 +50,22 @@ endif()
add_library(qwen STATIC qwen.cpp)
target_link_libraries(qwen PUBLIC re2::re2 ggml)

add_executable(main main.cpp)
target_link_libraries(main PRIVATE qwen)
# add_executable(main main.cpp)
# target_link_libraries(main PRIVATE qwen)

# c++ examples
option(QWEN_ENABLE_EXAMPLES "qwen: enable c++ examples" ON)
if (QWEN_ENABLE_EXAMPLES)
add_executable(main main.cpp)
target_link_libraries(main PRIVATE qwen)

find_package(OpenMP)
if (OpenMP_CXX_FOUND)
set(QWEN_OPENMP_TARGET OpenMP::OpenMP_CXX)
endif ()
add_executable(perplexity tests/perplexity.cpp)
target_link_libraries(perplexity PRIVATE qwen ${QWEN_OPENMP_TARGET})
endif ()

# GoogleTest
option(QWEN_ENABLE_TESTING "qwen: enable testing" OFF)
Expand Down
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,22 @@ output time: 15149.7 ms / 159 tokens (95.281 ms/token)
total time: 15948.1 ms
```


## Model Quality

We measure model quality by evaluating the perplexity over the WikiText-2 test dataset, following the strided sliding window strategy in https://huggingface.co/docs/transformers/perplexity. Lower perplexity usually indicates a better model.

Download and unzip the dataset

```sh
wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip wikitext-2-raw-v1.zip
```

```sh
./build/bin/perplexity -m <model_path> -f wikitext-2-raw/wiki.test.raw -s 512 -l 2048
```

## Development

**Unit Test**
Expand Down
90 changes: 51 additions & 39 deletions qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ auto ModelLoader::read_tensor(const std::string &name, ggml_tensor *tensor) -> v
// read and check tensor shape
{
int ndim = read_basic<int>();
int n_dims = ggml_n_dims((tensor));
int n_dims = ggml_n_dims(tensor);
// a quick fix
if ((n_dims == 1) && (ndim == 2) && (tensor->ne[1] == 1))
n_dims = 2;
Expand Down Expand Up @@ -431,14 +431,14 @@ QwenAttention::QwenAttention(ModelContext *ctx, int hidden_size, int num_attenti
v_cache(ggml_new_tensor_3d(ctx->ctx_kv.get(), GGML_TYPE_F16, max_length, hidden_size / num_attention_heads,
num_kv_heads)) {}

auto QwenAttention::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor * {
auto QwenAttention::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_past, int n_ctx) const -> ggml_tensor * {
ggml_context *gctx = ctx->ctx_b.get();

const int hidden_size = hidden_states->ne[0];
const int qlen = hidden_states->ne[1];
const int head_size = hidden_size / num_attention_heads;
const int rope_dim = head_size;
const int n_past = static_cast<int *>(KQ_pos->data)[0];
// const int n_past = static_cast<int *>(KQ_pos->data)[0];

ggml_tensor *q = q_proj.forward(ctx, hidden_states); // [qlen, hidden]
// [qlen, heads, head_size]
Expand Down Expand Up @@ -580,12 +580,12 @@ auto Qwen2MoeSparseMoeBlock::forward(ModelContext *ctx, ggml_tensor *hidden_stat
return final_hidden_states;
}

auto QwenBlock::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor * {
auto QwenBlock::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_past, int n_ctx) const -> ggml_tensor * {
ggml_context *gctx = ctx->ctx_b.get();

ggml_tensor *residual = hidden_states;
hidden_states = input_layernorm.forward(ctx, hidden_states, 1e-6f);
hidden_states = attn.forward(ctx, hidden_states, KQ_pos, n_ctx); // FAILE HERE a->ne[2] == b->ne[0]
hidden_states = attn.forward(ctx, hidden_states, KQ_pos, n_past, n_ctx); // FAILE HERE a->ne[2] == b->ne[0]
hidden_states = tensor_assign_buffers(ggml_add_inplace(gctx, hidden_states, residual));

residual = hidden_states;
Expand All @@ -595,12 +595,12 @@ auto QwenBlock::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tens
return hidden_states;
}

auto QwenMoeBlock::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_ctx , int num_experts, int num_experts_per_tok) const -> ggml_tensor * {
auto QwenMoeBlock::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_past, int n_ctx , int num_experts, int num_experts_per_tok) const -> ggml_tensor * {
ggml_context *gctx = ctx->ctx_b.get();

ggml_tensor *residual = hidden_states;
hidden_states = input_layernorm.forward(ctx, hidden_states, 1e-6f);
hidden_states = attn.forward(ctx, hidden_states, KQ_pos, n_ctx); // FAILE HERE a->ne[2] == b->ne[0]
hidden_states = attn.forward(ctx, hidden_states, KQ_pos, n_past, n_ctx); // FAILE HERE a->ne[2] == b->ne[0]

hidden_states = tensor_assign_buffers(ggml_add_inplace(gctx, hidden_states, residual));

Expand All @@ -620,12 +620,19 @@ QwenModel::QwenModel(ModelContext *ctx, const QwenConfig &config)
}
}

auto QwenModel::forward(ModelContext *ctx, ggml_tensor *input_ids, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor * {
auto QwenModel::forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const -> ggml_tensor * {
ggml_context *gctx = ctx->ctx_b.get();
ggml_tensor *KQ_pos = pos_ids_gen_(gctx, input_ids->ne[0], n_past, n_ctx);
if(KQ_pos){
tensor_to_device(KQ_pos);
}
ggml_tensor *hidden_states = embed_tokens.forward(ctx, input_ids);
for (const auto &layer : layers) {
ggml_set_scratch(gctx, ctx->scratch);
hidden_states = layer.forward(ctx, hidden_states, KQ_pos, n_ctx);
hidden_states = layer.forward(ctx, hidden_states, KQ_pos,n_past, n_ctx);
}
if(KQ_pos){
tensor_to_cpu(KQ_pos);
}
ggml_scratch empty_scratch = {0, 0, nullptr};
ggml_set_scratch(gctx, empty_scratch);
Expand All @@ -642,13 +649,21 @@ QwenMoeModel::QwenMoeModel(ModelContext *ctx, const QwenMoeConfig &config)
}
}

auto QwenMoeModel::forward(ModelContext *ctx, ggml_tensor *input_ids, ggml_tensor *KQ_pos, int n_ctx, int num_experts, int num_experts_per_tok) const -> ggml_tensor * {
auto QwenMoeModel::forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx, int num_experts, int num_experts_per_tok) const -> ggml_tensor * {
ggml_context *gctx = ctx->ctx_b.get();
ggml_tensor *KQ_pos = pos_ids_gen_(gctx, input_ids->ne[0], n_past, n_ctx);
if(KQ_pos){
tensor_to_device(KQ_pos);
}
ggml_tensor *hidden_states = embed_tokens.forward(ctx, input_ids);
for (const auto &layer : layers) {
ggml_set_scratch(gctx, ctx->scratch);
hidden_states = layer.forward(ctx, hidden_states, KQ_pos, n_ctx, num_experts, num_experts_per_tok);
hidden_states = layer.forward(ctx, hidden_states, KQ_pos, n_past, n_ctx, num_experts, num_experts_per_tok);
}
if(KQ_pos){
tensor_to_cpu(KQ_pos);
}

ggml_scratch empty_scratch = {0, 0, nullptr};
ggml_set_scratch(gctx, empty_scratch);
hidden_states = norm.forward(ctx, hidden_states, 1e-6f);
Expand Down Expand Up @@ -819,18 +834,13 @@ QwenMoeForCausalLM::~QwenMoeForCausalLM() {
}
}

auto QwenForCausalLM::generate_next_token(
const std::vector<int32_t> &input_ids,
const GenerationConfig &gen_config,
int n_past,
int n_ctx
) -> int32_t {
auto QwenForCausalLM::forward_graph_compute(const std::vector<int> &input_ids, int n_past, int n_ctx,int n_threads, bool is_decoding) -> ggml_tensor*{
ctx_.ctx_b = make_unique_ggml_context(ctx_.compute_buffer.size(), ctx_.compute_buffer.data(), false);
// ctx_.gf = ggml_new_graph(ctx_.ctx_b.get()); // dafault graph size didn't fit larger model like 32b, moe
size_t GRAPH_SIZE = 4096 * 2;
ctx_.gf = ggml_new_graph_custom(ctx_.ctx_b.get(), GRAPH_SIZE, false);

int n_threads = gen_config.num_threads; // user defined
// int n_threads = gen_config.num_threads; // user defined
if (n_threads <= 0) {
n_threads = get_default_num_threads(); // default thread num
}
Expand All @@ -842,28 +852,28 @@ auto QwenForCausalLM::generate_next_token(
ggml_tensor *curr_input_ids = ggml_new_tensor_1d(ctx_.ctx_b.get(), GGML_TYPE_I32, curr_input_ids_size);
memcpy(curr_input_ids->data, input_ids.data() + n_past, ggml_nbytes(curr_input_ids));

ggml_tensor *KQ_pos = ggml_new_tensor_1d(ctx_.ctx_b.get(), GGML_TYPE_I32, curr_input_ids_size);
int * data = static_cast<int *>(KQ_pos->data);
for (int i = 0; i < curr_input_ids_size; ++i) {
data[i] = n_past + i;
}
if (KQ_pos) {
tensor_to_device(KQ_pos);
}
ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, n_past, n_ctx, is_decoding);
lm_logits->backend = GGML_BACKEND_CPU;
// lm_logits->backend = GGML_BACKEND_TYPE_CPU; //newer ggml

ggml_tensor *lm_logits = forward(&ctx_, curr_input_ids, KQ_pos, n_ctx);
lm_logits->backend = GGML_BACKEND_CPU; // newer ggml version
if (KQ_pos) {
tensor_to_cpu(KQ_pos);
}

ggml_build_forward_expand(ctx_.gf, lm_logits);
#ifdef GGML_USE_METAL
ggml_metal_graph_compute(ctx_.ctx_metal.get(), ctx_.gf);
#else
ggml_graph_compute_helper(ctx_.work_buffer, ctx_.gf, n_threads);
#endif
// std::cout << lm_logits -> ne[1] << std::endl;
return lm_logits;
}

auto QwenForCausalLM::generate_next_token(
const std::vector<int32_t> &input_ids,
const GenerationConfig &gen_config,
int n_past,
int n_ctx
) -> int32_t {
ggml_tensor *lm_logits = forward_graph_compute(input_ids, n_past, n_ctx, gen_config.num_threads, true);
int vocab_size = lm_logits->ne[0];
float *next_token_logits = (float *)lm_logits->data;

Expand Down Expand Up @@ -1068,12 +1078,13 @@ auto QwenMoeForCausalLM::load(ModelLoader &loader) -> void {
auto QwenForCausalLM::forward(
ModelContext *ctx,
ggml_tensor *input_ids,
ggml_tensor *KQ_pos,
int n_ctx
int n_past,
int n_ctx,
bool is_decoding
) const -> ggml_tensor * {
ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, KQ_pos, n_ctx);
ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, n_past, n_ctx);
// NOTE: only compute next_token_logits for the last token
if (input_ids->ne[0] > 1) {
if (is_decoding && input_ids->ne[0] > 1) {
transformer_outputs = tensor_assign_buffers(
ggml_view_1d(ctx->ctx_b.get(), transformer_outputs, config.hidden_size,
(input_ids->ne[0] - 1) * config.hidden_size * ggml_element_size(transformer_outputs)));
Expand All @@ -1085,12 +1096,13 @@ auto QwenForCausalLM::forward(
auto QwenMoeForCausalLM::forward(
ModelContext *ctx,
ggml_tensor *input_ids,
ggml_tensor *KQ_pos,
int n_ctx
int n_past,
int n_ctx,
bool is_decoding
) const -> ggml_tensor * {
ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, KQ_pos, n_ctx, config.num_experts, config.num_experts_per_tok);
ggml_tensor *transformer_outputs = transformer.forward(ctx, input_ids, n_past, n_ctx, config.num_experts, config.num_experts_per_tok);
// NOTE: only compute next_token_logits for the last token
if (input_ids->ne[0] > 1) {
if (is_decoding && input_ids->ne[0] > 1) {
transformer_outputs = tensor_assign_buffers(
ggml_view_1d(ctx->ctx_b.get(), transformer_outputs, config.hidden_size,
(input_ids->ne[0] - 1) * config.hidden_size * ggml_element_size(transformer_outputs)));
Expand Down
29 changes: 22 additions & 7 deletions qwen.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <cmath>
#include "tiktoken.h"

#include <ggml.h>
Expand Down Expand Up @@ -356,7 +357,7 @@ class QwenAttention {
QwenAttention() : num_attention_heads(0), num_kv_heads(0) {}
QwenAttention(ModelContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int max_length);

auto forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor *;
auto forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_past, int n_ctx) const -> ggml_tensor *;

int num_attention_heads;
int num_kv_heads;
Expand Down Expand Up @@ -425,14 +426,24 @@ class QwenBlock {
post_attention_layernorm(ctx, hidden_size, false),
mlp(ctx, hidden_size, intermediate_size) {}

auto forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor *;
auto forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos,int n_past, int n_ctx) const -> ggml_tensor *;

RMSNorm input_layernorm;
QwenAttention attn;
RMSNorm post_attention_layernorm;
QwenMLP mlp;
};

struct BasicPositionIdsGenerator {
ggml_tensor *operator()(ggml_context *ctx, int qlen, int n_past, int n_ctx) const {
ggml_tensor *position_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, qlen);
for (int i = 0; i < qlen; i++) {
((int *)position_ids->data)[i] = n_past + i;
}
return position_ids;
}
};

class QwenMoeBlock {
public:
QwenMoeBlock() = default;
Expand All @@ -442,7 +453,7 @@ class QwenMoeBlock {
post_attention_layernorm(ctx, hidden_size, false),
mlp(ctx, hidden_size, intermediate_size, moe_intermediate_size, shared_expert_intermediate_size, num_experts,num_experts_per_tok) {}

auto forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_ctx, int num_experts, int num_experts_per_tok) const -> ggml_tensor *;
auto forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_past, int n_ctx, int num_experts, int num_experts_per_tok) const -> ggml_tensor *;

RMSNorm input_layernorm;
QwenAttention attn;
Expand All @@ -455,11 +466,12 @@ class QwenModel {
QwenModel() = default;
QwenModel(ModelContext *ctx, const QwenConfig &config);

auto forward(ModelContext *ctx, ggml_tensor *input_ids, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor *;
auto forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx) const -> ggml_tensor *;

Embedding embed_tokens;
std::vector<QwenBlock> layers;
RMSNorm norm;
BasicPositionIdsGenerator pos_ids_gen_;
};

class QwenMoeModel {
Expand All @@ -468,11 +480,12 @@ class QwenMoeModel {
QwenMoeModel(ModelContext *ctx, const QwenMoeConfig &config);

// Attention: These parameters should not be set to fixed values. I did this for quick implementation.
auto forward(ModelContext *ctx, ggml_tensor *input_ids, ggml_tensor *KQ_pos, int n_ctx, int num_experts, int num_experts_per_tok) const -> ggml_tensor *;
auto forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx, int num_experts, int num_experts_per_tok) const -> ggml_tensor *;

Embedding embed_tokens;
std::vector<QwenMoeBlock> layers;
RMSNorm norm;
BasicPositionIdsGenerator pos_ids_gen_;
};

class QwenForCausalLM {
Expand Down Expand Up @@ -505,7 +518,9 @@ class QwenForCausalLM {

virtual void load(ModelLoader &loader);

virtual ggml_tensor * forward(ModelContext *ctx, ggml_tensor *input_ids, ggml_tensor *KQ_pos, int n_ctx) const;
virtual ggml_tensor * forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx, bool is_decoding) const;

auto forward_graph_compute(const std::vector<int> &input_ids, int n_past, int n_ctx,int n_threads, bool is_decoding)-> ggml_tensor*;

static constexpr size_t MEM_SIZE = 1280 * MB; // 2k context
static constexpr size_t SCRATCH_SIZE = 1280 * MB; // 2k context
Expand All @@ -526,7 +541,7 @@ class QwenMoeForCausalLM : public QwenForCausalLM {
// Override methods here if needed

auto load(ModelLoader &loader) -> void override;
auto forward(ModelContext *ctx, ggml_tensor *input_ids, ggml_tensor *KQ_pos, int n_ctx) const -> ggml_tensor * override;
auto forward(ModelContext *ctx, ggml_tensor *input_ids, int n_past, int n_ctx, bool is_decoding) const -> ggml_tensor * override;

static constexpr size_t MEM_SIZE = 812ull * MB;
static constexpr size_t SCRATCH_SIZE = 1844ull * MB;
Expand Down
11 changes: 11 additions & 0 deletions tests/data/get-wikitext-2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip wikitext-2-raw-v1.zip

echo "Usage:"
echo ""
echo " ./perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]"
echo ""

exit 0
Loading

0 comments on commit 32fd347

Please sign in to comment.