Skip to content

Commit

Permalink
update custom rope_theta
Browse files Browse the repository at this point in the history
  • Loading branch information
yvonwin committed Apr 28, 2024
1 parent cb1b376 commit 076595b
Showing 1 changed file with 6 additions and 4 deletions.
10 changes: 6 additions & 4 deletions qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,7 @@ LlamaAttention::LlamaAttention(ModelContext *ctx, int hidden_size, int num_atten
auto QwenAttention::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_tensor *KQ_pos, int n_past, int n_ctx) const -> ggml_tensor * {
ggml_context *gctx = ctx->ctx_b.get();

float rope_theta = 1000000.0;
const int hidden_size = hidden_states->ne[0];
const int qlen = hidden_states->ne[1];
const int head_size = hidden_size / num_attention_heads;
Expand All @@ -577,15 +578,15 @@ auto QwenAttention::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml_
query_layer = tensor_assign_buffers(ggml_cont(gctx, query_layer));
}
#endif
query_layer = tensor_assign_buffers(ggml_rope_inplace(gctx, query_layer, KQ_pos, rope_dim, 2, n_ctx));
query_layer = tensor_assign_buffers(ggml_rope_custom_inplace(gctx, query_layer, KQ_pos, rope_dim, 2, n_ctx, 0, rope_theta, 1.f, 0.0f, 1.0f, 0.0f, 0.0f));
query_layer = tensor_assign_buffers(ggml_permute(gctx, query_layer, 0, 2, 1, 3)); // [heads, qlen, head_size]

#ifdef GGML_USE_CUBLAS
if (!ggml_is_contiguous(key_layer)) {
key_layer = tensor_assign_buffers(ggml_cont(gctx, key_layer));
}
#endif
key_layer = tensor_assign_buffers(ggml_rope_inplace(gctx, key_layer, KQ_pos, rope_dim, 2, n_ctx));
key_layer = tensor_assign_buffers(ggml_rope_custom_inplace(gctx, key_layer, KQ_pos, rope_dim, 2, n_ctx, 0, rope_theta, 1.f, 0.0f, 1.0f, 0.0f, 0.0f));
key_layer = tensor_assign_buffers(ggml_permute(gctx, key_layer, 0, 2, 1, 3)); // [kv_heads, qlen, head_size]
value_layer = tensor_assign_buffers(ggml_permute(gctx, value_layer, 1, 2, 0, 3)); // [kv_heads, head_size, qlen]

Expand Down Expand Up @@ -634,6 +635,7 @@ auto LlamaAttention::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml
// std::cout << "debug llamaattention" << std::endl;
ggml_context *gctx = ctx->ctx_b.get();

float rope_theta = 500000.0;
const int hidden_size = hidden_states->ne[0];
const int qlen = hidden_states->ne[1];
const int head_size = hidden_size / num_attention_heads;
Expand All @@ -657,15 +659,15 @@ auto LlamaAttention::forward(ModelContext *ctx, ggml_tensor *hidden_states, ggml
query_layer = tensor_assign_buffers(ggml_cont(gctx, query_layer));
}
#endif
query_layer = tensor_assign_buffers(ggml_rope_inplace(gctx, query_layer, KQ_pos, rope_dim, 2, n_ctx));
query_layer = tensor_assign_buffers(ggml_rope_custom_inplace(gctx, query_layer, KQ_pos, rope_dim, 2, n_ctx, 0, rope_theta, 1.f, 0.0f, 1.0f, 0.0f, 0.0f));
query_layer = tensor_assign_buffers(ggml_permute(gctx, query_layer, 0, 2, 1, 3)); // [heads, qlen, head_size]

#ifdef GGML_USE_CUBLAS
if (!ggml_is_contiguous(key_layer)) {
key_layer = tensor_assign_buffers(ggml_cont(gctx, key_layer));
}
#endif
key_layer = tensor_assign_buffers(ggml_rope_inplace(gctx, key_layer, KQ_pos, rope_dim, 2, n_ctx));
key_layer = tensor_assign_buffers(ggml_rope_custom_inplace(gctx, key_layer, KQ_pos, rope_dim, 2, n_ctx, 0, rope_theta, 1.f, 0.0f, 1.0f, 0.0f, 0.0f));
key_layer = tensor_assign_buffers(ggml_permute(gctx, key_layer, 0, 2, 1, 3)); // [kv_heads, qlen, head_size]
value_layer = tensor_assign_buffers(ggml_permute(gctx, value_layer, 1, 2, 0, 3)); // [kv_heads, head_size, qlen]

Expand Down

0 comments on commit 076595b

Please sign in to comment.