Skip to content

Commit

Permalink
fix: ExLlama Backend Context Size & Rope Scaling (#1311)
Browse files Browse the repository at this point in the history
* fix: context_size not propagated to exllama backend

* fix: exllama rope scaling
  • Loading branch information
ok2sh authored Nov 21, 2023
1 parent 480b14c commit 20d637e
Showing 1 changed file with 13 additions and 0 deletions.
13 changes: 13 additions & 0 deletions backend/python/exllama/exllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,19 @@ def LoadModel(self, request, context):

config = ExLlamaConfig(model_config_path) # create config from config.json
config.model_path = model_path # supply path to model weights file
if (request.ContextSize):
config.max_seq_len = request.ContextSize # override max sequence length
config.max_attention_size = request.ContextSize**2 # Should be set to context_size^2.
# https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163

# Set Rope scaling.
if (request.RopeFreqScale):
# Alpha value for Rope scaling.
# Higher value increases context but adds perplexity.
# alpha_value and compress_pos_emb are mutually exclusive.
# https://github.com/turboderp/exllama/issues/115
config.alpha_value = request.RopeFreqScale
config.calculate_rotary_embedding_base()

model = ExLlama(config) # create ExLlama instance and load the weights
tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file
Expand Down

0 comments on commit 20d637e

Please sign in to comment.