@@ -174,6 +174,7 @@ impl Default for InferenceSessionParameters {
174
174
}
175
175
}
176
176
177
+ #[ derive( Clone , Debug , PartialEq ) ]
177
178
/// The parameters that drive text generation.
178
179
pub struct InferenceParameters {
179
180
pub n_threads : i32 ,
@@ -184,6 +185,7 @@ pub struct InferenceParameters {
184
185
pub temp : f32 ,
185
186
pub bias_tokens : TokenBias ,
186
187
pub play_back_previous_tokens : bool ,
188
+ pub increased_determinism : bool ,
187
189
}
188
190
189
191
impl Default for InferenceParameters {
@@ -197,6 +199,7 @@ impl Default for InferenceParameters {
197
199
temp : 0.80 ,
198
200
bias_tokens : TokenBias :: default ( ) ,
199
201
play_back_previous_tokens : false ,
202
+ increased_determinism : true ,
200
203
}
201
204
}
202
205
}
@@ -1094,11 +1097,13 @@ impl Model {
1094
1097
pub fn evaluate (
1095
1098
& self ,
1096
1099
session : & mut InferenceSession ,
1097
- n_threads : i32 ,
1100
+ params : & InferenceParameters ,
1098
1101
input_tokens : & [ TokenId ] ,
1099
1102
) {
1100
1103
let n = input_tokens. len ( ) ;
1101
1104
let n_past = session. n_past as i32 ;
1105
+ let n_threads = params. n_threads ;
1106
+ let increased_determinism = params. increased_determinism ;
1102
1107
1103
1108
let Hyperparameters {
1104
1109
n_vocab,
@@ -1127,6 +1132,27 @@ impl Model {
1127
1132
1128
1133
let mut input_layer = ctx0. op_get_rows ( & self . tok_embeddings , & embd) ;
1129
1134
1135
+ // Defined here to avoid repetition and creating a binding inside nested loops.
1136
+ // See the call site below for more context.
1137
+ let vtrans_fun = |il : usize | -> ggml:: Tensor {
1138
+ ctx0. op_permute (
1139
+ & ctx0. op_reshape_3d (
1140
+ & ctx0. op_view_1d (
1141
+ & session. memory_v ,
1142
+ ( n_past + n as i32 ) * n_embd,
1143
+ il * n_ctx as usize * session. memory_v . element_size ( ) * n_embd as usize ,
1144
+ ) ,
1145
+ n_embd / n_head,
1146
+ n_head,
1147
+ n_past + n as i32 ,
1148
+ ) ,
1149
+ 1 ,
1150
+ 2 ,
1151
+ 0 ,
1152
+ 3 ,
1153
+ )
1154
+ } ;
1155
+
1130
1156
for il in 0 ..n_layer as usize {
1131
1157
let input_self_attention = input_layer. share ( ) ;
1132
1158
let mut current: ggml:: Tensor ;
@@ -1226,22 +1252,21 @@ impl Model {
1226
1252
let k_q_soft_max = ctx0. op_soft_max ( & k_q_masked) ;
1227
1253
1228
1254
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
1229
- let v_transposed = ctx0. op_permute (
1230
- & ctx0. op_reshape_3d (
1231
- & ctx0. op_view_1d (
1232
- & session. memory_v ,
1233
- ( n_past + n as i32 ) * n_embd,
1234
- il * n_ctx as usize * session. memory_v . element_size ( ) * n_embd as usize ,
1235
- ) ,
1236
- n_embd / n_head,
1237
- n_head,
1238
- n_past + n as i32 ,
1239
- ) ,
1240
- 1 ,
1241
- 2 ,
1242
- 0 ,
1243
- 3 ,
1244
- ) ;
1255
+ let v_transposed = {
1256
+ if !increased_determinism {
1257
+ vtrans_fun ( il)
1258
+ } else {
1259
+ ctx0. op_cpy (
1260
+ & vtrans_fun ( il) ,
1261
+ & ctx0. new_tensor_3d (
1262
+ ggml:: TYPE_F32 ,
1263
+ n_past + n as i32 ,
1264
+ n_embd / n_head,
1265
+ n_head,
1266
+ ) ,
1267
+ )
1268
+ }
1269
+ } ;
1245
1270
1246
1271
// KQV = transpose(V) * KQ_soft_max
1247
1272
let k_q_v = ctx0. op_mul_mat ( & v_transposed, & k_q_soft_max) ;
@@ -1393,7 +1418,7 @@ impl InferenceSession {
1393
1418
}
1394
1419
1395
1420
for batch in prompt_tokens. chunks ( 8 ) {
1396
- model. evaluate ( self , params. n_threads , batch) ;
1421
+ model. evaluate ( self , params, batch) ;
1397
1422
for & tk in batch {
1398
1423
// NOTE: No string ever tokenizes to the end of sentence. So we
1399
1424
// can just return the id here.
@@ -1427,7 +1452,7 @@ impl InferenceSession {
1427
1452
self . tokens . push ( next_token) ;
1428
1453
1429
1454
// Then, evaluate the network again to compute the new last_logits
1430
- model. evaluate ( self , params. n_threads , & [ next_token] ) ;
1455
+ model. evaluate ( self , params, & [ next_token] ) ;
1431
1456
1432
1457
// Return the next token
1433
1458
Ok ( if next_token as TokenId == EOD_TOKEN_ID {
0 commit comments