Merge pull request #341 from utilityai/update-llama-cpp-2024-06-13

MarcusDunn · web-flow · commit 4c3de32ac3c4 · 2024-06-14T15:00:33.000-07:00
Updated llama-cpp (bot)
diff --git a/embeddings/src/main.rs b/embeddings/src/main.rs
@@ -139,10 +139,10 @@ fn main() -> Result<()> {
         for token in token_line {
             // Attempt to convert token to string and print it; if it fails, print the token instead
             match model.token_to_str(*token, Special::Tokenize) {
-                Ok(token_str) => eprintln!(" {} --> {}", token, token_str),
+                Ok(token_str) => eprintln!("{token} --> {token_str}"),
                 Err(e) => {
-                    eprintln!("Failed to convert token to string, error: {}", e);
-                    eprintln!("Token value: {}", token);
+                    eprintln!("Failed to convert token to string, error: {e}");
+                    eprintln!("Token value: {token}");
                 }
             }
         }
diff --git a/llama-cpp-2/src/context.rs b/llama-cpp-2/src/context.rs
@@ -76,7 +76,7 @@ impl<'model> LlamaContext<'model> {
 
         match NonZeroI32::new(result) {
             None => {
-                self.initialized_logits = batch.initialized_logits.clone();
+                self.initialized_logits.clone_from(&batch.initialized_logits);
                 Ok(())
             }
             Some(error) => Err(DecodeError::from(error)),
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
@@ -203,7 +203,7 @@ pub enum StringToTokenError {
     #[error("{0}")]
     NulError(#[from] NulError),
     #[error("{0}")]
-    /// Failed to convert a provided integer to a c_int.
+    /// Failed to convert a provided integer to a [`c_int`].
     CIntConversionError(#[from] std::num::TryFromIntError),
 }
 
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
@@ -9,7 +9,7 @@ use crate::context::LlamaContext;
 use crate::llama_backend::LlamaBackend;
 use crate::model::params::LlamaModelParams;
 use crate::token::LlamaToken;
-use crate::token_type::LlamaTokenType;
+use crate::token_type::LlamaTokenAttr;
 use crate::{
     ApplyChatTemplateError, ChatTemplateError, LlamaContextLoadError, LlamaModelLoadError,
     NewLlamaChatMessageError, StringToTokenError, TokenToStringError,
@@ -238,9 +238,9 @@ impl LlamaModel {
     ///
     /// If the token type is not known to this library.
     #[must_use]
-    pub fn token_type(&self, LlamaToken(id): LlamaToken) -> LlamaTokenType {
-        let token_type = unsafe { llama_cpp_sys_2::llama_token_get_type(self.model.as_ptr(), id) };
-        LlamaTokenType::try_from(token_type).expect("token type is valid")
+    pub fn token_attr(&self, LlamaToken(id): LlamaToken) -> LlamaTokenAttr {
+        let token_type = unsafe { llama_cpp_sys_2::llama_token_get_attr(self.model.as_ptr(), id) };
+        LlamaTokenAttr::try_from(token_type).expect("token type is valid")
     }
 
     /// Convert a token to a string with a specified buffer size.
@@ -292,18 +292,23 @@ impl LlamaModel {
             return Ok(String::from("\n").into_bytes());
         }
 
-        // unsure what to do with this in the face of the 'special' arg
-        match self.token_type(token) {
-            LlamaTokenType::Normal | LlamaTokenType::UserDefined => {}
-            LlamaTokenType::Control => {
+        // unsure what to do with this in the face of the 'special' arg + attr changes
+        match self.token_attr(token) {
+            LlamaTokenAttr::Normal
+            | LlamaTokenAttr::UserDefined
+            | LlamaTokenAttr::Normalized
+            | LlamaTokenAttr::LStrip
+            | LlamaTokenAttr::RStrip
+            | LlamaTokenAttr::SingleWord => {}
+            LlamaTokenAttr::Control => {
                 if token == self.token_bos() || token == self.token_eos() {
                     return Ok(Vec::new());
                 }
             }
-            LlamaTokenType::Unknown
-            | LlamaTokenType::Undefined
-            | LlamaTokenType::Byte
-            | LlamaTokenType::Unused => {
+            LlamaTokenAttr::Unknown
+            | LlamaTokenAttr::Undefined
+            | LlamaTokenAttr::Byte
+            | LlamaTokenAttr::Unused => {
                 return Ok(Vec::new());
             }
         }
@@ -471,7 +476,7 @@ impl LlamaModel {
         // Set the tmpl pointer
         let tmpl = tmpl.map(CString::new);
         let tmpl_ptr = match &tmpl {
-            Some(str) => str.as_ref().map_err(|e| e.clone())?.as_ptr(),
+            Some(str) => str.as_ref().map_err(Clone::clone)?.as_ptr(),
             None => std::ptr::null(),
         };
 
diff --git a/llama-cpp-2/src/token_type.rs b/llama-cpp-2/src/token_type.rs
@@ -3,22 +3,19 @@
 /// A rust flavored equivalent of `llama_token_type`.
 #[repr(u32)]
 #[derive(Eq, PartialEq, Debug, Clone, Copy)]
-#[allow(clippy::module_name_repetitions)]
-pub enum LlamaTokenType {
-    /// An undefined token type.
-    Undefined = llama_cpp_sys_2::LLAMA_TOKEN_TYPE_UNDEFINED as _,
-    /// A normal token type.
-    Normal = llama_cpp_sys_2::LLAMA_TOKEN_TYPE_NORMAL as _,
-    /// An unknown token type.
-    Unknown = llama_cpp_sys_2::LLAMA_TOKEN_TYPE_UNKNOWN as _,
-    /// A control token type.
-    Control = llama_cpp_sys_2::LLAMA_TOKEN_TYPE_CONTROL as _,
-    /// A user defined token type.
-    UserDefined = llama_cpp_sys_2::LLAMA_TOKEN_TYPE_USER_DEFINED as _,
-    /// An unused token type.
-    Unused = llama_cpp_sys_2::LLAMA_TOKEN_TYPE_UNUSED as _,
-    /// A byte token type.
-    Byte = llama_cpp_sys_2::LLAMA_TOKEN_TYPE_BYTE as _,
+#[allow(clippy::module_name_repetitions, missing_docs)]
+pub enum LlamaTokenAttr {
+    Undefined = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_UNDEFINED as _,
+    Unknown = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_UNKNOWN as _,
+    Unused = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_UNUSED as _,
+    Normal = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_NORMAL as _,
+    Control = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_CONTROL as _,
+    UserDefined = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_USER_DEFINED as _,
+    Byte = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_BYTE as _,
+    Normalized = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_NORMALIZED as _,
+    LStrip = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_LSTRIP as _,
+    RStrip = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_RSTRIP as _,
+    SingleWord = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_SINGLE_WORD as _,
 }
 
 /// A safe wrapper for converting potentially deceptive `llama_token_type` values into
@@ -31,27 +28,31 @@ pub enum LlamaTokenType {
 /// # use std::ffi::c_int;
 /// # use std::num::TryFromIntError;
 /// # use std::result::Result;
-/// # use llama_cpp_2::token_type::{LlamaTokenTypeFromIntError, LlamaTokenType};
+/// # use llama_cpp_2::token_type::{LlamaTokenTypeFromIntError, LlamaTokenAttr};
 /// # fn main() -> Result<(), LlamaTokenTypeFromIntError> {
-/// let llama_token_type = LlamaTokenType::try_from(0 as llama_cpp_sys_2::llama_token_type)?;
-/// assert_eq!(llama_token_type, LlamaTokenType::Undefined);
+/// let llama_token_type = LlamaTokenAttr::try_from(0 as llama_cpp_sys_2::llama_token_type)?;
+/// assert_eq!(llama_token_type, LlamaTokenAttr::Undefined);
 ///
-/// let bad_llama_token_type = LlamaTokenType::try_from(100 as llama_cpp_sys_2::llama_token_type);
+/// let bad_llama_token_type = LlamaTokenAttr::try_from(100 as llama_cpp_sys_2::llama_token_type);
 /// assert_eq!(Err(LlamaTokenTypeFromIntError::UnknownValue(100)), bad_llama_token_type);
 /// # Ok(())
 /// # }
-impl TryFrom<llama_cpp_sys_2::llama_token_type> for LlamaTokenType {
+impl TryFrom<llama_cpp_sys_2::llama_token_type> for LlamaTokenAttr {
     type Error = LlamaTokenTypeFromIntError;
 
     fn try_from(value: llama_cpp_sys_2::llama_vocab_type) -> Result<Self, Self::Error> {
         match value {
-            llama_cpp_sys_2::LLAMA_TOKEN_TYPE_UNDEFINED => Ok(LlamaTokenType::Undefined),
-            llama_cpp_sys_2::LLAMA_TOKEN_TYPE_NORMAL => Ok(LlamaTokenType::Normal),
-            llama_cpp_sys_2::LLAMA_TOKEN_TYPE_UNKNOWN => Ok(LlamaTokenType::Unknown),
-            llama_cpp_sys_2::LLAMA_TOKEN_TYPE_CONTROL => Ok(LlamaTokenType::Control),
-            llama_cpp_sys_2::LLAMA_TOKEN_TYPE_USER_DEFINED => Ok(LlamaTokenType::UserDefined),
-            llama_cpp_sys_2::LLAMA_TOKEN_TYPE_UNUSED => Ok(LlamaTokenType::Unused),
-            llama_cpp_sys_2::LLAMA_TOKEN_TYPE_BYTE => Ok(LlamaTokenType::Byte),
+            llama_cpp_sys_2::LLAMA_TOKEN_ATTR_UNDEFINED => Ok(Self::Undefined),
+            llama_cpp_sys_2::LLAMA_TOKEN_ATTR_UNKNOWN => Ok(Self::Unknown),
+            llama_cpp_sys_2::LLAMA_TOKEN_ATTR_UNUSED => Ok(Self::Unused),
+            llama_cpp_sys_2::LLAMA_TOKEN_ATTR_NORMAL => Ok(Self::Normal),
+            llama_cpp_sys_2::LLAMA_TOKEN_ATTR_CONTROL => Ok(Self::Control),
+            llama_cpp_sys_2::LLAMA_TOKEN_ATTR_USER_DEFINED => Ok(Self::UserDefined),
+            llama_cpp_sys_2::LLAMA_TOKEN_ATTR_BYTE => Ok(Self::Byte),
+            llama_cpp_sys_2::LLAMA_TOKEN_ATTR_NORMALIZED => Ok(Self::Normalized),
+            llama_cpp_sys_2::LLAMA_TOKEN_ATTR_LSTRIP => Ok(Self::LStrip),
+            llama_cpp_sys_2::LLAMA_TOKEN_ATTR_RSTRIP => Ok(Self::RStrip),
+            llama_cpp_sys_2::LLAMA_TOKEN_ATTR_SINGLE_WORD => Ok(Self::SingleWord),
             _ => Err(LlamaTokenTypeFromIntError::UnknownValue(value as _)),
         }
     }
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
@@ -39,6 +39,8 @@ include = [
     "/llama.cpp/ggml-common.h",
     "/llama.cpp/ggml-cuda",
     "/llama.cpp/sgemm.h",
+    "/llama.cpp/ggml-cuda/*",
+    "/llama.cpp/ggml-cuda/template_instances/*",
 ]
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
@@ -463,10 +463,17 @@ fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'stati
         .map(|f| f.unwrap())
         .filter(|entry| entry.file_name().to_string_lossy().ends_with(".cu"))
         .map(|entry| entry.path());
+    
+    let template_instances = read_dir(cuda_path.join("template-instances"))
+        .unwrap()
+        .map(|f| f.unwrap())
+        .filter(|entry| entry.file_name().to_string_lossy().ends_with(".cu"))
+        .map(|entry| entry.path());
 
     nvcc.include(cuda_path.as_path())
         .include(LLAMA_PATH.as_path())
         .files(cuda_sources)
+        .files(template_instances)
         .file(LLAMA_PATH.join("ggml-cuda.cu"))
         .compile(lib_name);
 
@@ -555,7 +562,7 @@ fn compile_metal(cx: &mut Build, cxx: &mut Build) {
     // Create a static library for our metal embed code.
     let ggml_metal_embed_library_path = PathBuf::from(&out_dir).join("libggml-metal-embed.a");
     Command::new("ar")
-        .args(&[
+        .args([
             "crus",
             ggml_metal_embed_library_path.to_str().unwrap(),
             ggml_metal_embed_object_path.to_str().unwrap(),
diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit 917dc8cfa67a72fb7c8bf7392270da3bf4833af4
+Subproject commit 172c8256840ffd882ab9992ecedbb587d9b21f15
diff --git a/simple/src/main.rs b/simple/src/main.rs
@@ -117,6 +117,7 @@ impl Model {
     }
 }
 
+#[allow(clippy::too_many_lines)]
 fn main() -> Result<()> {
     let Args {
         n_len,
@@ -263,7 +264,7 @@ either reduce n_len or increase n_ctx"
             // use `Decoder.decode_to_string()` to avoid the intermediate buffer
             let mut output_string = String::with_capacity(32);
             let _decode_result = decoder.decode_to_string(&output_bytes, &mut output_string, false);
-            print!("{}", output_string);
+            print!("{output_string}");
             std::io::stdout().flush()?;
 
             batch.clear();

Original file line number	Diff line number	Diff line change
`@@ -139,10 +139,10 @@ fn main() -> Result<()> {`
`139`	`139`	`for token in token_line {`
`140`	`140`	`// Attempt to convert token to string and print it; if it fails, print the token instead`
`141`	`141`	`match model.token_to_str(*token, Special::Tokenize) {`
`142`		`- Ok(token_str) => eprintln!(" {} --> {}", token, token_str),`
	`142`	`+ Ok(token_str) => eprintln!("{token} --> {token_str}"),`
`143`	`143`	`Err(e) => {`
`144`		`- eprintln!("Failed to convert token to string, error: {}", e);`
`145`		`- eprintln!("Token value: {}", token);`
	`144`	`+ eprintln!("Failed to convert token to string, error: {e}");`
	`145`	`+ eprintln!("Token value: {token}");`
`146`	`146`	`}`
`147`	`147`	`}`
`148`	`148`	`}`
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ impl<'model> LlamaContext<'model> {`
`76`	`76`
`77`	`77`	`match NonZeroI32::new(result) {`
`78`	`78`	`None => {`
`79`		`- self.initialized_logits = batch.initialized_logits.clone();`
	`79`	`+ self.initialized_logits.clone_from(&batch.initialized_logits);`
`80`	`80`	`Ok(())`
`81`	`81`	`}`
`82`	`82`	`Some(error) => Err(DecodeError::from(error)),`
Original file line number	Diff line number	Diff line change
`@@ -203,7 +203,7 @@ pub enum StringToTokenError {`
`203`	`203`	`#[error("{0}")]`
`204`	`204`	`NulError(#[from] NulError),`
`205`	`205`	`#[error("{0}")]`
`206`		`- /// Failed to convert a provided integer to a c_int.`
	`206`	+ /// Failed to convert a provided integer to a [`c_int`].
`207`	`207`	`CIntConversionError(#[from] std::num::TryFromIntError),`
`208`	`208`	`}`
`209`	`209`
Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,8 @@ include = [`
`39`	`39`	`"/llama.cpp/ggml-common.h",`
`40`	`40`	`"/llama.cpp/ggml-cuda",`
`41`	`41`	`"/llama.cpp/sgemm.h",`
	`42`	`+ "/llama.cpp/ggml-cuda/*",`
	`43`	`+ "/llama.cpp/ggml-cuda/template_instances/*",`
`42`	`44`	`]`
`43`	`45`
`44`	`46`	`# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html`
Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,7 @@ impl Model {`
`117`	`117`	`}`
`118`	`118`	`}`
`119`	`119`
	`120`	`+#[allow(clippy::too_many_lines)]`
`120`	`121`	`fn main() -> Result<()> {`
`121`	`122`	`let Args {`
`122`	`123`	`n_len,`
`@@ -263,7 +264,7 @@ either reduce n_len or increase n_ctx"`
`263`	`264`	// use `Decoder.decode_to_string()` to avoid the intermediate buffer
`264`	`265`	`let mut output_string = String::with_capacity(32);`
`265`	`266`	`let _decode_result = decoder.decode_to_string(&output_bytes, &mut output_string, false);`
`266`		`- print!("{}", output_string);`
	`267`	`+ print!("{output_string}");`
`267`	`268`	`std::io::stdout().flush()?;`
`268`	`269`
`269`	`270`	`batch.clear();`