diff --git a/llamafile/BUILD.mk b/llamafile/BUILD.mk index a41805e167..4adcb3e0b8 100644 --- a/llamafile/BUILD.mk +++ b/llamafile/BUILD.mk @@ -53,6 +53,7 @@ o/$(MODE)/llamafile: \ o/$(MODE)/llamafile/tokenize \ o/$(MODE)/llamafile/addnl \ o/$(MODE)/llamafile/high \ + o/$(MODE)/llamafile/highlight_test.runs \ o/$(MODE)/llamafile/highlight_c_test.runs \ o/$(MODE)/llamafile/highlight_python_test.runs \ o/$(MODE)/llamafile/pool_test.runs \ @@ -165,6 +166,10 @@ o/$(MODE)/llamafile/pool_test: \ o/$(MODE)/llamafile/crash.o \ o/$(MODE)/llamafile/pool.o \ +o/$(MODE)/llamafile/highlight_test: \ + o/$(MODE)/llamafile/highlight_test.o \ + o/$(MODE)/llama.cpp/llama.cpp.a \ + o/$(MODE)/llamafile/highlight_c_test: \ o/$(MODE)/llamafile/highlight_c_test.o \ o/$(MODE)/llamafile/highlight_c.o \ diff --git a/llamafile/highlight_c.cpp b/llamafile/highlight_c.cpp index 862531e463..e92aea4cfb 100644 --- a/llamafile/highlight_c.cpp +++ b/llamafile/highlight_c.cpp @@ -22,13 +22,15 @@ #define NORMAL 0 #define WORD 1 #define QUOTE 2 -#define DQUOTE 3 -#define SLASH 4 -#define SLASH_SLASH 5 -#define SLASH_STAR 6 -#define SLASH_STAR_STAR 7 -#define TICK 8 -#define BACKSLASH 64 +#define QUOTE_BACKSLASH 3 +#define DQUOTE 4 +#define DQUOTE_BACKSLASH 5 +#define SLASH 6 +#define SLASH_SLASH 7 +#define SLASH_STAR 8 +#define SLASH_STAR_STAR 9 +#define TICK 10 +#define TICK_BACKSLASH 11 HighlightC::HighlightC(is_keyword_f *is_keyword, is_keyword_f *is_type) : is_keyword_(is_keyword), is_type_(is_type) { @@ -41,17 +43,6 @@ void HighlightC::feed(std::string *r, std::string_view input) { int c; for (size_t i = 0; i < input.size(); ++i) { c = input[i] & 255; - - if (t_ & BACKSLASH) { - t_ &= ~BACKSLASH; - *r += c; - continue; - } else if (c == '\\') { - *r += c; - t_ |= BACKSLASH; - continue; - } - switch (t_) { Normal: @@ -147,25 +138,46 @@ void HighlightC::feed(std::string *r, std::string_view input) { if (c == '\'') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = QUOTE_BACKSLASH; } break; + case QUOTE_BACKSLASH: + *r += c; + t_ = QUOTE; + break; + case DQUOTE: *r += c; if (c == '"') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = DQUOTE_BACKSLASH; } break; + case DQUOTE_BACKSLASH: + *r += c; + t_ = DQUOTE; + break; + case TICK: *r += c; if (c == '`') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = TICK_BACKSLASH; } break; + case TICK_BACKSLASH: + *r += c; + t_ = TICK; + break; + default: __builtin_unreachable(); } @@ -173,7 +185,6 @@ void HighlightC::feed(std::string *r, std::string_view input) { } void HighlightC::flush(std::string *r) { - t_ &= ~BACKSLASH; switch (t_) { case WORD: if (is_keyword_(word_.data(), word_.size())) { @@ -193,8 +204,11 @@ void HighlightC::flush(std::string *r) { *r += '/'; break; case TICK: + case TICK_BACKSLASH: case QUOTE: + case QUOTE_BACKSLASH: case DQUOTE: + case DQUOTE_BACKSLASH: case SLASH_SLASH: case SLASH_STAR: case SLASH_STAR_STAR: diff --git a/llamafile/highlight_cobol.cpp b/llamafile/highlight_cobol.cpp index e0dd6f3e2c..fa26a3aeab 100644 --- a/llamafile/highlight_cobol.cpp +++ b/llamafile/highlight_cobol.cpp @@ -22,9 +22,10 @@ #define NORMAL 0 #define WORD 1 #define QUOTE 2 -#define DQUOTE 3 -#define COMMENT 4 -#define BACKSLASH 64 +#define QUOTE_BACKSLASH 3 +#define DQUOTE 4 +#define DQUOTE_BACKSLASH 5 +#define COMMENT 6 HighlightCobol::HighlightCobol() { } @@ -58,16 +59,6 @@ void HighlightCobol::feed(std::string *r, std::string_view input) { } } - if (t_ & BACKSLASH) { - t_ &= ~BACKSLASH; - *r += c; - continue; - } else if (c == '\\') { - *r += c; - t_ |= BACKSLASH; - continue; - } - switch (t_) { Normal: @@ -136,17 +127,31 @@ void HighlightCobol::feed(std::string *r, std::string_view input) { if (c == '\'') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = QUOTE_BACKSLASH; } break; + case QUOTE_BACKSLASH: + *r += c; + t_ = QUOTE; + break; + case DQUOTE: *r += c; if (c == '"') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = DQUOTE_BACKSLASH; } break; + case DQUOTE_BACKSLASH: + *r += c; + t_ = DQUOTE; + break; + default: __builtin_unreachable(); } @@ -166,7 +171,9 @@ void HighlightCobol::flush(std::string *r) { word_.clear(); break; case QUOTE: + case QUOTE_BACKSLASH: case DQUOTE: + case DQUOTE_BACKSLASH: case COMMENT: *r += HI_RESET; break; diff --git a/llamafile/highlight_css.cpp b/llamafile/highlight_css.cpp index 6bd2e72ec3..bf40ffc8fa 100644 --- a/llamafile/highlight_css.cpp +++ b/llamafile/highlight_css.cpp @@ -24,11 +24,12 @@ #define PROPERTY 2 #define VALUE 3 #define QUOTE 4 -#define DQUOTE 5 -#define SLASH 6 -#define SLASH_STAR 7 -#define SLASH_STAR_STAR 8 -#define BACKSLASH 0x10000 +#define QUOTE_BACKSLASH 5 +#define DQUOTE 6 +#define DQUOTE_BACKSLASH 7 +#define SLASH 8 +#define SLASH_STAR 9 +#define SLASH_STAR_STAR 10 HighlightCss::HighlightCss() { } @@ -41,16 +42,6 @@ void HighlightCss::feed(std::string *r, std::string_view input) { for (size_t i = 0; i < input.size(); ++i) { c = input[i] & 255; - if (t_ & BACKSLASH) { - t_ &= ~BACKSLASH; - *r += c; - continue; - } else if (c == '\\') { - *r += c; - t_ |= BACKSLASH; - continue; - } - TryAgain: switch (t_ & 255) { @@ -170,17 +161,35 @@ void HighlightCss::feed(std::string *r, std::string_view input) { if (c == '\'') { *r += HI_RESET; goto Pop; + } else if (c == '\\') { + t_ &= -256; + t_ |= QUOTE_BACKSLASH; } break; + case QUOTE_BACKSLASH: + *r += c; + t_ &= -256; + t_ |= QUOTE; + break; + case DQUOTE: *r += c; if (c == '"') { *r += HI_RESET; goto Pop; + } else if (c == '\\') { + t_ &= -256; + t_ |= DQUOTE_BACKSLASH; } break; + case DQUOTE_BACKSLASH: + *r += c; + t_ &= -256; + t_ |= DQUOTE; + break; + Pop: t_ >>= 8; if (t_ == SELECTOR) @@ -200,6 +209,12 @@ void HighlightCss::flush(std::string *r) { case SLASH: *r += '/'; break; + case SELECTOR: + case PROPERTY: + case DQUOTE: + case DQUOTE_BACKSLASH: + *r += HI_RESET; + break; default: break; } diff --git a/llamafile/highlight_fortran.cpp b/llamafile/highlight_fortran.cpp index f235b16da9..2442901337 100644 --- a/llamafile/highlight_fortran.cpp +++ b/llamafile/highlight_fortran.cpp @@ -22,9 +22,10 @@ #define NORMAL 0 #define WORD 1 #define QUOTE 2 -#define DQUOTE 3 -#define COMMENT 4 -#define BACKSLASH 64 +#define QUOTE_BACKSLASH 3 +#define DQUOTE 4 +#define DQUOTE_BACKSLASH 5 +#define COMMENT 6 HighlightFortran::HighlightFortran() { } @@ -58,16 +59,6 @@ void HighlightFortran::feed(std::string *r, std::string_view input) { } } - if (t_ & BACKSLASH) { - t_ &= ~BACKSLASH; - *r += c; - continue; - } else if (c == '\\') { - *r += c; - t_ |= BACKSLASH; - continue; - } - switch (t_) { Normal: @@ -140,17 +131,31 @@ void HighlightFortran::feed(std::string *r, std::string_view input) { if (c == '\'') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = QUOTE_BACKSLASH; } break; + case QUOTE_BACKSLASH: + *r += c; + t_ = QUOTE; + break; + case DQUOTE: *r += c; if (c == '"') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = DQUOTE_BACKSLASH; } break; + case DQUOTE_BACKSLASH: + *r += c; + t_ = DQUOTE; + break; + default: __builtin_unreachable(); } @@ -170,7 +175,9 @@ void HighlightFortran::flush(std::string *r) { word_.clear(); break; case QUOTE: + case QUOTE_BACKSLASH: case DQUOTE: + case DQUOTE_BACKSLASH: case COMMENT: *r += HI_RESET; break; diff --git a/llamafile/highlight_haskell.cpp b/llamafile/highlight_haskell.cpp index bdbfb5a1ba..3a05f3fece 100644 --- a/llamafile/highlight_haskell.cpp +++ b/llamafile/highlight_haskell.cpp @@ -22,18 +22,20 @@ #define NORMAL 0 #define WORD 1 #define QUOTE 2 -#define DQUOTE 3 -#define TICK 4 -#define CURL 5 -#define CURL_HYPHEN 6 -#define CURL_HYPHEN_HYPHEN 7 -#define HYPHEN 8 -#define HYPHEN_HYPHEN 9 -#define HYPHEN_LT 10 -#define EQUAL 11 -#define COLON 12 -#define LT 13 -#define BACKSLASH 64 +#define QUOTE_BACKSLASH 3 +#define DQUOTE 4 +#define DQUOTE_BACKSLASH 5 +#define TICK 6 +#define TICK_BACKSLASH 7 +#define CURL 8 +#define CURL_HYPHEN 9 +#define CURL_HYPHEN_HYPHEN 10 +#define HYPHEN 11 +#define HYPHEN_HYPHEN 12 +#define HYPHEN_LT 13 +#define EQUAL 14 +#define COLON 15 +#define LT 16 HighlightHaskell::HighlightHaskell() { } @@ -45,17 +47,6 @@ void HighlightHaskell::feed(std::string *r, std::string_view input) { int c; for (size_t i = 0; i < input.size(); ++i) { c = input[i] & 255; - - if (t_ & BACKSLASH) { - t_ &= ~BACKSLASH; - *r += c; - continue; - } else if (c == '\\') { - *r += c; - t_ |= BACKSLASH; - continue; - } - switch (t_) { Normal: @@ -239,25 +230,46 @@ void HighlightHaskell::feed(std::string *r, std::string_view input) { if (c == '\'') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = QUOTE_BACKSLASH; } break; + case QUOTE_BACKSLASH: + *r += c; + t_ = QUOTE; + break; + case DQUOTE: *r += c; if (c == '"') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = DQUOTE_BACKSLASH; } break; + case DQUOTE_BACKSLASH: + *r += c; + t_ = DQUOTE; + break; + case TICK: *r += c; if (c == '`') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = TICK_BACKSLASH; } break; + case TICK_BACKSLASH: + *r += c; + t_ = TICK; + break; + default: __builtin_unreachable(); } @@ -265,7 +277,6 @@ void HighlightHaskell::feed(std::string *r, std::string_view input) { } void HighlightHaskell::flush(std::string *r) { - t_ &= ~BACKSLASH; switch (t_) { case WORD: if (is_keyword_haskell(symbol_.data(), symbol_.size())) { @@ -298,8 +309,11 @@ void HighlightHaskell::flush(std::string *r) { *r += HI_RESET; break; case TICK: + case TICK_BACKSLASH: case QUOTE: + case QUOTE_BACKSLASH: case DQUOTE: + case DQUOTE_BACKSLASH: case HYPHEN_HYPHEN: case CURL_HYPHEN: case CURL_HYPHEN_HYPHEN: diff --git a/llamafile/highlight_html.cpp b/llamafile/highlight_html.cpp index ad931398dd..3b24f8b7e5 100644 --- a/llamafile/highlight_html.cpp +++ b/llamafile/highlight_html.cpp @@ -315,6 +315,7 @@ void HighlightHtml::flush(std::string *r) { case COMMENT_HYPHEN_HYPHEN: case COMMENT_HYPHEN: case COMMENT: + case ENTITY: case DQUOTE: case QUOTE: case TAG2: diff --git a/llamafile/highlight_lisp.cpp b/llamafile/highlight_lisp.cpp index df0ae64a6c..0b4f001f52 100644 --- a/llamafile/highlight_lisp.cpp +++ b/llamafile/highlight_lisp.cpp @@ -21,9 +21,9 @@ #define NORMAL 0 #define SYMBOL 1 -#define DQUOTE 3 +#define DQUOTE 2 +#define DQUOTE_BACKSLASH 3 #define COMMENT 4 -#define BACKSLASH 64 HighlightLisp::HighlightLisp() { } @@ -35,17 +35,6 @@ void HighlightLisp::feed(std::string *r, std::string_view input) { int c; for (size_t i = 0; i < input.size(); ++i) { c = input[i] & 255; - - if (t_ & BACKSLASH) { - t_ &= ~BACKSLASH; - *r += c; - continue; - } else if (c == '\\') { - *r += c; - t_ |= BACKSLASH; - continue; - } - switch (t_) { Normal: @@ -116,9 +105,16 @@ void HighlightLisp::feed(std::string *r, std::string_view input) { if (c == '"') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = DQUOTE_BACKSLASH; } break; + case DQUOTE_BACKSLASH: + *r += c; + t_ = DQUOTE; + break; + case COMMENT: if (c == '\n') { *r += HI_RESET; @@ -136,7 +132,6 @@ void HighlightLisp::feed(std::string *r, std::string_view input) { } void HighlightLisp::flush(std::string *r) { - t_ &= ~BACKSLASH; switch (t_) { case SYMBOL: if (is_first_ && is_keyword_lisp(symbol_.data(), symbol_.size())) { @@ -153,6 +148,7 @@ void HighlightLisp::flush(std::string *r) { symbol_.clear(); break; case DQUOTE: + case DQUOTE_BACKSLASH: case COMMENT: *r += HI_RESET; break; diff --git a/llamafile/highlight_lua.cpp b/llamafile/highlight_lua.cpp index 7ba797e977..94a9539066 100644 --- a/llamafile/highlight_lua.cpp +++ b/llamafile/highlight_lua.cpp @@ -22,14 +22,15 @@ #define NORMAL 0 #define WORD 1 #define QUOTE 2 -#define DQUOTE 3 -#define HYPHEN 4 -#define COMMENT 5 -#define TICK 6 -#define LSB 7 -#define LITERAL 8 -#define LITERAL_RSB 9 -#define BACKSLASH 64 +#define QUOTE_BACKSLASH 3 +#define DQUOTE 4 +#define DQUOTE_BACKSLASH 5 +#define HYPHEN 6 +#define COMMENT 7 +#define TICK 8 +#define LSB 9 +#define LITERAL 10 +#define LITERAL_RSB 11 HighlightLua::HighlightLua() { } @@ -41,17 +42,6 @@ void HighlightLua::feed(std::string *r, std::string_view input) { int c; for (size_t i = 0; i < input.size(); ++i) { c = input[i] & 255; - - if (t_ & BACKSLASH) { - t_ &= ~BACKSLASH; - *r += c; - continue; - } else if (c == '\\') { - *r += c; - t_ |= BACKSLASH; - continue; - } - switch (t_) { Normal: @@ -105,7 +95,7 @@ void HighlightLua::feed(std::string *r, std::string_view input) { *r += "--"; t_ = COMMENT; } else { - *r += '/'; + *r += '-'; t_ = NORMAL; goto Normal; } @@ -126,17 +116,31 @@ void HighlightLua::feed(std::string *r, std::string_view input) { if (c == '\'') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = QUOTE_BACKSLASH; } break; + case QUOTE_BACKSLASH: + *r += c; + t_ = QUOTE; + break; + case DQUOTE: *r += c; if (c == '"') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = DQUOTE_BACKSLASH; } break; + case DQUOTE_BACKSLASH: + *r += c; + t_ = DQUOTE; + break; + case LSB: if (c == '=') { ++level1_; @@ -185,7 +189,6 @@ void HighlightLua::feed(std::string *r, std::string_view input) { } void HighlightLua::flush(std::string *r) { - t_ &= ~BACKSLASH; switch (t_) { case WORD: if (is_keyword_lua(word_.data(), word_.size())) { @@ -205,11 +208,14 @@ void HighlightLua::flush(std::string *r) { *r += '['; for (int i = 0; i < level1_; ++i) *r += '='; + break; case HYPHEN: *r += '-'; break; case QUOTE: + case QUOTE_BACKSLASH: case DQUOTE: + case DQUOTE_BACKSLASH: case COMMENT: case LITERAL: case LITERAL_RSB: diff --git a/llamafile/highlight_markdown.cpp b/llamafile/highlight_markdown.cpp index 965fc1490b..9723ef7f29 100644 --- a/llamafile/highlight_markdown.cpp +++ b/llamafile/highlight_markdown.cpp @@ -179,6 +179,10 @@ void HighlightMarkdown::flush(std::string *r) { case STAR: *r += '*'; break; + case TICK: + *r += '`'; + break; + case INCODE: case STRONG: case STRONG_STAR: *r += HI_RESET; diff --git a/llamafile/highlight_php.cpp b/llamafile/highlight_php.cpp index 5a13cd1314..507dea3818 100644 --- a/llamafile/highlight_php.cpp +++ b/llamafile/highlight_php.cpp @@ -22,14 +22,16 @@ #define NORMAL 0 #define WORD 1 #define QUOTE 2 -#define DQUOTE 3 -#define SLASH 4 -#define SLASH_SLASH 5 -#define SLASH_STAR 6 -#define SLASH_STAR_STAR 7 -#define TICK 8 -#define VAR 9 -#define BACKSLASH 64 +#define QUOTE_BACKSLASH 3 +#define DQUOTE 4 +#define DQUOTE_BACKSLASH 5 +#define SLASH 6 +#define SLASH_SLASH 7 +#define SLASH_STAR 8 +#define SLASH_STAR_STAR 9 +#define TICK 10 +#define TICK_BACKSLASH 11 +#define VAR 12 HighlightPhp::HighlightPhp() { } @@ -41,17 +43,6 @@ void HighlightPhp::feed(std::string *r, std::string_view input) { int c; for (size_t i = 0; i < input.size(); ++i) { c = input[i] & 255; - - if (t_ & BACKSLASH) { - t_ &= ~BACKSLASH; - *r += c; - continue; - } else if (c == '\\') { - *r += c; - t_ |= BACKSLASH; - continue; - } - switch (t_) { Normal: @@ -157,25 +148,46 @@ void HighlightPhp::feed(std::string *r, std::string_view input) { if (c == '\'') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = QUOTE_BACKSLASH; } break; + case QUOTE_BACKSLASH: + *r += c; + t_ = QUOTE; + break; + case DQUOTE: *r += c; if (c == '"') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = DQUOTE_BACKSLASH; } break; + case DQUOTE_BACKSLASH: + *r += c; + t_ = DQUOTE; + break; + case TICK: *r += c; if (c == '`') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = TICK_BACKSLASH; } break; + case TICK_BACKSLASH: + *r += c; + t_ = TICK; + break; + default: __builtin_unreachable(); } @@ -183,7 +195,6 @@ void HighlightPhp::feed(std::string *r, std::string_view input) { } void HighlightPhp::flush(std::string *r) { - t_ &= ~BACKSLASH; switch (t_) { case WORD: if (is_keyword_php(word_.data(), word_.size())) { diff --git a/llamafile/highlight_python.cpp b/llamafile/highlight_python.cpp index 5b5fe7a35a..85373d485c 100644 --- a/llamafile/highlight_python.cpp +++ b/llamafile/highlight_python.cpp @@ -25,19 +25,19 @@ #define SQUOTE 3 // ' #define SQUOTESTR 4 // '... -#define SQUOTE2 5 // '' -#define SQUOTE3 6 // '''... -#define SQUOTE31 7 // '''...' -#define SQUOTE32 8 // '''...'' +#define SQUOTESTR_BACKSLASH 5 // '... +#define SQUOTE2 6 // '' +#define SQUOTE3 7 // '''... +#define SQUOTE31 8 // '''...' +#define SQUOTE32 9 // '''...'' -#define DQUOTE 9 // " -#define DQUOTESTR 10 // "... -#define DQUOTE2 11 // "" -#define DQUOTE3 12 // """... -#define DQUOTE31 13 // """..." -#define DQUOTE32 14 // """..."" - -#define BACKSLASH 64 +#define DQUOTE 10 // " +#define DQUOTESTR 11 // "... +#define DQUOTESTR_BACKSLASH 12 // "... +#define DQUOTE2 13 // "" +#define DQUOTE3 14 // """... +#define DQUOTE31 15 // """..." +#define DQUOTE32 16 // """..."" HighlightPython::HighlightPython() { } @@ -49,17 +49,6 @@ void HighlightPython::feed(std::string *r, std::string_view input) { int c; for (size_t i = 0; i < input.size(); ++i) { c = input[i] & 255; - - if (t_ & BACKSLASH) { - t_ &= ~BACKSLASH; - *r += c; - continue; - } else if (c == '\\') { - *r += c; - t_ |= BACKSLASH; - continue; - } - switch (t_) { Normal: @@ -117,6 +106,8 @@ void HighlightPython::feed(std::string *r, std::string_view input) { *r += c; if (c == '\'') { t_ = SQUOTE2; + } else if (c == '\\') { + t_ = SQUOTESTR_BACKSLASH; } else { t_ = SQUOTESTR; } @@ -126,8 +117,14 @@ void HighlightPython::feed(std::string *r, std::string_view input) { if (c == '\'') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = SQUOTESTR_BACKSLASH; } break; + case SQUOTESTR_BACKSLASH: + *r += c; + t_ = SQUOTESTR; + break; // handle '''string''' case SQUOTE2: @@ -168,6 +165,8 @@ void HighlightPython::feed(std::string *r, std::string_view input) { *r += c; if (c == '"') { t_ = DQUOTE2; + } else if (c == '\\') { + t_ = DQUOTESTR_BACKSLASH; } else { t_ = DQUOTESTR; } @@ -177,8 +176,14 @@ void HighlightPython::feed(std::string *r, std::string_view input) { if (c == '"') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = DQUOTESTR_BACKSLASH; } break; + case DQUOTESTR_BACKSLASH: + *r += c; + t_ = DQUOTESTR; + break; // handle """string""" case DQUOTE2: @@ -221,7 +226,6 @@ void HighlightPython::feed(std::string *r, std::string_view input) { } void HighlightPython::flush(std::string *r) { - t_ &= ~BACKSLASH; switch (t_) { case WORD: if (is_keyword_python(word_.data(), word_.size())) { @@ -236,12 +240,14 @@ void HighlightPython::flush(std::string *r) { case COM: case SQUOTE: case SQUOTESTR: + case SQUOTESTR_BACKSLASH: case SQUOTE2: case SQUOTE3: case SQUOTE31: case SQUOTE32: case DQUOTE: case DQUOTESTR: + case DQUOTESTR_BACKSLASH: case DQUOTE2: case DQUOTE3: case DQUOTE31: diff --git a/llamafile/highlight_rust.cpp b/llamafile/highlight_rust.cpp index d0b1d1c084..3a9a9ccffc 100644 --- a/llamafile/highlight_rust.cpp +++ b/llamafile/highlight_rust.cpp @@ -22,16 +22,17 @@ #define NORMAL 0 #define WORD 1 #define QUOTE 2 -#define QUOTE2 3 -#define DQUOTE 4 -#define SLASH 5 -#define SLASH_SLASH 6 -#define SLASH_STAR 7 -#define SLASH_STAR_STAR 8 -#define HASH 9 -#define HASH_EXCLAIM 10 -#define ATTRIB 11 -#define BACKSLASH 64 +#define QUOTE_BACKSLASH 3 +#define QUOTE2 4 +#define DQUOTE 5 +#define DQUOTE_BACKSLASH 6 +#define SLASH 7 +#define SLASH_SLASH 8 +#define SLASH_STAR 9 +#define SLASH_STAR_STAR 10 +#define HASH 11 +#define HASH_EXCLAIM 12 +#define ATTRIB 13 HighlightRust::HighlightRust() { } @@ -43,19 +44,6 @@ void HighlightRust::feed(std::string *r, std::string_view input) { int c; for (size_t i = 0; i < input.size(); ++i) { c = input[i] & 255; - - if (t_ & BACKSLASH) { - t_ &= ~BACKSLASH; - *r += c; - if (t_ == QUOTE) - t_ = QUOTE2; - continue; - } else if (c == '\\') { - *r += c; - t_ |= BACKSLASH; - continue; - } - switch (t_) { Normal: @@ -149,11 +137,18 @@ void HighlightRust::feed(std::string *r, std::string_view input) { if (c == '\'') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = QUOTE_BACKSLASH; } else { t_ = QUOTE2; } break; + case QUOTE_BACKSLASH: + *r += c; + t_ = QUOTE2; + break; + case QUOTE2: if (c == '\'') { *r += c; @@ -171,9 +166,16 @@ void HighlightRust::feed(std::string *r, std::string_view input) { if (c == '"') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = DQUOTE_BACKSLASH; } break; + case DQUOTE_BACKSLASH: + *r += c; + t_ = DQUOTE; + break; + case HASH: if (c == '!') { t_ = HASH_EXCLAIM; @@ -221,7 +223,6 @@ void HighlightRust::feed(std::string *r, std::string_view input) { } void HighlightRust::flush(std::string *r) { - t_ &= ~BACKSLASH; switch (t_) { case WORD: if (is_keyword_rust(word_.data(), word_.size())) { @@ -241,8 +242,10 @@ void HighlightRust::flush(std::string *r) { *r += '/'; break; case QUOTE: + case QUOTE_BACKSLASH: case QUOTE2: case DQUOTE: + case DQUOTE_BACKSLASH: case ATTRIB: case SLASH_SLASH: case SLASH_STAR: diff --git a/llamafile/highlight_sql.cpp b/llamafile/highlight_sql.cpp index 80b246e3e6..471e219889 100644 --- a/llamafile/highlight_sql.cpp +++ b/llamafile/highlight_sql.cpp @@ -22,13 +22,14 @@ #define NORMAL 0 #define WORD 1 #define QUOTE 2 -#define DQUOTE 3 -#define HYPHEN 4 -#define HYPHEN_HYPHEN 5 -#define SLASH 6 -#define SLASH_STAR 7 -#define SLASH_STAR_STAR 8 -#define BACKSLASH 64 +#define QUOTE_BACKSLASH 3 +#define DQUOTE 4 +#define DQUOTE_BACKSLASH 5 +#define HYPHEN 6 +#define HYPHEN_HYPHEN 7 +#define SLASH 8 +#define SLASH_STAR 9 +#define SLASH_STAR_STAR 10 HighlightSql::HighlightSql() { } @@ -40,17 +41,6 @@ void HighlightSql::feed(std::string *r, std::string_view input) { int c; for (size_t i = 0; i < input.size(); ++i) { c = input[i] & 255; - - if (t_ & BACKSLASH) { - t_ &= ~BACKSLASH; - *r += c; - continue; - } else if (c == '\\') { - *r += c; - t_ |= BACKSLASH; - continue; - } - switch (t_) { Normal: @@ -126,17 +116,31 @@ void HighlightSql::feed(std::string *r, std::string_view input) { if (c == '\'') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = QUOTE_BACKSLASH; } break; + case QUOTE_BACKSLASH: + *r += c; + t_ = QUOTE; + break; + case DQUOTE: *r += c; if (c == '"') { *r += HI_RESET; t_ = NORMAL; + } else if (c == '\\') { + t_ = DQUOTE_BACKSLASH; } break; + case DQUOTE_BACKSLASH: + *r += c; + t_ = DQUOTE; + break; + case HYPHEN: if (c == '-') { *r += HI_COMMENT; @@ -166,7 +170,6 @@ void HighlightSql::feed(std::string *r, std::string_view input) { } void HighlightSql::flush(std::string *r) { - t_ &= ~BACKSLASH; switch (t_) { case WORD: if (is_keyword_sql(word_.data(), word_.size())) { @@ -185,7 +188,9 @@ void HighlightSql::flush(std::string *r) { *r += '-'; break; case QUOTE: + case QUOTE_BACKSLASH: case DQUOTE: + case DQUOTE_BACKSLASH: case SLASH_STAR: case SLASH_STAR_STAR: case HYPHEN_HYPHEN: diff --git a/llamafile/highlight_test.cpp b/llamafile/highlight_test.cpp new file mode 100644 index 0000000000..f925b37373 --- /dev/null +++ b/llamafile/highlight_test.cpp @@ -0,0 +1,155 @@ +// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi +// +// Copyright 2024 Mozilla Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highlight.h" +#include "macros.h" + +#include +#include +#include +#include + +#define LENGTH 10 +#define ITERATIONS 200000 +#define CHARSET "abc123{}[]!@#$%^*().\"'`\\/\n-_=&;:<>," + +const char *const kLanguages[] = { + "ada", // + "c", // + "c#", // + "c++", // + "cobol", // + "cs", // + "csharp", // + "css", // + "fortran", // + "go", // + "haskell", // + "html", // + "java", // + "javascript", // + "kotlin", // + "lisp", // + "lua", // + "markdown", // + "pascal", // + "php", // + "plain", // + "python", // + "rust", // + "sql", // +}; + +int rand32(void) { + /* Knuth, D.E., "The Art of Computer Programming," Vol 2, + Seminumerical Algorithms, Third Edition, Addison-Wesley, 1998, + p. 106 (line 26) & p. 108 */ + static unsigned long long lcg = 1; + lcg *= 6364136223846793005; + lcg += 1442695040888963407; + return lcg >> 32; +} + +std::string generate_random_string(int n) { + std::string s; + s.reserve(n); + for (int i = 0; i < n; ++i) + s += CHARSET[rand32() % (sizeof(CHARSET) - 1)]; + return s; +} + +std::string remove_ansi_sgr_codes(const std::string &input) { + std::string result; + result.reserve(input.length()); + bool in_escape_sequence = false; + for (char c : input) { + if (c == '\033') { + in_escape_sequence = true; + } else if (in_escape_sequence) { + if (c == 'm') + in_escape_sequence = false; + } else { + result += c; + } + } + return result; +} + +bool is_color_reset(const std::string &input) { + int t = 0; + int number = 0; + bool has_color = false; + for (char c : input) { + switch (t) { + case 0: + if (c == 033) + t = 1; + break; + case 1: + if (c == '[') { + t = 2; + number = 0; + } else { + fprintf(stderr, "unexpected ansi escape structure\n"); + exit(1); + } + break; + case 2: + if (isdigit(c)) { + number *= 10; + number += c - '0'; + } else if (c == 'm') { + has_color = !!number; + t = 0; + } else if (c == ';') { + has_color = !!number; + number = 0; + } else { + fprintf(stderr, "unexpected ansi escape structure\n"); + exit(1); + } + break; + default: + __builtin_unreachable(); + } + } + return !has_color; +} + +int main(int argc, char *argv[]) { + for (int l = 0; l < ARRAYLEN(kLanguages); ++l) { + Highlight *h = Highlight::create(kLanguages[l]); + for (int i = 0; i < ITERATIONS; ++i) { + std::string sauce = generate_random_string(LENGTH); + std::string colorized; + h->feed(&colorized, sauce); + h->flush(&colorized); + if (!is_color_reset(colorized)) { + fprintf(stderr, "%s highlight failed to reset color: %`'s -> %`'s\n", kLanguages[l], + sauce.c_str(), colorized.c_str()); + exit(1); + } + std::string plain = remove_ansi_sgr_codes(colorized); + if (sauce != plain) { + fprintf(stderr, "%s highlight failed to preserve code: %`'s -> %`'s -> %`'s\n", + kLanguages[l], sauce.c_str(), colorized.c_str(), plain.c_str()); + exit(1); + } + } + delete h; + } +}