@@ -362,6 +362,7 @@ ssize_t utf8_len(const char *str, size_t len) {
362362
363363 while (1 ) {
364364 char_len = utf8proc_iterate (ptr , -1 , & ch );
365+ if (char_len <= 0 ) break ;
365366
366367 if (ch == 0 ) break ;
367368 remaining -= char_len ;
@@ -387,6 +388,7 @@ uint32_array *unicode_codepoints(const char *str) {
387388
388389 while (1 ) {
389390 char_len = utf8proc_iterate (ptr , -1 , & ch );
391+ if (char_len <= 0 ) break ;
390392
391393 if (ch == 0 ) break ;
392394
@@ -527,7 +529,8 @@ size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len) {
527529 len1 = utf8proc_iterate (ptr1 , -1 , & c1 );
528530 len2 = utf8proc_iterate (ptr2 , -1 , & c2 );
529531
530- if (c1 <= 0 || c2 <= 0 ) break ;
532+ if (len1 <= 0 || len2 <= 0 || c1 <= 0 || c2 <= 0 ) break ;
533+
531534 if (c1 == c2 ) {
532535 ptr1 += len1 ;
533536 ptr2 += len2 ;
@@ -572,6 +575,9 @@ size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *st
572575 len1 = utf8proc_iterate (ptr1 , -1 , & c1 );
573576 len2 = utf8proc_iterate (ptr2 , -1 , & c2 );
574577
578+ /* Note: utf8 comparison can handle a non-valid UTF-8 sequence e.g. for trie
579+ ** suffix comparison where we may be in the middle of a multi-byte character
580+ **/
575581 if (len1 < 0 && len2 < 0 && * ptr1 == * ptr2 ) {
576582 ptr1 ++ ;
577583 ptr2 ++ ;
@@ -631,6 +637,9 @@ bool utf8_equal_ignore_separators_len(const char *str1, const char *str2, size_t
631637 len1 = utf8proc_iterate (ptr1 , -1 , & c1 );
632638 len2 = utf8proc_iterate (ptr2 , -1 , & c2 );
633639
640+ /* Note: utf8 comparison can handle a non-valid UTF-8 sequence e.g. for trie
641+ ** suffix comparison where we may be in the middle of a multi-byte character
642+ **/
634643 if (len1 < 0 && len2 < 0 && * ptr1 == * ptr2 ) {
635644 ptr1 ++ ;
636645 ptr2 ++ ;
@@ -821,7 +830,7 @@ size_t string_right_spaces_len(char *str, size_t len) {
821830 while (1 ) {
822831 ssize_t char_len = utf8proc_iterate_reversed (ptr , index , & ch );
823832
824- if (ch < = 0 ) break ;
833+ if (char_len <= 0 || ch = = 0 ) break ;
825834
826835 if (!utf8_is_whitespace (ch )) {
827836 break ;
@@ -840,6 +849,7 @@ inline size_t string_hyphen_prefix_len(char *str, size_t len) {
840849 int32_t unichr ;
841850 uint8_t * ptr = (uint8_t * )str ;
842851 ssize_t char_len = utf8proc_iterate (ptr , len , & unichr );
852+ if (char_len <= 0 || unichr == 0 ) return 0 ;
843853 if (utf8_is_hyphen (unichr )) {
844854 return (size_t )char_len ;
845855 }
@@ -851,6 +861,7 @@ inline size_t string_hyphen_suffix_len(char *str, size_t len) {
851861 int32_t unichr ;
852862 uint8_t * ptr = (uint8_t * )str ;
853863 ssize_t char_len = utf8proc_iterate_reversed (ptr , len , & unichr );
864+ if (char_len <= 0 || unichr == 0 ) return 0 ;
854865 if (utf8_is_hyphen (unichr )) {
855866 return (size_t )char_len ;
856867 }
@@ -867,7 +878,7 @@ size_t string_left_spaces_len(char *str, size_t len) {
867878 while (1 ) {
868879 ssize_t char_len = utf8proc_iterate (ptr , len , & ch );
869880
870- if (ch < = 0 ) break ;
881+ if (char_len <= 0 || ch = = 0 ) break ;
871882
872883 if (!utf8_is_whitespace (ch )) {
873884 break ;
0 commit comments