diff --git a/components/tidb_query_datatype/src/codec/collation/charset.rs b/components/tidb_query_datatype/src/codec/collation/charset.rs index 9982582225f..5aeca5089c1 100644 --- a/components/tidb_query_datatype/src/codec/collation/charset.rs +++ b/components/tidb_query_datatype/src/codec/collation/charset.rs @@ -47,3 +47,6 @@ impl Charset for CharsetUtf8mb4 { }) } } + +// gbk character data actually stored with utf8mb4 character encoding. +pub type CharsetGbk = CharsetUtf8mb4; diff --git a/components/tidb_query_datatype/src/codec/collation/collator/gbk_bin.data b/components/tidb_query_datatype/src/codec/collation/collator/gbk_bin.data new file mode 100644 index 00000000000..8422aadbf78 Binary files /dev/null and b/components/tidb_query_datatype/src/codec/collation/collator/gbk_bin.data differ diff --git a/components/tidb_query_datatype/src/codec/collation/collator/gbk_bin.rs b/components/tidb_query_datatype/src/codec/collation/collator/gbk_bin.rs new file mode 100644 index 00000000000..ebc1e85e28b --- /dev/null +++ b/components/tidb_query_datatype/src/codec/collation/collator/gbk_bin.rs @@ -0,0 +1,66 @@ +// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. + +use super::*; + +/// Collator for `gbk_bin` collation with padding behavior (trims right spaces). +#[derive(Debug)] +pub struct CollatorGbkBin; + +impl Collator for CollatorGbkBin { + type Charset = CharsetGbk; + type Weight = u16; + + const IS_CASE_INSENSITIVE: bool = false; + + #[inline] + fn char_weight(ch: char) -> Self::Weight { + // All GBK code point are in BMP, if the incoming character is not, convert it to '?'. + // This should not happened. + let r = ch as usize; + if r > 0xFFFF { + return '?' as u16; + } + + (&GBK_BIN_TABLE[r * 2..r * 2 + 2]).read_u16().unwrap() + } + + #[inline] + fn write_sort_key(writer: &mut W, bstr: &[u8]) -> Result { + let s = str::from_utf8(bstr)?.trim_end_matches(PADDING_SPACE); + let mut n = 0; + for ch in s.chars() { + let weight = Self::char_weight(ch); + if weight > 0xFF { + writer.write_u16_be(weight)?; + n += 2; + } else { + writer.write_u8(weight as u8)?; + n += 1; + } + } + Ok(n * std::mem::size_of::()) + } + + #[inline] + fn sort_compare(a: &[u8], b: &[u8]) -> Result { + let sa = str::from_utf8(a)?.trim_end_matches(PADDING_SPACE); + let sb = str::from_utf8(b)?.trim_end_matches(PADDING_SPACE); + Ok(sa + .chars() + .map(Self::char_weight) + .cmp(sb.chars().map(Self::char_weight))) + } + + #[inline] + fn sort_hash(state: &mut H, bstr: &[u8]) -> Result<()> { + let s = str::from_utf8(bstr)?.trim_end_matches(PADDING_SPACE); + for ch in s.chars().map(Self::char_weight) { + ch.hash(state); + } + Ok(()) + } +} + +// GBK_BIN_TABLE are the encoding tables from Unicode to GBK code, it is totally the same with golang's GBK encoding. +// If there is no mapping code in GBK, use 0x3F(?) instead. It should not happened. +static GBK_BIN_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_bin.data"); diff --git a/components/tidb_query_datatype/src/codec/collation/collator/mod.rs b/components/tidb_query_datatype/src/codec/collation/collator/mod.rs index e2ddb1a4e3f..33697ff396e 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/mod.rs @@ -1,12 +1,14 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. mod binary; +mod gbk_bin; mod latin1_bin; mod utf8mb4_binary; mod utf8mb4_general_ci; mod utf8mb4_unicode_ci; pub use binary::*; +pub use gbk_bin::*; pub use latin1_bin::*; pub use utf8mb4_binary::*; pub use utf8mb4_general_ci::*; @@ -42,9 +44,10 @@ mod tests { (Collation::Utf8Mb4GeneralCi, 2), (Collation::Utf8Mb4UnicodeCi, 3), (Collation::Latin1Bin, 4), + (Collation::GbkBin, 5), ]; let cases = vec![ - // (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi]) + // (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, GBKBin]) ( "a".as_bytes(), "a".as_bytes(), @@ -54,6 +57,7 @@ mod tests { Ordering::Equal, Ordering::Equal, Ordering::Equal, + Ordering::Equal, ], ), ( @@ -65,6 +69,7 @@ mod tests { Ordering::Equal, Ordering::Equal, Ordering::Equal, + Ordering::Equal, ], ), ( @@ -76,6 +81,7 @@ mod tests { Ordering::Equal, Ordering::Equal, Ordering::Greater, + Ordering::Greater, ], ), ( @@ -87,6 +93,7 @@ mod tests { Ordering::Greater, Ordering::Greater, Ordering::Greater, + Ordering::Greater, ], ), ( @@ -98,6 +105,7 @@ mod tests { Ordering::Less, Ordering::Less, Ordering::Less, + Ordering::Less, ], ), ( @@ -109,6 +117,7 @@ mod tests { Ordering::Equal, Ordering::Equal, Ordering::Less, + Ordering::Less, ], ), ( @@ -120,6 +129,7 @@ mod tests { Ordering::Equal, Ordering::Equal, Ordering::Less, + Ordering::Less, ], ), ( @@ -131,6 +141,19 @@ mod tests { Ordering::Less, Ordering::Equal, Ordering::Greater, + Ordering::Less, + ], + ), + ( + "中文".as_bytes(), + "汉字".as_bytes(), + [ + Ordering::Less, + Ordering::Less, + Ordering::Less, + Ordering::Less, + Ordering::Less, + Ordering::Greater, ], ), ]; @@ -185,9 +208,10 @@ mod tests { (Collation::Utf8Mb4GeneralCi, 2), (Collation::Utf8Mb4UnicodeCi, 3), (Collation::Latin1Bin, 4), + (Collation::GbkBin, 5), ]; let cases = vec![ - // (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi]) + // (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, GBKbin]) ( "a", [ @@ -196,6 +220,7 @@ mod tests { vec![0x00, 0x41], vec![0x0E, 0x33], vec![0x61], + vec![0x61], ], ), ( @@ -206,6 +231,7 @@ mod tests { vec![0x00, 0x41], vec![0x0E, 0x33], vec![0x41], + vec![0x41], ], ), ( @@ -216,6 +242,7 @@ mod tests { vec![0x00, 0x41], vec![0x0E, 0x33], vec![0x41], + vec![0x41], ], ), ( @@ -226,6 +253,7 @@ mod tests { vec![0xff, 0xfd], vec![0xff, 0xfd], vec![0xF0, 0x9F, 0x98, 0x83], + vec![0x3F], ], ), ( @@ -258,6 +286,10 @@ mod tests { 0x9D, 0x8C, 0x86, 0x20, 0x62, 0x61, 0x7A, 0x20, 0xE2, 0x98, 0x83, 0x20, 0x71, 0x75, 0x78, ], + vec![ + 0x46, 0x6f, 0x6f, 0x20, 0x3f, 0x20, 0x62, 0x61, 0x72, 0x20, 0x3f, 0x20, + 0x62, 0x61, 0x7a, 0x20, 0x3f, 0x20, 0x71, 0x75, 0x78, + ], ], ), ( @@ -271,6 +303,18 @@ mod tests { 0x13, 0xAB, 0x13, 0xB7, ], vec![0xEF, 0xB7, 0xBB], + vec![0x3f], + ], + ), + ( + "中文", + [ + vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87], + vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87], + vec![0x4E, 0x2D, 0x65, 0x87], + vec![0xFB, 0x40, 0xCE, 0x2D, 0xFB, 0x40, 0xE5, 0x87], + vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87], + vec![0xD6, 0xD0, 0xCE, 0xC4], ], ), ]; diff --git a/components/tidb_query_datatype/src/codec/collation/mod.rs b/components/tidb_query_datatype/src/codec/collation/mod.rs index d1559e56178..e1e8d731362 100644 --- a/components/tidb_query_datatype/src/codec/collation/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/mod.rs @@ -27,6 +27,7 @@ macro_rules! match_template_collator { Utf8Mb4GeneralCi => CollatorUtf8Mb4GeneralCi, Utf8Mb4UnicodeCi => CollatorUtf8Mb4UnicodeCi, Latin1Bin => CollatorLatin1Bin, + GbkBin => CollatorGbkBin, ], $($tail)* } diff --git a/components/tidb_query_datatype/src/def/field_type.rs b/components/tidb_query_datatype/src/def/field_type.rs index 3a7a98fb132..d6e77e6b2d6 100644 --- a/components/tidb_query_datatype/src/def/field_type.rs +++ b/components/tidb_query_datatype/src/def/field_type.rs @@ -111,6 +111,7 @@ pub enum Collation { Utf8Mb4GeneralCi = -45, Utf8Mb4UnicodeCi = -224, Latin1Bin = -47, + GbkBin = -87, } impl Collation { @@ -126,6 +127,7 @@ impl Collation { -47 => Ok(Collation::Latin1Bin), -63 | 63 | 47 => Ok(Collation::Binary), -224 | -192 => Ok(Collation::Utf8Mb4UnicodeCi), + -87 => Ok(Collation::GbkBin), n if n >= 0 => Ok(Collation::Utf8Mb4BinNoPadding), n => Err(DataTypeError::UnsupportedCollation { code: n }), }