diff --git a/components/tidb_query_datatype/src/codec/collation/collator/gbk_chinese_ci.data b/components/tidb_query_datatype/src/codec/collation/collator/gbk_chinese_ci.data new file mode 100644 index 00000000000..a10335ac035 Binary files /dev/null and b/components/tidb_query_datatype/src/codec/collation/collator/gbk_chinese_ci.data differ diff --git a/components/tidb_query_datatype/src/codec/collation/collator/gbk_bin.rs b/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs similarity index 64% rename from components/tidb_query_datatype/src/codec/collation/collator/gbk_bin.rs rename to components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs index ebc1e85e28b..9c2dd2497f1 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/gbk_bin.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs @@ -2,15 +2,16 @@ use super::*; -/// Collator for `gbk_bin` collation with padding behavior (trims right spaces). -#[derive(Debug)] -pub struct CollatorGbkBin; +pub trait GbkCollator: 'static + std::marker::Send + std::marker::Sync + std::fmt::Debug { + const IS_CASE_INSENSITIVE: bool; + const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2]; +} -impl Collator for CollatorGbkBin { +impl Collator for T { type Charset = CharsetGbk; type Weight = u16; - const IS_CASE_INSENSITIVE: bool = false; + const IS_CASE_INSENSITIVE: bool = T::IS_CASE_INSENSITIVE; #[inline] fn char_weight(ch: char) -> Self::Weight { @@ -21,7 +22,7 @@ impl Collator for CollatorGbkBin { return '?' as u16; } - (&GBK_BIN_TABLE[r * 2..r * 2 + 2]).read_u16().unwrap() + (&Self::WEIGHT_TABLE[r * 2..r * 2 + 2]).read_u16().unwrap() } #[inline] @@ -61,6 +62,28 @@ impl Collator for CollatorGbkBin { } } +/// Collator for `gbk_bin` collation with padding behavior (trims right spaces). +#[derive(Debug)] +pub struct CollatorGbkBin; + +impl GbkCollator for CollatorGbkBin { + const IS_CASE_INSENSITIVE: bool = false; + const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2] = GBK_BIN_TABLE; +} + +/// Collator for `gbk_chinese_ci` collation with padding behavior (trims right spaces). +#[derive(Debug)] +pub struct CollatorGbkChineseCi; + +impl GbkCollator for CollatorGbkChineseCi { + const IS_CASE_INSENSITIVE: bool = true; + const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2] = GBK_CHINESE_CI_TABLE; +} + // GBK_BIN_TABLE are the encoding tables from Unicode to GBK code, it is totally the same with golang's GBK encoding. // If there is no mapping code in GBK, use 0x3F(?) instead. It should not happened. -static GBK_BIN_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_bin.data"); +const GBK_BIN_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_bin.data"); + +// GBK_CHINESE_CI_TABLE are the sort key tables for GBK codepoint. +// If there is no mapping code in GBK, use 0x3F(?) instead. It should not happened. +const GBK_CHINESE_CI_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_chinese_ci.data"); diff --git a/components/tidb_query_datatype/src/codec/collation/collator/mod.rs b/components/tidb_query_datatype/src/codec/collation/collator/mod.rs index 33697ff396e..f1c694d0f67 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/mod.rs @@ -1,14 +1,14 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. mod binary; -mod gbk_bin; +mod gbk_collation; mod latin1_bin; mod utf8mb4_binary; mod utf8mb4_general_ci; mod utf8mb4_unicode_ci; pub use binary::*; -pub use gbk_bin::*; +pub use gbk_collation::*; pub use latin1_bin::*; pub use utf8mb4_binary::*; pub use utf8mb4_general_ci::*; @@ -45,9 +45,10 @@ mod tests { (Collation::Utf8Mb4UnicodeCi, 3), (Collation::Latin1Bin, 4), (Collation::GbkBin, 5), + (Collation::GbkChineseCi, 6), ]; let cases = vec![ - // (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, GBKBin]) + // (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, Latin1, GBKBin, GbkChineseCi]) ( "a".as_bytes(), "a".as_bytes(), @@ -58,6 +59,7 @@ mod tests { Ordering::Equal, Ordering::Equal, Ordering::Equal, + Ordering::Equal, ], ), ( @@ -70,6 +72,7 @@ mod tests { Ordering::Equal, Ordering::Equal, Ordering::Equal, + Ordering::Equal, ], ), ( @@ -82,6 +85,7 @@ mod tests { Ordering::Equal, Ordering::Greater, Ordering::Greater, + Ordering::Equal, ], ), ( @@ -94,6 +98,7 @@ mod tests { Ordering::Greater, Ordering::Greater, Ordering::Greater, + Ordering::Greater, ], ), ( @@ -106,6 +111,7 @@ mod tests { Ordering::Less, Ordering::Less, Ordering::Less, + Ordering::Less, ], ), ( @@ -118,6 +124,7 @@ mod tests { Ordering::Equal, Ordering::Less, Ordering::Less, + Ordering::Less, ], ), ( @@ -130,6 +137,7 @@ mod tests { Ordering::Equal, Ordering::Less, Ordering::Less, + Ordering::Less, ], ), ( @@ -142,6 +150,7 @@ mod tests { Ordering::Equal, Ordering::Greater, Ordering::Less, + Ordering::Less, ], ), ( @@ -154,6 +163,20 @@ mod tests { Ordering::Less, Ordering::Less, Ordering::Greater, + Ordering::Greater, + ], + ), + ( + "啊".as_bytes(), + "把".as_bytes(), + [ + Ordering::Less, + Ordering::Less, + Ordering::Less, + Ordering::Less, + Ordering::Less, + Ordering::Less, + Ordering::Less, ], ), ]; @@ -209,9 +232,10 @@ mod tests { (Collation::Utf8Mb4UnicodeCi, 3), (Collation::Latin1Bin, 4), (Collation::GbkBin, 5), + (Collation::GbkChineseCi, 6), ]; let cases = vec![ - // (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, GBKbin]) + // (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, Latin1, GBKBin, GbkChineseCi]) ( "a", [ @@ -221,6 +245,7 @@ mod tests { vec![0x0E, 0x33], vec![0x61], vec![0x61], + vec![0x41], ], ), ( @@ -232,6 +257,7 @@ mod tests { vec![0x0E, 0x33], vec![0x41], vec![0x41], + vec![0x41], ], ), ( @@ -243,6 +269,7 @@ mod tests { vec![0x0E, 0x33], vec![0x41], vec![0x41], + vec![0x41], ], ), ( @@ -254,6 +281,7 @@ mod tests { vec![0xff, 0xfd], vec![0xF0, 0x9F, 0x98, 0x83], vec![0x3F], + vec![0x3F], ], ), ( @@ -290,6 +318,10 @@ mod tests { 0x46, 0x6f, 0x6f, 0x20, 0x3f, 0x20, 0x62, 0x61, 0x72, 0x20, 0x3f, 0x20, 0x62, 0x61, 0x7a, 0x20, 0x3f, 0x20, 0x71, 0x75, 0x78, ], + vec![ + 0x46, 0x4f, 0x4f, 0x20, 0x3f, 0x20, 0x42, 0x41, 0x52, 0x20, 0x3f, 0x20, + 0x42, 0x41, 0x5a, 0x20, 0x3f, 0x20, 0x51, 0x55, 0x58, + ], ], ), ( @@ -304,6 +336,7 @@ mod tests { ], vec![0xEF, 0xB7, 0xBB], vec![0x3f], + vec![0x3f], ], ), ( @@ -315,6 +348,7 @@ mod tests { vec![0xFB, 0x40, 0xCE, 0x2D, 0xFB, 0x40, 0xE5, 0x87], vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87], vec![0xD6, 0xD0, 0xCE, 0xC4], + vec![0xD3, 0x21, 0xC1, 0xAD], ], ), ]; diff --git a/components/tidb_query_datatype/src/codec/collation/mod.rs b/components/tidb_query_datatype/src/codec/collation/mod.rs index e1e8d731362..ed9cbe654ef 100644 --- a/components/tidb_query_datatype/src/codec/collation/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/mod.rs @@ -28,6 +28,7 @@ macro_rules! match_template_collator { Utf8Mb4UnicodeCi => CollatorUtf8Mb4UnicodeCi, Latin1Bin => CollatorLatin1Bin, GbkBin => CollatorGbkBin, + GbkChineseCi => CollatorGbkChineseCi, ], $($tail)* } diff --git a/components/tidb_query_datatype/src/def/field_type.rs b/components/tidb_query_datatype/src/def/field_type.rs index d6e77e6b2d6..561f1f53c1a 100644 --- a/components/tidb_query_datatype/src/def/field_type.rs +++ b/components/tidb_query_datatype/src/def/field_type.rs @@ -112,6 +112,7 @@ pub enum Collation { Utf8Mb4UnicodeCi = -224, Latin1Bin = -47, GbkBin = -87, + GbkChineseCi = -28, } impl Collation {