Skip to content

Commit

Permalink
coprocessor: add gbk_chinese_ci support in coprocessor (tikv#11024)
Browse files Browse the repository at this point in the history
Signed-off-by: xiongjiwei <[email protected]>

Co-authored-by: Ti Chi Robot <[email protected]>
  • Loading branch information
xiongjiwei and ti-chi-bot authored Oct 11, 2021
1 parent 085ee22 commit 3adc033
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 11 deletions.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@

use super::*;

/// Collator for `gbk_bin` collation with padding behavior (trims right spaces).
#[derive(Debug)]
pub struct CollatorGbkBin;
pub trait GbkCollator: 'static + std::marker::Send + std::marker::Sync + std::fmt::Debug {
const IS_CASE_INSENSITIVE: bool;
const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2];
}

impl Collator for CollatorGbkBin {
impl<T: GbkCollator> Collator for T {
type Charset = CharsetGbk;
type Weight = u16;

const IS_CASE_INSENSITIVE: bool = false;
const IS_CASE_INSENSITIVE: bool = T::IS_CASE_INSENSITIVE;

#[inline]
fn char_weight(ch: char) -> Self::Weight {
Expand All @@ -21,7 +22,7 @@ impl Collator for CollatorGbkBin {
return '?' as u16;
}

(&GBK_BIN_TABLE[r * 2..r * 2 + 2]).read_u16().unwrap()
(&Self::WEIGHT_TABLE[r * 2..r * 2 + 2]).read_u16().unwrap()
}

#[inline]
Expand Down Expand Up @@ -61,6 +62,28 @@ impl Collator for CollatorGbkBin {
}
}

/// Collator for `gbk_bin` collation with padding behavior (trims right spaces).
#[derive(Debug)]
pub struct CollatorGbkBin;

impl GbkCollator for CollatorGbkBin {
const IS_CASE_INSENSITIVE: bool = false;
const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2] = GBK_BIN_TABLE;
}

/// Collator for `gbk_chinese_ci` collation with padding behavior (trims right spaces).
#[derive(Debug)]
pub struct CollatorGbkChineseCi;

impl GbkCollator for CollatorGbkChineseCi {
const IS_CASE_INSENSITIVE: bool = true;
const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2] = GBK_CHINESE_CI_TABLE;
}

// GBK_BIN_TABLE are the encoding tables from Unicode to GBK code, it is totally the same with golang's GBK encoding.
// If there is no mapping code in GBK, use 0x3F(?) instead. It should not happened.
static GBK_BIN_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_bin.data");
const GBK_BIN_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_bin.data");

// GBK_CHINESE_CI_TABLE are the sort key tables for GBK codepoint.
// If there is no mapping code in GBK, use 0x3F(?) instead. It should not happened.
const GBK_CHINESE_CI_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_chinese_ci.data");
42 changes: 38 additions & 4 deletions components/tidb_query_datatype/src/codec/collation/collator/mod.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0.

mod binary;
mod gbk_bin;
mod gbk_collation;
mod latin1_bin;
mod utf8mb4_binary;
mod utf8mb4_general_ci;
mod utf8mb4_unicode_ci;

pub use binary::*;
pub use gbk_bin::*;
pub use gbk_collation::*;
pub use latin1_bin::*;
pub use utf8mb4_binary::*;
pub use utf8mb4_general_ci::*;
Expand Down Expand Up @@ -45,9 +45,10 @@ mod tests {
(Collation::Utf8Mb4UnicodeCi, 3),
(Collation::Latin1Bin, 4),
(Collation::GbkBin, 5),
(Collation::GbkChineseCi, 6),
];
let cases = vec![
// (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, GBKBin])
// (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, Latin1, GBKBin, GbkChineseCi])
(
"a".as_bytes(),
"a".as_bytes(),
Expand All @@ -58,6 +59,7 @@ mod tests {
Ordering::Equal,
Ordering::Equal,
Ordering::Equal,
Ordering::Equal,
],
),
(
Expand All @@ -70,6 +72,7 @@ mod tests {
Ordering::Equal,
Ordering::Equal,
Ordering::Equal,
Ordering::Equal,
],
),
(
Expand All @@ -82,6 +85,7 @@ mod tests {
Ordering::Equal,
Ordering::Greater,
Ordering::Greater,
Ordering::Equal,
],
),
(
Expand All @@ -94,6 +98,7 @@ mod tests {
Ordering::Greater,
Ordering::Greater,
Ordering::Greater,
Ordering::Greater,
],
),
(
Expand All @@ -106,6 +111,7 @@ mod tests {
Ordering::Less,
Ordering::Less,
Ordering::Less,
Ordering::Less,
],
),
(
Expand All @@ -118,6 +124,7 @@ mod tests {
Ordering::Equal,
Ordering::Less,
Ordering::Less,
Ordering::Less,
],
),
(
Expand All @@ -130,6 +137,7 @@ mod tests {
Ordering::Equal,
Ordering::Less,
Ordering::Less,
Ordering::Less,
],
),
(
Expand All @@ -142,6 +150,7 @@ mod tests {
Ordering::Equal,
Ordering::Greater,
Ordering::Less,
Ordering::Less,
],
),
(
Expand All @@ -154,6 +163,20 @@ mod tests {
Ordering::Less,
Ordering::Less,
Ordering::Greater,
Ordering::Greater,
],
),
(
"啊".as_bytes(),
"把".as_bytes(),
[
Ordering::Less,
Ordering::Less,
Ordering::Less,
Ordering::Less,
Ordering::Less,
Ordering::Less,
Ordering::Less,
],
),
];
Expand Down Expand Up @@ -209,9 +232,10 @@ mod tests {
(Collation::Utf8Mb4UnicodeCi, 3),
(Collation::Latin1Bin, 4),
(Collation::GbkBin, 5),
(Collation::GbkChineseCi, 6),
];
let cases = vec![
// (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, GBKbin])
// (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, Latin1, GBKBin, GbkChineseCi])
(
"a",
[
Expand All @@ -221,6 +245,7 @@ mod tests {
vec![0x0E, 0x33],
vec![0x61],
vec![0x61],
vec![0x41],
],
),
(
Expand All @@ -232,6 +257,7 @@ mod tests {
vec![0x0E, 0x33],
vec![0x41],
vec![0x41],
vec![0x41],
],
),
(
Expand All @@ -243,6 +269,7 @@ mod tests {
vec![0x0E, 0x33],
vec![0x41],
vec![0x41],
vec![0x41],
],
),
(
Expand All @@ -254,6 +281,7 @@ mod tests {
vec![0xff, 0xfd],
vec![0xF0, 0x9F, 0x98, 0x83],
vec![0x3F],
vec![0x3F],
],
),
(
Expand Down Expand Up @@ -290,6 +318,10 @@ mod tests {
0x46, 0x6f, 0x6f, 0x20, 0x3f, 0x20, 0x62, 0x61, 0x72, 0x20, 0x3f, 0x20,
0x62, 0x61, 0x7a, 0x20, 0x3f, 0x20, 0x71, 0x75, 0x78,
],
vec![
0x46, 0x4f, 0x4f, 0x20, 0x3f, 0x20, 0x42, 0x41, 0x52, 0x20, 0x3f, 0x20,
0x42, 0x41, 0x5a, 0x20, 0x3f, 0x20, 0x51, 0x55, 0x58,
],
],
),
(
Expand All @@ -304,6 +336,7 @@ mod tests {
],
vec![0xEF, 0xB7, 0xBB],
vec![0x3f],
vec![0x3f],
],
),
(
Expand All @@ -315,6 +348,7 @@ mod tests {
vec![0xFB, 0x40, 0xCE, 0x2D, 0xFB, 0x40, 0xE5, 0x87],
vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87],
vec![0xD6, 0xD0, 0xCE, 0xC4],
vec![0xD3, 0x21, 0xC1, 0xAD],
],
),
];
Expand Down
1 change: 1 addition & 0 deletions components/tidb_query_datatype/src/codec/collation/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ macro_rules! match_template_collator {
Utf8Mb4UnicodeCi => CollatorUtf8Mb4UnicodeCi,
Latin1Bin => CollatorLatin1Bin,
GbkBin => CollatorGbkBin,
GbkChineseCi => CollatorGbkChineseCi,
],
$($tail)*
}
Expand Down
1 change: 1 addition & 0 deletions components/tidb_query_datatype/src/def/field_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ pub enum Collation {
Utf8Mb4UnicodeCi = -224,
Latin1Bin = -47,
GbkBin = -87,
GbkChineseCi = -28,
}

impl Collation {
Expand Down

0 comments on commit 3adc033

Please sign in to comment.