Skip to content

Commit

Permalink
coprocessor: add gbk_bin support in coprocessor (tikv#10996)
Browse files Browse the repository at this point in the history
* add gbk_bin support

Signed-off-by: xiongjiwei <[email protected]>

* address comments

Signed-off-by: xiongjiwei <[email protected]>
  • Loading branch information
xiongjiwei authored Sep 28, 2021
1 parent f9bbdbc commit b0938c2
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 2 deletions.
3 changes: 3 additions & 0 deletions components/tidb_query_datatype/src/codec/collation/charset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,6 @@ impl Charset for CharsetUtf8mb4 {
})
}
}

// gbk character data actually stored with utf8mb4 character encoding.
pub type CharsetGbk = CharsetUtf8mb4;
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0.

use super::*;

/// Collator for `gbk_bin` collation with padding behavior (trims right spaces).
#[derive(Debug)]
pub struct CollatorGbkBin;

impl Collator for CollatorGbkBin {
type Charset = CharsetGbk;
type Weight = u16;

const IS_CASE_INSENSITIVE: bool = false;

#[inline]
fn char_weight(ch: char) -> Self::Weight {
// All GBK code point are in BMP, if the incoming character is not, convert it to '?'.
// This should not happened.
let r = ch as usize;
if r > 0xFFFF {
return '?' as u16;
}

(&GBK_BIN_TABLE[r * 2..r * 2 + 2]).read_u16().unwrap()
}

#[inline]
fn write_sort_key<W: BufferWriter>(writer: &mut W, bstr: &[u8]) -> Result<usize> {
let s = str::from_utf8(bstr)?.trim_end_matches(PADDING_SPACE);
let mut n = 0;
for ch in s.chars() {
let weight = Self::char_weight(ch);
if weight > 0xFF {
writer.write_u16_be(weight)?;
n += 2;
} else {
writer.write_u8(weight as u8)?;
n += 1;
}
}
Ok(n * std::mem::size_of::<u8>())
}

#[inline]
fn sort_compare(a: &[u8], b: &[u8]) -> Result<Ordering> {
let sa = str::from_utf8(a)?.trim_end_matches(PADDING_SPACE);
let sb = str::from_utf8(b)?.trim_end_matches(PADDING_SPACE);
Ok(sa
.chars()
.map(Self::char_weight)
.cmp(sb.chars().map(Self::char_weight)))
}

#[inline]
fn sort_hash<H: Hasher>(state: &mut H, bstr: &[u8]) -> Result<()> {
let s = str::from_utf8(bstr)?.trim_end_matches(PADDING_SPACE);
for ch in s.chars().map(Self::char_weight) {
ch.hash(state);
}
Ok(())
}
}

// GBK_BIN_TABLE are the encoding tables from Unicode to GBK code, it is totally the same with golang's GBK encoding.
// If there is no mapping code in GBK, use 0x3F(?) instead. It should not happened.
static GBK_BIN_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_bin.data");
48 changes: 46 additions & 2 deletions components/tidb_query_datatype/src/codec/collation/collator/mod.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0.

mod binary;
mod gbk_bin;
mod latin1_bin;
mod utf8mb4_binary;
mod utf8mb4_general_ci;
mod utf8mb4_unicode_ci;

pub use binary::*;
pub use gbk_bin::*;
pub use latin1_bin::*;
pub use utf8mb4_binary::*;
pub use utf8mb4_general_ci::*;
Expand Down Expand Up @@ -42,9 +44,10 @@ mod tests {
(Collation::Utf8Mb4GeneralCi, 2),
(Collation::Utf8Mb4UnicodeCi, 3),
(Collation::Latin1Bin, 4),
(Collation::GbkBin, 5),
];
let cases = vec![
// (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi])
// (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, GBKBin])
(
"a".as_bytes(),
"a".as_bytes(),
Expand All @@ -54,6 +57,7 @@ mod tests {
Ordering::Equal,
Ordering::Equal,
Ordering::Equal,
Ordering::Equal,
],
),
(
Expand All @@ -65,6 +69,7 @@ mod tests {
Ordering::Equal,
Ordering::Equal,
Ordering::Equal,
Ordering::Equal,
],
),
(
Expand All @@ -76,6 +81,7 @@ mod tests {
Ordering::Equal,
Ordering::Equal,
Ordering::Greater,
Ordering::Greater,
],
),
(
Expand All @@ -87,6 +93,7 @@ mod tests {
Ordering::Greater,
Ordering::Greater,
Ordering::Greater,
Ordering::Greater,
],
),
(
Expand All @@ -98,6 +105,7 @@ mod tests {
Ordering::Less,
Ordering::Less,
Ordering::Less,
Ordering::Less,
],
),
(
Expand All @@ -109,6 +117,7 @@ mod tests {
Ordering::Equal,
Ordering::Equal,
Ordering::Less,
Ordering::Less,
],
),
(
Expand All @@ -120,6 +129,7 @@ mod tests {
Ordering::Equal,
Ordering::Equal,
Ordering::Less,
Ordering::Less,
],
),
(
Expand All @@ -131,6 +141,19 @@ mod tests {
Ordering::Less,
Ordering::Equal,
Ordering::Greater,
Ordering::Less,
],
),
(
"中文".as_bytes(),
"汉字".as_bytes(),
[
Ordering::Less,
Ordering::Less,
Ordering::Less,
Ordering::Less,
Ordering::Less,
Ordering::Greater,
],
),
];
Expand Down Expand Up @@ -185,9 +208,10 @@ mod tests {
(Collation::Utf8Mb4GeneralCi, 2),
(Collation::Utf8Mb4UnicodeCi, 3),
(Collation::Latin1Bin, 4),
(Collation::GbkBin, 5),
];
let cases = vec![
// (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi])
// (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, GBKbin])
(
"a",
[
Expand All @@ -196,6 +220,7 @@ mod tests {
vec![0x00, 0x41],
vec![0x0E, 0x33],
vec![0x61],
vec![0x61],
],
),
(
Expand All @@ -206,6 +231,7 @@ mod tests {
vec![0x00, 0x41],
vec![0x0E, 0x33],
vec![0x41],
vec![0x41],
],
),
(
Expand All @@ -216,6 +242,7 @@ mod tests {
vec![0x00, 0x41],
vec![0x0E, 0x33],
vec![0x41],
vec![0x41],
],
),
(
Expand All @@ -226,6 +253,7 @@ mod tests {
vec![0xff, 0xfd],
vec![0xff, 0xfd],
vec![0xF0, 0x9F, 0x98, 0x83],
vec![0x3F],
],
),
(
Expand Down Expand Up @@ -258,6 +286,10 @@ mod tests {
0x9D, 0x8C, 0x86, 0x20, 0x62, 0x61, 0x7A, 0x20, 0xE2, 0x98, 0x83, 0x20,
0x71, 0x75, 0x78,
],
vec![
0x46, 0x6f, 0x6f, 0x20, 0x3f, 0x20, 0x62, 0x61, 0x72, 0x20, 0x3f, 0x20,
0x62, 0x61, 0x7a, 0x20, 0x3f, 0x20, 0x71, 0x75, 0x78,
],
],
),
(
Expand All @@ -271,6 +303,18 @@ mod tests {
0x13, 0xAB, 0x13, 0xB7,
],
vec![0xEF, 0xB7, 0xBB],
vec![0x3f],
],
),
(
"中文",
[
vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87],
vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87],
vec![0x4E, 0x2D, 0x65, 0x87],
vec![0xFB, 0x40, 0xCE, 0x2D, 0xFB, 0x40, 0xE5, 0x87],
vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87],
vec![0xD6, 0xD0, 0xCE, 0xC4],
],
),
];
Expand Down
1 change: 1 addition & 0 deletions components/tidb_query_datatype/src/codec/collation/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ macro_rules! match_template_collator {
Utf8Mb4GeneralCi => CollatorUtf8Mb4GeneralCi,
Utf8Mb4UnicodeCi => CollatorUtf8Mb4UnicodeCi,
Latin1Bin => CollatorLatin1Bin,
GbkBin => CollatorGbkBin,
],
$($tail)*
}
Expand Down
2 changes: 2 additions & 0 deletions components/tidb_query_datatype/src/def/field_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ pub enum Collation {
Utf8Mb4GeneralCi = -45,
Utf8Mb4UnicodeCi = -224,
Latin1Bin = -47,
GbkBin = -87,
}

impl Collation {
Expand All @@ -126,6 +127,7 @@ impl Collation {
-47 => Ok(Collation::Latin1Bin),
-63 | 63 | 47 => Ok(Collation::Binary),
-224 | -192 => Ok(Collation::Utf8Mb4UnicodeCi),
-87 => Ok(Collation::GbkBin),
n if n >= 0 => Ok(Collation::Utf8Mb4BinNoPadding),
n => Err(DataTypeError::UnsupportedCollation { code: n }),
}
Expand Down

0 comments on commit b0938c2

Please sign in to comment.