forked from tikv/tikv
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
coprocessor: add gbk_bin support in coprocessor (tikv#10996)
* add gbk_bin support Signed-off-by: xiongjiwei <[email protected]> * address comments Signed-off-by: xiongjiwei <[email protected]>
- Loading branch information
1 parent
f9bbdbc
commit b0938c2
Showing
6 changed files
with
118 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file added
BIN
+128 KB
components/tidb_query_datatype/src/codec/collation/collator/gbk_bin.data
Binary file not shown.
66 changes: 66 additions & 0 deletions
66
components/tidb_query_datatype/src/codec/collation/collator/gbk_bin.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. | ||
|
||
use super::*; | ||
|
||
/// Collator for `gbk_bin` collation with padding behavior (trims right spaces). | ||
#[derive(Debug)] | ||
pub struct CollatorGbkBin; | ||
|
||
impl Collator for CollatorGbkBin { | ||
type Charset = CharsetGbk; | ||
type Weight = u16; | ||
|
||
const IS_CASE_INSENSITIVE: bool = false; | ||
|
||
#[inline] | ||
fn char_weight(ch: char) -> Self::Weight { | ||
// All GBK code point are in BMP, if the incoming character is not, convert it to '?'. | ||
// This should not happened. | ||
let r = ch as usize; | ||
if r > 0xFFFF { | ||
return '?' as u16; | ||
} | ||
|
||
(&GBK_BIN_TABLE[r * 2..r * 2 + 2]).read_u16().unwrap() | ||
} | ||
|
||
#[inline] | ||
fn write_sort_key<W: BufferWriter>(writer: &mut W, bstr: &[u8]) -> Result<usize> { | ||
let s = str::from_utf8(bstr)?.trim_end_matches(PADDING_SPACE); | ||
let mut n = 0; | ||
for ch in s.chars() { | ||
let weight = Self::char_weight(ch); | ||
if weight > 0xFF { | ||
writer.write_u16_be(weight)?; | ||
n += 2; | ||
} else { | ||
writer.write_u8(weight as u8)?; | ||
n += 1; | ||
} | ||
} | ||
Ok(n * std::mem::size_of::<u8>()) | ||
} | ||
|
||
#[inline] | ||
fn sort_compare(a: &[u8], b: &[u8]) -> Result<Ordering> { | ||
let sa = str::from_utf8(a)?.trim_end_matches(PADDING_SPACE); | ||
let sb = str::from_utf8(b)?.trim_end_matches(PADDING_SPACE); | ||
Ok(sa | ||
.chars() | ||
.map(Self::char_weight) | ||
.cmp(sb.chars().map(Self::char_weight))) | ||
} | ||
|
||
#[inline] | ||
fn sort_hash<H: Hasher>(state: &mut H, bstr: &[u8]) -> Result<()> { | ||
let s = str::from_utf8(bstr)?.trim_end_matches(PADDING_SPACE); | ||
for ch in s.chars().map(Self::char_weight) { | ||
ch.hash(state); | ||
} | ||
Ok(()) | ||
} | ||
} | ||
|
||
// GBK_BIN_TABLE are the encoding tables from Unicode to GBK code, it is totally the same with golang's GBK encoding. | ||
// If there is no mapping code in GBK, use 0x3F(?) instead. It should not happened. | ||
static GBK_BIN_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_bin.data"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters