Skip to content

Commit

Permalink
Merge DBCS lead type and trail byte array.
Browse files Browse the repository at this point in the history
  • Loading branch information
zufuliu committed Nov 23, 2023
1 parent 332a113 commit 3c8699f
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 51 deletions.
43 changes: 42 additions & 1 deletion scintilla/scripts/DBCS.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
from enum import IntFlag
import MultiStageTable

DBCSCodePages = [
'cp932', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213',
Expand All @@ -9,6 +10,45 @@
'cp1361', 'johab',
]

# Byte ranges found in Wikipedia articles with relevant search strings in each case
DBCSByteRanges = {
'cp932': {
'lead': [(0x81, 0x9F), (0xE0, 0xFC)],
'trail': [(0x40, 0x7E), (0x80, 0xFC)],
},
'cp936': {
'lead': [(0x81, 0xFE)],
'trail': [(0x40, 0x7E), (0x80, 0xFE)],
},
'cp949': {
'lead': [(0x81, 0xFE)],
'trail': [(0x41, 0x5A), (0x61, 0x7A), (0x81, 0xFE)],
},
'cp950': {
'lead': [(0x81, 0xFE)],
'trail': [(0x40, 0x7E), (0xA1, 0xFE)],
},
'cp1361': {
'lead': [(0x84, 0xD3), (0xD8, 0xDE), (0xE0, 0xF9)],
'trail': [(0x31, 0x7E), (0x81, 0xFE)],
},
}

def print_dbcs_byte_ranges():
for codePage, ranges in DBCSByteRanges.items():
table = [0]*256
for start, end in ranges['lead']:
while start <= end:
table[start] = 1
start += 1
for start, end in ranges['trail']:
while start <= end:
table[start] |= 2
start += 1
valueBit, totalBit, data = MultiStageTable.runLengthEncode(codePage, table)
data = MultiStageTable.dumpArray(data, 20)
print(f'{{{data[0]}}}')

def to_byte_ranges(items):
ranges = []
if items:
Expand Down Expand Up @@ -140,5 +180,6 @@ def print_dbcs_valid_bytes():
print(' lead:', format_byte_ranges(validLead))
print(' trail:', format_byte_ranges(validTrail))

#print_dbcs_byte_ranges()
#print_dbcs_test_char(DBCSTrailKind.All)
print_dbcs_valid_bytes()
#print_dbcs_valid_bytes()
64 changes: 21 additions & 43 deletions scintilla/src/CharClassify.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -678,10 +678,6 @@ void CharClassify::InitUnicodeData() noexcept {

namespace {

inline void SetRange(bool (&bs)[256], int low, int high) noexcept {
memset(bs + low, true, high - low + 1);
}

//dbcs++Autogenerated -- start of section automatically generated
// Created with Python 3.13.0a1, Unicode 15.1.0
const uint16_t CharClassifyRLE_CP932[] = {
Expand Down Expand Up @@ -891,67 +887,49 @@ const uint8_t CharClassify_CP1361Data[] = {

DBCSCharClassify::DBCSCharClassify(int codePage_) noexcept:
codePage(codePage_),
minTrailByte(0x40),
leadByte{},
trailByte{} {
// Byte ranges found in Wikipedia articles with relevant search strings in each case
minTrailByte(0x40) {
switch (codePage_) {
case 932:
case 932: {
// Shift-JIS
SetRange(leadByte, 0x81, 0x9F);
SetRange(leadByte, 0xE0, 0xFC);
// Lead bytes F0 to FC may be a Microsoft addition.

SetRange(trailByte, 0x40, 0x7E);
SetRange(trailByte, 0x80, 0xFC);
constexpr uint8_t BytesRLE_CP932[] = {252, 4, 254, 4, 6, 127, 254, 6, 119, 12,};
ExpandRLE(BytesRLE_CP932, leadByte);

ExpandRLE(CharClassifyRLE_CP932, classifyMap);
break;
} break;

case 936:
case 936: {
// GBK
SetRange(leadByte, 0x81, 0xFE);

SetRange(trailByte, 0x40, 0x7E);
SetRange(trailByte, 0x80, 0xFE);
constexpr uint8_t BytesRLE_CP936[] = {252, 4, 254, 4, 6, 255, 255, 4,};
ExpandRLE(BytesRLE_CP936, leadByte);

ExpandRLE(CharClassifyRLE_CP936, classifyMap);
break;
} break;

case 949:
case 949: {
// Korean Unified Hangul Code, Wansung KS C-5601-1987
minTrailByte = 0x41;
SetRange(leadByte, 0x81, 0xFE);

SetRange(trailByte, 0x41, 0x5A);
SetRange(trailByte, 0x61, 0x7A);
SetRange(trailByte, 0x81, 0xFE);
constexpr uint8_t BytesRLE_CP949[] = {252, 8, 106, 24, 106, 24, 255, 255, 4,};
ExpandRLE(BytesRLE_CP949, leadByte);

ExpandRLE(CharClassifyRLE_CP949, classifyMap);
break;
} break;

case 950:
case 950: {
// Big5
SetRange(leadByte, 0x81, 0xFE);

SetRange(trailByte, 0x40, 0x7E);
SetRange(trailByte, 0xA1, 0xFE);
constexpr uint8_t BytesRLE_CP950[] = {252, 4, 254, 8, 129, 255, 127, 4,};
ExpandRLE(BytesRLE_CP950, leadByte);

ExpandRLE(CharClassifyRLE_CP950, classifyMap);
break;
} break;

case 1361:
default: {
// Korean Johab, KS C-5601-1992
minTrailByte = 0x31;
SetRange(leadByte, 0x84, 0xD3);
SetRange(leadByte, 0xD8, 0xDE);
SetRange(leadByte, 0xE0, 0xF9);

SetRange(trailByte, 0x31, 0x7E);
SetRange(trailByte, 0x81, 0xFE);
constexpr uint8_t BytesRLE_CP1361[] = {196, 254, 62, 8, 14, 255, 71, 18, 31, 6, 107, 22, 4,};
ExpandRLE(BytesRLE_CP1361, leadByte);

ExpandRunBlock(CharClassify_CP1361Index, CharClassify_CP1361Data, classifyMap,
CharClassify_CP1361IndexBit, CharClassify_CP1361BlockBit);
break;
} break;
}
}
7 changes: 3 additions & 4 deletions scintilla/src/CharClassify.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,10 @@ class DBCSCharClassify {
explicit DBCSCharClassify(int codePage_) noexcept;

bool IsLeadByte(unsigned char ch) const noexcept {
return leadByte[ch];
return leadByte[ch] & true;
}
bool IsTrailByte(unsigned char ch) const noexcept {
return trailByte[ch];
return leadByte[ch] & 2;
}

CharacterClass ClassifyCharacter(uint32_t ch) const noexcept {
Expand All @@ -164,8 +164,7 @@ class DBCSCharClassify {
private:
const int codePage;
int minTrailByte;
bool leadByte[256];
bool trailByte[256];
uint8_t leadByte[256];
unsigned char classifyMap[0xffff + 1];
};

Expand Down
3 changes: 1 addition & 2 deletions scintilla/src/Document.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ Document::Document(DocumentOption options) :
enteredModification = 0;
enteredStyling = 0;
enteredReadOnlyCount = 0;
matchesValid = false;
insertionSet = false;
tabInChars = 8;
indentInChars = 0;
Expand All @@ -164,8 +165,6 @@ Document::Document(DocumentOption options) :
tabIndents = true;
backspaceUnindents = false;

matchesValid = false;

perLineData[ldMarkers] = std::make_unique<LineMarkers>();
perLineData[ldLevels] = std::make_unique<LineLevels>();
perLineData[ldState] = std::make_unique<LineState>();
Expand Down
2 changes: 1 addition & 1 deletion scintilla/src/RESearch.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,7 @@ const char *RESearch::DoCompile(const char *pattern, size_t length, FindOption f

case '[': { /* match char class */
int prevChar = 0;
bool negative = false;/* xor mask -CCL/NCL */
bool negative = false;

i++;
++p;
Expand Down

0 comments on commit 3c8699f

Please sign in to comment.