From 3c8699f8854b915bb22f14182054279f7c515b94 Mon Sep 17 00:00:00 2001 From: zufuliu Date: Thu, 23 Nov 2023 18:07:45 +0800 Subject: [PATCH] Merge DBCS lead type and trail byte array. --- scintilla/scripts/DBCS.py | 43 ++++++++++++++++++++++- scintilla/src/CharClassify.cxx | 64 +++++++++++----------------------- scintilla/src/CharClassify.h | 7 ++-- scintilla/src/Document.cxx | 3 +- scintilla/src/RESearch.cxx | 2 +- 5 files changed, 68 insertions(+), 51 deletions(-) diff --git a/scintilla/scripts/DBCS.py b/scintilla/scripts/DBCS.py index 9d10349e8d..114d07e958 100644 --- a/scintilla/scripts/DBCS.py +++ b/scintilla/scripts/DBCS.py @@ -1,5 +1,6 @@ import sys from enum import IntFlag +import MultiStageTable DBCSCodePages = [ 'cp932', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213', @@ -9,6 +10,45 @@ 'cp1361', 'johab', ] +# Byte ranges found in Wikipedia articles with relevant search strings in each case +DBCSByteRanges = { + 'cp932': { + 'lead': [(0x81, 0x9F), (0xE0, 0xFC)], + 'trail': [(0x40, 0x7E), (0x80, 0xFC)], + }, + 'cp936': { + 'lead': [(0x81, 0xFE)], + 'trail': [(0x40, 0x7E), (0x80, 0xFE)], + }, + 'cp949': { + 'lead': [(0x81, 0xFE)], + 'trail': [(0x41, 0x5A), (0x61, 0x7A), (0x81, 0xFE)], + }, + 'cp950': { + 'lead': [(0x81, 0xFE)], + 'trail': [(0x40, 0x7E), (0xA1, 0xFE)], + }, + 'cp1361': { + 'lead': [(0x84, 0xD3), (0xD8, 0xDE), (0xE0, 0xF9)], + 'trail': [(0x31, 0x7E), (0x81, 0xFE)], + }, +} + +def print_dbcs_byte_ranges(): + for codePage, ranges in DBCSByteRanges.items(): + table = [0]*256 + for start, end in ranges['lead']: + while start <= end: + table[start] = 1 + start += 1 + for start, end in ranges['trail']: + while start <= end: + table[start] |= 2 + start += 1 + valueBit, totalBit, data = MultiStageTable.runLengthEncode(codePage, table) + data = MultiStageTable.dumpArray(data, 20) + print(f'{{{data[0]}}}') + def to_byte_ranges(items): ranges = [] if items: @@ -140,5 +180,6 @@ def print_dbcs_valid_bytes(): print(' lead:', format_byte_ranges(validLead)) print(' trail:', format_byte_ranges(validTrail)) +#print_dbcs_byte_ranges() #print_dbcs_test_char(DBCSTrailKind.All) -print_dbcs_valid_bytes() +#print_dbcs_valid_bytes() diff --git a/scintilla/src/CharClassify.cxx b/scintilla/src/CharClassify.cxx index 1167e1c0ee..20b29f8554 100644 --- a/scintilla/src/CharClassify.cxx +++ b/scintilla/src/CharClassify.cxx @@ -678,10 +678,6 @@ void CharClassify::InitUnicodeData() noexcept { namespace { -inline void SetRange(bool (&bs)[256], int low, int high) noexcept { - memset(bs + low, true, high - low + 1); -} - //dbcs++Autogenerated -- start of section automatically generated // Created with Python 3.13.0a1, Unicode 15.1.0 const uint16_t CharClassifyRLE_CP932[] = { @@ -891,67 +887,49 @@ const uint8_t CharClassify_CP1361Data[] = { DBCSCharClassify::DBCSCharClassify(int codePage_) noexcept: codePage(codePage_), - minTrailByte(0x40), - leadByte{}, - trailByte{} { - // Byte ranges found in Wikipedia articles with relevant search strings in each case + minTrailByte(0x40) { switch (codePage_) { - case 932: + case 932: { // Shift-JIS - SetRange(leadByte, 0x81, 0x9F); - SetRange(leadByte, 0xE0, 0xFC); - // Lead bytes F0 to FC may be a Microsoft addition. - - SetRange(trailByte, 0x40, 0x7E); - SetRange(trailByte, 0x80, 0xFC); + constexpr uint8_t BytesRLE_CP932[] = {252, 4, 254, 4, 6, 127, 254, 6, 119, 12,}; + ExpandRLE(BytesRLE_CP932, leadByte); ExpandRLE(CharClassifyRLE_CP932, classifyMap); - break; + } break; - case 936: + case 936: { // GBK - SetRange(leadByte, 0x81, 0xFE); - - SetRange(trailByte, 0x40, 0x7E); - SetRange(trailByte, 0x80, 0xFE); + constexpr uint8_t BytesRLE_CP936[] = {252, 4, 254, 4, 6, 255, 255, 4,}; + ExpandRLE(BytesRLE_CP936, leadByte); ExpandRLE(CharClassifyRLE_CP936, classifyMap); - break; + } break; - case 949: + case 949: { // Korean Unified Hangul Code, Wansung KS C-5601-1987 minTrailByte = 0x41; - SetRange(leadByte, 0x81, 0xFE); - - SetRange(trailByte, 0x41, 0x5A); - SetRange(trailByte, 0x61, 0x7A); - SetRange(trailByte, 0x81, 0xFE); + constexpr uint8_t BytesRLE_CP949[] = {252, 8, 106, 24, 106, 24, 255, 255, 4,}; + ExpandRLE(BytesRLE_CP949, leadByte); ExpandRLE(CharClassifyRLE_CP949, classifyMap); - break; + } break; - case 950: + case 950: { // Big5 - SetRange(leadByte, 0x81, 0xFE); - - SetRange(trailByte, 0x40, 0x7E); - SetRange(trailByte, 0xA1, 0xFE); + constexpr uint8_t BytesRLE_CP950[] = {252, 4, 254, 8, 129, 255, 127, 4,}; + ExpandRLE(BytesRLE_CP950, leadByte); ExpandRLE(CharClassifyRLE_CP950, classifyMap); - break; + } break; - case 1361: + default: { // Korean Johab, KS C-5601-1992 minTrailByte = 0x31; - SetRange(leadByte, 0x84, 0xD3); - SetRange(leadByte, 0xD8, 0xDE); - SetRange(leadByte, 0xE0, 0xF9); - - SetRange(trailByte, 0x31, 0x7E); - SetRange(trailByte, 0x81, 0xFE); + constexpr uint8_t BytesRLE_CP1361[] = {196, 254, 62, 8, 14, 255, 71, 18, 31, 6, 107, 22, 4,}; + ExpandRLE(BytesRLE_CP1361, leadByte); ExpandRunBlock(CharClassify_CP1361Index, CharClassify_CP1361Data, classifyMap, CharClassify_CP1361IndexBit, CharClassify_CP1361BlockBit); - break; + } break; } } diff --git a/scintilla/src/CharClassify.h b/scintilla/src/CharClassify.h index 66deccef9c..0d6a734162 100644 --- a/scintilla/src/CharClassify.h +++ b/scintilla/src/CharClassify.h @@ -140,10 +140,10 @@ class DBCSCharClassify { explicit DBCSCharClassify(int codePage_) noexcept; bool IsLeadByte(unsigned char ch) const noexcept { - return leadByte[ch]; + return leadByte[ch] & true; } bool IsTrailByte(unsigned char ch) const noexcept { - return trailByte[ch]; + return leadByte[ch] & 2; } CharacterClass ClassifyCharacter(uint32_t ch) const noexcept { @@ -164,8 +164,7 @@ class DBCSCharClassify { private: const int codePage; int minTrailByte; - bool leadByte[256]; - bool trailByte[256]; + uint8_t leadByte[256]; unsigned char classifyMap[0xffff + 1]; }; diff --git a/scintilla/src/Document.cxx b/scintilla/src/Document.cxx index 088659d850..f08a437a65 100644 --- a/scintilla/src/Document.cxx +++ b/scintilla/src/Document.cxx @@ -156,6 +156,7 @@ Document::Document(DocumentOption options) : enteredModification = 0; enteredStyling = 0; enteredReadOnlyCount = 0; + matchesValid = false; insertionSet = false; tabInChars = 8; indentInChars = 0; @@ -164,8 +165,6 @@ Document::Document(DocumentOption options) : tabIndents = true; backspaceUnindents = false; - matchesValid = false; - perLineData[ldMarkers] = std::make_unique(); perLineData[ldLevels] = std::make_unique(); perLineData[ldState] = std::make_unique(); diff --git a/scintilla/src/RESearch.cxx b/scintilla/src/RESearch.cxx index d8cdf5dbee..78897ade95 100644 --- a/scintilla/src/RESearch.cxx +++ b/scintilla/src/RESearch.cxx @@ -479,7 +479,7 @@ const char *RESearch::DoCompile(const char *pattern, size_t length, FindOption f case '[': { /* match char class */ int prevChar = 0; - bool negative = false;/* xor mask -CCL/NCL */ + bool negative = false; i++; ++p;