From 3c8699f8854b915bb22f14182054279f7c515b94 Mon Sep 17 00:00:00 2001
From: zufuliu <zufuliu@gmail.com>
Date: Thu, 23 Nov 2023 18:07:45 +0800
Subject: [PATCH] Merge DBCS lead type and trail byte array.

---
 scintilla/scripts/DBCS.py      | 43 ++++++++++++++++++++++-
 scintilla/src/CharClassify.cxx | 64 +++++++++++-----------------------
 scintilla/src/CharClassify.h   |  7 ++--
 scintilla/src/Document.cxx     |  3 +-
 scintilla/src/RESearch.cxx     |  2 +-
 5 files changed, 68 insertions(+), 51 deletions(-)

diff --git a/scintilla/scripts/DBCS.py b/scintilla/scripts/DBCS.py
index 9d10349e8d..114d07e958 100644
--- a/scintilla/scripts/DBCS.py
+++ b/scintilla/scripts/DBCS.py
@@ -1,5 +1,6 @@
 import sys
 from enum import IntFlag
+import MultiStageTable
 
 DBCSCodePages = [
 	'cp932', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213',
@@ -9,6 +10,45 @@
 	'cp1361', 'johab',
 ]
 
+# Byte ranges found in Wikipedia articles with relevant search strings in each case
+DBCSByteRanges = {
+	'cp932': {
+		'lead': [(0x81, 0x9F), (0xE0, 0xFC)],
+		'trail': [(0x40, 0x7E), (0x80, 0xFC)],
+	},
+	'cp936': {
+		'lead': [(0x81, 0xFE)],
+		'trail': [(0x40, 0x7E), (0x80, 0xFE)],
+	},
+	'cp949': {
+		'lead': [(0x81, 0xFE)],
+		'trail': [(0x41, 0x5A), (0x61, 0x7A), (0x81, 0xFE)],
+	},
+	'cp950': {
+		'lead': [(0x81, 0xFE)],
+		'trail': [(0x40, 0x7E), (0xA1, 0xFE)],
+	},
+	'cp1361': {
+		'lead': [(0x84, 0xD3), (0xD8, 0xDE), (0xE0, 0xF9)],
+		'trail': [(0x31, 0x7E), (0x81, 0xFE)],
+	},
+}
+
+def print_dbcs_byte_ranges():
+	for codePage, ranges in DBCSByteRanges.items():
+		table = [0]*256
+		for start, end in ranges['lead']:
+			while start <= end:
+				table[start] = 1
+				start += 1
+		for start, end in ranges['trail']:
+			while start <= end:
+				table[start] |= 2
+				start += 1
+		valueBit, totalBit, data = MultiStageTable.runLengthEncode(codePage, table)
+		data = MultiStageTable.dumpArray(data, 20)
+		print(f'{{{data[0]}}}')
+
 def to_byte_ranges(items):
 	ranges = []
 	if items:
@@ -140,5 +180,6 @@ def print_dbcs_valid_bytes():
 		print('    lead:', format_byte_ranges(validLead))
 		print('   trail:', format_byte_ranges(validTrail))
 
+#print_dbcs_byte_ranges()
 #print_dbcs_test_char(DBCSTrailKind.All)
-print_dbcs_valid_bytes()
+#print_dbcs_valid_bytes()
diff --git a/scintilla/src/CharClassify.cxx b/scintilla/src/CharClassify.cxx
index 1167e1c0ee..20b29f8554 100644
--- a/scintilla/src/CharClassify.cxx
+++ b/scintilla/src/CharClassify.cxx
@@ -678,10 +678,6 @@ void CharClassify::InitUnicodeData() noexcept {
 
 namespace {
 
-inline void SetRange(bool (&bs)[256], int low, int high) noexcept {
-	memset(bs + low, true, high - low + 1);
-}
-
 //dbcs++Autogenerated -- start of section automatically generated
 // Created with Python 3.13.0a1, Unicode 15.1.0
 const uint16_t CharClassifyRLE_CP932[] = {
@@ -891,67 +887,49 @@ const uint8_t CharClassify_CP1361Data[] = {
 
 DBCSCharClassify::DBCSCharClassify(int codePage_) noexcept:
 	codePage(codePage_),
-	minTrailByte(0x40),
-	leadByte{},
-	trailByte{} {
-	// Byte ranges found in Wikipedia articles with relevant search strings in each case
+	minTrailByte(0x40) {
 	switch (codePage_) {
-	case 932:
+	case 932: {
 		// Shift-JIS
-		SetRange(leadByte, 0x81, 0x9F);
-		SetRange(leadByte, 0xE0, 0xFC);
-		// Lead bytes F0 to FC may be a Microsoft addition.
-
-		SetRange(trailByte, 0x40, 0x7E);
-		SetRange(trailByte, 0x80, 0xFC);
+		constexpr uint8_t BytesRLE_CP932[] = {252, 4, 254, 4, 6, 127, 254, 6, 119, 12,};
+		ExpandRLE(BytesRLE_CP932, leadByte);
 
 		ExpandRLE(CharClassifyRLE_CP932, classifyMap);
-		break;
+	} break;
 
-	case 936:
+	case 936: {
 		// GBK
-		SetRange(leadByte, 0x81, 0xFE);
-
-		SetRange(trailByte, 0x40, 0x7E);
-		SetRange(trailByte, 0x80, 0xFE);
+		constexpr uint8_t BytesRLE_CP936[] = {252, 4, 254, 4, 6, 255, 255, 4,};
+		ExpandRLE(BytesRLE_CP936, leadByte);
 
 		ExpandRLE(CharClassifyRLE_CP936, classifyMap);
-		break;
+	} break;
 
-	case 949:
+	case 949: {
 		// Korean Unified Hangul Code, Wansung KS C-5601-1987
 		minTrailByte = 0x41;
-		SetRange(leadByte, 0x81, 0xFE);
-
-		SetRange(trailByte, 0x41, 0x5A);
-		SetRange(trailByte, 0x61, 0x7A);
-		SetRange(trailByte, 0x81, 0xFE);
+		constexpr uint8_t BytesRLE_CP949[] = {252, 8, 106, 24, 106, 24, 255, 255, 4,};
+		ExpandRLE(BytesRLE_CP949, leadByte);
 
 		ExpandRLE(CharClassifyRLE_CP949, classifyMap);
-		break;
+	} break;
 
-	case 950:
+	case 950: {
 		// Big5
-		SetRange(leadByte, 0x81, 0xFE);
-
-		SetRange(trailByte, 0x40, 0x7E);
-		SetRange(trailByte, 0xA1, 0xFE);
+		constexpr uint8_t BytesRLE_CP950[] = {252, 4, 254, 8, 129, 255, 127, 4,};
+		ExpandRLE(BytesRLE_CP950, leadByte);
 
 		ExpandRLE(CharClassifyRLE_CP950, classifyMap);
-		break;
+	} break;
 
-	case 1361:
+	default: {
 		// Korean Johab, KS C-5601-1992
 		minTrailByte = 0x31;
-		SetRange(leadByte, 0x84, 0xD3);
-		SetRange(leadByte, 0xD8, 0xDE);
-		SetRange(leadByte, 0xE0, 0xF9);
-
-		SetRange(trailByte, 0x31, 0x7E);
-		SetRange(trailByte, 0x81, 0xFE);
+		constexpr uint8_t BytesRLE_CP1361[] = {196, 254, 62, 8, 14, 255, 71, 18, 31, 6, 107, 22, 4,};
+		ExpandRLE(BytesRLE_CP1361, leadByte);
 
 		ExpandRunBlock(CharClassify_CP1361Index, CharClassify_CP1361Data, classifyMap,
 			CharClassify_CP1361IndexBit, CharClassify_CP1361BlockBit);
-		break;
+	} break;
 	}
 }
diff --git a/scintilla/src/CharClassify.h b/scintilla/src/CharClassify.h
index 66deccef9c..0d6a734162 100644
--- a/scintilla/src/CharClassify.h
+++ b/scintilla/src/CharClassify.h
@@ -140,10 +140,10 @@ class DBCSCharClassify {
 	explicit DBCSCharClassify(int codePage_) noexcept;
 
 	bool IsLeadByte(unsigned char ch) const noexcept {
-		return leadByte[ch];
+		return leadByte[ch] & true;
 	}
 	bool IsTrailByte(unsigned char ch) const noexcept {
-		return trailByte[ch];
+		return leadByte[ch] & 2;
 	}
 
 	CharacterClass ClassifyCharacter(uint32_t ch) const noexcept {
@@ -164,8 +164,7 @@ class DBCSCharClassify {
 private:
 	const int codePage;
 	int minTrailByte;
-	bool leadByte[256];
-	bool trailByte[256];
+	uint8_t leadByte[256];
 	unsigned char classifyMap[0xffff + 1];
 };
 
diff --git a/scintilla/src/Document.cxx b/scintilla/src/Document.cxx
index 088659d850..f08a437a65 100644
--- a/scintilla/src/Document.cxx
+++ b/scintilla/src/Document.cxx
@@ -156,6 +156,7 @@ Document::Document(DocumentOption options) :
 	enteredModification = 0;
 	enteredStyling = 0;
 	enteredReadOnlyCount = 0;
+	matchesValid = false;
 	insertionSet = false;
 	tabInChars = 8;
 	indentInChars = 0;
@@ -164,8 +165,6 @@ Document::Document(DocumentOption options) :
 	tabIndents = true;
 	backspaceUnindents = false;
 
-	matchesValid = false;
-
 	perLineData[ldMarkers] = std::make_unique<LineMarkers>();
 	perLineData[ldLevels] = std::make_unique<LineLevels>();
 	perLineData[ldState] = std::make_unique<LineState>();
diff --git a/scintilla/src/RESearch.cxx b/scintilla/src/RESearch.cxx
index d8cdf5dbee..78897ade95 100644
--- a/scintilla/src/RESearch.cxx
+++ b/scintilla/src/RESearch.cxx
@@ -479,7 +479,7 @@ const char *RESearch::DoCompile(const char *pattern, size_t length, FindOption f
 
 		case '[': {               /* match char class */
 			int prevChar = 0;
-			bool negative = false;/* xor mask -CCL/NCL */
+			bool negative = false;
 
 			i++;
 			++p;