Skip to content

Commit

Permalink
Auto detect common CSV delimiter, issue #892.
Browse files Browse the repository at this point in the history
  • Loading branch information
zufuliu committed Oct 26, 2024
1 parent 1c63449 commit 4243079
Showing 1 changed file with 100 additions and 2 deletions.
102 changes: 100 additions & 2 deletions src/Styles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1934,6 +1934,98 @@ PEDITLEXER Style_SniffShebang(char *pchText) noexcept {
return nullptr;
}

void Style_SniffCSV() noexcept {
const Sci_Line lines = SciCall_GetLineCount();
const Sci_Position endPos = SciCall_PositionFromLine(min<Sci_Line>(lines, 2));
const char *ptr = SciCall_GetRangePointer(0, endPos);
if (ptr == nullptr) { // empty document
return;
}
#if 0
StopWatch watch;
watch.Start();
#endif
int table[32*2]{};
int offset = 0;
const char * const end = ptr + endPos;
#if defined(_WIN64)
uint64_t maskSet = 0;
while (ptr < end) {
const uint8_t ch = *ptr++;
constexpr uint64_t mask = (1 << '\r') | (1 << '\n') | (1 << '\t') | (UINT64_C(1) << ',') | (UINT64_C(1) << ';');
if (ch == '|' || (ch <= ';' && (mask & (UINT64_C(1) << ch)) != 0)) {
if (ch == '\r' || ch == '\n') {
offset = 32;
if (ch == '\r' && *ptr == '\n') {
++ptr;
}
} else {
const unsigned index = offset + (ch & 31);
maskSet |= UINT64_C(1) << index;
table[index] += 1;
}
}
}
const uint32_t mask = (maskSet & UINT32_MAX) & (maskSet >> 32);
#else
uint32_t maskSet[2]{};
while (ptr < end) {
const uint8_t ch = *ptr++;
if (ch == ',' || ch == '\t' || ch == ';' || ch == '|') {
const int index = (ch & 31);
maskSet[offset] |= 1U << index;
table[index] += 1;
} else if (ch == '\r' || ch == '\n') {
offset = 1;
if (ch == '\r' && *ptr == '\n') {
++ptr;
}
}
}
const uint32_t mask = maskSet[0] & maskSet[1];
#endif // _WIN64

constexpr uint32_t preferred = ';' | ('|' << 8) | ('\t' << 16) | (',' << 24);
uint8_t delimiter = '\0';
if (mask != 0) { // delimiter occurs in both row
uint32_t value = preferred;
int maxColumn = 0;
while (value != 0) {
const int index = value & 31;
if (mask & (1U << index)) {
const int first = table[index];
const int second = table[32 + index];
const int column = min(first, second);
if (column > maxColumn && column >= 4*abs(first - second)) { // 75%
maxColumn = column;
delimiter = value & 0xff;
}
}
value >>= 8;
}
}
if (delimiter == 0) {
uint32_t value = preferred;
int maxColumn = 0;
while (value != 0) {
const int index = value & 31;
const int column = table[index] + table[32 + index];
if (column > maxColumn) {
maxColumn = column;
delimiter = value & 0xff;
}
value >>= 8;
}
}
#if 0
watch.Stop();
watch.ShowLog(__func__);
#endif
if (delimiter != 0) {
iCsvOption = ('\"' << 8) | delimiter;
}
}

//=============================================================================
// find lexer from <!DOCTYPE type PUBLIC > or <%@ Page Language="lang" %>
// Style_GetDocTypeLanguage()
Expand Down Expand Up @@ -2606,6 +2698,9 @@ bool Style_SetLexerFromFile(LPCWSTR lpszFile) noexcept {
pLexNew = pLexArray[iDefaultLexerIndex];
}
// Apply the new lexer
if (pLexNew->iLexer == SCLEX_CSV) {
Style_SniffCSV();
}
Style_SetLexer(pLexNew, true);
return bFound;
}
Expand All @@ -2620,6 +2715,9 @@ void Style_SetLexerFromName(LPCWSTR lpszFile, LPCWSTR lpszName) noexcept {
pLexNew = Style_MatchLexer(lpszName, true);
}
if (pLexNew != nullptr) {
if (pLexNew->iLexer == SCLEX_CSV) {
Style_SniffCSV();
}
Style_SetLexer(pLexNew, true);
} else {
Style_SetLexerFromFile(lpszFile);
Expand Down Expand Up @@ -2796,7 +2894,7 @@ void Style_SetLexerByLangIndex(int lang) noexcept {
}
if (pLex != nullptr) {
if (bLexerChanged || pLex != pLexCurrent || langIndex != np2LexLangIndex) {
Style_SetLexer(pLex, (pLex != pLexCurrent) + true);
Style_SetLexer(pLex, (pLex != pLexCurrent) + TRUE);
}
}
}
Expand Down Expand Up @@ -5099,7 +5197,7 @@ void Style_SelectLexerDlg(HWND hwnd, bool favorite) noexcept {
if (pLexCurrent->iLexer == SCLEX_CSV) {
SelectCSVOptionsDlg();
}
Style_SetLexer(pLexCurrent, (pLex != pLexCurrent) + true);
Style_SetLexer(pLexCurrent, (pLex != pLexCurrent) + TRUE);
}
} else {
if (favorite) {
Expand Down

0 comments on commit 4243079

Please sign in to comment.