-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcppure.cpp
153 lines (119 loc) · 3.16 KB
/
cppure.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#include "cppure.h"
#include <fstream>
#include <set>
#include <vector>
using namespace std;
namespace cppure
{
// If "a" is an impure word, ensure "a b" is not also a wordlist option
// Otherwise, we will let "a" pass through if we see the combination "a c"
vector<string> wordlist;
enum state {
WaitForWord,
ProcessingWord
};
bool load_wordlist(string const& path)
{
wordlist.clear();
ifstream ifs(path);
if (!ifs.good()) return false;
string word;
while (ifs.good()) {
getline(ifs, word);
wordlist.push_back(word);
}
return true;
}
bool icase_match(string const& a, string const& b) {
// Must be same length
if (a.size() != b.size()) return false;
// Check for full match between a and b in 'lowercase' sense
for (int j = 0; j < a.size(); ++j) {
if (tolower(a[j]) != tolower(b[j])) {
return false;
}
}
return true;
}
int icase_match_any(string const& buf, set<int> const& possible) {
// We can process the remainder of the buffer now (the last word)
for (int i : possible) {
if (icase_match(buf, wordlist[i])) return i;
}
return -1;
}
string cleanse(string const& str)
{
state st = WaitForWord;
string ret, buf;
set<int> all_possible, possible;
for (int i = 0; i < wordlist.size(); ++i)
all_possible.insert(i);
// Walk once through input string
for (size_t idx = 0; idx < str.size(); ++idx) {
char c = str[idx];
// Only check non-unicode values
bool word_char = !isspace(c) && !ispunct(c) && static_cast<unsigned char>(c) < 128;
// Still waiting for start of word...
if (!word_char && (st == WaitForWord)) {
ret += c;
continue;
}
// Starting a word!
if (word_char && (st == WaitForWord)) {
st = ProcessingWord;
possible = all_possible;
}
// Processing a word...
if (st == ProcessingWord) {
// If no more possibilities remain
// then we can add what we have in the buffer to the output
if (possible.empty()) {
buf += c;
// We can clear out the buffer and start over
if (!word_char) {
ret += buf;
buf.clear();
st = WaitForWord;
}
continue;
}
// We've encountered a non-word character, so we process this word
if (!word_char) {
// We match a possible word, so replace our buffer
if (icase_match_any(buf, possible) >= 0) {
fill(buf.begin(), buf.end(), '*');
}
buf += c;
ret += buf;
buf.clear();
st = WaitForWord;
continue;
}
// Don't yet have a match, so add the character
buf += c;
// Filter remaining possibilities based on current character
for (auto it = possible.begin(); it != possible.end();) {
// Current string is too long for match
if (wordlist[*it].size() < buf.size()) {
it = possible.erase(it);
continue;
}
// Current character doesn't match (in lowercase)
if (tolower(wordlist[*it][buf.size()-1]) != tolower(buf[buf.size()-1])) {
it = possible.erase(it);
continue;
}
// Next possible word
++it;
}
}
}
// Process the last word
if (icase_match_any(buf, possible) >= 0) {
fill(buf.begin(), buf.end(), '*');
}
ret += buf;
return ret;
}
}