-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclass.cb_text.php
197 lines (171 loc) · 6.72 KB
/
class.cb_text.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
<?php
/* This file is part of cbutil.
* Copyright © 2011-2012 stiftung kulturserver.de ggmbh <[email protected]>
*
* cbutil is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* cbutil is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with cbutil. If not, see <http://www.gnu.org/licenses/>.
*/
require_once 'class.cb_transliterator.php';
class CbText {
/**
* Removes all known markup from the text.
*
* @param string $text
* @return string plain text
*/
public static function plain($text) {
return self::removeMakeUrl($text);
}
/**
* Removes the markup produced by the URL parser and replaces it with a plain
* text version.
*
* @param string $text
* @param string $format any combination of "{TEXT}" and "{URL}"
* @return string processed text
*/
public static function removeMakeUrl($text, $format = '{TEXT}') {
$replacement = self::inject($format, array(
'text' => '$1',
'url' => '$2'
));
return preg_replace('/\[(.+?)(?:\.intern|)\](http[^\s]+)/', $replacement, $text);
}
/**
* removes invalid UTF8 characters from the given string. The algorithm was
* proposed on http://webcollab.sourceforge.net/unicode.html in
* section "Character Validation".
*
* @param string text corrupted by invalid UTF8 characters
* @return string adjusted text
*/
public static function removeInvalidUtf8($text) {
$text = preg_replace('/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]' .
'|(?<=^|[\x00-\x7F])[\x80-\xBF]+' .
'|([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*' .
'|[\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})' .
'|[\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))|(?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/', '', $text);
return preg_replace('/\xE0[\x80-\x9F][\x80-\xBF]' .
'|\xED[\xA0-\xBF][\x80-\xBF]/S', '?', $text);
}
/**
* Removes invalid XML 1.0 characters.
* See http://en.wikipedia.org/wiki/Valid_characters_in_XML
*
* @param string $str
* @return string replaced string
*/
public static function removeInvalidXML1Characters($str) {
return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x19\x1E]/', '', $str);
}
/**
* Replaces uppercase keys enclosed in "{" and "}" with the provided values.
*
* @param string $text
* @param array $injections map of keys and their replacements
* @return string processed text
*/
public static function inject($text, array $injections) {
foreach ($injections as $key => $replacement) {
$text = str_replace('{' . strtoupper($key) . '}', $replacement, $text);
}
return $text;
}
/**
* Cuts a text off if it is longer than a given maximum and appends an
* indicator for omission.
*
* @param string $text
* @param int $length
* @param string $omission
* @return string truncated text
*/
public static function truncate($text, $length, $omission = '...') {
if (mb_strlen($text) > $length) {
$text = rtrim(mb_substr($text, 0, $length)) . $omission;
}
return $text;
}
/**
* Cuts a text off if it is longer than a given maximum and appends an
* indicator for omission. Words are preserved.
*
* @param string $text
* @param int $length
* @param string $omission
* @return string truncated text
*/
public static function truncateWords($text, $length, $omission = ' ...') {
if (mb_strlen($text) <= $length) return $text;
$trailingWhitespaceRe = '/\s+$/';
$leadingWordsRe = '/^[^\s-]*/';
$trailingInsignificantCharsRe = '/[,]+$/';
$start = mb_substr($text, 0, $length);
$result = null;
if (preg_match($trailingWhitespaceRe, $start)) {
$result = preg_replace($trailingWhitespaceRe, '', $start);
} else {
$matches = array();
preg_match($leadingWordsRe, mb_substr($text, $length), $matches);
$result = $start . $matches[0];
}
return preg_replace($trailingInsignificantCharsRe, '', $result) . $omission;
}
/**
* Creates a slug representation of the string. It is meant to be used in
* plain text ASCII, unicode-hostile environments like URLs, CSS rules and
* cb-ml labels.
*
* @param string $text
* @param array $optionalParams
* @return string URL-safe representation
*/
public static function slugify($text, $optionalParams = null) {
// Handle the legacy invocation where the second parameter was the
// delimiter and no maximum length existed.
if ($optionalParams === null) {
$optionalParams = array();
} else if (!is_array($optionalParams)) {
$optionalParams = array(
'delimiter' => $optionalParams,
// Alternative delimiters were mainly used for cb-ml-labels, which
// is why we disable the length in that case, as it is important
// that they do not get truncated in older projects. Otherwise, some
// things could break.
'max_length' => null
);
}
// Merge with defaults.
$optionalParams = array_merge(array(
'delimiter' => '-',
'max_length' => 42 // interestingly, this seems to be a common default
), $optionalParams);
// Resolve HTML entities.
$text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
// Make sure that no special chars are in there.
$text = CbTransliterator::translit($text);
// Remove all characters that cannot be represented unescaped.
$text = preg_replace(array('/[^a-z\d\s]/i', '/\s+/'), array(' ', $optionalParams['delimiter']), strtolower($text));
// Truncate the result if needed. Try to truncate by words. If that does
// not work due to the nature of the string, truncate by characters.
if ($optionalParams['max_length'] !== null && $optionalParams['max_length'] > 0) {
$truncatedText = self::truncateWords($text, $optionalParams['max_length'], '');
if (mb_strlen($truncatedText) > $optionalParams['max_length']) {
$truncatedText = self::truncate($text, $optionalParams['max_length'], '');
}
$text = $truncatedText;
}
// Make sure that there are no delimiters at beginning and end.
return trim($text, $optionalParams['delimiter']);
}
}