Skip to content

Commit 2e97b51

Browse files
Add support for diacritics and eszett (#709)
1 parent 8fd2708 commit 2e97b51

File tree

2 files changed

+41
-9
lines changed

2 files changed

+41
-9
lines changed

lib/natural/tokenizers/aggressive_tokenizer_de.js

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,12 @@ THE SOFTWARE.
2323
'use strict'
2424

2525
const Tokenizer = require('./tokenizer')
26-
const util = require('util')
2726

28-
const AggressiveTokenizer = function () {
29-
Tokenizer.call(this)
27+
class AggressiveTokenizer extends Tokenizer {
28+
tokenize (text) {
29+
// break a string up into an array of tokens by anything non-word
30+
return this.trim(text.split(/[^a-zA-Z0-9ßäöü_'-]+/))
31+
}
3032
}
31-
util.inherits(AggressiveTokenizer, Tokenizer)
3233

3334
module.exports = AggressiveTokenizer
34-
35-
AggressiveTokenizer.prototype.tokenize = function (text) {
36-
// break a string up into an array of tokens by anything non-word
37-
return this.trim(text.split(/[^a-zA-Z0-9_'-]+/))
38-
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
Copyright (c) 2023, Hugo W.L. ter Doest
3+
4+
Permission is hereby granted, free of charge, to any person obtaining a copy
5+
of this software and associated documentation files (the "Software"), to deal
6+
in the Software without restriction, including without limitation the rights
7+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8+
copies of the Software, and to permit persons to whom the Software is
9+
furnished to do so, subject to the following conditions:
10+
11+
The above copyright notice and this permission notice shall be included in
12+
all copies or substantial portions of the Software.
13+
14+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20+
THE SOFTWARE.
21+
*/
22+
23+
'use strict'
24+
25+
const Tokenizer = require('../lib/natural/tokenizers/aggressive_tokenizer_de')
26+
const tokenizer = new Tokenizer()
27+
28+
describe('aggressive_tokenizer', function () {
29+
it('should tokenize strings with diacritics ä, ö and ü, and esszet ß', function () {
30+
expect(tokenizer.tokenize('Es werden nur Maßnahmen gefördert, die nicht aufgrund einer Rechtsvorschrift umgesetzt werden müssen.')).toEqual(
31+
['Es', 'werden', 'nur', 'Maßnahmen', 'gefördert', 'die', 'nicht', 'aufgrund', 'einer',
32+
'Rechtsvorschrift', 'umgesetzt', 'werden', 'müssen'])
33+
expect(tokenizer.tokenize('Anträge sind vor Beginn der jeweiligen Maßnahme zu stellen.')).toEqual(
34+
['Anträge', 'sind', 'vor', 'Beginn', 'der', 'jeweiligen', 'Maßnahme', 'zu', 'stellen'])
35+
})
36+
})

0 commit comments

Comments
 (0)