Skip to content

Commit 1b830b1

Browse files
Fixes issue #689 (#705)
* Fixed issue #689 * Fixed indentation * Trailing whitespace
1 parent d41aca5 commit 1b830b1

File tree

2 files changed

+16
-2
lines changed

2 files changed

+16
-2
lines changed

lib/natural/tokenizers/sentence_tokenizer.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ util.inherits(SentenceTokenizer, Tokenizer)
3434

3535
SentenceTokenizer.prototype.tokenize = function (text) {
3636
// break string up in to sentences based on punctation and quotation marks
37-
let tokens = text.match(/(?<=\s+|^)["''"[({]?(.*?[.?!]|[^.?!]+)(\s[.?!])*["''"\])}]?(?=\s+|$)/g)
37+
// let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?.*?[.?!…](\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g)
38+
let tokens = text.match(/(?<=\s+|^)["''"[({]?(.*?[.?!]|.+)(\s[.?!])*["''"\])}]?(?=\s+|$)/g)
3839

3940
DEBUG && console.log('SentenceTokenizer.tokenize: ' + tokens)
4041

spec/sentence_tokenizer_spec.js

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,18 +159,31 @@ describe('sentence_tokenizer', function () {
159159
])
160160
})
161161

162-
it('should handle text with the ellipsis symbol … and it should handle last sentence without punctuation (issue #648)', function () {
162+
it('Should handle text with the ellipsis symbol … (issue #648)', function () {
163163
expect(
164164
tokenizer.tokenize('We’re heading for a catastrophic global temperature rise… Fires are blazing from the Amazon to the Arctic.')
165165
).toEqual([
166166
'We’re heading for a catastrophic global temperature rise…',
167167
'Fires are blazing from the Amazon to the Arctic.'
168168
])
169+
})
170+
it('It should handle last sentence without punctuation (issue #648)', function () {
169171
expect(
170172
tokenizer.tokenize('We’re heading for a catastrophic global temperature rise. Fires are blazing from the Amazon to the Arctic')
171173
).toEqual([
172174
'We’re heading for a catastrophic global temperature rise.',
173175
'Fires are blazing from the Amazon to the Arctic'
174176
])
175177
})
178+
it('It should handle the example from issue #689 correctly', function () {
179+
const testInput = `
180+
This is some test content.
181+
182+
We're trying to figure out variations in versions of the package.
183+
`.trim()
184+
expect(tokenizer.tokenize(testInput)).toEqual([
185+
'This is some test content.',
186+
'We\'re trying to figure out variations in versions of the package.'
187+
])
188+
})
176189
})

0 commit comments

Comments
 (0)