1414from __future__ import print_function
1515from __future__ import unicode_literals
1616from abc import abstractproperty
17+ import collections
1718import logging
1819import re
1920
3435from ..nlp .pos import ChemCrfPosTagger
3536from ..nlp .tokenize import ChemSentenceTokenizer , ChemWordTokenizer , regex_span_tokenize
3637from ..text import CONTROL_RE
37- from ..utils import memoized_property
38+ from ..utils import memoized_property , python_2_unicode_compatible
3839from .element import BaseElement
3940
4041
4142log = logging .getLogger (__name__ )
4243
4344
44- @six . python_2_unicode_compatible
45+ @python_2_unicode_compatible
4546class BaseText (BaseElement ):
4647 """Abstract base class for a text Document Element."""
4748
@@ -59,7 +60,7 @@ def __init__(self, text, word_tokenizer=None, lexicon=None, abbreviation_detecto
5960 self .parsers = parsers if parsers is not None else self .parsers
6061
6162 def __repr__ (self ):
62- return '%s(id=%r, references=%r, text=%r)' % (self .__class__ .__name__ , self .id , self .references , self ._text . encode ( 'utf8' ) )
63+ return '%s(id=%r, references=%r, text=%r)' % (self .__class__ .__name__ , self .id , self .references , self ._text )
6364
6465 def __str__ (self ):
6566 return self ._text
@@ -113,7 +114,7 @@ def _repr_html_(self):
113114 return self .text
114115
115116
116- class Text (BaseText ):
117+ class Text (collections . Sequence , BaseText ):
117118 """A passage of text, comprising one or more sentences."""
118119
119120 sentence_tokenizer = ChemSentenceTokenizer ()
@@ -129,6 +130,12 @@ def __init__(self, text, sentence_tokenizer=None, word_tokenizer=None, lexicon=N
129130 super (Text , self ).__init__ (text , word_tokenizer = word_tokenizer , lexicon = lexicon , abbreviation_detector = abbreviation_detector , pos_tagger = pos_tagger , ner_tagger = ner_tagger , parsers = None , ** kwargs )
130131 self .sentence_tokenizer = sentence_tokenizer if sentence_tokenizer is not None else self .sentence_tokenizer
131132
133+ def __getitem__ (self , index ):
134+ return self .sentences [index ]
135+
136+ def __len__ (self ):
137+ return len (self .sentences )
138+
132139 @memoized_property
133140 def sentences (self ):
134141 """Return a list of Sentences that make up this text passage."""
@@ -310,7 +317,7 @@ def __init__(self, text, start=0, end=None, word_tokenizer=None, lexicon=None, a
310317 self .end = end if end is not None else len (text )
311318
312319 def __repr__ (self ):
313- return '%s(%r, %r, %r)' % (self .__class__ .__name__ , self ._text . encode ( 'utf8' ) , self .start , self .end )
320+ return '%s(%r, %r, %r)' % (self .__class__ .__name__ , self ._text , self .start , self .end )
314321
315322 @memoized_property
316323 def tokens (self ):
@@ -532,7 +539,7 @@ def __add__(self, other):
532539 return NotImplemented
533540
534541
535- @six . python_2_unicode_compatible
542+ @python_2_unicode_compatible
536543class Span (object ):
537544 """A text span within a sentence."""
538545
@@ -545,7 +552,7 @@ def __init__(self, text, start, end):
545552 """The end offset of this token in the original text."""
546553
547554 def __repr__ (self ):
548- return '%s(%r, %r, %r)' % (self .__class__ .__name__ , self .text . encode ( 'utf8' ) , self .start , self .end )
555+ return '%s(%r, %r, %r)' % (self .__class__ .__name__ , self .text , self .start , self .end )
549556
550557 def __str__ (self ):
551558 return self .text
0 commit comments