12
12
import re
13
13
import uuid
14
14
from types import MappingProxyType
15
- from typing import Any , Callable , Dict , FrozenSet , List , Optional , Tuple , Union
15
+ from typing import Any , Callable , Dict , FrozenSet , List , Optional , Sequence , Tuple , Union , cast
16
16
17
- from typing_extensions import ParamSpec , TypedDict
17
+ from typing_extensions import ParamSpec , TypeAlias , TypedDict
18
18
19
19
from unstructured .documents .coordinates import (
20
20
TYPE_TO_COORDINATE_SYSTEM_MAP ,
24
24
from unstructured .partition .utils .constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
25
25
from unstructured .utils import lazyproperty
26
26
27
+ Point : TypeAlias = Tuple [float , float ]
28
+ Points : TypeAlias = Tuple [Point , ...]
29
+
27
30
28
31
class NoID (abc .ABC ):
29
32
"""Class to indicate that an element do not have an ID."""
@@ -61,10 +64,10 @@ def from_dict(cls, input_dict: Dict[str, Any]):
61
64
class CoordinatesMetadata :
62
65
"""Metadata fields that pertain to the coordinates of the element."""
63
66
64
- points : Tuple [ Tuple [ float , float ], ... ]
65
- system : CoordinateSystem
67
+ points : Optional [ Points ]
68
+ system : Optional [ CoordinateSystem ]
66
69
67
- def __init__ (self , points , system ):
70
+ def __init__ (self , points : Optional [ Points ] , system : Optional [ CoordinateSystem ] ):
68
71
# Both `points` and `system` must be present; one is not meaningful without the other.
69
72
if (points is None and system is not None ) or (points is not None and system is None ):
70
73
raise ValueError (
@@ -94,30 +97,38 @@ def to_dict(self):
94
97
@classmethod
95
98
def from_dict (cls , input_dict : Dict [str , Any ]):
96
99
# `input_dict` may contain a tuple of tuples or a list of lists
97
- def convert_to_tuple_of_tuples (sequence_of_sequences ) :
98
- subsequences = []
100
+ def convert_to_points (sequence_of_sequences : Sequence [ Sequence [ float ]]) -> Points :
101
+ points : List [ Point ] = []
99
102
for seq in sequence_of_sequences :
100
103
if isinstance (seq , list ):
101
- subsequences .append (tuple (seq ))
104
+ points .append (cast ( Point , tuple (seq ) ))
102
105
elif isinstance (seq , tuple ):
103
- subsequences .append (seq )
104
- return tuple (subsequences )
105
-
106
- input_points = input_dict .get ("points" , None )
107
- points = convert_to_tuple_of_tuples (input_points ) if input_points is not None else None
108
- width = input_dict .get ("layout_width" , None )
109
- height = input_dict .get ("layout_height" , None )
110
- system = None
111
- if input_dict .get ("system" , None ) == "RelativeCoordinateSystem" :
112
- system = RelativeCoordinateSystem ()
113
- elif (
114
- width is not None
115
- and height is not None
116
- and input_dict .get ("system" , None ) in TYPE_TO_COORDINATE_SYSTEM_MAP
117
- ):
118
- system = TYPE_TO_COORDINATE_SYSTEM_MAP [input_dict ["system" ]](width , height )
119
- constructor_args = {"points" : points , "system" : system }
120
- return cls (** constructor_args )
106
+ points .append (cast (Point , seq ))
107
+ return tuple (points )
108
+
109
+ # -- parse points --
110
+ input_points = input_dict .get ("points" )
111
+ points = convert_to_points (input_points ) if input_points is not None else None
112
+
113
+ # -- parse system --
114
+ system_name = input_dict .get ("system" )
115
+ width = input_dict .get ("layout_width" )
116
+ height = input_dict .get ("layout_height" )
117
+ system = (
118
+ None
119
+ if system_name is None
120
+ else RelativeCoordinateSystem ()
121
+ if system_name == "RelativeCoordinateSystem"
122
+ else TYPE_TO_COORDINATE_SYSTEM_MAP [system_name ](width , height )
123
+ if (
124
+ width is not None
125
+ and height is not None
126
+ and system_name in TYPE_TO_COORDINATE_SYSTEM_MAP
127
+ )
128
+ else None
129
+ )
130
+
131
+ return cls (points = points , system = system )
121
132
122
133
123
134
class RegexMetadata (TypedDict ):
@@ -637,14 +648,19 @@ def to_dict(self) -> Dict[str, Any]:
637
648
}
638
649
639
650
def convert_coordinates_to_new_system (
640
- self ,
641
- new_system : CoordinateSystem ,
642
- in_place = True ,
643
- ) -> Optional [Tuple [Tuple [Union [int , float ], Union [int , float ]], ...]]:
644
- """Converts the element location coordinates to a new coordinate system. If inplace is true,
645
- changes the coordinates in place and updates the coordinate system."""
646
- if self .metadata .coordinates is None :
651
+ self , new_system : CoordinateSystem , in_place : bool = True
652
+ ) -> Optional [Points ]:
653
+ """Converts the element location coordinates to a new coordinate system.
654
+
655
+ If inplace is true, changes the coordinates in place and updates the coordinate system.
656
+ """
657
+ if (
658
+ self .metadata .coordinates is None
659
+ or self .metadata .coordinates .system is None
660
+ or self .metadata .coordinates .points is None
661
+ ):
647
662
return None
663
+
648
664
new_coordinates = tuple (
649
665
self .metadata .coordinates .system .convert_coordinates_to_new_system (
650
666
new_system = new_system ,
@@ -653,15 +669,19 @@ def convert_coordinates_to_new_system(
653
669
)
654
670
for x , y in self .metadata .coordinates .points
655
671
)
672
+
656
673
if in_place :
657
674
self .metadata .coordinates .points = new_coordinates
658
675
self .metadata .coordinates .system = new_system
676
+
659
677
return new_coordinates
660
678
661
679
662
680
class CheckBox (Element ):
663
- """A checkbox with an attribute indicating whether its checked or not. Primarily used
664
- in documents that are forms"""
681
+ """A checkbox with an attribute indicating whether its checked or not.
682
+
683
+ Primarily used in documents that are forms.
684
+ """
665
685
666
686
def __init__ (
667
687
self ,
@@ -682,12 +702,18 @@ def __init__(
682
702
)
683
703
self .checked : bool = checked
684
704
685
- def __eq__ (self , other ):
686
- return (self .checked == other .checked ) and (
687
- self .metadata .coordinates == other .metadata .coordinates
705
+ def __eq__ (self , other : object ) -> bool :
706
+ if not isinstance (other , CheckBox ):
707
+ return False
708
+ return all (
709
+ (
710
+ self .checked == other .checked ,
711
+ self .metadata .coordinates == other .metadata .coordinates ,
712
+ )
688
713
)
689
714
690
- def to_dict (self ) -> dict :
715
+ def to_dict (self ) -> Dict [str , Any ]:
716
+ """Serialize to JSON-compatible (str keys) dict."""
691
717
out = super ().to_dict ()
692
718
out ["type" ] = "CheckBox"
693
719
out ["checked" ] = self .checked
@@ -729,20 +755,23 @@ def __init__(
729
755
detection_origin = detection_origin ,
730
756
)
731
757
732
- def __str__ (self ):
733
- return self .text
734
-
735
- def __eq__ (self , other ):
758
+ def __eq__ (self , other : object ):
759
+ if not isinstance (other , Text ):
760
+ return False
736
761
return all (
737
- [
738
- ( self .text == other .text ) ,
739
- ( self .metadata .coordinates == other .metadata .coordinates ) ,
740
- ( self .category == other .category ) ,
741
- ( self .embeddings == other .embeddings ) ,
742
- ] ,
762
+ (
763
+ self .text == other .text ,
764
+ self .metadata .coordinates == other .metadata .coordinates ,
765
+ self .category == other .category ,
766
+ self .embeddings == other .embeddings ,
767
+ ) ,
743
768
)
744
769
745
- def to_dict (self ) -> dict :
770
+ def __str__ (self ):
771
+ return self .text
772
+
773
+ def to_dict (self ) -> Dict [str , Any ]:
774
+ """Serialize to JSON-compatible (str keys) dict."""
746
775
out = super ().to_dict ()
747
776
out ["element_id" ] = self .id
748
777
out ["type" ] = self .category
@@ -751,14 +780,17 @@ def to_dict(self) -> dict:
751
780
out ["embeddings" ] = self .embeddings
752
781
return out
753
782
754
- def apply (self , * cleaners : Callable ):
755
- """Applies a cleaning brick to the text element. The function that's passed in
756
- should take a string as input and produce a string as output."""
783
+ def apply (self , * cleaners : Callable [[str ], str ]):
784
+ """Applies a cleaning brick to the text element.
785
+
786
+ The function that's passed in should take a string as input and produce a string as
787
+ output.
788
+ """
757
789
cleaned_text = self .text
758
790
for cleaner in cleaners :
759
791
cleaned_text = cleaner (cleaned_text )
760
792
761
- if not isinstance (cleaned_text , str ):
793
+ if not isinstance (cleaned_text , str ): # pyright: ignore[reportUnnecessaryIsInstance]
762
794
raise ValueError ("Cleaner produced a non-string output." )
763
795
764
796
self .text = cleaned_text
0 commit comments