16
16
import numpy as np
17
17
18
18
from pandas ._libs import lib
19
+ from pandas ._libs .missing import NA
19
20
from pandas ._libs .tslibs import (
20
21
Timedelta ,
21
22
Timestamp ,
@@ -351,7 +352,7 @@ def _from_sequence_of_strings(
351
352
# duration to string casting behavior
352
353
mask = isna (scalars )
353
354
if not isinstance (strings , (pa .Array , pa .ChunkedArray )):
354
- strings = pa .array (strings , type = pa .string (), from_pandas = True )
355
+ strings = pa .array (strings , type = pa .string ())
355
356
strings = pc .if_else (mask , None , strings )
356
357
try :
357
358
scalars = strings .cast (pa .int64 ())
@@ -372,7 +373,7 @@ def _from_sequence_of_strings(
372
373
if isinstance (strings , (pa .Array , pa .ChunkedArray )):
373
374
scalars = strings
374
375
else :
375
- scalars = pa .array (strings , type = pa .string (), from_pandas = True )
376
+ scalars = pa .array (strings , type = pa .string ())
376
377
scalars = pc .if_else (pc .equal (scalars , "1.0" ), "1" , scalars )
377
378
scalars = pc .if_else (pc .equal (scalars , "0.0" ), "0" , scalars )
378
379
scalars = scalars .cast (pa .bool_ ())
@@ -384,6 +385,13 @@ def _from_sequence_of_strings(
384
385
from pandas .core .tools .numeric import to_numeric
385
386
386
387
scalars = to_numeric (strings , errors = "raise" )
388
+ if not pa .types .is_decimal (pa_type ):
389
+ # TODO: figure out why doing this cast breaks with decimal dtype
390
+ # in test_from_sequence_of_strings_pa_array
391
+ mask = strings .is_null ()
392
+ scalars = pa .array (scalars , mask = np .array (mask ), type = pa_type )
393
+ # TODO: could we just do strings.cast(pa_type)?
394
+
387
395
else :
388
396
raise NotImplementedError (
389
397
f"Converting strings to { pa_type } is not implemented."
@@ -426,7 +434,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
426
434
"""
427
435
if isinstance (value , pa .Scalar ):
428
436
pa_scalar = value
429
- elif isna (value ):
437
+ elif isna (value ) and not lib . is_float ( value ) :
430
438
pa_scalar = pa .scalar (None , type = pa_type )
431
439
else :
432
440
# Workaround https://github.com/apache/arrow/issues/37291
@@ -443,7 +451,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
443
451
value = value .as_unit (pa_type .unit )
444
452
value = value ._value
445
453
446
- pa_scalar = pa .scalar (value , type = pa_type , from_pandas = True )
454
+ pa_scalar = pa .scalar (value , type = pa_type )
447
455
448
456
if pa_type is not None and pa_scalar .type != pa_type :
449
457
pa_scalar = pa_scalar .cast (pa_type )
@@ -475,6 +483,13 @@ def _box_pa_array(
475
483
if copy :
476
484
value = value .copy ()
477
485
pa_array = value .__arrow_array__ ()
486
+
487
+ elif hasattr (value , "__arrow_array__" ):
488
+ # e.g. StringArray
489
+ if copy :
490
+ value = value .copy ()
491
+ pa_array = value .__arrow_array__ ()
492
+
478
493
else :
479
494
if (
480
495
isinstance (value , np .ndarray )
@@ -528,19 +543,32 @@ def _box_pa_array(
528
543
pa_array = pa .array (dta ._ndarray , type = pa_type , mask = mask )
529
544
return pa_array
530
545
546
+ mask = None
547
+ if getattr (value , "dtype" , None ) is None or value .dtype .kind not in "mfM" :
548
+ # similar to isna(value) but exclude NaN
549
+ # TODO: cythonize!
550
+ mask = np .array ([x is NA or x is None for x in value ], dtype = bool )
551
+
552
+ from_pandas = False
553
+ if pa .types .is_integer (pa_type ):
554
+ # If user specifically asks to cast a numpy float array with NaNs
555
+ # to pyarrow integer, we'll treat those NaNs as NA
556
+ from_pandas = True
531
557
try :
532
- pa_array = pa .array (value , type = pa_type , from_pandas = True )
558
+ pa_array = pa .array (
559
+ value , type = pa_type , mask = mask , from_pandas = from_pandas
560
+ )
533
561
except (pa .ArrowInvalid , pa .ArrowTypeError ):
534
562
# GH50430: let pyarrow infer type, then cast
535
- pa_array = pa .array (value , from_pandas = True )
563
+ pa_array = pa .array (value , mask = mask , from_pandas = from_pandas )
536
564
537
565
if pa_type is None and pa .types .is_duration (pa_array .type ):
538
566
# Workaround https://github.com/apache/arrow/issues/37291
539
567
from pandas .core .tools .timedeltas import to_timedelta
540
568
541
569
value = to_timedelta (value )
542
570
value = value .to_numpy ()
543
- pa_array = pa .array (value , type = pa_type , from_pandas = True )
571
+ pa_array = pa .array (value , type = pa_type )
544
572
545
573
if pa .types .is_duration (pa_array .type ) and pa_array .null_count > 0 :
546
574
# GH52843: upstream bug for duration types when originally
@@ -1187,7 +1215,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
1187
1215
if not len (values ):
1188
1216
return np .zeros (len (self ), dtype = bool )
1189
1217
1190
- result = pc .is_in (self ._pa_array , value_set = pa .array (values , from_pandas = True ))
1218
+ result = pc .is_in (self ._pa_array , value_set = pa .array (values ))
1191
1219
# pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
1192
1220
# to False
1193
1221
return np .array (result , dtype = np .bool_ )
@@ -1992,7 +2020,7 @@ def __setitem__(self, key, value) -> None:
1992
2020
raise ValueError ("Length of indexer and values mismatch" )
1993
2021
chunks = [
1994
2022
* self ._pa_array [:key ].chunks ,
1995
- pa .array ([value ], type = self ._pa_array .type , from_pandas = True ),
2023
+ pa .array ([value ], type = self ._pa_array .type ),
1996
2024
* self ._pa_array [key + 1 :].chunks ,
1997
2025
]
1998
2026
data = pa .chunked_array (chunks ).combine_chunks ()
@@ -2046,7 +2074,7 @@ def _rank_calc(
2046
2074
pa_type = pa .float64 ()
2047
2075
else :
2048
2076
pa_type = pa .uint64 ()
2049
- result = pa .array (ranked , type = pa_type , from_pandas = True )
2077
+ result = pa .array (ranked , type = pa_type )
2050
2078
return result
2051
2079
2052
2080
data = self ._pa_array .combine_chunks ()
@@ -2298,7 +2326,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
2298
2326
right , right_type = _to_numpy_and_type (right )
2299
2327
pa_type = left_type or right_type
2300
2328
result = np .where (cond , left , right )
2301
- return pa .array (result , type = pa_type , from_pandas = True )
2329
+ return pa .array (result , type = pa_type )
2302
2330
2303
2331
@classmethod
2304
2332
def _replace_with_mask (
@@ -2341,7 +2369,7 @@ def _replace_with_mask(
2341
2369
replacements = replacements .as_py ()
2342
2370
result = np .array (values , dtype = object )
2343
2371
result [mask ] = replacements
2344
- return pa .array (result , type = values .type , from_pandas = True )
2372
+ return pa .array (result , type = values .type )
2345
2373
2346
2374
# ------------------------------------------------------------------
2347
2375
# GroupBy Methods
@@ -2420,7 +2448,7 @@ def _groupby_op(
2420
2448
return type (self )(pa_result )
2421
2449
else :
2422
2450
# DatetimeArray, TimedeltaArray
2423
- pa_result = pa .array (result , from_pandas = True )
2451
+ pa_result = pa .array (result )
2424
2452
return type (self )(pa_result )
2425
2453
2426
2454
def _apply_elementwise (self , func : Callable ) -> list [list [Any ]]:
0 commit comments