Describe the bug
Large batches after join operations exceed Arrow Vector 2GB memory limit, causing OversizedAllocationException.
To Reproduce
./bin/spark-sql \
--conf spark.sql.autoBroadcastJoinThreshold=-1 \
--conf spark.sql.adaptive.autoBroadcastJoinThreshold=-1
create table tmp_t1(a int, b int) stored as orc;
with g1 as (select id as a from range(1)),
g2 as (select id as b from range(10000))
insert overwrite tmp_t1 select g1.a, g2.b from from g1 join g2;
create table tmp_t2(a int, b int) stored as orc;
with g1 as (select id as a from range(1)),
g2 as (select id as b from range(10000))
insert overwrite tmp_t2 select g1.a, g2.b from from g1 join g2;
select s, count(1) as cnt
from (select concat(
cast(date_add('2010-01-01', t1.b) as string),
cast(date_add('2010-01-02', t2.b) as string)
) as s
from tmp_t1 t1 join tmp_t2 t2 on t1.a = t2.a)
group by s
order by cnt
limit 100;
Exception in thread "auron native task 0.0 in stage 4.0 (TID 10)" auron.org.apache.arrow.vector.util.OversizedAllocationException: Memory required for vector is (2147483648), which is overflow or more than max allowed (2147483647). You could consider using LargeVarCharVector/LargeVarBinaryVector for large strings/large bytes types
at auron.org.apache.arrow.vector.BaseVariableWidthVector.checkDataBufferSize(BaseVariableWidthVector.java:465)
at auron.org.apache.arrow.vector.BaseVariableWidthVector.reallocDataBuffer(BaseVariableWidthVector.java:574)
at auron.org.apache.arrow.vector.BaseVariableWidthVector.handleSafe(BaseVariableWidthVector.java:1344)
at auron.org.apache.arrow.vector.BaseVariableWidthVector.setSafe(BaseVariableWidthVector.java:1178)
at org.apache.spark.sql.execution.auron.arrowio.util.StringWriter.setValue(ArrowWriter.scala:247)
at org.apache.spark.sql.execution.auron.arrowio.util.ArrowFieldWriter.write(ArrowWriter.scala:126)
at org.apache.spark.sql.execution.auron.arrowio.util.ArrowWriter.write(ArrowWriter.scala:97)
at org.apache.auron.spark.sql.SparkAuronUDFWrapperContext.$anonfun$eval$5(SparkAuronUDFWrapperContext.scala:78)
at org.apache.auron.spark.sql.SparkAuronUDFWrapperContext.$anonfun$eval$5$adapted(SparkAuronUDFWrapperContext.scala:76)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at org.apache.auron.spark.sql.SparkAuronUDFWrapperContext.$anonfun$eval$4(SparkAuronUDFWrapperContext.scala:76)
at org.apache.auron.spark.sql.SparkAuronUDFWrapperContext.$anonfun$eval$4$adapted(SparkAuronUDFWrapperContext.scala:69)
at org.apache.spark.sql.auron.util.Using$.$anonfun$resources$9(Using.scala:395)
at org.apache.spark.sql.auron.util.Using$.resource(Using.scala:273)
at org.apache.spark.sql.auron.util.Using$.$anonfun$resources$8(Using.scala:394)
at org.apache.spark.sql.auron.util.Using$.resource(Using.scala:273)
at org.apache.spark.sql.auron.util.Using$.$anonfun$resources$7(Using.scala:393)
at org.apache.spark.sql.auron.util.Using$.resource(Using.scala:273)
at org.apache.spark.sql.auron.util.Using$.$anonfun$resources$6(Using.scala:392)
at org.apache.spark.sql.auron.util.Using$.resource(Using.scala:273)
at org.apache.spark.sql.auron.util.Using$.resources(Using.scala:391)
at org.apache.auron.spark.sql.SparkAuronUDFWrapperContext.eval(SparkAuronUDFWrapperContext.scala:69)
Expected behavior
Query should execute without memory allocation errors.
Screenshots
Additional context
Describe the bug
Large batches after join operations exceed Arrow Vector 2GB memory limit, causing
OversizedAllocationException.To Reproduce
Expected behavior
Query should execute without memory allocation errors.
Screenshots
Additional context