diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc index 96cf169b914285..8c621606502b4d 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc @@ -916,7 +916,7 @@ Status MarkForCompilationPassImpl::DeclusterNodes() { // increasing its live range. // // See b/221997940 for a real-world example of this. - if (n->op_def().name() == "Fill" && + if ((n->op_def().name() == "Fill" || n->def().op() == "SplitV") && n->out_nodes().begin() != n->out_nodes().end() && absl::c_all_of(n->out_nodes(), [&](Node* user) { return GetClusterForNode(user) != cluster; @@ -990,8 +990,7 @@ Status MarkForCompilationPassImpl::CreateClusters() { // trouble. if (cluster->effective_cluster_size() >= debug_options_.min_cluster_size || - cluster->has_functional_control_flow() || - cluster->is_xla_compile_attr_true()) { + cluster->has_functional_control_flow()) { string& name = cluster_names[cluster->cycles_graph_node_id()]; if (name.empty()) { diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h index a9bf175f205f4e..8e2efd84549000 100644 --- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h +++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h @@ -902,7 +902,7 @@ struct UnsortedSegmentFunctor { config = GetGpuLaunchConfig(data_size, d); TF_CHECK_OK(GpuLaunchKernel( UnsortedSegmentCustomKernel< - T, Index, typename ReduceUpdateOpFor::atomic_op>, + T, Index, typename ReduceUpdateOpFor::nonatomic_op>, config.block_count, config.thread_per_block, 0, d.stream(), input_outer_dim_size, input_inner_dim_size, output_outer_dim_size, unsorted_segment_ids.data(), data.data(), output.data())); diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc index f58accf3e2583b..a4ed476a65686a 100644 --- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc +++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc @@ -947,12 +947,16 @@ absl::Status AMDGPUTargetModuleLinker( // not has major impact as the hipcc path by default enables flush to zero for // compilation. // If ftz is enabled, set it as an attribute on every function in the module. - if (debug_options.xla_gpu_ftz()) { - for (llvm::Function& fn : *module) { + for (llvm::Function& fn : *module) { + if (debug_options.xla_gpu_ftz()) { // may be necessary for the compiler to generate atomics (confirm!) fn.addFnAttr("denormal-fp-math-f32", "preserve-sign"); fn.addFnAttr("amdgpu-unsafe-fp-atomics", "true"); } + + if (!fn.isDeclaration() && fn.hasInternalLinkage()) { + fn.addFnAttr(llvm::Attribute::AttrKind::AlwaysInline); + } } return absl::OkStatus();