diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 96cf169b914285..8c621606502b4d 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -916,7 +916,7 @@ Status MarkForCompilationPassImpl::DeclusterNodes() {
     // increasing its live range.
     //
     // See b/221997940 for a real-world example of this.
-    if (n->op_def().name() == "Fill" &&
+    if ((n->op_def().name() == "Fill" || n->def().op() == "SplitV") &&
         n->out_nodes().begin() != n->out_nodes().end() &&
         absl::c_all_of(n->out_nodes(), [&](Node* user) {
           return GetClusterForNode(user) != cluster;
@@ -990,8 +990,7 @@ Status MarkForCompilationPassImpl::CreateClusters() {
     // trouble.
 
     if (cluster->effective_cluster_size() >= debug_options_.min_cluster_size ||
-        cluster->has_functional_control_flow() ||
-        cluster->is_xla_compile_attr_true()) {
+        cluster->has_functional_control_flow()) {
       string& name = cluster_names[cluster->cycles_graph_node_id()];
 
       if (name.empty()) {
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
index a9bf175f205f4e..8e2efd84549000 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
@@ -902,7 +902,7 @@ struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
       config = GetGpuLaunchConfig(data_size, d);
       TF_CHECK_OK(GpuLaunchKernel(
           UnsortedSegmentCustomKernel<
-              T, Index, typename ReduceUpdateOpFor<ReductionF>::atomic_op>,
+              T, Index, typename ReduceUpdateOpFor<ReductionF>::nonatomic_op>,
           config.block_count, config.thread_per_block, 0, d.stream(),
           input_outer_dim_size, input_inner_dim_size, output_outer_dim_size,
           unsorted_segment_ids.data(), data.data(), output.data()));
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index f58accf3e2583b..a4ed476a65686a 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -947,12 +947,16 @@ absl::Status AMDGPUTargetModuleLinker(
   // not has major impact as the hipcc path by default enables flush to zero for
   // compilation.
   // If ftz is enabled, set it as an attribute on every function in the module.
-  if (debug_options.xla_gpu_ftz()) {
-    for (llvm::Function& fn : *module) {
+  for (llvm::Function& fn : *module) {
+    if (debug_options.xla_gpu_ftz()) {
       // may be necessary for the compiler to generate atomics (confirm!)
       fn.addFnAttr("denormal-fp-math-f32", "preserve-sign");
       fn.addFnAttr("amdgpu-unsafe-fp-atomics", "true");
     }
+
+    if (!fn.isDeclaration() && fn.hasInternalLinkage()) {
+      fn.addFnAttr(llvm::Attribute::AttrKind::AlwaysInline);
+    }
   }
 
   return absl::OkStatus();