From d4c8f29c54f1bf45543fffd2d995b06dc015c53b Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 11 Jun 2025 13:18:18 -0700
Subject: [PATCH 1/6] [Exutorch] Add broadcast support for le op

For refactored hf repro requires this to support mask generation

Differential Revision: [D76456398](https://our.internmc.facebook.com/intern/diff/D76456398/)

[ghstack-poisoned]
---
 kernels/optimized/cpu/op_le.cpp               |  81 +-
 kernels/test/op_le_test.cpp                   | 812 ++++++++++++++++++
 .../optimized/op_registration_util.bzl        |   3 +
 3 files changed, 851 insertions(+), 45 deletions(-)
diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp
index 94c2d5ffa76..2513155aac1 100644
--- a/kernels/optimized/cpu/op_le.cpp
+++ b/kernels/optimized/cpu/op_le.cpp
@@ -6,12 +6,14 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/optimized/cpu/binary_ops.h>
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/optimized/vec/vec.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
+#include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
 namespace torch {
 namespace executor {
@@ -79,52 +81,41 @@ Tensor& opt_le_tensor_out(
     return out;
   }
 
-  ET_KERNEL_CHECK(ctx, tensors_have_same_shape(a, b), InvalidArgument, out);
-
-  // Resize for dynamic shape
-  auto error = resize_tensor(out, a.sizes());
-  ET_KERNEL_CHECK_MSG(
-      ctx,
-      error == Error::Ok,
-      InvalidArgument,
-      out,
-      "Failed to resize output tensor.");
-
-  if (a_type == b_type && a_type == out_type) {
-    ET_SWITCH_REAL_TYPES_AND(
-        Bool, out_type, ctx, "le.Tensor_out", CTYPE, [&]() {
-          using Vec = executorch::vec::Vectorized<CTYPE>;
-          executorch::vec::map2<CTYPE>(
-              [](Vec x, Vec y) { return x.le(y); },
-              out.mutable_data_ptr<CTYPE>(),
-              a.const_data_ptr<CTYPE>(),
-              b.const_data_ptr<CTYPE>(),
-              a.numel());
-        });
+  // Check for optimized broadcast paths
+  auto selected_optimized_path = select_optimized_path(a, b, out);
+  printf("selected_optimized_path: %d\n", static_cast<int>(selected_optimized_path));
+  if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
+    // Resize for dynamic shape
+    auto error = resize_tensor(out, a.sizes());
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
+
+    ET_SWITCH_REALB_TYPES(a_type, ctx, "le.Tensor_out", CTYPE, [&]() {
+      using Vec = executorch::vec::Vectorized<CTYPE>;
+      executorch::vec::map2<CTYPE>(
+          [](Vec x, Vec y) { return x.le(y); },
+          out.mutable_data_ptr<CTYPE>(),
+          a.const_data_ptr<CTYPE>(),
+          b.const_data_ptr<CTYPE>(),
+          out.numel());
+    });
+  } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
+    // Handle optimized broadcast cases
+    ET_SWITCH_REALB_TYPES(out_type, ctx, "le.Tensor_out", CTYPE, [&]() {
+      using Vec = executorch::vec::Vectorized<CTYPE>;
+      auto le_lambda = [](auto x, auto y) { return x.le(y); };
+      return torch::executor::handle_broadcast_elementwise<CTYPE>(
+          ctx, le_lambda, a, b, out, selected_optimized_path);
+    });
   } else {
-    ET_SWITCH_REAL_TYPES_AND(
-        Bool, a_type, ctx, "le.Tensor_out", CTYPE_A, [&]() {
-          ET_SWITCH_REAL_TYPES_AND(
-              Bool, b_type, ctx, "le.Tensor_out", CTYPE_B, [&]() {
-                using CTYPE_IN = typename torch::executor::
-                    promote_types<CTYPE_A, CTYPE_B>::type;
-                ET_DCHECK(
-                    CppTypeToScalarType<CTYPE_IN>::value ==
-                    promoteTypes(a_type, b_type));
-                ET_SWITCH_REAL_TYPES_AND(
-                    Bool, out_type, ctx, "le.Tensor_out", CTYPE_OUT, [&]() {
-                      const size_t n = a.numel();
-                      const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
-                      const CTYPE_B* b_data = b.const_data_ptr<CTYPE_B>();
-                      CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-                      for (auto i = 0; i < n; ++i) {
-                        out_data[i] = static_cast<CTYPE_OUT>(
-                            static_cast<CTYPE_IN>(a_data[i]) <=
-                            static_cast<CTYPE_IN>(b_data[i]));
-                      }
-                    });
-              });
-        });
+    // @lint-ignore CLANGTIDY facebook-hte-CArray
+    static constexpr const char op_name[] = "le.Tensor_out";
+    return internal::comparison_tensor_out<std::less_equal, op_name>(
+        ctx, a, b, out);
   }
 
   return out;
diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp
index bcd40d24d89..eebad43ce75 100644
--- a/kernels/test/op_le_test.cpp
+++ b/kernels/test/op_le_test.cpp
@@ -186,3 +186,815 @@ TEST_F(OpLeTensorOutTest, BroadcastTest) {
   op_le_tensor_out(a, b, out);
   EXPECT_TENSOR_EQ(out, tf.make({1, 4}, {true, true, true, false}));
 }
+
+TEST_F(OpLeTensorOutTest, Broadcast2DTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case: (1, 10) and (6, 1) -> (6, 10)
+  Tensor a =
+      tf.make(/*sizes=*/{1, 10}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  Tensor b = tf.make({6, 1}, {2, 4, 6, 8, 10, 12});
+
+  Tensor out = tf_bool.zeros({6, 10});
+
+  op_le_tensor_out(a, b, out);
+
+  // Expected: each row i should be [1<=b[i], 2<=b[i], ..., 10<=b[i]]
+  // Row 0: b[0]=2, so [1<=2, 2<=2, 3<=2, ...] = [true, true, false, false, ...]
+  // Row 1: b[1]=4, so [1<=4, 2<=4, 3<=4, 4<=4, 5<=4, ...] = [true, true, true,
+  // true, false, ...]
+  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  std::vector<ctype> expected_data = {
+      // Row 0 (b=2): 1<=2, 2<=2, 3<=2, 4<=2, 5<=2, 6<=2, 7<=2, 8<=2, 9<=2,
+      // 10<=2
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      // Row 1 (b=4): 1<=4, 2<=4, 3<=4, 4<=4, 5<=4, 6<=4, 7<=4, 8<=4, 9<=4,
+      // 10<=4
+      true,
+      true,
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      // Row 2 (b=6): 1<=6, 2<=6, 3<=6, 4<=6, 5<=6, 6<=6, 7<=6, 8<=6, 9<=6,
+      // 10<=6
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      // Row 3 (b=8): 1<=8, 2<=8, 3<=8, 4<=8, 5<=8, 6<=8, 7<=8, 8<=8, 9<=8,
+      // 10<=8
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      false,
+      false,
+      // Row 4 (b=10): 1<=10, 2<=10, 3<=10, 4<=10, 5<=10, 6<=10, 7<=10, 8<=10,
+      // 9<=10, 10<=10
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 5 (b=12): 1<=12, 2<=12, 3<=12, 4<=12, 5<=12, 6<=12, 7<=12, 8<=12,
+      // 9<=12, 10<=12
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true};
+
+  EXPECT_TENSOR_EQ(
+      out,
+      tf_bool.make(
+          {6, 10}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, Broadcast1DTo2DTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case: (6,) and (1, 10) -> (6, 10)
+  Tensor a = tf.make({6, 1}, {2, 4, 6, 8, 10, 12});
+  Tensor b =
+      tf.make(/*sizes=*/{1, 10}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+
+  Tensor out = tf_bool.zeros({6, 10});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  std::vector<ctype> expected_data = {
+      // Row 0 (a=2): 2<=1, 2<=2, 2<=3, 2<=4, 2<=5, 2<=6, 2<=7, 2<=8, 2<=9,
+      // 2<=10
+      false,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 1 (a=4): 4<=1, 4<=2, 4<=3, 4<=4, 4<=5, 4<=6, 4<=7, 4<=8, 4<=9,
+      // 4<=10
+      false,
+      false,
+      false,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 2 (a=6): 6<=1, 6<=2, 6<=3, 6<=4, 6<=5, 6<=6, 6<=7, 6<=8, 6<=9,
+      // 6<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 3 (a=8): 8<=1, 8<=2, 8<=3, 8<=4, 8<=5, 8<=6, 8<=7, 8<=8, 8<=9,
+      // 8<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      true,
+      true,
+      true,
+      // Row 4 (a=10): 10<=1, 10<=2, 10<=3, 10<=4, 10<=5, 10<=6, 10<=7, 10<=8,
+      // 10<=9, 10<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      true,
+      // Row 5 (a=12): 12<=1, 12<=2, 12<=3, 12<=4, 12<=5, 12<=6, 12<=7, 12<=8,
+      // 12<=9, 12<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false};
+
+  EXPECT_TENSOR_EQ(
+      out,
+      tf_bool.make(
+          {6, 10}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, BroadcastReverseTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case: (6, 1) and (1, 10) -> (6, 10) (reverse of the first broadcast
+  // test)
+  Tensor a = tf.make(/*sizes=*/{6, 1}, /*data=*/{2, 4, 6, 8, 10, 12});
+  Tensor b = tf.make({1, 10}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+
+  Tensor out = tf_bool.zeros({6, 10});
+
+  op_le_tensor_out(a, b, out);
+
+  // Expected: each row i should be [a[i]<=1, a[i]<=2, ..., a[i]<=10]
+  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  std::vector<ctype> expected_data = {
+      // Row 0 (a=2): 2<=1, 2<=2, 2<=3, 2<=4, 2<=5, 2<=6, 2<=7, 2<=8, 2<=9,
+      // 2<=10
+      false,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 1 (a=4): 4<=1, 4<=2, 4<=3, 4<=4, 4<=5, 4<=6, 4<=7, 4<=8, 4<=9,
+      // 4<=10
+      false,
+      false,
+      false,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 2 (a=6): 6<=1, 6<=2, 6<=3, 6<=4, 6<=5, 6<=6, 6<=7, 6<=8, 6<=9,
+      // 6<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 3 (a=8): 8<=1, 8<=2, 8<=3, 8<=4, 8<=5, 8<=6, 8<=7, 8<=8, 8<=9,
+      // 8<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      true,
+      true,
+      true,
+      // Row 4 (a=10): 10<=1, 10<=2, 10<=3, 10<=4, 10<=5, 10<=6, 10<=7, 10<=8,
+      // 10<=9, 10<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      true,
+      // Row 5 (a=12): 12<=1, 12<=2, 12<=3, 12<=4, 12<=5, 12<=6, 12<=7, 12<=8,
+      // 12<=9, 12<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false};
+
+  EXPECT_TENSOR_EQ(
+      out,
+      tf_bool.make(
+          {6, 10}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, BroadcastLastDimTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case for kBroadcastLastDim: (3, 4, 1) and (3, 4, 5) -> (3, 4, 5)
+  Tensor a = tf.make(
+      /*sizes=*/{3, 4, 1}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor b = tf.make(
+      {3, 4, 5},
+      {
+          // First 3x4 slice
+          1,
+          2,
+          3,
+          4,
+          5, // row 0
+          2,
+          3,
+          4,
+          5,
+          6, // row 1
+          3,
+          4,
+          5,
+          6,
+          7, // row 2
+          4,
+          5,
+          6,
+          7,
+          8, // row 3
+             // Second 3x4 slice
+          5,
+          6,
+          7,
+          8,
+          9, // row 0
+          6,
+          7,
+          8,
+          9,
+          10, // row 1
+          7,
+          8,
+          9,
+          10,
+          11, // row 2
+          8,
+          9,
+          10,
+          11,
+          12, // row 3
+              // Third 3x4 slice
+          9,
+          10,
+          11,
+          12,
+          13, // row 0
+          10,
+          11,
+          12,
+          13,
+          14, // row 1
+          11,
+          12,
+          13,
+          14,
+          15, // row 2
+          12,
+          13,
+          14,
+          15,
+          16 // row 3
+      });
+
+  Tensor out = tf_bool.zeros({3, 4, 5});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  std::vector<ctype> expected_data = {
+      // First slice: a values are 1,2,3,4
+      true,
+      true,
+      true,
+      true,
+      true, // 1 <= [1,2,3,4,5]
+      true,
+      true,
+      true,
+      true,
+      true, // 2 <= [2,3,4,5,6]
+      true,
+      true,
+      true,
+      true,
+      true, // 3 <= [3,4,5,6,7]
+      true,
+      true,
+      true,
+      true,
+      true, // 4 <= [4,5,6,7,8]
+      // Second slice: a values are 5,6,7,8
+      true,
+      true,
+      true,
+      true,
+      true, // 5 <= [5,6,7,8,9]
+      true,
+      true,
+      true,
+      true,
+      true, // 6 <= [6,7,8,9,10]
+      true,
+      true,
+      true,
+      true,
+      true, // 7 <= [7,8,9,10,11]
+      true,
+      true,
+      true,
+      true,
+      true, // 8 <= [8,9,10,11,12]
+      // Third slice: a values are 9,10,11,12
+      true,
+      true,
+      true,
+      true,
+      true, // 9 <= [9,10,11,12,13]
+      true,
+      true,
+      true,
+      true,
+      true, // 10 <= [10,11,12,13,14]
+      true,
+      true,
+      true,
+      true,
+      true, // 11 <= [11,12,13,14,15]
+      true,
+      true,
+      true,
+      true,
+      true// 12 <= [12,13,14,15,16]
+  };
+
+  EXPECT_TENSOR_EQ(
+      out,
+      tf_bool.make({3, 4, 5}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, BroadcastLastDimReverseTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case for kBroadcastLastDimReverseArguments: (3, 4, 5) and (3, 4, 1) ->
+  // (3, 4, 5)
+  Tensor a = tf.make(
+      {3, 4, 5},
+      {
+          // First 3x4 slice
+          1,
+          2,
+          3,
+          4,
+          5, // row 0
+          2,
+          3,
+          4,
+          5,
+          6, // row 1
+          3,
+          4,
+          5,
+          6,
+          7, // row 2
+          4,
+          5,
+          6,
+          7,
+          8, // row 3
+             // Second 3x4 slice
+          5,
+          6,
+          7,
+          8,
+          9, // row 0
+          6,
+          7,
+          8,
+          9,
+          10, // row 1
+          7,
+          8,
+          9,
+          10,
+          11, // row 2
+          8,
+          9,
+          10,
+          11,
+          12, // row 3
+              // Third 3x4 slice
+          9,
+          10,
+          11,
+          12,
+          13, // row 0
+          10,
+          11,
+          12,
+          13,
+          14, // row 1
+          11,
+          12,
+          13,
+          14,
+          15, // row 2
+          12,
+          13,
+          14,
+          15,
+          16 // row 3
+      });
+  Tensor b = tf.make(
+      /*sizes=*/{3, 4, 1},
+      /*data=*/{5, 5, 5, 5, 10, 10, 10, 10, 15, 15, 15, 15});
+
+  Tensor out = tf_bool.zeros({3, 4, 5});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  std::vector<ctype> expected_data = {
+      // First slice: b values are all 5
+      true,
+      true,
+      true,
+      true,
+      true, // [1,2,3,4,5] <= 5
+      true,
+      true,
+      true,
+      true,
+      false, // [2,3,4,5,6] <= 5
+      true,
+      true,
+      true,
+      false,
+      false, // [3,4,5,6,7] <= 5
+      true,
+      true,
+      false,
+      false,
+      false, // [4,5,6,7,8] <= 5
+      // Second slice: b values are all 10
+      true,
+      true,
+      true,
+      true,
+      true, // [5,6,7,8,9] <= 10
+      true,
+      true,
+      true,
+      true,
+      true, // [6,7,8,9,10] <= 10
+      true,
+      true,
+      true,
+      true,
+      false, // [7,8,9,10,11] <= 10
+      true,
+      true,
+      true,
+      false,
+      false, // [8,9,10,11,12] <= 10
+      // Third slice: b values are all 15
+      true,
+      true,
+      true,
+      true,
+      true, // [9,10,11,12,13] <= 15
+      true,
+      true,
+      true,
+      true,
+      true, // [10,11,12,13,14] <= 15
+      true,
+      true,
+      true,
+      true,
+      true, // [11,12,13,14,15] <= 15
+      true,
+      true,
+      true,
+      true,
+      false // [12,13,14,15,16] <= 15
+  };
+
+  EXPECT_TENSOR_EQ(
+      out,
+      tf_bool.make({3, 4, 5}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, BroadcastNdByNdTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case for kBroadcastNdByNd: (2, 1, 4) and (2, 3, 4) -> (2, 3, 4)
+  Tensor a = tf.make(/*sizes=*/{2, 1, 4}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8});
+  Tensor b = tf.make(
+      {2, 3, 4},
+      {
+          // First 2x3 slice
+          1,
+          2,
+          3,
+          4, // row 0
+          2,
+          3,
+          4,
+          5, // row 1
+          3,
+          4,
+          5,
+          6, // row 2
+             // Second 2x3 slice
+          5,
+          6,
+          7,
+          8, // row 0
+          6,
+          7,
+          8,
+          9, // row 1
+          7,
+          8,
+          9,
+          10 // row 2
+      });
+
+  Tensor out = tf_bool.zeros({2, 3, 4});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  std::vector<ctype> expected_data = {
+      // First slice: a[0,0,:] = [1,2,3,4]
+      true,
+      true,
+      true,
+      true, // [1,2,3,4] <= [1,2,3,4]
+      true,
+      true,
+      true,
+      true, // [1,2,3,4] <= [2,3,4,5]
+      true,
+      true,
+      true,
+      true, // [1,2,3,4] <= [3,4,5,6]
+      // Second slice: a[1,0,:] = [5,6,7,8]
+      true,
+      true,
+      true,
+      true, // [5,6,7,8] <= [5,6,7,8]
+      true,
+      true,
+      true,
+      true, // [5,6,7,8] <= [6,7,8,9]
+      true,
+      true,
+      true,
+      true // [5,6,7,8] <= [7,8,9,10]
+  };
+
+  EXPECT_TENSOR_EQ(
+      out,
+      tf_bool.make({2, 3, 4}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, BroadcastNdByNdReverseTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case for kBroadcastNdByNdReverseArguments: (2, 3, 4) and (2, 1, 4) ->
+  // (2, 3, 4)
+  Tensor a = tf.make(
+      {2, 3, 4},
+      {
+          // First 2x3 slice
+          1,
+          2,
+          3,
+          4, // row 0
+          2,
+          3,
+          4,
+          5, // row 1
+          3,
+          4,
+          5,
+          6, // row 2
+             // Second 2x3 slice
+          5,
+          6,
+          7,
+          8, // row 0
+          6,
+          7,
+          8,
+          9, // row 1
+          7,
+          8,
+          9,
+          10 // row 2
+      });
+  Tensor b = tf.make(/*sizes=*/{2, 1, 4}, /*data=*/{2, 3, 4, 5, 6, 7, 8, 9});
+
+  Tensor out = tf_bool.zeros({2, 3, 4});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  std::vector<ctype> expected_data = {
+      // First slice: b[0,0,:] = [2,3,4,5]
+      true,
+      true,
+      true,
+      true, // [1,2,3,4] <= [2,3,4,5]
+      true,
+      true,
+      true,
+      true, // [2,3,4,5] <= [2,3,4,5]
+      false,
+      false,
+      false,
+      false, // [3,4,5,6] <= [2,3,4,5]
+      // Second slice: b[1,0,:] = [6,7,8,9]
+      true,
+      true,
+      true,
+      true, // [5,6,7,8] <= [6,7,8,9]
+      true,
+      true,
+      true,
+      true, // [6,7,8,9] <= [6,7,8,9]
+      false,
+      false,
+      false,
+      false // [7,8,9,10] <= [6,7,8,9]
+  };
+
+  EXPECT_TENSOR_EQ(
+      out,
+      tf_bool.make({2, 3, 4}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, Broadcast2dBy1dTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case for kBroadcast2dBy1d: (3, 4) and (4,) -> (3, 4)
+  Tensor a = tf.make(
+      /*sizes=*/{3, 4}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor b = tf.make({4}, {2, 4, 6, 8});
+
+  Tensor out = tf_bool.zeros({3, 4});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  std::vector<ctype> expected_data = {
+      true,
+      true,
+      true,
+      true, // [1,2,3,4] <= [2,4,6,8]
+      false,
+      false,
+      false,
+      true, // [5,6,7,8] <= [2,4,6,8]
+      false,
+      false,
+      false,
+      false // [9,10,11,12] <= [2,4,6,8]
+  };
+
+  EXPECT_TENSOR_EQ(
+      out,
+      tf_bool.make({3, 4}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, Broadcast2dBy1dReverseTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case for kBroadcast2dBy1dReverseArguments: (4,) and (3, 4) -> (3, 4)
+  Tensor a = tf.make({4}, {2, 4, 6, 8});
+  Tensor b = tf.make(
+      /*sizes=*/{3, 4}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+  Tensor out = tf_bool.zeros({3, 4});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  std::vector<ctype> expected_data = {
+      false,
+      false,
+      false,
+      false, // [2,4,6,8] <= [1,2,3,4]
+      true,
+      true,
+      true,
+      true, // [2,4,6,8] <= [5,6,7,8]
+      true,
+      true,
+      true,
+      true // [2,4,6,8] <= [9,10,11,12]
+  };
+
+  EXPECT_TENSOR_EQ(
+      out,
+      tf_bool.make({3, 4}, expected_data));
+}
diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
index 433072d78ba..5f09278c500 100644
--- a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
@@ -206,8 +206,11 @@ OPTIMIZED_ATEN_OPS = (
     op_target(
         name = "op_le",
         deps = [
+            ":binary_ops",
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
+            "//executorch/kernels/portable/cpu/pattern:comparison_op",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
         ],
     ),
     op_target(

From 07cd1023e20c522a9f38d76297415560e3f1d830 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 11 Jun 2025 14:54:11 -0700
Subject: [PATCH 2/6] Update on "[Exutorch] Add broadcast support for le op"

For refactored hf repro requires this to support mask generation

Differential Revision: [D76456398](https://our.internmc.facebook.com/intern/diff/D76456398/)

[ghstack-poisoned]
---
 kernels/optimized/cpu/op_le.cpp |  6 ++-
 kernels/test/op_le_test.cpp     | 77 ++++++++++++++++-----------------
 2 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp
index 2513155aac1..095e6fd0cfb 100644
--- a/kernels/optimized/cpu/op_le.cpp
+++ b/kernels/optimized/cpu/op_le.cpp
@@ -9,11 +9,11 @@
 #include <executorch/kernels/optimized/cpu/binary_ops.h>
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/optimized/vec/vec.h>
+#include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
-#include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
 namespace torch {
 namespace executor {
@@ -83,7 +83,9 @@ Tensor& opt_le_tensor_out(
 
   // Check for optimized broadcast paths
   auto selected_optimized_path = select_optimized_path(a, b, out);
-  printf("selected_optimized_path: %d\n", static_cast<int>(selected_optimized_path));
+  printf(
+      "selected_optimized_path: %d\n",
+      static_cast<int>(selected_optimized_path));
   if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());
diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp
index eebad43ce75..070a397dfbc 100644
--- a/kernels/test/op_le_test.cpp
+++ b/kernels/test/op_le_test.cpp
@@ -204,7 +204,9 @@ TEST_F(OpLeTensorOutTest, Broadcast2DTest) {
   // Row 0: b[0]=2, so [1<=2, 2<=2, 3<=2, ...] = [true, true, false, false, ...]
   // Row 1: b[1]=4, so [1<=4, 2<=4, 3<=4, 4<=4, 5<=4, ...] = [true, true, true,
   // true, false, ...]
-  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
   std::vector<ctype> expected_data = {
       // Row 0 (b=2): 1<=2, 2<=2, 3<=2, 4<=2, 5<=2, 6<=2, 7<=2, 8<=2, 9<=2,
       // 10<=2
@@ -279,10 +281,7 @@ TEST_F(OpLeTensorOutTest, Broadcast2DTest) {
       true,
       true};
 
-  EXPECT_TENSOR_EQ(
-      out,
-      tf_bool.make(
-          {6, 10}, expected_data));
+  EXPECT_TENSOR_EQ(out, tf_bool.make({6, 10}, expected_data));
 }
 
 TEST_F(OpLeTensorOutTest, Broadcast1DTo2DTest) {
@@ -298,7 +297,9 @@ TEST_F(OpLeTensorOutTest, Broadcast1DTo2DTest) {
 
   op_le_tensor_out(a, b, out);
 
-  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
   std::vector<ctype> expected_data = {
       // Row 0 (a=2): 2<=1, 2<=2, 2<=3, 2<=4, 2<=5, 2<=6, 2<=7, 2<=8, 2<=9,
       // 2<=10
@@ -373,10 +374,7 @@ TEST_F(OpLeTensorOutTest, Broadcast1DTo2DTest) {
       false,
       false};
 
-  EXPECT_TENSOR_EQ(
-      out,
-      tf_bool.make(
-          {6, 10}, expected_data));
+  EXPECT_TENSOR_EQ(out, tf_bool.make({6, 10}, expected_data));
 }
 
 TEST_F(OpLeTensorOutTest, BroadcastReverseTest) {
@@ -393,7 +391,9 @@ TEST_F(OpLeTensorOutTest, BroadcastReverseTest) {
   op_le_tensor_out(a, b, out);
 
   // Expected: each row i should be [a[i]<=1, a[i]<=2, ..., a[i]<=10]
-  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
   std::vector<ctype> expected_data = {
       // Row 0 (a=2): 2<=1, 2<=2, 2<=3, 2<=4, 2<=5, 2<=6, 2<=7, 2<=8, 2<=9,
       // 2<=10
@@ -468,10 +468,7 @@ TEST_F(OpLeTensorOutTest, BroadcastReverseTest) {
       false,
       false};
 
-  EXPECT_TENSOR_EQ(
-      out,
-      tf_bool.make(
-          {6, 10}, expected_data));
+  EXPECT_TENSOR_EQ(out, tf_bool.make({6, 10}, expected_data));
 }
 
 TEST_F(OpLeTensorOutTest, BroadcastLastDimTest) {
@@ -553,7 +550,9 @@ TEST_F(OpLeTensorOutTest, BroadcastLastDimTest) {
 
   op_le_tensor_out(a, b, out);
 
-  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
   std::vector<ctype> expected_data = {
       // First slice: a values are 1,2,3,4
       true,
@@ -617,12 +616,10 @@ TEST_F(OpLeTensorOutTest, BroadcastLastDimTest) {
       true,
       true,
       true,
-      true// 12 <= [12,13,14,15,16]
+      true // 12 <= [12,13,14,15,16]
   };
 
-  EXPECT_TENSOR_EQ(
-      out,
-      tf_bool.make({3, 4, 5}, expected_data));
+  EXPECT_TENSOR_EQ(out, tf_bool.make({3, 4, 5}, expected_data));
 }
 
 TEST_F(OpLeTensorOutTest, BroadcastLastDimReverseTest) {
@@ -706,7 +703,9 @@ TEST_F(OpLeTensorOutTest, BroadcastLastDimReverseTest) {
 
   op_le_tensor_out(a, b, out);
 
-  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
   std::vector<ctype> expected_data = {
       // First slice: b values are all 5
       true,
@@ -773,9 +772,7 @@ TEST_F(OpLeTensorOutTest, BroadcastLastDimReverseTest) {
       false // [12,13,14,15,16] <= 15
   };
 
-  EXPECT_TENSOR_EQ(
-      out,
-      tf_bool.make({3, 4, 5}, expected_data));
+  EXPECT_TENSOR_EQ(out, tf_bool.make({3, 4, 5}, expected_data));
 }
 
 TEST_F(OpLeTensorOutTest, BroadcastNdByNdTest) {
@@ -819,7 +816,9 @@ TEST_F(OpLeTensorOutTest, BroadcastNdByNdTest) {
 
   op_le_tensor_out(a, b, out);
 
-  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
   std::vector<ctype> expected_data = {
       // First slice: a[0,0,:] = [1,2,3,4]
       true,
@@ -849,9 +848,7 @@ TEST_F(OpLeTensorOutTest, BroadcastNdByNdTest) {
       true // [5,6,7,8] <= [7,8,9,10]
   };
 
-  EXPECT_TENSOR_EQ(
-      out,
-      tf_bool.make({2, 3, 4}, expected_data));
+  EXPECT_TENSOR_EQ(out, tf_bool.make({2, 3, 4}, expected_data));
 }
 
 TEST_F(OpLeTensorOutTest, BroadcastNdByNdReverseTest) {
@@ -896,7 +893,9 @@ TEST_F(OpLeTensorOutTest, BroadcastNdByNdReverseTest) {
 
   op_le_tensor_out(a, b, out);
 
-  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
   std::vector<ctype> expected_data = {
       // First slice: b[0,0,:] = [2,3,4,5]
       true,
@@ -926,9 +925,7 @@ TEST_F(OpLeTensorOutTest, BroadcastNdByNdReverseTest) {
       false // [7,8,9,10] <= [6,7,8,9]
   };
 
-  EXPECT_TENSOR_EQ(
-      out,
-      tf_bool.make({2, 3, 4}, expected_data));
+  EXPECT_TENSOR_EQ(out, tf_bool.make({2, 3, 4}, expected_data));
 }
 
 TEST_F(OpLeTensorOutTest, Broadcast2dBy1dTest) {
@@ -944,7 +941,9 @@ TEST_F(OpLeTensorOutTest, Broadcast2dBy1dTest) {
 
   op_le_tensor_out(a, b, out);
 
-  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
   std::vector<ctype> expected_data = {
       true,
       true,
@@ -960,9 +959,7 @@ TEST_F(OpLeTensorOutTest, Broadcast2dBy1dTest) {
       false // [9,10,11,12] <= [2,4,6,8]
   };
 
-  EXPECT_TENSOR_EQ(
-      out,
-      tf_bool.make({3, 4}, expected_data));
+  EXPECT_TENSOR_EQ(out, tf_bool.make({3, 4}, expected_data));
 }
 
 TEST_F(OpLeTensorOutTest, Broadcast2dBy1dReverseTest) {
@@ -978,7 +975,9 @@ TEST_F(OpLeTensorOutTest, Broadcast2dBy1dReverseTest) {
 
   op_le_tensor_out(a, b, out);
 
-  using ctype = executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<ScalarType::Bool>::ctype; 
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
   std::vector<ctype> expected_data = {
       false,
       false,
@@ -994,7 +993,5 @@ TEST_F(OpLeTensorOutTest, Broadcast2dBy1dReverseTest) {
       true // [2,4,6,8] <= [9,10,11,12]
   };
 
-  EXPECT_TENSOR_EQ(
-      out,
-      tf_bool.make({3, 4}, expected_data));
+  EXPECT_TENSOR_EQ(out, tf_bool.make({3, 4}, expected_data));
 }

From ccda16eb24ddfae25cab6f4145dedc0dedc9d9ed Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 11 Jun 2025 16:16:53 -0700
Subject: [PATCH 3/6] Update on "[Exutorch] Add broadcast support for le op"

For refactored hf repro requires this to support mask generation

Differential Revision: [D76456398](https://our.internmc.facebook.com/intern/diff/D76456398/)

[ghstack-poisoned]
---
 kernels/optimized/cpu/op_le.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp
index 095e6fd0cfb..f5dad1c2c98 100644
--- a/kernels/optimized/cpu/op_le.cpp
+++ b/kernels/optimized/cpu/op_le.cpp
@@ -83,9 +83,6 @@ Tensor& opt_le_tensor_out(
 
   // Check for optimized broadcast paths
   auto selected_optimized_path = select_optimized_path(a, b, out);
-  printf(
-      "selected_optimized_path: %d\n",
-      static_cast<int>(selected_optimized_path));
   if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());

From eeb437591c2c7516cb309a3a91e1bc4f0cc19faa Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 11 Jun 2025 17:34:44 -0700
Subject: [PATCH 4/6] Update on "[Exutorch] Add broadcast support for le op"

For refactored hf repro requires this to support mask generation

Differential Revision: [D76456398](https://our.internmc.facebook.com/intern/diff/D76456398/)

[ghstack-poisoned]
---
 kernels/optimized/cpu/op_le.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp
index f5dad1c2c98..0603fdf4716 100644
--- a/kernels/optimized/cpu/op_le.cpp
+++ b/kernels/optimized/cpu/op_le.cpp
@@ -105,7 +105,6 @@ Tensor& opt_le_tensor_out(
   } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
     // Handle optimized broadcast cases
     ET_SWITCH_REALB_TYPES(out_type, ctx, "le.Tensor_out", CTYPE, [&]() {
-      using Vec = executorch::vec::Vectorized<CTYPE>;
       auto le_lambda = [](auto x, auto y) { return x.le(y); };
       return torch::executor::handle_broadcast_elementwise<CTYPE>(
           ctx, le_lambda, a, b, out, selected_optimized_path);

From 0e17f38c9c2782d31b0491aaf0815f2de4235fee Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 12 Jun 2025 10:05:28 -0700
Subject: [PATCH 5/6] Update on "[Exutorch] Add broadcast support for le op"

For refactored hf repro requires this to support mask generation

Differential Revision: [D76456398](https://our.internmc.facebook.com/intern/diff/D76456398/)

[ghstack-poisoned]
---
 kernels/optimized/cpu/op_le.cpp |  2 +-
 kernels/test/op_le_test.cpp     | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp
index 0603fdf4716..1ce45558c6a 100644
--- a/kernels/optimized/cpu/op_le.cpp
+++ b/kernels/optimized/cpu/op_le.cpp
@@ -85,7 +85,7 @@ Tensor& opt_le_tensor_out(
   auto selected_optimized_path = select_optimized_path(a, b, out);
   if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
     // Resize for dynamic shape
-    auto error = resize_tensor(out, a.sizes());
+    auto error = resize_to_broadcast_target_size(a, b, out);
     ET_KERNEL_CHECK_MSG(
         ctx,
         error == Error::Ok,
diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp
index 070a397dfbc..d96d87be596 100644
--- a/kernels/test/op_le_test.cpp
+++ b/kernels/test/op_le_test.cpp
@@ -962,6 +962,35 @@ TEST_F(OpLeTensorOutTest, Broadcast2dBy1dTest) {
   EXPECT_TENSOR_EQ(out, tf_bool.make({3, 4}, expected_data));
 }
 
+TEST_F(OpLeTensorOutTest, Broadcast1DTo2DShapeTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case: (6,) and (1, 6) -> (1, 6)
+  Tensor a = tf.make({6}, {1, 3, 5, 7, 9, 11});
+  Tensor b = tf.make({1, 6}, {2, 4, 6, 8, 10, 12});
+
+  Tensor out = tf_bool.zeros({1, 6});
+
+  op_le_tensor_out(a, b, out);
+
+  // Expected: a[i] <= b[0,i] for all i
+  // [1, 3, 5, 7, 9, 11] <= [2, 4, 6, 8, 10, 12]
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      true, // 1 <= 2
+      true, // 3 <= 4
+      true, // 5 <= 6
+      true, // 7 <= 8
+      true, // 9 <= 10
+      true // 11 <= 12
+  };
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({1, 6}, expected_data));
+}
+
 TEST_F(OpLeTensorOutTest, Broadcast2dBy1dReverseTest) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Bool> tf_bool;

From 35c8c8d0fa8f5a72a5027738870b3090d640ff2d Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 12 Jun 2025 10:07:12 -0700
Subject: [PATCH 6/6] Update on "[Exutorch] Add broadcast support for le op"

For refactored hf repro requires this to support mask generation

Differential Revision: [D76456398](https://our.internmc.facebook.com/intern/diff/D76456398/)

[ghstack-poisoned]
---
 kernels/test/op_le_test.cpp | 90 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp
index d96d87be596..d8ecec11c46 100644
--- a/kernels/test/op_le_test.cpp
+++ b/kernels/test/op_le_test.cpp
@@ -991,7 +991,95 @@ TEST_F(OpLeTensorOutTest, Broadcast1DTo2DShapeTest) {
   EXPECT_TENSOR_EQ(out, tf_bool.make({1, 6}, expected_data));
 }
 
-TEST_F(OpLeTensorOutTest, Broadcast2dBy1dReverseTest) {
+TEST_F(OpLeTensorOutTest, Broadcast2DBy1DShapeTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case: (10,) and (6, 1) -> (6, 10)
+  Tensor a = tf.make({10}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  Tensor b = tf.make({6, 1}, {2, 4, 6, 8, 10, 12});
+
+  Tensor out = tf_bool.zeros({6, 10});
+
+  op_le_tensor_out(a, b, out);
+
+  // Expected: a[j] <= b[i,0] for all i,j
+  // Each row i should be [a[0]<=b[i,0], a[1]<=b[i,0], ..., a[9]<=b[i,0]]
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      // Row 0 (b=2): [1,2,3,4,5,6,7,8,9,10] <= 2
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      // Row 1 (b=4): [1,2,3,4,5,6,7,8,9,10] <= 4
+      true,
+      true,
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      // Row 2 (b=6): [1,2,3,4,5,6,7,8,9,10] <= 6
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      // Row 3 (b=8): [1,2,3,4,5,6,7,8,9,10] <= 8
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      false,
+      false,
+      // Row 4 (b=10): [1,2,3,4,5,6,7,8,9,10] <= 10
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 5 (b=12): [1,2,3,4,5,6,7,8,9,10] <= 12
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true};
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({6, 10}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, Broadcast22dBy1dReverseTest) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Bool> tf_bool;