diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp
index 8cadd40a8da..4aeeb69323f 100644
--- a/kernels/optimized/cpu/op_le.cpp
+++ b/kernels/optimized/cpu/op_le.cpp
@@ -8,6 +8,8 @@
 
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
+#include <executorch/kernels/optimized/cpu/binary_ops.h>
+#include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -79,52 +81,39 @@ Tensor& opt_le_tensor_out(
     return out;
   }
 
-  ET_KERNEL_CHECK(ctx, tensors_have_same_shape(a, b), InvalidArgument, out);
-
-  // Resize for dynamic shape
-  auto error = resize_tensor(out, a.sizes());
-  ET_KERNEL_CHECK_MSG(
-      ctx,
-      error == Error::Ok,
-      InvalidArgument,
-      out,
-      "Failed to resize output tensor.");
-
-  if (a_type == b_type && a_type == out_type) {
-    ET_SWITCH_REAL_TYPES_AND(
-        Bool, out_type, ctx, "le.Tensor_out", CTYPE, [&]() {
-          using Vec = at::vec::Vectorized<CTYPE>;
-          at::vec::map2<CTYPE>(
-              [](Vec x, Vec y) { return x.le(y); },
-              out.mutable_data_ptr<CTYPE>(),
-              a.const_data_ptr<CTYPE>(),
-              b.const_data_ptr<CTYPE>(),
-              a.numel());
-        });
+  // Check for optimized broadcast paths
+  auto selected_optimized_path = select_optimized_path(a, b, out);
+  if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
+    // Resize for dynamic shape
+    auto error = resize_to_broadcast_target_size(a, b, out);
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
+
+    ET_SWITCH_REALB_TYPES(a_type, ctx, "le.Tensor_out", CTYPE, [&]() {
+      using Vec = at::vec::Vectorized<CTYPE>;
+      at::vec::map2<CTYPE>(
+          [](Vec x, Vec y) { return x.le(y); },
+          out.mutable_data_ptr<CTYPE>(),
+          a.const_data_ptr<CTYPE>(),
+          b.const_data_ptr<CTYPE>(),
+          out.numel());
+    });
+  } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
+    // Handle optimized broadcast cases
+    ET_SWITCH_REALB_TYPES(out_type, ctx, "le.Tensor_out", CTYPE, [&]() {
+      auto le_lambda = [](auto x, auto y) { return x.le(y); };
+      return torch::executor::handle_broadcast_elementwise<CTYPE>(
+          ctx, le_lambda, a, b, out, selected_optimized_path);
+    });
   } else {
-    ET_SWITCH_REAL_TYPES_AND(
-        Bool, a_type, ctx, "le.Tensor_out", CTYPE_A, [&]() {
-          ET_SWITCH_REAL_TYPES_AND(
-              Bool, b_type, ctx, "le.Tensor_out", CTYPE_B, [&]() {
-                using CTYPE_IN = typename torch::executor::
-                    promote_types<CTYPE_A, CTYPE_B>::type;
-                ET_DCHECK(
-                    CppTypeToScalarType<CTYPE_IN>::value ==
-                    promoteTypes(a_type, b_type));
-                ET_SWITCH_REAL_TYPES_AND(
-                    Bool, out_type, ctx, "le.Tensor_out", CTYPE_OUT, [&]() {
-                      const size_t n = a.numel();
-                      const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
-                      const CTYPE_B* b_data = b.const_data_ptr<CTYPE_B>();
-                      CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-                      for (auto i = 0; i < n; ++i) {
-                        out_data[i] = static_cast<CTYPE_OUT>(
-                            static_cast<CTYPE_IN>(a_data[i]) <=
-                            static_cast<CTYPE_IN>(b_data[i]));
-                      }
-                    });
-              });
-        });
+    // @lint-ignore CLANGTIDY facebook-hte-CArray
+    static constexpr const char op_name[] = "le.Tensor_out";
+    return internal::comparison_tensor_out<std::less_equal, op_name>(
+        ctx, a, b, out);
   }
 
   return out;
diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp
index bcd40d24d89..d8ecec11c46 100644
--- a/kernels/test/op_le_test.cpp
+++ b/kernels/test/op_le_test.cpp
@@ -186,3 +186,929 @@ TEST_F(OpLeTensorOutTest, BroadcastTest) {
   op_le_tensor_out(a, b, out);
   EXPECT_TENSOR_EQ(out, tf.make({1, 4}, {true, true, true, false}));
 }
+
+TEST_F(OpLeTensorOutTest, Broadcast2DTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case: (1, 10) and (6, 1) -> (6, 10)
+  Tensor a =
+      tf.make(/*sizes=*/{1, 10}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  Tensor b = tf.make({6, 1}, {2, 4, 6, 8, 10, 12});
+
+  Tensor out = tf_bool.zeros({6, 10});
+
+  op_le_tensor_out(a, b, out);
+
+  // Expected: each row i should be [1<=b[i], 2<=b[i], ..., 10<=b[i]]
+  // Row 0: b[0]=2, so [1<=2, 2<=2, 3<=2, ...] = [true, true, false, false, ...]
+  // Row 1: b[1]=4, so [1<=4, 2<=4, 3<=4, 4<=4, 5<=4, ...] = [true, true, true,
+  // true, false, ...]
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      // Row 0 (b=2): 1<=2, 2<=2, 3<=2, 4<=2, 5<=2, 6<=2, 7<=2, 8<=2, 9<=2,
+      // 10<=2
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      // Row 1 (b=4): 1<=4, 2<=4, 3<=4, 4<=4, 5<=4, 6<=4, 7<=4, 8<=4, 9<=4,
+      // 10<=4
+      true,
+      true,
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      // Row 2 (b=6): 1<=6, 2<=6, 3<=6, 4<=6, 5<=6, 6<=6, 7<=6, 8<=6, 9<=6,
+      // 10<=6
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      // Row 3 (b=8): 1<=8, 2<=8, 3<=8, 4<=8, 5<=8, 6<=8, 7<=8, 8<=8, 9<=8,
+      // 10<=8
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      false,
+      false,
+      // Row 4 (b=10): 1<=10, 2<=10, 3<=10, 4<=10, 5<=10, 6<=10, 7<=10, 8<=10,
+      // 9<=10, 10<=10
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 5 (b=12): 1<=12, 2<=12, 3<=12, 4<=12, 5<=12, 6<=12, 7<=12, 8<=12,
+      // 9<=12, 10<=12
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true};
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({6, 10}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, Broadcast1DTo2DTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case: (6,) and (1, 10) -> (6, 10)
+  Tensor a = tf.make({6, 1}, {2, 4, 6, 8, 10, 12});
+  Tensor b =
+      tf.make(/*sizes=*/{1, 10}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+
+  Tensor out = tf_bool.zeros({6, 10});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      // Row 0 (a=2): 2<=1, 2<=2, 2<=3, 2<=4, 2<=5, 2<=6, 2<=7, 2<=8, 2<=9,
+      // 2<=10
+      false,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 1 (a=4): 4<=1, 4<=2, 4<=3, 4<=4, 4<=5, 4<=6, 4<=7, 4<=8, 4<=9,
+      // 4<=10
+      false,
+      false,
+      false,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 2 (a=6): 6<=1, 6<=2, 6<=3, 6<=4, 6<=5, 6<=6, 6<=7, 6<=8, 6<=9,
+      // 6<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 3 (a=8): 8<=1, 8<=2, 8<=3, 8<=4, 8<=5, 8<=6, 8<=7, 8<=8, 8<=9,
+      // 8<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      true,
+      true,
+      true,
+      // Row 4 (a=10): 10<=1, 10<=2, 10<=3, 10<=4, 10<=5, 10<=6, 10<=7, 10<=8,
+      // 10<=9, 10<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      true,
+      // Row 5 (a=12): 12<=1, 12<=2, 12<=3, 12<=4, 12<=5, 12<=6, 12<=7, 12<=8,
+      // 12<=9, 12<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false};
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({6, 10}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, BroadcastReverseTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case: (6, 1) and (1, 10) -> (6, 10) (reverse of the first broadcast
+  // test)
+  Tensor a = tf.make(/*sizes=*/{6, 1}, /*data=*/{2, 4, 6, 8, 10, 12});
+  Tensor b = tf.make({1, 10}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+
+  Tensor out = tf_bool.zeros({6, 10});
+
+  op_le_tensor_out(a, b, out);
+
+  // Expected: each row i should be [a[i]<=1, a[i]<=2, ..., a[i]<=10]
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      // Row 0 (a=2): 2<=1, 2<=2, 2<=3, 2<=4, 2<=5, 2<=6, 2<=7, 2<=8, 2<=9,
+      // 2<=10
+      false,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 1 (a=4): 4<=1, 4<=2, 4<=3, 4<=4, 4<=5, 4<=6, 4<=7, 4<=8, 4<=9,
+      // 4<=10
+      false,
+      false,
+      false,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 2 (a=6): 6<=1, 6<=2, 6<=3, 6<=4, 6<=5, 6<=6, 6<=7, 6<=8, 6<=9,
+      // 6<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 3 (a=8): 8<=1, 8<=2, 8<=3, 8<=4, 8<=5, 8<=6, 8<=7, 8<=8, 8<=9,
+      // 8<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      true,
+      true,
+      true,
+      // Row 4 (a=10): 10<=1, 10<=2, 10<=3, 10<=4, 10<=5, 10<=6, 10<=7, 10<=8,
+      // 10<=9, 10<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      true,
+      // Row 5 (a=12): 12<=1, 12<=2, 12<=3, 12<=4, 12<=5, 12<=6, 12<=7, 12<=8,
+      // 12<=9, 12<=10
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false};
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({6, 10}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, BroadcastLastDimTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case for kBroadcastLastDim: (3, 4, 1) and (3, 4, 5) -> (3, 4, 5)
+  Tensor a = tf.make(
+      /*sizes=*/{3, 4, 1}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor b = tf.make(
+      {3, 4, 5},
+      {
+          // First 3x4 slice
+          1,
+          2,
+          3,
+          4,
+          5, // row 0
+          2,
+          3,
+          4,
+          5,
+          6, // row 1
+          3,
+          4,
+          5,
+          6,
+          7, // row 2
+          4,
+          5,
+          6,
+          7,
+          8, // row 3
+             // Second 3x4 slice
+          5,
+          6,
+          7,
+          8,
+          9, // row 0
+          6,
+          7,
+          8,
+          9,
+          10, // row 1
+          7,
+          8,
+          9,
+          10,
+          11, // row 2
+          8,
+          9,
+          10,
+          11,
+          12, // row 3
+              // Third 3x4 slice
+          9,
+          10,
+          11,
+          12,
+          13, // row 0
+          10,
+          11,
+          12,
+          13,
+          14, // row 1
+          11,
+          12,
+          13,
+          14,
+          15, // row 2
+          12,
+          13,
+          14,
+          15,
+          16 // row 3
+      });
+
+  Tensor out = tf_bool.zeros({3, 4, 5});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      // First slice: a values are 1,2,3,4
+      true,
+      true,
+      true,
+      true,
+      true, // 1 <= [1,2,3,4,5]
+      true,
+      true,
+      true,
+      true,
+      true, // 2 <= [2,3,4,5,6]
+      true,
+      true,
+      true,
+      true,
+      true, // 3 <= [3,4,5,6,7]
+      true,
+      true,
+      true,
+      true,
+      true, // 4 <= [4,5,6,7,8]
+      // Second slice: a values are 5,6,7,8
+      true,
+      true,
+      true,
+      true,
+      true, // 5 <= [5,6,7,8,9]
+      true,
+      true,
+      true,
+      true,
+      true, // 6 <= [6,7,8,9,10]
+      true,
+      true,
+      true,
+      true,
+      true, // 7 <= [7,8,9,10,11]
+      true,
+      true,
+      true,
+      true,
+      true, // 8 <= [8,9,10,11,12]
+      // Third slice: a values are 9,10,11,12
+      true,
+      true,
+      true,
+      true,
+      true, // 9 <= [9,10,11,12,13]
+      true,
+      true,
+      true,
+      true,
+      true, // 10 <= [10,11,12,13,14]
+      true,
+      true,
+      true,
+      true,
+      true, // 11 <= [11,12,13,14,15]
+      true,
+      true,
+      true,
+      true,
+      true // 12 <= [12,13,14,15,16]
+  };
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({3, 4, 5}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, BroadcastLastDimReverseTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case for kBroadcastLastDimReverseArguments: (3, 4, 5) and (3, 4, 1) ->
+  // (3, 4, 5)
+  Tensor a = tf.make(
+      {3, 4, 5},
+      {
+          // First 3x4 slice
+          1,
+          2,
+          3,
+          4,
+          5, // row 0
+          2,
+          3,
+          4,
+          5,
+          6, // row 1
+          3,
+          4,
+          5,
+          6,
+          7, // row 2
+          4,
+          5,
+          6,
+          7,
+          8, // row 3
+             // Second 3x4 slice
+          5,
+          6,
+          7,
+          8,
+          9, // row 0
+          6,
+          7,
+          8,
+          9,
+          10, // row 1
+          7,
+          8,
+          9,
+          10,
+          11, // row 2
+          8,
+          9,
+          10,
+          11,
+          12, // row 3
+              // Third 3x4 slice
+          9,
+          10,
+          11,
+          12,
+          13, // row 0
+          10,
+          11,
+          12,
+          13,
+          14, // row 1
+          11,
+          12,
+          13,
+          14,
+          15, // row 2
+          12,
+          13,
+          14,
+          15,
+          16 // row 3
+      });
+  Tensor b = tf.make(
+      /*sizes=*/{3, 4, 1},
+      /*data=*/{5, 5, 5, 5, 10, 10, 10, 10, 15, 15, 15, 15});
+
+  Tensor out = tf_bool.zeros({3, 4, 5});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      // First slice: b values are all 5
+      true,
+      true,
+      true,
+      true,
+      true, // [1,2,3,4,5] <= 5
+      true,
+      true,
+      true,
+      true,
+      false, // [2,3,4,5,6] <= 5
+      true,
+      true,
+      true,
+      false,
+      false, // [3,4,5,6,7] <= 5
+      true,
+      true,
+      false,
+      false,
+      false, // [4,5,6,7,8] <= 5
+      // Second slice: b values are all 10
+      true,
+      true,
+      true,
+      true,
+      true, // [5,6,7,8,9] <= 10
+      true,
+      true,
+      true,
+      true,
+      true, // [6,7,8,9,10] <= 10
+      true,
+      true,
+      true,
+      true,
+      false, // [7,8,9,10,11] <= 10
+      true,
+      true,
+      true,
+      false,
+      false, // [8,9,10,11,12] <= 10
+      // Third slice: b values are all 15
+      true,
+      true,
+      true,
+      true,
+      true, // [9,10,11,12,13] <= 15
+      true,
+      true,
+      true,
+      true,
+      true, // [10,11,12,13,14] <= 15
+      true,
+      true,
+      true,
+      true,
+      true, // [11,12,13,14,15] <= 15
+      true,
+      true,
+      true,
+      true,
+      false // [12,13,14,15,16] <= 15
+  };
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({3, 4, 5}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, BroadcastNdByNdTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case for kBroadcastNdByNd: (2, 1, 4) and (2, 3, 4) -> (2, 3, 4)
+  Tensor a = tf.make(/*sizes=*/{2, 1, 4}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8});
+  Tensor b = tf.make(
+      {2, 3, 4},
+      {
+          // First 2x3 slice
+          1,
+          2,
+          3,
+          4, // row 0
+          2,
+          3,
+          4,
+          5, // row 1
+          3,
+          4,
+          5,
+          6, // row 2
+             // Second 2x3 slice
+          5,
+          6,
+          7,
+          8, // row 0
+          6,
+          7,
+          8,
+          9, // row 1
+          7,
+          8,
+          9,
+          10 // row 2
+      });
+
+  Tensor out = tf_bool.zeros({2, 3, 4});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      // First slice: a[0,0,:] = [1,2,3,4]
+      true,
+      true,
+      true,
+      true, // [1,2,3,4] <= [1,2,3,4]
+      true,
+      true,
+      true,
+      true, // [1,2,3,4] <= [2,3,4,5]
+      true,
+      true,
+      true,
+      true, // [1,2,3,4] <= [3,4,5,6]
+      // Second slice: a[1,0,:] = [5,6,7,8]
+      true,
+      true,
+      true,
+      true, // [5,6,7,8] <= [5,6,7,8]
+      true,
+      true,
+      true,
+      true, // [5,6,7,8] <= [6,7,8,9]
+      true,
+      true,
+      true,
+      true // [5,6,7,8] <= [7,8,9,10]
+  };
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({2, 3, 4}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, BroadcastNdByNdReverseTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case for kBroadcastNdByNdReverseArguments: (2, 3, 4) and (2, 1, 4) ->
+  // (2, 3, 4)
+  Tensor a = tf.make(
+      {2, 3, 4},
+      {
+          // First 2x3 slice
+          1,
+          2,
+          3,
+          4, // row 0
+          2,
+          3,
+          4,
+          5, // row 1
+          3,
+          4,
+          5,
+          6, // row 2
+             // Second 2x3 slice
+          5,
+          6,
+          7,
+          8, // row 0
+          6,
+          7,
+          8,
+          9, // row 1
+          7,
+          8,
+          9,
+          10 // row 2
+      });
+  Tensor b = tf.make(/*sizes=*/{2, 1, 4}, /*data=*/{2, 3, 4, 5, 6, 7, 8, 9});
+
+  Tensor out = tf_bool.zeros({2, 3, 4});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      // First slice: b[0,0,:] = [2,3,4,5]
+      true,
+      true,
+      true,
+      true, // [1,2,3,4] <= [2,3,4,5]
+      true,
+      true,
+      true,
+      true, // [2,3,4,5] <= [2,3,4,5]
+      false,
+      false,
+      false,
+      false, // [3,4,5,6] <= [2,3,4,5]
+      // Second slice: b[1,0,:] = [6,7,8,9]
+      true,
+      true,
+      true,
+      true, // [5,6,7,8] <= [6,7,8,9]
+      true,
+      true,
+      true,
+      true, // [6,7,8,9] <= [6,7,8,9]
+      false,
+      false,
+      false,
+      false // [7,8,9,10] <= [6,7,8,9]
+  };
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({2, 3, 4}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, Broadcast2dBy1dTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case for kBroadcast2dBy1d: (3, 4) and (4,) -> (3, 4)
+  Tensor a = tf.make(
+      /*sizes=*/{3, 4}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor b = tf.make({4}, {2, 4, 6, 8});
+
+  Tensor out = tf_bool.zeros({3, 4});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      true,
+      true,
+      true,
+      true, // [1,2,3,4] <= [2,4,6,8]
+      false,
+      false,
+      false,
+      true, // [5,6,7,8] <= [2,4,6,8]
+      false,
+      false,
+      false,
+      false // [9,10,11,12] <= [2,4,6,8]
+  };
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({3, 4}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, Broadcast1DTo2DShapeTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case: (6,) and (1, 6) -> (1, 6)
+  Tensor a = tf.make({6}, {1, 3, 5, 7, 9, 11});
+  Tensor b = tf.make({1, 6}, {2, 4, 6, 8, 10, 12});
+
+  Tensor out = tf_bool.zeros({1, 6});
+
+  op_le_tensor_out(a, b, out);
+
+  // Expected: a[i] <= b[0,i] for all i
+  // [1, 3, 5, 7, 9, 11] <= [2, 4, 6, 8, 10, 12]
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      true, // 1 <= 2
+      true, // 3 <= 4
+      true, // 5 <= 6
+      true, // 7 <= 8
+      true, // 9 <= 10
+      true // 11 <= 12
+  };
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({1, 6}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, Broadcast2DBy1DShapeTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case: (10,) and (6, 1) -> (6, 10)
+  Tensor a = tf.make({10}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  Tensor b = tf.make({6, 1}, {2, 4, 6, 8, 10, 12});
+
+  Tensor out = tf_bool.zeros({6, 10});
+
+  op_le_tensor_out(a, b, out);
+
+  // Expected: a[j] <= b[i,0] for all i,j
+  // Each row i should be [a[0]<=b[i,0], a[1]<=b[i,0], ..., a[9]<=b[i,0]]
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      // Row 0 (b=2): [1,2,3,4,5,6,7,8,9,10] <= 2
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      // Row 1 (b=4): [1,2,3,4,5,6,7,8,9,10] <= 4
+      true,
+      true,
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      false,
+      false,
+      // Row 2 (b=6): [1,2,3,4,5,6,7,8,9,10] <= 6
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      // Row 3 (b=8): [1,2,3,4,5,6,7,8,9,10] <= 8
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      false,
+      false,
+      // Row 4 (b=10): [1,2,3,4,5,6,7,8,9,10] <= 10
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      // Row 5 (b=12): [1,2,3,4,5,6,7,8,9,10] <= 12
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true,
+      true};
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({6, 10}, expected_data));
+}
+
+TEST_F(OpLeTensorOutTest, Broadcast22dBy1dReverseTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case for kBroadcast2dBy1dReverseArguments: (4,) and (3, 4) -> (3, 4)
+  Tensor a = tf.make({4}, {2, 4, 6, 8});
+  Tensor b = tf.make(
+      /*sizes=*/{3, 4}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+  Tensor out = tf_bool.zeros({3, 4});
+
+  op_le_tensor_out(a, b, out);
+
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data = {
+      false,
+      false,
+      false,
+      false, // [2,4,6,8] <= [1,2,3,4]
+      true,
+      true,
+      true,
+      true, // [2,4,6,8] <= [5,6,7,8]
+      true,
+      true,
+      true,
+      true // [2,4,6,8] <= [9,10,11,12]
+  };
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({3, 4}, expected_data));
+}
diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
index a95c3f6b368..4b49e966b9b 100644
--- a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
@@ -218,8 +218,11 @@ OPTIMIZED_ATEN_OPS = (
     op_target(
         name = "op_le",
         deps = [
+            ":binary_ops",
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
+            "//executorch/kernels/portable/cpu/pattern:comparison_op",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
             "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
         ],
     ),