From 28b06a4221fc233f73c7ca97946227d6539bb654 Mon Sep 17 00:00:00 2001 From: Surbhi Jain Date: Tue, 12 Sep 2023 22:20:48 +0000 Subject: [PATCH 1/3] Add default node pool to GKE cluster --- .../kubernetes/terraform/examples/v5e/main.tf | 2 ++ .../terraform/examples/v5e/terraform.tfvars | 7 +++++ tools/kubernetes/terraform/module/main.tf | 28 +++++++++++++++++++ .../terraform/module/terraform.tfvars | 7 +++++ .../kubernetes/terraform/module/variables.tf | 17 +++++++++++ 5 files changed, 61 insertions(+) diff --git a/tools/kubernetes/terraform/examples/v5e/main.tf b/tools/kubernetes/terraform/examples/v5e/main.tf index c3b6990c..5f2a84a9 100644 --- a/tools/kubernetes/terraform/examples/v5e/main.tf +++ b/tools/kubernetes/terraform/examples/v5e/main.tf @@ -2,6 +2,7 @@ variable "project_id" {} variable "resource_name_prefix" {} variable "region" {} variable "tpu_node_pools" {} +variable "default_pool" {} variable "maintenance_interval" {} @@ -11,5 +12,6 @@ module "tpu-gke" { resource_name_prefix = var.resource_name_prefix region = var.region tpu_node_pools = var.tpu_node_pools + default_pool = var.default_pool maintenance_interval = var.maintenance_interval } diff --git a/tools/kubernetes/terraform/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/examples/v5e/terraform.tfvars index a7f7b98c..481b2288 100644 --- a/tools/kubernetes/terraform/examples/v5e/terraform.tfvars +++ b/tools/kubernetes/terraform/examples/v5e/terraform.tfvars @@ -26,4 +26,11 @@ tpu_node_pools = [{ topology = "16x16" policy = "sb-compact-1" }] +default_pool = { + zone = ["us-east5-a", "us-east5-b", "us-east5-c"] + machine_type = "e2-standard-32", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} maintenance_interval = "PERIODIC" diff --git a/tools/kubernetes/terraform/module/main.tf b/tools/kubernetes/terraform/module/main.tf index 6c551df3..c7766194 100644 --- a/tools/kubernetes/terraform/module/main.tf +++ b/tools/kubernetes/terraform/module/main.tf @@ -117,3 +117,31 @@ resource "google_container_node_pool" "multihost_tpu" { policy_name = var.tpu_node_pools[count.index].policy } } + +resource "google_container_node_pool" "default_pool" { + provider = google-beta + project = var.project_id + name = "default-pool" + location = var.region + node_locations = var.default_pool.zone + cluster = google_container_cluster.tpu_cluster.name + initial_node_count = var.default_pool.initial_node_count_per_zone + autoscaling { + min_node_count = var.default_pool.min_node_count_per_zone + max_node_count = var.default_pool.max_node_count_per_zone + } + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + spot = true + machine_type = var.default_pool.machine_type + + metadata = { + disable-legacy-endpoints = "true" + } + gcfs_config { + enabled = true + } + } +} diff --git a/tools/kubernetes/terraform/module/terraform.tfvars b/tools/kubernetes/terraform/module/terraform.tfvars index a24e1f9c..b899075a 100644 --- a/tools/kubernetes/terraform/module/terraform.tfvars +++ b/tools/kubernetes/terraform/module/terraform.tfvars @@ -18,4 +18,11 @@ tpu_node_pools = [{ machine_type = "ct4p-hightpu-4t" topology = "2x2x2" }] +default_pool = { + zone = ["us-central2-a", "us-central2-b", "us-central2-c"] + machine_type = "e2-standard-32", + initial_node_count_per_zone = 1, + min_node_count_per_zone = 1, + max_node_count_per_zone = 10 +} maintenance_interval = "AS_NEEDED" diff --git a/tools/kubernetes/terraform/module/variables.tf b/tools/kubernetes/terraform/module/variables.tf index 35f460aa..0ecc6dc0 100644 --- a/tools/kubernetes/terraform/module/variables.tf +++ b/tools/kubernetes/terraform/module/variables.tf @@ -38,6 +38,23 @@ variable "tpu_node_pools" { })) } +variable "default_pool" { + description = "default nodepool config" + type = object({ + zone = list(string), + machine_type = string, + initial_node_count_per_zone = number, + min_node_count_per_zone = number, + max_node_count_per_zone = number + }) + validation { + condition = ( + (var.default_pool.min_node_count_per_zone >=0 && var.default_pool.min_node_count_per_zone <= var.default_pool.max_node_count_per_zone) + ) + error_message = "default_pool.min_node_count_per_zone must be >= 0 and <= default_pool.max_node_count_per_zone." + } +} + variable "maintenance_interval" { default = "AS_NEEDED" description = "maintenance interval for TPU machines." From 64b4aecaa3563314574e9c838a6c69d1ce716be9 Mon Sep 17 00:00:00 2001 From: Surbhi Jain Date: Tue, 12 Sep 2023 22:20:48 +0000 Subject: [PATCH 2/3] Add default node pool to GKE cluster --- tools/kubernetes/terraform/module/main.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/kubernetes/terraform/module/main.tf b/tools/kubernetes/terraform/module/main.tf index c7766194..56de5c16 100644 --- a/tools/kubernetes/terraform/module/main.tf +++ b/tools/kubernetes/terraform/module/main.tf @@ -134,7 +134,6 @@ resource "google_container_node_pool" "default_pool" { oauth_scopes = [ "https://www.googleapis.com/auth/cloud-platform" ] - spot = true machine_type = var.default_pool.machine_type metadata = { From 8dcfb18862e8769b56de8daaf5ec6ab61997a574 Mon Sep 17 00:00:00 2001 From: Surbhi Jain Date: Tue, 12 Sep 2023 22:20:48 +0000 Subject: [PATCH 3/3] Add default node pool to GKE cluster --- tools/kubernetes/terraform/examples/v5e/main.tf | 4 ++-- .../terraform/examples/v5e/terraform.tfvars | 4 ++-- tools/kubernetes/terraform/module/main.tf | 14 +++++++------- tools/kubernetes/terraform/module/terraform.tfvars | 4 ++-- tools/kubernetes/terraform/module/variables.tf | 8 ++++---- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/tools/kubernetes/terraform/examples/v5e/main.tf b/tools/kubernetes/terraform/examples/v5e/main.tf index 5f2a84a9..304251dc 100644 --- a/tools/kubernetes/terraform/examples/v5e/main.tf +++ b/tools/kubernetes/terraform/examples/v5e/main.tf @@ -2,7 +2,7 @@ variable "project_id" {} variable "resource_name_prefix" {} variable "region" {} variable "tpu_node_pools" {} -variable "default_pool" {} +variable "cpu_node_pool" {} variable "maintenance_interval" {} @@ -12,6 +12,6 @@ module "tpu-gke" { resource_name_prefix = var.resource_name_prefix region = var.region tpu_node_pools = var.tpu_node_pools - default_pool = var.default_pool + cpu_node_pool = var.cpu_node_pool maintenance_interval = var.maintenance_interval } diff --git a/tools/kubernetes/terraform/examples/v5e/terraform.tfvars b/tools/kubernetes/terraform/examples/v5e/terraform.tfvars index 481b2288..5330e786 100644 --- a/tools/kubernetes/terraform/examples/v5e/terraform.tfvars +++ b/tools/kubernetes/terraform/examples/v5e/terraform.tfvars @@ -26,9 +26,9 @@ tpu_node_pools = [{ topology = "16x16" policy = "sb-compact-1" }] -default_pool = { +cpu_node_pool = { zone = ["us-east5-a", "us-east5-b", "us-east5-c"] - machine_type = "e2-standard-32", + machine_type = "n2-standard-64", initial_node_count_per_zone = 1, min_node_count_per_zone = 1, max_node_count_per_zone = 10 diff --git a/tools/kubernetes/terraform/module/main.tf b/tools/kubernetes/terraform/module/main.tf index 56de5c16..ac88bc27 100644 --- a/tools/kubernetes/terraform/module/main.tf +++ b/tools/kubernetes/terraform/module/main.tf @@ -118,23 +118,23 @@ resource "google_container_node_pool" "multihost_tpu" { } } -resource "google_container_node_pool" "default_pool" { +resource "google_container_node_pool" "cpu_node_pool" { provider = google-beta project = var.project_id - name = "default-pool" + name = "cpu-node-pool" location = var.region - node_locations = var.default_pool.zone + node_locations = var.cpu_node_pool.zone cluster = google_container_cluster.tpu_cluster.name - initial_node_count = var.default_pool.initial_node_count_per_zone + initial_node_count = var.cpu_node_pool.initial_node_count_per_zone autoscaling { - min_node_count = var.default_pool.min_node_count_per_zone - max_node_count = var.default_pool.max_node_count_per_zone + min_node_count = var.cpu_node_pool.min_node_count_per_zone + max_node_count = var.cpu_node_pool.max_node_count_per_zone } node_config { oauth_scopes = [ "https://www.googleapis.com/auth/cloud-platform" ] - machine_type = var.default_pool.machine_type + machine_type = var.cpu_node_pool.machine_type metadata = { disable-legacy-endpoints = "true" diff --git a/tools/kubernetes/terraform/module/terraform.tfvars b/tools/kubernetes/terraform/module/terraform.tfvars index b899075a..45697b73 100644 --- a/tools/kubernetes/terraform/module/terraform.tfvars +++ b/tools/kubernetes/terraform/module/terraform.tfvars @@ -18,9 +18,9 @@ tpu_node_pools = [{ machine_type = "ct4p-hightpu-4t" topology = "2x2x2" }] -default_pool = { +cpu_node_pool = { zone = ["us-central2-a", "us-central2-b", "us-central2-c"] - machine_type = "e2-standard-32", + machine_type = "n2-standard-64", initial_node_count_per_zone = 1, min_node_count_per_zone = 1, max_node_count_per_zone = 10 diff --git a/tools/kubernetes/terraform/module/variables.tf b/tools/kubernetes/terraform/module/variables.tf index 0ecc6dc0..8804cec6 100644 --- a/tools/kubernetes/terraform/module/variables.tf +++ b/tools/kubernetes/terraform/module/variables.tf @@ -38,8 +38,8 @@ variable "tpu_node_pools" { })) } -variable "default_pool" { - description = "default nodepool config" +variable "cpu_node_pool" { + description = "cpu nodepool config" type = object({ zone = list(string), machine_type = string, @@ -49,9 +49,9 @@ variable "default_pool" { }) validation { condition = ( - (var.default_pool.min_node_count_per_zone >=0 && var.default_pool.min_node_count_per_zone <= var.default_pool.max_node_count_per_zone) + (var.cpu_node_pool.min_node_count_per_zone >=0 && var.cpu_node_pool.min_node_count_per_zone <= var.cpu_node_pool.max_node_count_per_zone) ) - error_message = "default_pool.min_node_count_per_zone must be >= 0 and <= default_pool.max_node_count_per_zone." + error_message = "cpu_node_pool.min_node_count_per_zone must be >= 0 and <= cpu_node_pool.max_node_count_per_zone." } }