Skip to content

feat+refactor: ability to skip some of the default checks #10324

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
315 changes: 196 additions & 119 deletions pkg/cluster/check/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,57 +13,171 @@ import (
"github.com/siderolabs/talos/pkg/machinery/config/machine"
)

// PreBootSequenceChecks
const (
CheckEtcdHealthy = "etcd to be healthy"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as the package name is check, we can drop Check everywhere

CheckEtcdConsistent = "etcd members to be consistent across nodes"
CheckEtcdControlPlane = "etcd members to be control plane nodes"
CheckApidReady = "apid to be ready"
CheckAllNodesMemorySizes = "all nodes memory sizes"
CheckAllNodesDiskSizes = "all nodes disk sizes"
CheckNoDiagnostics = "no diagnostics"
CheckKubeletHealthy = "kubelet to be healthy"
CheckAllNodesBootSequenceFinished = "all nodes to finish boot sequence"
)

// K8sComponentsReadinessChecks
const (
CheckK8sAllNodesReported = "all k8s nodes to report"
CheckControlPlaneStaticPodsRunning = "all control plane static pods to be running"
CheckControlPlaneComponentsReady = "all control plane components to be ready"
)

// DefaultClusterChecks
const (
CheckK8sAllNodesReady = "all k8s nodes to report ready"
CheckKubeProxyReady = "kube-proxy to report ready"
CheckCoreDNSReady = "coredns to report ready"
CheckK8sNodesSchedulable = "all k8s nodes to report schedulable"
)

func getCheck(name string) ClusterCheck {
switch name {
// PreBootSequenceChecks
case CheckEtcdHealthy:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckEtcdHealthy, func(ctx context.Context) error {
return ServiceHealthAssertion(ctx, cluster, "etcd", WithNodeTypes(machine.TypeInit, machine.TypeControlPlane))
}, 5*time.Minute, 5*time.Second)
}
case CheckEtcdConsistent:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckEtcdConsistent, func(ctx context.Context) error {
return EtcdConsistentAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
}
case CheckEtcdControlPlane:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckEtcdControlPlane, func(ctx context.Context) error {
return EtcdControlPlaneNodesAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
}
case CheckApidReady:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckApidReady, func(ctx context.Context) error {
return ApidReadyAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
}
case CheckAllNodesMemorySizes:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckAllNodesMemorySizes, func(ctx context.Context) error {
return AllNodesMemorySizes(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
}
case CheckAllNodesDiskSizes:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckAllNodesDiskSizes, func(ctx context.Context) error {
return AllNodesDiskSizes(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
}
case CheckNoDiagnostics:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckNoDiagnostics, func(ctx context.Context) error {
return NoDiagnostics(ctx, cluster)
}, time.Minute, 5*time.Second)
}
case CheckKubeletHealthy:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckKubeletHealthy, func(ctx context.Context) error {
return ServiceHealthAssertion(ctx, cluster, "kubelet", WithNodeTypes(machine.TypeInit, machine.TypeControlPlane))
}, 5*time.Minute, 5*time.Second)
}
case CheckAllNodesBootSequenceFinished:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckAllNodesBootSequenceFinished, func(ctx context.Context) error {
return AllNodesBootedAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
}

// K8sComponentsReadinessChecks
case CheckK8sAllNodesReported:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckK8sAllNodesReported, func(ctx context.Context) error {
return K8sAllNodesReportedAssertion(ctx, cluster)
}, 5*time.Minute, 30*time.Second)
}
case CheckControlPlaneStaticPodsRunning:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckControlPlaneStaticPodsRunning, func(ctx context.Context) error {
return K8sControlPlaneStaticPods(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
}
case CheckControlPlaneComponentsReady:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckControlPlaneComponentsReady, func(ctx context.Context) error {
return K8sFullControlPlaneAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
}

// Additional Checks for Default Cluster Checks
case CheckK8sAllNodesReady:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckK8sAllNodesReady, func(ctx context.Context) error {
return K8sAllNodesReadyAssertion(ctx, cluster)
}, 10*time.Minute, 5*time.Second)
}
case CheckKubeProxyReady:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckKubeProxyReady, func(ctx context.Context) error {
present, replicas, err := DaemonSetPresent(ctx, cluster, "kube-system", "k8s-app=kube-proxy")
if err != nil {
return err
}
if !present {
return conditions.ErrSkipAssertion
}
return K8sPodReadyAssertion(ctx, cluster, replicas, "kube-system", "k8s-app=kube-proxy")
}, 5*time.Minute, 5*time.Second)
}
case CheckCoreDNSReady:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckCoreDNSReady, func(ctx context.Context) error {
present, replicas, err := DeploymentPresent(ctx, cluster, "kube-system", "k8s-app=kube-dns")
if err != nil {
return err
}
if !present {
return conditions.ErrSkipAssertion
}
return K8sPodReadyAssertion(ctx, cluster, replicas, "kube-system", "k8s-app=kube-dns")
}, 5*time.Minute, 5*time.Second)
}
case CheckK8sNodesSchedulable:
return func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition(CheckK8sNodesSchedulable, func(ctx context.Context) error {
return K8sAllNodesSchedulableAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
}
default:
panic("unknown check name: " + name)
}
}

// DefaultClusterChecks returns a set of default Talos cluster readiness checks.
func DefaultClusterChecks() []ClusterCheck {
// Concatenate pre-boot, Kubernetes component, and additional checks.
return slices.Concat(
PreBootSequenceChecks(),
K8sComponentsReadinessChecks(),
[]ClusterCheck{
// wait for all the nodes to report ready at k8s level
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all k8s nodes to report ready", func(ctx context.Context) error {
return K8sAllNodesReadyAssertion(ctx, cluster)
}, 10*time.Minute, 5*time.Second)
},

getCheck(CheckK8sAllNodesReady),
// wait for kube-proxy to report ready
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("kube-proxy to report ready", func(ctx context.Context) error {
present, replicas, err := DaemonSetPresent(ctx, cluster, "kube-system", "k8s-app=kube-proxy")
if err != nil {
return err
}

if !present {
return conditions.ErrSkipAssertion
}

return K8sPodReadyAssertion(ctx, cluster, replicas, "kube-system", "k8s-app=kube-proxy")
}, 5*time.Minute, 5*time.Second)
},

getCheck(CheckKubeProxyReady),
// wait for coredns to report ready
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("coredns to report ready", func(ctx context.Context) error {
present, replicas, err := DeploymentPresent(ctx, cluster, "kube-system", "k8s-app=kube-dns")
if err != nil {
return err
}

if !present {
return conditions.ErrSkipAssertion
}

return K8sPodReadyAssertion(ctx, cluster, replicas, "kube-system", "k8s-app=kube-dns")
}, 5*time.Minute, 5*time.Second)
},

getCheck(CheckCoreDNSReady),
// wait for all the nodes to be schedulable
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all k8s nodes to report schedulable", func(ctx context.Context) error {
return K8sAllNodesSchedulableAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},
getCheck(CheckK8sNodesSchedulable),
},
)
}
Expand All @@ -74,25 +188,11 @@ func DefaultClusterChecks() []ClusterCheck {
func K8sComponentsReadinessChecks() []ClusterCheck {
return []ClusterCheck{
// wait for all the nodes to report in at k8s level
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all k8s nodes to report", func(ctx context.Context) error {
return K8sAllNodesReportedAssertion(ctx, cluster)
}, 5*time.Minute, 30*time.Second) // give more time per each attempt, as this check is going to build and cache kubeconfig
},

getCheck(CheckK8sAllNodesReported),
// wait for k8s control plane static pods
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all control plane static pods to be running", func(ctx context.Context) error {
return K8sControlPlaneStaticPods(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},

getCheck(CheckControlPlaneStaticPodsRunning),
// wait for HA k8s control plane
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all control plane components to be ready", func(ctx context.Context) error {
return K8sFullControlPlaneAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},
getCheck(CheckControlPlaneComponentsReady),
}
}

Expand All @@ -103,70 +203,47 @@ func ExtraClusterChecks() []ClusterCheck {
return []ClusterCheck{}
}

// preBootSequenceCheckNames returns the list of pre-boot check names.
func preBootSequenceCheckNames() []string {
return []string{
CheckEtcdHealthy,
CheckEtcdConsistent,
CheckEtcdControlPlane,
CheckApidReady,
CheckAllNodesMemorySizes,
CheckAllNodesDiskSizes,
CheckNoDiagnostics,
CheckKubeletHealthy,
CheckAllNodesBootSequenceFinished,
}
}

// PreBootSequenceChecks returns a set of Talos cluster readiness checks which are run before boot sequence.
func PreBootSequenceChecks() []ClusterCheck {
return []ClusterCheck{
// wait for etcd to be healthy on all control plane nodes
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("etcd to be healthy", func(ctx context.Context) error {
return ServiceHealthAssertion(ctx, cluster, "etcd", WithNodeTypes(machine.TypeInit, machine.TypeControlPlane))
}, 5*time.Minute, 5*time.Second)
},

// wait for etcd members to be consistent across nodes
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("etcd members to be consistent across nodes", func(ctx context.Context) error {
return EtcdConsistentAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},

// wait for etcd members to be the control plane nodes
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("etcd members to be control plane nodes", func(ctx context.Context) error {
return EtcdControlPlaneNodesAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},

// wait for apid to be ready on all the nodes
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("apid to be ready", func(ctx context.Context) error {
return ApidReadyAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},

// wait for all nodes to report their memory size
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all nodes memory sizes", func(ctx context.Context) error {
return AllNodesMemorySizes(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},

// wait for all nodes to report their disk size
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all nodes disk sizes", func(ctx context.Context) error {
return AllNodesDiskSizes(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},

// check diagnostics
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("no diagnostics", func(ctx context.Context) error {
return NoDiagnostics(ctx, cluster)
}, time.Minute, 5*time.Second)
},
return PreBootSequenceChecksFiltered(nil)
}

// wait for kubelet to be healthy on all
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("kubelet to be healthy", func(ctx context.Context) error {
return ServiceHealthAssertion(ctx, cluster, "kubelet", WithNodeTypes(machine.TypeInit, machine.TypeControlPlane))
}, 5*time.Minute, 5*time.Second)
},
// PreBootSequenceChecksFiltered returns a filtered version of the PreBootSequenceChecks,
// removing any checks whose names appear in the provided 'skips' list.
func PreBootSequenceChecksFiltered(skips []string) []ClusterCheck {
checkNames := []string{
CheckEtcdHealthy,
CheckEtcdConsistent,
CheckEtcdControlPlane,
CheckApidReady,
CheckAllNodesMemorySizes,
CheckAllNodesDiskSizes,
CheckNoDiagnostics,
CheckKubeletHealthy,
CheckAllNodesBootSequenceFinished,
}

// wait for all nodes to finish booting
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all nodes to finish boot sequence", func(ctx context.Context) error {
return AllNodesBootedAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},
var filtered []ClusterCheck
for _, name := range checkNames {
if slices.Contains(skips, name) {
continue
}
filtered = append(filtered, getCheck(name))
}
return filtered
}