diff --git a/charts/model-engine/templates/_helpers.tpl b/charts/model-engine/templates/_helpers.tpl index 50383770..3d94a495 100644 --- a/charts/model-engine/templates/_helpers.tpl +++ b/charts/model-engine/templates/_helpers.tpl @@ -341,9 +341,12 @@ env: value: "true" - name: LAUNCH_SERVICE_TEMPLATE_FOLDER value: "/workspace/model-engine/model_engine_server/infra/gateways/resources/templates" - {{- if .Values.redis.auth}} + {{- if .Values.secrets.kubernetesRedisSecretName }} - name: REDIS_AUTH_TOKEN - value: {{ .Values.redis.auth }} + valueFrom: + secretKeyRef: + name: {{ .Values.secrets.kubernetesRedisSecretName }} + key: auth_token {{- end }} {{- if .Values.azure}} - name: AZURE_IDENTITY_NAME diff --git a/charts/model-engine/templates/aws_config_map.yaml b/charts/model-engine/templates/aws_config_map.yaml index 60b91c97..9e9eb041 100644 --- a/charts/model-engine/templates/aws_config_map.yaml +++ b/charts/model-engine/templates/aws_config_map.yaml @@ -20,6 +20,9 @@ data: [profile {{ $profileName }}] role_arn = {{ index $annotations "eks.amazonaws.com/role-arn" }} web_identity_token_file = /var/run/secrets/eks.amazonaws.com/serviceaccount/token + [profile {{ $.Values.serviceAccount.sqsProfileName }}] + role_arn = {{ index $annotations "eks.amazonaws.com/role-arn" }} + web_identity_token_file = /var/run/secrets/eks.amazonaws.com/serviceaccount/token --- {{- end }} {{- end }} diff --git a/charts/model-engine/templates/inference_framework_config.yaml b/charts/model-engine/templates/inference_framework_config.yaml index 45759d77..1b4aed66 100644 --- a/charts/model-engine/templates/inference_framework_config.yaml +++ b/charts/model-engine/templates/inference_framework_config.yaml @@ -2,17 +2,18 @@ apiVersion: v1 kind: ConfigMap metadata: name: {{ include "modelEngine.fullname" . }}-inference-framework-latest-config + namespace: {{ .Release.Namespace }} labels: product: common team: infra annotations: - "helm.sh/hook": pre-install + "helm.sh/hook": pre-install,pre-upgrade "helm.sh/hook-weight": "-2" data: deepspeed: "latest" text_generation_inference: "latest" - vllm: "latest" - vllm_batch: "latest" - vllm_batch_v2: "latest" + vllm: "{{ .Values.vLLM.primaryTag }}" + vllm_batch: "{{ .Values.vLLM.batchTag }}" + vllm_batch_v2: "{{ .Values.vLLM.batchV2Tag }}" lightllm: "latest" tensorrt_llm: "latest" diff --git a/charts/model-engine/templates/istio-virtualservice.yaml b/charts/model-engine/templates/istio-virtualservice.yaml index 1bd26e14..f486489d 100644 --- a/charts/model-engine/templates/istio-virtualservice.yaml +++ b/charts/model-engine/templates/istio-virtualservice.yaml @@ -1,4 +1,4 @@ -{{- if .Values.virtualservice.enabled -}} +{{- if .values.virtualService.enabled -}} {{- $fullName := include "modelEngine.fullname" . -}} apiVersion: networking.istio.io/v1alpha3 kind: VirtualService @@ -6,19 +6,15 @@ metadata: name: {{ $fullName }} labels: {{- include "modelEngine.labels" . | nindent 4 }} - {{- with .Values.virtualservice.annotations }} + {{- with .values.virtualService.annotations }} annotations: {{- toYaml . | nindent 4 }} {{- end }} spec: hosts: - {{- range .Values.virtualservice.hostDomains }} - - "{{ $fullName }}.{{ . }}" - {{- end }} + - model-engine.{{ $.Values.global.networking.internalDomain }} gateways: - {{- range .Values.virtualservice.gateways }} - - {{ . | quote }} - {{- end }} + - {{ $.Values.global.networking.internalGateway }} http: - route: - destination: diff --git a/charts/model-engine/templates/service_account_inference.yaml b/charts/model-engine/templates/service_account_inference.yaml index c9fa94fb..669f7604 100644 --- a/charts/model-engine/templates/service_account_inference.yaml +++ b/charts/model-engine/templates/service_account_inference.yaml @@ -1,4 +1,4 @@ -{{- if and (.Values.serviceTemplate) (.Values.serviceTemplate.createServiceAccount) (.Values.serviceTemplate.serviceAccountAnnotations) (.Values.serviceTemplate.serviceAccountName) (.Values.config.values.launch.endpoint_namespace)}} +{{- if and (.Values.serviceTemplate) (.Values.serviceTemplate.createInferenceServiceAccount) (.Values.serviceTemplate.serviceAccountAnnotations) (.Values.serviceTemplate.serviceAccountName) (.Values.config.values.launch.endpoint_namespace)}} {{- $annotations := .Values.serviceTemplate.serviceAccountAnnotations }} {{- $inferenceServiceAccountName := .Values.serviceTemplate.serviceAccountName }} {{- $inferenceServiceAccountNamespace := .Values.config.values.launch.endpoint_namespace }} @@ -22,4 +22,4 @@ imagePullSecrets: - name: egp-ecr-regcred {{- end }} --- -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/model-engine/templates/service_config_map.yaml b/charts/model-engine/templates/service_config_map.yaml index 403bb552..0d7b290f 100644 --- a/charts/model-engine/templates/service_config_map.yaml +++ b/charts/model-engine/templates/service_config_map.yaml @@ -3,6 +3,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: {{ include "modelEngine.fullname" . }}-service-config + namespace: {{ .Release.Namespace }} labels: {{- include "modelEngine.labels" . | nindent 4 }} annotations: @@ -11,46 +12,110 @@ metadata: data: launch_service_config: |- dd_trace_enabled: {{ .Values.dd_trace_enabled | default false | quote }} + + # Config to know where model-engine is running gateway_namespace: {{ .Release.Namespace | quote }} - {{- with .Values.config.values.launch }} - {{- range $key, $value := . }} - {{ $key }}: {{ $value | quote }} - {{- end }} - {{- end }} - infra_service_config: |- - env: {{ .Values.context | quote }} - {{- with .Values.config.values.infra }} - {{- range $key, $value := . }} - {{ $key }}: {{ $value | quote }} - {{- end }} - {{- end }} ---- + # Config for scale-hosted Hosted Model Inference in the prod cluster, plus a bunch of other config-ish notes + # NOTE: If you add/change values inside this file that need to apply to all clusters, please make changes in + # all service_config_{env}.yaml files as well. -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "modelEngine.fullname" . }}-service-config - namespace: {{ .Values.config.values.launch.endpoint_namespace }} - labels: - {{- include "modelEngine.labels" . | nindent 4 }} - annotations: - "helm.sh/hook": pre-install,pre-upgrade - "helm.sh/hook-weight": "-2" -data: - launch_service_config: |- - dd_trace_enabled: {{ .Values.dd_trace_enabled | default false | quote }} - gateway_namespace: {{ .Release.Namespace | quote }} - {{- with .Values.config.values.launch }} - {{- range $key, $value := . }} - {{ $key }}: {{ $value | quote }} - {{- end }} - {{- end }} + # Config for scale-hosted Hosted Model Inference in the prod cluster, see `service_config` for more details + model_primitive_host: model-server.{{ .Release.Namespace }}.svc.cluster.local + + # # Endpoint config + # K8s namespace the endpoints will be created in + endpoint_namespace: {{ .Release.Namespace | quote }} + + # Asynchronous endpoints + sqs_profile: {{ $.Values.serviceAccount.sqsProfileName }} + sqs_queue_policy_template: |- + { + "Version": "2012-10-17", + "Id": "__default_policy_ID", + "Statement": [ + { + "Sid": "__owner_statement", + "Effect": "Allow", + "Principal": { + "AWS": "arn:{{ .Values.aws.partition }}:iam::{{ .Values.aws.accountId }}:root" + }, + "Action": "sqs:*", + "Resource": "arn:{{ .Values.aws.partition }}:sqs:{{ .Values.aws.region }}:{{ .Values.aws.accountId }}:${queue_name}" + }, + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:{{ .Values.aws.partition }}:iam::{{ .Values.aws.accountId }}:role/{{ $.Values.serviceAccount.sqsProfileName }}" + }, + "Action": "sqs:*", + "Resource": "arn:{{ .Values.aws.partition }}:sqs:{{ .Values.aws.region }}:{{ .Values.aws.accountId }}:${queue_name}" + }, + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:{{ .Values.aws.partition }}:iam::{{ .Values.aws.accountId }}:role/ml_hosted_model_inference" + }, + "Action": "sqs:*", + "Resource": "arn:{{ .Values.aws.partition }}:sqs:{{ .Values.aws.region }}:{{ .Values.aws.accountId }}:${queue_name}" + } + ] + } + + sqs_queue_tag_template: |- + { + "{{ .Values.tagging.organization }}/product": "{{ .Values.tagging.productTag }}", + "{{ .Values.tagging.organization }}/team": "${team}", + "{{ .Values.tagging.organization }}/contact": "{{ .Values.tagging.contactEmail }}", + "{{ .Values.tagging.organization }}/customer": "AllCustomers", + "{{ .Values.tagging.organization }}/financialOwner": "{{ .Values.tagging.contactEmail }}", + "Launch-Endpoint-Id": "${endpoint_id}", + "Launch-Endpoint-Name": "${endpoint_name}", + "Launch-Endpoint-Created-By": "${endpoint_created_by}" + } + + # Billing + billing_queue_arn: arn:aws:events:{{ .Values.aws.region }}:{{ .Values.aws.accountId }}:event-bus/money + + # The below redis URL would not work if we needed auth, which we do, so we have to pull cache_url from the cache_redis_aws_secret_name + cache_redis_aws_secret_name: "{{ .Values.secrets.redisAwsSecretName }}" + + cloud_file_llm_fine_tune_repository: "s3://{{ .Values.aws.s3Bucket }}/hosted-model-inference/llm-ft-job-repository/prod" + + dd_trace_enabled: true + istio_enabled: true + sensitive_log_mode: true + tgi_repository: "text-generation-inference" + vllm_repository: "vllm" + lightllm_repository: "lightllm" + tensorrt_llm_repository: "tensorrt-llm" + batch_inference_vllm_repository: "llm-engine/batch-infer-vllm" + user_inference_base_repository: "launch/inference" + user_inference_pytorch_repository: "hosted-model-inference/async-pytorch" + user_inference_tensorflow_repository: "hosted-model-inference/async-tensorflow-cpu" + docker_image_layer_cache_repository: "kaniko-cache" + + # S3 access + hf_user_fine_tuned_weights_prefix: "s3://{{ .Values.aws.s3Bucket }}/hosted-model-inference/fine_tuned_weights" infra_service_config: |- env: {{ .Values.context | quote }} - {{- with .Values.config.values.infra }} - {{- range $key, $value := . }} - {{ $key }}: {{ $value | quote }} - {{- end }} - {{- end }} + cloud_provider: "aws" + env: "prod" + k8s_cluster_name: "{{ .Values.clusterName }}" + dns_host_domain: "model-engine.{{ $.Values.global.networking.internalDomain }}" + default_region: "{{ .Values.aws.region }}" + ml_account_id: "{{ .Values.aws.accountId }}" + docker_repo_prefix: "{{ .Values.aws.accountId }}.dkr.ecr.{{ .Values.aws.region }}.amazonaws.com" + redis_host: "{{ .Values.redis.hostname }}" + s3_bucket: "{{ .Values.aws.s3Bucket }}" + profile_ml_worker: "ml-worker" + profile_ml_inference_worker: "ml-worker" + identity_service_url: "{{ .Values.identityServiceUrl }}" + firehose_role_arn: "arn:{{ .Values.aws.partition }}:iam::{{ .Values.aws.accountId }}:role/firehose-stream-logging-role" + firehose_stream_name: "{{ .Values.firehoseStreamName }}" + db_engine_pool_size: 20 + db_engine_max_overflow: 10 + db_engine_echo: false + db_engine_echo_pool: true + db_engine_disconnect_strategy: "pessimistic" {{- end }} diff --git a/charts/model-engine/templates/service_template_config_map.yaml b/charts/model-engine/templates/service_template_config_map.yaml index f721eb46..6c0a6158 100644 --- a/charts/model-engine/templates/service_template_config_map.yaml +++ b/charts/model-engine/templates/service_template_config_map.yaml @@ -486,7 +486,7 @@ data: protocol: TCP name: http ${NODE_PORT_DICT} - {{- if .Values.virtualservice.enabled }} + {{- if .values.virtualService.enabled }} virtual-service.yaml: |- apiVersion: networking.istio.io/v1alpha3 kind: VirtualService @@ -522,6 +522,7 @@ data: loadBalancer: simple: LEAST_REQUEST {{- end }} + {{- if and (.Capabilities.APIVersions.Has "autoscaling.k8s.io/v1") (.Values.autoscaling.vertical.enabled) }} vertical-pod-autoscaler.yaml: |- apiVersion: "autoscaling.k8s.io/v1" kind: VerticalPodAutoscaler @@ -548,6 +549,7 @@ data: cpu: ${CPUS} memory: ${MEMORY} controlledResources: ["cpu", "memory"] + {{- end }} pod-disruption-budget.yaml: |- apiVersion: policy/v1 kind: PodDisruptionBudget diff --git a/charts/model-engine/values.yaml b/charts/model-engine/values.yaml index b75b6efa..26315b66 100644 --- a/charts/model-engine/values.yaml +++ b/charts/model-engine/values.yaml @@ -1,11 +1,313 @@ -dd_trace_enabled: true -spellbook: - enabled: false -redis: - auth: +# If specified, will override the name of the deployed services +# Otherwise, defaults to the Chart name, typically "model-engine" +# serviceIdentifier: + +# The Kubernetes cluster name in which the Model Engine is deployed +clusterName: + +secrets: + # Either of the below AWS secrets expect a key named "database_url" with a fully specified database URL including + # the username and password. + + # Use the Cloud database secret name to pull from AWS Secrets Manager + # cloudDatabaseSecretName: + # Use the Kubernetes database secret name to pull from Kubernetes Secrets + kubernetesDatabaseSecretName: + + # This secret must have a fully specified database URL including the password (auth token) + # It should be under "cache_url" key in the secret + # redisAwsSecretName: + # Kubernetes secret containing a key `auth_token` that contains the redis auth token for connection + # Will not be used if `redisAwsSecretName` is set. Used in conjunction with REDIS_HOST and REDIS_PORT env vars + kubernetesRedisSecretName: + db: + # Runs an initial database schema migration on deployment if set to true runDbInitScript: false -balloonNodeSelector: - node-lifecycle: normal -nodeSelector: - node-lifecycle: normal + +replicaCount: + # The gateway service is the entrypoint for all requests to the Model Engine + gateway: 1 + # The cacher service is responsible for caching kubernetes API requests + cacher: 1 + # The builder service is responsible for creating new deployments and other kubernetes resources + builder: 1 + +# Tag of the model engine image that will be used for the model engine deployments +tag: +# Sets the 'env' label on the pods and primarily used for metadata tagging +context: +# Specifies core services' image repositories +image: + gatewayRepository: public.ecr.aws/b2z8n5q1/model-engine + builderRepository: public.ecr.aws/b2z8n5q1/model-engine + cacherRepository: public.ecr.aws/b2z8n5q1/model-engine + forwarderRepository: public.ecr.aws/b2z8n5q1/model-engine + pullPolicy: Always + +# Specifiers for the core model engine service deployments +nodeSelector: { } +tolerations: [ ] +affinity: { } + +# Specifies the configuration on the Gateway service kube service +service: + type: ClusterIP + port: 80 + +# Creates istio virtual services for the Model Engine using the global domain nome and gateway specified below +virtualService: + enabled: true + +global: + networking: + # Internal domain name attached to the internal Istio gateway. + # The model engine deployment will be exposed at: + # model-engine. + # Deployed services will be exposed at: + # launch-endpoint-id-{endpoint_id}.model-engine. + internalDomain: + # namespace/service for the Istio internal gateway deployment + internalGateway: + +# Tag of the vLLM images to use for LLM Engine deployments +# These tags must exist in a 'vllm' repository in ECR, which will be found based on your +# AWS account ID and region. +vllm: + primaryTag: 0.5.4 + batchTag: 0.5.4 + batchV2Tag: 0.5.4 + +# Specifies the number of replicas for each "balloon" service for each GPU type. +# Used to warm up nodes prior to model deployment. +balloons: + - acceleratorName: nvidia-ampere-a10 + replicaCount: 0 + - acceleratorName: nvidia-ampere-a100 + replicaCount: 0 + - acceleratorName: cpu + replicaCount: 0 + - acceleratorName: nvidia-tesla-t4 + replicaCount: 0 + - acceleratorName: nvidia-hopper-h100 + replicaCount: 0 + +# Specific node labels that the "balloon" services should be scheduled on +balloonNodeSelector: { } + +# Metadata to be tagged on the deployed pods in the SQS queue +tagging: + organization: + contactEmail: + productTag: + +# Used to specify the https/http prefix for the model engine gateway URL for initialization jobs +# that must connect to the model engine gateway +hostDomain: + prefix: http:// + +destinationrule: + enabled: true + annotations: { } + +autoscaling: + horizontal: + enabled: true + minReplicas: 1 + maxReplicas: 5 + targetConcurrency: 3 + vertical: + enabled: false + prewarming: + enabled: false + +celery_autoscaler: + enabled: true + num_shards: 10 + +# Specifies a minimum number of pods that must be available at all times during upgrades or scaling +podDisruptionBudget: + enabled: true + minAvailable: 1 + +# Default resources for the Model Engine deployments +resources: + requests: + cpu: 2 + ephemeral-storage: 256Mi + +# Service Account information for the Model Engine deployments +serviceAccount: + annotations: + eks.amazonaws.com/role-arn: + sqsProfileName: +# The service account automatically gets created in the Release namespace +# namespaces: + +aws: + # Used to mount a configmap into the containers in order to supply AWS profiles + configMap: + name: ml-worker-config + create: true + mountPath: /opt/.aws/config + namespaces: + - default + profileName: ml-worker + s3WriteProfileName: ml-worker + partition: *awsPartition + region: *awsRegion + accountId: *awsAccountId + # The Model Engine s3 bucket + s3Bucket: + +# Optional additional way of setting the Redis hostname aside from the REDIS_HOST env var +redis: + hostname: + +# Experimental additional inference image +triton: + image: + repository: + tag: + +serviceTemplate: + # createInferenceServiceAccount/serviceAccountName/serviceAccountAnnotations specify whether to create a serviceAccount for + # inference pods. Assumes the inference pods run in a separate namespace to the LLM Engine control plane. + createInferenceServiceAccount: true + securityContext: + capabilities: + drop: + - all + mountInfraConfig: false + serviceAccountName: model-engine + awsConfigMapName: ml-worker-config + serviceAccountAnnotations: + eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/llm-engine + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-2" + +# Specifies the type of broker to use for the celery autoscaler +# Can be either "sqs" or "servicebus" +celeryBrokerType: sqs + +# For each GPU type, specify tolerations associated with any taints associated with different GPU type node classes. +# This will only set the tolerations for the pods that exist to cache images on each node, however. They will NOT set +# tolerations for the deployed ML models. Each of those are set in service_template_config_map.yaml in model-engine templates +# to contain the standard "nvidia.com/gpu" toleration. +imageCache: + devices: + - name: cpu + nodeSelector: + cpu-only: "true" + - name: a10 + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-ampere-a10 + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - name: a100 + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-ampere-a100 + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - name: t4 + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-tesla-t4 + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - name: h100 + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-hopper-h100 + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - name: h100-mig-1g-20gb + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-hopper-h100-mig-1g20gb + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - name: h100-mig-3g-40gb + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-hopper-h100-mig-3g40gb + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + +# Requests will automatically receive these resource inputs if not otherwise specified, based on the GPU +# type associated with the deployment request. Please ensure that your infrastructure configuration labels each +# GPU node type with: "k8s.amazonaws.com/accelerator: ${GPU_TYPE}" +recommendedHardware: + byGpuMemoryGb: + - gpu_memory_le: 20 + cpus: 5 + gpus: 1 + memory: 20Gi + storage: 40Gi + gpu_type: nvidia-hopper-h100-1g20gb + - gpu_memory_le: 40 + cpus: 10 + gpus: 1 + memory: 40Gi + storage: 80Gi + gpu_type: nvidia-hopper-h100-3g40gb + - gpu_memory_le: 80 + cpus: 20 + gpus: 1 + memory: 80Gi + storage: 96Gi + gpu_type: nvidia-hopper-h100 + - gpu_memory_le: 160 + cpus: 40 + gpus: 2 + memory: 160Gi + storage: 160Gi + gpu_type: nvidia-hopper-h100 + - gpu_memory_le: 320 + cpus: 80 + gpus: 4 + memory: 320Gi + storage: 320Gi + gpu_type: nvidia-hopper-h100 + - gpu_memory_le: 640 + cpus: 160 + gpus: 8 + memory: 800Gi + storage: 640Gi + gpu_type: nvidia-hopper-h100 + byModelName: + - name: llama-3-8b-instruct-262k + cpus: 40 + gpus: 2 + memory: 160Gi + storage: 160Gi + gpu_type: nvidia-hopper-h100 + - name: deepseek-coder-v2 + cpus: 160 + gpus: 8 + memory: 800Gi + storage: 640Gi + gpu_type: nvidia-hopper-h100 + - name: deepseek-coder-v2-instruct + cpus: 160 + gpus: 8 + memory: 800Gi + storage: 640Gi + gpu_type: nvidia-hopper-h100 + +# Enables Datadog and associated tracing +datadog: + enabled: false +dd_trace_enabled: false + +# Deprecated service for deployment of LLM's +spellbook: + enabled: false \ No newline at end of file diff --git a/charts/model-engine/values_sample.yaml b/charts/model-engine/values_sample.yaml index 430abea6..5e4c5092 100644 --- a/charts/model-engine/values_sample.yaml +++ b/charts/model-engine/values_sample.yaml @@ -138,9 +138,9 @@ serviceTemplate: drop: - all mountInfraConfig: true - # createServiceAccount/serviceAccountName/serviceAccountAnnotations specify whether to create a serviceAccount for + # createInferenceServiceAccount/serviceAccountName/serviceAccountAnnotations specify whether to create a serviceAccount for # inference pods. Assumes the inference pods run in a separate namespace to the LLM Engine control plane. - createServiceAccount: true + createInferenceServiceAccount: true serviceAccountName: model-engine serviceAccountAnnotations: eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/llm-engine