Uria Franko
09/12/2023, 11:38 AM12-9-2023 14:30:33.606
I0912 11:30:33.606660 1 static_autoscaler.go:509] ip-10-2-54-164.eu-central-1.compute.internal is unneeded since 2023-09-12 11:15:47.260684064 +0000 UTC m=+10982.039451071 duration 14m46.343709026s
12-9-2023 14:30:33.606
I0912 11:30:33.606586 1 scale_down.go:448] Node ip-10-2-165-69.eu-central-1.compute.internal - <http://nvidia.com/gpu|nvidia.com/gpu> utilization 0.000000
12-9-2023 14:30:33.606
I0912 11:30:33.606577 1 cluster.go:224] node ip-10-2-165-69.eu-central-1.compute.internal has unready GPU
12-9-2023 14:30:33.606
I0912 11:30:33.606565 1 scale_down.go:448] Node ip-10-2-54-164.eu-central-1.compute.internal - <http://nvidia.com/gpu|nvidia.com/gpu> utilization 0.000000
12-9-2023 14:30:33.606
I0912 11:30:33.606547 1 cluster.go:224] node ip-10-2-54-164.eu-central-1.compute.internal has unready GPU
Node definition
worker-single-gpu = {
dedicated_node_role = "worker"
instance_type = "g4dn.xlarge"
gpu_accelerator = "nvidia-tesla-t4"
gpu_count = 1
min_size = 0
max_size = 10
local_ssd_size_gb = 160
root_disk_size_gb = 200
}
David Espejo (he/him)
09/12/2023, 4:27 PMflyte-binary
to do so as described in the docs
2. Request the GPU resources as described here
I guess you edited <http://eks.tf|eks.tf>
to enable nodes with GPUs? I haven't tried the modules with GPU nodes yetUria Franko
09/12/2023, 4:30 PM<http://eks.tf|eks.tf>
looks like this:
locals {
mng_defaults = {
dedicated_node_role = null
instance_type = "t3.large"
gpu_accelerator = ""
gpu_count = 0
min_size = 0
max_size = 1
root_disk_size_gb = 20
local_ssd_size_gb = 0
spot = false
subnet_ids = module.vpc.private_subnets
}
mngs = {
services = {
max_size = 3
min_size = 1
}
worker = {
dedicated_node_role = "worker"
instance_type = "t3.large"
min_size = 0
max_size = 5
root_disk_size_gb = 250
}
worker-large = {
dedicated_node_role = "worker"
instance_type = "t3.2xlarge"
min_size = 0
max_size = 5
root_disk_size_gb = 250
}
worker-single-gpu = {
dedicated_node_role = "worker"
instance_type = "g4dn.xlarge"
gpu_accelerator = "nvidia-tesla-t4"
gpu_count = 1
min_size = 0
max_size = 10
local_ssd_size_gb = 160
root_disk_size_gb = 200
}
worker-multi-gpu = {
dedicated_node_role = "worker"
instance_type = "g4dn.12xlarge"
gpu_accelerator = "nvidia-tesla-t4"
gpu_count = 4
min_size = 0
max_size = 1
local_ssd_size_gb = 200
root_disk_size_gb = 800
}
}
_mngs_with_defaults = {
for k, v in local.mngs : k => merge(local.mng_defaults, v)
}
# Autoscaling Group tags must be managed separately for Cluster Autoscaler
# to correctly scale node pools from 0.
# See: <https://github.com/terraform-aws-modules/terraform-aws-eks/issues/1886>
# See: <https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md#auto-discovery-setup>
_mngs_asg_tags = {
for k, v in local._mngs_with_defaults : k => merge(
# Spot
v.spot ? {
"<http://k8s.io/cluster-autoscaler/node-template/label/eks.amazonaws.com/capacityType|k8s.io/cluster-autoscaler/node-template/label/eks.amazonaws.com/capacityType>" = "SPOT"
} : {
"<http://k8s.io/cluster-autoscaler/node-template/label/eks.amazonaws.com/capacityType|k8s.io/cluster-autoscaler/node-template/label/eks.amazonaws.com/capacityType>" = "ON_DEMAND"
},
# Ephemeral storage
{
"<http://k8s.io/cluster-autoscaler/node-template/resources/ephemeral-storage|k8s.io/cluster-autoscaler/node-template/resources/ephemeral-storage>" = v.local_ssd_size_gb > 0 ? "${v.local_ssd_size_gb}G" : "${v.root_disk_size_gb}G"
},
# GPUs
v.gpu_count == 0 ? {} : {
"<http://k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu|k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu>" = true
"<http://k8s.io/cluster-autoscaler/node-template/label/k8s.amazonaws.com/accelerator|k8s.io/cluster-autoscaler/node-template/label/k8s.amazonaws.com/accelerator>" = v.gpu_accelerator
"<http://k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu|k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu>" = tostring(v.gpu_count)
"<http://k8s.io/cluster-autoscaler/node-template/taint/nvidia.com/gpu|k8s.io/cluster-autoscaler/node-template/taint/nvidia.com/gpu>" = "present:NoSchedule"
},
# Dedicated node role
v.dedicated_node_role == null ? {} : {
"<http://k8s.io/cluster-autoscaler/node-template/label/flyte.org/node-role|k8s.io/cluster-autoscaler/node-template/label/flyte.org/node-role>" = v.dedicated_node_role
"<http://k8s.io/cluster-autoscaler/node-template/taint/flyte.org/node-role|k8s.io/cluster-autoscaler/node-template/taint/flyte.org/node-role>" = "${v.dedicated_node_role}:NoSchedule"
}
)
}
}
module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "19.10.0"
cluster_name = local.name_prefix
cluster_version = "1.24"
cluster_endpoint_private_access = true
cluster_endpoint_public_access = true
enable_irsa = true
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets
eks_managed_node_groups = {
for k, v in local._mngs_with_defaults : k => {
desired_size = v.min_size
max_size = v.max_size
min_size = v.min_size
ami_type = v.gpu_count == 0 ? null : "AL2_x86_64_GPU"
block_device_mappings = {
xvda = {
device_name = "/dev/xvda"
ebs = {
volume_size = v.root_disk_size_gb
}
}
}
capacity_type = v.spot ? "SPOT" : "ON_DEMAND"
instance_types = [v.instance_type]
labels = merge(
v.gpu_count == 0 ? {} : {
"<http://k8s.amazonaws.com/accelerator|k8s.amazonaws.com/accelerator>" = v.gpu_accelerator
},
v.dedicated_node_role == null ? {} : {
"<http://flyte.org/node-role|flyte.org/node-role>" = v.dedicated_node_role
}
)
# Setup local SSDs
pre_bootstrap_user_data = v.local_ssd_size_gb > 0 ? file("${path.module}/setup_local_ssd.sh") : ""
subnet_ids = v.subnet_ids
tags = {
"<http://k8s.io/cluster-autoscaler/enabled|k8s.io/cluster-autoscaler/enabled>" = true
"<http://k8s.io/cluster-autoscaler/${local.name_prefix}|k8s.io/cluster-autoscaler/${local.name_prefix}>" = true
}
taints = concat(
v.gpu_count == 0 ? [] : [
{
key = "<http://nvidia.com/gpu|nvidia.com/gpu>"
value = "present"
effect = "NO_SCHEDULE"
}
],
v.dedicated_node_role == null ? [] : [
{
key = "<http://flyte.org/node-role|flyte.org/node-role>"
value = v.dedicated_node_role
effect = "NO_SCHEDULE"
}
]
)
}
}
}
resource "aws_autoscaling_group_tag" "eks_managed_node_group_asg_tag" {
# Create a unique identifier for each tag by stripping
# "<http://k8s.io/cluster-autoscaler/node-template/|k8s.io/cluster-autoscaler/node-template/>" and adding as a suffix to the name of
# the managed node group
for_each = merge([
for mng, tags in local._mngs_asg_tags : {
for tag_key, tag_value in tags : "${mng}-${replace(tag_key, "<http://k8s.io/cluster-autoscaler/node-template/|k8s.io/cluster-autoscaler/node-template/>", "")}" => {
mng = mng
key = tag_key
value = tag_value
}
}
]...)
autoscaling_group_name = one(module.eks.eks_managed_node_groups[each.value.mng].node_group_autoscaling_group_names)
tag {
key = each.value.key
value = each.value.value
propagate_at_launch = false
}
depends_on = [module.eks]
}
data "aws_eks_cluster_auth" "default" {
name = module.eks.cluster_name
}
data "aws_iam_policy" "cloudwatch_agent_server_policy" {
name = "CloudWatchAgentServerPolicy"
}
module "aws_cloudwatch_metrics_irsa_role" {
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
version = "5.11.2"
role_name = "${local.name_prefix}-aws-cloudwatch-metrics"
role_policy_arns = {
default = data.aws_iam_policy.cloudwatch_agent_server_policy.arn
}
oidc_providers = {
default = {
provider_arn = module.eks.oidc_provider_arn
namespace_service_accounts = ["kube-system:aws-cloudwatch-metrics"]
}
}
}
data "aws_iam_policy_document" "aws_for_fluent_bit_policy" {
source_policy_documents = [data.aws_iam_policy.cloudwatch_agent_server_policy.policy]
# Fluent-bit CloudWatch plugin manages log groups and retention policies
statement {
actions = [
"logs:DeleteRetentionPolicy",
"logs:PutRetentionPolicy"
]
resources = ["*"]
}
}
module "aws_load_balancer_controller_irsa_role" {
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
version = "5.11.2"
role_name = "${local.name_prefix}-aws-load-balancer-controller"
attach_load_balancer_controller_policy = true
oidc_providers = {
ex = {
provider_arn = module.eks.oidc_provider_arn
namespace_service_accounts = ["kube-system:aws-load-balancer-controller"]
}
}
}
module "cluster_autoscaler_irsa_role" {
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
version = "5.11.2"
role_name = "${local.name_prefix}-cluster-autoscaler"
attach_cluster_autoscaler_policy = true
cluster_autoscaler_cluster_ids = [module.eks.cluster_name]
oidc_providers = {
default = {
provider_arn = module.eks.oidc_provider_arn
namespace_service_accounts = ["kube-system:aws-cluster-autoscaler"]
}
}
}
resource "helm_release" "aws_cloudwatch_metrics" {
namespace = "kube-system"
wait = true
timeout = 600
name = "aws-cloudwatch-metrics"
repository = "<https://aws.github.io/eks-charts>"
chart = "aws-cloudwatch-metrics"
version = "0.0.8"
set {
name = "clusterName"
value = module.eks.cluster_name
}
set {
name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
value = module.aws_cloudwatch_metrics_irsa_role.iam_role_arn
}
set {
name = "tolerations[0].operator"
value = "Exists"
}
}
resource "helm_release" "aws_cluster_autoscaler" {
namespace = "kube-system"
wait = true
timeout = 600
name = "aws-cluster-autoscaler"
repository = "<https://kubernetes.github.io/autoscaler>"
chart = "cluster-autoscaler"
version = "9.24.0"
set {
name = "cloudProvider"
value = "aws"
}
set {
name = "autoDiscovery.clusterName"
value = module.eks.cluster_name
}
set {
name = "awsRegion"
value = data.aws_region.current.name
}
set {
name = "rbac.serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
value = module.cluster_autoscaler_irsa_role.iam_role_arn
}
set {
name = "extraArgs.scale-down-unneeded-time"
value = "5m"
}
set {
name = "extraArgs.scale-down-delay-after-add"
value = "2m"
}
}
resource "helm_release" "aws_load_balancer_controller" {
namespace = "kube-system"
wait = true
timeout = 600
name = "aws-load-balancer-controller"
repository = "<https://aws.github.io/eks-charts>"
chart = "aws-load-balancer-controller"
version = "1.4.7"
set {
name = "clusterName"
value = module.eks.cluster_name
}
set {
name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
value = module.aws_load_balancer_controller_irsa_role.iam_role_arn
}
}
Which kind of the same but added gpu nodes and added gpu label
My plugins section in the eks-production.yml
looks like
plugins:
k8s:
inject-finalizer: true
default-env-vars:
- AWS_METADATA_SERVICE_TIMEOUT: 5
- AWS_METADATA_SERVICE_NUM_ATTEMPTS: 20
gpu-resource-name: <http://nvidia.com/gpu|nvidia.com/gpu>
resource-tolerations:
- <http://nvidia.com/gpu|nvidia.com/gpu>:
- key: '<http://nvidia.com/gpu|nvidia.com/gpu>'
operator: 'Equal'
value: 'present'
effect: 'NoSchedule'
default-tolerations:
- key: '<http://flyte.org/node-role|flyte.org/node-role>'
operator: 'Equal'
value: 'worker'
effect: 'NoSchedule'
and the task itself looks like
@task(
requests=Resources(cpu="3", mem="4Gi", gpu="1"),
container_image="anibali/pytorch:1.8.1-cuda11.1-ubuntu20.04",
timeout="1h",
retries=0,
)
def say_hello() -> str:
import os
try:
import torch
print("Cuda", torch.cuda.is_available())
except ImportError:
print("Torch not installed")
print(
f"There are {os.cpu_count()} cpus and {os.environ['CUDA_VISIBLE_DEVICES']} gpus"
)
return "hello world"
I’ve tried with no image specify and got the same resultsDavid Espejo (he/him)
09/12/2023, 5:10 PMmng_defaults.gpu_count
equal to 0
then no taint will be applied to the EKS nodes and also the AMI type for the EKS nodes won't have GPU support (it should be AL2_x86_64_GPU
in this case)
Try changing the default to non-zero and let us know. Again, I haven't tried this myself but it's part of the reference implementation and this is how it should workUria Franko
09/13/2023, 7:11 AMAL2_x86_64_GPU
ami meaning the count worked on the _mngs_asg_tags
should I still change the default from 0?David Espejo (he/him)
09/13/2023, 2:16 PMworker-single-gpu
definition, one you created?Uria Franko
09/13/2023, 2:22 PMtolerations:
- key: '<http://nvidia.com/gpu|nvidia.com/gpu>'
effect: 'NoSchedule'
value: 'present'
- key: '<http://flyte.org/node-role|flyte.org/node-role>'
operator: 'Equal'
value: 'worker'
effect: 'NoSchedule'
David Espejo (he/him)
09/13/2023, 2:28 PMUria Franko
09/15/2023, 1:59 PMDavid Espejo (he/him)
09/15/2023, 2:57 PMtolerations
array defined