Skip to content

Add minimum specs for executor #237

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions neurons/validators/src/services/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,3 +346,31 @@
"26.1.3": "52d8fcc2c4370bf324cdf17cbc586784",
"27.3.1": "40f1f7724fa0432ea6878692a05b998c",
}


GPU_MIN_CORE_COUNT = {
# Latest Gen NVIDIA GPUs
"NVIDIA H200": 16,
"NVIDIA H100 80GB HBM3": 16,
"NVIDIA H100 NVL": 16,
"NVIDIA H100 PCIe": 16,
"NVIDIA GeForce RTX 4090": 12,
"NVIDIA GeForce RTX 4090 D": 12,
"NVIDIA RTX 4000 Ada Generation": 12,
"NVIDIA RTX 6000 Ada Generation": 10,
"NVIDIA L4": 12,
"NVIDIA L40S": 10,
"NVIDIA L40": 12,
"NVIDIA RTX 2000 Ada Generation": 10,
# Previous Gen NVIDIA GPUs
"NVIDIA A100 80GB PCIe": 16,
"NVIDIA A100-SXM4-80GB": 16,
"NVIDIA RTX A6000": 6,
"NVIDIA RTX A5000": 8,
"NVIDIA RTX A4500": 6,
"NVIDIA RTX A4000": 4,
"NVIDIA A40": 6,
"NVIDIA A30": 6,
"NVIDIA GeForce RTX 3090": 12,
"NVIDIA V100": 6,
}
55 changes: 37 additions & 18 deletions neurons/validators/src/services/task_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
GPU_UTILIZATION_LIMIT,
GPU_MEMORY_UTILIZATION_LIMIT,
VERIFY_JOB_REQUIRED_COUNT,
GPU_MIN_CORE_COUNT,
)
from services.redis_service import (
RedisService,
Expand Down Expand Up @@ -511,6 +512,7 @@ async def create_task(

ram = machine_spec.get("ram", {}).get("total", 0)
storage = machine_spec.get("hard_disk", {}).get("free", 0)
cpu_count = machine_spec.get("cpu", {}).get("count", 0)

gpu_processes = machine_spec.get("gpu_processes", [])

Expand Down Expand Up @@ -742,24 +744,41 @@ async def create_task(
log_text,
)

# if ram < vram * 0.9 or storage < vram * 1.5:
# log_status = "warning"
# log_text = _m(
# "Incorrect vram",
# extra=get_extra_info(
# {
# **default_extra,
# "gpu_model": gpu_model,
# "gpu_count": gpu_count,
# "memory": ram,
# "vram": vram,
# "storage": storage,
# "nvidia_driver": nvidia_driver,
# "libnvidia_ml": libnvidia_ml,
# }
# ),
# )
# logger.warning(log_text)
if ram < vram * gpu_count:
log_status = "warning"
log_text = _m(
"Executor below min vram specifications allowed",
extra=get_extra_info(
{
**default_extra,
"gpu_model": gpu_model,
"gpu_count": gpu_count,
"memory": ram,
"vram": vram,
"storage": storage,
"nvidia_driver": nvidia_driver,
"libnvidia_ml": libnvidia_ml,
}
),
)
logger.warning(log_text)

if cpu_count < GPU_MIN_CORE_COUNT[gpu_model] * gpu_count:
log_status = "warning"
log_text = _m(
"Executor is below min cpu core specifications allowed",
extra=get_extra_info(
{
**default_extra,
"gpu_model": gpu_model,
"gpu_count": gpu_count,
"cpu_count": cpu_count,
"memory": ram,
"vram": vram,
}
),
)
logger.warning(log_text)

# await self.clear_remote_directory(ssh_client, remote_dir)
# await self.redis_service.set_verified_job_info(
Expand Down