diff --git a/neurons/validators/src/services/const.py b/neurons/validators/src/services/const.py index a187af2d..f02ad413 100644 --- a/neurons/validators/src/services/const.py +++ b/neurons/validators/src/services/const.py @@ -346,3 +346,31 @@ "26.1.3": "52d8fcc2c4370bf324cdf17cbc586784", "27.3.1": "40f1f7724fa0432ea6878692a05b998c", } + + +GPU_MIN_CORE_COUNT = { + # Latest Gen NVIDIA GPUs + "NVIDIA H200": 16, + "NVIDIA H100 80GB HBM3": 16, + "NVIDIA H100 NVL": 16, + "NVIDIA H100 PCIe": 16, + "NVIDIA GeForce RTX 4090": 12, + "NVIDIA GeForce RTX 4090 D": 12, + "NVIDIA RTX 4000 Ada Generation": 12, + "NVIDIA RTX 6000 Ada Generation": 10, + "NVIDIA L4": 12, + "NVIDIA L40S": 10, + "NVIDIA L40": 12, + "NVIDIA RTX 2000 Ada Generation": 10, + # Previous Gen NVIDIA GPUs + "NVIDIA A100 80GB PCIe": 16, + "NVIDIA A100-SXM4-80GB": 16, + "NVIDIA RTX A6000": 6, + "NVIDIA RTX A5000": 8, + "NVIDIA RTX A4500": 6, + "NVIDIA RTX A4000": 4, + "NVIDIA A40": 6, + "NVIDIA A30": 6, + "NVIDIA GeForce RTX 3090": 12, + "NVIDIA V100": 6, +} diff --git a/neurons/validators/src/services/task_service.py b/neurons/validators/src/services/task_service.py index 6478033a..40bd95f5 100644 --- a/neurons/validators/src/services/task_service.py +++ b/neurons/validators/src/services/task_service.py @@ -30,6 +30,7 @@ GPU_UTILIZATION_LIMIT, GPU_MEMORY_UTILIZATION_LIMIT, VERIFY_JOB_REQUIRED_COUNT, + GPU_MIN_CORE_COUNT, ) from services.redis_service import ( RedisService, @@ -511,6 +512,7 @@ async def create_task( ram = machine_spec.get("ram", {}).get("total", 0) storage = machine_spec.get("hard_disk", {}).get("free", 0) + cpu_count = machine_spec.get("cpu", {}).get("count", 0) gpu_processes = machine_spec.get("gpu_processes", []) @@ -742,24 +744,41 @@ async def create_task( log_text, ) - # if ram < vram * 0.9 or storage < vram * 1.5: - # log_status = "warning" - # log_text = _m( - # "Incorrect vram", - # extra=get_extra_info( - # { - # **default_extra, - # "gpu_model": gpu_model, - # "gpu_count": gpu_count, - # "memory": ram, - # "vram": vram, - # "storage": storage, - # "nvidia_driver": nvidia_driver, - # "libnvidia_ml": libnvidia_ml, - # } - # ), - # ) - # logger.warning(log_text) + if ram < vram * gpu_count: + log_status = "warning" + log_text = _m( + "Executor below min vram specifications allowed", + extra=get_extra_info( + { + **default_extra, + "gpu_model": gpu_model, + "gpu_count": gpu_count, + "memory": ram, + "vram": vram, + "storage": storage, + "nvidia_driver": nvidia_driver, + "libnvidia_ml": libnvidia_ml, + } + ), + ) + logger.warning(log_text) + + if cpu_count < GPU_MIN_CORE_COUNT[gpu_model] * gpu_count: + log_status = "warning" + log_text = _m( + "Executor is below min cpu core specifications allowed", + extra=get_extra_info( + { + **default_extra, + "gpu_model": gpu_model, + "gpu_count": gpu_count, + "cpu_count": cpu_count, + "memory": ram, + "vram": vram, + } + ), + ) + logger.warning(log_text) # await self.clear_remote_directory(ssh_client, remote_dir) # await self.redis_service.set_verified_job_info(