Skip to content

feat(aci): Enforce quota limits when creating metric detectors #97445

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/sentry/features/temporary.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,8 @@ def register_temporary_features(manager: FeatureManager):
manager.add("organizations:workflow-engine-rule-serializers", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False)
# Enable single processing of metric issues
manager.add("organizations:workflow-engine-single-process-metric-issues", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False)
# Enable metric detector limits by plan type
manager.add("organizations:workflow-engine-metric-detector-limit", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)
# Enable EventUniqueUserFrequencyConditionWithConditions special alert condition
manager.add("organizations:event-unique-user-frequency-condition-with-conditions", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)
# Use spans instead of transactions for dynamic sampling calculations. This will become the new default.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from rest_framework.request import Request
from rest_framework.response import Response

from sentry import features, options
from sentry import features, options, quotas
from sentry.api.api_owners import ApiOwner
from sentry.api.api_publish_status import ApiPublishStatus
from sentry.api.base import Endpoint, region_silo_endpoint
Expand Down Expand Up @@ -562,6 +562,19 @@ def check_can_create_alert(self, request: Request, organization: Organization) -
permission, then we must verify that the user is a team admin with "alerts:write" access to the project(s)
in their request.
"""
if features.has(
"organizations:workflow-engine-metric-detector-limit", organization, actor=request.user
):
alert_limit = quotas.backend.get_metric_detector_limit(organization.id)
alert_count = AlertRule.objects.fetch_for_organization(
organization=organization
).count()

if alert_limit >= 0 and alert_count >= alert_limit:
raise ValidationError(
f"You may not exceed {alert_limit} metric alerts on your current plan."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Quota Enforcement Mismatch Across Paths

The metric detector quota is inconsistently enforced. The AlertRule creation endpoint counts all AlertRule objects (excluding snapshots), while the Detector creation path counts only active Detector objects. Both paths use the same quota limit, which allows users to exceed the intended limit.

Fix in Cursor Fix in Web

)

# if the requesting user has any of these org-level permissions, then they can create an alert
if (
request.access.has_scope("alerts:write")
Expand Down
28 changes: 28 additions & 0 deletions src/sentry/incidents/metric_issue_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from rest_framework import serializers

from sentry import features, quotas
from sentry.constants import ObjectStatus
from sentry.incidents.logic import enable_disable_subscriptions
from sentry.snuba.models import QuerySubscription, SnubaQuery, SnubaQueryEventType
from sentry.snuba.snuba_query_validator import SnubaQueryValidator
Expand Down Expand Up @@ -88,6 +90,32 @@ def validate(self, attrs):

return attrs

def enforce_quota_on_create(self) -> None:
"""
Enforce quota limits for metric detector creation.
Raises ValidationError if quota limits are exceeded.
"""
organization = self.context.get("organization")
request = self.context.get("request")

if not organization or not request:
raise serializers.ValidationError()

if features.has(
"organizations:workflow-engine-metric-detector-limit", organization, actor=request.user
):
detector_limit = quotas.backend.get_metric_detector_limit(organization.id)
detector_count = Detector.objects.filter(
project__organization=organization,
type="metric_issue", # Avoided circular import. TODO: move magic strings to constant file
status=ObjectStatus.ACTIVE,
).count()

if detector_limit >= 0 and detector_count >= detector_limit:
raise serializers.ValidationError(
f"You may not exceed {detector_limit} metric detectors on your current plan."
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Quota Error Grammar & Hardcoded String

The new metric detector quota enforcement has two issues. The error message for exceeding the limit uses incorrect grammar when the limit is 1 (e.g., "1 metric detectors" instead of "1 metric detector"). Additionally, the detector type filter uses a hardcoded string "metric_issue" instead of a constant, which makes the code less robust.

Fix in Cursor Fix in Web

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Inconsistent Quota Handling Across Detectors and Alerts

Quota enforcement for metric detectors and alert rules is inconsistent: detectors count only ACTIVE objects, while alert rules exclude only SNAPSHOT status. This leads to different quota behavior. Also, the quota exceeded message uses plural forms ('detectors', 'alerts') when the limit is 1.

Additional Locations (2)
Fix in Cursor Fix in Web

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Quota Enforcement Issues

The enforce_quota_on_create method raises an unhelpful, empty ValidationError if organization or request context is missing. This validation also incorrectly runs even when the quota feature flag is disabled. Additionally, the quota check is not atomic with detector creation, allowing concurrent requests to exceed the configured limit.

Fix in Cursor Fix in Web


def update_data_source(self, instance: Detector, data_source: SnubaQueryDataSourceType):
try:
source_instance = DataSource.objects.get(detector=instance)
Expand Down
6 changes: 6 additions & 0 deletions src/sentry/quotas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,3 +748,9 @@ def get_dashboard_limit(self, org_id: int) -> int:
Returns the maximum number of dashboards allowed for the organization's plan type.
"""
return -1

def get_metric_detector_limit(self, org_id: int) -> int:
"""
Returns the maximum number of detectors allowed for the organization's plan type.
"""
return -1
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def post(self, request: Request, organization: Organization) -> Response:
if not detector_type:
raise ValidationError({"type": ["This field is required."]})

# restrict creating metric issue detectors by plan type
# Restrict creating metric issue detectors by plan type
if detector_type == MetricIssue.slug and not features.has(
"organizations:incidents", organization, actor=request.user
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ def data_source(self) -> BaseDataSourceValidator:
def data_conditions(self) -> BaseDataConditionValidator:
raise NotImplementedError

def enforce_quota_on_create(self) -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you decorate this with @abc.abstractmethod? that way every detector will have to have this method. (hopefully makes it easier to find the quota for each detector type)

"""
Enforce quota limits for detector creation.
Raise ValidationError if quota limits are exceeded.

Only Metric Issues Detector has quota limits.
"""
pass

def update(self, instance: Detector, validated_data: dict[str, Any]):
with transaction.atomic(router.db_for_write(Detector)):
if "name" in validated_data:
Expand Down Expand Up @@ -110,6 +119,10 @@ def update(self, instance: Detector, validated_data: dict[str, Any]):
return instance

def create(self, validated_data):
# If quotas are exceeded, we will prevent creation of new detectors.
# Do not disable or prevent the users from updating existing detectors.
self.enforce_quota_on_create()
Comment on lines +122 to +124
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it might be a cleaner abstraction to make enforce_quota_on_create method return a boolean result (has quota or not), then have this raise the validation error. that way we can enforce the data flow control through the abstract method definition. another nit is for the name; maybe we can just name it enforce_quota -- that way it's a little cleaner if we want to reuse the method in the future.

# If quotas are exceeded, we will prevent creation of new detectors.
# Do not disable or prevent the users from updating existing detectors.
if not self.enforce_quota():
    serializers.ValidationError(f"Cannot create {validated_data.type} monitor, exceeds quota limit.")

we may also want to have the result include information about the quota itself to keep the nice error message per detector type. in that case, we could do something like:

@dataclass(frozen=True)
class DetectorQuota():
    limit: int | False # or something close for the typing
    count: int

class BaseDetectorValidator():
    @abc.abstractmethod
    def get_quota(self) -> DetectorQuota:
        pass
    
    def enforce_quota(self) -> None:
        quota = self.get_quota()
    
        if quota.limit and quota.limit >= quota.count:
            raise serializers.ValidationError(
                f"Cannot create {validated_data.type} monitor, exceeds quota of {quota.limit}."
            )
    
    def create(self, validated_data):
        self.enforce_quota()
        # ...


with transaction.atomic(router.db_for_write(Detector)):
condition_group = DataConditionGroup.objects.create(
logic_type=DataConditionGroup.Type.ANY,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
AlertRuleDetectionType,
AlertRuleSeasonality,
AlertRuleSensitivity,
AlertRuleStatus,
AlertRuleThresholdType,
AlertRuleTrigger,
AlertRuleTriggerAction,
Expand Down Expand Up @@ -1640,6 +1641,58 @@ def test_performance_alert(self, record_analytics: MagicMock) -> None:
resp = self.get_response(self.organization.slug, **valid_alert_rule)
assert resp.status_code == 201

@with_feature("organizations:incidents")
@with_feature("organizations:workflow-engine-metric-detector-limit")
@patch("sentry.quotas.backend.get_metric_detector_limit")
def test_metric_alert_limit(self, mock_get_limit: MagicMock) -> None:
# Set limit to 2 alert rules
mock_get_limit.return_value = 2

# Create 2 existing metric alert rules (1 active, 1 to be deleted)
alert_rule = self.create_alert_rule(organization=self.organization)
alert_rule.status = AlertRuleStatus.PENDING.value
alert_rule.save()

alert_rule = self.create_alert_rule(organization=self.organization)
alert_rule.status = AlertRuleStatus.SNAPSHOT.value
alert_rule.save()

# Create another alert rule, it should succeed
data = deepcopy(self.alert_rule_dict)
with outbox_runner():
resp = self.get_success_response(
self.organization.slug,
status_code=201,
**data,
)
alert_rule = AlertRule.objects.get(id=resp.data["id"])
assert alert_rule.name == "JustAValidTestRule"

# Create another alert rule, it should fail
data = deepcopy(self.alert_rule_dict)
resp = self.get_error_response(
self.organization.slug,
status_code=400,
**data,
)

@with_feature("organizations:incidents")
@with_feature("organizations:workflow-engine-metric-detector-limit")
def test_metric_alert_limit_unlimited_plan(self) -> None:
# Create many alert rules
for _ in range(5):
self.create_alert_rule(organization=self.organization)

# Creating another alert rule, it should succeed
with outbox_runner():
resp = self.get_success_response(
self.organization.slug,
status_code=201,
**self.alert_rule_dict,
)
alert_rule = AlertRule.objects.get(id=resp.data["id"])
assert alert_rule.name == "JustAValidTestRule"


@freeze_time()
class AlertRuleCreateEndpointTestCrashRateAlert(AlertRuleIndexBase):
Expand Down
50 changes: 49 additions & 1 deletion tests/sentry/incidents/endpoints/validators/test_validators.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from unittest import mock

from rest_framework.exceptions import ErrorDetail
from rest_framework.exceptions import ErrorDetail, ValidationError

from sentry import audit_log
from sentry.constants import ObjectStatus
from sentry.incidents.grouptype import MetricIssue
from sentry.incidents.metric_issue_detector import (
MetricIssueComparisonConditionValidator,
Expand All @@ -23,6 +24,7 @@
SnubaQuery,
SnubaQueryEventType,
)
from sentry.testutils.helpers.features import with_feature
from sentry.workflow_engine.models import DataCondition, DataConditionGroup, DataSource, Detector
from sentry.workflow_engine.models.data_condition import Condition
from sentry.workflow_engine.registry import data_source_type_registry
Expand Down Expand Up @@ -358,3 +360,49 @@ def test_too_many_conditions(self) -> None:
assert validator.errors.get("nonFieldErrors") == [
ErrorDetail(string="Too many conditions", code="invalid")
]

@mock.patch("sentry.quotas.backend.get_metric_detector_limit")
def test_enforce_quota_feature_disabled(self, mock_get_limit: mock.MagicMock) -> None:
mock_get_limit.return_value = 0
validator = MetricIssueDetectorValidator(data=self.valid_data, context=self.context)

assert validator.is_valid()
assert validator.save()
mock_get_limit.assert_not_called()

@mock.patch("sentry.quotas.backend.get_metric_detector_limit")
@with_feature("organizations:workflow-engine-metric-detector-limit")
def test_enforce_quota_within_limit(self, mock_get_limit: mock.MagicMock) -> None:
mock_get_limit.return_value = 1

# Create a not-metric detector
self.create_detector(
project_id=self.project.id,
name="Error Detector",
status=ObjectStatus.ACTIVE,
)
# Create 3 inactive detectors
for status in [
ObjectStatus.DISABLED,
ObjectStatus.PENDING_DELETION,
ObjectStatus.DELETION_IN_PROGRESS,
]:
self.create_detector(
project_id=self.project.id,
name=f"Inactive Detector {status}",
type=MetricIssue.slug,
status=status,
)

validator = MetricIssueDetectorValidator(data=self.valid_data, context=self.context)
assert validator.is_valid()
assert validator.save()
mock_get_limit.assert_called_once_with(self.project.organization.id)

validator = MetricIssueDetectorValidator(data=self.valid_data, context=self.context)
validator.is_valid()
with self.assertRaisesMessage(
ValidationError,
expected_message="You may not exceed 1 metric detectors on your current plan.",
):
validator.save()
Original file line number Diff line number Diff line change
Expand Up @@ -956,6 +956,115 @@ def test_owner_not_in_organization(self) -> None:
)
assert "owner" in response.data

@with_feature("organizations:workflow-engine-metric-detector-limit")
@mock.patch("sentry.quotas.backend.get_metric_detector_limit")
def test_metric_detector_limit(self, mock_get_limit: mock.MagicMock) -> None:
# Set limit to 2 detectors
mock_get_limit.return_value = 2

# Create 2 metric detectors (1 active, 1 to be deleted)
self.create_detector(
project_id=self.project.id,
name="Existing Detector 1",
type=MetricIssue.slug,
status=ObjectStatus.ACTIVE,
)
self.create_detector(
project_id=self.project.id,
name="Existing Detector 2",
type=MetricIssue.slug,
status=ObjectStatus.PENDING_DELETION,
)

# Create another metric detector, it should succeed
with self.tasks():
response = self.get_success_response(
self.organization.slug,
**self.valid_data,
status_code=201,
)
detector = Detector.objects.get(id=response.data["id"])
assert detector.name == "Test Detector"

# Create another metric detector, it should fail
response = self.get_error_response(
self.organization.slug,
**self.valid_data,
status_code=400,
)
assert response.status_code == 400

@with_feature("organizations:workflow-engine-metric-detector-limit")
@mock.patch("sentry.quotas.backend.get_metric_detector_limit")
def test_metric_detector_limit_unlimited_plan(self, mock_get_limit: mock.MagicMock) -> None:
# Set limit to -1 (unlimited)
mock_get_limit.return_value = -1

# Create many metric detectors
for i in range(5):
self.create_detector(
project_id=self.project.id,
name=f"Existing Detector {i+1}",
type=MetricIssue.slug,
status=ObjectStatus.ACTIVE,
)

# Create another detector, it should succeed
with self.tasks():
response = self.get_success_response(
self.organization.slug,
**self.valid_data,
status_code=201,
)
mock_get_limit.assert_called_once_with(self.organization.id)
detector = Detector.objects.get(id=response.data["id"])
assert detector.name == "Test Detector"

@with_feature("organizations:workflow-engine-metric-detector-limit")
@mock.patch("sentry.quotas.backend.get_metric_detector_limit")
def test_metric_detector_limit_only_applies_to_metric_detectors(
self, mock_get_limit: mock.MagicMock
) -> None:
# Set limit to 1 metric detector
mock_get_limit.return_value = 1

# Create a not-metric detector
self.create_detector(
project_id=self.project.id,
name="Error Detector",
type=ErrorGroupType.slug,
status=ObjectStatus.ACTIVE,
)

# Create 1 metric detector, it should succeed
response = self.get_success_response(
self.organization.slug,
**self.valid_data,
status_code=201,
)
detector = Detector.objects.get(id=response.data["id"])
assert detector.name == "Test Detector"

# Create another metric detector, it should fail
response = self.get_error_response(
self.organization.slug,
**self.valid_data,
status_code=400,
)
assert response.status_code == 400

# Create another not-metric detector, it should succeed
with self.tasks():
response = self.get_success_response(
self.organization.slug,
projectId=self.project.id,
name="Error Detector",
type=ErrorGroupType.slug,
status_code=201,
)
detector = Detector.objects.get(id=response.data["id"])
assert detector.type == ErrorGroupType.slug


@region_silo_test
@with_feature("organizations:incidents")
Expand Down
Loading