-
Notifications
You must be signed in to change notification settings - Fork 810
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Monitoring] Add auto-allocation of ECS resources #4407
base: master
Are you sure you want to change the base?
[Monitoring] Add auto-allocation of ECS resources #4407
Conversation
def get_cpu_metrics_for_challenge( | ||
challenge, | ||
cluster_name=COMMON_SETTINGS_DICT["CLUSTER"], | ||
range_days=1, | ||
period_seconds=300, | ||
): | ||
""" | ||
Get the CPU Utilization of the worker in the challenge. | ||
""" | ||
|
||
cloudwatch_client = get_boto3_client("cloudwatch", aws_keys) | ||
|
||
start_time = datetime.utcnow() - timedelta(days=range_days) | ||
end_time = datetime.utcnow() | ||
queue_name = challenge.queue | ||
service_name = "{}_service".format(queue_name) | ||
|
||
response = cloudwatch_client.get_metric_statistics( | ||
Namespace="AWS/ECS", | ||
MetricName="CPUUtilization", | ||
Dimensions=[ | ||
{"Name": "ClusterName", "Value": cluster_name}, | ||
{"Name": "ServiceName", "Value": service_name}, | ||
], | ||
StartTime=start_time, | ||
EndTime=end_time, | ||
Period=period_seconds, | ||
Statistics=["Average", "Maximum", "Minimum"], |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extract CPU Utilization from workers with Cloudwatch
def get_memory_metrics_for_challenge( | ||
challenge, | ||
cluster_name=COMMON_SETTINGS_DICT["CLUSTER"], | ||
range_days=1, | ||
period_seconds=300, | ||
): | ||
""" | ||
Get the Memory Utilization of the worker in the challenge. | ||
""" | ||
|
||
cloudwatch_client = get_boto3_client("cloudwatch", aws_keys) | ||
|
||
start_time = datetime.utcnow() - timedelta(days=range_days) | ||
end_time = datetime.utcnow() | ||
queue_name = challenge.queue | ||
service_name = "{}_service".format(queue_name) | ||
response = cloudwatch_client.get_metric_statistics( | ||
Namespace="AWS/ECS", | ||
MetricName="MemoryUtilization", | ||
Dimensions=[ | ||
{"Name": "ClusterName", "Value": cluster_name}, | ||
{"Name": "ServiceName", "Value": service_name}, | ||
], | ||
StartTime=start_time, | ||
EndTime=end_time, | ||
Period=period_seconds, | ||
Statistics=["Average", "Maximum", "Minimum"], | ||
) | ||
|
||
return response["Datapoints"] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extract Memory Utilization from worker with Cloudwatch
def get_storage_metrics_for_challenge( | ||
challenge, | ||
cluster_name=COMMON_SETTINGS_DICT["CLUSTER"], | ||
range_days=1, | ||
period_seconds=300, | ||
): | ||
""" | ||
Get the Storage Utilization of the worker in the challenge. | ||
""" | ||
|
||
from datetime import datetime, timedelta | ||
|
||
cloudwatch_client = get_boto3_client("cloudwatch", aws_keys) | ||
|
||
start_time = datetime.utcnow() - timedelta(days=range_days) | ||
end_time = datetime.utcnow() | ||
queue_name = challenge.queue | ||
service_name = "{}_service".format(queue_name) | ||
|
||
response = cloudwatch_client.get_metric_statistics( | ||
Namespace="ECS/ContainerInsights", | ||
MetricName="EphemeralStorageUtilized", | ||
Dimensions=[ | ||
{"Name": "ClusterName", "Value": cluster_name}, | ||
{"Name": "ServiceName", "Value": service_name}, | ||
], | ||
StartTime=start_time, | ||
EndTime=end_time, | ||
Period=period_seconds, | ||
Statistics=["Average"], |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extract Storage Utilization from worker with Cloudwatch
def current_worker_limit(task_definition_name, metrics): | ||
ecs_client = get_boto3_client("ecs", aws_keys) | ||
try: | ||
response = ecs_client.describe_task_definition( | ||
taskDefinition=task_definition_name | ||
) | ||
except Exception as e: | ||
print(f"Error retrieving task definition: {str(e)}") | ||
return {} | ||
|
||
task_definition = response.get("taskDefinition", {}) | ||
return task_definition.get(metrics, 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extract the current resources limit for workers, only applied for CPU and Memory
return task_definition.get(metrics, 0) | ||
|
||
|
||
def get_new_resource_limit( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function adjusts the worker limit based on how much they’re being used:
- Lower the Limit: If average utilization is below 25%, the limit is cut in half.
- Raise the Limit: If average utilization is above 75%, the limit is doubled.
There’s also room to add more complex logic later if needed.
average = sum( | ||
datapoint["Average"] for datapoint in metrics if "Average" in datapoint | ||
) / len(metrics) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Calculates the average of the "Average" metrics across all data points (where each data point represents a day). This essentially provides the overall average utilization percentage based on the daily average metrics.
# Apply separate logic based on whether the metric is CPU or Memory | ||
if metric_name == "CPU": | ||
# CPU-specific scaling logic | ||
if average <= 25: | ||
# if average smaller than 25%, scale down | ||
print( | ||
f"Scaling down {service_name} due to low {metric_name} utilization" | ||
) | ||
new_limit = str(int(current_metric_limit) // 2) | ||
elif average >= 75: | ||
# if average greater than 75%, scale up | ||
print( | ||
f"Scaling up {service_name} due to high {metric_name} utilization" | ||
) | ||
new_limit = str(int(current_metric_limit) * 2) | ||
else: | ||
# no scaling action required | ||
print( | ||
f"No scaling action required for {service_name} based on {metric_name} utilization" | ||
) | ||
new_limit = current_metric_limit | ||
return new_limit |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Logic to find new limit for CPU
elif metric_name == "Memory": | ||
if average <= 25: | ||
# if average smaller than 25%, scale down | ||
print( | ||
f"Scaling down {service_name} due to low {metric_name} utilization" | ||
) | ||
new_limit = str(int(current_metric_limit) // 2) | ||
elif average >= 75: | ||
# if average greater than 75%, scale up | ||
print( | ||
f"Scaling up {service_name} due to high {metric_name} utilization" | ||
) | ||
new_limit = str(int(current_metric_limit) * 2) | ||
else: | ||
# no scaling action required | ||
print( | ||
f"No scaling action required for {service_name} based on {metric_name} utilization" | ||
) | ||
new_limit = current_metric_limit | ||
return new_limit |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Logic to find new limit for Memory
Codecov ReportAll modified and coverable lines are covered by tests ✅
❗ Your organization needs to install the Codecov GitHub app to enable full functionality. Additional details and impacted files@@ Coverage Diff @@
## master #4407 +/- ##
==========================================
- Coverage 72.93% 69.30% -3.63%
==========================================
Files 83 20 -63
Lines 5368 3574 -1794
==========================================
- Hits 3915 2477 -1438
+ Misses 1453 1097 -356 see 64 files with indirect coverage changes see 64 files with indirect coverage changes Continue to review full report in Codecov by Sentry.
|
Thanks for this PR! Made a few changes. |
This pull request adds the functionality to automatically monitor and scale Fargate workers. This feature will help optimize resource allocation and improve the overall performance of the application.