diff --git a/deployment/grid/terraform/compute_plane/aws_iam.tf b/deployment/grid/terraform/compute_plane/aws_iam.tf index 14621ed2..3b9d41bb 100644 --- a/deployment/grid/terraform/compute_plane/aws_iam.tf +++ b/deployment/grid/terraform/compute_plane/aws_iam.tf @@ -14,7 +14,7 @@ module "keda_role" { } cluster_service_accounts = { - "${var.cluster_name}" = ["keda:keda-operator"] + (var.cluster_name) = ["keda:keda-operator"] } depends_on = [ diff --git a/deployment/grid/terraform/compute_plane/grafana_auth.tf b/deployment/grid/terraform/compute_plane/grafana_auth.tf index 615a1d1e..20336419 100644 --- a/deployment/grid/terraform/compute_plane/grafana_auth.tf +++ b/deployment/grid/terraform/compute_plane/grafana_auth.tf @@ -9,7 +9,7 @@ resource "aws_cognito_user_pool_client" "grafana" { allowed_oauth_flows_user_pool_client = true generate_secret = true allowed_oauth_flows = ["code"] - callback_urls = ["https://${data.kubernetes_ingress_v1.grafana_ingress.status.0.load_balancer.0.ingress.0.hostname}/oauth2/idpresponse"] + callback_urls = ["https://${data.kubernetes_ingress_v1.grafana_ingress.status[0].load_balancer[0].ingress[0].hostname}/oauth2/idpresponse"] allowed_oauth_scopes = [ "email", "openid" ] diff --git a/deployment/grid/terraform/compute_plane/main.tf b/deployment/grid/terraform/compute_plane/main.tf index 1c66495f..826d71e0 100644 --- a/deployment/grid/terraform/compute_plane/main.tf +++ b/deployment/grid/terraform/compute_plane/main.tf @@ -17,7 +17,7 @@ locals { xvda = { device_name = "/dev/xvda" ebs = { - volume_size = 50 + volume_size = var.eks_node_volume_size volume_type = "gp3" encrypted = true kms_key_id = module.eks_ebs_kms_key.key_arn @@ -50,7 +50,7 @@ locals { xvda = { device_name = "/dev/xvda" ebs = { - volume_size = 20 + volume_size = var.eks_node_volume_size volume_type = "gp3" encrypted = true kms_key_id = module.eks_ebs_kms_key.key_arn diff --git a/deployment/grid/terraform/compute_plane/outputs.tf b/deployment/grid/terraform/compute_plane/outputs.tf index a92b5a59..b83cdefe 100644 --- a/deployment/grid/terraform/compute_plane/outputs.tf +++ b/deployment/grid/terraform/compute_plane/outputs.tf @@ -25,12 +25,12 @@ output "certificate_authority" { output "nlb_influxdb" { description = "url of the NLB in front of the influx DB" - value = data.kubernetes_service_v1.influxdb_load_balancer.status.0.load_balancer.0.ingress.0.hostname + value = data.kubernetes_service_v1.influxdb_load_balancer.status[0].load_balancer[0].ingress[0].hostname } output "grafana_ingress_domain" { description = "Ingress Domain for Grafana" - value = "https://${data.kubernetes_ingress_v1.grafana_ingress.status.0.load_balancer.0.ingress.0.hostname}" + value = "https://${data.kubernetes_ingress_v1.grafana_ingress.status[0].load_balancer[0].ingress[0].hostname}" } output "eks_managed_node_groups" { diff --git a/deployment/grid/terraform/compute_plane/variables.tf b/deployment/grid/terraform/compute_plane/variables.tf index 1cae0eaf..8950b4e9 100644 --- a/deployment/grid/terraform/compute_plane/variables.tf +++ b/deployment/grid/terraform/compute_plane/variables.tf @@ -4,6 +4,7 @@ variable "region" { description = "AWS region" + type = string } variable "input_role" { @@ -17,31 +18,32 @@ variable "input_role" { variable "kubernetes_version" { description = "Name of EKS cluster in AWS" -} - -variable "htc_agent_namespace" { - description = "kubernetes namespace for the deployment of the agent" - default = "default" + type = string } variable "aws_htc_ecr" { description = "URL of Amazon ECR image repostiories" + type = string } variable "cluster_name" { description = "Name of EKS cluster in AWS" + type = string } variable "k8s_ca_version" { description = "Cluster autoscaler version" + type = string } variable "k8s_keda_version" { description = "Keda version" + type = string } variable "suffix" { description = "suffix for generating unique name for AWS resource" + type = string default = "" } @@ -51,22 +53,17 @@ variable "eks_worker_groups" { variable "vpc_private_subnet_ids" { description = "Private subnet IDs" + type = list(string) } variable "vpc_public_subnet_ids" { description = "Public subnet IDs" -} - -variable "vpc_default_security_group_id" { - description = "Default SG ID" + type = list(string) } variable "vpc_id" { description = "Default VPC ID" -} - -variable "vpc_cidr" { - description = "Default VPC CIDR" + type = string } variable "enable_private_subnet" { @@ -82,11 +79,13 @@ variable "grafana_admin_password" { variable "kms_deletion_window" { description = "Number of days after which KMS key will be permanently deleted" + type = number default = 7 } variable "kms_key_admin_roles" { description = "List of roles to assign KMS Key Administrator permissions" + type = list(string) default = [] } @@ -100,10 +99,10 @@ variable "node_drainer_lambda_role_arn" { type = string } -variable "allowed_access_cidr_blocks" { - description = "List of CIDR blocks which are allowed ingress/egress access from/to the VPC" - type = list(string) -} +# variable "allowed_access_cidr_blocks" { +# description = "List of CIDR blocks which are allowed ingress/egress access from/to the VPC" +# type = list(string) +# } variable "cognito_domain_name" { description = "Cognito Domain Name" @@ -119,3 +118,9 @@ variable "cognito_userpool_id" { description = "Cognito User Pool ID" type = string } + +variable "eks_node_volume_size" { + description = "Size in GB for EKS Worker Nodes" + type = number + default = 50 +} diff --git a/deployment/grid/terraform/control_plane/s3.tf b/deployment/grid/terraform/control_plane/s3.tf index e4b585c2..830140b4 100644 --- a/deployment/grid/terraform/control_plane/s3.tf +++ b/deployment/grid/terraform/control_plane/s3.tf @@ -98,6 +98,6 @@ module "htc_data_bucket" { } tags = { - Tag = "${var.suffix}" + Tag = var.suffix } } diff --git a/deployment/grid/terraform/control_plane/variables.tf b/deployment/grid/terraform/control_plane/variables.tf index 7f4eb368..770faabd 100644 --- a/deployment/grid/terraform/control_plane/variables.tf +++ b/deployment/grid/terraform/control_plane/variables.tf @@ -4,245 +4,294 @@ variable "region" { description = "AWS region" + type = string } variable "aws_htc_ecr" { description = "URL of Amazon ECR image repostiories" + type = string } variable "lambda_runtime" { description = "Python version" + type = string default = "python3.7" } variable "ddb_state_table" { description = "HTC DynamoDB table name" + type = string } variable "dynamodb_autoscaling_enabled" { description = "Switches autoscaling for the dynamodb table" + type = bool } variable "dynamodb_billing_mode" { description = "Sets billing mode [PROVISIONED] or [PAY_PER_REQUEST]" + type = string } variable "sqs_queue" { description = "HTC SQS queue name" + type = string } variable "sqs_dlq" { description = "HTC SQS queue dlq name" + type = string } variable "s3_bucket" { description = "S3 bucket name" + type = string } variable "grid_storage_service" { description = "Configuration string for internal results storage system" + type = string } variable "task_queue_service" { description = "Configuration string for the type of queuing service to use" + type = string } variable "task_queue_config" { description = "Dictionary configuration of the tasks queue" + type = string } variable "task_input_passed_via_external_storage" { description = "Indicator for passing the args through stdin" + type = number } variable "lambda_name_ttl_checker" { description = "Lambda name for ttl checker" + type = string } variable "lambda_name_submit_tasks" { description = "Lambda name for submit task" + type = string } variable "lambda_name_cancel_tasks" { description = "Lambda name for cancel tasks" + type = string } variable "lambda_name_get_results" { description = "Lambda name for get result task" + type = string } variable "lambda_name_scaling_metrics" { description = "Lambda function name for scaling_metrics" + type = string } variable "lambda_name_node_drainer" { description = "Lambda function name for node_drainer" + type = string } variable "metrics_are_enabled" { description = "If set to True(1) then metrics will be accumulated and delivered downstream for visualisation" + type = bool } variable "metrics_submit_tasks_lambda_connection_string" { description = "The type and the connection string for the downstream" + type = string } variable "metrics_get_results_lambda_connection_string" { description = "The type and the connection string for the downstream" + type = string } variable "metrics_cancel_tasks_lambda_connection_string" { description = "The type and the connection string for the downstream" + type = string } variable "metrics_ttl_checker_lambda_connection_string" { description = "The type and the connection string for the downstream" + type = string } -variable "agent_use_congestion_control" { - description = "Use Congestion Control protocol at pods to avoid overloading DDB" -} +# variable "agent_use_congestion_control" { +# description = "Use Congestion Control protocol at pods to avoid overloading DDB" +# type = bool +# } variable "error_log_group" { description = "Log group for errors" + type = string } variable "error_logging_stream" { description = "Log stream for errors" + type = string } variable "dynamodb_table_write_capacity" { description = "write capacity for the status table" + type = number } variable "dynamodb_table_read_capacity" { description = "read capacity for the status table" + type = number } variable "dynamodb_gsi_index_table_write_capacity" { description = "write capacity for the status table (gsi index)" + type = number } variable "dynamodb_gsi_index_table_read_capacity" { description = "read capacity for the status table (gsi index)" + type = number } variable "dynamodb_gsi_ttl_table_write_capacity" { description = "write capacity for the status table(gsi ttl)" + type = number } variable "dynamodb_gsi_ttl_table_read_capacity" { description = "read capacity for the status table (gsi ttl)" + type = number } -variable "dynamodb_gsi_parent_table_write_capacity" { - description = "write capacity for the status table (gsi parent)" -} +# variable "dynamodb_gsi_parent_table_write_capacity" { +# description = "write capacity for the status table (gsi parent)" +# type = number +# } -variable "dynamodb_gsi_parent_table_read_capacity" { - description = "read capacity for the status table (gsi parent)" -} +# variable "dynamodb_gsi_parent_table_read_capacity" { +# description = "read capacity for the status table (gsi parent)" +# type = number +# } variable "suffix" { description = "suffix for generating unique name for AWS resource" + type = string } variable "vpc_private_subnet_ids" { description = "Private subnet IDs" + type = list(string) } -variable "vpc_public_subnet_ids" { - description = "Public subnet IDs" -} +# variable "vpc_public_subnet_ids" { +# description = "Public subnet IDs" +# type = string +# } variable "vpc_default_security_group_id" { description = "Default SG ID" + type = string } variable "vpc_id" { description = "Default VPC ID" + type = string } variable "vpc_cidr" { description = "Default VPC CIDR" + type = string } variable "nlb_influxdb" { description = "network load balancer url in front of influxdb" + type = string default = "" } variable "cluster_name" { description = "Name of EKS cluster in AWS" + type = string } variable "api_gateway_version" { description = "version deployed by API Gateway" + type = string } variable "state_table_service" { description = "State Table service type" + type = string } variable "state_table_config" { description = "Status Table configuration" + type = string } variable "priorities" { + type = map(number) default = { "__0" = 0 } } -variable "kms_deletion_window" { - description = "Number of days after which KMS key will be permanently deleted" - default = 7 -} - variable "kms_key_admin_roles" { description = "List of roles to assign KMS Key Administrator permissions" + type = list(string) default = [] } -# Lambda Node Drainer - -# variable "dimension_value_metrics" { -# default = "[{DimensionName=cluster_name,DimensionValue=htc-aws}, {DimensionName=env,DimensionValue=dev}]" -# description = "Dimensions name/value for the CloudWatch metrics" -# } +variable "kms_deletion_window" { + description = "Number of days after which KMS key will be permanently deleted" + type = number + default = 7 +} variable "namespace_metrics" { description = "NameSpace for metrics" + type = string } variable "tasks_queue_name" { description = "HTC queue name" + type = string } variable "dimension_name_metrics" { description = "Dimensions name/value for the CloudWatch metrics" + type = string } variable "period_metrics" { description = "Period for metrics in minutes" + type = number } variable "metric_name" { description = "Metrics name" + type = string } variable "metrics_event_rule_time" { description = "Fires event rule to put metrics" + type = string } variable "graceful_termination_delay" { description = "graceful termination delay for scaled in action" + type = number } -variable "aws_xray_daemon_version" { - description = "version for the XRay daemon" - type = string -} +# variable "aws_xray_daemon_version" { +# description = "version for the XRay daemon" +# type = string +# } variable "eks_managed_node_groups" { description = "Map of names and ARNs of EKS Managed Node Group ASGs" diff --git a/deployment/grid/terraform/htc-agent/variables.tf b/deployment/grid/terraform/htc-agent/variables.tf index 028f77ea..c495d9b6 100644 --- a/deployment/grid/terraform/htc-agent/variables.tf +++ b/deployment/grid/terraform/htc-agent/variables.tf @@ -25,8 +25,8 @@ variable "agent_image_tag" { } variable "htc_agent_permissions_policy_arn" { - type = string description = "IAM Policy ARN for HTC Agent IRSA Permissions" + type = string } variable "suffix" { @@ -51,8 +51,8 @@ variable "test_agent_image_tag" { } variable "lambda_image_repository" { - type = string description = "repository to the lambda image" + type = string } variable "lambda_image_tag" { @@ -89,8 +89,8 @@ variable "agent_max_cpu" { } variable "lambda_min_cpu" { - type = number description = "Minimum CPU asisgned to the lambda (in milli)" + type = number } variable "lambda_max_cpu" { @@ -118,7 +118,7 @@ variable "lambda_min_memory" { variable "lambda_max_memory" { description = "Maximum memory asisgned to the agent (in MiB)" - type = string + type = number default = 100 } @@ -163,10 +163,10 @@ variable "lambda_configuration_s3_source" { type = string } -variable "lambda_configuration_s3_source_kms_key_arn" { - description = "The CMK KMS Key ARN for the Lambda Layer S3 bucket source" - type = string -} +# variable "lambda_configuration_s3_source_kms_key_arn" { +# description = "The CMK KMS Key ARN for the Lambda Layer S3 bucket source" +# type = string +# } variable "region" { description = "The region of the Lambda Layer" @@ -174,17 +174,17 @@ variable "region" { default = "eu-west-1" } -variable "lambda_configuration_layer_name" { - description = "The name of the lambda layer storing the source code" - type = string - default = "mock_layer" -} +# variable "lambda_configuration_layer_name" { +# description = "The name of the lambda layer storing the source code" +# type = string +# default = "mock_layer" +# } -variable "lambda_configuration_layer_version" { - description = "The version of the lambda layer storing the source code" - type = number - default = 1 -} +# variable "lambda_configuration_layer_version" { +# description = "The version of the lambda layer storing the source code" +# type = number +# default = 1 +# } variable "lambda_configuration_function_name" { description = "The name of the lambda function to be executed" @@ -204,33 +204,41 @@ variable "lambda_handler_function_name" { variable "namespace_metrics" { description = "NameSpace for metrics" + type = string } variable "dimension_name_metrics" { description = "Dimensions name for the CloudWatch metrics" + type = string } variable "dimension_value_metrics" { description = "Dimensions name for the CloudWatch metrics" + type = string } -variable "average_period" { - description = "Average period in second used by the HPA to compute the current load on the system" - default = 30 -} +# variable "average_period" { +# description = "Average period in second used by the HPA to compute the current load on the system" +# type = number +# default = 30 +# } variable "metric_name" { description = "Metrics name" + type = string } variable "max_htc_agents" { description = "maximum number of agents that can run on EKS" + type = number } variable "min_htc_agents" { description = "minimum number of agents that can run on EKS" + type = number } variable "htc_agent_target_value" { description = "target value for the load on the system" + type = number } diff --git a/deployment/grid/terraform/main.tf b/deployment/grid/terraform/main.tf index 890a4556..1f8b18ad 100644 --- a/deployment/grid/terraform/main.tf +++ b/deployment/grid/terraform/main.tf @@ -6,7 +6,6 @@ locals { account_id = data.aws_caller_identity.current.account_id dns_suffix = data.aws_partition.current.dns_suffix - partition = data.aws_partition.current.partition aws_htc_ecr = var.aws_htc_ecr != "" ? var.aws_htc_ecr : "${local.account_id}.dkr.ecr.${var.region}.${local.dns_suffix}" project_name = var.project_name != "" ? var.project_name : random_string.random_resources.result grafana_admin_password = var.grafana_admin_password != "" ? var.grafana_admin_password : random_password.password.result @@ -22,7 +21,6 @@ locals { lambda_name_scaling_metrics = "${var.lambda_name_scaling_metrics}-${local.project_name}" lambda_name_node_drainer = "${var.lambda_name_node_drainer}-${local.project_name}" metrics_name = "${var.metrics_name}-${local.project_name}" - config_name = "${var.config_name}-${local.project_name}" s3_bucket = "${var.s3_bucket}-${local.project_name}" error_log_group = "${var.error_log_group}-${local.project_name}" error_logging_stream = "${var.error_logging_stream}-${local.project_name}" @@ -118,6 +116,8 @@ module "vpc" { public_subnets = var.vpc_cidr_block_public enable_private_subnet = var.enable_private_subnet allowed_access_cidr_blocks = local.allowed_access_cidr_blocks + kms_key_admin_roles = var.kms_key_admin_roles + kms_deletion_window = var.kms_deletion_window } @@ -127,18 +127,15 @@ module "compute_plane" { vpc_id = module.vpc.vpc_id vpc_private_subnet_ids = module.vpc.private_subnet_ids vpc_public_subnet_ids = module.vpc.public_subnet_ids - vpc_default_security_group_id = module.vpc.default_security_group_id - vpc_cidr = module.vpc.vpc_cidr_block - allowed_access_cidr_blocks = local.allowed_access_cidr_blocks cluster_name = local.cluster_name kubernetes_version = var.kubernetes_version k8s_ca_version = var.k8s_ca_version k8s_keda_version = var.k8s_keda_version aws_htc_ecr = local.aws_htc_ecr - htc_agent_namespace = var.htc_agent_namespace suffix = local.project_name region = var.region eks_worker_groups = var.eks_worker_groups + eks_node_volume_size = var.eks_node_volume_size input_role = var.input_role enable_private_subnet = var.enable_private_subnet grafana_admin_password = local.grafana_admin_password @@ -147,15 +144,18 @@ module "compute_plane" { cognito_domain_name = module.control_plane.cognito_domain_name cognito_userpool_arn = module.control_plane.cognito_userpool_arn cognito_userpool_id = module.control_plane.cognito_userpool_id + kms_key_admin_roles = var.kms_key_admin_roles + kms_deletion_window = var.kms_deletion_window + # allowed_access_cidr_blocks = local.allowed_access_cidr_blocks } module "control_plane" { source = "./control_plane" - vpc_id = module.vpc.vpc_id - vpc_private_subnet_ids = module.vpc.private_subnet_ids - vpc_public_subnet_ids = module.vpc.public_subnet_ids + vpc_id = module.vpc.vpc_id + vpc_private_subnet_ids = module.vpc.private_subnet_ids + # vpc_public_subnet_ids = module.vpc.public_subnet_ids vpc_default_security_group_id = module.vpc.default_security_group_id vpc_cidr = module.vpc.vpc_cidr_block allowed_access_cidr_blocks = local.allowed_access_cidr_blocks @@ -192,9 +192,6 @@ module "control_plane" { dynamodb_gsi_index_table_read_capacity = var.dynamodb_default_read_capacity dynamodb_gsi_ttl_table_write_capacity = var.dynamodb_default_write_capacity dynamodb_gsi_ttl_table_read_capacity = var.dynamodb_default_read_capacity - dynamodb_gsi_parent_table_write_capacity = var.dynamodb_default_write_capacity - dynamodb_gsi_parent_table_read_capacity = var.dynamodb_default_read_capacity - agent_use_congestion_control = var.agent_use_congestion_control nlb_influxdb = module.compute_plane.nlb_influxdb cluster_name = local.cluster_name api_gateway_version = var.api_gateway_version @@ -208,9 +205,10 @@ module "control_plane" { metric_name = local.metrics_name metrics_event_rule_time = var.metrics_event_rule_time graceful_termination_delay = var.graceful_termination_delay - aws_xray_daemon_version = var.aws_xray_daemon_version lambda_configuration_s3_source = try(var.agent_configuration.lambda.s3_source, local.default_agent_configuration.lambda.s3_source) lambda_configuration_s3_source_kms_key_arn = try(var.agent_configuration.lambda.s3_source_kms_key_arn, local.default_agent_configuration.lambda.s3_source_kms_key_arn) + kms_key_admin_roles = var.kms_key_admin_roles + kms_deletion_window = var.kms_deletion_window depends_on = [ @@ -220,47 +218,45 @@ module "control_plane" { module "htc_agent" { - source = "./htc-agent" - region = var.region - agent_chart_url = lookup(var.agent_configuration, "agent_chart_url", local.default_agent_configuration.agent_chart_url) - termination_grace_period = var.graceful_termination_delay - suffix = local.project_name - agent_name = var.htc_agent_name - htc_agent_permissions_policy_arn = module.control_plane.htc_agent_permissions_policy_arn - eks_oidc_provider_arn = module.compute_plane.oidc_provider_arn - max_htc_agents = var.max_htc_agents - min_htc_agents = var.min_htc_agents - htc_agent_target_value = var.htc_agent_target_value - average_period = var.average_period - namespace_metrics = var.namespace_metrics - dimension_name_metrics = var.dimension_name_metrics - dimension_value_metrics = local.cluster_name - metric_name = local.metrics_name - agent_image_tag = try(var.agent_configuration.agent.tag, local.default_agent_configuration.agent.tag) - agent_image_repository = try(var.agent_configuration.agent.image, local.default_agent_configuration.agent.image) - agent_pull_policy = try(var.agent_configuration.agent.pullPolicy, local.default_agent_configuration.agent.pullPolicy) - agent_min_cpu = try(var.agent_configuration.agent.minCPU, local.default_agent_configuration.agent.minCPU) - agent_max_cpu = try(var.agent_configuration.agent.maxCPU, local.default_agent_configuration.agent.maxCPU) - agent_min_memory = try(var.agent_configuration.agent.minMemory, local.default_agent_configuration.agent.minMemory) - agent_max_memory = try(var.agent_configuration.agent.maxMemory, local.default_agent_configuration.agent.maxMemory) - get_layer_image_tag = try(var.agent_configuration.get_layer.tag, local.default_agent_configuration.get_layer.tag) - get_layer_image_repository = try(var.agent_configuration.get_layer.image, local.default_agent_configuration.get_layer.image) - get_layer_pull_policy = try(var.agent_configuration.get_layer.pullPolicy, local.default_agent_configuration.get_layer.pullPolicy) - lambda_image_tag = try(var.agent_configuration.lambda.runtime, local.default_agent_configuration.lambda.runtime) - lambda_image_repository = try(var.agent_configuration.lambda.image, local.default_agent_configuration.lambda.image) - lambda_pull_policy = try(var.agent_configuration.lambda.pullPolicy, local.default_agent_configuration.lambda.pullPolicy) - lambda_min_cpu = try(var.agent_configuration.lambda.minCPU, local.default_agent_configuration.lambda.minCPU) - lambda_max_cpu = try(var.agent_configuration.lambda.maxCPU, local.default_agent_configuration.lambda.maxCPU) - lambda_min_memory = try(var.agent_configuration.lambda.minMemory, local.default_agent_configuration.lambda.minMemory) - lambda_max_memory = try(var.agent_configuration.lambda.maxMemory, local.default_agent_configuration.lambda.maxMemory) - lambda_handler_file_name = try(var.agent_configuration.lambda.lambda_handler_file_name, local.default_agent_configuration.lambda.lambda_handler_file_name) - lambda_handler_function_name = try(var.agent_configuration.lambda.lambda_handler_function_name, local.default_agent_configuration.lambda.lambda_handler_function_name) - lambda_configuration_function_name = try(var.agent_configuration.lambda.function_name, local.default_agent_configuration.lambda.function_name) - lambda_configuration_s3_source = try(var.agent_configuration.lambda.s3_source, local.default_agent_configuration.lambda.s3_source) - lambda_configuration_s3_source_kms_key_arn = try(var.agent_configuration.lambda.s3_source_kms_key_arn, local.default_agent_configuration.lambda.s3_source_kms_key_arn) - test_agent_image_tag = try(var.agent_configuration.test.tag, local.default_agent_configuration.test.tag) - test_pull_policy = try(var.agent_configuration.test.pullPolicy, local.default_agent_configuration.test.pullPolicy) - test_agent_image_repository = try(var.agent_configuration.test.image, local.default_agent_configuration.test.image) + source = "./htc-agent" + region = var.region + agent_chart_url = lookup(var.agent_configuration, "agent_chart_url", local.default_agent_configuration.agent_chart_url) + termination_grace_period = var.graceful_termination_delay + suffix = local.project_name + agent_name = var.htc_agent_name + htc_agent_permissions_policy_arn = module.control_plane.htc_agent_permissions_policy_arn + eks_oidc_provider_arn = module.compute_plane.oidc_provider_arn + max_htc_agents = var.max_htc_agents + min_htc_agents = var.min_htc_agents + htc_agent_target_value = var.htc_agent_target_value + namespace_metrics = var.namespace_metrics + dimension_name_metrics = var.dimension_name_metrics + dimension_value_metrics = local.cluster_name + metric_name = local.metrics_name + agent_image_tag = try(var.agent_configuration.agent.tag, local.default_agent_configuration.agent.tag) + agent_image_repository = try(var.agent_configuration.agent.image, local.default_agent_configuration.agent.image) + agent_pull_policy = try(var.agent_configuration.agent.pullPolicy, local.default_agent_configuration.agent.pullPolicy) + agent_min_cpu = try(var.agent_configuration.agent.minCPU, local.default_agent_configuration.agent.minCPU) + agent_max_cpu = try(var.agent_configuration.agent.maxCPU, local.default_agent_configuration.agent.maxCPU) + agent_min_memory = try(var.agent_configuration.agent.minMemory, local.default_agent_configuration.agent.minMemory) + agent_max_memory = try(var.agent_configuration.agent.maxMemory, local.default_agent_configuration.agent.maxMemory) + get_layer_image_tag = try(var.agent_configuration.get_layer.tag, local.default_agent_configuration.get_layer.tag) + get_layer_image_repository = try(var.agent_configuration.get_layer.image, local.default_agent_configuration.get_layer.image) + get_layer_pull_policy = try(var.agent_configuration.get_layer.pullPolicy, local.default_agent_configuration.get_layer.pullPolicy) + lambda_image_tag = try(var.agent_configuration.lambda.runtime, local.default_agent_configuration.lambda.runtime) + lambda_image_repository = try(var.agent_configuration.lambda.image, local.default_agent_configuration.lambda.image) + lambda_pull_policy = try(var.agent_configuration.lambda.pullPolicy, local.default_agent_configuration.lambda.pullPolicy) + lambda_min_cpu = try(var.agent_configuration.lambda.minCPU, local.default_agent_configuration.lambda.minCPU) + lambda_max_cpu = try(var.agent_configuration.lambda.maxCPU, local.default_agent_configuration.lambda.maxCPU) + lambda_min_memory = try(var.agent_configuration.lambda.minMemory, local.default_agent_configuration.lambda.minMemory) + lambda_max_memory = try(var.agent_configuration.lambda.maxMemory, local.default_agent_configuration.lambda.maxMemory) + lambda_handler_file_name = try(var.agent_configuration.lambda.lambda_handler_file_name, local.default_agent_configuration.lambda.lambda_handler_file_name) + lambda_handler_function_name = try(var.agent_configuration.lambda.lambda_handler_function_name, local.default_agent_configuration.lambda.lambda_handler_function_name) + lambda_configuration_function_name = try(var.agent_configuration.lambda.function_name, local.default_agent_configuration.lambda.function_name) + lambda_configuration_s3_source = try(var.agent_configuration.lambda.s3_source, local.default_agent_configuration.lambda.s3_source) + test_agent_image_tag = try(var.agent_configuration.test.tag, local.default_agent_configuration.test.tag) + test_pull_policy = try(var.agent_configuration.test.pullPolicy, local.default_agent_configuration.test.pullPolicy) + test_agent_image_repository = try(var.agent_configuration.test.image, local.default_agent_configuration.test.image) depends_on = [ module.vpc, diff --git a/deployment/grid/terraform/providers.tf b/deployment/grid/terraform/providers.tf index 178e391f..6d647751 100644 --- a/deployment/grid/terraform/providers.tf +++ b/deployment/grid/terraform/providers.tf @@ -21,7 +21,7 @@ terraform { helm = { source = "hashicorp/helm" - version = ">= 2.0" + version = "~> 2.0" } tls = { @@ -33,7 +33,29 @@ terraform { source = "hashicorp/archive" version = "~> 2.0" } + + local = { + source = "hashicorp/local" + version = "~> 2.0" + } + + random = { + source = "hashicorp/random" + version = "~> 3.0" + } + + time = { + source = "hashicorp/time" + version = "~> 0.0" + } + + null = { + source = "hashicorp/null" + version = "~> 3.0" + } } + + required_version = "~> 1.0" } diff --git a/deployment/grid/terraform/variables.tf b/deployment/grid/terraform/variables.tf index 727fed88..8a311c1b 100644 --- a/deployment/grid/terraform/variables.tf +++ b/deployment/grid/terraform/variables.tf @@ -4,6 +4,7 @@ variable "region" { description = "AWS region" + type = string default = "eu-west-1" } @@ -19,61 +20,43 @@ variable "input_role" { variable "cluster_name" { description = "Name of EKS cluster in AWS" - default = "htc" -} - -variable "config_name" { - description = "Default path for the SSM parameter storing the configuration of the grid" + type = string default = "htc" } variable "lambda_runtime" { description = "Lambda runtine" + type = string default = "python3.11" } variable "kubernetes_version" { description = "Name of EKS cluster in AWS" + type = string default = "1.25" } variable "k8s_ca_version" { description = "Cluster autoscaler version" + type = string default = "v1.21.0" } variable "k8s_keda_version" { description = "Keda version" + type = string default = "2.11.2" } variable "aws_htc_ecr" { description = "URL of Amazon ECR image repostiories" + type = string default = "" } -variable "cwa_version" { - description = "Cloud Watch Adapter for kubernetes version" - default = "v0.8.0" -} - -variable "aws_node_termination_handler" { - description = "version of the deployment managing node termination" - default = "v1.10.0" -} - -variable "cw_agent_version" { - description = "CloudWatch Agent version" - default = "1.247347.5b250583" -} - -variable "fluentbit_version" { - description = "Fluentbit version" - default = "2.10.0" -} - variable "ddb_state_table" { description = "htc DinamoDB state table name" + type = string default = "htc_tasks_state_table" } @@ -85,184 +68,208 @@ variable "dynamodb_autoscaling_enabled" { variable "dynamodb_billing_mode" { description = "Sets billing mode [PROVISIONED] or [PAY_PER_REQUEST]" + type = string default = "PROVISIONED" } variable "task_queue_service" { description = "Configuration string for the type of queuing service to be used" + type = string default = "SQS" } variable "task_queue_config" { description = "dictionary queue config" + type = string default = "{'priorities':3}" } variable "sqs_queue" { description = "htc SQS queue name" + type = string default = "htc_task_queue" } variable "sqs_dlq" { description = "htc SQS queue dlq name" + type = string default = "htc_task_queue_dlq" } variable "s3_bucket" { description = "S3 bucket name" + type = string default = "htc-data-bucket" } variable "grid_storage_service" { description = "Configuration string for internal results storage system" + type = string default = "S3 htc-data-bucket-1" } variable "state_table_service" { description = "State Table service type" + type = string default = "DynamoDB" } variable "state_table_config" { description = "Status Table configuration" + type = string default = "{'retries':{'max_attempts':10, 'mode':'adaptive'}}" } variable "lambda_name_ttl_checker" { description = "Lambda name for ttl checker" + type = string default = "ttl_checker" } variable "lambda_name_submit_tasks" { description = "Lambda name for submit task" + type = string default = "submit_task" } variable "lambda_name_get_results" { description = "Lambda name for get result task" + type = string default = "get_results" } variable "lambda_name_cancel_tasks" { description = "Lambda name for cancel tasks" + type = string default = "cancel_tasks" } -variable "lambda_alb_name" { - description = "Name of the load balancer for Lambdas" - default = "lambda-frontend" -} - variable "metrics_are_enabled" { description = "If set to True(1) then metrics will be accumulated and delivered downstream for visualisation" + type = string default = "1" } variable "metrics_submit_tasks_lambda_connection_string" { description = "The type and the connection string for the downstream" + type = string default = "influxdb 8086 measurementsdb submit_tasks" } variable "metrics_cancel_tasks_lambda_connection_string" { description = "The type and the connection string for the downstream" + type = string default = "influxdb 8086 measurementsdb cancel_tasks" } variable "metrics_get_results_lambda_connection_string" { description = "The type and the connection string for the downstream" + type = string default = "influxdb 8086 measurementsdb get_results" } variable "metrics_ttl_checker_lambda_connection_string" { description = "The type and the connection string for the downstream" + type = string default = "influxdb 8086 measurementsdb ttl_checker" } variable "agent_use_congestion_control" { description = "Use Congestion Control protocol at pods to avoid overloading DDB" + type = string default = "0" } variable "error_log_group" { description = "Log group for errors" + type = string default = "grid_errors" } variable "error_logging_stream" { description = "Log stream for errors" + type = string default = "lambda_errors" } variable "dynamodb_default_read_capacity" { description = "default read capacity for all tables" + type = number default = 100 } variable "dynamodb_default_write_capacity" { description = "default write capacity for all tables" + type = number default = 100 } variable "namespace_metrics" { description = "NameSpace for metrics" + type = string default = "CloudGrid/HTC/Scaling/" } variable "dimension_name_metrics" { description = "Dimensions name/value for the CloudWatch metrics" + type = string default = "cluster_name" } variable "htc_path_logs" { description = "Path to fluentD to search de logs application" + type = string default = "logs/" } variable "lambda_name_scaling_metrics" { description = "Lambda function name for metrics" + type = string default = "scaling_metrics" } variable "lambda_name_node_drainer" { description = "Lambda function name for metrics" + type = string default = "node_drainer" } variable "period_metrics" { description = "Period for metrics in minutes" + type = string default = "1" } variable "metrics_name" { description = "Metrics name" + type = string default = "pending_tasks_ddb" } -variable "average_period" { - description = "Average period in second used by the HPA to compute the current load on the system" - default = 30 -} +# variable "average_period" { +# description = "Average period in second used by the HPA to compute the current load on the system" +# type = number +# default = 30 +# } variable "metrics_event_rule_time" { description = "Fires event rule to put metrics" + type = string default = "rate(1 minute)" } variable "htc_agent_name" { description = "name of the htc agent to scale out/in" + type = string default = "htc-agent" } variable "htc_agent_namespace" { description = "kubernetes namespace for the deployment of the agent" + type = string default = "default" } -variable "suffix" { - description = "suffix for generating unique name for AWS resource" - default = "" -} - variable "eks_worker_groups" { type = any default = [] @@ -270,71 +277,85 @@ variable "eks_worker_groups" { variable "max_htc_agents" { description = "maximum number of agents that can run on EKS" + type = number default = 100 } variable "min_htc_agents" { description = "minimum number of agents that can run on EKS" + type = number default = 1 } variable "htc_agent_target_value" { description = "target value for the load on the system" + type = number default = 2 } variable "graceful_termination_delay" { description = "graceful termination delay in second for scaled in action" + type = number default = 30 } variable "empty_task_queue_backoff_timeout_sec" { description = "agent backoff timeout in second" + type = number default = 0.5 } variable "work_proc_status_pull_interval_sec" { description = "agent pulling interval" + type = number default = 0.5 } variable "task_ttl_expiration_offset_sec" { description = "agent TTL for task to time out in second" + type = number default = 30 } variable "task_ttl_refresh_interval_sec" { description = "reset interval for agent TTL" + type = number default = 5.0 } variable "dynamodb_results_pull_interval_sec" { description = "agent pulling interval for pending task in DDB" + type = number default = 0.5 } variable "agent_task_visibility_timeout_sec" { description = "default visibility timeout for SQS messages" + type = number default = 3600 } variable "task_input_passed_via_external_storage" { description = "Indicator for passing the args through stdin" + type = number default = 1 } variable "metrics_pre_agent_connection_string" { description = "pre agent connection string for monitoring" + type = string default = "influxdb 8086 measurementsdb agent_pre" } variable "metrics_post_agent_connection_string" { description = "post agent connection string for monitoring" + type = string default = "influxdb 8086 measurementsdb agent_post" } variable "agent_configuration_filename" { description = "filename where agent configuration (in json) is going to be stored" + type = string default = "agent_config.json" } @@ -350,11 +371,11 @@ variable "enable_xray" { default = 0 } -variable "aws_xray_daemon_version" { - description = "version for the XRay daemon" - type = string - default = "latest" -} +# variable "aws_xray_daemon_version" { +# description = "version for the XRay daemon" +# type = string +# default = "latest" +# } variable "enable_private_subnet" { description = "enable private subnet" @@ -377,13 +398,13 @@ variable "grafana_admin_password" { variable "vpc_cidr_block_public" { description = "netmask for the cidr for each public subnet" - type = string + type = number default = 24 } variable "vpc_cidr_block_private" { description = "netmask for the cidr for each private subnet" - type = string + type = number default = 24 } @@ -395,6 +416,7 @@ variable "project_name" { variable "kms_deletion_window" { description = "Number of days after which KMS key will be permanently deleted" + type = number default = 7 } @@ -409,3 +431,9 @@ variable "allowed_access_cidr_blocks" { type = list(string) default = [] } + +variable "eks_node_volume_size" { + description = "Size in GB for EKS Worker Nodes" + type = number + default = 50 +} diff --git a/deployment/grid/terraform/vpc/variables.tf b/deployment/grid/terraform/vpc/variables.tf index 5db09d2a..7c1acfcd 100644 --- a/deployment/grid/terraform/vpc/variables.tf +++ b/deployment/grid/terraform/vpc/variables.tf @@ -4,11 +4,13 @@ variable "region" { description = "AWS region" + type = string default = "eu-west-1" } variable "cluster_name" { description = "Name of EKS cluster in AWS" + type = string default = "htc" } @@ -35,11 +37,13 @@ variable "enable_private_subnet" { variable "kms_deletion_window" { description = "Number of days after which KMS key will be permanently deleted" + type = number default = 7 } variable "kms_key_admin_roles" { description = "List of roles to assign KMS Key Administrator permissions" + type = list(string) default = [] } diff --git a/deployment/image_repository/terraform/providers.tf b/deployment/image_repository/terraform/providers.tf index c7e0eb5d..34d2b0d1 100644 --- a/deployment/image_repository/terraform/providers.tf +++ b/deployment/image_repository/terraform/providers.tf @@ -13,11 +13,14 @@ terraform { source = "hashicorp/aws" version = "~> 5.0" } + null = { source = "hashicorp/null" version = "~> 3.0" } } + + required_version = ">= 1.0" } diff --git a/deployment/image_repository/terraform/variables.tf b/deployment/image_repository/terraform/variables.tf index 5f6e204e..ddd9c5b4 100644 --- a/deployment/image_repository/terraform/variables.tf +++ b/deployment/image_repository/terraform/variables.tf @@ -4,11 +4,13 @@ variable "region" { description = "the region where the ECR repository will be created" + type = string default = "eu-west-1" } variable "aws_htc_ecr" { description = "URL of Amazon ECR image repostiories" + type = string default = "" } @@ -36,10 +38,12 @@ variable "rebuild_runtimes" { variable "kms_deletion_window" { description = "Number of days after which KMS key will be permanently deleted" + type = number default = 7 } variable "kms_key_admin_roles" { description = "List of roles to assign KMS Key Administrator permissions" + type = list(string) default = [] }