Wednesday, 25 June 2025

Step Function - Terraform for Glue Job

 

# main.tf


provider "aws" {

  region = var.region

}


# IAM Role for Glue

resource "aws_iam_role" "glue_role" {

  name = "glue-service-role"


  assume_role_policy = jsonencode({

    Version = "2012-10-17",

    Statement = [{

      Action = "sts:AssumeRole",

      Principal = {

        Service = "glue.amazonaws.com"

      },

      Effect = "Allow",

      Sid    = ""

    }]

  })

}


resource "aws_iam_role_policy_attachment" "glue_policy" {

  role       = aws_iam_role.glue_role.name

  policy_arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"

}


resource "aws_glue_catalog_database" "glue_db" {

  name = var.glue_db

}


resource "aws_glue_crawler" "crawler" {

  name          = var.crawler_name

  role          = aws_iam_role.glue_role.arn

  database_name = aws_glue_catalog_database.glue_db.name


  s3_target {

    path = "s3://${var.bucket_name}/raw/"

  }


  table_prefix = "raw_"

}


resource "aws_glue_job" "transform" {

  name     = var.glue_job_name

  role_arn = aws_iam_role.glue_role.arn


  command {

    name            = "glueetl"

    script_location = "s3://${var.bucket_name}/scripts/transform_script.py"

    python_version  = "3"

  }


  glue_version = "4.0"

  max_capacity = 2


  execution_property {

    max_concurrent_runs = 1

  }

}


# Glue Trigger to run job every 24 hours

resource "aws_glue_trigger" "daily_trigger" {

  name     = "daily-trigger"

  type     = "SCHEDULED"

  schedule = "cron(0 0 * * ? *)" # every 24 hours UTC


  actions {

    job_name = aws_glue_job.transform.name

  }


  enabled = true

}


# IAM Role for Step Functions

resource "aws_iam_role" "step_function_role" {

  name = "step-function-glue-role"


  assume_role_policy = jsonencode({

    Version = "2012-10-17",

    Statement = [{

      Effect = "Allow",

      Principal = {

        Service = "states.amazonaws.com"

      },

      Action = "sts:AssumeRole"

    }]

  })

}


resource "aws_iam_role_policy" "step_function_policy" {

  name = "step-function-policy"

  role = aws_iam_role.step_function_role.id


  policy = jsonencode({

    Version = "2012-10-17",

    Statement = [

      {

        Effect = "Allow",

        Action = [

          "glue:GetCrawler",

          "glue:StartCrawler",

          "glue:StartJobRun",

          "lambda:InvokeFunction",

          "states:StartExecution"

        ],

        Resource = "*"

      }

    ]

  })

}


resource "aws_lambda_function" "notify_lambda" {

  function_name = "notify-complete-lambda"

  role          = aws_iam_role.step_function_role.arn

  handler       = "index.lambda_handler"

  runtime       = "python3.9"


  filename         = "lambda_function_payload.zip"

  source_code_hash = filebase64sha256("lambda_function_payload.zip")

}


resource "aws_sfn_state_machine" "glue_workflow" {

  name     = "glue-etl-workflow"

  role_arn = aws_iam_role.step_function_role.arn


  definition = jsonencode({

    Comment = "ETL with Glue and Lambda",

    StartAt = "StartCrawler",

    States = {

      StartCrawler = {

        Type     = "Task",

        Resource = "arn:aws:states:::aws-sdk:glue:startCrawler",

        Parameters = {

          Name = var.crawler_name

        },

        Next = "Wait30Seconds"

      },

      Wait30Seconds = {

        Type   = "Wait",

        Seconds = 30,

        Next   = "GetCrawlerStatus"

      },

      GetCrawlerStatus = {

        Type     = "Task",

        Resource = "arn:aws:states:::aws-sdk:glue:getCrawler",

        Parameters = {

          Name = var.crawler_name

        },

        Next = "CheckCrawlerState"

      },

      CheckCrawlerState = {

        Type = "Choice",

        Choices = [

          {

            Variable     = "$.Crawler.State",

            StringEquals = "READY",

            Next         = "RunTransformJob"

          }

        ],

        Default = "Wait30Seconds"

      },

      RunTransformJob = {

        Type     = "Task",

        Resource = "arn:aws:states:::glue:startJobRun.sync",

        Parameters = {

          JobName = var.glue_job_name

        },

        Next = "NotifyLambda"

      },

      NotifyLambda = {

        Type     = "Task",

        Resource = "arn:aws:states:::lambda:invoke",

        Parameters = {

          FunctionName = aws_lambda_function.notify_lambda.arn,

          Payload = {

            job = var.glue_job_name,

            status = "SUCCESS"

          }

        },

        End = true

      }

    }

  })

}


# EventBridge Rule to trigger Step Function daily

resource "aws_cloudwatch_event_rule" "daily_etl_rule" {

  name                = "daily-etl-schedule"

  schedule_expression = "cron(0 0 * * ? *)"

}


resource "aws_cloudwatch_event_target" "etl_target" {

  rule      = aws_cloudwatch_event_rule.daily_etl_rule.name

  target_id = "StepFunctionTrigger"

  arn       = aws_sfn_state_machine.glue_workflow.arn

  role_arn  = aws_iam_role.step_function_role.arn

}


# Upload these:

# - glue_job/transform_script.py to s3://${var.bucket_name}/scripts/

# - Lambda zip as lambda_function_payload.zip


No comments:

Post a Comment