Wednesday, 25 June 2025

Terraform EMR + Step Snippet

 

resource "aws_emr_cluster" "example" {

  name          = "emr-transform"

  release_label = "emr-6.10.0"

  applications  = ["Spark"]

  service_role  = aws_iam_role.emr_service.arn

  ec2_attributes {

    key_name                          = "my-key"

    instance_profile                 = aws_iam_instance_profile.emr_profile.arn

  }

  master_instance_type = "m5.xlarge"

  core_instance_type   = "m5.xlarge"

  core_instance_count  = 2

  log_uri              = "s3://my-bucket/emr-logs/"

  auto_terminate       = true


  step {

    name              = "Transform CSV to Parquet"

    action_on_failure = "CONTINUE"

    hadoop_jar_step {

      jar  = "command-runner.jar"

      args = [

        "spark-submit",

        "--deploy-mode", "cluster",

        "s3://my-bucket/scripts/transform_data.py",

        "s3://my-bucket/input/",

        "s3://my-bucket/output/"

      ]

    }

  }

}


No comments:

Post a Comment