Wednesday, 17 September 2025

Day 1

main.tf


provider "google" {
  project = var.project_id
  region  = var.region
}

# Enable required services
resource "google_project_service" "services" {
  for_each = toset([
    "pubsub.googleapis.com",
    "logging.googleapis.com",
    "cloudfunctions.googleapis.com"
  ])
  service = each.key
}

    variable "project_id" {
  description = "Your GCP Project ID"
}
variable "region" {
  default     = "us-central1"
}
    
# Pub/Sub topic that receives log events
resource "google_pubsub_topic" "log_topic" {
  name = "storage-policy-violations"
}

# Pub/Sub topic for SOC alerts
resource "google_pubsub_topic" "soc_alerts" {
  name = "soc-alerts"
}

# Log sink to capture public bucket IAM changes
resource "google_logging_project_sink" "storage_sink" {
  name        = "storage-public-bucket-sink"
  destination = "pubsub.googleapis.com/${google_pubsub_topic.log_topic.id}"

  # Filter: only when public access is granted
  filter = < <EOT
resource.type="gcs_bucket"
protoPayload.methodName="storage.setIamPermissions"
(protoPayload.serviceData.policyDelta.bindingDeltas.member="allUsers"
 OR protoPayload.serviceData.policyDelta.bindingDeltas.member="allAuthenticatedUsers")
EOT

  unique_writer_identity = true
}

# Give sink permission to publish
resource "google_pubsub_topic_iam_member" "sink_pub" {
  topic  = google_pubsub_topic.log_topic.name
  role   = "roles/pubsub.publisher"
  member = google_logging_project_sink.storage_sink.writer_identity
}

# Storage bucket for function code
resource "google_storage_bucket" "function_bucket" {
  name          = "${var.project_id}-function-src"
  location      = var.region
  force_destroy = true
}

# Upload function zip
resource "google_storage_bucket_object" "function_source" {
  name   = "function-source.zip"
  bucket = google_storage_bucket.function_bucket.name
  source = "function-source.zip"
}

# Cloud Function
resource "google_cloudfunctions_function" "notify_soc" {
  name        = "storage-public-alert"
  runtime     = "python39"
  region      = var.region
  entry_point = "process_pubsub"

  source_archive_bucket = google_storage_bucket.function_bucket.name
  source_archive_object = google_storage_bucket_object.function_source.name

  event_trigger {
    event_type = "google.pubsub.topic.publish"
    resource   = google_pubsub_topic.log_topic.name
  }

  available_memory_mb = 256
  description         = "Notifies SOC when a bucket is made public"
}

# Allow function to publish to SOC topic
resource "google_pubsub_topic_iam_member" "function_pub" {
  topic  = google_pubsub_topic.soc_alerts.name
  role   = "roles/pubsub.publisher"
  member = "serviceAccount:${google_cloudfunctions_function.notify_soc.service_account_email}"
}




main.py
import base64
import json
from google.cloud import pubsub_v1

SOC_TOPIC = "soc-alerts"

def process_pubsub(event, context):
    """Triggered when a bucket is made public"""
    if "data" not in event:
        print("No data found in event")
        return

    # Decode log entry
    payload = base64.b64decode(event["data"]).decode("utf-8")
    try:
        log_entry = json.loads(payload)
    except Exception as e:
        print(f"Could not parse log entry: {e}")
        return

    bucket_name = log_entry.get("resource", {}).get("labels", {}).get("bucket_name", "unknown")

    # Create alert message
    message = {
        "alert": "PUBLIC_BUCKET_DETECTED",
        "bucket": bucket_name,
        "log": log_entry
    }

    # Publish to SOC topic
    publisher = pubsub_v1.PublisherClient()
    project_id = log_entry.get("resource", {}).get("labels", {}).get("project_id", "")
    topic_path = publisher.topic_path(project_id, SOC_TOPIC)

    publisher.publish(topic_path, json.dumps(message).encode("utf-8"))
    print(f"⚠️ SOC ALERT: Public bucket detected -> {bucket_name}")
    
    #python -c "import pathlib, shutil; [shutil.rmtree(p) for p in pathlib.Path('.').rglob('__pycache__')]"

  

Monday, 15 September 2025

AWS Data & ETL Training Master Deck

AWS Data & ETL Training Master Deck (Editable)

AWS Data & ETL Training Master Deck (Editable)

10-Day instructor-led hands-on training — outline & slides

Day 1: AWS Basics & Account Setup

  • Slide 1: Title, Duration, Instructor
    Course title slide showing Day 1, total duration for the session, and instructor name.
  • Slide 2: Agenda & Learning Objectives
    List the day's agenda and measurable learning objectives (account setup, billing monitoring, MFA, AWS infra concepts).
  • Slide 3: What is Cloud & Why AWS?
    High-level cloud concepts, benefits of cloud vs on-prem, reasons to choose AWS (services, scale, ecosystem).
  • Slide 4: AWS Global Infrastructure Diagram
    Diagram illustrating Regions, Availability Zones, and Edge Locations with brief notes on use-cases (latency, fault-isolation).
  • Slide 5: AWS Account Setup Steps (screenshots)
    Step-by-step account creation guidance with placeholders for screenshots: sign-up, billing info, support plan, root account safety.
  • Slide 6: Hands-on Demo: Billing alarm, MFA
    Step-by-step technical tasks students must perform in the lab:
    1. Enable IAM Billing Access — Console: Account settings → activate IAM access to billing info.
    2. Create CloudWatch Billing Alarm — Console: CloudWatch → Alarms → Create Alarm → Metric: Billing → Total Estimated Charge; set threshold (e.g. $5) → create SNS topic for email notifications → subscribe student email.
    3. Enable MFA on Root/Users — Console: IAM → Users → select user (or root) → Security credentials → Manage MFA → choose Virtual MFA → scan QR with Authenticator app (Google Authenticator/Authy) → verify codes.
    4. Test Access — Demonstrate logging in with an IAM user and validate MFA prompts; verify billing alarm notification by temporarily lowering threshold or using simulated billing metric if available.
    # Example AWS CLI (for reference - optional) aws cloudwatch put-metric-alarm \ --alarm-name "EstimatedChargesAlarm" \ --metric-name "EstimatedCharges" \ --namespace "AWS/Billing" \ --statistic Maximum \ --period 21600 \ --evaluation-periods 1 \ --threshold 5 \ --comparison-operator GreaterThanOrEqualToThreshold \ --dimensions Name=Currency,Value=USD \ --alarm-actions arn:aws:sns:us-east-1:123456789012:BillingAlerts
  • Slide 7: Summary & Q&A
    Recap key takeaways: cloud fundamentals, AWS infra, account safety practices (MFA, billing alarms). Open floor for questions.

Day 2: IAM & Security

  • Slide 1: Agenda & Objectives
    Outline of day: IAM concepts, hands-on user & group creation, policies, best practices.
  • Slide 2: IAM Concepts (Users, Groups, Roles, Policies)
    Explain IAM building blocks: Users, Groups, Roles, Policies, trust vs permissions.
  • Slide 3: IAM Architecture Diagram
    Diagram showing relationship between identities, roles, STS, and resources.
  • Slide 4: Hands-on: Create IAM user/group, attach policy
    Lab steps for students:
    1. Create an IAM group (e.g., etl-developers).
    2. Create an IAM user (e.g., student01) and add to group.
    3. Create and attach an inline or managed policy (least-privilege example: S3 read/write to a specific bucket).
    4. Test access using AWS CLI with generated access key (recommend temporary credentials or role-based cross-account testing).
  • Slide 5: Best Practices: Least Privilege, MFA
    Guidelines: use roles for services, avoid root, enable MFA, rotate keys, use IAM Access Analyzer, and log with CloudTrail.
  • Slide 6: Summary & Q&A
    Recap and Q&A.

Day 3: Amazon S3 Basics

  • Slide 1: Agenda & Objectives
    Intro to S3, storage classes, basic operations, versioning & lifecycle.
  • Slide 2: S3 Overview (Buckets, Objects, Storage Classes)
    Explain buckets, objects, keys, metadata, and storage classes (Standard, Intelligent-Tiering, IA, Glacier).
  • Slide 3: Versioning & Lifecycle Diagram
    Diagram and examples of versioning and lifecycle rules to transition objects to cheaper storage.
  • Slide 4: Hands-on: Create bucket, upload/download objects
    Lab steps: create bucket, set bucket policy, upload/download via console and CLI, enable versioning.
  • Slide 5: Summary & Q&A
    Recap and Q&A.

Day 4: Amazon S3 Advanced

  • Slide 1: Agenda & Objectives
    Encryption, bucket policies, event notifications and integration with Lambda/SNS/SQS.
  • Slide 2: Encryption & Security (SSE-S3, SSE-KMS, ACL, Bucket Policy)
    Explain server-side encryption options, KMS keys, ACLs vs bucket policies, and public access blocks.
  • Slide 3: Event Notifications Diagram (S3 → Lambda/SNS/SQS)
    Diagram showing S3 event notification flows to Lambda, SNS, and SQS for processing pipelines.
  • Slide 4: Hands-on: Trigger Lambda on S3 upload
    Lab: create Lambda function, add S3 trigger, upload object to test invocation, view CloudWatch logs.
  • Slide 5: Summary & Q&A
    Recap and Q&A.

Day 5: Amazon RDS

  • Slide 1: Agenda & Objectives
    Relational databases on AWS, engines, HA patterns, backups and restores.
  • Slide 2: RDS Overview (Engines, Multi-AZ, Read Replica)
    Discuss supported engines (MySQL, PostgreSQL, Aurora), Multi-AZ, read replicas, and failover behavior.
  • Slide 3: Security & VPC integration Diagram
    Diagram showing RDS inside VPC, subnets, SGs, route for application access, and IAM authentication options.
  • Slide 4: Hands-on: Launch RDS instance, connect & query
    Lab: launch a small RDS instance (free tier if available), configure security group, connect via psql/mysql client, run sample queries.
  • Slide 5: Summary & Q&A
    Recap and Q&A.

Day 6: AWS Glue Basics & Data Catalog

  • Slide 1: Agenda & Objectives
    Intro to Glue, Data Catalog, Crawlers, Jobs and Studio.
  • Slide 2: Glue Architecture Diagram
    Architecture showing Glue interacting with S3, Catalog, and compute (Glue jobs).
  • Slide 3: Glue Components (Catalog, Crawler, Jobs, Studio)
    Explain each component and how they fit into ETL workflows.
  • Slide 4: Hands-on: Catalog S3 CSV/JSON → Glue table
    Lab: create a Glue Crawler to catalogue S3 files and validate the Glue table schema.
  • Slide 5: Query with Athena
    Show how to query Glue cataloged tables using Athena.
  • Slide 6: Summary & Q&A
    Recap and Q&A.

Day 7: AWS Glue Advanced & PySpark ETL

  • Slide 1: Agenda & Objectives
    Advanced Glue topics and PySpark-based ETL jobs.
  • Slide 2: DynamicFrame vs DataFrame Diagram
    Explain differences, when to use DynamicFrame (schema flexibility) vs DataFrame (performance / Spark APIs).
  • Slide 3: PySpark ETL Transformations (filter, join, aggregate)
    Common transformations with examples and notes about performance and partitioning.
  • Slide 4: Hands-on Demo: CSV → Parquet → RDS
    Lab: run a PySpark job to convert CSV to Parquet, partition data, and (optionally) push results to RDS.
  • Slide 5: Sample PySpark ETL Job (code snippet)
    Include a short PySpark snippet in the slide for students to review and run (full code in appendix).
    # PySpark (Glue) snippet - pseudocode df = spark.read.csv("s3://bucket/raw/data.csv", header=True) df = df.filter("status = 'active'") \ .withColumn("event_date", to_date(col("timestamp"))) df.write.partitionBy("event_date").parquet("s3://bucket/processed/")
  • Slide 6: Integration with Athena
    Show how Athena can query the Parquet output using Glue catalog partitions.
  • Slide 7: Summary & Q&A
    Recap and Q&A.

Day 8: Amazon Athena

  • Slide 1: Agenda & Objectives
    Introduce Athena, cost model, and best practices for querying data lakes.
  • Slide 2: Athena Overview & Cost Model
    Explain pay-per-query model (data scanned), partitioning, compression, and reducing cost.
  • Slide 3: Querying Glue tables (SELECT, GROUP BY, partitions)
    Examples for common SQL queries over Glue catalog tables and partition-aware queries.
  • Slide 4: Hands-on: Athena SQL Queries
    Lab: run sample queries, test performance, and measure scanned bytes for cost awareness.
  • Slide 5: Summary & Q&A
    Recap and Q&A.

Day 9: AWS Lambda & CloudWatch

  • Slide 1: Agenda & Objectives
    Serverless compute basics, event-driven architecture, monitoring & observability.
  • Slide 2: Lambda Lifecycle Diagram
    Diagram: cold start, container reuse, concurrency limits.
  • Slide 3: Triggers: S3, Glue, RDS
    Examples of event sources and patterns to invoke Lambda for ETL steps.
  • Slide 4: CloudWatch Metrics, Logs, Alarms
    How to instrument Lambda with logs, custom metrics, and alarms for failure/latency.
  • Slide 5: Hands-on: Lambda triggered by S3
    Lab: deploy a Python Lambda, configure S3 trigger, upload object to test, observe CloudWatch logs.
  • Slide 6: Sample Python Lambda Code
    Example code snippet to include on slide:
    # sample lambda handler def handler(event, context): for record in event['Records']: key = record['s3']['object']['key'] # process object (e.g., read, transform, write) print(f"Processing {key}")
  • Slide 7: Summary & Q&A
    Recap and Q&A.

Day 10: Capstone Project & Wrap-Up

  • Slide 1: Agenda & Objectives
    Overview of final integrated pipeline and evaluation criteria for the capstone.
  • Slide 2: End-to-End ETL Pipeline Diagram (S3 → Glue → Athena → RDS)
    A diagram showing full flow: data ingest → catalog → transform → query → store and monitor.
  • Slide 3: Step-by-Step Demo Script
    Steps for the instructor & students to follow:
    1. Upload CSV to S3
    2. Glue Crawler → Catalog
    3. Glue PySpark ETL → Parquet
    4. Athena Queries
    5. Optional: Load into RDS
    6. CloudWatch Monitoring
  • Slide 4: Summary of Key Takeaways
    Highlight the major learnings from the course and recommended next steps/resources.
  • Slide 5: Final Q&A
    Open discussion, feedback, and next steps for continued learning.
Generated outline • Editable master deck for instructor use — add diagrams, screenshots and code files as needed.

Wednesday, 3 September 2025

Terrafoem Commands


 

Category Command Description
Init & Setup terraform init Initialize Terraform working directory
terraform init -reconfigure Reinitialize and ignore previous backend configs
terraform init -upgrade Reinitialize and upgrade providers/modules
terraform get Download and update modules
Planning terraform plan Show planned changes
terraform plan -out=tfplan Save execution plan to a file
Apply/Destroy terraform apply Apply changes with confirmation
terraform apply tfplan Apply using a saved plan file
terraform apply -auto-approve Apply without manual approval
terraform destroy Destroy infrastructure with confirmation
terraform destroy -auto-approve Destroy without confirmation
terraform destroy -target=aws_instance.example Destroy specific resource
Validate & Format terraform validate Validate configuration syntax
terraform fmt Format Terraform files
terraform fmt -recursive Format files in all subdirectories
Output terraform output Show output variables
terraform output -json Show outputs in JSON format
State Management terraform show Show full state or plan content
terraform state list List all resources in the state file
terraform state show <resource> Show specific resource details
terraform state pull Download current state file
terraform state push Upload local state file (used with care)
terraform refresh Update state with real infrastructure
terraform taint <resource> Mark a resource for recreation
terraform untaint <resource> Remove taint from a resource
Workspace Management terraform workspace list List all workspaces
terraform workspace new <name> Create new workspace (e.g., dev, prod)
terraform workspace select <name> Switch to another workspace
terraform workspace delete <name> Delete a workspace
Debugging & Visuals TF_LOG=DEBUG terraform plan Enable debug logging
TF_LOG_PATH=log.txt terraform apply Save logs to a file
terraform graph | dot -Tpng > graph.png Visualize resource graph (Graphviz needed)
Terraform Cloud terraform login Authenticate to Terraform Cloud
terraform logout Remove local credentials
terraform state push Manually upload state file to remote