Tips to Improve Knowledge: September 2025

Wednesday, 17 September 2025

Day 1

main.tf


provider "google" {
  project = var.project_id
  region  = var.region
}

# Enable required services
resource "google_project_service" "services" {
  for_each = toset([
    "pubsub.googleapis.com",
    "logging.googleapis.com",
    "cloudfunctions.googleapis.com"
  ])
  service = each.key
}

    variable "project_id" {
  description = "Your GCP Project ID"
}
variable "region" {
  default     = "us-central1"
}
    
# Pub/Sub topic that receives log events
resource "google_pubsub_topic" "log_topic" {
  name = "storage-policy-violations"
}

# Pub/Sub topic for SOC alerts
resource "google_pubsub_topic" "soc_alerts" {
  name = "soc-alerts"
}

# Log sink to capture public bucket IAM changes
resource "google_logging_project_sink" "storage_sink" {
  name        = "storage-public-bucket-sink"
  destination = "pubsub.googleapis.com/${google_pubsub_topic.log_topic.id}"

  # Filter: only when public access is granted
  filter = < <EOT
resource.type="gcs_bucket"
protoPayload.methodName="storage.setIamPermissions"
(protoPayload.serviceData.policyDelta.bindingDeltas.member="allUsers"
 OR protoPayload.serviceData.policyDelta.bindingDeltas.member="allAuthenticatedUsers")
EOT

  unique_writer_identity = true
}

# Give sink permission to publish
resource "google_pubsub_topic_iam_member" "sink_pub" {
  topic  = google_pubsub_topic.log_topic.name
  role   = "roles/pubsub.publisher"
  member = google_logging_project_sink.storage_sink.writer_identity
}

# Storage bucket for function code
resource "google_storage_bucket" "function_bucket" {
  name          = "${var.project_id}-function-src"
  location      = var.region
  force_destroy = true
}

# Upload function zip
resource "google_storage_bucket_object" "function_source" {
  name   = "function-source.zip"
  bucket = google_storage_bucket.function_bucket.name
  source = "function-source.zip"
}

# Cloud Function
resource "google_cloudfunctions_function" "notify_soc" {
  name        = "storage-public-alert"
  runtime     = "python39"
  region      = var.region
  entry_point = "process_pubsub"

  source_archive_bucket = google_storage_bucket.function_bucket.name
  source_archive_object = google_storage_bucket_object.function_source.name

  event_trigger {
    event_type = "google.pubsub.topic.publish"
    resource   = google_pubsub_topic.log_topic.name
  }

  available_memory_mb = 256
  description         = "Notifies SOC when a bucket is made public"
}

# Allow function to publish to SOC topic
resource "google_pubsub_topic_iam_member" "function_pub" {
  topic  = google_pubsub_topic.soc_alerts.name
  role   = "roles/pubsub.publisher"
  member = "serviceAccount:${google_cloudfunctions_function.notify_soc.service_account_email}"
}




main.py
import base64
import json
from google.cloud import pubsub_v1

SOC_TOPIC = "soc-alerts"

def process_pubsub(event, context):
    """Triggered when a bucket is made public"""
    if "data" not in event:
        print("No data found in event")
        return

    # Decode log entry
    payload = base64.b64decode(event["data"]).decode("utf-8")
    try:
        log_entry = json.loads(payload)
    except Exception as e:
        print(f"Could not parse log entry: {e}")
        return

    bucket_name = log_entry.get("resource", {}).get("labels", {}).get("bucket_name", "unknown")

    # Create alert message
    message = {
        "alert": "PUBLIC_BUCKET_DETECTED",
        "bucket": bucket_name,
        "log": log_entry
    }

    # Publish to SOC topic
    publisher = pubsub_v1.PublisherClient()
    project_id = log_entry.get("resource", {}).get("labels", {}).get("project_id", "")
    topic_path = publisher.topic_path(project_id, SOC_TOPIC)

    publisher.publish(topic_path, json.dumps(message).encode("utf-8"))
    print(f"⚠️ SOC ALERT: Public bucket detected -> {bucket_name}")
    
    #python -c "import pathlib, shutil; [shutil.rmtree(p) for p in pathlib.Path('.').rglob('__pycache__')]"

Monday, 15 September 2025

AWS Data & ETL Training Master Deck

AWS Data & ETL Training Master Deck (Editable)

10-Day instructor-led hands-on training — outline & slides

Day 1: AWS Basics & Account Setup

Slide 1: Title, Duration, Instructor
Course title slide showing Day 1, total duration for the session, and instructor name.
Slide 2: Agenda & Learning Objectives
List the day's agenda and measurable learning objectives (account setup, billing monitoring, MFA, AWS infra concepts).
Slide 3: What is Cloud & Why AWS?
High-level cloud concepts, benefits of cloud vs on-prem, reasons to choose AWS (services, scale, ecosystem).
Slide 4: AWS Global Infrastructure Diagram
Diagram illustrating Regions, Availability Zones, and Edge Locations with brief notes on use-cases (latency, fault-isolation).
Slide 5: AWS Account Setup Steps (screenshots)
Step-by-step account creation guidance with placeholders for screenshots: sign-up, billing info, support plan, root account safety.
Slide 6: Hands-on Demo: Billing alarm, MFA
Step-by-step technical tasks students must perform in the lab:
1. Enable IAM Billing Access — Console: Account settings → activate IAM access to billing info.
2. Create CloudWatch Billing Alarm — Console: CloudWatch → Alarms → Create Alarm → Metric: Billing → Total Estimated Charge; set threshold (e.g. $5) → create SNS topic for email notifications → subscribe student email.
3. Enable MFA on Root/Users — Console: IAM → Users → select user (or root) → Security credentials → Manage MFA → choose Virtual MFA → scan QR with Authenticator app (Google Authenticator/Authy) → verify codes.
4. Test Access — Demonstrate logging in with an IAM user and validate MFA prompts; verify billing alarm notification by temporarily lowering threshold or using simulated billing metric if available.
# Example AWS CLI (for reference - optional) aws cloudwatch put-metric-alarm \ --alarm-name "EstimatedChargesAlarm" \ --metric-name "EstimatedCharges" \ --namespace "AWS/Billing" \ --statistic Maximum \ --period 21600 \ --evaluation-periods 1 \ --threshold 5 \ --comparison-operator GreaterThanOrEqualToThreshold \ --dimensions Name=Currency,Value=USD \ --alarm-actions arn:aws:sns:us-east-1:123456789012:BillingAlerts
Slide 7: Summary & Q&A
Recap key takeaways: cloud fundamentals, AWS infra, account safety practices (MFA, billing alarms). Open floor for questions.

Day 2: IAM & Security

Slide 1: Agenda & Objectives
Outline of day: IAM concepts, hands-on user & group creation, policies, best practices.
Slide 2: IAM Concepts (Users, Groups, Roles, Policies)
Explain IAM building blocks: Users, Groups, Roles, Policies, trust vs permissions.
Slide 3: IAM Architecture Diagram
Diagram showing relationship between identities, roles, STS, and resources.
Slide 4: Hands-on: Create IAM user/group, attach policy
Lab steps for students:
1. Create an IAM group (e.g., etl-developers).
2. Create an IAM user (e.g., student01) and add to group.
3. Create and attach an inline or managed policy (least-privilege example: S3 read/write to a specific bucket).
4. Test access using AWS CLI with generated access key (recommend temporary credentials or role-based cross-account testing).
Slide 5: Best Practices: Least Privilege, MFA
Guidelines: use roles for services, avoid root, enable MFA, rotate keys, use IAM Access Analyzer, and log with CloudTrail.
Slide 6: Summary & Q&A
Recap and Q&A.

Day 3: Amazon S3 Basics

Slide 1: Agenda & Objectives
Intro to S3, storage classes, basic operations, versioning & lifecycle.
Slide 2: S3 Overview (Buckets, Objects, Storage Classes)
Explain buckets, objects, keys, metadata, and storage classes (Standard, Intelligent-Tiering, IA, Glacier).
Slide 3: Versioning & Lifecycle Diagram
Diagram and examples of versioning and lifecycle rules to transition objects to cheaper storage.
Slide 4: Hands-on: Create bucket, upload/download objects
Lab steps: create bucket, set bucket policy, upload/download via console and CLI, enable versioning.
Slide 5: Summary & Q&A
Recap and Q&A.

Day 4: Amazon S3 Advanced

Slide 1: Agenda & Objectives
Encryption, bucket policies, event notifications and integration with Lambda/SNS/SQS.
Slide 2: Encryption & Security (SSE-S3, SSE-KMS, ACL, Bucket Policy)
Explain server-side encryption options, KMS keys, ACLs vs bucket policies, and public access blocks.
Slide 3: Event Notifications Diagram (S3 → Lambda/SNS/SQS)
Diagram showing S3 event notification flows to Lambda, SNS, and SQS for processing pipelines.
Slide 4: Hands-on: Trigger Lambda on S3 upload
Lab: create Lambda function, add S3 trigger, upload object to test invocation, view CloudWatch logs.
Slide 5: Summary & Q&A
Recap and Q&A.

Day 5: Amazon RDS

Slide 1: Agenda & Objectives
Relational databases on AWS, engines, HA patterns, backups and restores.
Slide 2: RDS Overview (Engines, Multi-AZ, Read Replica)
Discuss supported engines (MySQL, PostgreSQL, Aurora), Multi-AZ, read replicas, and failover behavior.
Slide 3: Security & VPC integration Diagram
Diagram showing RDS inside VPC, subnets, SGs, route for application access, and IAM authentication options.
Slide 4: Hands-on: Launch RDS instance, connect & query
Lab: launch a small RDS instance (free tier if available), configure security group, connect via psql/mysql client, run sample queries.
Slide 5: Summary & Q&A
Recap and Q&A.

Day 6: AWS Glue Basics & Data Catalog

Slide 1: Agenda & Objectives
Intro to Glue, Data Catalog, Crawlers, Jobs and Studio.
Slide 2: Glue Architecture Diagram
Architecture showing Glue interacting with S3, Catalog, and compute (Glue jobs).
Slide 3: Glue Components (Catalog, Crawler, Jobs, Studio)
Explain each component and how they fit into ETL workflows.
Slide 4: Hands-on: Catalog S3 CSV/JSON → Glue table
Lab: create a Glue Crawler to catalogue S3 files and validate the Glue table schema.
Slide 5: Query with Athena
Show how to query Glue cataloged tables using Athena.
Slide 6: Summary & Q&A
Recap and Q&A.

Day 7: AWS Glue Advanced & PySpark ETL

Slide 1: Agenda & Objectives
Advanced Glue topics and PySpark-based ETL jobs.
Slide 2: DynamicFrame vs DataFrame Diagram
Explain differences, when to use DynamicFrame (schema flexibility) vs DataFrame (performance / Spark APIs).
Slide 3: PySpark ETL Transformations (filter, join, aggregate)
Common transformations with examples and notes about performance and partitioning.
Slide 4: Hands-on Demo: CSV → Parquet → RDS
Lab: run a PySpark job to convert CSV to Parquet, partition data, and (optionally) push results to RDS.
Slide 5: Sample PySpark ETL Job (code snippet)
Include a short PySpark snippet in the slide for students to review and run (full code in appendix).
# PySpark (Glue) snippet - pseudocode df = spark.read.csv("s3://bucket/raw/data.csv", header=True) df = df.filter("status = 'active'") \ .withColumn("event_date", to_date(col("timestamp"))) df.write.partitionBy("event_date").parquet("s3://bucket/processed/")
Slide 6: Integration with Athena
Show how Athena can query the Parquet output using Glue catalog partitions.
Slide 7: Summary & Q&A
Recap and Q&A.

Day 8: Amazon Athena

Slide 1: Agenda & Objectives
Introduce Athena, cost model, and best practices for querying data lakes.
Slide 2: Athena Overview & Cost Model
Explain pay-per-query model (data scanned), partitioning, compression, and reducing cost.
Slide 3: Querying Glue tables (SELECT, GROUP BY, partitions)
Examples for common SQL queries over Glue catalog tables and partition-aware queries.
Slide 4: Hands-on: Athena SQL Queries
Lab: run sample queries, test performance, and measure scanned bytes for cost awareness.
Slide 5: Summary & Q&A
Recap and Q&A.

Day 9: AWS Lambda & CloudWatch

Slide 1: Agenda & Objectives
Serverless compute basics, event-driven architecture, monitoring & observability.
Slide 2: Lambda Lifecycle Diagram
Diagram: cold start, container reuse, concurrency limits.
Slide 3: Triggers: S3, Glue, RDS
Examples of event sources and patterns to invoke Lambda for ETL steps.
Slide 4: CloudWatch Metrics, Logs, Alarms
How to instrument Lambda with logs, custom metrics, and alarms for failure/latency.
Slide 5: Hands-on: Lambda triggered by S3
Lab: deploy a Python Lambda, configure S3 trigger, upload object to test, observe CloudWatch logs.
Slide 6: Sample Python Lambda Code
Example code snippet to include on slide:
# sample lambda handler def handler(event, context): for record in event['Records']: key = record['s3']['object']['key'] # process object (e.g., read, transform, write) print(f"Processing {key}")
Slide 7: Summary & Q&A
Recap and Q&A.

Day 10: Capstone Project & Wrap-Up

Slide 1: Agenda & Objectives
Overview of final integrated pipeline and evaluation criteria for the capstone.
Slide 2: End-to-End ETL Pipeline Diagram (S3 → Glue → Athena → RDS)
A diagram showing full flow: data ingest → catalog → transform → query → store and monitor.
Slide 3: Step-by-Step Demo Script
Steps for the instructor & students to follow:
1. Upload CSV to S3
2. Glue Crawler → Catalog
3. Glue PySpark ETL → Parquet
4. Athena Queries
5. Optional: Load into RDS
6. CloudWatch Monitoring
Slide 4: Summary of Key Takeaways
Highlight the major learnings from the course and recommended next steps/resources.
Slide 5: Final Q&A
Open discussion, feedback, and next steps for continued learning.

Wednesday, 3 September 2025

Terrafoem Commands

Category	Command	Description
Init & Setup	`terraform init`	Initialize Terraform working directory
	`terraform init -reconfigure`	Reinitialize and ignore previous backend configs
	`terraform init -upgrade`	Reinitialize and upgrade providers/modules
	`terraform get`	Download and update modules
Planning	`terraform plan`	Show planned changes
	`terraform plan -out=tfplan`	Save execution plan to a file
Apply/Destroy	`terraform apply`	Apply changes with confirmation
	`terraform apply tfplan`	Apply using a saved plan file
	`terraform apply -auto-approve`	Apply without manual approval
	`terraform destroy`	Destroy infrastructure with confirmation
	`terraform destroy -auto-approve`	Destroy without confirmation
	`terraform destroy -target=aws_instance.example`	Destroy specific resource
Validate & Format	`terraform validate`	Validate configuration syntax
	`terraform fmt`	Format Terraform files
	`terraform fmt -recursive`	Format files in all subdirectories
Output	`terraform output`	Show output variables
	`terraform output -json`	Show outputs in JSON format
State Management	`terraform show`	Show full state or plan content
	`terraform state list`	List all resources in the state file
	`terraform state show <resource>`	Show specific resource details
	`terraform state pull`	Download current state file
	`terraform state push`	Upload local state file (used with care)
	`terraform refresh`	Update state with real infrastructure
	`terraform taint <resource>`	Mark a resource for recreation
	`terraform untaint <resource>`	Remove taint from a resource
Workspace Management	`terraform workspace list`	List all workspaces
	`terraform workspace new <name>`	Create new workspace (e.g., dev, prod)
	`terraform workspace select <name>`	Switch to another workspace
	`terraform workspace delete <name>`	Delete a workspace
Debugging & Visuals	`TF_LOG=DEBUG terraform plan`	Enable debug logging
	`TF_LOG_PATH=log.txt terraform apply`	Save logs to a file
	`terraform graph \| dot -Tpng > graph.png`	Visualize resource graph (Graphviz needed)
Terraform Cloud	`terraform login`	Authenticate to Terraform Cloud
	`terraform logout`	Remove local credentials
	`terraform state push`	Manually upload state file to remote