Monday, 20 October 2025

Tokenizer

🧩 What is a Tokenizer?

A tokenizer is a text preprocessing tool used in NLP (Natural Language Processing) that converts human-readable text into numbers so that models like BERT or DistilBERT can understand it.

💬 Why do we need it?

Machine learning models cannot understand raw text like:

"I love this movie!"

So, we must convert text → tokens → numbers (IDs).

🔤 Step-by-step example

Let’s see what a tokenizer does with DistilBERT.

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

text = "I love this movie!"
tokens = tokenizer.tokenize(text)
print(tokens)

📊 Output:

['i', 'love', 'this', 'movie', '!']

So, it breaks the sentence into small pieces called tokens.

🔢 Convert tokens to IDs

Next, we convert these words into numeric IDs (the model’s vocabulary).

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

Example:

[1045, 2293, 2023, 3185, 999]

Each number corresponds to a token in BERT’s vocabulary.

Wednesday, 15 October 2025

GCP pubsub data and how to get it

GCP Pub/Sub Subscription JSON & Python Example

GCP Pub/Sub Subscription Example

JSON File

{
  "protoPayload": {
    "@type": "type.googleapis.com/google.cloud.audit.AuditLog",
    "status": {},
    "authenticationInfo": {
      "principalEmail": "admin@gcp-project.iam.gserviceaccount.com",
      "serviceAccountDelegationInfo": []
    },
    "requestMetadata": {
      "callerIp": "35.203.120.14",
      "callerSuppliedUserAgent": "google-cloud-sdk gcloud/463.0.0"
    },
    "serviceName": "pubsub.googleapis.com",
    "methodName": "google.pubsub.v1.Subscriber.CreateSubscription",
    "resourceName": "projects/my-gcp-project/subscriptions/bq_export_subscription",
    "request": {
      "@type": "type.googleapis.com/google.pubsub.v1.Subscription",
      "name": "projects/my-gcp-project/subscriptions/bq_export_subscription",
      "topic": "projects/my-gcp-project/topics/ticket-updates",
      "bigqueryConfig": {
        "table": "projects/my-gcp-project/datasets/pubsub_exports/tables/ticket_updates",
        "writeMetadata": true,
        "useTopicSchema": true,
        "dropUnknownFields": false,
        "state": "ACTIVE",
        "serviceAccountEmail": "pubsub-bq-writer@my-gcp-project.iam.gserviceaccount.com"
      },
      "ackDeadlineSeconds": 30,
      "retainAckedMessages": false,
      "expirationPolicy": {
        "ttl": "2678400s"
      },
      "messageRetentionDuration": "604800s",
      "enableMessageOrdering": false,
      "labels": {
        "env": "prod",
        "team": "data-engineering"
      }
    },
    "response": {
      "@type": "type.googleapis.com/google.pubsub.v1.Subscription",
      "name": "projects/my-gcp-project/subscriptions/bq_export_subscription",
      "topic": "projects/my-gcp-project/topics/ticket-updates",
      "bigqueryConfig": {
        "table": "projects/my-gcp-project/datasets/pubsub_exports/tables/ticket_updates",
        "state": "ACTIVE"
      }
    }
  },
  "insertId": "px1a12bc3d4e56",
  "resource": {
    "type": "pubsub_subscription",
    "labels": {
      "subscription_id": "bq_export_subscription",
      "project_id": "my-gcp-project"
    }
  },
  "timestamp": "2025-10-15T22:03:19.123Z",
  "severity": "INFO",
  "logName": "projects/my-gcp-project/logs/cloudaudit.googleapis.com%2Factivity",
  "operation": {
    "id": "operation-1234567890",
    "producer": "pubsub.googleapis.com",
    "last": true
  },
  "receiveTimestamp": "2025-10-15T22:03:20.456Z"
}

Python Code

from google.cloud import pubsub_v1

def get_subscription_details(project_id: str, subscription_id: str):
    """
    Retrieves a Pub/Sub subscription and prints its BigQuery service account email (if configured).
    """
    # Create a Subscriber client
    subscriber = pubsub_v1.SubscriberClient()

    # Build the fully qualified subscription name
    subscription_path = subscriber.subscription_path(project_id, subscription_id)
    print(f"Fetching subscription: {subscription_path}")

    try:
        # Get subscription details
        subscription = subscriber.get_subscription(request={"subscription": subscription_path})

        print("\\n✅ Subscription Details:")
        print(f"Name: {subscription.name}")
        print(f"Topic: {subscription.topic}")
        print(f"Ack Deadline: {subscription.ack_deadline_seconds} seconds")

        # Check for BigQuery configuration
        if subscription.bigquery_config and subscription.bigquery_config.service_account_email:
            print(f"BigQuery Table: {subscription.bigquery_config.table}")
            print(f"Service Account Email: {subscription.bigquery_config.service_account_email}")
        else:
            print("No BigQuery configuration or service account email found for this subscription.")

    except Exception as e:
        print(f"❌ Error fetching subscription: {e}")


if __name__ == "__main__":
    PROJECT_ID = "my-gcp-project"
    SUBSCRIPTION_ID = "bq_export_subscription"
    get_subscription_details(PROJECT_ID, SUBSCRIPTION_ID)