feat: Cloudflare DNS ad-blocking

karteekiitg · karteekiitg · commit cae52902573f · 2025-05-20T10:41:06.000+05:30
Signed-off-by: Karteek &lt;120569182+karteekiitg@users.noreply.github.com&gt;
diff --git a/.github/workflows/cf_adblock.yaml b/.github/workflows/cf_adblock.yaml
@@ -0,0 +1,109 @@
+name: Monthly Cloudflare Adblock Update
+
+on:
+  workflow_dispatch: # Allows manual triggering
+  schedule:
+    - cron: "0 0 1 * *" # Runs at 00:00 UTC on the 1st day of every month
+
+env:
+  TF_VAR_gcs_env: prod
+
+permissions:
+  contents: read
+  id-token: write
+
+jobs:
+  update_cf_adblock:
+    runs-on: ubuntu-latest
+    container:
+      image: ghcr.io/karteekiitg/k8s_setup:latest
+
+    steps:
+      - name: Checkout repository
+        id: checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+
+      - name: Load .env file to environment
+        shell: bash
+        run: |
+          if [ -f "./.env" ]; then
+            echo "Sourcing .env file..."
+            grep -v '^[[:space:]]*#' ./.env | grep -v '^[[:space:]]*$' | grep '=' >> $GITHUB_ENV
+            echo "Finished processing .env file for GITHUB_ENV."
+          else
+            echo -e "\033[31mError: .env file not found at ./.\033[0m"
+            exit 1
+          fi
+
+      - name: Load secrets to environment
+        shell: bash
+        env: # Environment variables specific to THIS step
+          TF_VAR_infisical_client_secret: ${{ secrets.INFISICAL_CLIENT_SECRET }}
+        run: |
+          echo "Making setup_infisical.sh executable..."
+          chmod +x ./.devcontainer/setup_infisical.sh
+          echo "Running setup_infisical.sh..."
+          ./.devcontainer/setup_infisical.sh
+          if [ $? -ne 0 ]; then
+            echo -e "\033[31mError: setup_infisical.sh failed. See script output above for details.\033[0m"
+            exit 1
+          fi
+
+          EXPORT_FILE="$HOME/.infisical_exports.env"
+
+          if [ -f "$EXPORT_FILE" ]; then
+            echo "Sourcing secrets from $EXPORT_FILE to GITHUB_ENV (filtering, handling 'export' prefix, and stripping quotes)..."
+
+            # Pre-filter with grep to remove comments and truly empty lines, ensure '=' exists
+            # Then pipe into the while loop for further processing
+            grep -v '^[[:space:]]*#' "$EXPORT_FILE" | grep -v '^[[:space:]]*$' | grep '=' | \
+            while IFS= read -r line || [ -n "$line" ]; do # Read whole line
+              # Remove "export " prefix if it exists from the already filtered line
+              line_no_export="${line#export }"
+
+              # At this point, 'line_no_export' should be in KEY=VALUE format
+              # (possibly with quotes around VALUE) because of the preceding grep filters.
+              # We still split to handle the value quoting.
+
+              key="${line_no_export%%=*}"
+              value_with_potential_quotes="${line_no_export#*=}"
+
+              # Remove leading/trailing single quotes from value_with_potential_quotes
+              value_cleaned="${value_with_potential_quotes#\'}"
+              value_cleaned="${value_cleaned%\'}"
+              # Remove leading/trailing double quotes from value_with_potential_quotes
+              value_cleaned="${value_cleaned#\"}"
+              value_cleaned="${value_cleaned%\"}"
+
+              echo "$key=$value_cleaned" >> $GITHUB_ENV
+            done
+
+            echo "Finished processing $EXPORT_FILE for GITHUB_ENV."
+            echo "Removing $EXPORT_FILE..."
+            rm -f "$EXPORT_FILE"
+          else
+            echo -e "\033[31mError: Secrets export file ($EXPORT_FILE) was not found after running setup_infisical.sh.\033[0m"
+            exit 1
+          fi
+          echo "Secrets loaded and temporary file removed."
+
+      - name: Authenticate to Google Cloud
+        id: google-auth
+        uses: google-github-actions/auth@ba79af03959ebeac9769e648f473a284504d9193
+        with:
+          workload_identity_provider: ${{ env.GCP_WORKLOAD_IDENTITY_PROVIDER }} # Now from Infisical via env
+          service_account: ${{ env.GCP_SERVICE_ACCOUNT_EMAIL }} # Now from Infisical via env
+
+      - name: Run Adblock List Chunking Script
+        run: bash chunk_adblock_lists.sh 1000
+        working-directory: ./tofu/cf-adblock # Ensures script is run in the correct context
+
+      - name: OpenTofu Init for cf-adblock
+        run: tofu init
+        working-directory: ./tofu/cf-adblock
+
+      - name: OpenTofu Apply for cf-adblock
+        id: apply
+        shell: bash
+        run: tofu apply -auto-approve
+        working-directory: ./tofu/cf-adblock
diff --git a/.gitignore b/.gitignore
@@ -31,3 +31,5 @@ override.tf.json
 
 *.pem
 *.crt
+
+processed_adblock_chunks
diff --git a/tofu/cf-adblock/README.md b/tofu/cf-adblock/README.md
@@ -0,0 +1,79 @@
+# Cloudflare Adblock & Malware DNS Filtering
+
+OpenTofu module for Cloudflare Zero Trust Gateway DNS policies to block ads/malware. Fetches external domain lists, processes them, and updates Cloudflare.
+
+## My Usage
+
+I generally tend to avoid hosting piHole / AdGuard, as when they go down, we lose access to the internet. Setting HA is not quite straight forward. Also it mostly only covers home network, not mobile network.
+
+Even if using piHole / AdGuard, you can use to set this DoH endpoint as upstream. So, I use this setup in the following way, after getting DoH endpoint / ipv6 address from cloudflare:
+
+1.  On Browsers, android, ios, etc. i use the DoH endpoint to directly on top of using uBo and sponsorblock.
+2.  My router only supports ipv4 addresses as dns servers. So I use 1.1.1.2 / 1.0.0.2 as dns servers to block malware by default. If your router / devices supports DoH or DoT by default, always use it instead of ipv4 / ipv6.
+3. If using cloudflare warp as your vpn / zerotrust setup, your devices are automatically protected by warp. I also use the ipv6 address as upstream for tailscale / netbird, so that I am also protected by default, when using these as my vpn / zerotrust.
+4. I use a secondary cloudflare account, using a cheap [1.111B class domain](https://gen.xyz/1111b).
+
+## Overview
+
+Enhances network security and user experience by filtering unwanted content at the DNS level using Cloudflare Gateway.
+
+**Key Components & Functionality:**
+
+1.  **`adblock_urls.txt`**:
+    *   Contains URLs to ad/malware domain lists (e.g., Hagezi). Add/Delete lists from here.
+
+2.  **`chunk_adblock_lists.sh` (Shell Script)**:
+    *   **Purpose**: Downloads domains from `adblock_urls.txt`, processes them into a unique sorted list, and splits them into chunk files (e.g., `adblock_chunk_000.txt`) in `./processed_adblock_chunks/`.
+    *   **Usage**: Used by `tofu plan/apply` and GitHub Actions to update domain lists for Cloudflare.
+
+3.  **OpenTofu Configuration (`.tofu` files)**:
+    *   **`cloudflare_zero_trust_list.tofu`**: Creates `cloudflare_zero_trust_list` resources from chunk files in `./processed_adblock_chunks/`, populating them with domains.
+    *   **`cloudflare_zero_trust_gateway_policy.tofu`**: Defines DNS Gateway policies: `block_ads` uses the generated domain lists, and `block_malware` uses Cloudflare's predefined categories.
+    *   **`cloudflare_zero_trust_dns_location.tofu` (Optional/Example)**: Sets up a custom DNS location (e.g., "HomeLab") in Cloudflare Zero Trust for DoH endpoints.
+    *   **`backend.tofu`**: Configures GCS backend for OpenTofu state (prefix: `cf-adblock/prod` or per environment).
+    *   **`providers.tofu`**: Defines Cloudflare and HTTP providers, versions, and state encryption.
+    *   **`variables.tofu`**: Defines input variables (Cloudflare details, GCS bucket, encryption passphrase).
+
+## GitHub Action Automation (`cf_adblock.yaml`)
+
+Automates blocklist updates using [github action](/.github/workflows/cf_adblock.yaml):
+
+1.  **Triggers**: Scheduled (e.g., monthly) and manual (`workflow_dispatch`) triggers.
+2.  **Setup**: Checks out code, loads `.env` variables. Authenticates to Infisical (fetches secrets for `/tofu` and `/tofu_rw`) and Google Cloud (WIF for GCS access). Sets up OpenTofu. **Importantly, setup a github repository secret named INFISICAL_CLIENT_SECRET with your infisical client secret, in your github settings.**
+3.  **Execution**: Runs `chunk_adblock_lists.sh` (in `tofu/cf-adblock/`) to generate domain chunks. Then runs `tofu init`, `tofu plan`, and `tofu apply -auto-approve` (if changes) to update Cloudflare.
+
+## Required Inputs (Variables)
+
+Configure these via Infisical secrets (surfaced as `TF_VAR_...` environment variables):
+
+*   `TF_VAR_cloudflare_secondary_account_id`: Your Cloudflare Account ID for Zero Trust configurations.
+*   `TF_VAR_cloudflare_secondary_api_token`: Cloudflare API Token for Zero Trust management. **Sensitive secret.**
+*   `TF_VAR_bucket_name`: GCS bucket name for OpenTofu remote state.
+*   `TF_VAR_tofu_encryption_passphrase`: Passphrase for OpenTofu state encryption. **Sensitive secret.**
+
+## Manual Setup & Execution (Local Environment)
+
+Note: By default, every month, it updates the list, running as a [github action](/.github/workflows/cf_adblock.yaml). To run manually (e.g., in devcontainer):
+
+1.  **Prerequisites**:
+    *   Follow instructions in [devcontainer](/.devcontainer/README.md) on the steps to setup devcontainer.
+    *   `cd tofu/cf-adblock`.
+
+2.  **Prepare Domain Lists**:
+    *   Run `bash ./chunk_adblock_lists.sh <chunk_size>` (e.g., 1000).
+    *   Verify files in `./processed_adblock_chunks/`.
+
+3.  **Initialize OpenTofu**:
+    *   Run `tofu init` (uses `TF_VAR_bucket_name` & `TF_VAR_gcs_env`).
+
+4.  **Plan Changes**:
+    *   Run `tofu plan`. Review changes.
+
+5.  **Apply Changes**:
+    *   If acceptable, run `tofu apply`.
+
+Provides automated, robust ad/malware blocking via Cloudflare DNS filtering.
+
+## Acknowledgements
+
+This part of cloudflare ad-blocking was inspired by Marco Lancini's [blog post](https://blog.marcolancini.it/2022/blog-serverless-ad-blocking-with-cloudflare-gateway/) on serverless ad-blocking with Cloudflare Gateway.
diff --git a/tofu/cf-adblock/adblock_urls.txt b/tofu/cf-adblock/adblock_urls.txt
@@ -0,0 +1,6 @@
+terraform {
+  backend "gcs" {
+    bucket = var.bucket_name
+    prefix = "cf-adblock/prod"
+  }
+}
diff --git a/tofu/cf-adblock/backend.tofu b/tofu/cf-adblock/backend.tofu
@@ -0,0 +1,5 @@
+terraform {
+  backend "gcs" {
+    bucket = var.bucket_name
+  }
+}
diff --git a/tofu/cf-adblock/chunk_adblock_lists.sh b/tofu/cf-adblock/chunk_adblock_lists.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+set -euo pipefail
+
+if [ "$#" -ne 1 ]; then
+  echo "Usage: $0 <CHUNK_SIZE>"
+  echo "       Reads URLs from ./adblock_urls.txt (must be in current directory)." >&2
+  echo "       Outputs chunk files to ./processed_adblock_chunks/" >&2
+  exit 1
+fi
+
+CHUNK_SIZE="$1"
+URL_SOURCE_FILE="./adblock_urls.txt"
+OUTPUT_DIR="./processed_adblock_chunks"
+MAX_TOTAL_DOMAINS=100000
+
+if ! [[ "$CHUNK_SIZE" =~ ^[0-9]+$ ]] || [ "$CHUNK_SIZE" -lt 1 ]; then
+  echo "Error: Chunk size must be a positive integer." >&2
+  exit 1
+fi
+
+if [ ! -f "$URL_SOURCE_FILE" ]; then
+  echo "Error: URL source file not found at $URL_SOURCE_FILE." >&2
+  exit 1
+fi
+
+URLS=()
+while IFS= read -r line || [[ -n "$line" ]]; do
+    # Remove comments and skip empty lines
+    processed_line=$(echo "$line" | sed -e 's/#.*//' | xargs) # Remove # and onwards, then trim
+    if [ -n "$processed_line" ]; then
+        URLS+=("$processed_line")
+    fi
+done < "$URL_SOURCE_FILE"
+
+if [ ${#URLS[@]} -eq 0 ]; then
+  echo "No valid URLs found in $URL_SOURCE_FILE. Creating empty $OUTPUT_DIR and exiting." >&2
+  mkdir -p "$OUTPUT_DIR"
+  exit 0
+fi
+
+mkdir -p "$OUTPUT_DIR"
+echo "Output directory: $OUTPUT_DIR (relative to current directory)" >&2
+
+TMP_MERGED_CONTENT=$(mktemp)
+TMP_SORTED_UNIQUE_DOMAINS=$(mktemp)
+trap 'rm -f "$TMP_MERGED_CONTENT" "$TMP_SORTED_UNIQUE_DOMAINS"' EXIT SIGINT SIGTERM ERR
+
+echo "Downloading content from ${#URLS[@]} URLs specified in $URL_SOURCE_FILE..." >&2
+for URL in "${URLS[@]}"; do
+  # URL should be clean from the while loop processing
+  echo "  Downloading: $URL" >&2
+  if curl -sSLf "$URL" >> "$TMP_MERGED_CONTENT"; then
+    echo >> "$TMP_MERGED_CONTENT"
+  else
+    echo "  Warning: Failed to download or got an error for URL: $URL. Skipping." >&2
+  fi
+done
+
+echo "Processing downloaded content (filter, sort, unique)..." >&2
+grep -vE "^\s*#|^\s*$" "$TMP_MERGED_CONTENT" | sort -u > "$TMP_SORTED_UNIQUE_DOMAINS"
+
+TOTAL_DOMAINS_COUNT=$(wc -l < "$TMP_SORTED_UNIQUE_DOMAINS" | xargs)
+if ! [[ "$TOTAL_DOMAINS_COUNT" =~ ^[0-9]+$ ]]; then
+    TOTAL_DOMAINS_COUNT=0
+fi
+echo "Total unique domains found: $TOTAL_DOMAINS_COUNT" >&2
+
+if [ "$TOTAL_DOMAINS_COUNT" -gt "$MAX_TOTAL_DOMAINS" ]; then
+  echo "Error: Total unique domains ($TOTAL_DOMAINS_COUNT) exceeds limit of $MAX_TOTAL_DOMAINS." >&2
+  exit 1
+fi
+
+if [ "$TOTAL_DOMAINS_COUNT" -eq 0 ]; then
+    echo "No valid domains found after filtering. No chunk files will be created in $OUTPUT_DIR." >&2
+    exit 0
+fi
+
+echo "Splitting into chunks of $CHUNK_SIZE into $OUTPUT_DIR directory..." >&2
+FILE_PREFIX="adblock_chunk_"
+ORIGINAL_DIR=$(pwd)
+cd "$OUTPUT_DIR"
+# Split the file. Output files will be in the current directory (which is now OUTPUT_DIR)
+# Note: TMP_SORTED_UNIQUE_DOMAINS is an absolute path, so cd doesn't affect finding it.
+# Use --additional-suffix to add .txt directly.
+split -l "$CHUNK_SIZE" -a 3 -d --additional-suffix=.txt "$TMP_SORTED_UNIQUE_DOMAINS" "$FILE_PREFIX"
+
+cd "$ORIGINAL_DIR" # Now cd back
+
+echo "Chunk files (e.g., ${FILE_PREFIX}000.txt) created in $OUTPUT_DIR:" >&2
+# Optional: List the created files if desired for logging
+# for f in "$OUTPUT_DIR/${FILE_PREFIX}"*.txt; do
+#   if [ -f "$f" ]; then # Check if any files were actually created
+#     echo "  $f" >&2
+#   fi
+# done
+
+echo "Script completed successfully. Chunks are in $OUTPUT_DIR" >&2
+exit 0
diff --git a/tofu/cf-adblock/cloudflare_zero_trust_dns_location.tofu b/tofu/cf-adblock/cloudflare_zero_trust_dns_location.tofu
@@ -0,0 +1,36 @@
+resource "cloudflare_zero_trust_dns_location" "homelab" {
+  account_id     = var.cloudflare_secondary_account_id
+  name           = "HomeLab" # This will be the name in the Cloudflare dashboard
+  client_default = true      # Set to true if this should be the default location for WARP clients
+  ecs_support    = false
+
+  endpoints = {
+    doh = {
+      enabled = true # Enables DNS over HTTPS
+    }
+    dot = {
+      enabled = false # DNS over TLS, can be enabled if needed
+    }
+    ipv4 = {
+      enabled = false # Enables a dedicated IPv4 DNS resolver for this location
+    }
+    ipv6 = {
+      enabled = true # Enables a dedicated IPv6 DNS resolver, can be enabled if needed
+    }
+  }
+}
+
+output "dns_location_homelab" {
+  description = "DNS location - HomeLab (Cloudflare-assigned IPs)"
+  value = {
+    # These attributes will be populated with the unique IPs/hostnames assigned by Cloudflare
+    # after a successful 'tofu apply'.
+    doh                     = "https://${cloudflare_zero_trust_dns_location.homelab.doh_subdomain}.cloudflare-gateway.com/dns-query"
+    ipv4_destination        = cloudflare_zero_trust_dns_location.homelab.ipv4_destination
+    ipv4_destination_backup = cloudflare_zero_trust_dns_location.homelab.ipv4_destination_backup # May not be populated if only one IPv4 is assigned
+    # 'ip' might be populated with an IPv6 if ipv6 endpoint is enabled and assigned.
+    # For IPv4, refer to ipv4_destination.
+    ip = cloudflare_zero_trust_dns_location.homelab.ip
+    # dns_destination_ipv6_block_id is not relevant when Cloudflare assigns IPs.
+  }
+}
diff --git a/tofu/cf-adblock/cloudflare_zero_trust_gateway_policy.tofu b/tofu/cf-adblock/cloudflare_zero_trust_gateway_policy.tofu
@@ -0,0 +1,45 @@
+locals {
+  # Directory where the external script (run by GitHub Actions before tofu plan)
+  # outputs the chunked domain files. This directory should be gitignored.
+  # Example: <module_path>/processed_adblock_chunks/
+  chunk_files_output_dir = "${path.module}/processed_adblock_chunks"
+  chunk_file_name_pattern = "adblock_chunk_*.txt" # Pattern the script uses for output files
+
+  # Discover all chunk files created by the script.
+  # fileset() returns a sorted list of file paths relative to chunk_files_output_dir.
+  discovered_chunk_filenames = fileset(local.chunk_files_output_dir, local.chunk_file_name_pattern)
+
+  # Step 1: Create a map of filenames to their raw (trimmed) content.
+  chunk_file_contents = {
+    for filename in local.discovered_chunk_filenames :
+    filename => trimspace(file("${local.chunk_files_output_dir}/${filename}"))
+  }
+
+  # Step 2: Create the final map, processing the content from the map above.
+  list_definitions_from_files = {
+    for filename, content_str in local.chunk_file_contents : # Iterate over the pre-processed content
+    filename => {
+      # raw_content_for_this_file = content_str // Just for clarity, same as content_str
+      domains_in_chunk          = [for d in split("\n", content_str) : d if d != ""]
+    }
+  }
+
+  # For the 'block_ads' policy that uses these lists:
+  # Collect all IDs from the cloudflare_zero_trust_list resources that will be created.
+}
+
+resource "cloudflare_zero_trust_list" "adblock_domain_lists" {
+  account_id = var.cloudflare_secondary_account_id
+
+  # for_each iterates over the map of discovered chunk filenames and their processed domains.
+  # each.key is the filename (e.g., "adblock_chunk_000.txt").
+  # each.value is the map { file_content = "...", domains_in_chunk = [...] }.
+  for_each   = local.list_definitions_from_files # Changed to new local variable name
+
+  # Create a Cloudflare list name derived from the chunk filename for stability and traceability.
+  # Example: "adblock_chunk_000.txt" becomes "ad-list-adblock-chunk-000".
+  name        = "ad-list-${replace(replace(each.key, ".txt", ""), "_", "-")}"
+  type        = "DOMAIN"
+  items       = [for domain_str in each.value.domains_in_chunk : { value = domain_str }] # Changed to use domains_in_chunk
+  description = "Adblock list. Source chunk file: ${each.key}. Managed by Terraform."
+}
diff --git a/tofu/cf-adblock/cloudflare_zero_trust_list.tofu b/tofu/cf-adblock/cloudflare_zero_trust_list.tofu
diff --git a/tofu/cf-adblock/providers.tofu b/tofu/cf-adblock/providers.tofu
diff --git a/tofu/cf-adblock/variables.tofu b/tofu/cf-adblock/variables.tofu

-Original file line number
+Diff line change
 *.pem
 *.crt
++
 +processed_adblock_chunks