wikiwall/_worker.js at main · 2dapps/wikiwall · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/**
 * Copyright Dani Akash (https://github.com/DaniAkash)
 * SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
 * https://polyformproject.org/licenses/noncommercial/1.0.0
 */

/**
 * WikiWall Cloudflare Worker
 *
 * GET /api/batch?count=N
 *   Streams filtered, display-ready Wikipedia articles as NDJSON
 *   (one JSON object per line) as soon as each one passes the filters.
 *   The client can start displaying slides the moment 10 articles arrive
 *   instead of waiting for all N to be fetched.
 *
 * Everything else falls through to static assets (index.html, etc.)
 */

// ── Wikipedia API ──────────────────────────────────────────────────────────
const WIKI_URL     = 'https://en.wikipedia.org/api/rest_v1/page/random/summary';
const WIKI_HEADERS = {
  'User-Agent': 'WikiWall/1.0 (https://github.com/2dapps/wikiwall)',
  'Accept':     'application/json',
};

// ── Fetch tuning ───────────────────────────────────────────────────────────
const WAVE_SIZE  = 10;  // parallel Wikipedia requests per wave
const WAVE_DELAY = 300; // ms between waves — polite gap, faster on server-side
const MAX_COUNT  = 5;   // hard cap per request — keeps Worker well within time limits

// ── Image quality thresholds ───────────────────────────────────────────────
const MIN_IMG_WIDTH  = 1280;
const MIN_IMG_HEIGHT = 720;

// ── NSFW / inappropriate content blocklist ─────────────────────────────────
// Applied to title + description + extract before including an article.
const BLOCKLIST = [
  'murder', 'homicide', 'massacre', 'genocide', 'war crime',
  'pornograph', 'nude', 'naked', 'nudity', 'erotic',
  'sexual', 'genitalia', 'penis', 'vagina', 'breast',
  'autopsy', 'corpse', 'cadaver', 'decompos',
  'execution', 'beheading', 'torture', 'mutilat',
  'suicide', 'self-harm',
  'surgery', 'surgical wound', 'graphic injury',
  'drug use', 'overdose',
];

// ── Helpers ────────────────────────────────────────────────────────────────

function isGoodArticle(a) {
  if (!a) return false;

  // Must have a high-res original image meeting minimum dimensions
  const orig = a.originalimage;
  if (!orig || orig.width < MIN_IMG_WIDTH || orig.height < MIN_IMG_HEIGHT) return false;

  // Must have a usable thumbnail URL for display
  if (!a.thumbnail?.source) return false;

  // Must have enough text to show something meaningful
  if (!a.extract || a.extract.length < 80) return false;
  if (!a.titles?.normalized) return false;

  // NSFW check — scan title + description + extract together
  const text = [
    a.titles?.normalized ?? '',
    a.description ?? '',
    a.extract ?? '',
  ].join(' ').toLowerCase();

  if (BLOCKLIST.some(word => text.includes(word))) return false;

  return true;
}

function toSlide(a) {
  // Rewrite thumbnail URL to request a 1920px-wide version for crisp display
  const image = a.thumbnail.source.replace(/\/\d+px-/, '/1920px-');
  return {
    image,
    title:    a.titles.normalized,
    // First two sentences — enough context without overflowing the screen
    extract:  a.extract.split('. ').slice(0, 2).join('. ').trim().replace(/\.+$/, '') + '.',
    category: a.description ?? 'Wikipedia',
  };
}

// ── Worker entry point ─────────────────────────────────────────────────────

export default {
  async fetch(request, env) {
    const { pathname, searchParams } = new URL(request.url);

    if (pathname === '/api/batch') {
      const count = Math.min(
        Math.max(1, parseInt(searchParams.get('count') || '10', 10)),
        MAX_COUNT
      );

      const encoder = new TextEncoder();
      const { readable, writable } = new TransformStream();
      const writer = writable.getWriter();

      // Fetch and filter articles in the background, writing each one to the
      // stream as soon as it passes. The client can start rendering immediately
      // rather than waiting for the full batch to be assembled.
      (async () => {
        let sent = 0;

        while (sent < count) {
          const wave = await Promise.all(
            Array.from({ length: WAVE_SIZE }, () =>
              fetch(WIKI_URL, { headers: WIKI_HEADERS })
                .then(r => r.json())
                .catch(() => null)
            )
          );

          for (const article of wave) {
            if (sent >= count) break;
            if (!isGoodArticle(article)) continue;
            // Write one article as a JSON line — client reads these one by one
            await writer.write(encoder.encode(JSON.stringify(toSlide(article)) + '\n'));
            sent++;
          }

          if (sent < count) await new Promise(r => setTimeout(r, WAVE_DELAY));
        }

        await writer.close();
      })();

      return new Response(readable, {
        headers: {
          'Content-Type':  'application/x-ndjson',
          'Cache-Control': 'no-store',
        },
      });
    }

    // All other routes — serve static assets (index.html, etc.)
    return env.ASSETS.fetch(request);
  },
};