aboutsummaryrefslogtreecommitdiff
path: root/scripts/extract_texts.js
diff options
context:
space:
mode:
authorsrdusr <trevorgray@srdusr.com>2025-09-26 13:39:28 +0200
committersrdusr <trevorgray@srdusr.com>2025-09-26 13:39:28 +0200
commit8d60c7f93407988ee0232ea90980028f299cb0f3 (patch)
treeb343b691d1bce64fb3bc9b40324857486f2be244 /scripts/extract_texts.js
parent76f0d0e902e6ed164704572bd81faa5e5e560cf3 (diff)
downloadtyperpunk-8d60c7f93407988ee0232ea90980028f299cb0f3.tar.gz
typerpunk-8d60c7f93407988ee0232ea90980028f299cb0f3.zip
Initial Commit
Diffstat (limited to 'scripts/extract_texts.js')
-rw-r--r--scripts/extract_texts.js113
1 files changed, 113 insertions, 0 deletions
diff --git a/scripts/extract_texts.js b/scripts/extract_texts.js
new file mode 100644
index 0000000..acd5660
--- /dev/null
+++ b/scripts/extract_texts.js
@@ -0,0 +1,113 @@
+#!/usr/bin/env node
+/*
+ Extract paragraphs from mirrored sites under similar/ to build a large texts.json.
+ - Scans HTML files in similar/play.typeracer.com, similar/monkeytype.com, etc.
+ - Extracts visible text nodes, splits into paragraphs, filters by length.
+ - Deduplicates and shuffles, attaches category from source directory, and attribution as the source path.
+ - Writes to repo-root texts.json for both CLI and Web to use.
+*/
+const fs = require('fs');
+const path = require('path');
+const { glob } = require('glob');
+const cheerio = require('cheerio');
+
+const ROOT = path.resolve(__dirname, '..');
+const SIMILAR_DIR = path.join(ROOT, 'similar');
+const OUTPUT = path.join(ROOT, 'texts.json');
+
+function isLikelyVisibleText(text) {
+ const t = text.replace(/\s+/g, ' ').trim();
+ if (!t) return false;
+ if (t.length < 60) return false; // avoid too-short snippets
+ // avoid nav/footer boilerplate
+ if (/©|copyright|cookie|privacy|terms|policy|subscribe|sign in|login|menu|footer|header/i.test(t)) return false;
+ return true;
+}
+
+function splitIntoParagraphs(text) {
+ // Split by double newline or sentence groups
+ const blocks = text
+ .split(/\n\s*\n|\r\n\r\n/)
+ .map(s => s.replace(/\s+/g, ' ').trim())
+ .filter(Boolean);
+ const paras = [];
+ for (const b of blocks) {
+ // Further chunk into 80-350 char ranges
+ if (b.length <= 400) {
+ paras.push(b);
+ } else {
+ let start = 0;
+ while (start < b.length) {
+ let end = Math.min(start + 350, b.length);
+ // try to cut at sentence boundary
+ const slice = b.slice(start, end);
+ const lastPeriod = slice.lastIndexOf('. ');
+ const lastComma = slice.lastIndexOf(', ');
+ const cut = lastPeriod > 150 ? lastPeriod + 1 : (lastComma > 150 ? lastComma + 1 : slice.length);
+ paras.push(slice.slice(0, cut).trim());
+ start += cut;
+ }
+ }
+ }
+ return paras;
+}
+
+(async () => {
+ try {
+ const htmlFiles = await glob('**/*.html', { cwd: SIMILAR_DIR, absolute: true, dot: false, nodir: true });
+ const items = [];
+ const seen = new Set();
+
+ for (const file of htmlFiles) {
+ const rel = path.relative(SIMILAR_DIR, file);
+ const parts = rel.split(path.sep);
+ const category = parts[0]?.replace(/\W+/g, '').toLowerCase() || 'general';
+ const attribution = `similar/${rel}`;
+
+ const html = fs.readFileSync(file, 'utf8');
+ const $ = cheerio.load(html);
+
+ // Remove script/style/nav/footer elements
+ $('script, style, nav, footer, header, noscript').remove();
+ // Collect text from paragraphs and common content containers
+ const textBits = [];
+ $('p, article, main, section, .content, .text, .article, .post').each((_, el) => {
+ const t = $(el).text();
+ if (isLikelyVisibleText(t)) textBits.push(t);
+ });
+
+ const combined = textBits.join('\n\n');
+ if (!combined.trim()) continue;
+
+ const paras = splitIntoParagraphs(combined)
+ .map(s => s.replace(/\s+/g, ' ').trim())
+ .filter(s => s.length >= 80 && s.length <= 400);
+
+ for (const content of paras) {
+ const key = content.toLowerCase();
+ if (seen.has(key)) continue;
+ seen.add(key);
+ items.push({ category, content, attribution });
+ }
+ }
+
+ // Shuffle
+ for (let i = items.length - 1; i > 0; i--) {
+ const j = Math.floor(Math.random() * (i + 1));
+ [items[i], items[j]] = [items[j], items[i]];
+ }
+
+ // If not enough, keep existing texts.json and merge
+ let existing = [];
+ if (fs.existsSync(OUTPUT)) {
+ try { existing = JSON.parse(fs.readFileSync(OUTPUT, 'utf8')); } catch {}
+ }
+ const merged = [...items, ...existing].slice(0, 5000); // cap to 5k entries
+
+ fs.writeFileSync(OUTPUT, JSON.stringify(merged, null, 2));
+ console.log(`Wrote ${merged.length} texts to ${OUTPUT}`);
+ } catch (err) {
+ console.error('extract_texts failed:', err);
+ process.exit(1);
+ }
+})();