From 8d60c7f93407988ee0232ea90980028f299cb0f3 Mon Sep 17 00:00:00 2001 From: srdusr Date: Fri, 26 Sep 2025 13:39:28 +0200 Subject: Initial Commit --- scripts/extract_texts.js | 113 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 scripts/extract_texts.js (limited to 'scripts/extract_texts.js') diff --git a/scripts/extract_texts.js b/scripts/extract_texts.js new file mode 100644 index 0000000..acd5660 --- /dev/null +++ b/scripts/extract_texts.js @@ -0,0 +1,113 @@ +#!/usr/bin/env node +/* + Extract paragraphs from mirrored sites under similar/ to build a large texts.json. + - Scans HTML files in similar/play.typeracer.com, similar/monkeytype.com, etc. + - Extracts visible text nodes, splits into paragraphs, filters by length. + - Deduplicates and shuffles, attaches category from source directory, and attribution as the source path. + - Writes to repo-root texts.json for both CLI and Web to use. +*/ +const fs = require('fs'); +const path = require('path'); +const { glob } = require('glob'); +const cheerio = require('cheerio'); + +const ROOT = path.resolve(__dirname, '..'); +const SIMILAR_DIR = path.join(ROOT, 'similar'); +const OUTPUT = path.join(ROOT, 'texts.json'); + +function isLikelyVisibleText(text) { + const t = text.replace(/\s+/g, ' ').trim(); + if (!t) return false; + if (t.length < 60) return false; // avoid too-short snippets + // avoid nav/footer boilerplate + if (/©|copyright|cookie|privacy|terms|policy|subscribe|sign in|login|menu|footer|header/i.test(t)) return false; + return true; +} + +function splitIntoParagraphs(text) { + // Split by double newline or sentence groups + const blocks = text + .split(/\n\s*\n|\r\n\r\n/) + .map(s => s.replace(/\s+/g, ' ').trim()) + .filter(Boolean); + const paras = []; + for (const b of blocks) { + // Further chunk into 80-350 char ranges + if (b.length <= 400) { + paras.push(b); + } else { + let start = 0; + while (start < b.length) { + let end = Math.min(start + 350, b.length); + // try to cut at sentence boundary + const slice = b.slice(start, end); + const lastPeriod = slice.lastIndexOf('. '); + const lastComma = slice.lastIndexOf(', '); + const cut = lastPeriod > 150 ? lastPeriod + 1 : (lastComma > 150 ? lastComma + 1 : slice.length); + paras.push(slice.slice(0, cut).trim()); + start += cut; + } + } + } + return paras; +} + +(async () => { + try { + const htmlFiles = await glob('**/*.html', { cwd: SIMILAR_DIR, absolute: true, dot: false, nodir: true }); + const items = []; + const seen = new Set(); + + for (const file of htmlFiles) { + const rel = path.relative(SIMILAR_DIR, file); + const parts = rel.split(path.sep); + const category = parts[0]?.replace(/\W+/g, '').toLowerCase() || 'general'; + const attribution = `similar/${rel}`; + + const html = fs.readFileSync(file, 'utf8'); + const $ = cheerio.load(html); + + // Remove script/style/nav/footer elements + $('script, style, nav, footer, header, noscript').remove(); + // Collect text from paragraphs and common content containers + const textBits = []; + $('p, article, main, section, .content, .text, .article, .post').each((_, el) => { + const t = $(el).text(); + if (isLikelyVisibleText(t)) textBits.push(t); + }); + + const combined = textBits.join('\n\n'); + if (!combined.trim()) continue; + + const paras = splitIntoParagraphs(combined) + .map(s => s.replace(/\s+/g, ' ').trim()) + .filter(s => s.length >= 80 && s.length <= 400); + + for (const content of paras) { + const key = content.toLowerCase(); + if (seen.has(key)) continue; + seen.add(key); + items.push({ category, content, attribution }); + } + } + + // Shuffle + for (let i = items.length - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + [items[i], items[j]] = [items[j], items[i]]; + } + + // If not enough, keep existing texts.json and merge + let existing = []; + if (fs.existsSync(OUTPUT)) { + try { existing = JSON.parse(fs.readFileSync(OUTPUT, 'utf8')); } catch {} + } + const merged = [...items, ...existing].slice(0, 5000); // cap to 5k entries + + fs.writeFileSync(OUTPUT, JSON.stringify(merged, null, 2)); + console.log(`Wrote ${merged.length} texts to ${OUTPUT}`); + } catch (err) { + console.error('extract_texts failed:', err); + process.exit(1); + } +})(); -- cgit v1.2.3