<?php // php /app/data/scripts/generate_indieblog_daily_rss.php dry-run max-results=150 date=YYYY-MM-DD // Configuration $jsonUrl = "https://indieblog.page/export"; $rssFile = "/app/data/public/indieblogv2_feed.xml"; $cacheFile = "/app/data/cache/indieblogv2_cache.txt"; $cutoffDate = strtotime('-10 days'); // Only last 10 days // Allow target date override via CLI: date=YYYY-MM-DD $targetDate = null; if (PHP_SAPI === 'cli') { foreach ($argv as $arg) { if (preg_match('/^date=(\d{4}-\d{2}-\d{2})$/', $arg, $matches)) { $targetDate = $matches[1]; break; } } } if (!$targetDate) { $targetDate = date('Y-m-d'); } // List of exclude patterns (case insensitive) $excludePatterns = [ '(?i)(azure|powershell|rimworld|Olympique|Olympic|Paris 2024|sportif|windows|twitter|freebsd|mastodon|kirby|flask|wp engine|cobol)', '(?i)(metalcore|deathcore|death metal|black metal|death punk|deathpunk|djent|blackmetal|blackened|bitcoin)', '(?i)(JO|foot|menace|de mort|iptv|ip tv|automated archives|announcing systemd)', '(?i)(guerre|bluesky|combat|terror|craindre|fear|décliner|declin|climat|inégalité|fuite|ont fui|a fui|musk|tesla|voitures|assassiné|dying|indifference|feux|abandon|chômage|scam|arnaque|fraud|inquiét|openai)', '(?i)(victime|violence|violent|violemm|pénurie|impôt|taxes|deuil|chasse|bombes|a perdu|vide exist|fake|poursuivi|subir|pauvre|riches|politique|politic|Macron|marine le pen|michel barnier|zemmour)', '(?i)(linux mint|Matignon|tremble|aucun n\'a|décès|meurtre|crise|difficile|marre|action contre|irrecevable|pollution|élimination|pourrai|tourisme|mauvaise humeur|stress|dangereux|détruit|insupportable|de droite)', '(?i)(souffre|chaleur|cyber|attaque|homophob|extrême.droite|risque|travail|offensi|sacrif|canard pc|photos|videos|crisis|trump|nocif|negatif|négative|poids|plainte|escro|usurp|contamin|advertising)', '(?i)(anniversary edition|en voyage|extermin|kamikaze|nationalis|mittal|est prévu pour 202|is planned for|due in 202|incendies|fires|inondation|ligue 1|le suicide|gaza|terreur|capitalism|attentat)', '(?i)(calvaire|calvitie|la gauche|esclave|paris|ségrégation|athlète|précarité|Hezbollah|Israël|morts|frappes|marine lepen|dérives|un drame|sans-abri|protobuf|paralympique|nazi|mélenchon|ministre)', '(?i)(Review: (?:[1-3]\.\d|[1-2]\.\d|3\.0)/5\.0|licenciement|misère|du jamais vu|il faut|Palestiniens|israél|palestine|vaccin|mpox|épidémie|violeur|rapist|pédo|criminel|poutine|le RN)', '(?i)(réseau social x|gendarme|policier|kurde|condamnation|perquisition|incendi|gabriel attal|attaqué|justice|tués par|tué par|snapdragon|qualcomm|porn|barnier|occultes|désinformation|immigration|racism|spacex|islam|condamné|de viol|pour viol)', // Year-related filters '(?i)(\b\d{4}\b|(?:\b(?:at|in|of)\s\d{4}\b)|(?:\b\d{4}\b\.$|\(\d{4}\)))', '(?i)(^202\d\b|\b202\d$)', '(?i)(\bin\s202\d|\bat\s202\d)', '(?i)(my\s.*\b202\d\b)', // Exclude non-ASCII characters, Cyrillic, Greek, Han, etc. '[^\x00-\x7F]', '[\p{Cyrillic}]', '[\p{Greek}]', '[\p{Han}\p{Hiragana}\p{Katakana}]', '[äöüßÄÖÜẞ]', '(?i)(വേമ്പനാട്ട്)', '(?i)(the week|week notes|weekly|weeknote|digest)', '(?i)(new year|yearly)', '(?i)(monday|tuesday|wednesday|thursday|friday|saturday|sunday)', '(?i)(january|february|march|april|may|june|july|august|september|october|november|december)', '(?i)(today|yesterday|tomorrow|this year|this week)', '(?i)(merry christmas)' ]; // Fix slashes for regex $excludePatterns = array_map(fn($pattern) => str_replace('/', '\/', $pattern), $excludePatterns); $excludeRegex = '/' . implode('|', $excludePatterns) . '/i'; // Check for dry-run mode $isDryRun = in_array('dry-run', $argv); $maxResults = 100; // Default limit foreach ($argv as $arg) { if (preg_match('/^max-results=(\d+)$/', $arg, $matches)) { $maxResults = (int)$matches[1]; } } @mkdir(dirname($rssFile), 0777, true); @mkdir(dirname($cacheFile), 0777, true); $jsonData = @file_get_contents($jsonUrl); if (!$jsonData) die("Error: Unable to fetch JSON from $jsonUrl\n"); $data = json_decode($jsonData, true); if (!$data) die("Error: Invalid JSON data from $jsonUrl\n"); $includedIds = file_exists($cacheFile) ? file($cacheFile, FILE_IGNORE_NEW_LINES) : []; // Filter recent articles, not already included, and not matching exclude regex $newItems = []; $excludedItems = []; foreach ($data as $item) { if (!isset($item['published'], $item['itemid'], $item['itemtitle'])) { continue; } $published = $item['published']; $itemId = $item['itemid']; $itemTitle = $item['itemtitle']; if (in_array($itemId, $includedIds) || $published < $cutoffDate || preg_match($excludeRegex, $itemTitle)) { $excludedItems[] = $item; } else { $newItems[] = $item; } } usort($newItems, fn($a, $b) => $b['published'] <=> $a['published']); usort($excludedItems, fn($a, $b) => $b['published'] <=> $a['published']); // Dry run: display filtered items if ($isDryRun) { echo "=== Dry Run Mode ===\n"; $showIncluded = array_slice($newItems, 0, $maxResults); $showExcluded = array_slice($excludedItems, 0, $maxResults); echo "Filtered IN (" . count($newItems) . " total, showing up to $maxResults):\n"; foreach ($showIncluded as $item) { echo "- " . ($item['itemtitle'] ?? 'No title') . "\n"; } echo "\nFiltered OUT (" . count($excludedItems) . " total, showing up to $maxResults):\n"; foreach ($showExcluded as $item) { echo "- " . ($item['itemtitle'] ?? 'No title') . "\n"; } exit(0); } // Group items only for the target date $groupedByDay = []; foreach ($newItems as $item) { $day = isset($item['published']) && is_numeric($item['published']) ? date('Y-m-d', $item['published']) : date('Y-m-d'); if ($day !== $targetDate) { continue; } $groupedByDay[$day][] = $item; } $rss = new SimpleXMLElement('<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"></rss>'); $channel = $rss->addChild('channel'); $channel->addChild('title', 'IndieBlog Feed (Filtered, Last 10 Days)'); $channel->addChild('link', 'https://indieblog.page/'); $channel->addChild('description', 'RSS feed of articles from the last 10 days, excluding unwanted topics'); $channel->addChild('language', 'en'); $newEntries = []; foreach ($groupedByDay as $day => $items) { if (count($items) < 10) { continue; } $rssItem = $channel->addChild('item'); $rssItem->addChild('title', "Links for $day"); $rssItem->addChild('link', "https://indieblog.page/$day#" . md5(json_encode($items))); $rssItem->addChild('guid', "https://indieblog.page/$day#" . md5(json_encode($items))); $rssItem->addChild('pubDate', date(DATE_RSS, strtotime($day))); $description = "<ul>"; foreach ($items as $item) { $postTitle = htmlspecialchars($item['itemtitle'] ?? 'No title', ENT_XML1); $postUrl = htmlspecialchars($item['itemurl'] ?? '#', ENT_XML1); $blogTitle = htmlspecialchars($item['feedtitle'] ?? 'Unknown Blog', ENT_XML1); $blogUrl = htmlspecialchars($item['feedurl'] ?? '#', ENT_XML1); $description .= "<li><a href=\"$postUrl\">$postTitle</a> (<a href=\"$blogUrl\">$blogTitle</a>)</li>"; $newEntries[] = $item['itemid']; } $description .= "</ul>"; addCData($rssItem->addChild('description'), $description); } if (!empty($newEntries)) { file_put_contents($cacheFile, implode("\n", array_merge($includedIds, $newEntries))); } $rss->asXML($rssFile); function addCData(SimpleXMLElement $node, $content) { $domNode = dom_import_simplexml($node); $domOwner = $domNode->ownerDocument; $domNode->appendChild($domOwner->createCDATASection($content)); } ?>