From c6b78460a4f0d8eaaa18f5a023b39bd7415bf695 Mon Sep 17 00:00:00 2001 From: SansGuidon Date: Wed, 8 Jan 2025 15:02:19 +0000 Subject: [PATCH] update(indieblog) enrich exclusion rules and add dry-run --- cron/generate_indieblog_daily_rss.php | 86 ++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 16 deletions(-) diff --git a/cron/generate_indieblog_daily_rss.php b/cron/generate_indieblog_daily_rss.php index d82a174..043b6dc 100644 --- a/cron/generate_indieblog_daily_rss.php +++ b/cron/generate_indieblog_daily_rss.php @@ -1,5 +1,5 @@ str_replace('/', '\/', $pattern), $excludePatterns); // Crée une expression régulière unique pour matcher toutes les exclusions $excludeRegex = '/' . implode('|', $excludePatterns) . '/i'; +// Vérifier si le script est en mode dry run +$isDryRun = in_array('dry-run', $argv); +$maxResults = 100; // Limite par défaut pour la sortie +foreach ($argv as $arg) { + if (preg_match('/^max-results=(\d+)$/', $arg, $matches)) { + $maxResults = (int)$matches[1]; + } +} + @mkdir(dirname($rssFile), 0777, true); @mkdir(dirname($cacheFile), 0777, true); @@ -53,15 +74,48 @@ if (!$data) die("Error: Invalid JSON data from $jsonUrl\n"); $includedIds = file_exists($cacheFile) ? file($cacheFile, FILE_IGNORE_NEW_LINES) : []; // Filtrer les articles récents, non déjà inclus, et ne contenant pas de termes exclus -$newItems = array_filter($data, function ($item) use ($includedIds, $cutoffDate, $excludeRegex) { - return isset($item['published'], $item['itemid'], $item['itemtitle']) && - !in_array($item['itemid'], $includedIds) && - $item['published'] >= $cutoffDate && - !preg_match($excludeRegex, $item['itemtitle']); // Exclure si le titre contient des termes interdits -}); +$newItems = []; +$excludedItems = []; + +foreach ($data as $item) { + if (!isset($item['published'], $item['itemid'], $item['itemtitle'])) { + continue; + } + + $published = $item['published']; + $itemId = $item['itemid']; + $itemTitle = $item['itemtitle']; + + if (in_array($itemId, $includedIds) || $published < $cutoffDate || preg_match($excludeRegex, $itemTitle)) { + $excludedItems[] = $item; + } else { + $newItems[] = $item; + } +} usort($newItems, fn($a, $b) => $b['published'] <=> $a['published']); +usort($excludedItems, fn($a, $b) => $b['published'] <=> $a['published']); +if ($isDryRun) { + // Mode dry-run : afficher les entrées incluses et exclues + echo "=== Dry Run Mode ===\n"; + + $showIncluded = array_slice($newItems, 0, $maxResults); + $showExcluded = array_slice($excludedItems, 0, $maxResults); + + echo "Filtered IN (" . count($newItems) . " total, showing up to $maxResults):\n"; + foreach ($showIncluded as $item) { + echo "- " . ($item['itemtitle'] ?? 'No title') . "\n"; + } + + echo "\nFiltered OUT (" . count($excludedItems) . " total, showing up to $maxResults):\n"; + foreach ($showExcluded as $item) { + echo "- " . ($item['itemtitle'] ?? 'No title') . "\n"; + } + exit(0); +} + +// Groupement par jour pour les nouveaux éléments $groupedByDay = []; foreach ($newItems as $item) { $day = isset($item['published']) && is_numeric($item['published']) @@ -111,4 +165,4 @@ function addCData(SimpleXMLElement $node, $content) $domNode = dom_import_simplexml($node); $domOwner = $domNode->ownerDocument; $domNode->appendChild($domOwner->createCDATASection($content)); -} +} \ No newline at end of file