From e3f0f39a9d5acb74b67a050237d39bbddc6273cf Mon Sep 17 00:00:00 2001 From: Morgan Wattiez Date: Sun, 18 May 2025 20:47:18 +0200 Subject: [PATCH] add adapter for filmstouspublics --- .gitignore | 1 + aggregators/filmstouspublics-adapter.js | 316 ++++++++++++++++++++++++ server.js | 79 ++---- 3 files changed, 344 insertions(+), 52 deletions(-) create mode 100644 aggregators/filmstouspublics-adapter.js diff --git a/.gitignore b/.gitignore index 3c3629e..47dfb72 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ node_modules +cache diff --git a/aggregators/filmstouspublics-adapter.js b/aggregators/filmstouspublics-adapter.js new file mode 100644 index 0000000..ff5f8bd --- /dev/null +++ b/aggregators/filmstouspublics-adapter.js @@ -0,0 +1,316 @@ +const axios = require('axios'); +const cheerio = require('cheerio'); +const fs = require('fs'); +const path = require('path'); +const BASE_URL = 'https://www.filmstouspublics.fr'; + +// Setup disk cache +const CACHE_DIR = path.join(__dirname, '../cache'); +if (!fs.existsSync(CACHE_DIR)) fs.mkdirSync(CACHE_DIR, { recursive: true }); + +// Load cache from disk if available +function loadCache(type, key) { + try { + const file = path.join(CACHE_DIR, `${type}_${key.replace(/[^a-z0-9]/gi, '_')}.json`); + if (fs.existsSync(file)) { + const stats = fs.statSync(file); + // Cache valid for 24 hours + if (Date.now() - stats.mtime.getTime() < 86400000) { + console.log(`Cache hit for ${type}:`, key); + return JSON.parse(fs.readFileSync(file, 'utf8')); + } + } + } catch (e) { + console.error('Cache load error:', e.message); + } + return null; +} + +// Save cache to disk +function saveCache(type, key, data) { + try { + const file = path.join(CACHE_DIR, `${type}_${key.replace(/[^a-z0-9]/gi, '_')}.json`); + fs.writeFileSync(file, JSON.stringify(data)); + console.log(`Saved to cache: ${type}/${key}`); + } catch (e) { + console.error('Cache save error:', e.message); + } +} + +// Calculate average and median ages from country ratings +function calculateAverageAge(ageRatings) { + // Only include positive age restrictions (exclude zeros/Tous publics) + const ages = Object.values(ageRatings) + .filter(age => { + const numAge = typeof age === 'string' ? parseInt(age) : age; + return !isNaN(numAge) && numAge > 0; // Only include positive ages + }) + .map(age => typeof age === 'string' ? parseInt(age) : age); + + if (ages.length === 0) return null; + + // Calculate average + const avg = ages.reduce((sum, age) => sum + age, 0) / ages.length; + + // Calculate median (more useful for skewed distributions) + const sorted = [...ages].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + const median = sorted.length % 2 === 0 + ? (sorted[mid - 1] + sorted[mid]) / 2 + : sorted[mid]; + + return { average: avg.toFixed(1), median, countries: ages.length, min: sorted[0], max: sorted[sorted.length-1] }; +} + +async function searchMovies(query) { + // Check cache first + const cached = loadCache('search', query); + if (cached) return cached; + + const searchUrl = `${BASE_URL}/?s=${encodeURIComponent(query)}`; + console.log('Searching FilmsTousPublics:', searchUrl); + + try { + const response = await axios.get(searchUrl, { + headers: { 'User-Agent': 'Mozilla/5.0' } + }); + const $ = cheerio.load(response.data); + const results = []; + + // Better selector for different article structures + $('article[class*="tipi-xs-12"], article[class*="post"]').each((_, el) => { + const title = $(el).find('h3.title a, .title-wrap .title a').text().trim(); + const link = $(el).find('h3.title a, .title-wrap .title a').attr('href'); + + // Handle lazy-loaded images properly + const imgEl = $(el).find('.mask img'); + const img = imgEl.attr('data-lazy-src') || imgEl.attr('src'); + + // Get rating if available + let rating = null; + const ratingEl = $(el).find('.lets-review-api-wrap, .lets-review-final-score'); + if (ratingEl.length) { + rating = ratingEl.attr('data-api-score') || ratingEl.text().trim(); + } + + if (title && link) { + results.push({ + title, + link, + img, + rating, + }); + } + }); + + console.log(`FilmsTousPublics found ${results.length} results`); + saveCache('search', query, results); + return results; + } catch (error) { + console.error('Error searching FilmsTousPublics:', error.message); + return []; + } +} + +async function getMovieClassification(movieUrl) { + if (!movieUrl) return {}; + + // Check cache first + const cached = loadCache('detail', movieUrl); + if (cached) return cached; + + console.log('Fetching details for:', movieUrl); + + try { + const response = await axios.get(movieUrl, { + headers: { 'User-Agent': 'Mozilla/5.0' } + }); + const $ = cheerio.load(response.data); + + // Get country age ratings + const ageRatings = {}; + console.log('Found pullquote elements:', $('aside.pullquote').length); + + // More robust approach: Find all

tags inside the pullquote section + $('aside.pullquote p').each((_, el) => { + const text = $(el).text().trim(); + console.log('Processing age text:', text); + + // Detect "Tous publics" for France (All audiences) + if (text.includes('Tous publics')) { + ageRatings.france = 0; // Set to 0 for averaging but "All" for display + console.log('Found France rating: Tous publics (0)'); + } + // More flexible regex for the weird dashes used + else { + // Just extract any number that appears in string after "Déconseillé aux" + const match = text.match(/Déconseillé aux[^0-9]*(\d+)[^0-9]*ans/i); + if (match && match[1]) { + const age = parseInt(match[1]); + console.log('Found age restriction:', age); + + // Identify country by image alt or src + const img = $(el).find('img'); + const alt = img.attr('alt') || ''; + const src = img.attr('src') || ''; + + // Check for all possible countries - more flexible matching + if (alt.includes('France') || src.toLowerCase().includes('france')) { + ageRatings.france = age; + } else if (alt.includes('Allemagne') || src.toLowerCase().includes('allemagne')) { + ageRatings.germany = age; + } else if (alt.includes('espagne') || alt.includes('Espagne') || src.toLowerCase().includes('espagne')) { + ageRatings.spain = age; + } else if (alt.includes('Royaume') || src.toLowerCase().includes('royaume')) { + ageRatings.uk = age; + } else if (alt.includes('Suede') || alt.includes('Suède') || src.toLowerCase().includes('suede')) { + ageRatings.sweden = age; + } else if (alt.includes('Suisse') || src.toLowerCase().includes('suisse')) { + ageRatings.switzerland = age; + } else if (alt.includes('Pays') || src.toLowerCase().includes('pays-bas')) { + ageRatings.netherlands = age; + } else if (src.toLowerCase().includes('etats-unis')) { + ageRatings.usa = age; + } else { + // Unknown country, log for debugging + console.log('Unidentified country with age rating:', age, 'Alt:', alt, 'Src:', src); + } + } + } + }); + + console.log('Found age ratings:', ageRatings); + + // Get summary/plot (first few paragraphs) + let summary = ''; + $('.entry-content > p').each((i, el) => { + // Skip pullquote or other non-content paragraphs + if (!$(el).find('.pullquote').length && i < 3 && $(el).text().trim().length > 30) { + summary += $(el).text().trim() + ' '; + } + }); + summary = summary.trim(); + + // Get movie metadata + const metadata = {}; + $('h3:contains("Informations") + ul li').each((_, el) => { + const text = $(el).text().trim(); + if (text.includes('Durée :')) { + metadata.duration = text.replace('Durée :', '').trim(); + } else if (text.includes('Nom original :')) { + metadata.originalTitle = text.replace('Nom original :', '').trim(); + } else if (text.includes('Sortie :')) { + metadata.releaseDate = text.replace('Sortie :', '').trim(); + } else if (text.includes('Réalisateur :')) { + metadata.director = text.replace('Réalisateur :', '').trim(); + } else if (text.includes('Producteur :')) { + metadata.producer = text.replace('Producteur :', '').trim(); + } else if (text.includes('Acteurs :')) { + metadata.actors = text.replace('Acteurs :', '').trim(); + } else if (text.includes('Studio :')) { + metadata.studio = text.replace('Studio :', '').trim(); + } + }); + + // Get overall rating + let overallRating = null; + const ratingEl = $('.lets-review-block__final-score .score'); + if (ratingEl.length) { + overallRating = ratingEl.text().trim(); + } + + // Extract year from release date if available + let year = null; + if (metadata.releaseDate) { + const yearMatch = metadata.releaseDate.match(/\b(19|20)\d{2}\b/); + if (yearMatch) { + year = yearMatch[0]; + } + } + + const result = { + summary, + year, + ageRatings, + overallRating, + ...metadata + }; + + // Cache the result + saveCache('detail', movieUrl, result); + return result; + } catch (error) { + console.error('Error getting FilmsTousPublics movie details:', error.message); + return {}; + } +} + +async function searchAndEnrich(query) { + try { + const results = await searchMovies(query); + return await Promise.all(results.map(async movie => { + const details = await getMovieClassification(movie.link); + + // Calculate average age + const ageStats = calculateAverageAge(details.ageRatings || {}); + console.log(`Movie: ${movie.title}, Age stats:`, ageStats); + + // Convert country codes to readable names for frontend display + const countryNames = { + france: "France", + germany: "Allemagne", + spain: "Espagne", + uk: "Royaume-Uni", + sweden: "Suède", + switzerland: "Suisse", + netherlands: "Pays-Bas", + usa: "États-Unis" + }; + + // Format age ratings for display + const formattedAgeRatings = {}; + for (const [country, age] of Object.entries(details.ageRatings || {})) { + const countryName = countryNames[country] || country; + formattedAgeRatings[countryName] = age === 0 ? "Tous publics" : `${age}+`; + } + + // Get a recommended age - prefer median, then average, then fallback + const recommendedAge = + ageStats?.median ? `${ageStats.median}+` : + ageStats?.average ? `${ageStats.average}+` : + details.ageRatings?.france === 0 ? "Tous publics" : + details.ageRatings?.france ? `${details.ageRatings.france}+` : + "Non spécifié"; + + return { + title: movie.title, + year: details.year, + img: movie.img, + link: movie.link, + source: 'filmstouspublics', + rating: movie.rating || details.overallRating, + // Age information for display + age: recommendedAge.replace('Tous publics', '0+').replace('Non spécifié', '-'), + ageFrance: details.ageRatings?.france === 0 ? "Tous publics" : + details.ageRatings?.france ? `${details.ageRatings.france}+` : + "Non spécifié", + ageAverage: ageStats?.average || null, + ageMedian: ageStats?.median || null, + ageRecommended: recommendedAge, + countriesCount: ageStats?.countries || 0, + ageRange: { + min: ageStats?.min || null, + max: ageStats?.max || null + }, + ageDetails: formattedAgeRatings, + summary: details.summary, + ...details + }; + })); + } catch (error) { + console.error('FilmsTousPublics searchAndEnrich error:', error.message); + return []; + } +} + +module.exports = { searchAndEnrich }; diff --git a/server.js b/server.js index 91432b6..8b43cd4 100644 --- a/server.js +++ b/server.js @@ -3,80 +3,55 @@ const cors = require('cors'); const cinecheck = require('./aggregators/cinecheck-adapter'); const commonsense = require('./aggregators/commonsense-adapter'); const filmages = require('./aggregators/filmages-adapter'); +const filmstouspublics = require('./aggregators/filmstouspublics-adapter'); const { mergeResults } = require('./merge'); const app = express(); app.use(cors()); -// Helper to normalize text and get words for matching -function getWords(text) { - if (!text || typeof text !== 'string') return []; - return text - .toLowerCase() - // Remove punctuation, keep letters, numbers, and whitespace. Handles Unicode. - .replace(/[^\p{L}\p{N}\s]/gu, '') - .replace(/\s+/g, ' ') // Normalize multiple spaces to single - .trim() - .split(' ') - .filter(Boolean); // Remove empty strings from split -} - app.get('/search', async (req, res) => { const q = req.query.q; if (!q) { - return res.status(400).json({ error: "Missing query. Predictable." }); + return res.status(400).json({ error: "Missing query. Try typing words." }); } + + console.log('===== SEARCH LOGS ====='); + console.log('Query:', q); + try { - const [cine, cs, fa] = await Promise.all([ - cinecheck.searchAndEnrich(q).catch(e => { console.error('Cinecheck failed:', e.message); return []; }), - commonsense.searchAndEnrich(q).catch(e => { console.error('Commonsense failed:', e.message); return []; }), - filmages.searchAndEnrich(q).catch(e => { console.error('Filmages failed:', e.message); return []; }) + const [cine, cs, fa, ftp] = await Promise.all([ + cinecheck.searchAndEnrich(q).catch(e => { + console.error('Cinecheck failed:', e.message); + return []; + }), + commonsense.searchAndEnrich(q).catch(e => { + console.error('Commonsense failed:', e.message); + return []; + }), + filmages.searchAndEnrich(q).catch(e => { + console.error('Filmages failed:', e.message); + return []; + }), + filmstouspublics.searchAndEnrich(q).catch(e => { + console.error('FilmsTousPublics failed:', e.message); + return []; + }) ]); - console.log('===== SEARCH LOGS ====='); console.log('Cinecheck results:', cine.length); console.log('CSM results:', cs.length); console.log('Filmages results:', fa.length); - console.log('Raw CSM data:', cs); // Inspect full data - - let merged = mergeResults([cine, cs, fa]); - - // Sort merged results based on query relevance - const queryWords = getWords(q); - if (queryWords.length > 0) { - merged.forEach(item => { - const titleWords = getWords(item.title); - const uniqueQueryWords = [...new Set(queryWords)]; - const uniqueTitleWords = [...new Set(titleWords)]; - - let commonWordCount = 0; - for (const qw of uniqueQueryWords) { - if (uniqueTitleWords.includes(qw)) { - commonWordCount++; - } - } - - item.matchScore1 = uniqueQueryWords.length > 0 ? commonWordCount / uniqueQueryWords.length : 0; - - const unionLength = new Set([...uniqueQueryWords, ...uniqueTitleWords]).size; - item.matchScore2 = unionLength > 0 ? commonWordCount / unionLength : 0; - }); - - merged.sort((a, b) => { - if (b.matchScore1 !== a.matchScore1) return b.matchScore1 - a.matchScore1; - if (b.matchScore2 !== a.matchScore2) return b.matchScore2 - a.matchScore2; - return getWords(a.title).length - getWords(b.title).length; // Shorter titles preferred as tertiary sort - }); - } + console.log('FilmsTousPublics results:', ftp.length); + const merged = mergeResults([cine, cs, fa, ftp]); res.json(merged); } catch (e) { console.error('General search error:', e); - res.status(500).json({ error: e.message || "Server's taking a nap." }); + res.status(500).json({ error: e.message || "Server error. You broke something." }); } }); const PORT = 3000; app.listen(PORT, () => { - console.log(`Backend sorting your life out on http://localhost:${PORT}. You're welcome.`); + console.log(`Backend multi-agrégateurs prêt sur http://localhost:${PORT}`); });