From 6c0142c01be24c49ee11af89d5f0c2b612d80a66 Mon Sep 17 00:00:00 2001 From: Morgan Wattiez Date: Sun, 18 May 2025 00:17:42 +0200 Subject: [PATCH] fix commons, hack ajax --- aggregators/commonsense-adapter.js | 148 +++++++++++++++++++++-------- 1 file changed, 111 insertions(+), 37 deletions(-) diff --git a/aggregators/commonsense-adapter.js b/aggregators/commonsense-adapter.js index cea7870..47447d6 100644 --- a/aggregators/commonsense-adapter.js +++ b/aggregators/commonsense-adapter.js @@ -3,54 +3,128 @@ const cheerio = require('cheerio'); const BASE = 'https://www.commonsensemedia.org'; async function searchMovies(query) { - const url = `${BASE}/search/${encodeURIComponent(query)}`; + // Hit the AJAX endpoint directly, not the HTML page + const url = `${BASE}/ajax/search/${encodeURIComponent(query)}`; + console.log('Searching CSM via AJAX endpoint:', url); + const res = await axios.get(url, { - headers: { 'User-Agent': 'Mozilla/5.0', 'accept-language': 'en-US,en;q=0.9' } + headers: { + 'User-Agent': 'Mozilla/5.0', + 'X-Requested-With': 'XMLHttpRequest' + } }); - const $ = cheerio.load(res.data); + + // The response is JSON containing HTML chunks + console.log('Got AJAX response, status:', res.status); + if (!res.data || !Array.isArray(res.data)) { + console.log('Invalid AJAX response format'); + return []; + } + + // Find the HTML insert command with search results + const searchResultsCommand = res.data.find(cmd => + cmd.command === 'insert' && cmd.data && cmd.data.includes('search-main-list-content') + ); + + if (!searchResultsCommand) { + console.log('No search results in AJAX response'); + return []; + } + + // Parse the HTML chunk + const $ = cheerio.load(searchResultsCommand.data); const results = []; - $('.search-results-list__row').each((_, el) => { - const type = $(el).find('.media-type').text().trim(); - if (type.toLowerCase() !== 'movie') return; // ignore non-movies - const title = $(el).find('.search-results-product-title').text().trim(); - const link = $(el).find('a.search-results-product-title').attr('href'); - const absLink = link ? BASE + link : null; - const img = $(el).find('img.search-results-product-image').attr('src'); - // Pas d'année la plupart du temps sur CSM. - results.push({ title, year: null, img, link: absLink }); + + $('.review-teaser').each((_, el) => { + const typeEl = $(el).find('.review-teaser-type'); + if (!typeEl.text().trim().toUpperCase().includes('MOVIE')) return; + + const title = $(el).find('.review-teaser-title a').text().trim(); + const link = $(el).find('.review-teaser-title a').attr('href'); + const fullLink = link ? BASE + link : null; + + // Get image (might be lazy-loaded) + const imgEl = $(el).find('.review-image img'); + const imgSrc = imgEl.attr('data-src') || imgEl.attr('src'); + const img = imgSrc && !imgSrc.includes('ratio_2_3') ? + (imgSrc.startsWith('/') ? BASE + imgSrc : imgSrc) : null; + + // Get age rating + const age = $(el).find('.rating__age').text().trim().replace('age', '').replace('+', '').trim(); + + // Get summary + const summary = $(el).find('.review-teaser-one-liner').text().trim(); + + // Get year + const yearMatch = $(el).text().match(/Release Year:\s*(\d{4})/); + const year = yearMatch ? yearMatch[1] : null; + + if (title && fullLink) { + results.push({ + title, + link: fullLink, + img, + age, + summary, + year + }); + } }); - console.log('CSM search:', results); + + console.log('CSM search found:', results.length, 'results'); + console.log('First result:', results[0]); return results; } -async function getMovieClassification(movieUrl) { +async function getMovieDetails(movieUrl) { if (!movieUrl) return {}; - const res = await axios.get(movieUrl, { headers: { 'User-Agent': 'Mozilla/5.0' } }); - const $ = cheerio.load(res.data); - - const age = $('[data-test="age-rating"]').first().text().replace('age', '').replace('+', '').trim() || null; - const summary = $('[data-test="review-summary"]').first().text().trim(); - const details = []; - $('[data-test="product-rating-section"]').each((_, el) => { - const label = $(el).find('[data-test="rating-section-label"]').text().trim(); - const score = $(el).find('.icon-circle-solid.active,.icon-star-solid.active').length; - const desc = $(el).find('[data-test="rating-section-description"]').text().trim(); - if (label) details.push({ type: label, score, description: desc }); - }); - - return { age, summary, details }; + + try { + const res = await axios.get(movieUrl, { + headers: { 'User-Agent': 'Mozilla/5.0' } + }); + + const $ = cheerio.load(res.data); + + // Additional details from the full page + const parentsNeedToKnow = $('[data-test="parents-need-to-know"]').text().trim(); + + // Get all rating categories + const details = []; + $('[data-test="product-rating-section"]').each((_, section) => { + const type = $(section).find('[data-test="rating-section-label"]').text().trim(); + const score = $(section).find('.icon-circle-solid.active').length; + const description = $(section).find('[data-test="rating-section-description"]').text().trim(); + + if (type) { + details.push({ type, score, description }); + } + }); + + return { parentsNeedToKnow, details }; + } catch (error) { + console.error('Error fetching movie details:', error.message); + return {}; + } } async function searchAndEnrich(query) { - const results = await searchMovies(query); - return await Promise.all(results.map(async m => ({ - title: m.title, - year: m.year, - img: m.img, - link: m.link, - source: 'commonsense', - ...(await getMovieClassification(m.link)) - }))); + try { + const results = await searchMovies(query); + return await Promise.all(results.map(async movie => ({ + title: movie.title, + year: movie.year, + img: movie.img, + link: movie.link, + source: 'commonsense', + summary: movie.summary, + age: movie.age, + ...(await getMovieDetails(movie.link)) + }))); + } catch (error) { + console.error('Error in CSM searchAndEnrich:', error.message); + return []; + } } module.exports = { searchAndEnrich };