aggregate more source

This commit is contained in:
2025-05-17 20:46:23 +02:00
parent 820cc6f209
commit 459b455fee
6 changed files with 422 additions and 52 deletions

View File

@ -0,0 +1,81 @@
const axios = require('axios');
const cheerio = require('cheerio');
const CINECHECK_BASE = 'https://www.cinecheck.be';
async function searchMovies(query) {
const url = `${CINECHECK_BASE}/umbraco/surface/searchresults/search?query=${encodeURIComponent(query)}&producties=0&amount=5`;
const res = await axios.get(url, {
headers: {
'x-umb-culture': 'fr-BE',
'x-umb-key': '0a0c11a9-ece8-4dc8-8578-e5aab235d9ff',
'x-requested-with': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0',
}
});
const $ = cheerio.load(res.data);
const results = [];
$('.c-search__result').each((_, el) => {
const title = $(el).find('.c-search__title').text().trim().replace(/\s*\(.+?\)\s*$/, '');
const yearMatch = $(el).find('.c-search__title').text().match(/\((\d{4})\)/);
const year = yearMatch ? yearMatch[1] : null;
const imgSrc = $(el).find('img.c-search__image').attr('src')
? CINECHECK_BASE + $(el).find('img.c-search__image').attr('src')
: null;
const link = $(el).find('a.c-search__hiddenlink').attr('href')
? CINECHECK_BASE + $(el).find('a.c-search__hiddenlink').attr('href')
: null;
if (title && link) {
results.push({ title, year, img: imgSrc, link });
}
});
return results;
}
async function getMovieClassification(movieUrl) {
const res = await axios.get(movieUrl, { headers: { 'User-Agent': 'Mozilla/5.0' } });
const $ = cheerio.load(res.data);
const year = $('.c-movie__details .c-movie__label').first().text().trim() || null;
const genres = $('.c-movie__details .c-movie__label').eq(1).text().split(',').map(s => s.trim());
const img = $('.c-movie__cover img').attr('src')
? CINECHECK_BASE + $('.c-movie__cover img').attr('src')
: null;
const marks = [];
$('.c-header__marks .c-header__mark').each((_, el) => {
const label = $(el).find('span.vh').text().trim();
if (label) marks.push(label);
});
const details = [];
$('.c-classificatie__item').each((_, el) => {
const type = $(el).find('svg use').first().attr('xlink:href') || '';
const typeName = type.split('#')[1] || '';
const description = $(el).find('.js-classificatie-text').text().trim();
if (typeName && description) {
details.push({ type: typeName, description });
}
});
const summary = $('.c-movie__introtext p').first().text().trim();
return {
year,
genres,
img,
marks,
details,
summary
};
}
async function searchAndEnrich(query) {
const results = await searchMovies(query);
return await Promise.all(results.map(async m => ({
title: m.title,
year: m.year,
img: m.img,
link: m.link,
source: 'cinecheck',
...(await getMovieClassification(m.link))
})));
}
module.exports = { searchAndEnrich };

View File

@ -0,0 +1,56 @@
const axios = require('axios');
const cheerio = require('cheerio');
const BASE = 'https://www.commonsensemedia.org';
async function searchMovies(query) {
const url = `${BASE}/search/${encodeURIComponent(query)}`;
const res = await axios.get(url, {
headers: { 'User-Agent': 'Mozilla/5.0', 'accept-language': 'en-US,en;q=0.9' }
});
const $ = cheerio.load(res.data);
const results = [];
$('.search-results-list__row').each((_, el) => {
const type = $(el).find('.media-type').text().trim();
if (type.toLowerCase() !== 'movie') return; // ignore non-movies
const title = $(el).find('.search-results-product-title').text().trim();
const link = $(el).find('a.search-results-product-title').attr('href');
const absLink = link ? BASE + link : null;
const img = $(el).find('img.search-results-product-image').attr('src');
// Pas d'année la plupart du temps sur CSM.
results.push({ title, year: null, img, link: absLink });
});
console.log('CSM search:', results);
return results;
}
async function getMovieClassification(movieUrl) {
if (!movieUrl) return {};
const res = await axios.get(movieUrl, { headers: { 'User-Agent': 'Mozilla/5.0' } });
const $ = cheerio.load(res.data);
const age = $('[data-test="age-rating"]').first().text().replace('age', '').replace('+', '').trim() || null;
const summary = $('[data-test="review-summary"]').first().text().trim();
const details = [];
$('[data-test="product-rating-section"]').each((_, el) => {
const label = $(el).find('[data-test="rating-section-label"]').text().trim();
const score = $(el).find('.icon-circle-solid.active,.icon-star-solid.active').length;
const desc = $(el).find('[data-test="rating-section-description"]').text().trim();
if (label) details.push({ type: label, score, description: desc });
});
return { age, summary, details };
}
async function searchAndEnrich(query) {
const results = await searchMovies(query);
return await Promise.all(results.map(async m => ({
title: m.title,
year: m.year,
img: m.img,
link: m.link,
source: 'commonsense',
...(await getMovieClassification(m.link))
})));
}
module.exports = { searchAndEnrich };

View File

@ -0,0 +1,99 @@
const axios = require('axios');
const cheerio = require('cheerio');
const BASE_URL = 'http://www.filmages.ch/';
async function searchMovies(query) {
const searchUrl = `${BASE_URL}films/recherche/search/${encodeURIComponent(query)}.html`;
const response = await axios.get(searchUrl, { headers: { 'User-Agent': 'Mozilla/5.0' } });
const $ = cheerio.load(response.data);
const results = [];
$('table.layout_simpletable tbody tr.item').each((_, el) => {
const row = $(el);
const titleFrench = row.find('td.field.title_french a').text().trim();
const link = row.find('td.field.title_french a').attr('href');
const titleOriginal = row.find('td.field.title_original').text().trim();
const director = row.find('td.field.director').text().trim();
const ageLegal = row.find('td.field.age_legal').text().trim();
const ageSuggested = row.find('td.field.age_suggested').text().trim();
if (titleFrench && link) {
results.push({
titleFrench,
titleOriginal,
director,
ageLegalSearch: ageLegal,
ageSuggestedSearch: ageSuggested,
link: BASE_URL + link, // Make absolute
});
}
});
// console.log('FilmAges search results:', results);
return results;
}
async function getMovieClassification(movieUrl) {
if (!movieUrl) return {};
const response = await axios.get(movieUrl, { headers: { 'User-Agent': 'Mozilla/5.0' } });
const $ = cheerio.load(response.data);
const mainDetails = $('#reader_main .layout_full .item');
const rightDetails = $('#reader_right_1 .layout_full .item'); // For ages
const rightCriteria = $('#reader_right_2 .layout_full .item'); // For indications
const titleOriginal = mainDetails.find('.field.title_original .value').text().trim();
const year = mainDetails.find('.field.year .value').text().trim();
const summary = mainDetails.find('.field.summary .value').text().trim();
const synthesis = mainDetails.find('.field.final_remark .value').text().trim();
const ageLegal = rightDetails.find('.field.age_legal .value').text().trim();
const ageSuggested = rightDetails.find('.field.age_suggested .value').text().trim();
const indications = [];
rightCriteria.find('.field.indication .value a').each((_, el) => {
indications.push($(el).text().trim());
});
const counterIndications = [];
rightCriteria.find('.field.counter_indication .value a').each((_, el) => {
counterIndications.push($(el).text().trim());
});
const director = mainDetails.find('.field.director .value').text().trim();
return {
titleOriginalPage: titleOriginal, // To distinguish from search result one
year,
summary,
synthesis,
ageLegal,
ageSuggested,
indications,
counterIndications,
directorPage: director,
};
}
async function searchAndEnrich(query) {
const searchResults = await searchMovies(query);
const enrichedResults = await Promise.all(
searchResults.map(async (movie) => {
const classification = await getMovieClassification(movie.link);
return {
title: movie.titleFrench || movie.titleOriginal, // Prioritize French title for matching
year: classification.year, // Year is only on detail page
img: null, // No images readily available from search/detail
link: movie.link,
source: 'filmages',
details: {
...movie, // Keep all search results fields
...classification, // Add all detail page fields
}
};
})
);
// console.log('FilmAges enriched:', enrichedResults);
return enrichedResults;
}
module.exports = { searchAndEnrich };