275 lines
15 KiB
PHP
275 lines
15 KiB
PHP
<?php
|
|
|
|
use Shaarli\Config\ConfigManager;
|
|
|
|
function auto_tag_plugin_init(ConfigManager $conf)
|
|
{
|
|
// Configure keywords and tags based on provided rules
|
|
$conf->setEmpty('plugins.AUTO_TAG_KEYWORDS', [
|
|
'accessibility,accessibilité,web development,web design,html,css' => 'web',
|
|
'accounting,comptabilité' => 'accounting',
|
|
'addict,addiction,drugs,drogues,cigarettes,bonbons,dopamine,sucre,nicotine,hooked' => 'addiction',
|
|
'adhd,tdah' => 'adhd',
|
|
'alternative,alternatives,compatible,compatibles,migrated,migration' => 'alternatives',
|
|
'ads,advertisements,publicités,spam' => 'ads',
|
|
'anxiété,anxieux,anxieuse,anxiété,burnout,méditation,cardio,santé,health,healthy' => 'health-and-wellness',
|
|
'architecture,architectures' => 'architecture',
|
|
'archive,archiving,archives,archivage' => 'archiving,archives',
|
|
'art,arts' => 'culture',
|
|
'ask hn,news.ycombinator.com/item,reddit.com/r/' => 'debate',
|
|
'autism,autisme,autist,autiste' => 'autism',
|
|
'belgian,belgium,belge,belgique' => 'belgium',
|
|
'board game,board games,boardgame,jeu de société,jeux de société' => 'board-games,geek',
|
|
'bookmarks,favoris,signets' => 'bookmarks-management',
|
|
'browsers,navigateurs,web browsers' => 'web-browsers',
|
|
'books,livre,livres,quatrième de couverture,roman,novel,reading-list' => 'reading-and-literature,inspiration,culture',
|
|
'robots.txt,bots,spam,crawling,ddos' => 'spam',
|
|
'bruxelles,brussels' => 'brussels',
|
|
'calm tech,calmness,calm,technologie calme' => 'calm-tech',
|
|
'cars,car bloat,vehicles,véhicule,véhicules,vehicle,automobile,automotive' => 'transport',
|
|
'cheatsheet,cheat sheet,cheat-sheet,antisèche' => 'cheatsheet,guides-and-tips',
|
|
'cleaning,nettoyage' => 'cleaning',
|
|
'cloud,aws,amazon,cloudron' => 'cloud',
|
|
'cloudron' => 'cloudron,hosting',
|
|
'(comic),-comic-,comics,bandes dessinées' => 'comics,reading-and-literature,culture,humor',
|
|
'comparison,comparer, vs ,versus,comparatif,comparaison' => 'comparison',
|
|
'communicate,communication,messaging,messenger,gmail,communiquer,écriture inclusive,la langue,académie française,language' => 'communication',
|
|
'complex,impossible to solve,complicated,very difficult,complexe,compliqué,complexity' => 'complexity,its-complicated',
|
|
'computers,personal computer,the pc' => 'pc',
|
|
'configure,configuration,paramétrer,paramétrage' => 'configuration',
|
|
'database,databases,RDS,base de données' => 'databases',
|
|
'data collection,collecte de données' => 'data-collection',
|
|
'data transfer,transfert de données' => 'data-portability',
|
|
'debug,troubleshoot,diagnose,résoudre,diagnostiquer,troubleshooting,find a solution' => 'problem-solving,guides-and-tips,debugging',
|
|
'design,designs' => 'design',
|
|
'development workflow,devex,flux de développement' => 'devex',
|
|
'disk,disque,disques' => 'storage',
|
|
'distraction,procrastination,procrastine,procrastiner,glander' => 'procrastination',
|
|
'diy,self-host,héberger soi-même,my personal,fait maison' => 'diy',
|
|
'dns,network,tcp,wireshark,réseau' => 'network',
|
|
'docker,docker-compose,docker compose,container,containers,k8s,eks,kubernetes,minikube,k3s,helm,openshift' => 'container-technology,devops',
|
|
'documentation,docs,document,documentation technique' => 'documentation',
|
|
'ses droits,legally,legalement,légal' => 'legal',
|
|
'elixir,python,pip,php,rust,golang,programming,developer,software development,developers,développeurs' => 'software-development',
|
|
'emulator,emulation,émulateur,émulation' => 'emulation',
|
|
'entrepreneurship,entrepreneurs,entrepreneuriat' => 'business',
|
|
'espresso,coffee,café' => 'coffee',
|
|
'ethic,ethique,ethics' => 'ethics',
|
|
'explor,going deep,exploration' => 'discovery',
|
|
'en sécurité,enfin libre,libéré,unsafe,liberté' => 'privacy-and-security,freedom',
|
|
'libre,free,liberté,degoogling,degoogle,degoogler,dégooglisation' => 'freedom',
|
|
'logiciel libre,free software,logiciel gratuit,free to use' => 'free-software',
|
|
'from home,remote work,work remote,travail à distance,télétravail' => 'remote-work',
|
|
'frustrated,frustration,am pissed' => 'rant',
|
|
'big-tech,gafam,degoogling,google,degoogle,grandes entreprises technologiques' => 'big-tech',
|
|
'game,jeu vidéo,game dev,jeux,jeux vidéo' => 'games,geek,culture',
|
|
'gamedev,building game,développement de jeux' => 'gamedev,games,geek,culture',
|
|
'gratuit,free' => 'free',
|
|
'gitops,gitlab,github actions,devops,SRE,ci/cd,platform-engineering,ci pipeline,application deployment,dagger,renovatebot,dependabot,continuous integration,site reliability eng' => 'devops',
|
|
'git,gitlab,jujutsu,pijul,mercurial,svn,version control,contrôle de version' => 'version-control',
|
|
'gpt,chatgpt,llm,llms,artificial intelligence,intelligence artificielle,IA,l\'ia,ai,ai model,an ai' => 'ai',
|
|
'hacking,piratage' => 'hacking',
|
|
'nutrition,food,alimentation,nourriture,recette,recette de cuisine,recettes de cuisine,ingrédients,cette recette,beurre,cuisine' => 'food',
|
|
'history,histoire,documentaire,documentary' => 'history',
|
|
'humans,humains' => 'humans',
|
|
'humor,humour' => 'humor',
|
|
'inspiration,creativity,creative,inspiration,créativité' => 'inspiration',
|
|
'leadership,staff engineering,gestion' => 'leadership',
|
|
'lambic,gueuze,beer,bière,bières' => 'beer-and-brewing',
|
|
'linux,ubuntu,debian,linux windows macos' => 'os',
|
|
'list,index of,awesome,installation,GitHub - ,liste' => 'list,discovery',
|
|
'low-tech,low tech,technologie simple' => 'low-tech',
|
|
'merdification,enshittif,AI-generated,crapification,decline in quality,déclin de qualité' => 'enshittification',
|
|
'misinformation,fact-checking,fact checking' => 'misinformation',
|
|
'monitoring,metrics,to monitor,surveillance,métriques' => 'monitoring,metrics',
|
|
'movie,cinéma,cinema,film,films' => 'movies,culture,geek',
|
|
'music,spotify,radios,webradios,soundtrack,bande originale,musique' => 'music',
|
|
'newsletter,news' => 'newsletter,news',
|
|
'nostalgia,nostalgie,things used to be better,internet archive' => 'nostalgia',
|
|
'obsidian,note taking,note-taking,takings notes,note-geek,capturing knowledge,knowledge management,prise de notes,gestion de connaissances,knowledge transfer,transferring knowledge,your notes,my notes' => 'knowledge-management,note-taking',
|
|
'ocr' => 'ocr',
|
|
'open-source,open source,code source libre' => 'open-source,free',
|
|
'optimize,optimization,speed up,an efficient,optimiser,optimisation' => 'optimization',
|
|
'philosophe,philosophia,philosophy,lifestyle,philosophie' => 'philosophy',
|
|
'photography,photos,photographie' => 'photos',
|
|
'podcast,podcasts' => 'podcast',
|
|
'ego,narcissism,narcissist,narcissisme,psycholog,psychologie' => 'psychology',
|
|
'voting,politic,politique,vote,multiculturalism,culturalism,cultural integration,political' => 'politics',
|
|
'python,logiciels en python' => 'python,software-development,code',
|
|
'privatebin' => 'secrets',
|
|
'programming languages,langages de programmation' => 'code,computer-languages',
|
|
'publishing,publier,publication' => 'publishing',
|
|
'quality,qualité' => 'quality',
|
|
'recommandations,recommendation,conseils' => 'recommandations,recommendations',
|
|
'recycling,sustainable,green web,climate,recyclage,web écologique,climat,ai emissions,water use,emissions produced,amount of co2,global co2,co2 emissions,car bloat,environnement,environment,environmental,environmentally,plasic waste' => 'ecology',
|
|
'relationship,relationships,de rencontre' => 'relationships',
|
|
'religion,chretiens,bible,coran,islam,musulmans,croyants,athée,la foi,église' => 'philosophy',
|
|
'reviews,critique,avis' => 'reviews',
|
|
'rss,rss feed,miniflux,web reader,lecteur web' => 'content-aggregation,content-curation',
|
|
'ruby,rails app,applications ruby' => 'ruby,software-development,code',
|
|
'science,sciences' => 'science',
|
|
'scripting,jq,curl,wget,script,bash,terminal,bash script,script python,python script,lua,script shell,script bash,shell script' => 'scripting',
|
|
'search engine,moteur de recherche' => 'search-engines',
|
|
'security,permission,sécurité,secure,privacy,private,degoogl,gdpr,data protection,online tracking,user profiling,anonymo,anonymi,surveillance,malware,spyware,decentrali,secrets,privacy matters,vpn,passkey,protéger,password manager' => 'privacy-and-security',
|
|
'simplicity,minimal,declutter,stopped using,simple,simplification,simplifier,reduction in,no longer needed,minimalisme,simplicité,réduction' => 'minimalism',
|
|
'small web,indie web,indieweb,petit web' => 'small-web',
|
|
'smartphone,android,mobile,phone,téléphone,sms' => 'mobile',
|
|
'snippet,extrait de code' => 'code',
|
|
'static site,static-site,site statique' => 'static-site,static-web',
|
|
'social media,réseau social,fediverse,fédiverse,réseaux sociaux,social networks,social network' => 'social-media',
|
|
'society,societies' => 'society',
|
|
'teamwork,collaborat,équipe,cooperat,coordinat,travail équipe' => 'collaboration',
|
|
'technology,technologie' => 'technology',
|
|
'template,modèle' => 'template',
|
|
'terminal,tools-and-resources,terminaltrove' => 'terminal,tools-and-resources',
|
|
'test,tester' => 'testing',
|
|
'markdown,text files,fichiers texte,formats,text-based,plaintext,markup language,markdown,plain text,basé sur du texte,langage balisé' => 'plaintext,formats,text-files',
|
|
'time to update,maintainers,tech-debt,legacy code,long term software,temps pour mettre à jour' => 'maintenance,tech-debt',
|
|
'to do,to-do,à faire' => 'todo',
|
|
'tool,resources,a script,outil,a collection,a catalog,awesome list,links,outils,ressources,password manager' => 'tools-and-resources',
|
|
'training,course,conference talk,learning,homeschool,expert,specializ,tacit knowledge,tribal knowledge,formation,cours,conférence,apprentissage' => 'education',
|
|
'.txt,text-based,fichiers txt' => 'text-files',
|
|
'ui,interfaces utilisateur' => 'ui',
|
|
'ux,the experience of,usable,uxer,user experience,expérience utilisateur' => 'ux',
|
|
'vps,serveur privé virtuel' => 'cloud,vps,hosting',
|
|
'web archive,web archiving,save any website,bookmarks,wayback machine,archive.org,archivebox,archive web' => 'web-archiving',
|
|
'webring' => 'discovery,small-web,webring',
|
|
'wordpress,personal website,blog roll,blogroll,blogosphere,webring,digital garden,to blog,blogs' => 'blogging,writing,discovery,small-web',
|
|
'work,travail,contract work,previous job,my work,coworkers,coworker,the job,workgroup' => 'work',
|
|
'youtube.com,invidious,peertube,watch?v' => 'video',
|
|
'zoemp,zoemp.be' => 'zoemp'
|
|
]);
|
|
}
|
|
|
|
function fetch_page_content($url)
|
|
{
|
|
$htmlContent = @file_get_contents($url);
|
|
if ($htmlContent === false) {
|
|
return '';
|
|
}
|
|
|
|
// Remove unnecessary tags and strip content to plain text
|
|
$htmlContent = preg_replace('/<(script|style|head|noscript)[^>]*>.*?<\/\1>/is', '', $htmlContent);
|
|
return strip_tags($htmlContent);
|
|
}
|
|
|
|
function calculate_tags(array $keywordsToTags, array $searchContents): array
|
|
{
|
|
$contextWeights = [
|
|
'title' => 3,
|
|
'url' => 3,
|
|
'description' => 3,
|
|
'existing' => 3,
|
|
'content' => 1
|
|
];
|
|
|
|
$tagScores = [];
|
|
|
|
foreach ($keywordsToTags as $keywords => $tags) {
|
|
$keywordList = explode(',', $keywords);
|
|
foreach ($keywordList as $keyword) {
|
|
$keyword = preg_quote(trim($keyword), '/');
|
|
$regex = '/\b' . $keyword . '\b/i';
|
|
|
|
foreach ($searchContents as $context => $content) {
|
|
if (preg_match($regex, $content)) {
|
|
$tagList = explode(',', $tags);
|
|
foreach ($tagList as $tag) {
|
|
$tag = trim($tag);
|
|
$tagScores[$tag] = ($tagScores[$tag] ?? 0) + $contextWeights[$context];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return $tagScores;
|
|
}
|
|
|
|
function filter_and_limit_tags(array $tagScores, int $minScore = 2, int $maxTags = 4): array
|
|
{
|
|
// Filter tags with a score greater than or equal to the minimum required
|
|
$filteredTags = array_filter($tagScores, function ($score) use ($minScore) {
|
|
return $score >= $minScore;
|
|
});
|
|
|
|
// Check if $filteredTags is not empty to avoid max() error
|
|
if (!empty($filteredTags)) {
|
|
// Determine the maximum score
|
|
$maxScore = max($filteredTags);
|
|
|
|
// Remove tags with a score less than half of the maximum score
|
|
$filteredTags = array_filter($filteredTags, function ($score) use ($maxScore) {
|
|
return $score > ($maxScore / 2);
|
|
});
|
|
} else {
|
|
// If no tags pass the filter, return an empty array
|
|
$filteredTags = [];
|
|
}
|
|
|
|
// Sort tags by descending score
|
|
arsort($filteredTags);
|
|
|
|
// Limit the number of tags to the maximum allowed
|
|
return array_slice(array_keys($filteredTags), 0, $maxTags);
|
|
}
|
|
|
|
function append_tag_stats(array $data, array $tagScores): array
|
|
{
|
|
$stats = [];
|
|
foreach ($tagScores as $tag => $score) {
|
|
$stats[] = "$tag: score=$score";
|
|
}
|
|
|
|
$data['link']['description'] .= "\n\nTag Stats:\n" . implode("\n", $stats);
|
|
return $data;
|
|
}
|
|
|
|
function apply_auto_tags(array $data, ConfigManager $conf): array
|
|
{
|
|
if (empty($data['link']['url'])) {
|
|
return $data;
|
|
}
|
|
|
|
$keywordsToTags = $conf->get('plugins.AUTO_TAG_KEYWORDS', []);
|
|
$pageContent = fetch_page_content($data['link']['url']);
|
|
|
|
$searchContents = [
|
|
'title' => $data['link']['title'] ?? '',
|
|
'url' => $data['link']['url'],
|
|
'description' => $data['link']['description'] ?? '',
|
|
'existing' => implode(' ', explode(' ', $data['link']['tags'] ?? '')),
|
|
'content' => $pageContent
|
|
];
|
|
|
|
// Calculate scores for each tag
|
|
$tagScores = calculate_tags($keywordsToTags, $searchContents);
|
|
|
|
// Append tag stats to the description
|
|
// $data = append_tag_stats($data, $tagScores); // UNCOMMENT WHEN DEBUGGING AUTOMATED TAGS
|
|
|
|
// Filter and limit tags
|
|
$tagsToAdd = filter_and_limit_tags($tagScores);
|
|
|
|
$tagsToAdd[] = 'auto-tagged'; // Add a fixed tag to indicate auto-tagging
|
|
$existingTags = array_map('trim', explode(' ', $data['link']['tags'] ?? ''));
|
|
$tagsToAdd = array_map('trim', $tagsToAdd);
|
|
|
|
// Merge, remove duplicates, and clean up
|
|
$data['link']['tags'] = trim(implode(' ', array_unique(array_merge($existingTags, $tagsToAdd))));
|
|
|
|
return $data;
|
|
}
|
|
|
|
function hook_auto_tag_plugin_render_editlink(array $data, ConfigManager $conf): array
|
|
{
|
|
// Always apply auto-tagging, even for existing links
|
|
return apply_auto_tags($data, $conf);
|
|
}
|
|
|
|
function hook_auto_tag_plugin_save_link(array $data, ConfigManager $conf): array
|
|
{
|
|
return apply_auto_tags($data, $conf);
|
|
}
|
|
|