presse/mediapart.php

133 lines
4.0 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
/*ini_set('display_errors', 1);
ini_set('display_startup_errors', 1);
error_reporting(E_ALL);*/
function cleantext( $node )
{
$content = $node->textContent;
$content = str_replace('Partager sur Facebook', '', $content);
$content = str_replace('Partager sur X (ex Twitter)', '', $content);
$content = str_replace('Lire + tard', '', $content);
$content = str_replace('Offrir larticle', '', $content);
$content = str_replace(' PDF', '', $content);
$content = str_replace('Cette fonctionnalité est actuellement en maintenance.La génération de PDF reste disponible via la fonctionnalité «Imprimer».', '', $content);
$content = str_replace("Fermer\n", '', $content);
$content = str_replace("Recommander\n", '', $content);
$content = str_replace("Commenter\n", '', $content);
$content = str_replace(' ', '', $content);
$content = str_replace("\n", "\n\n", $content);
for ($i = 0; $i < 15; $i++)
{
$content = str_replace("\n\n\n", "\n\n", $content);
}
return $content;
}
$sessionid = 'value of cookie MPSESSID';
$feedurl = 'https://www.mediapart.fr/articles/feed';
$opts = [
'http' => [
'method' => "GET",
'header' => "Accept-language: en\nCookie: MPSESSID=" . $sessionid,
]
];
$context = stream_context_create($opts);
// load feeds
$feed = file_get_contents($feedurl);
$xml = new SimpleXMLElement($feed);
$items = $xml->xpath("/rss/channel/item");
setlocale(LC_ALL, 'fr_FR.UTF8', 'fr_FR','fr','fr','fra','fr_FR@euro');
$localedate = strftime("%A %d %B %Y");
$header = "Mediapart - " . $localedate . "\n\nSommaire:\n";
$result = '';
foreach ($items as $item)
{
if (str_starts_with($item->pubDate, date("D, j M Y")))
{
$title = $item->title;
// Add to content
$header .= '- ' . $title . "\n";
$article = file_get_contents($item->link, false, $context);
$doc = new DOMDocument();
$doc->loadHTML($article);
$finder = new DomXPath($doc);
$category = trim($finder->query('//p[@class="news__heading__top__kicker _default"]')->item(0)->textContent);
$summary = trim($finder->query('//p[@class="news__heading__top__intro margin-top:500"]')->item(0)->textContent);
$author = trim($finder->query('//a[@class="focus color:text-brand heading-product:100"]')->item(0)->textContent);
// clean images
$figures = $finder->query('//figure');
foreach ($figures as $figure)
{
$figure->parentNode->removeChild($figure);
}
// clean "à lire aussi" and inline quotes
$asides = $finder->query('//aside');
foreach ($asides as $aside)
{
$aside->parentNode->removeChild($aside);
}
// improve section titles
$asides = $finder->query('//h2');
foreach ($asides as $aside)
{
$aside->textContent = '*' . $aside->textContent . '*';
}
// Output
// old: add header as one block
//$part1 = $finder->query('//div[@class="news__heading grid"]');
//$result .= cleantext($part1->item(0));
$result .= "\n------\n\n";
$result .= "Article : " . $item->title . "\n";
$result .= 'Catégorie: ' . str_replace("\n", '', $category) . "\n";
$result .= 'Auteur.ice : ' . $author . "\n";
$result .= "\n";
$result .= "Introduction :\n" . $summary . "\n";
// article main text
$maintext = $finder->query('//div[@class="news__body__center"]');
$result .= cleantext($maintext->item(0));
}
//break;
}
// web version:
/*
file_put_contents('mediapart.txt', $header . "\n" . $result);
echo '<html><head><title>Mediapart - ' . $date . '</title></head><body>';
echo '<div><a download href="mediapart.txt">mediapart.txt</a></div>';
echo '<div>';
echo str_replace("\n", '<br>', $header) . '<br>' . str_replace("\n", '<br>', $result);
echo '</div>';
echo '</body></html>';
*/
$filename = 'mediapart' . str_replace(' ', '', $localedate) . '.txt';
header('Content-Type: application/text; charset=utf-8');
header('Content-Disposition: attachment; filename=' . $filename);
echo $header . "\n" . $result;
?>