improve mediapart: use existing html

This commit is contained in:
quenousimporte 2024-05-28 12:12:41 +02:00
parent b34c11113d
commit 16885de8ad
1 changed files with 34 additions and 25 deletions

View File

@ -1,6 +1,10 @@
<?php <?php
$sessionid = 'value of cookie MPSESSID'; $sessionid = 'value of cookie MPSESSID';
/*ini_set('display_errors', '1');
ini_set('display_startup_errors', '1');
error_reporting(E_ALL);*/
require('TPEpubCreator.php'); require('TPEpubCreator.php');
$feedurl = 'https://www.mediapart.fr/articles/feed'; $feedurl = 'https://www.mediapart.fr/articles/feed';
$opts = [ $opts = [
@ -37,41 +41,46 @@ foreach ($items as $item)
$doc->loadHTML($article); $doc->loadHTML($article);
$finder = new DomXPath($doc); $finder = new DomXPath($doc);
// clean images $svgs = $finder->query('//svg');
$figures = $finder->query('//figure'); foreach ($svgs as $svg)
foreach ($figures as $figure)
{ {
$figure->parentNode->removeChild($figure); $svg->parentNode->removeChild($svg);
} }
// clean "à lire aussi" and inline quotes
$asides = $finder->query('//aside');
foreach ($asides as $aside)
{
$aside->parentNode->removeChild($aside);
}
// improve section titles
$asides = $finder->query('//h2');
foreach ($asides as $aside)
{
$aside->textContent = "\n*" . $aside->textContent . "*\n";
// todo keep h2
}
$maintext = $finder->query('//div[contains(@class, "paywall-restricted-content")]');
$result = '<h1>' . $title . '</h1>'; $result = '<h1>' . $title . '</h1>';
$result .= '<p>' . $author . '</p>'; $result .= '<p>' . $author . '</p>';
$result .= '<p>' . $item->pubDate . '</p>'; $result .= '<p>' . $item->pubDate . '</p>';
$result .= '<p><b>' . $summary . '</b></p>'; $result .= '<p><b>' . $summary . '</b></p>';
// todo use html instead $nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]');
$result .= '<div>' . str_replace("\n", '<br>', $maintext->item(0)->textContent) . '</div>'; if (!$nodes->length)
{
// articles accès libre
$nodes = $finder->query('//div[contains(@class, "news__body__center__article")]');
}
//echo $result; if (!$nodes->length)
{
echo 'warning: could not get content of "' . $title . '"<br>';
}
else
{
$node = $nodes->item(0);
$innerHTML = '';
foreach ($node->childNodes as $childNode){
$innerHTML .= $childNode->ownerDocument->saveHTML($childNode);
}
$outerHTML = $node->ownerDocument->saveHTML($node);
$textcontent = $node->textContent;
$result .= '<div>' . $innerHTML . '</div>';
$epub->AddPage($result, false, $title); $epub->AddPage($result, false, $title);
}
//echo $result;
//break; //break;
} }