improve mediapart: use existing html
This commit is contained in:
parent
b34c11113d
commit
16885de8ad
|
@ -1,6 +1,10 @@
|
||||||
<?php
|
<?php
|
||||||
|
|
||||||
$sessionid = 'value of cookie MPSESSID';
|
$sessionid = 'value of cookie MPSESSID';
|
||||||
|
/*ini_set('display_errors', '1');
|
||||||
|
ini_set('display_startup_errors', '1');
|
||||||
|
error_reporting(E_ALL);*/
|
||||||
|
|
||||||
require('TPEpubCreator.php');
|
require('TPEpubCreator.php');
|
||||||
$feedurl = 'https://www.mediapart.fr/articles/feed';
|
$feedurl = 'https://www.mediapart.fr/articles/feed';
|
||||||
$opts = [
|
$opts = [
|
||||||
|
@ -37,41 +41,46 @@ foreach ($items as $item)
|
||||||
$doc->loadHTML($article);
|
$doc->loadHTML($article);
|
||||||
$finder = new DomXPath($doc);
|
$finder = new DomXPath($doc);
|
||||||
|
|
||||||
// clean images
|
$svgs = $finder->query('//svg');
|
||||||
$figures = $finder->query('//figure');
|
foreach ($svgs as $svg)
|
||||||
foreach ($figures as $figure)
|
|
||||||
{
|
{
|
||||||
$figure->parentNode->removeChild($figure);
|
$svg->parentNode->removeChild($svg);
|
||||||
}
|
}
|
||||||
|
|
||||||
// clean "à lire aussi" and inline quotes
|
|
||||||
$asides = $finder->query('//aside');
|
|
||||||
foreach ($asides as $aside)
|
|
||||||
{
|
|
||||||
$aside->parentNode->removeChild($aside);
|
|
||||||
}
|
|
||||||
|
|
||||||
// improve section titles
|
|
||||||
$asides = $finder->query('//h2');
|
|
||||||
foreach ($asides as $aside)
|
|
||||||
{
|
|
||||||
$aside->textContent = "\n*" . $aside->textContent . "*\n";
|
|
||||||
// todo keep h2
|
|
||||||
}
|
|
||||||
|
|
||||||
$maintext = $finder->query('//div[contains(@class, "paywall-restricted-content")]');
|
|
||||||
$result = '<h1>' . $title . '</h1>';
|
$result = '<h1>' . $title . '</h1>';
|
||||||
$result .= '<p>' . $author . '</p>';
|
$result .= '<p>' . $author . '</p>';
|
||||||
$result .= '<p>' . $item->pubDate . '</p>';
|
$result .= '<p>' . $item->pubDate . '</p>';
|
||||||
$result .= '<p><b>' . $summary . '</b></p>';
|
$result .= '<p><b>' . $summary . '</b></p>';
|
||||||
|
|
||||||
// todo use html instead
|
$nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]');
|
||||||
$result .= '<div>' . str_replace("\n", '<br>', $maintext->item(0)->textContent) . '</div>';
|
if (!$nodes->length)
|
||||||
|
{
|
||||||
|
// articles accès libre
|
||||||
|
$nodes = $finder->query('//div[contains(@class, "news__body__center__article")]');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$nodes->length)
|
||||||
|
{
|
||||||
|
echo 'warning: could not get content of "' . $title . '"<br>';
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
$node = $nodes->item(0);
|
||||||
|
|
||||||
|
$innerHTML = '';
|
||||||
|
foreach ($node->childNodes as $childNode){
|
||||||
|
$innerHTML .= $childNode->ownerDocument->saveHTML($childNode);
|
||||||
|
}
|
||||||
|
$outerHTML = $node->ownerDocument->saveHTML($node);
|
||||||
|
$textcontent = $node->textContent;
|
||||||
|
|
||||||
|
$result .= '<div>' . $innerHTML . '</div>';
|
||||||
|
|
||||||
|
$epub->AddPage($result, false, $title);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
//echo $result;
|
//echo $result;
|
||||||
|
|
||||||
$epub->AddPage($result, false, $title);
|
|
||||||
|
|
||||||
//break;
|
//break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue