improve mediapart: use existing html
This commit is contained in:
parent
b34c11113d
commit
16885de8ad
|
@ -1,6 +1,10 @@
|
|||
<?php
|
||||
|
||||
$sessionid = 'value of cookie MPSESSID';
|
||||
/*ini_set('display_errors', '1');
|
||||
ini_set('display_startup_errors', '1');
|
||||
error_reporting(E_ALL);*/
|
||||
|
||||
require('TPEpubCreator.php');
|
||||
$feedurl = 'https://www.mediapart.fr/articles/feed';
|
||||
$opts = [
|
||||
|
@ -37,41 +41,46 @@ foreach ($items as $item)
|
|||
$doc->loadHTML($article);
|
||||
$finder = new DomXPath($doc);
|
||||
|
||||
// clean images
|
||||
$figures = $finder->query('//figure');
|
||||
foreach ($figures as $figure)
|
||||
$svgs = $finder->query('//svg');
|
||||
foreach ($svgs as $svg)
|
||||
{
|
||||
$figure->parentNode->removeChild($figure);
|
||||
$svg->parentNode->removeChild($svg);
|
||||
}
|
||||
|
||||
// clean "à lire aussi" and inline quotes
|
||||
$asides = $finder->query('//aside');
|
||||
foreach ($asides as $aside)
|
||||
{
|
||||
$aside->parentNode->removeChild($aside);
|
||||
}
|
||||
|
||||
// improve section titles
|
||||
$asides = $finder->query('//h2');
|
||||
foreach ($asides as $aside)
|
||||
{
|
||||
$aside->textContent = "\n*" . $aside->textContent . "*\n";
|
||||
// todo keep h2
|
||||
}
|
||||
|
||||
$maintext = $finder->query('//div[contains(@class, "paywall-restricted-content")]');
|
||||
$result = '<h1>' . $title . '</h1>';
|
||||
$result .= '<p>' . $author . '</p>';
|
||||
$result .= '<p>' . $item->pubDate . '</p>';
|
||||
$result .= '<p><b>' . $summary . '</b></p>';
|
||||
|
||||
// todo use html instead
|
||||
$result .= '<div>' . str_replace("\n", '<br>', $maintext->item(0)->textContent) . '</div>';
|
||||
$nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]');
|
||||
if (!$nodes->length)
|
||||
{
|
||||
// articles accès libre
|
||||
$nodes = $finder->query('//div[contains(@class, "news__body__center__article")]');
|
||||
}
|
||||
|
||||
//echo $result;
|
||||
if (!$nodes->length)
|
||||
{
|
||||
echo 'warning: could not get content of "' . $title . '"<br>';
|
||||
}
|
||||
else
|
||||
{
|
||||
$node = $nodes->item(0);
|
||||
|
||||
$innerHTML = '';
|
||||
foreach ($node->childNodes as $childNode){
|
||||
$innerHTML .= $childNode->ownerDocument->saveHTML($childNode);
|
||||
}
|
||||
$outerHTML = $node->ownerDocument->saveHTML($node);
|
||||
$textcontent = $node->textContent;
|
||||
|
||||
$result .= '<div>' . $innerHTML . '</div>';
|
||||
|
||||
$epub->AddPage($result, false, $title);
|
||||
}
|
||||
|
||||
|
||||
//echo $result;
|
||||
//break;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue