diff --git a/mediapart.php b/mediapart.php index c64a638..f470964 100644 --- a/mediapart.php +++ b/mediapart.php @@ -1,6 +1,10 @@ loadHTML($article); $finder = new DomXPath($doc); - // clean images - $figures = $finder->query('//figure'); - foreach ($figures as $figure) + $svgs = $finder->query('//svg'); + foreach ($svgs as $svg) { - $figure->parentNode->removeChild($figure); + $svg->parentNode->removeChild($svg); } - // clean "à lire aussi" and inline quotes - $asides = $finder->query('//aside'); - foreach ($asides as $aside) - { - $aside->parentNode->removeChild($aside); - } - - // improve section titles - $asides = $finder->query('//h2'); - foreach ($asides as $aside) - { - $aside->textContent = "\n*" . $aside->textContent . "*\n"; - // todo keep h2 - } - - $maintext = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); $result = '

' . $title . '

'; $result .= '

' . $author . '

'; $result .= '

' . $item->pubDate . '

'; $result .= '

' . $summary . '

'; - // todo use html instead - $result .= '
' . str_replace("\n", '
', $maintext->item(0)->textContent) . '
'; + $nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); + if (!$nodes->length) + { + // articles accès libre + $nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); + } + + if (!$nodes->length) + { + echo 'warning: could not get content of "' . $title . '"
'; + } + else + { + $node = $nodes->item(0); + + $innerHTML = ''; + foreach ($node->childNodes as $childNode){ + $innerHTML .= $childNode->ownerDocument->saveHTML($childNode); + } + $outerHTML = $node->ownerDocument->saveHTML($node); + $textcontent = $node->textContent; + + $result .= '
' . $innerHTML . '
'; + + $epub->AddPage($result, false, $title); + } + //echo $result; - - $epub->AddPage($result, false, $title); - //break; }