From e5e7ab27e30e8e35ce9540a8e772806aea0c8e32 Mon Sep 17 00:00:00 2001 From: quenousimporte Date: Mon, 26 Aug 2024 18:15:17 +0200 Subject: [PATCH] refactor mp --- epub.php | 170 ++++++++++++++++++------------------------------------- 1 file changed, 56 insertions(+), 114 deletions(-) diff --git a/epub.php b/epub.php index 7274400..f1518fe 100644 --- a/epub.php +++ b/epub.php @@ -19,7 +19,61 @@ function add_mp_article($url, $context, $epub) { + $article = file_get_contents($url, false, $context); + $doc = new DOMDocument(); + $doc->loadHTML($article); + $finder = new DomXPath($doc); + $title = $finder->query('//h1')->item(0)->textContent; + $author = $finder->query('//a[contains(@href, "biographie")]')->item(0)->textContent; + $date = $finder->query('//time')->item(1)->textContent; + $summary = $finder->query('//p[contains(@class, "news__heading__top__intro")]')->item(0)->textContent; + + // strip images + $toremove = $finder->query('//svg'); + foreach ($toremove as $elt) + { + $elt->parentNode->removeChild($elt); + } + $toremove = $finder->query('//figure'); + foreach ($toremove as $elt) + { + $elt->parentNode->removeChild($elt); + } + $toremove = $finder->query('//span[@class="screen-reader-only"]'); + foreach ($toremove as $elt) + { + $elt->parentNode->removeChild($elt); + } + + $result = '

' . $title . '

'; + $result .= '

' . $author . '

'; + $result .= '

' . $date . '

'; + $result .= '

' . $summary . '

'; + + $nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); + if (!$nodes->length) + { + // articles accès libre + $nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); + } + + if (!$nodes->length) + { + echo 'warning: could not get content of "' . $title . '"
'; + } + else + { + $node = $nodes->item(0); + + $innerHTML = ''; + foreach ($node->childNodes as $childNode){ + $innerHTML .= $childNode->ownerDocument->saveHTML($childNode); + } + $result .= '
' . strip_tags($innerHTML, '

') . '

'; + + $epub->AddPage($result, false, $title); + } } function get_mp_cookie() @@ -330,63 +384,7 @@ foreach ($items as $item) { - $url = $item->link; - - $article = file_get_contents($url, false, $context); - $doc = new DOMDocument(); - $doc->loadHTML($article); - $finder = new DomXPath($doc); - - $title = $item->title; - $author = $item->xpath('dc:creator')[0]; - $date = $item->pubDate; - $summary = $item->description; - - // strip images - $toremove = $finder->query('//svg'); - foreach ($toremove as $elt) - { - $elt->parentNode->removeChild($elt); - } - $toremove = $finder->query('//figure'); - foreach ($toremove as $elt) - { - $elt->parentNode->removeChild($elt); - } - $toremove = $finder->query('//span[@class="screen-reader-only"]'); - foreach ($toremove as $elt) - { - $elt->parentNode->removeChild($elt); - } - - $result = '

' . $title . '

'; - $result .= '

' . $author . '

'; - $result .= '

' . $date . '

'; - $result .= '

' . $summary . '

'; - - $nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); - if (!$nodes->length) - { - // articles accès libre - $nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); - } - - if (!$nodes->length) - { - echo 'warning: could not get content of "' . $title . '"
'; - } - else - { - $node = $nodes->item(0); - - $innerHTML = ''; - foreach ($node->childNodes as $childNode){ - $innerHTML .= $childNode->ownerDocument->saveHTML($childNode); - } - $result .= '
' . strip_tags($innerHTML, '

') . '

'; - - $epub->AddPage($result, false, $title); - } + add_mp_article($item->link, $context, $epub); } write_epub($epub); @@ -420,63 +418,7 @@ foreach ($links as $link) { - $url = 'https://www.mediapart.fr' . $link->value; - - $article = file_get_contents($url, false, $context); - $doc = new DOMDocument(); - $doc->loadHTML($article); - $finder = new DomXPath($doc); - - $title = $finder->query('//h1')->item(0)->textContent; - $author = $finder->query('//a[contains(@href, "biographie")]')->item(0)->textContent; - $date = $finder->query('//time')->item(1)->textContent; - $summary = $finder->query('//p[contains(@class, "news__heading__top__intro")]')->item(0)->textContent; - - // strip images - $toremove = $finder->query('//svg'); - foreach ($toremove as $elt) - { - $elt->parentNode->removeChild($elt); - } - $toremove = $finder->query('//figure'); - foreach ($toremove as $elt) - { - $elt->parentNode->removeChild($elt); - } - $toremove = $finder->query('//span[@class="screen-reader-only"]'); - foreach ($toremove as $elt) - { - $elt->parentNode->removeChild($elt); - } - - $result = '

' . $title . '

'; - $result .= '

' . $author . '

'; - $result .= '

' . $date . '

'; - $result .= '

' . $summary . '

'; - - $nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); - if (!$nodes->length) - { - // articles accès libre - $nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); - } - - if (!$nodes->length) - { - echo 'warning: could not get content of "' . $title . '"
'; - } - else - { - $node = $nodes->item(0); - - $innerHTML = ''; - foreach ($node->childNodes as $childNode){ - $innerHTML .= $childNode->ownerDocument->saveHTML($childNode); - } - $result .= '
' . strip_tags($innerHTML, '

') . '

'; - - $epub->AddPage($result, false, $title); - } + add_mp_article('https://www.mediapart.fr' . $link->value, $context, $epub); } write_epub($epub);