From 1eb8d51d3c11222b3e651b0ee2b707e580759137 Mon Sep 17 00:00:00 2001 From: quenousimporte Date: Mon, 3 Jun 2024 17:38:46 +0200 Subject: [PATCH] merge le monde and mediapart --- lemonde.php => epub.php | 107 +++++++++++++++++++++++++++++++++- mediapart.php | 126 ---------------------------------------- 2 files changed, 105 insertions(+), 128 deletions(-) rename lemonde.php => epub.php (53%) delete mode 100644 mediapart.php diff --git a/lemonde.php b/epub.php similarity index 53% rename from lemonde.php rename to epub.php index aa9df19..e5c8eb2 100644 --- a/lemonde.php +++ b/epub.php @@ -113,6 +113,109 @@ } } + // Mediapart + if (isset($_POST['mediapart']) && $_POST['mediapart']) + { + $sessionid = $_POST['mpsessid']; + + $feedurl = 'https://www.mediapart.fr/articles/feed'; + $opts = [ + 'http' => [ + 'method' => "GET", + 'header' => "Accept-language: en\nCookie: MPSESSID=" . $sessionid, + ] + ]; + $context = stream_context_create($opts); + + $epub = new TPEpubCreator(); + $epub->temp_folder = 'temp/'; + $epub->epub_file = 'epub/mediapart' . $date . '.epub'; + $epub->title = 'Mediapart'; + + // load feeds + $feed = file_get_contents($feedurl); + $xml = new SimpleXMLElement($feed); + $items = $xml->xpath("/rss/channel/item"); + + foreach ($items as $item) + { + $title = $item->title; + + $category = $item->xpath('dc:subject')[0]; + $author = $item->xpath('dc:creator')[0]; + $summary = $item->description; + + $article = file_get_contents($item->link, false, $context); + $doc = new DOMDocument(); + $doc->loadHTML($article); + $finder = new DomXPath($doc); + + // strip images + $toremove = $finder->query('//svg'); + foreach ($toremove as $elt) + { + $elt->parentNode->removeChild($elt); + } + $toremove = $finder->query('//figure'); + foreach ($toremove as $elt) + { + $elt->parentNode->removeChild($elt); + } + $toremove = $finder->query('//span[@class="screen-reader-only"]'); + foreach ($toremove as $elt) + { + $elt->parentNode->removeChild($elt); + } + + $result = '

' . $title . '

'; + $result .= '

' . $author . '

'; + $result .= '

' . $item->pubDate . '

'; + $result .= '

' . $summary . '

'; + + $nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); + if (!$nodes->length) + { + // articles accès libre + $nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); + } + + if (!$nodes->length) + { + echo 'warning: could not get content of "' . $title . '"
'; + } + else + { + $node = $nodes->item(0); + + $innerHTML = ''; + foreach ($node->childNodes as $childNode){ + $innerHTML .= $childNode->ownerDocument->saveHTML($childNode); + } + $outerHTML = $node->ownerDocument->saveHTML($node); + $textcontent = $node->textContent; + + $result .= '
' . strip_tags($innerHTML, '

') . '

'; + + $epub->AddPage($result, false, $title); + } + } + + if ( ! $epub->error ) { + $epub->CreateEPUB(); + + if ( ! $epub->error ) { + echo 'Success: ' . $epub->epub_file . ' created.
'; + } + else + { + echo $epub->error; + } + + } else { + echo $epub->error; + } + } + // list existing files $files = glob('epub/*'); foreach ($files as $file) @@ -123,8 +226,8 @@ ?>
- Le Monde
- GetPublicationContentItems url:
+ Le Monde. GetPublicationContentItems url:
+ Mediapart.
diff --git a/mediapart.php b/mediapart.php deleted file mode 100644 index 4026707..0000000 --- a/mediapart.php +++ /dev/null @@ -1,126 +0,0 @@ - - - -'; -} - -/*ini_set('display_errors', '1'); -ini_set('display_startup_errors', '1'); -error_reporting(E_ALL);*/ - -require('TPEpubCreator.php'); -$feedurl = 'https://www.mediapart.fr/articles/feed'; -$opts = [ - 'http' => [ - 'method' => "GET", - 'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, - ] -]; -$context = stream_context_create($opts); - -$epub = new TPEpubCreator(); -$epub->temp_folder = 'temp/'; -$epub->epub_file = 'mediapart.epub'; -$epub->title = 'Mediapart'; - -// load feeds -$feed = file_get_contents($feedurl); -$xml = new SimpleXMLElement($feed); -$items = $xml->xpath("/rss/channel/item"); - -setlocale(LC_ALL, 'fr_FR.UTF8', 'fr_FR','fr','fr','fra','fr_FR@euro'); -$localedate = strftime("%A %d %B %Y"); - -foreach ($items as $item) -{ - $title = $item->title; - - $category = $item->xpath('dc:subject')[0]; - $author = $item->xpath('dc:creator')[0]; - $summary = $item->description; - - $article = file_get_contents($item->link, false, $context); - $doc = new DOMDocument(); - $doc->loadHTML($article); - $finder = new DomXPath($doc); - - // strip images - $toremove = $finder->query('//svg'); - foreach ($toremove as $elt) - { - $elt->parentNode->removeChild($elt); - } - $toremove = $finder->query('//figure'); - foreach ($toremove as $elt) - { - $elt->parentNode->removeChild($elt); - } - $toremove = $finder->query('//span[@class="screen-reader-only"]'); - foreach ($toremove as $elt) - { - $elt->parentNode->removeChild($elt); - } - - $result = '

' . $title . '

'; - $result .= '

' . $author . '

'; - $result .= '

' . $item->pubDate . '

'; - $result .= '

' . $summary . '

'; - - $nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); - if (!$nodes->length) - { - // articles accès libre - $nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); - } - - if (!$nodes->length) - { - echo 'warning: could not get content of "' . $title . '"
'; - } - else - { - $node = $nodes->item(0); - - $innerHTML = ''; - foreach ($node->childNodes as $childNode){ - $innerHTML .= $childNode->ownerDocument->saveHTML($childNode); - } - $outerHTML = $node->ownerDocument->saveHTML($node); - $textcontent = $node->textContent; - - $result .= '
' . strip_tags($innerHTML, '

') . '

'; - - $epub->AddPage($result, false, $title); - } - - - //echo $result; - //break; -} - -if ( ! $epub->error ) -{ - $epub->CreateEPUB(); - - if ( ! $epub->error ) { - echo 'Success: Download your book here.'; - } - - } else { - echo $epub->error; -} -?> - -
- Force MPSESSID: - -
- - \ No newline at end of file