From 566287d767700a5753e5827c963600ac74535b26 Mon Sep 17 00:00:00 2001 From: quenousimporte Date: Thu, 22 Aug 2024 12:08:39 +0200 Subject: [PATCH 1/3] add mediapart from home page --- epub.php | 127 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 109 insertions(+), 18 deletions(-) diff --git a/epub.php b/epub.php index 77bb448..d004a04 100644 --- a/epub.php +++ b/epub.php @@ -157,7 +157,7 @@ $content = '

' . $article->title . '

'; //$content .= '' . $article->abstract . ''; $content .= '
Page ' . $article->page . '
'; - + if (count($article->rubrics)) { $content .= '
Rubriques: ' . implode(',', $article->rubrics) . '
'; @@ -175,19 +175,19 @@ { if ($item->class == 'quote') { - $content .= '
' .$item->content . '
'; + $content .= '
' .$item->content . '
'; } else if ($item->class == 'paragraphTitle') { - $content .= '

' .$item->content . '

'; + $content .= '

' .$item->content . '

'; } else if ($item->class == 'introduction') { - $content .= '

' .$item->content . '

'; + $content .= '

' .$item->content . '

'; } else { - $content .= '

' .$item->content . '

'; + $content .= '

' .$item->content . '

'; } } } @@ -206,11 +206,11 @@ else { $epub->AddPage('', false, 'Page ' . $page, true); - } + } $page++; } $epub->AddPage($content, false, $article->title); - + } write_epub($epub); } @@ -291,19 +291,19 @@ write_epub($epub); } - // Mediapart + if (isset($_POST['forcempcookies']) && $_POST['forcempcookies']) + { + $old = $mp_sessionid; + $mp_sessionid = get_mp_cookie(); + $settings = file_get_contents('settings.php'); + $settings = str_replace($old, $mp_sessionid, $settings); + file_put_contents('settings.php', $settings); + echo '
Updated mpsessid in settings.
'; + } + + // Mediapart RSS if (isset($_POST['mediapart']) && $_POST['mediapart']) { - if (isset($_POST['forcempcookies']) && $_POST['forcempcookies']) - { - $old = $mp_sessionid; - $mp_sessionid = get_mp_cookie(); - $settings = file_get_contents('settings.php'); - $settings = str_replace($old, $mp_sessionid, $settings); - file_put_contents('settings.php', $settings); - echo '
Updated mpsessid in settings.
'; - } - $feedurl = 'https://www.mediapart.fr/articles/feed'; $opts = [ 'http' => [ @@ -386,6 +386,94 @@ write_epub($epub); } + // Mediapart home page + if (isset($_POST['mphome']) && $_POST['mphome']) + { + $homeurl = 'https://www.mediapart.fr'; + $opts = [ + 'http' => [ + 'method' => "GET", + 'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, + ] + ]; + $context = stream_context_create($opts); + + $epub = new TPEpubCreator(); + $epub->temp_folder = 'temp/'; + $epub->epub_file = 'epub/mediaparthome.epub'; + $epub->title = 'Mediapart home ' . $date; + + $home = file_get_contents($homeurl, false, $context); + $doc = new DOMDocument(); + $doc->loadHTML($home); + $finder = new DomXPath($doc); + + $links = $finder->query('//section[contains(@class, "_hot")]//a[not(contains(@href, "/dossier/")) and not(contains(@href, "/studio/")) and not(contains(@href, "fil-")) and (@href != "/journal/series")]/@href'); + + foreach ($links as $link) + { + $url = 'https://www.mediapart.fr' . $link->value; + + $article = file_get_contents($url, false, $context); + $doc = new DOMDocument(); + $doc->loadHTML($article); + $finder = new DomXPath($doc); + + $title = $finder->query('//h1')->item(0)->textContent; + $author = $finder->query('//a[contains(@href, "biographie")]')->item(0)->textContent; + $date = $finder->query('//time')->item(0)->textContent; + $summary = $finder->query('//p[contains(@class, "news__heading__top__intro")]')->item(0)->textContent; + + // strip images + $toremove = $finder->query('//svg'); + foreach ($toremove as $elt) + { + $elt->parentNode->removeChild($elt); + } + $toremove = $finder->query('//figure'); + foreach ($toremove as $elt) + { + $elt->parentNode->removeChild($elt); + } + $toremove = $finder->query('//span[@class="screen-reader-only"]'); + foreach ($toremove as $elt) + { + $elt->parentNode->removeChild($elt); + } + + $result = '

' . $title . '

'; + $result .= '

' . $author . '

'; + $result .= '

' . $date . '

'; + $result .= '

' . $summary . '

'; + + $nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); + if (!$nodes->length) + { + // articles accès libre + $nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); + } + + if (!$nodes->length) + { + echo 'warning: could not get content of "' . $title . '"
'; + } + else + { + $node = $nodes->item(0); + + $innerHTML = ''; + foreach ($node->childNodes as $childNode){ + $innerHTML .= $childNode->ownerDocument->saveHTML($childNode); + } + $result .= '
' . strip_tags($innerHTML, '

') . '

'; + + $epub->AddPage($result, false, $title); + } + } + + write_epub($epub); + } + // New York Times if (isset($_POST['nyt']) && $_POST['nyt']) { @@ -565,6 +653,9 @@
+ + +
From 5fdfa0fff1c53b60afa73384487da17aa228ca77 Mon Sep 17 00:00:00 2001 From: quenousimporte Date: Thu, 22 Aug 2024 14:23:30 +0200 Subject: [PATCH 2/3] add option to include mp cold articles from home fix article date --- epub.php | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/epub.php b/epub.php index d004a04..1ad81ba 100644 --- a/epub.php +++ b/epub.php @@ -408,7 +408,13 @@ $doc->loadHTML($home); $finder = new DomXPath($doc); - $links = $finder->query('//section[contains(@class, "_hot")]//a[not(contains(@href, "/dossier/")) and not(contains(@href, "/studio/")) and not(contains(@href, "fil-")) and (@href != "/journal/series")]/@href'); + $coldcondition = '[contains(@class, "_hot")]'; + if (isset($_POST['mpcold']) && $_POST['mpcold']) + { + $coldcondition = '[contains(@class, "_cold") or contains(@class, "_hot")]'; + } + + $links = $finder->query('//section' . $coldcondition . '//a[not(contains(@href, "/dossier/")) and not(contains(@href, "/studio/")) and not(contains(@href, "fil-")) and (@href != "/journal/series")]/@href'); foreach ($links as $link) { @@ -421,7 +427,7 @@ $title = $finder->query('//h1')->item(0)->textContent; $author = $finder->query('//a[contains(@href, "biographie")]')->item(0)->textContent; - $date = $finder->query('//time')->item(0)->textContent; + $date = $finder->query('//time')->item(1)->textContent; $summary = $finder->query('//p[contains(@class, "news__heading__top__intro")]')->item(0)->textContent; // strip images @@ -656,6 +662,9 @@
+ + +
From 21daa9e42698724517fa3cb3e85fbf18b621bd09 Mon Sep 17 00:00:00 2001 From: quenousimporte Date: Fri, 23 Aug 2024 09:31:16 +0200 Subject: [PATCH 3/3] simplify mp home xpath, more inclusive --- epub.php | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/epub.php b/epub.php index 1ad81ba..fb9c2c8 100644 --- a/epub.php +++ b/epub.php @@ -408,13 +408,9 @@ $doc->loadHTML($home); $finder = new DomXPath($doc); - $coldcondition = '[contains(@class, "_hot")]'; - if (isset($_POST['mpcold']) && $_POST['mpcold']) - { - $coldcondition = '[contains(@class, "_cold") or contains(@class, "_hot")]'; - } + $xpath = '//a[@data-js = "teaser-link" and contains(@href, "/journal/") and not(contains(@href, "/dossier/")) and not(contains(@href, "/studio/")) and not(contains(@href, "fil-")) and (@href != "/journal/series")]/@href'; - $links = $finder->query('//section' . $coldcondition . '//a[not(contains(@href, "/dossier/")) and not(contains(@href, "/studio/")) and not(contains(@href, "fil-")) and (@href != "/journal/series")]/@href'); + $links = $finder->query($xpath); foreach ($links as $link) { @@ -662,9 +658,6 @@
- - -