From bac4ba584ecc85d534ee3d9c2ff551368fdb8629 Mon Sep 17 00:00:00 2001 From: quenousimporte Date: Tue, 1 Oct 2024 13:46:38 +0200 Subject: [PATCH] mp: add AFP in title iff author is AFP --- epub.php | 1507 +++++++++++++++++++++++++++--------------------------- 1 file changed, 756 insertions(+), 751 deletions(-) diff --git a/epub.php b/epub.php index 976ab6b..a887d15 100644 --- a/epub.php +++ b/epub.php @@ -1,752 +1,757 @@ - - - - - - Revue de presse - Epub - - - -
- Quick link: All Mediapart home page with renewed authent -
- -childNodes as $childNode){ - $nodeHTML = $childNode->ownerDocument->saveHTML($childNode); - - $nodeHTML = str_replace('' . strip_tags($innerHTML, '

') . ''; - } - - function add_mp_article($url, $context, $epub) - { - if (isset($_POST['mptoday']) && $_POST['mptoday'] && !str_contains($url, (new DateTime('today'))->format('dmy'))) - { - return; - } - // todo filter on "A la une" ? in rss item dc:subject contains "A la Une" - - $article = file_get_contents($url, false, $context); - $doc = new DOMDocument(); - $doc->loadHTML($article); - $finder = new DomXPath($doc); - - $title = $finder->query('//h1')->item(0)->textContent; - $author = $finder->query('//a[contains(@href, "biographie")]')->item(0)->textContent; - $date = trim($finder->query('//time')->item(1)->textContent); - $summary = $finder->query('//p[contains(@class, "news__heading__top__intro")]')->item(0)->textContent; - $category = trim($finder->query('//p[contains(@class,"news__heading__top__kicker")]')->item(0)->textContent); - - // strip images - $toremove = $finder->query('//svg'); - foreach ($toremove as $elt) - { - $elt->parentNode->removeChild($elt); - } - $toremove = $finder->query('//figure'); - foreach ($toremove as $elt) - { - $elt->parentNode->removeChild($elt); - } - $toremove = $finder->query('//span[@class="screen-reader-only"]'); - foreach ($toremove as $elt) - { - $elt->parentNode->removeChild($elt); - } - - $result = '

' . $title . '

'; - $result .= '

' . $category . '

'; - $result .= '

' . $author . '

'; - $result .= '

' . $date . '

'; - $result .= '

' . $summary . '

'; - - $nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); - if (!$nodes->length) - { - // articles accès libre - $nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); - } - - if (!$nodes->length) - { - echo 'warning: could not get content of "' . $title . '"
'; - } - else - { - $node = $nodes->item(0); - $nodehtml = get_clean_html($node); - $result .= $nodehtml; - - // add boite noire - $nodes = $finder->query('//aside[contains(@class, "_black")]'); - if ($nodes->length) - { - $node = $nodes->item(0); - $nodehtml = get_clean_html($node); - $result .= '
' . $nodehtml . '
'; - } - $epub->AddPage($result, false, $title); - } - } - - function get_mp_cookie() - { - global $mp_user; - global $mp_password; - - $url = 'https://www.mediapart.fr/login_check'; - $postfields = 'email=' . $mp_user . '&password=' . $mp_password . '&submitButton=&_target_path=https%3A%2F%2Fwww.mediapart.fr%2F'; - - $ch = curl_init($url); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); - curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($ch, CURLOPT_AUTOREFERER, false); - - curl_setopt($ch, CURLOPT_HEADER, 1); - curl_setopt($ch, CURLOPT_POST, 1); - curl_setopt($ch, CURLOPT_POSTFIELDS, $postfields); - - $result = curl_exec($ch); - $sessid = ''; - - preg_match_all('/^Set-Cookie:\s*([^;]*)/mi', $result, $matches); - foreach($matches[1] as $item) - { - if (str_starts_with($item, 'MPSESSID=')) - { - $sessid = str_replace('MPSESSID=', '', $item); - } - } - return $sessid; - } - - function write_epub($epub) - { - if (file_exists($epub->epub_file)) - { - unlink($epub->epub_file); - } - - if (!$epub->error) - { - $epub->CreateEPUB(); - - if (!$epub->error) - { - echo 'Success: ' . $epub->epub_file . ' created.
'; - } - else - { - echo $epub->error; - } - - } - else - { - echo $epub->error; - } - } - - function lm_download_image($base_url, $url_part, $id) - { - $image_url = preg_replace('/GetPublicationContentItems-.*\.json/', $url_part . $id . '.jpg', $base_url); - $temp_img_path = './temp/' . $id . '.jpg'; - $temp_content = file_get_contents($image_url); - file_put_contents($temp_img_path, $temp_content); - return $temp_img_path; - } - - function get_json($url) - { - // cache json in case url expires - $tempjsonpath = 'temp/' . hash('md5', $url) . '.json'; - $json = ''; - if (file_exists($tempjsonpath)) - { - $json = file_get_contents($tempjsonpath); - } - else - { - $json = file_get_contents($url); - file_put_contents($tempjsonpath, $json); - } - return json_decode($json); - } - - // quick scenario - if (isset($_GET['mp'])) - { - $_POST['forcempcookies'] = 1; - $_POST['mphome'] = 1; - } - - date_default_timezone_set('Europe/Paris'); - $today = (new DateTime('today'))->format('Ymd'); - - if (isset($_POST['emptycache']) && $_POST['emptycache']) - { - $files = glob('temp/*'); - foreach($files as $file) - { - if(is_file($file)) - { - unlink($file); - } - } - } - - // Cafeyn - if (isset($_POST['cafeyn']) && $_POST['cafeyn']) - { - $url = $_POST['cafeyn']; - - // extract url from curl command - if (str_starts_with($url, 'curl ')) - { - $url = explode("'", $url)[1]; - echo '

extracted url from curl command:

' . $url . '

'; - } - - // cache json in case url expires - $publication = get_json($url); - $pubname = $publication->metadata->title; - - $epub = new TPEpubCreator(); - $epub->temp_folder = 'temp/'; - $epub->epub_file = 'epub/' . $pubname . 'cafeyn.epub'; - $epub->title = $pubname . ' Cafeyn - ' . $today ; - - $list = (array)$publication->articles; - $page = 1; - foreach ($list as $entry) - { - $articleurl = str_replace('material', 'article/' . $entry->hash . '.json', $url); - $article = get_json($articleurl); - - $content = '

' . $article->title . '

'; - //$content .= '' . $article->abstract . ''; - $content .= '
Page ' . $article->page . '
'; - - if (count($article->rubrics)) - { - $content .= '
Rubriques: ' . implode(',', $article->rubrics) . '
'; - } - if (count($article->authors)) - { - $content .= '
Auteurs: ' . implode(',', $article->authors) . '
'; - } - - foreach ($article->content->sections as $section) - { - $dropcap = ''; - foreach ($section->items as $item) - { - if ($item->type == 'text') - { - if ($item->class == 'quote') - { - $content .= '
' .$item->content . '
'; - } - else if ($item->class == 'paragraphTitle') - { - $content .= '

' .$item->content . '

'; - } - else if ($item->class == 'introduction') - { - $content .= '

' .$item->content . '

'; - } - else if ($item->class == 'dropcap') - { - $dropcap = $item->content; - } - else - { - if ($dropcap) - { - $item->content = '' . $dropcap . '' . $item->content; - $dropcap = ''; - } - $content .= '

' . $item->content . '

'; - } - } - } - } - - while ($article->page >= $page) - { - $path = $publication->pages[$page - 1]->tn; - $path = str_replace('material', $path, $url); - $path = str_replace('api.', 'content.', $path); - if ($page == 1) - { - // cover - $epub->AddImage($path, 'image/jpeg', true); - } - else - { - $epub->AddPage('', false, 'Page ' . $page, true); - } - $page++; - } - $epub->AddPage($content, false, $article->title); - - } - write_epub($epub); - } - - // Le Monde - if (isset($_POST['lemonde']) && $_POST['lemonde']) - { - $url = $_POST['lemonde']; - - // extract url from curl command - if (str_starts_with($url, 'curl ')) - { - $url = explode("'", $url)[1]; - echo '

extracted url from curl command:

' . $url . '

'; - } - - $epub = new TPEpubCreator(); - $epub->temp_folder = 'temp/'; - $epub->epub_file = 'epub/lemonde.epub'; - $epub->title = 'Le Monde ' . $today; - - $publication = get_json($url); - $content = array_filter($publication->Content, function($item) { return $item->Category == 'Le Monde'; }); - usort($content, function ($a, $b) { return $a->PageNumber - $b->PageNumber; }); - - $pageindex = get_json(str_replace('GetPublicationContentItems', 'GetPublicationPages', $url)); - - $page = 0; - foreach ($content as $article) - { - if ($article->PageNumber > $page) - { - $page = $article->PageNumber; - $pageobj = array_values(array_filter($pageindex->Page, function($p) - { - global $page; - return $page == $p->PageNumber; - }))[0]; - $pageid = $pageobj->PublicationPageID; - - $path = lm_download_image($url, 'Preview-MEDIUM-', $pageid); - - if ($page == 1) - { - // cover - $epub->AddImage($path, 'image/jpeg', true); - } - else - { - $epub->AddPage('', false, 'Page ' . $page, true); - } - } - - $articlebody = array_filter($article->ContentItem, function($item) { return $item->ContentType == 'text/xml'; }); - $articlebody = array_values($articlebody)[0]; - - if ($articlebody->Title && $articlebody->HtmlText) - { - $pagecontent = '

'. strip_tags($articlebody->Title) . '

'; - $pagecontent .= '

Page ' . $article->PageNumber . '

'; - - $author = array_filter($article->ContentItem, function($item) { return $item->ContentType == 'author/xml'; }); - $author = array_values($author)[0]; - if ($author->Author) - { - $pagecontent .= $author->Author; - } - - if ($articlebody->Introduction) - { - $pagecontent .= '' . $articlebody->Introduction . ''; - } - $pagecontent .= $articlebody->HtmlText; - - $epub->AddPage($pagecontent, false, str_replace(' ', ' ', strip_tags($articlebody->Title)), true); - } - } - write_epub($epub); - } - - if (isset($_POST['forcempcookies']) && $_POST['forcempcookies']) - { - $old = $mp_sessionid; - $mp_sessionid = get_mp_cookie(); - $settings = file_get_contents('settings.php'); - $settings = str_replace($old, $mp_sessionid, $settings); - file_put_contents('settings.php', $settings); - echo '
Updated mpsessid in settings.
'; - } - - // Mediapart RSS - if (isset($_POST['mediapart']) && $_POST['mediapart']) - { - $feedurl = 'https://www.mediapart.fr/articles/feed'; - $opts = [ - 'http' => [ - 'method' => "GET", - 'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, - ] - ]; - $context = stream_context_create($opts); - - $epub = new TPEpubCreator(); - $epub->temp_folder = 'temp/'; - $epub->epub_file = 'epub/mediapart.epub'; - $epub->title = 'Mediapart - RSS ' . $today; - - // load feeds - $feed = file_get_contents($feedurl); - $xml = new SimpleXMLElement($feed); - $items = $xml->xpath("/rss/channel/item"); - - foreach (array_reverse($items) as $item) - { - add_mp_article($item->link, $context, $epub); - } - - write_epub($epub); - } - - // Politis RSS - if (isset($_POST['politis']) && $_POST['politis']) - { - $issuenb = $_POST['politis']; - $feedurl = 'https://www.politis.fr/flux-rss-politis-fr/'; - $opts = [ - 'http' => [ - 'method' => "GET", - //'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, // todo? - ] - ]; - $context = stream_context_create($opts); - - $epub = new TPEpubCreator(); - $epub->temp_folder = 'temp/'; - $epub->epub_file = 'epub/politis' . $issuenb . '.epub'; - $epub->title = 'Politis ' . $issuenb . ' - RSS ' . $today; - - // find cover - $issues = file_get_contents('https://www.politis.fr/boutique/magazines-unite/'); - $doc = new DOMDocument(); - $doc->loadHTML($issues); - $finder = new DomXPath($doc); - $cover = $finder->query('//img[contains(@src, "' . $issuenb . '_")]'); - if ($cover->length) - { - $epub->AddImage($cover->item(0)->getAttribute('src'), 'image/jpeg', true); - } - else - { - echo 'warning: no cover for issue ' . $issuenb . '
'; - } - - // load feeds - $feed = file_get_contents($feedurl); - $xml = new SimpleXMLElement($feed); - $items = $xml->xpath("/rss/channel/item"); - - foreach (array_reverse($items) as $item) - { - $url = $item->link; - $article = file_get_contents($url, false, $context); - - if (!str_contains($article, ' dans l’hebdo N° ' . $issuenb)) - { - continue; - } - - $doc = new DOMDocument(); - $doc->loadHTML($article); - $finder = new DomXPath($doc); - - $title = $finder->query('//h1')->item(0)->textContent; - $author = $finder->query('//span[@class="nom"]')->item(0)->textContent; - $date = $finder->query('//span[@class="date"]')->item(0)->textContent; - $summary = $finder->query('//div[@class="extrait"]')->item(0)->textContent; - - // transform "zoom" divs into aside tags - $zooms = $finder->query('//div[contains(@class,"gt_zoom")]'); - foreach ($zooms as $zoom) - { - $aside = $doc->createElement('aside'); - while ($zoom->childNodes->length > 0) - { - $aside->appendChild($zoom->childNodes->item(0)); - } - $zoom->parentNode->replaceChild($aside, $zoom); - } - - $result = '

' . $title . '

'; - $result .= '

' . $author . '

'; - $result .= '

' . $date . '

'; - $result .= '

' . $summary . '

'; - - $nodes = $finder->query('//div[@id="content"]'); - $node = $nodes->item(0); - $nodehtml = get_clean_html($node); - $result .= $nodehtml; - $epub->AddPage($result, false, $title); - - } - - write_epub($epub); - } - - // Mediapart home page - if (isset($_POST['mphome']) && $_POST['mphome']) - { - $homeurl = 'https://www.mediapart.fr'; - $opts = [ - 'http' => [ - 'method' => "GET", - 'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, - ] - ]; - $context = stream_context_create($opts); - - $epub = new TPEpubCreator(); - $epub->temp_folder = 'temp/'; - $epub->epub_file = 'epub/mediaparthome.epub'; - $epub->title = 'Mediapart - A la une ' . $today; - - $home = file_get_contents($homeurl, false, $context); - $doc = new DOMDocument(); - $doc->loadHTML($home); - $finder = new DomXPath($doc); - - $xpath = '//a[@data-js = "teaser-link" and contains(@href, "/journal/") and not(contains(@href, "/dossier/")) and not(contains(@href, "/studio/")) and not(contains(@href, "fil-")) and (@href != "/journal/series")]/@href'; - - $links = $finder->query($xpath); - - foreach ($links as $link) - { - add_mp_article('https://www.mediapart.fr' . $link->value, $context, $epub); - } - - write_epub($epub); - } - - // New York Times RSS - if (isset($_POST['nyt']) && $_POST['nyt']) - { - $feedurl = 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml'; - $opts = [ - 'http' => [ - 'method' => "GET", - 'header' => "Accept-language: en\nCookie: NYT-S=" . $nyt_sessionid, - ] - ]; - $context = stream_context_create($opts); - - $epub = new TPEpubCreator(); - $epub->temp_folder = 'temp/'; - $epub->epub_file = 'epub/newyorktimes.epub'; - $epub->title = 'The New York Times ' . $today; - - // load feeds - $feed = file_get_contents($feedurl); - $xml = new SimpleXMLElement($feed); - $items = $xml->xpath("/rss/channel/item"); - - foreach (array_reverse($items) as $item) - { - $title = $item->title; - $author = $item->xpath('dc:creator')[0]; - $summary = $item->description; - - $article = file_get_contents($item->link, false, $context); - $doc = new DOMDocument(); - $doc->loadHTML($article); - $finder = new DomXPath($doc); - - $result = '

' . $title . '

'; - $result .= '

' . $author . '

'; - $result .= '

' . $item->pubDate . '

'; - $result .= '

' . $summary . '

'; - - $nodes = $finder->query('//section[@name="articleBody"]'); - - if (!$nodes->length) - { - echo 'warning: could not get content of "' . $title . '"
'; - } - else - { - $node = $nodes->item(0); - $nodehtml = get_clean_html($node); - $result .= $nodehtml; - $epub->AddPage($result, false, $title); - } - } - - write_epub($epub); - } - - // New York Times today's paper - if (isset($_POST['nyttoday']) && $_POST['nyttoday']) - { - $todaypage = file_get_contents("https://www.nytimes.com/section/todayspaper"); - $opts = [ - 'http' => [ - 'method' => "GET", - 'header' => "Accept-language: en\nCookie: NYT-S=" . $nyt_sessionid, - ] - ]; - $context = stream_context_create($opts); - - $epub = new TPEpubCreator(); - $epub->temp_folder = 'temp/'; - $epub->epub_file = 'epub/newyorktimestoday.epub'; - $epub->title = 'The New York Times today\'s paper ' . $today; - - // grab today's paper - $start = strpos($todaypage, "window.__preloadedData = ") + 25; - $end = strpos($todaypage, "};", $start) + 1; - $json = str_replace(":undefined", ":null", substr($todaypage, $start, $end - $start)); - $data = json_decode($json); - - foreach($data->initialState as $item) - { - if ($item->__typename == "Article") - { - $headline = ($data->initialState->{ $item->headline->id }); - - $title = $headline->default; - $summary = $item->summary; - - $result = '

' . $title . '

'; - - // authors - $result .= '

'; - $creators = ($data->initialState->{ $item->bylines[0]->id })->creators; - foreach ($creators as $creator) - { - $author = $data->initialState->{ $creator->id }; - $result .= $author->displayName . ' '; - } - $result .= '

'; - - $result .= '

' . $item->lastMajorModification . '

'; - $result .= '

' . $summary . '

'; - - $article = file_get_contents($item->url, false, $context); - $doc = new DOMDocument(); - $doc->loadHTML($article); - $finder = new DomXPath($doc); - $nodes = $finder->query('//section[@name="articleBody"]'); - - if (!$nodes->length) - { - echo 'warning: could not get content of "' . $title . '"
'; - } - else - { - $node = $nodes->item(0); - $nodehtml = get_clean_html($node); - $result .= $nodehtml; - $epub->AddPage($result, false, $title); - } - } - } - write_epub($epub); - } -?> - -

Generate epub

-
-
- - - -

Mediapart

- - -
- - -
- - -
- - - -

Le Monde

- - - -

Cafeyn

- - - -

The New York Times

- - -
- - - -

Politis

- - -

Admin

- - -

- - -
-
- -

Available files

-"; - while (false !== ($file = readdir($handle))) { - if ($file != "." && $file != "..") { - $filePath = 'epub/' . $file; - $modTime = date("F d Y H:i:s.", filemtime($filePath)); - echo "
  • $file - $modTime
  • "; - } - } - echo ""; - closedir($handle); - } -?> - -

    Mediapart RSS summary

    -xpath("/rss/channel/item"); - foreach ($items as $item) - { - echo ''; - } -?> - -

    Links

    -Generated files
    -Press review
    - - + + + + + + Revue de presse - Epub + + + + + +childNodes as $childNode){ + $nodeHTML = $childNode->ownerDocument->saveHTML($childNode); + + $nodeHTML = str_replace('' . strip_tags($innerHTML, '

    ') . ''; + } + + function add_mp_article($url, $context, $epub) + { + if (isset($_POST['mptoday']) && $_POST['mptoday'] && !str_contains($url, (new DateTime('today'))->format('dmy'))) + { + return; + } + // todo filter on "A la une" ? in rss item dc:subject contains "A la Une" + + $article = file_get_contents($url, false, $context); + $doc = new DOMDocument(); + $doc->loadHTML($article); + $finder = new DomXPath($doc); + + $title = $finder->query('//h1')->item(0)->textContent; + $author = $finder->query('//a[contains(@href, "biographie")]')->item(0)->textContent; + $date = trim($finder->query('//time')->item(1)->textContent); + $summary = $finder->query('//p[contains(@class, "news__heading__top__intro")]')->item(0)->textContent; + $category = trim($finder->query('//p[contains(@class,"news__heading__top__kicker")]')->item(0)->textContent); + + if (trim($author) == 'Agence France-Presse') + { + $title = 'AFP - ' . $title; + } + + // strip images + $toremove = $finder->query('//svg'); + foreach ($toremove as $elt) + { + $elt->parentNode->removeChild($elt); + } + $toremove = $finder->query('//figure'); + foreach ($toremove as $elt) + { + $elt->parentNode->removeChild($elt); + } + $toremove = $finder->query('//span[@class="screen-reader-only"]'); + foreach ($toremove as $elt) + { + $elt->parentNode->removeChild($elt); + } + + $result = '

    ' . $title . '

    '; + $result .= '

    ' . $category . '

    '; + $result .= '

    ' . $author . '

    '; + $result .= '

    ' . $date . '

    '; + $result .= '

    ' . $summary . '

    '; + + $nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); + if (!$nodes->length) + { + // articles accès libre + $nodes = $finder->query('//div[contains(@class, "news__body__center__article")]'); + } + + if (!$nodes->length) + { + echo 'warning: could not get content of "' . $title . '"
    '; + } + else + { + $node = $nodes->item(0); + $nodehtml = get_clean_html($node); + $result .= $nodehtml; + + // add boite noire + $nodes = $finder->query('//aside[contains(@class, "_black")]'); + if ($nodes->length) + { + $node = $nodes->item(0); + $nodehtml = get_clean_html($node); + $result .= '
    ' . $nodehtml . '
    '; + } + $epub->AddPage($result, false, $title); + } + } + + function get_mp_cookie() + { + global $mp_user; + global $mp_password; + + $url = 'https://www.mediapart.fr/login_check'; + $postfields = 'email=' . $mp_user . '&password=' . $mp_password . '&submitButton=&_target_path=https%3A%2F%2Fwww.mediapart.fr%2F'; + + $ch = curl_init($url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_AUTOREFERER, false); + + curl_setopt($ch, CURLOPT_HEADER, 1); + curl_setopt($ch, CURLOPT_POST, 1); + curl_setopt($ch, CURLOPT_POSTFIELDS, $postfields); + + $result = curl_exec($ch); + $sessid = ''; + + preg_match_all('/^Set-Cookie:\s*([^;]*)/mi', $result, $matches); + foreach($matches[1] as $item) + { + if (str_starts_with($item, 'MPSESSID=')) + { + $sessid = str_replace('MPSESSID=', '', $item); + } + } + return $sessid; + } + + function write_epub($epub) + { + if (file_exists($epub->epub_file)) + { + unlink($epub->epub_file); + } + + if (!$epub->error) + { + $epub->CreateEPUB(); + + if (!$epub->error) + { + echo 'Success: ' . $epub->epub_file . ' created.
    '; + } + else + { + echo $epub->error; + } + + } + else + { + echo $epub->error; + } + } + + function lm_download_image($base_url, $url_part, $id) + { + $image_url = preg_replace('/GetPublicationContentItems-.*\.json/', $url_part . $id . '.jpg', $base_url); + $temp_img_path = './temp/' . $id . '.jpg'; + $temp_content = file_get_contents($image_url); + file_put_contents($temp_img_path, $temp_content); + return $temp_img_path; + } + + function get_json($url) + { + // cache json in case url expires + $tempjsonpath = 'temp/' . hash('md5', $url) . '.json'; + $json = ''; + if (file_exists($tempjsonpath)) + { + $json = file_get_contents($tempjsonpath); + } + else + { + $json = file_get_contents($url); + file_put_contents($tempjsonpath, $json); + } + return json_decode($json); + } + + // quick scenario + if (isset($_GET['mp'])) + { + $_POST['forcempcookies'] = 1; + $_POST['mphome'] = 1; + } + + date_default_timezone_set('Europe/Paris'); + $today = (new DateTime('today'))->format('Ymd'); + + if (isset($_POST['emptycache']) && $_POST['emptycache']) + { + $files = glob('temp/*'); + foreach($files as $file) + { + if(is_file($file)) + { + unlink($file); + } + } + } + + // Cafeyn + if (isset($_POST['cafeyn']) && $_POST['cafeyn']) + { + $url = $_POST['cafeyn']; + + // extract url from curl command + if (str_starts_with($url, 'curl ')) + { + $url = explode("'", $url)[1]; + echo '

    extracted url from curl command:

    ' . $url . '

    '; + } + + // cache json in case url expires + $publication = get_json($url); + $pubname = $publication->metadata->title; + + $epub = new TPEpubCreator(); + $epub->temp_folder = 'temp/'; + $epub->epub_file = 'epub/' . $pubname . 'cafeyn.epub'; + $epub->title = $pubname . ' Cafeyn - ' . $today ; + + $list = (array)$publication->articles; + $page = 1; + foreach ($list as $entry) + { + $articleurl = str_replace('material', 'article/' . $entry->hash . '.json', $url); + $article = get_json($articleurl); + + $content = '

    ' . $article->title . '

    '; + //$content .= '' . $article->abstract . ''; + $content .= '
    Page ' . $article->page . '
    '; + + if (count($article->rubrics)) + { + $content .= '
    Rubriques: ' . implode(',', $article->rubrics) . '
    '; + } + if (count($article->authors)) + { + $content .= '
    Auteurs: ' . implode(',', $article->authors) . '
    '; + } + + foreach ($article->content->sections as $section) + { + $dropcap = ''; + foreach ($section->items as $item) + { + if ($item->type == 'text') + { + if ($item->class == 'quote') + { + $content .= '
    ' .$item->content . '
    '; + } + else if ($item->class == 'paragraphTitle') + { + $content .= '

    ' .$item->content . '

    '; + } + else if ($item->class == 'introduction') + { + $content .= '

    ' .$item->content . '

    '; + } + else if ($item->class == 'dropcap') + { + $dropcap = $item->content; + } + else + { + if ($dropcap) + { + $item->content = '' . $dropcap . '' . $item->content; + $dropcap = ''; + } + $content .= '

    ' . $item->content . '

    '; + } + } + } + } + + while ($article->page >= $page) + { + $path = $publication->pages[$page - 1]->tn; + $path = str_replace('material', $path, $url); + $path = str_replace('api.', 'content.', $path); + if ($page == 1) + { + // cover + $epub->AddImage($path, 'image/jpeg', true); + } + else + { + $epub->AddPage('', false, 'Page ' . $page, true); + } + $page++; + } + $epub->AddPage($content, false, $article->title); + + } + write_epub($epub); + } + + // Le Monde + if (isset($_POST['lemonde']) && $_POST['lemonde']) + { + $url = $_POST['lemonde']; + + // extract url from curl command + if (str_starts_with($url, 'curl ')) + { + $url = explode("'", $url)[1]; + echo '

    extracted url from curl command:

    ' . $url . '

    '; + } + + $epub = new TPEpubCreator(); + $epub->temp_folder = 'temp/'; + $epub->epub_file = 'epub/lemonde.epub'; + $epub->title = 'Le Monde ' . $today; + + $publication = get_json($url); + $content = array_filter($publication->Content, function($item) { return $item->Category == 'Le Monde'; }); + usort($content, function ($a, $b) { return $a->PageNumber - $b->PageNumber; }); + + $pageindex = get_json(str_replace('GetPublicationContentItems', 'GetPublicationPages', $url)); + + $page = 0; + foreach ($content as $article) + { + if ($article->PageNumber > $page) + { + $page = $article->PageNumber; + $pageobj = array_values(array_filter($pageindex->Page, function($p) + { + global $page; + return $page == $p->PageNumber; + }))[0]; + $pageid = $pageobj->PublicationPageID; + + $path = lm_download_image($url, 'Preview-MEDIUM-', $pageid); + + if ($page == 1) + { + // cover + $epub->AddImage($path, 'image/jpeg', true); + } + else + { + $epub->AddPage('', false, 'Page ' . $page, true); + } + } + + $articlebody = array_filter($article->ContentItem, function($item) { return $item->ContentType == 'text/xml'; }); + $articlebody = array_values($articlebody)[0]; + + if ($articlebody->Title && $articlebody->HtmlText) + { + $pagecontent = '

    '. strip_tags($articlebody->Title) . '

    '; + $pagecontent .= '

    Page ' . $article->PageNumber . '

    '; + + $author = array_filter($article->ContentItem, function($item) { return $item->ContentType == 'author/xml'; }); + $author = array_values($author)[0]; + if ($author->Author) + { + $pagecontent .= $author->Author; + } + + if ($articlebody->Introduction) + { + $pagecontent .= '' . $articlebody->Introduction . ''; + } + $pagecontent .= $articlebody->HtmlText; + + $epub->AddPage($pagecontent, false, str_replace(' ', ' ', strip_tags($articlebody->Title)), true); + } + } + write_epub($epub); + } + + if (isset($_POST['forcempcookies']) && $_POST['forcempcookies']) + { + $old = $mp_sessionid; + $mp_sessionid = get_mp_cookie(); + $settings = file_get_contents('settings.php'); + $settings = str_replace($old, $mp_sessionid, $settings); + file_put_contents('settings.php', $settings); + echo '
    Updated mpsessid in settings.
    '; + } + + // Mediapart RSS + if (isset($_POST['mediapart']) && $_POST['mediapart']) + { + $feedurl = 'https://www.mediapart.fr/articles/feed'; + $opts = [ + 'http' => [ + 'method' => "GET", + 'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, + ] + ]; + $context = stream_context_create($opts); + + $epub = new TPEpubCreator(); + $epub->temp_folder = 'temp/'; + $epub->epub_file = 'epub/mediapart.epub'; + $epub->title = 'Mediapart - RSS ' . $today; + + // load feeds + $feed = file_get_contents($feedurl); + $xml = new SimpleXMLElement($feed); + $items = $xml->xpath("/rss/channel/item"); + + foreach (array_reverse($items) as $item) + { + add_mp_article($item->link, $context, $epub); + } + + write_epub($epub); + } + + // Politis RSS + if (isset($_POST['politis']) && $_POST['politis']) + { + $issuenb = $_POST['politis']; + $feedurl = 'https://www.politis.fr/flux-rss-politis-fr/'; + $opts = [ + 'http' => [ + 'method' => "GET", + //'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, // todo? + ] + ]; + $context = stream_context_create($opts); + + $epub = new TPEpubCreator(); + $epub->temp_folder = 'temp/'; + $epub->epub_file = 'epub/politis' . $issuenb . '.epub'; + $epub->title = 'Politis ' . $issuenb . ' - RSS ' . $today; + + // find cover + $issues = file_get_contents('https://www.politis.fr/boutique/magazines-unite/'); + $doc = new DOMDocument(); + $doc->loadHTML($issues); + $finder = new DomXPath($doc); + $cover = $finder->query('//img[contains(@src, "' . $issuenb . '_")]'); + if ($cover->length) + { + $epub->AddImage($cover->item(0)->getAttribute('src'), 'image/jpeg', true); + } + else + { + echo 'warning: no cover for issue ' . $issuenb . '
    '; + } + + // load feeds + $feed = file_get_contents($feedurl); + $xml = new SimpleXMLElement($feed); + $items = $xml->xpath("/rss/channel/item"); + + foreach (array_reverse($items) as $item) + { + $url = $item->link; + $article = file_get_contents($url, false, $context); + + if (!str_contains($article, ' dans l’hebdo N° ' . $issuenb)) + { + continue; + } + + $doc = new DOMDocument(); + $doc->loadHTML($article); + $finder = new DomXPath($doc); + + $title = $finder->query('//h1')->item(0)->textContent; + $author = $finder->query('//span[@class="nom"]')->item(0)->textContent; + $date = $finder->query('//span[@class="date"]')->item(0)->textContent; + $summary = $finder->query('//div[@class="extrait"]')->item(0)->textContent; + + // transform "zoom" divs into aside tags + $zooms = $finder->query('//div[contains(@class,"gt_zoom")]'); + foreach ($zooms as $zoom) + { + $aside = $doc->createElement('aside'); + while ($zoom->childNodes->length > 0) + { + $aside->appendChild($zoom->childNodes->item(0)); + } + $zoom->parentNode->replaceChild($aside, $zoom); + } + + $result = '

    ' . $title . '

    '; + $result .= '

    ' . $author . '

    '; + $result .= '

    ' . $date . '

    '; + $result .= '

    ' . $summary . '

    '; + + $nodes = $finder->query('//div[@id="content"]'); + $node = $nodes->item(0); + $nodehtml = get_clean_html($node); + $result .= $nodehtml; + $epub->AddPage($result, false, $title); + + } + + write_epub($epub); + } + + // Mediapart home page + if (isset($_POST['mphome']) && $_POST['mphome']) + { + $homeurl = 'https://www.mediapart.fr'; + $opts = [ + 'http' => [ + 'method' => "GET", + 'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, + ] + ]; + $context = stream_context_create($opts); + + $epub = new TPEpubCreator(); + $epub->temp_folder = 'temp/'; + $epub->epub_file = 'epub/mediaparthome.epub'; + $epub->title = 'Mediapart - A la une ' . $today; + + $home = file_get_contents($homeurl, false, $context); + $doc = new DOMDocument(); + $doc->loadHTML($home); + $finder = new DomXPath($doc); + + $xpath = '//a[@data-js = "teaser-link" and contains(@href, "/journal/") and not(contains(@href, "/dossier/")) and not(contains(@href, "/studio/")) and not(contains(@href, "fil-")) and (@href != "/journal/series")]/@href'; + + $links = $finder->query($xpath); + + foreach ($links as $link) + { + add_mp_article('https://www.mediapart.fr' . $link->value, $context, $epub); + } + + write_epub($epub); + } + + // New York Times RSS + if (isset($_POST['nyt']) && $_POST['nyt']) + { + $feedurl = 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml'; + $opts = [ + 'http' => [ + 'method' => "GET", + 'header' => "Accept-language: en\nCookie: NYT-S=" . $nyt_sessionid, + ] + ]; + $context = stream_context_create($opts); + + $epub = new TPEpubCreator(); + $epub->temp_folder = 'temp/'; + $epub->epub_file = 'epub/newyorktimes.epub'; + $epub->title = 'The New York Times ' . $today; + + // load feeds + $feed = file_get_contents($feedurl); + $xml = new SimpleXMLElement($feed); + $items = $xml->xpath("/rss/channel/item"); + + foreach (array_reverse($items) as $item) + { + $title = $item->title; + $author = $item->xpath('dc:creator')[0]; + $summary = $item->description; + + $article = file_get_contents($item->link, false, $context); + $doc = new DOMDocument(); + $doc->loadHTML($article); + $finder = new DomXPath($doc); + + $result = '

    ' . $title . '

    '; + $result .= '

    ' . $author . '

    '; + $result .= '

    ' . $item->pubDate . '

    '; + $result .= '

    ' . $summary . '

    '; + + $nodes = $finder->query('//section[@name="articleBody"]'); + + if (!$nodes->length) + { + echo 'warning: could not get content of "' . $title . '"
    '; + } + else + { + $node = $nodes->item(0); + $nodehtml = get_clean_html($node); + $result .= $nodehtml; + $epub->AddPage($result, false, $title); + } + } + + write_epub($epub); + } + + // New York Times today's paper + if (isset($_POST['nyttoday']) && $_POST['nyttoday']) + { + $todaypage = file_get_contents("https://www.nytimes.com/section/todayspaper"); + $opts = [ + 'http' => [ + 'method' => "GET", + 'header' => "Accept-language: en\nCookie: NYT-S=" . $nyt_sessionid, + ] + ]; + $context = stream_context_create($opts); + + $epub = new TPEpubCreator(); + $epub->temp_folder = 'temp/'; + $epub->epub_file = 'epub/newyorktimestoday.epub'; + $epub->title = 'The New York Times today\'s paper ' . $today; + + // grab today's paper + $start = strpos($todaypage, "window.__preloadedData = ") + 25; + $end = strpos($todaypage, "};", $start) + 1; + $json = str_replace(":undefined", ":null", substr($todaypage, $start, $end - $start)); + $data = json_decode($json); + + foreach($data->initialState as $item) + { + if ($item->__typename == "Article") + { + $headline = ($data->initialState->{ $item->headline->id }); + + $title = $headline->default; + $summary = $item->summary; + + $result = '

    ' . $title . '

    '; + + // authors + $result .= '

    '; + $creators = ($data->initialState->{ $item->bylines[0]->id })->creators; + foreach ($creators as $creator) + { + $author = $data->initialState->{ $creator->id }; + $result .= $author->displayName . ' '; + } + $result .= '

    '; + + $result .= '

    ' . $item->lastMajorModification . '

    '; + $result .= '

    ' . $summary . '

    '; + + $article = file_get_contents($item->url, false, $context); + $doc = new DOMDocument(); + $doc->loadHTML($article); + $finder = new DomXPath($doc); + $nodes = $finder->query('//section[@name="articleBody"]'); + + if (!$nodes->length) + { + echo 'warning: could not get content of "' . $title . '"
    '; + } + else + { + $node = $nodes->item(0); + $nodehtml = get_clean_html($node); + $result .= $nodehtml; + $epub->AddPage($result, false, $title); + } + } + } + write_epub($epub); + } +?> + +

    Generate epub

    +
    +
    + + + +

    Mediapart

    + + +
    + + +
    + + +
    + + + +

    Le Monde

    + + + +

    Cafeyn

    + + + +

    The New York Times

    + + +
    + + + +

    Politis

    + + +

    Admin

    + + +

    + + +
    +
    + +

    Available files

    +"; + while (false !== ($file = readdir($handle))) { + if ($file != "." && $file != "..") { + $filePath = 'epub/' . $file; + $modTime = date("F d Y H:i:s.", filemtime($filePath)); + echo "
  • $file - $modTime
  • "; + } + } + echo ""; + closedir($handle); + } +?> + +

    Mediapart RSS summary

    +xpath("/rss/channel/item"); + foreach ($items as $item) + { + echo ''; + } +?> + +

    Links

    +Generated files
    +Press review
    + + \ No newline at end of file