Revue de presse

childNodes as $childNode){ $nodeHTML = $childNode->ownerDocument->saveHTML($childNode); $nodeHTML = str_replace('' . strip_tags($innerHTML, '

') . ''; } function add_mp_article($url, $context, $epub) { if (isset($_POST['mptoday']) && $_POST['mptoday'] && !str_contains($url, (new DateTime('today'))->format('dmy'))) { return; } // todo filter on "A la une" ? in rss item dc:subject contains "A la Une" $article = file_get_contents($url, false, $context); $doc = new DOMDocument(); $doc->loadHTML($article); $finder = new DomXPath($doc); $title = $finder->query('//h1')->item(0)->textContent; $author = $finder->query('//a[contains(@href, "biographie")]')->item(0)->textContent; $date = trim($finder->query('//time')->item(1)->textContent); $summary = $finder->query('//p[contains(@class, "newsheadingtopintro")]')->item(0)->textContent; // strip images $toremove = $finder->query('//svg'); foreach ($toremove as $elt) { $elt->parentNode->removeChild($elt); } $toremove = $finder->query('//figure'); foreach ($toremove as $elt) { $elt->parentNode->removeChild($elt); } $toremove = $finder->query('//span[@class="screen-reader-only"]'); foreach ($toremove as $elt) { $elt->parentNode->removeChild($elt); } $result = '
' . $title . '
'; $result .= '
' . $author . '
'; $result .= '
' . $date . '
'; $result .= '
' . $summary . '
'; $nodes = $finder->query('//div[contains(@class, "paywall-restricted-content")]'); if (!$nodes->length) { // articles accès libre $nodes = $finder->query('//div[contains(@class, "newsbodycenterarticle")]'); } if (!$nodes->length) { echo 'warning: could not get content of "' . $title . '"
'; } else { $node = $nodes->item(0); $nodehtml = get_clean_html($node); $result .= $nodehtml; // add boite noire $nodes = $finder->query('//aside[contains(@class, "_black")]'); if ($nodes->length) { $node = $nodes->item(0); $nodehtml = get_clean_html($node); $result .= '
' . $nodehtml . '
'; } $epub->AddPage($result, false, $title); } } function get_mp_cookie() { global $mp_user; global $mp_password; $url = 'https://www.mediapart.fr/login_check'; $postfields = 'email=' . $mp_user . '&password=' . $mp_password . '&submitButton=&_target_path=https%3A%2F%2Fwww.mediapart.fr%2F'; $ch = curl_init($url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_AUTOREFERER, false); curl_setopt($ch, CURLOPT_HEADER, 1); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $postfields); $result = curl_exec($ch); $sessid = ''; preg_match_all('/^Set-Cookie:\s([^;])/mi', $result, $matches); foreach($matches[1] as $item) { if (str_starts_with($item, 'MPSESSID=')) { $sessid = str_replace('MPSESSID=', '', $item); } } return $sessid; } function write_epub($epub) { if (file_exists($epub->epub_file)) { unlink($epub->epub_file); } if (!$epub->error) { $epub->CreateEPUB(); if (!$epub->error) { echo 'Success: ' . $epub->epub_file . ' created.
'; } else { echo $epub->error; } } else { echo $epub->error; } } function lm_download_image($base_url, $url_part, $id) { $image_url = preg_replace('/GetPublicationContentItems-.\.json/', $url_part . $id . '.jpg', $base_url); $temp_img_path = './temp/' . $id . '.jpg'; $temp_content = file_get_contents($image_url); file_put_contents($temp_img_path, $temp_content); return $temp_img_path; } function get_json($url) { // cache json in case url expires $tempjsonpath = 'temp/' . hash('md5', $url) . '.json'; $json = ''; if (file_exists($tempjsonpath)) { $json = file_get_contents($tempjsonpath); } else { $json = file_get_contents($url); file_put_contents($tempjsonpath, $json); } return json_decode($json); } date_default_timezone_set('Europe/Paris'); $today = (new DateTime('today'))->format('Ymd'); if (isset($_POST['emptycache']) && $_POST['emptycache']) { $files = glob('temp/'); foreach($files as $file) { if(is_file($file)) { unlink($file); } } } // Cafeyn if (isset($_POST['cafeyn']) && $_POST['cafeyn']) { $url = $_POST['cafeyn']; // extract url from curl command if (str_starts_with($url, 'curl ')) { $url = explode("'", $url)[1]; echo '
extracted url from curl command:
' . $url . '
'; } // cache json in case url expires $publication = get_json($url); $pubname = $publication->metadata->title; $epub = new TPEpubCreator(); $epub->temp_folder = 'temp/'; $epub->epub_file = 'epub/' . $pubname . 'cafeyn.epub'; $epub->title = $pubname . ' Cafeyn - ' . $today ; $list = (array)$publication->articles; $page = 1; foreach ($list as $entry) { $articleurl = str_replace('material', 'article/' . $entry->hash . '.json', $url); $article = get_json($articleurl); $content = '
' . $article->title . '
'; //$content .= '' . $article->abstract . ''; $content .= '
Page ' . $article->page . '
'; if (count($article->rubrics)) { $content .= '
Rubriques: ' . implode(',', $article->rubrics) . '
'; } if (count($article->authors)) { $content .= '
Auteurs: ' . implode(',', $article->authors) . '
'; } foreach ($article->content->sections as $section) { $dropcap = ''; foreach ($section->items as $item) { if ($item->type == 'text') { if ($item->class == 'quote') { $content .= '
' .$item->content . '
'; } else if ($item->class == 'paragraphTitle') { $content .= '
' .$item->content . '
'; } else if ($item->class == 'introduction') { $content .= '
' .$item->content . '
'; } else if ($item->class == 'dropcap') { $dropcap = $item->content; } else { if ($dropcap) { $item->content = '' . $dropcap . '' . $item->content; $dropcap = ''; } $content .= '
' . $item->content . '
'; } } } } while ($article->page >= $page) { $path = $publication->pages[$page - 1]->tn; $path = str_replace('material', $path, $url); $path = str_replace('api.', 'content.', $path); if ($page == 1) { // cover $epub->AddImage($path, 'image/jpeg', true); } else { $epub->AddPage('', false, 'Page ' . $page, true); } $page++; } $epub->AddPage($content, false, $article->title); } write_epub($epub); } // Le Monde if (isset($_POST['lemonde']) && $_POST['lemonde']) { $url = $_POST['lemonde']; // extract url from curl command if (str_starts_with($url, 'curl ')) { $url = explode("'", $url)[1]; echo '
extracted url from curl command:
' . $url . '
'; } $epub = new TPEpubCreator(); $epub->temp_folder = 'temp/'; $epub->epub_file = 'epub/lemonde.epub'; $epub->title = 'Le Monde ' . $today; $publication = get_json($url); $content = array_filter($publication->Content, function($item) { return $item->Category == 'Le Monde'; }); usort($content, function ($a, $b) { return $a->PageNumber - $b->PageNumber; }); $pageindex = get_json(str_replace('GetPublicationContentItems', 'GetPublicationPages', $url)); $page = 0; foreach ($content as $article) { if ($article->PageNumber > $page) { $page = $article->PageNumber; $pageobj = array_values(array_filter($pageindex->Page, function($p) { global $page; return $page == $p->PageNumber; }))[0]; $pageid = $pageobj->PublicationPageID; $path = lm_download_image($url, 'Preview-MEDIUM-', $pageid); if ($page == 1) { // cover $epub->AddImage($path, 'image/jpeg', true); } else { $epub->AddPage('', false, 'Page ' . $page, true); } } $articlebody = array_filter($article->ContentItem, function($item) { return $item->ContentType == 'text/xml'; }); $articlebody = array_values($articlebody)[0]; if ($articlebody->Title && $articlebody->HtmlText) { $pagecontent = '
'. strip_tags($articlebody->Title) . '
'; $pagecontent .= '
Page ' . $article->PageNumber . '
'; $author = array_filter($article->ContentItem, function($item) { return $item->ContentType == 'author/xml'; }); $author = array_values($author)[0]; if ($author->Author) { $pagecontent .= $author->Author; } if ($articlebody->Introduction) { $pagecontent .= '' . $articlebody->Introduction . ''; } $pagecontent .= $articlebody->HtmlText; $epub->AddPage($pagecontent, false, str_replace(' ', ' ', strip_tags($articlebody->Title)), true); } } write_epub($epub); } if (isset($_POST['forcempcookies']) && $_POST['forcempcookies']) { $old = $mp_sessionid; $mp_sessionid = get_mp_cookie(); $settings = file_get_contents('settings.php'); $settings = str_replace($old, $mp_sessionid, $settings); file_put_contents('settings.php', $settings); echo '
Updated mpsessid in settings.
'; } // Mediapart RSS if (isset($_POST['mediapart']) && $_POST['mediapart']) { $feedurl = 'https://www.mediapart.fr/articles/feed'; $opts = [ 'http' => [ 'method' => "GET", 'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, ] ]; $context = stream_context_create($opts); $epub = new TPEpubCreator(); $epub->temp_folder = 'temp/'; $epub->epub_file = 'epub/mediapart.epub'; $epub->title = 'Mediapart - RSS ' . $today; // load feeds $feed = file_get_contents($feedurl); $xml = new SimpleXMLElement($feed); $items = $xml->xpath("/rss/channel/item"); foreach (array_reverse($items) as $item) { add_mp_article($item->link, $context, $epub); } write_epub($epub); } // Politis RSS if (isset($_POST['politis']) && $_POST['politis']) { $issuenb = $_POST['politis']; $feedurl = 'https://www.politis.fr/flux-rss-politis-fr/'; $opts = [ 'http' => [ 'method' => "GET", //'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, // todo? ] ]; $context = stream_context_create($opts); $epub = new TPEpubCreator(); $epub->temp_folder = 'temp/'; $epub->epub_file = 'epub/politis' . $issuenb . '.epub'; $epub->title = 'Politis ' . $issuenb . ' - RSS ' . $today; // find cover $issues = file_get_contents('https://www.politis.fr/boutique/magazines-unite/'); $doc = new DOMDocument(); $doc->loadHTML($issues); $finder = new DomXPath($doc); $cover = $finder->query('//img[contains(@src, "' . $issuenb . '_")]')->item(0)->getAttribute('src'); $epub->AddImage($cover, 'image/jpeg', true); // load feeds $feed = file_get_contents($feedurl); $xml = new SimpleXMLElement($feed); $items = $xml->xpath("/rss/channel/item"); foreach (array_reverse($items) as $item) { $url = $item->link; $article = file_get_contents($url, false, $context); if (!str_contains($article, ' dans l’hebdo N° ' . $issuenb)) { continue; } $doc = new DOMDocument(); $doc->loadHTML($article); $finder = new DomXPath($doc); $title = $finder->query('//h1')->item(0)->textContent; $author = $finder->query('//span[@class="nom"]')->item(0)->textContent; $date = $finder->query('//span[@class="date"]')->item(0)->textContent; $summary = $finder->query('//div[@class="extrait"]')->item(0)->textContent; $result = '
' . $title . '
'; $result .= '
' . $author . '
'; $result .= '
' . $date . '
'; $result .= '
' . $summary . '
'; $nodes = $finder->query('//div[@id="content"]'); $node = $nodes->item(0); $nodehtml = get_clean_html($node); $result .= $nodehtml; $epub->AddPage($result, false, $title); } write_epub($epub); } // Mediapart home page if (isset($_POST['mphome']) && $_POST['mphome']) { $homeurl = 'https://www.mediapart.fr'; $opts = [ 'http' => [ 'method' => "GET", 'header' => "Accept-language: en\nCookie: MPSESSID=" . $mp_sessionid, ] ]; $context = stream_context_create($opts); $epub = new TPEpubCreator(); $epub->temp_folder = 'temp/'; $epub->epub_file = 'epub/mediaparthome.epub'; $epub->title = 'Mediapart - A la une ' . $today; $home = file_get_contents($homeurl, false, $context); $doc = new DOMDocument(); $doc->loadHTML($home); $finder = new DomXPath($doc); $xpath = '//a[@data-js = "teaser-link" and contains(@href, "/journal/") and not(contains(@href, "/dossier/")) and not(contains(@href, "/studio/")) and not(contains(@href, "fil-")) and (@href != "/journal/series")]/@href'; $links = $finder->query($xpath); foreach ($links as $link) { add_mp_article('https://www.mediapart.fr' . $link->value, $context, $epub); } write_epub($epub); } // New York Times RSS if (isset($_POST['nyt']) && $_POST['nyt']) { $feedurl = 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml'; $opts = [ 'http' => [ 'method' => "GET", 'header' => "Accept-language: en\nCookie: NYT-S=" . $nyt_sessionid, ] ]; $context = stream_context_create($opts); $epub = new TPEpubCreator(); $epub->temp_folder = 'temp/'; $epub->epub_file = 'epub/newyorktimes.epub'; $epub->title = 'The New York Times ' . $today; // load feeds $feed = file_get_contents($feedurl); $xml = new SimpleXMLElement($feed); $items = $xml->xpath("/rss/channel/item"); foreach (array_reverse($items) as $item) { $title = $item->title; $author = $item->xpath('dc:creator')[0]; $summary = $item->description; $article = file_get_contents($item->link, false, $context); $doc = new DOMDocument(); $doc->loadHTML($article); $finder = new DomXPath($doc); $result = '
' . $title . '
'; $result .= '
' . $author . '
'; $result .= '
' . $item->pubDate . '
'; $result .= '
' . $summary . '
'; $nodes = $finder->query('//section[@name="articleBody"]'); if (!$nodes->length) { echo 'warning: could not get content of "' . $title . '"
'; } else { $node = $nodes->item(0); $nodehtml = get_clean_html($node); $result .= $nodehtml; $epub->AddPage($result, false, $title); } } write_epub($epub); } // New York Times today's paper if (isset($_POST['nyttoday']) && $_POST['nyttoday']) { $todaypage = file_get_contents("https://www.nytimes.com/section/todayspaper"); $opts = [ 'http' => [ 'method' => "GET", 'header' => "Accept-language: en\nCookie: NYT-S=" . $nyt_sessionid, ] ]; $context = stream_context_create($opts); $epub = new TPEpubCreator(); $epub->temp_folder = 'temp/'; $epub->epub_file = 'epub/newyorktimestoday.epub'; $epub->title = 'The New York Times today\'s paper ' . $today; // grab today's paper $start = strpos($todaypage, "window.__preloadedData = ") + 25; $end = strpos($todaypage, "};", $start) + 1; $json = str_replace(":undefined", ":null", substr($todaypage, $start, $end - $start)); $data = json_decode($json); foreach($data->initialState as $item) { if ($item->__typename == "Article") { $headline = ($data->initialState->{ $item->headline->id }); $title = $headline->default; $summary = $item->summary; $result = '
' . $title . '
'; // authors $result .= '
'; $creators = ($data->initialState->{ $item->bylines[0]->id })->creators; foreach ($creators as $creator) { $author = $data->initialState->{ $creator->id }; $result .= $author->displayName . ' '; } $result .= '
'; $result .= '
' . $item->lastMajorModification . '
'; $result .= '
' . $summary . '
'; $article = file_get_contents($item->url, false, $context); $doc = new DOMDocument(); $doc->loadHTML($article); $finder = new DomXPath($doc); $nodes = $finder->query('//section[@name="articleBody"]'); if (!$nodes->length) { echo 'warning: could not get content of "' . $title . '"
'; } else { $node = $nodes->item(0); $nodehtml = get_clean_html($node); $result .= $nodehtml; $epub->AddPage($result, false, $title); } } } write_epub($epub); } ?>
Generate epub

Mediapart
From RSS
From home page
Force cookies
Today only
Le Monde
"GetPublicationContentItems" json url:
Cafeyn
"material" json url:
The New York Times
From RSS
From today's paper embedded script
Politis
Issue number:
Admin
Empty cache

Links
Fichiers générés
Revue de presse

Mediapart RSS summary
xpath("/rss/channel/item"); foreach ($items as $item) { echo '
'.$item->pubDate. ' '.$item->title.'
'; } ?>